-
Notifications
You must be signed in to change notification settings - Fork 4.2k
/
weather.py
255 lines (187 loc) · 7.98 KB
/
weather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
########################################################################
#
# Functions for downloading and re-sampling weather-data
# for 5 cities in Denmark between 1980-2018.
#
# The raw data was obtained from:
#
# National Climatic Data Center (NCDC) in USA
# https://www7.ncdc.noaa.gov/CDO/cdoselect.cmd
#
# Note that the NCDC's database functionality may change soon, and
# that the CSV-file needed some manual editing before it could be read.
# See the function _convert_raw_data() below for inspiration if you
# want to convert a new data-file from NCDC's database.
#
# Implemented in Python 3.6
#
# Usage:
# 1) Set the desired storage directory in the data_dir variable.
# 2) Call maybe_download_and_extract() to download the data-set
# if it is not already located in the given data_dir.
# 3) Either call load_original_data() or load_resampled_data()
# to load the original or resampled data for use in your program.
#
# Format:
# The raw data-file from NCDC is not included in the downloaded archive,
# which instead contains a cleaned-up version of the raw data-file
# referred to as the "original data". This data has not yet been resampled.
# The original data-file is available as a pickled file for fast reloading
# with Pandas, and as a CSV-file for broad compatibility.
#
########################################################################
#
# This file is part of the TensorFlow Tutorials available at:
#
# https://github.com/Hvass-Labs/TensorFlow-Tutorials
#
# Published under the MIT License. See the file LICENSE for details.
#
# Copyright 2018 by Magnus Erik Hvass Pedersen
#
########################################################################
import pandas as pd
import os
import download
########################################################################
# Directory where you want to download and save the data-set.
# Set this before you start calling any of the functions below.
data_dir = "data/weather-denmark/"
# Full path for the pickled data-file. (Original data).
def path_original_data_pickle():
return os.path.join(data_dir, "weather-denmark.pkl")
# Full path for the comma-separated text-file. (Original data).
def path_original_data_csv():
return os.path.join(data_dir, "weather-denmark.csv")
# Full path for the resampled data as a pickled file.
def path_resampled_data_pickle():
return os.path.join(data_dir, "weather-denmark-resampled.pkl")
# URL for the data-set on the internet.
data_url = "https://github.com/Hvass-Labs/weather-denmark/raw/master/weather-denmark.tar.gz"
# List of the cities in this data-set. These are cities in Denmark.
cities = ['Aalborg', 'Aarhus', 'Esbjerg', 'Odense', 'Roskilde']
########################################################################
# Private helper-functions.
def _date_string(x):
"""Convert two integers to a string for the date and time."""
date = x[0] # Date. Example: 19801231
time = x[1] # Time. Example: 1230
return "{0}{1:04d}".format(date, time)
def _usaf_to_city(usaf):
"""
The raw data-file uses USAF-codes to identify weather-stations.
If you download another data-set from NCDC then you will have to
change this function to use the USAF-codes in your new data-file.
"""
table = \
{
60300: 'Aalborg',
60700: 'Aarhus',
60800: 'Esbjerg',
61200: 'Odense',
61700: 'Roskilde'
}
return table[usaf]
def _convert_raw_data(path):
"""
This converts a raw data-file obtained from the NCDC database.
This function may be useful as an inspiration if you want to
download another raw data-file from NCDC, but you will have
to modify this function to match the data you have downloaded.
Note that you may also have to manually edit the raw data-file,
e.g. because the header is not in a proper comma-separated format.
"""
# The raw CSV-file uses various markers for "not-available" (NA).
# (This is one of several oddities with NCDC's file-format.)
na_values = ['999', '999.0', '999.9', '9999.9']
# Use Pandas to load the comma-separated file.
# Note that you may have to manually edit the file's header
# to get this to load correctly.
df_raw = pd.read_csv(path, sep=',', header=1,
index_col=False, na_values=na_values)
# Create a new data-frame containing only the data
# we are interested in.
df = pd.DataFrame()
# Get the city-name / weather-station name from the USAF code.
df['City'] = df_raw['USAF '].apply(_usaf_to_city)
# Convert the integer date-time to a proper date-time object.
datestr = df_raw[['Date ', 'HrMn']].apply(_date_string, axis=1)
df['DateTime'] = pd.to_datetime(datestr, format='%Y%m%d%H%M')
# Get the data we are interested in.
df['Temp'] = df_raw['Temp ']
df['Pressure'] = df_raw['Slp ']
df['WindSpeed'] = df_raw['Spd ']
df['WindDir'] = df_raw['Dir']
# Set the city-name and date-time as the index.
df.set_index(['City', 'DateTime'], inplace=True)
# Save the new data-frame as a pickle for fast reloading.
df.to_pickle(path_original_data_pickle())
# Save the new data-frame as a CSV-file for general readability.
df.to_csv(path_original_data_csv())
return df
def _resample(df):
"""
Resample the contents of a Pandas data-frame by first
removing empty rows and columns, then up-sampling and
interpolating the data for 1-minute intervals, and
finally down-sampling to 60-minute intervals.
"""
# Remove all empty rows.
df_res = df.dropna(how='all')
# Upsample so the time-series has data for every minute.
df_res = df_res.resample('1T')
# Fill in missing values.
df_res = df_res.interpolate(method='time')
# Downsample so the time-series has data for every hour.
df_res = df_res.resample('60T')
# Finalize the resampling. (Is this really necessary?)
df_res = df_res.interpolate()
# Remove all empty rows.
df_res = df_res.dropna(how='all')
return df_res
########################################################################
# Public functions that you may call to download the data-set from
# the internet and load the data into memory.
def maybe_download_and_extract():
"""
Download and extract the weather-data if the data-files don't
already exist in the data_dir.
"""
download.maybe_download_and_extract(url=data_url, download_dir=data_dir)
def load_original_data():
"""
Load and return the original data that has not been resampled.
Note that this is not the raw data obtained from NCDC.
It is a cleaned-up version of that data, as written by the
function _convert_raw_data() above.
"""
return pd.read_pickle(path_original_data_pickle())
def load_resampled_data():
"""
Load and return the resampled weather-data.
This has data-points at regular 60-minute intervals where
missing data has been linearly interpolated.
This uses a cache-file for saving and quickly reloading the data,
so the original data is only resampled once.
"""
# Path for the cache-file with the resampled data.
path = path_resampled_data_pickle()
# If the cache-file exists ...
if os.path.exists(path):
# Reload the cache-file.
df = pd.read_pickle(path)
else:
# Otherwise resample the original data and save it in a cache-file.
# Load the original data.
df_org = load_original_data()
# Split the original data into separate data-frames for each city.
df_cities = [df_org.xs(city) for city in cities]
# Resample the data for each city.
df_resampled = [_resample(df_city) for df_city in df_cities]
# Join the resampled data into a single data-frame.
df = pd.concat(df_resampled, keys=cities,
axis=1, join='inner')
# Save the resampled data in a cache-file for quick reloading.
df.to_pickle(path)
return df
########################################################################