-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathparse.py
More file actions
281 lines (235 loc) · 9.56 KB
/
parse.py
File metadata and controls
281 lines (235 loc) · 9.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
##### parse.py
##### Parsing Script converting HADCrut3 Climate Data to GEOJSON
##### AUTHOR: Teoman (Ted) Yavuzkurt
##### www.github.com/teomandavid
##### www.teomandavid.com
##### IMPORTS
import os, re, sys, time, geojson, argparse, subprocess
##### CONSTANTS
# NOTE: these can be set via command line arguments
# But these are good defaults (so you can just run python3 parse.py)
# NOTE: climate anomaly data is available from 1850 to 2009
# So this is a good range.
START_YEAR = 1850 # year to start outputting data
END_YEAR = 2010 # year to end data output (non-inclusive)
VERBOSE = False # print extra info to command line
PRETTY_PRINT = True # make output pretty
PATH = "./climate-data" # directory containing climate data set
OUTPATH = "./output/" # directory to store output, will be created
OUTPREFIX = "" # filename prefix. Will append .json if needed
PRETTY_PRINT = True # pretty print JSON Output
LIMIT = 25000 # maximum # of stations to parse (>6000 == all of them)
AVERAGE_TEMPS = True # averages missing data points
MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
##### REGEX -- precompiled for speed
# Matches fields we want to capture
reHeaders = re.compile("(Number|Name|Country|Lat|Long|Height|Start\syear|End\syear)=\s*(.*)")
# Matches numbers
reNum = re.compile("\-*\d+.*")
# Matches years
reYear = re.compile("(\d{4})\s+(.*)")
# Matches temperature readings
reTemp = re.compile("(\-*\d+.\d+)\s*(.*)")
# Matches the word "Obs:" (placed before temp readings)
reObs = re.compile("Obs:")
# Matches positive numbers (i.e. names of files) -- prevents matching DS_STORE etc
reInclude = re.compile("[0-9]+")
##### MAIN CODE
##### I've segmented these into functions
##### So it's clearer what each part is doing
##### Not adding a lot of comments in this file as it's pretty self explanatory
##### Could've done this with Pandas.
# init():
# Arguments: None
# Description: Parses script arguments and starts parsing
# Called on startup
START_YEAR = 1850 # year to start outputting data
END_YEAR = 2010 # year to end data output (non-inclusive)
VERBOSE = False # print extra info to command line
PRETTY_PRINT = True # make output pretty
PATH = "./climate-data/" # directory containing climate data set
OUTPATH = "./output/" # directory to store output, will be created
OUTPREFIX = "" # filename prefix. Will append .json if needed
PRETTY_PRINT = True # pretty print JSON Output
LIMIT = 25000 # maximum # of stations to parse (>6000 == all of them)
AVERAGE_TEMPS = True # averages missing data points
MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
def init():
global LIMIT, VERBOSE, START_YEAR, END_YEAR, PRETTY_PRINT, OUTPATH, PATH, AVERAGE_TEMPS
parser = argparse.ArgumentParser(description='Parse climate data set into GEOJson.')
parser.add_argument('-l', help='Max # of stations to process. (>6000 == all) Default: ' + str(LIMIT), type=int, nargs=1, default=[LIMIT])
parser.add_argument('-v', action="store_true", help='Print extra info during parsing. Default: ' + str(VERBOSE), default=VERBOSE)
parser.add_argument('-p', help="Period for output. Default: " + str(START_YEAR) + ", " + str(END_YEAR), type=int, nargs=2, default=[START_YEAR, END_YEAR])
parser.add_argument('-a', action="store_true", help="Average missing data points or not. Default: " + str(AVERAGE_TEMPS), default=AVERAGE_TEMPS)
parser.add_argument('-f', action="store_true", help="Format JSON output nicely. Default: " + str(PRETTY_PRINT), default=PRETTY_PRINT)
parser.add_argument('-o', help="Directory for output. Default: " + OUTPATH, type=str, default=[OUTPATH], nargs=1)
parser.add_argument('-i', help="Directory containing climate data. Default: " + PATH, type=str, default=[PATH], nargs=1)
args = parser.parse_args()
LIMIT = vars(args)['l'][0]
VERBOSE = vars(args)['v']
START_YEAR = vars(args)['p'][0]
END_YEAR = vars(args)['p'][1]
AVERAGE_TEMPS = vars(args)['a']
PRETTY_PRINT = vars(args)['f']
OUTPATH = vars(args)['o'][0]
PATH = vars(args)['i'][0]
parseData()
# parseData():
# Arguments: None
# Description: Calls other parsing functions
def parseData():
stations = parseFiles()
for month in range(0, 12):
print("Parsing month: " + MONTHS[month])
features = stationsToFeatures(stations, month)
outputJson(features, MONTHS[month])
features = stationsToFeatures(stations, None, True)
outputJson(features, "headers")
def parseFiles():
filenames = getAllFilenames()
stations = []
start = time.clock()
for filename in filenames:
station = parseFile(filename)
if(station):
stations.append(station)
printv("PARSE COMPLETE, file: %s\t station %20s\t start %i\t end %i" % (filename[-6:], station['Name'], station['Start year'], station['End year']))
print("%i files parsed in %i seconds" % (len(stations), time.clock()-start))
return stations
def getAllFilenames():
filenames = []
count = 1
for root, dirs, files in os.walk(PATH, topdown = False):
for name in files:
if reInclude.match(name):
filenames.append(os.path.join(root, name))
if count > LIMIT:
break
count = count + 1
if count > LIMIT:
break
return filenames
def parseFile(filename):
station = {}
temperatures = []
station['temperatures'] = temperatures
file = open(filename)
# are we capturing the actual data yet
capturing = False
for line in file:
if reObs.match(line):
capturing = True
padTemperatures(station)
continue
if capturing:
temperatures.extend(parseTemp(line))
elif reHeaders.match(line):
header = reHeaders.search(line)
station[header.group(1)] = parseHeader(header.group(2))
padTemperatures(station, True)
return station if validStation(station) else []
# filter out bad station data
def validStation(station):
return abs(station['Lat']) <= 90 and abs(station['Long']) <= 180
def parseHeader(string):
return num(string) if reNum.match(string) else string.title()
def num(s):
try:
return int(s)
except ValueError:
return float(s)
def padTemperatures(station, padEnd = False):
if(padEnd):
finishYear = END_YEAR
startYear = station['End year']
else:
finishYear = station['Start year']
startYear = START_YEAR
if(finishYear > startYear):
for i in range(startYear, finishYear):
for j in range(0, 12):
station['temperatures'].append(-99.0);
def parseTemp(line):
temps = []
yearMatch = reYear.search(line)
year = int(yearMatch.group(1))
line = yearMatch.group(2)
if year >= START_YEAR:
for i in range(0, 12):
temp = reTemp.search(line)
temps.append(float(temp.group(1)))
line = temp.group(2)
return temps
def ifelse(ddict, key, default):
return ddict[key] if key in ddict.keys() else default
def interpolate(temps, index):
# verifies that the indices separated by rng are valid
def valid(rng):
return index - rng > 0 and \
index + rng < len(temps) and \
temps[index - rng] != -99 and \
temps[index + rng] != -99
def avg(rng):
return round((temps[index - rng] + temps[index + rng]) / 2, 1)
# average first by surrounding months
# if not possible, average by last year's reading and next year's
if(valid(1) or valid(12)):
return avg(1) if valid(1) else avg(12)
else:
# data points missing for both surrounding months and same month in surrounding years
return -99
def stationToGeojson(station, month, headersOnly = False):
# negative Longitude in data is East, in MapBox it's West
point = geojson.Point((-1 * station['Long'],station['Lat']))
stationProperties = generateProperties(station, month, headersOnly)
feature = geojson.Feature(geometry = point, properties=stationProperties)
printv("FEATURE ENCODING COMPLETE, id: %s\t station: %20s\t" % (station['Number'], station['Name']))
return feature
def generateProperties(station, month, headersOnly = False):
if(headersOnly):
return {
"id" : ifelse(station, 'Number', time.clock()),
"name" : ifelse(station, 'Name', ""),
"country" : ifelse(station, 'Country', ""),
"elevation" : ifelse(station, 'Height', 0.0),
"start_year": ifelse(station, 'Start year', START_YEAR),
"end_year" : ifelse(station, 'End year', END_YEAR)
}
properties = {}
# month == offset
for i in range(0, len(station['temperatures']), 12):
tempIndex = i + month
if(tempIndex > len(station['temperatures'])):
break
tempstr = str(i//12)
if(station['temperatures'][tempIndex] == -99):
if(AVERAGE_TEMPS):
station['temperatures'][tempIndex] = interpolate(station['temperatures'], tempIndex)
properties[tempstr] = station['temperatures'][tempIndex]
return properties
def stationsToFeatures(stations, month, headersOnly = False):
features = []
for station in stations:
geoStation = stationToGeojson(station, month, headersOnly)
features.append(geoStation)
return features
def outputJson(features, suffix = ""):
print("OUTPUTING JSON DATA")
geoData = geojson.FeatureCollection(features);
printv("GeoJSON FeatureCollection Created")
filename = OUTPATH + OUTPREFIX + str(suffix) + ".json"
os.makedirs(os.path.dirname(filename), exist_ok = True)
printv("Created Directory for " + filename)
extraArgs = {"indent":4, "separators":(',', ': ')} if PRETTY_PRINT else {}
rawJson = geojson.dumps(geoData, sort_keys = True, **extraArgs)
printv("Raw JSON Generated. Writing to file " + filename)
with open(filename, 'w') as outfile:
outfile.write(rawJson)
print("SUCCESSFULLY WROTE JSON FILE: " + filename)
# printv():
# Arguments: Any number of string parameters
# Description: Prints the arguments if VERBOSE is enabled
def printv(*args):
if(VERBOSE):
print(*args)
init()