-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathprocess.py
More file actions
309 lines (232 loc) · 9.41 KB
/
process.py
File metadata and controls
309 lines (232 loc) · 9.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
Module: process.py
For various processing tasks of contract data.
Definitions:
Activity- A DoD technical term that I'm currently (mis)using to refer to
subgroups within the DoD, e.g. Army, Navy, Defense Information
Systems Agency, Missile Defense Agency, etc.
"""
##
# Download a series of contract announcements for contracts worth more than
# $6.5 million.
#
# Data taken from http://www.defense.gov/contracts/
#
# contracts start at number 391 (October 07, 1994)
#
import fuzzywuzzy as fuzz
import os
from bs4 import BeautifulSoup
from collections import defaultdict
from datetime import datetime
from json import dumps
from joblib import Parallel, delayed
from re import findall, compile
from urllib2 import urlopen
def download_announcements(first_id=1842, last_id=5442, save_dir="data/html/"):
"""
Download contract announcements for a range of contract IDs from
http://www.defense.gov/Contracts.
"""
url_root = 'http://www.defense.gov/contracts/contract.aspx?contractid='
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
# download newest first because that's handier
ids = [str(valid_num) for valid_num in range(last_id, first_id, -1)]
# for id_ in ids[-5:]:
for id_ in ids:
url = url_root + id_
page = urlopen(url)
html = page.read()
save_file = save_dir + id_ + ".html"
with open(save_file, 'w') as file_:
file_.write(html)
print "Downloaded ", url
def make_agency_dict(html_file, save_file=None):
"""
Parse the defense.gov contract html
"""
soup = BeautifulSoup(open(html_file, 'r'))
# the first paragraph says "CONTRACTS"
pars = soup.find_all("p")[1:]
contract_dict = defaultdict(list)
# Not all IDs in the range actually have data. For example,
# http://www.defense.gov/Contracts/Contract.aspx?ContractID=5052
# redirects to defense.gov homepage
if "defense.gov" not in soup.title.get_text():
# one case
# (http://www.defense.gov/Contracts/Contract.aspx?ContractID=5067)
# found to not have a first agency, only "contracts" so use
# "contracts" as default
branch = "contracts"
# newer contracts use bolded, centered paragraphs for agency
try:
for par in pars:
# contracting branches are styled (with centering)
if 'style' in par.attrs.keys() or 'align' in par.attrs.keys():
branch = par.get_text().strip().lower()
contract_dict[branch] = []
else:
text = par.get_text().strip()
if text:
contract_dict[branch].append(text)
# older ones use h3 tags
except UnboundLocalError as err:
print err.message
# we need to iterate over all the elements in the div with cntrcts
contracts_div = soup.find_all(class_="PressOpsContentBody")[1]
for elem in contracts_div.descendants:
if elem.name == 'h3':
branch = elem.get_text().strip().lower()
if branch:
contract_dict[branch] = []
elif elem.name == 'p' and branch:
text = elem.get_text().strip()
if text:
contract_dict[branch].append(text)
date = \
','.join([s.strip() for s in
soup.title.get_text().strip().split(',')[-2:]])
# key entire dictionary on the date; we'll use it for building time series
contract_dict = {"date": date, "contracts": contract_dict}
if save_file:
with open(save_file, 'w+') as file_:
file_.write(dumps(contract_dict))
return contract_dict
def make_agency_dicts(html_dir="data/html", output_json_dir="data/json"):
"""
Use ``make_agency_dict`` to clean the html and save the json format to
file, keyed by defense agency.
"""
if not os.path.isdir(output_json_dir):
os.makedirs(output_json_dir)
if html_dir[-1] != '/':
html_dir += '/'
input_files = [html_dir + f for f in os.listdir(html_dir)
if os.path.isfile(html_dir + f)
and not f.startswith('.')]
if output_json_dir[-1] != '/':
output_json_dir += '/'
for file_ in input_files:
print file_
basename = os.path.basename(file_).split('.')[0] + '.json'
make_agency_dict(file_, output_json_dir + basename)
def unzip_fgdc():
"""
Assume a directory structure as can be understood from the simple code
below.
"""
to_dir = "data/fgdc/"
from_dir = "data/archive/"
for file_ in os.listdir(from_dir):
full_to_dir = to_dir + re.findall("20[01][0-9]", file_)[0]
if not os.path.isdir(full_to_dir):
os.makedirs(full_to_dir)
os.popen("unzip " + from_dir + "'" + file_ + "'" +
" -d " + full_to_dir)
def count_by_activity(contract_json):
"""
Given a full JSON contracts dictionary, return a new dictionary of counts
by activity.
"""
by_activity = contract_json['contracts']
activities = by_activity.keys()
return dict(zip(activities, [len(by_activity[a]) for a in activities]))
def make_contract_jsons(contract_blob, date, division):
"""
parse a day's worth of contracts in JSON format to be a list of
dictionaries where every single contract announcement (blob) is parsed and
made into a row of a dataframe.
Returns: Array (most will be len 1) of parsed-out json strings.
"""
# parse in to sentences. first sentence is the company announcement.
# split on ';' because sometimes have mult companies sep'd by ';'.
# if multiple, create multiple contract_json. handle multiple
# dollar amounts
# percentage per company
#
contract_jsons = []
return [js for js in contract_jsons]
##########################################
# Helper functions for make_contract_row
#
# Parameters: blob_lines
# A single announcement, a child of division, grandchild of date,
# assumed split into lines by sentence with nltk sent_tokenize.
##########################################
def _extract_company_roots(blob):
"""
Find all companies and return a list of lists of their 'roots'.
For example, Lockheed Martin Aerospace, LTD would result in
["Lockheed ", "Martin ", "Aerospace"]. Other things like lowering
all upper case and stripping will be done when we build the
normalized list of companies, so if we had "Lockheed Martin Fire
and Missile", we would match that to "Lockheed Martin Aerospace"
giving both a normalized name "Lockheed Martin"
"""
assert isinstance(blob, basestring), \
"Error, must pass a string blob, not %s" % str(type(blob))
semicol_split = blob.split(';')
to_comma = [el.split(',')[0] for el in semicol_split]
get_company_roots = lambda x: findall("[A-Z][a-zA-Z.]*\s*", x.strip())
company_roots = map(get_company_roots, to_comma)
return company_roots
#: Regex to extract, e.g., $343,444,400 from a string
find_dollars = \
lambda x: findall("\$[0-9]{1,3},[0-9]{1,3},[0-9]{1,3}[,0-9]*", x)
#: Convert find_dollars-found string to integer
doll_to_int = lambda x: int(x.replace('$', '').replace(',', ''))
def _extract_amount(blob_lines):
"Extract dollar amounts from blob lines. If multiple, sum them"
assert type(blob_lines) is list
string_dollars = reduce(list.__add__, map(find_dollars, blob_lines))
total_dollars = reduce(int.__add__,
[doll_to_int(sd) for sd in string_dollars])
return total_dollars
class ContractRow(object):
"""
Contract row contains the following fields, a container for the information
gleaned from a contract announcement.
"""
def __init__(self, row_dict):
assert row_dict.keys() == ["date", "division", "company",
"related_ids", "amount", "pct_shared"]
assert type(row_dict["date"]) is datetime
self.date = row_dict["date"]
self.division = row_dict["division"]
self.company = row_dict["company"]
self.related_ids = row_dict["related_ids"]
self.amount = row_dict["amount"]
self.pct_shared = row_dict["pct_shared"]
def normalize_company_list(company_list):
"""
transform the list of companies to be normalized, meaning companies with
important words in common be mapped to a common name when they match to
a high enough degree
"""
strip_names = [n.strip().lower() for n in company_list]
cur_name = company_list[0].strip().lower()
for i, matching_name in enumerate(strip_names):
match_idxs = []
for j, next_name in enumerate(strip_names):
if fuzz.ratio(cur_name, next_name.strip().lower()) > 75:
match_idxs.append(j)
# can improve the make_normalized_name as necessary
company_list[match_idxs] =\
make_normalized_name(company_list[matched_idxs])
return company_list
def make_normalized_name(matches):
"""
Create a single normalized name for all the names for which a
match has been found.
"""
# TODO make a smarter version of this
if len(matches[0]) == 1:
return matches[0]
else:
return ' '.join(matches[0], matches[1])
def make_csv_contracts():
"""
make a csv string representing the fully-processed JSONs.
"""
pass