python-webscraping/metadata.py at master · MirTalpur/python-webscraping · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
from lxml import html
import requests
from selenium import webdriver
from datetime import datetime
import re
import csv

# Start URL Provided in documentation for assignment
START_URL = "https://www.occ.gov/topics/licensing/interpretations-and-actions/index-interpretations-and-actions.html"
# Interpretive letter data for the Letter No and the Topic
letter_no_topic_interpretive_letter = []
# Data for all the interpretive letters date and href links
date_href_interpretive_letter = []
# Corporate decisions data for the Letter No and the Topic
letter_no_topic_corporate_decisions = []
# Corporate decisions data letters date and href links
date_href_corporate_decisions = []
# Letter topic approvals data for the Letter No and the Topic
letter_no_topic_approvals_with_conditions_enforceable = []
# Letter topic approvals data for the Letter date and href links
date_href_approvals_with_conditions_enforceable = []
# Topic CRA decisions data for the Letter No and the Topic
letter_no_topic_cra_decision = []
# Topic CRA decisions data for the date and href links
date_href_cra_decision = []
# Letter No. Charters
letter_no_topic_charters = []
# Date and href links for charters
date_href_charters = []


def output_to_csv():
    '''
    Outputs the data to a csv
    '''
    with open('filename.csv', 'wb') as myfile:
        wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
        wr.writerow(["Interpretations and Actions"])
        wr.writerow(('Letter No.', 'Topic','Date', 'href'))
        for i in range(0, len(letter_no_topic_interpretive_letter), 2):
            wr.writerow((letter_no_topic_interpretive_letter[i], letter_no_topic_interpretive_letter[i + 1],
                         date_href_interpretive_letter[i], date_href_interpretive_letter[i+1]))
        wr.writerow(["Corporate Decisions"])
        wr.writerow(('Letter No.', 'Topic','Date', 'href'))
        for i in range(0, len(letter_no_topic_corporate_decisions), 2):
            wr.writerow((letter_no_topic_corporate_decisions[i], letter_no_topic_corporate_decisions[i + 1],
                         date_href_corporate_decisions[i], date_href_corporate_decisions[i+1]))
        wr.writerow(["Approvals with Conditions Enforceable under 12 U.S.C. 1818"])
        wr.writerow(('Letter No.', 'Topic', 'Date', 'href'))
        for i in range(0, len(letter_no_topic_approvals_with_conditions_enforceable), 2):
            wr.writerow((letter_no_topic_approvals_with_conditions_enforceable[i], letter_no_topic_approvals_with_conditions_enforceable[i + 1], date_href_approvals_with_conditions_enforceable[i], date_href_approvals_with_conditions_enforceable[i+1]))
        wr.writerow(["CRA Decisions"])
        wr.writerow(('Letter No.', 'Topic', 'Date', 'href'))
        for i in range(0, len(letter_no_topic_cra_decision), 2):
            wr.writerow((letter_no_topic_cra_decision[i], letter_no_topic_cra_decision[i + 1],
                         date_href_cra_decision[i], date_href_cra_decision[i + 1]))
        wr.writerow(["Charters with standard conditions"])
        wr.writerow(('Letter No.', 'Topic', 'Date', 'href'))
        for i in range(0, len(letter_no_topic_charters), 2):
            wr.writerow((letter_no_topic_charters[i], letter_no_topic_charters[i + 1],
                         date_href_charters[i], date_href_charters[i + 1]))


def get_dates(dates, month_year,table_data):
    if month_year == 'march2010':
        for i, value in enumerate(table_data):
            match = re.search(r'\d{2}/\d{2}/\d{4}', value)
            if match:
                # if a match append it to dates
                date = datetime.strptime(match.group(), '%m/%d/%Y').date()
                dates.append(date.strftime('%m/%d/%Y'))
        return  table_data
    elif month_year == 'may1996':
        for i, value in enumerate(table_data):
            match = re.search(r'(\d{2}/\d{2}/\d{2})', value) or re.search(r'\d{2}/\d{2}/\d{2}', value)
            if match:
                date = datetime.strptime(match.group(), '%m/%d/%y').date()
                dates.append(date.strftime('%m/%d/%y'))
        return table_data
    else:
        data_tables = []
        for i in range(len(table_data)):
            if re.search(r'\d{2}/\d{2}/\d{4}', table_data[i]) or re.search(r'\d{1}/\d{2}/\d{4}', table_data[i]):
                if len(table_data[i]) > 11:
                    data_tables.append(table_data[i])
                else:
                    dates.append(table_data[i])
            else:
                data_tables.append(table_data[i])
        for i, value in enumerate(data_tables):
            match = re.search(r'\d{2}/\d{2}/\d{4}', value)
            if match:
                date = datetime.strptime(match.group(), '%m/%d/%Y').date()
                dates.append(date.strftime('%m/%d/%Y'))
        return data_tables


def get_common_table_data(path, type, month_year):
    '''
    get common table data from the march page
    essentially most of the data in the march page follows the same format
    '''
    table_data = tree.xpath(path + '//text()')
    # remove all data not required
    while 'Topic' in table_data: table_data.remove('Topic')
    while '\n' in table_data: table_data.remove('\n')
    while '\n\n' in table_data: table_data.remove('\n\n')
    while 'Letter No.' in table_data: table_data.remove('Letter No.')
    while ' (PDF)' in table_data: table_data.remove(' (PDF)')
    while 'WORD' in table_data: table_data.remove('WORD')
    while '\r\n' in table_data: table_data.remove('\r\n')
    while '\r\n\r\n' in table_data: table_data.remove('\r\n\r\n')
    while '\r\n\r\n\r\n' in table_data: table_data.remove('\r\n\r\n\r\n')
    # make sure we can use save to csv
    for i in range(len(table_data)):
        table_data[i] = ''.join([j if ord(j) < 128 else ' ' for j in table_data[i]])
    # add all of the date
    dates = []
    table_data = get_dates(dates, month_year, table_data)
    # get all the a href
    ahref_interpretives = tree.xpath(path + '//a/@href')
    if not ahref_interpretives:
        for i in range(len(dates)):
            ahref_interpretives.append('No link prvoided')
    date_href = []
    for date,ahref_interpretive in zip(dates,ahref_interpretives):
        date_href.append(date)
        date_href.append(ahref_interpretive)
    # Update the main list with respect to table type
    if type == 'interpretive':
        letter_no_topic_interpretive_letter.extend(table_data)
        date_href_interpretive_letter.extend(date_href)
    elif type == 'corporate':
        letter_no_topic_corporate_decisions.extend(table_data)
        date_href_corporate_decisions.extend(date_href)
    elif type == 'approvals':
        letter_no_topic_approvals_with_conditions_enforceable.extend(table_data)
        date_href_approvals_with_conditions_enforceable.extend(date_href)
    elif type == 'cra':
        letter_no_topic_cra_decision.extend(table_data)
        date_href_cra_decision.extend(date_href)
    elif type == 'charters':
        letter_no_topic_charters.extend(table_data)
        date_href_charters.extend(date_href)


def get_may_nineteen_ninetysix_data(tree):
    # get the data from the data tables and xpaths for may 1996
    get_common_table_data('//*[@id="maincontent"]/table[1]','interpretive','may1996')
    get_common_table_data('//*[@id="maincontent"]/table[2]','corporate','may1996')
    get_common_table_data('//*[@id="maincontent"]/table[3]','approvals','may1996')

def get_august_twenty_one_data(tree):
    # get all the data needed for  august
    get_common_table_data('/html/body/table[2]/tr/td[2]/table/tr/td/table[1]', 'interpretive','august2001')
    get_common_table_data('/html/body/table[2]/tr/td[2]/table/tr/td/table[2]', 'cra', 'august2001')
    get_common_table_data('/html/body/table[2]/tr/td[2]/table/tr/td/table[3]', 'corporate', 'august2001')
    get_common_table_data('/html/body/table[2]/tr/td[2]/table/tr/td/table[4]', 'approvals', 'august2001')
    get_common_table_data('/html/body/table[2]/tr/td[2]/table/tr/td/table[5]', 'charters', 'august2001')


def get_march_twenty_ten_data(tree):
    # gets the march 2010 data
    get_common_table_data('//*[@id="maincontent"]/table[1]','interpretive','march2010')
    get_common_table_data('//*[@id="maincontent"]/table[2]','corporate','march2010')
    get_common_table_data('//*[@id="maincontent"]/table[3]','approvals','march2010')

if __name__ == '__main__':
    '''
    Works as the main script and controls the actions taken and functions called to
    retrieve all of the data required
    '''
    # Go to start url page given in the project assignment
    driver = webdriver.Firefox()
    driver.get(START_URL)

    # click on expand all a class
    expand_all = driver.find_element_by_xpath('//*[@id="maincontent"]/div[2]/p/a[1]')
    expand_all.click()

    # with expand all clicked we need to click on All Interpretations and Actions
    all_interpretations_actions = driver.find_element_by_xpath('//*[@id="ui-accordion-2-panel-0"]/p/a')
    all_interpretations_actions.click()

    # we will store this current url because it has all the links we need
    arhieve_url = driver.current_url

    # lets go to the 2010 link
    twenty_ten_link = driver.find_element_by_xpath('//*[@id="maincontent"]/div/table/tbody/tr/td[1]/ul/li[8]/a')
    twenty_ten_link.click()

    # lets click on the march link
    march_link = driver.find_element_by_xpath('//*[@id="maincontent"]/ul/li[10]/a')
    march_link.click()

    # get the html and request data content on the page
    page = requests.get(driver.current_url)
    tree = html.fromstring(page.content)

    # get the required data needed
    get_march_twenty_ten_data(tree)

    # for faster results we'll stop using selenium now
    page = requests.get('https://www.occ.gov/static/interpretations-and-precedents/aug01/intaug01.html')
    tree = html.fromstring(page.content)

    # get all the august data
    get_august_twenty_one_data(tree)

    # get all the data for may
    page = requests.get('https://www.occ.gov/topics/licensing/interpretations-and-actions/1996/interpretations-and-actions-may-1996.html')
    tree = html.fromstring(page.content)
    get_may_nineteen_ninetysix_data(tree)

    # output the csv required
    output_to_csv()
    # close the firefox driver
    driver.close()