PyTopCoder/TopCoderParser.py at master · codersasha/PyTopCoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
from topcoder_common import *
from Problem import *

# the text that defines a problem is missing
MISSING_PROBLEM_TEXT = u"Problem Statement not available."

class TopCoderParser(object):
    """The class that performs all of the parsing for the TopCoder pages.
    Generates Problem objects from HTML pages."""

    ## init ##
    def __init__(self, html):
        """Creates a new parser object, using the HTML given."""
        # save as soup
        self.soup = BeautifulSoup(html)

    ## private object methods ##
    def _is_missing_problem(self):
        """Checks if this page contains a problem that is missing.
        Returns True if it contains the 'problem missing' text, False if not."""
        if self.soup.find("td", {"class": "problemText"}).getText() == MISSING_PROBLEM_TEXT:
            return True
        return False

    def _get_header(self, text):
        """Returns a reference to the soup header tag that contains the given text,
        or None if the header was not found.
        Looks for h3 tags, since TopCoder headers use this format."""
        return self.soup.find("h3", text=re.compile(text))

    def _scrape_piece(self, piece):
        """Scrape the HTML for a particular piece, returning the piece on success.
        The pieces correspond to the keys in the object dictionary."""

        if piece == P_PROBLEM_NUMBER:
            # cannot get the problem number from just the HTML
            return None

        elif piece == P_PROBLEM_NAME:
            # get problem name (without HTML)
            problem_name_text = self.soup.find("td", {"class": "statTextBig"}).getText()
            return re.findall("Problem statement for (.+)", problem_name_text, re.DOTALL | re.IGNORECASE | re.MULTILINE)[0]

        elif piece == P_PROBLEM_STATEMENT:
            # get problem statement (with HTML)
            problem_statement_tag = self.soup.find("td", {"class": "problemText"}).findAll("td", {"class": "statText"})[2]
            return extract_html(problem_statement_tag)

        elif piece == P_PROBLEM_DEFINITION:
            # get problem definition (without HTML)
            definitions_header = self._get_header('Definition')
            definition = dict(EMPTY_DEFINITIONS_DICT)
            if definitions_header:
                definitions_table = definitions_header.parent.parent.parent.nextSibling.find("table")
                class_row, method_row, params_row, returns_row, signature_row, ensure_public_row = definitions_table.findAll("tr")
                definition['class'] = class_row.findAll("td")[1].text
                definition['method'] = method_row.findAll("td")[1].text
                definition['types']['input'] = params_row.findAll("td")[1].text.split(', ')
                definition['types']['output'] = returns_row.findAll("td")[1].text

                # parse signature
                signature = signature_row.findAll("td")[1].text
                parts = re.findall("(.+?) (.+?)\((.+?)\)", signature)[0]
                definition['names']['input'] = [x.split()[-1] for x in parts[2].split(', ')]

            return definition

        elif piece == P_PROBLEM_CONSTRAINTS:
            # get constraints (with HTML)
            constraints_header = self._get_header('Constraints')
            if constraints_header:
                constraints = []
                constraint_bullets = constraints_header.parent.parent.parent.findAllNext("td", text="-")
                for bullet in constraint_bullets:
                    constraints.append(extract_html(bullet.parent.parent.findAll("td")[1]))
                return constraints
            else:
                return None

        elif piece == P_PROBLEM_EXAMPLES:
            examples_header = self._get_header("Examples")
            examples = []
            if examples_header:
                examples_numbers = examples_header.parent.parent.parent.findAllNext("td", text=re.compile("^\d+\)$"))
                for number in examples_numbers:
                    new_example = dict(EMPTY_EXAMPLE_DICT)
                    example_table = number.parent.parent.nextSibling.find("table")

                    # get input (without HTML)
                    params_table = example_table.findAll("tr")[0].find("table")
                    new_example['input'] = [eval_variable(x.getText().strip()) for x in params_table.findAll("td")]

                    # get output (without HTML)
                    returns_row = example_table.findAll("tr")[1 + len(new_example['input'])]
                    new_example['output'] = eval_variable(re.findall("Returns: (.+)", returns_row.getText(), re.DOTALL | re.IGNORECASE | re.MULTILINE)[0].strip())

                    # get comment (with HTML)
                    comments_row = example_table.findAll("tr")[2 + len(new_example['input'])]
                    new_example['comment'] = extract_html(comments_row.find("td"))

                    # save example
                    examples.append(new_example)
            return examples

        elif piece == P_SUBMISSION_LISTING_LINK:
            contest_link = self.soup.find("a", {"href": re.compile("/tc\?module=ProblemDetail&.+")})['href']
            return 'http://community.topcoder.com' + contest_link

        elif piece == P_SUBMISSION_LINK:
            submission_link = self.soup.find("a", {"href": re.compile("(/stat\?c=problem_solution&.+)|(/tc\?module=HSProblemSolution&.+)")})
            if submission_link:
                return 'http://community.topcoder.com' + submission_link['href']
            return None

        elif piece == P_PROBLEM_TESTS:
            # get the system tests (no HTML) - can only happen on a submission page
            tests = []
            test_inputs = self.soup.findAll("td", {"class": "statText", "align": "left"})
            for i in range(len(test_inputs)):
                new_test = {'input': [], 'output': None}

                # parse test input
                test_input_cell = test_inputs[i]
                new_test['input'] = [eval_variable(x.strip()) for x in test_input_cell.getText().split(',\n')]

                # extract test output
                test_output_cell = test_inputs[i].parent.findAll("td")[3]
                new_test['output'] = eval_variable(test_output_cell.getText().strip())

                # save test
                tests.append(new_test)
            return tests

        else:
            # not recognised
            return None

    def _scrape_pieces(self, pieces, problem = None):
        """Scrapes the given pieces and saves them to the given problem object,
        or, if none is given, creates a new problem object.

        Returns the updated problem object on success."""

        if problem == None:
            problem = Problem()

        # get each piece
        for piece in pieces:
            result = self._scrape_piece(piece)
            problem[piece] = result

        return problem

    ## public object methods ##
    def parse_problem_page(self, problem = None):
        """Parses the problem page, returning the new Problem object generated
        from the process (or updating a given one).

        The following pieces are updated:
        P_PROBLEM_NAME
        P_PROBLEM_STATEMENT
        P_PROBLEM_DEFINITION
        P_PROBLEM_CONSTRAINTS
        P_PROBLEM_EXAMPLES
        P_SUBMISSION_LISTING_LINK
        """

        # first check if the problem is OK
        if self._is_missing_problem():
            return None

        return self._scrape_pieces([P_PROBLEM_NAME,
            P_PROBLEM_STATEMENT,
            P_PROBLEM_DEFINITION,
            P_PROBLEM_CONSTRAINTS,
            P_PROBLEM_EXAMPLES,
            P_SUBMISSION_LISTING_LINK], problem)

    def parse_submission_listing_page(self, problem = None):
        """Parses the problem submission listing page, returning the new Problem
        object generated (or updating a given one).

        The following pieces are updated:
        P_SUBMISSION_LINK
        """

        return self._scrape_pieces([P_SUBMISSION_LINK], problem)

    def parse_submission_page(self, problem = None):
        """Parses the problem submission page, returning the new Problem object
        generated (or updating a given one).

        The following pieces are updated:
        P_PROBLEM_TESTS
        """

        return self._scrape_pieces([P_PROBLEM_TESTS], problem)

## helper functions ##
def scrape_problem(n, opener = None):
    """Attempts to scrape the TopCoder problem with ID n from the website.
    If given an opener, attempts to use it, otherwise connects to TopCoder.
    On success, returns a new Problem object."""

    if opener == None:
        print "Connecting to TopCoder...",
        opener = connect_to_topcoder()
        print "OK"

    # load problem page and scrape problem
    print "Loading problem page...",
    problem_page_html = get_topcoder_problem_page(opener, n)
    problem = TopCoderParser(problem_page_html).parse_problem_page()

    if problem == None:
        print "Problem does not exist."
        return None
    print "OK"

    # load submission listing page and scrape submission link
    print "Loading submission listing page...",
    if problem[P_SUBMISSION_LISTING_LINK]:
        submission_listing_page_html = open_page(opener, problem[P_SUBMISSION_LISTING_LINK])
        problem = TopCoderParser(submission_listing_page_html).parse_submission_listing_page(problem)
        print "OK"

        print "Loading submission page...",
        if problem[P_SUBMISSION_LINK]:
            # load submission page and scrape tests
            submission_page_html = open_page(opener, problem[P_SUBMISSION_LINK])
            problem = TopCoderParser(submission_page_html).parse_submission_page(problem)
            print "OK"
        else:
            print "WARNING: Problem has no submissions."
    else:
        print "WARNING: Problem was not used in any competitions."

    # remove the links
    del problem[P_SUBMISSION_LISTING_LINK]
    del problem[P_SUBMISSION_LINK]

    # save the problem number
    problem[P_PROBLEM_NUMBER] = n

    # done!
    return problem