chaincrawler/chainSearch.py at master · davidbramsay/chaincrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
#!/usr/bin/python
'''
This is a webcrawler for Chain-API. (https://.github.com/ResEnv/chain-api)

This is based on the ChainCrawler (see that file), but instead of continuing
to crawl and being designed to crawl, this will simply find and return the
uri of a resource without continuing to crawl.  No ZMQ, no Queue/Asyc support.

There are three main find modes: find just quits and returns the URI of the
first match.  find_degrees_all will do an exhaustive breadth first search
of x degrees, and when completed will return a list of all matches.
find_create_link will do a similar exhaustive search and return a create link
for a particular type of object related to the starting resource.

'''

from crawlerCache import CrawlerCacheWithCollisionHistory
from leakyLIFO import LeakyLIFO
from timeDecaySet import TimeDecaySet
from globalConfig import log
import re
import time
import random
import requests
import threading
import Queue
import zmq


class ChainSearch(object):


    def __init__(self, entry_point='http://learnair.media.mit.edu:8000/', \
            crawl_delay=1000, filter_keywords=['previous','next']):
        #entry_point = starting URL for crawl
        #search_depth = how many steps in path we save to retrace when at a dead end
        #found_set_persistence = how long, in min,  to keep a resource URI in memory
        #       before it is allowed to be returned as a new resource again.  720= 12
        #       hours before crawler 'forgets' it has seen something and resubmits it
        #       in the queue to be processed
        #crawl_delay = how long, in ms, before accessing/crawling a new resource

        self.entry_point = entry_point #entry point URI

        #initialize crawl variables
        self.current_uri = entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.crawl_delay = crawl_delay #in milliseconds
        self.degrees = 0
        self.return_if_found = False
        self.createform_type = None

        self.found_resources = TimeDecaySet(0)

        #initialize filter word list for crawling
        self.filter_keywords = ['edit','create','self','curies','websocket']
        [self.filter_keywords.append(x) for x in filter_keywords]
        log.debug( "filter keywords %s", self.filter_keywords)

        log.info( "-----------------------------------------------" )
        log.info( "Crawler Initialized." )
        log.info( "Entry Point: %s", self.entry_point )
        log.info( "-----------------------------------------------" )


    def reinit(self):

        self.current_uri = self.entry_point #keep track of current location
        self.current_uri_type = 'entry_point'
        self.degrees = 0
        self.return_if_found = False
        self.createform_type = None
        self.found_resources = TimeDecaySet(0)


    @staticmethod
    def apply_hal_curies(json, del_curies=True):
        '''Find and apply CURIES relationship shorcuts (namespace/rel
        definitions) to other links in the json object. I.E., if we have
        a CURIES "http://learnair.media.mit.edu/rels/{rel}" with name "ch",
        and a link further called 'ch:sites', remove the CURIES part of the
        object and apply it so that 'ch:sites' is now "http://learnair.media
        .mit.edu/rels/sites". del_curies tells this function whether to
        remove the CURIES section of _links after applying it to the document
        (True), or whether to leave it in (False).'''

        try:
            curies = json['_links']['curies'] #find the curies.

            for curie in curies: #compare each curies name...
                for key in json['_links']: #...with each link relationship

                    #if we find a link relation that uses the curies
                    if (key.startswith(curie['name'] + ':')):

                        #combine the curies & key to make the full resource link
                        newIndex = curie['href']
                        replaceString = key.split(curie['name'] + ':',1)[1]
                        newIndex = re.sub(r"\{.*\}", replaceString, newIndex)

                        #move the resource to the full resource link
                        json['_links'][newIndex] = json['_links'][key]
                        del json['_links'][key]
                        log.debug( 'CURIES: %s moved to %s', key, newIndex )

            #delete curies section of json if desired
            if del_curies:
                del json['_links']['curies']
                log.debug( 'CURIES: CURIES Resource applied fully & removed.' )

        except:
            log.warn( "CURIES: No CURIES found" )
            json['_links']={}

        return json


    @staticmethod
    def pluralize_resource_name(resource_name, namespace=""):
        return [namespace + resource_name + 's', namespace + resource_name + 'es']


    def flatten_filter_link_array(self, req_links):
        ''' takes a JSON array (after CURIES have been applied, if desired)
        and handles HAL 'items' collections and other links, by flattening
        them into a list.  each list element has list[0][fields] fields='href'
        (the actual crawlable link), 'type' (a link associated with the type
        at the other end of the link), 'from_item_list' (true if the resource
        was part of the item collection), and 'title' (a unique name for the
        resource on the other end of the link.

        'from_item_list' is required because collections inherit the type from
        the link above them, which is likely plural, even though they themselves
        are singular.  There is no generalizable way to go from a plural resource
        name to a singular one.  As such, 'from_item_list' tells us to accept the
        pluralized version of the type as indicitive of the found resource.
        '''
        crawl_links=[]

        #formulate and push link items to crawl_links array from json
        for key, item in req_links.iteritems():

            #first handle 'item' links
            if key == 'items':
                for items_item in item:
                    #inherit 'type' from previous crawl step
                    try:
                        items_item['type'] = self.current_uri_type
                    except:
                        log.error('Cannot inherit type information of list from previous crawl')
                        items_item['type'] = 'UNKNOWN'
                    items_item['from_item_list'] = True
                    crawl_links.append(items_item)

            #now filter out links we don't want and push the rest
            elif not any(substring in key.lower() for substring in \
                    self.filter_keywords):
                if item is not None:
                    item['type']=key
                    item['from_item_list'] = False
                    crawl_links.append(item)
                else:
                    log.warn(' EXTRACT_LINK: nonetype link detected in' + \
                            ' resource %s', key)

        return crawl_links


    def query_link_array(self, crawl_links):
        '''takes a crawl_link array (which has links and types of objects)
        and decides which of these links were quieried for. Return List of
        URIs that are matched resources not in the set already discovered'''

        if self.qry_resource_type is not None:
            log.info('SEARCH_LIST: looking for singular: %s', self.qry_resource_type)
            log.info('SEARCH_LIST: looking for plural as item_list: %s', self.qry_resource_plural)
        if self.qry_resource_title is not None:
            log.info('SEARCH_LIST: looking for title: %s', self.qry_resource_title)

        matching_uris = []

        #(1) if resource name exists, filter items to get only items that
        #match the singular resource name, AND (things that match the plural
        #resource name && are from_item_list)
        #(2) if title exists, filter items remaining for those that match the title

        for link_item in crawl_links:

            log.debug('SEARCH_LIST: checking if %s matches query criteria', link_item['href'])
            this_link_item_matches = True

            #see if it matches resource_type, if queried for
            if self.qry_resource_type is not None:
                if ((any(link_item['type'].lower() in x for x in self.qry_resource_plural) and link_item['from_item_list']) \
                        or (link_item['type'].lower() == self.qry_resource_type)):
                    #it does!

                    #double check for createForms the parent is correct
                    if ('createform' == link_item['type'].lower() and self.createform_type is not None):
                        if (self.current_uri_type.lower() not in self.createform_type):
                            this_link_item_matches = False
                        else:
                            log.info('SEARCH_LIST: matched search_type %s', link_item['type'])
                    else:
                        log.info('SEARCH_LIST: matched search_type %s', link_item['type'])

                else:
                    #it doesn't, but we're searching on resource_type
                    this_link_item_matches = False

            #see if it matches resource_title, if queried for
            if self.qry_resource_title is not None:
                if (link_item['title'].lower() == self.qry_resource_title):
                    #it does!
                    log.info('SEARCH_LIST: matched search_title %s', link_item['title'])
                else:
                    #it doesn't, but we're searching on resource_title
                    this_link_item_matches = False

            #if we made it to here and this_link_item_matches, it's a match!
            if this_link_item_matches:
                matching_uris.append(link_item['href'])

        #return list of matching uris
        return matching_uris


    def push_uris_to_queue(self, uris):
        '''check uris against found_resources set, and if they're not there,
        get resource and push URI and resource out to queue'''

        found_one = False
        #self.found_resources
        for uri in uris:
            #if 'add' returns true, it's not in our set yet
            if self.found_resources.add(uri):

                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('New Resource Found!  %s', uri)
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
                log.info('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')

                found_one = True

        return found_one


    def search(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None):
        '''
        crawl through chain, pushing uri/resource that match the passed criteria
        onto the queue.  If nothing is passed, push all resources.

        Can match the resource_type.  If you want a resource list (plural, i.e.
        lists of organizations resources NOT organization resources), you can
        specify that as the resource_type even though it is the plural.

        The code assumes the word can be pluralized by adding an 's' or 'es' to
        the end.  If this is not true (i.e. Person -> People) please give the
        plural so the code can recognize when it has found a list of the
        singular resource of interest.

        if looking for a specific resource, this will cross check against the
        title of the resource.  Selection will be ANDED with other query
        criteria.
        '''

        #store search criteria in lowercase form, with namespace appended
        #add plural forms +'s', +'es' to list of plural cases to look for

        if resource_type is not None and resource_type != 'createForm':
            #append namespace
            self.qry_resource_type = namespace + resource_type
            #make all lowercase
            self.qry_resource_type = self.qry_resource_type.lower()
            #'pluralize' resource after adding namespace
            self.qry_resource_plural = self.pluralize_resource_name(self.qry_resource_type)
            #add special pluralization if given by user
            if plural_resource_type is not None:
                self.qry_resource_plural.append(namespace + plural_resource_type)
            #make all plural list items lowercase
            self.qry_resource_plural = [x.lower() for x in self.qry_resource_plural]
        #check if we're searching for a createForm
        elif resource_type == 'createForm':
            #use this search criteria
            self.qry_resource_type = 'createform'
            self.qry_resource_plural = 'createform'
        else:
            #not searching on resource_type, just define qry_resource_type as None
            self.qry_resource_type = None

        if resource_title is not None:
            #make all lowercase
            self.qry_resource_title = resource_title.lower()
        else:
            #not searching on title, just define qry_resource_title as None
            self.qry_resource_title = None

        #end initializing query variables

        #initialize crawl variables
        self.current_uri = self.entry_point #keep track of current location
        self.current_uri_type = 'entry_point'

        self.bfs()

        return self.found_resources


    def bfs(self):

        current_depth = 0
        visited = set()
        link_tree = [[] for k in range(self.degrees)]

        while True:

            time.sleep(self.crawl_delay/1000.0)

            #download the current resource
            try:
                req = requests.get(self.current_uri)
                log.info( '%s downloaded.', self.current_uri )

                #put request in JSON form, apply CURIES, get links
                resource_json = req.json()
                log.debug('HAL/JSON RAW RESOURCE: %s', resource_json)

            #downloading the current resource failed
            except requests.exceptions.ConnectionError:

                log.warn( 'URI "%s" unresponsive, ignoring',\
                        self.current_uri )

                resource_json = {'_links':[]}

                #if we failed to download the entry point, give up
                if self.current_uri == self.entry_point:
                    log.error( 'URI is entry point, no previous link.  Try again when' \
                            + ' the entry point URI is available.' )
                    return

            #end downloading resource

            #get links from this resource
            req_links = self.apply_hal_curies(resource_json)['_links']
            crawl_links = self.flatten_filter_link_array(req_links)

            #crawl_links is a 'flat' list list[:][fields]
            #fields are href, type, title, in_cache, from_item_list

            log.debug('HAL/JSON LINKS CURIES APPLIED, FILTERED (for history,' + \
                    'self, create/edit, ws, itemlist flattened): %s', crawl_links)

            #find the uris/resources that match search criteria!
            matching_uris = self.query_link_array(crawl_links)
            #... and send them out!!
            if (self.push_uris_to_queue(matching_uris) and self.return_if_found):
                return #return if we are using find_first and we found one

            #push all uris that don't match visited to proper depth list
            visited.add(self.current_uri)

            if current_depth < self.degrees:
                [link_tree[current_depth].append(x) for x in crawl_links \
                        if not x['href'] in visited]

            log.debug('BFS Array: %s', link_tree)
            log.debug('VISITED: %s', visited)

            #select next current_uri and current_uri_type by looking through
            #link_tree, if empty return

            finished = True

            for index in range(len(link_tree)):
                if len(link_tree[index]):

                    self.current_uri = link_tree[index][0]['href']
                    self.current_uri_type = link_tree[index][0]['type']
                    del link_tree[index][0]

                    current_depth = index + 1
                    finished = False
                    break

            if finished:
                return

            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
            log.info('CRAWL: moving to %s', self.current_uri)
            log.info('CRAWL: type: %s', self.current_uri_type)
            log.info('CRAWL: depth: %s', current_depth)
            log.debug('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')


    def find_degrees_all(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, degrees=1):
        '''only looks at 'degrees' degree away for the resources exhaustively,
        returns the list after examining all links 'degrees' away'''
        self.reinit()
        self.degrees = degrees

        return self.search(namespace=namespace, resource_type=resource_type, \
            plural_resource_type=plural_resource_type, resource_title=resource_title).asList()


    def find_first(self, namespace="", resource_type=None, \
            plural_resource_type=None, resource_title=None, max_degrees=3):
        '''breadth first search, returning first matching resource.  Max_degrees
        specifies the max degrees of seperation it will exhaustively search
        before giving up and returning an empty list if none are found'''

        self.reinit()
        self.degrees = max_degrees
        self.return_if_found = True

        return self.search(namespace=namespace, resource_type=resource_type, \
            plural_resource_type=plural_resource_type, resource_title=resource_title).asList()


    def find_create_link(self, namespace="", resource_type=None, \
            plural_resource_type=None, degrees=1):
        ''' look for a createform link of type resource_type, at most 'degrees'
        degrees away from the entrypoint, and return after exhaustive search'''

        self.reinit()
        self.filter_keywords = [x for x in self.filter_keywords if x != 'create']
        self.degrees = degrees

        if resource_type is not None:
            #append namespace
            self.createform_type = namespace + resource_type
            #make all lowercase
            self.createform_type = [self.createform_type.lower()]
            #'pluralize' resource after adding namespace
            [self.createform_type.append(x) for x in \
                    self.pluralize_resource_name(self.createform_type[0])]

        found_link= self.search(namespace=namespace, resource_type='createForm', \
            plural_resource_type=plural_resource_type).asList()

        self.filter_keywords.append('create')

        return found_link


    def reset_entrypoint(self, new_entrypoint = 'http://learnair.media.mit.edu:8000/'):
        self.entry_point = new_entrypoint #entry point URI
        self.current_uri = new_entrypoint #keep track of current location
        self.current_uri_type = 'entry_point'


if __name__=="__main__":


    #######JUST CRAWL EXAMPLES######

    searcher = ChainSearch('http://learnair.media.mit.edu:8000/devices/10')

    #x = searcher.find_degrees_all(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #       resource_title='a') #resource_type='site')
    x = searcher.find_create_link(namespace='http://learnair.media.mit.edu:8000/rels/')#, \
    #       resource_type='sensor') #resource_type='site')
    print '---------------------'
    print x

    #searcher = ChainSearch('http://learnair.media.mit.edu:8000/devices/?site_id=1')

    #crawler.crawl(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_title='a')
    #crawler.crawl(namespace='http://learnair.media.mit.edu:8000/rels/', \
    #        resource_type='Device', \
    #        resource_title='test004')
    #crawler.crawl()