-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpipelines.py
More file actions
77 lines (64 loc) · 3.11 KB
/
pipelines.py
File metadata and controls
77 lines (64 loc) · 3.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import sys
import tldextract
sys.path.insert(0, "../")
from CMS import Format
from traversal_rule_identifier import TraversalRule
import json
import logging
from traversal_rule_identifier import TraversalRule
logging.basicConfig(level=logging.ERROR)
AUTHOR_CANDIDATES_FILE_PREFIX = "author-candidates-"
class ScraperPipeline(object):
def open_spider(self, spider):
logging.debug("Opening Spider")
self.adf_file = open("../resources/adf_results.json", "w")
self.items_file = open("../resources/results.json", "w")
self.author_candidates_file = open("../resources/url_author_candidates.json", "w")
self.url_references = dict()
self.results = dict()
self.items = []
self.author_candidates = dict()
self.traversal = dict()
self.traversal_rule_file = open("../resources/domain_traversal_rules-500.json", "w")
with open("../resources/url_references.json", "r") as fp:
self.url_references = json.load(fp)
def close_spider(self, spider):
# logging.debug("Closing Spider and dumping {}".format(self.results))
json.dump(self.results, self.adf_file, cls=Format.MyEncoder)
# json.dump(self.items, self.items_file, cls=Format.MyEncoder)
logging.info("author candidates {}".format(self.author_candidates))
json.dump(self.author_candidates, self.author_candidates_file)
json.dump(self.traversal, self.traversal_rule_file)
self.items_file.close()
self.adf_file.close()
self.author_candidates_file.close()
def checkpoint_domain_traversal(self):
self.traversal_rule_file = open("../resources/domain_traversal_rules-500.json", "w")
json.dump(self.traversal, self.traversal_rule_file)
def process_item(self, item, spider):
if item["CMS-ADF"]:
adf = item["CMS-ADF"]
self.results[adf.url] = adf
if "CANDIDATE_AUTHORS" in item:
logging.debug("Dumping Candidate Authors for {}".format(adf.url))
self.author_candidates[adf.url] = item["CANDIDATE_AUTHORS"]
if get_domain(adf.url) not in self.traversal and self.author_candidates and self.url_references and adf.url in self.url_references and adf.url in self.author_candidates :
reference = self.url_references[adf.url]
if reference['author_name']:
tr = TraversalRule(None, reference['author_name'], None)
tr.candidates = self.author_candidates[adf.url]
tr.pick_traversal_from_author()
if tr.traversal_rule:
self.traversal[get_domain(adf.url)] = tr.traversal_rule
logging.debug("Traversal rule {}".format(self.traversal))
self.items.append(item)
self.checkpoint_domain_traversal()
return item
def get_domain(url):
extracted = tldextract.extract(url)
return extracted.registered_domain