-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_web_crawler_main_class.py
More file actions
102 lines (73 loc) · 2.61 KB
/
test_web_crawler_main_class.py
File metadata and controls
102 lines (73 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#
# Testing asynchronous web crawler
# Author: Emily Quinn Finney
#
from bs4 import BeautifulSoup
import os
import pytest
import requests
import web_crawler_main_class as websclass
@pytest.fixture
def link():
return 'http://shop.numitea.com/Mate-Lemon/p/NUMIS-10250&c=NumiTeaStore@Teabag@Green'
@pytest.fixture
def structured_page(link):
page = requests.get(link).text
return BeautifulSoup(page, 'lxml')
@pytest.fixture
def crawler(link):
return websclass.PageScraper(link, 'NumiTeaStore', 'NUMIS-[0-9]*')
@pytest.fixture
@pytest.mark.asyncio
def loader(event_loop, link):
import aiohttp
with aiohttp.ClientSession(loop=event_loop) as client_session:
# initialize the objects
yield websclass.URLoader(link, client_session)
@pytest.fixture
def filename():
filename = 'test_file.txt'
yield filename
print("commence filename teardown")
if os.path.isfile(filename):
os.remove(filename)
@pytest.fixture
def scraper(loader, crawler, filename):
return websclass.MainScraper(loader, crawler, filename)
def test_locate_linked_pages(crawler, structured_page):
set_of_links = crawler.locate_linked_pages(structured_page)
for thing in set_of_links:
assert crawler.sequence in thing
def test_add_link_to_master(crawler, link):
master_list_before = len(crawler.master_set)
result = crawler.add_link_to_master(link)
assert result
master_list_after = len(crawler.master_set)
assert master_list_after == master_list_before + 1
def test_find_id(link, id_sequence='NUMIS-[0-9]*'):
number = websclass.find_id(link, id_sequence)
assert number == 'NUMIS-10250'
def test_identify_duplicates(link, master_list=set(), id_sequence='NUMIS-[0-9]*'):
result = websclass.identify_duplicates(link, master_list, id_sequence)
assert not result
master_list.add('NUMIS-10250')
next_result = websclass.identify_duplicates(link, master_list, id_sequence)
assert next_result
@pytest.mark.asyncio
async def test_fetch(loader):
response = await loader.fetch()
print(response)
@pytest.mark.asyncio
async def test_open_page(loader):
structured_page = await loader.open_page()
print(structured_page)
def test_write_page_to_file(filename, structured_page):
websclass.write_page_to_file(structured_page, filename)
assert os.path.isfile(filename)
@pytest.mark.asyncio
async def test_update_queue(scraper, link):
count_before = scraper.page_scraper.queue.count(link)
await scraper.update_queue(link)
# test that it changed url loader
assert scraper.url_loader.url == link
# TODO: test that it removed link from queue