Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added before_scroll.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
157 changes: 157 additions & 0 deletions tests/unit/crawlers/_playwright/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from playwright.async_api import async_playwright
from yarl import URL

from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll


async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None:
"""Checks that infinite_scroll loads all items on a page with infinite scrolling."""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

target_url = str(server_url / 'infinite_scroll')

# Get data with manual scrolling
await page.goto(target_url)

manual_items = []
for _ in range(4):
items = await page.query_selector_all('.item')
manual_items = items
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
await page.wait_for_timeout(1000)

# Reset page
await page.close()
page = await browser.new_page()
await page.goto(target_url)

# Get data with infinite_scroll utility
before_scroll = await page.query_selector_all('.item')
assert len(before_scroll) != len(manual_items)
assert len(before_scroll) == 10

await infinite_scroll(page)

after_scroll = await page.query_selector_all('.item')

assert len(before_scroll) < len(after_scroll)
assert len(manual_items) == len(after_scroll)

await browser.close()


async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None:
"""Checks that infinite_scroll does not call error on a page without infinite scrolling."""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

await page.goto(str(server_url))

await infinite_scroll(page)

title = await page.title()

assert title == 'Hello, world!'

await browser.close()


async def test_double_call_infinite_scroll(server_url: URL) -> None:
"""Checks that calling infinite_scroll twice does not load more items the second time."""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()

await page.goto(str(server_url / 'infinite_scroll'))

await infinite_scroll(page)
first_count = len(await page.query_selector_all('.item'))

await infinite_scroll(page)
second_count = len(await page.query_selector_all('.item'))

assert first_count == second_count

await browser.close()


async def test_block_requests_default(server_url: URL) -> None:
"""Checks that block_requests blocks the correct resources by default."""
async with async_playwright() as p:
browser = await p.chromium.launch()

target_url = str(server_url / 'resource_loading_page')

# Default behavior, all resources load
page = await browser.new_page()
loaded_urls_no_block = []

page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1]))
await page.goto(target_url)
await page.wait_for_load_state('networkidle')
await page.close()

# With blocking — collect loaded resources
page = await browser.new_page()
loaded_urls_blocked = []

page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
await block_requests(page)
await page.goto(target_url)
await page.wait_for_load_state('networkidle')
await page.close()

await browser.close()

# Without blocking, both resources should load
assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'}

# With blocking, only JS should load
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'}


async def test_block_requests_with_extra_patterns(server_url: URL) -> None:
"""Checks that block_requests blocks the correct resources with extra patterns."""
async with async_playwright() as p:
browser = await p.chromium.launch()

target_url = str(server_url / 'resource_loading_page')

page = await browser.new_page()
loaded_urls_blocked = []

page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
await block_requests(page, extra_url_patterns=['*.js'])
await page.goto(target_url)
await page.wait_for_load_state('networkidle')
await page.close()

await browser.close()

# With blocking, only HTML should load
assert set(loaded_urls_blocked) == {'resource_loading_page'}


async def test_block_requests_with_custom_patterns(server_url: URL) -> None:
"""Checks that block_requests blocks the correct resources with custom patterns."""
async with async_playwright() as p:
browser = await p.chromium.launch()

target_url = str(server_url / 'resource_loading_page')

page = await browser.new_page()
loaded_urls_blocked = []

page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1]))
await block_requests(page, url_patterns=['*.js'])
await page.goto(target_url)
await page.wait_for_load_state('networkidle')
await page.close()

await browser.close()

# With blocking, only PNG should load
assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'}
20 changes: 20 additions & 0 deletions tests/unit/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
GENERIC_RESPONSE,
HELLO_WORLD,
INCAPSULA,
INFINITE_SCROLL,
PROBLEMATIC_LINKS,
RESOURCE_LOADING_PAGE,
ROBOTS_TXT,
SECONDARY_INDEX,
START_ENQUEUE,
Expand Down Expand Up @@ -121,6 +123,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None:
'xml': hello_world_xml,
'robots.txt': robots_txt,
'get_compressed': get_compressed,
'infinite_scroll': infinite_scroll_endpoint,
'resource_loading_page': resource_loading_endpoint,
}
path = URL(scope['path']).parts[1]
# Route requests to appropriate handlers
Expand Down Expand Up @@ -411,6 +415,22 @@ async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send)
await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)})


async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the infinite scroll page."""
await send_html_response(
send,
INFINITE_SCROLL,
)


async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None:
"""Handle requests for the resource loading page."""
await send_html_response(
send,
RESOURCE_LOADING_PAGE,
)


class TestServer(Server):
"""A test HTTP server implementation based on Uvicorn Server."""

Expand Down
59 changes: 59 additions & 0 deletions tests/unit/server_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,62 @@

sitemap: http://not-exists.com/sitemap_1.xml
sitemap: http://not-exists.com/sitemap_2.xml"""


INFINITE_SCROLL = b"""\
<!DOCTYPE html>
<html>
<body>
<div id="content"></div>

<script>
let page = 0;
let loading = false;

for (let i = 0; i < 10; i++) {
const div = document.createElement('div');
div.className = 'item';
div.style.height = '200px';
div.textContent = 'Item ' + (i + 1);
document.getElementById('content').appendChild(div);
}

async function loadMore() {
if (loading || page >= 3) return;
loading = true;
page++;

await new Promise(resolve => setTimeout(resolve, 100));

for (let i = 0; i < 10; i++) {
const div = document.createElement('div');
div.className = 'item';
div.style.height = '200px';
div.textContent = 'Item ' + (page * 10 + i + 1);
document.getElementById('content').appendChild(div);
}

loading = false;
}

window.addEventListener('scroll', () => {
if (window.innerHeight + window.scrollY >= document.body.offsetHeight - 100) {
loadMore();
}
});
</script>
</body>
</html>
"""

RESOURCE_LOADING_PAGE = b"""\
<!DOCTYPE html>
<html>
<head>
<script src="/server_static/test.js"></script>
</head>
<body>
<img src="/server_static/test.png" />
</body>
</html>
"""
Empty file.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading