From bd239525ccc8b3aa07d1aece6d8912987af3d61f Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 25 Nov 2025 02:27:08 +0000 Subject: [PATCH] add tests for playwright utils --- before_scroll.png | Bin 0 -> 4253 bytes tests/unit/crawlers/_playwright/test_utils.py | 157 ++++++++++++++++++ tests/unit/server.py | 20 +++ tests/unit/server_endpoints.py | 59 +++++++ tests/unit/server_static/test.js | 0 tests/unit/server_static/test.png | 0 6 files changed, 236 insertions(+) create mode 100644 before_scroll.png create mode 100644 tests/unit/crawlers/_playwright/test_utils.py create mode 100644 tests/unit/server_static/test.js create mode 100644 tests/unit/server_static/test.png diff --git a/before_scroll.png b/before_scroll.png new file mode 100644 index 0000000000000000000000000000000000000000..6d360f6bba60307ddce12a4bda5ae0e2ff9278b8 GIT binary patch literal 4253 zcmeAS@N?(olHy`uVBq!ia0y~yUeX7 q@D_FkhX4QX9*X@7G?5KtA~VB;)qHl1Z#nXSA`G6celF{r5}E*b2*WS{ literal 0 HcmV?d00001 diff --git a/tests/unit/crawlers/_playwright/test_utils.py b/tests/unit/crawlers/_playwright/test_utils.py new file mode 100644 index 0000000000..1b81108e9e --- /dev/null +++ b/tests/unit/crawlers/_playwright/test_utils.py @@ -0,0 +1,157 @@ +from playwright.async_api import async_playwright +from yarl import URL + +from crawlee.crawlers._playwright._utils import block_requests, infinite_scroll + + +async def test_infinite_scroll_on_dynamic_page(server_url: URL) -> None: + """Checks that infinite_scroll loads all items on a page with infinite scrolling.""" + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + target_url = str(server_url / 'infinite_scroll') + + # Get data with manual scrolling + await page.goto(target_url) + + manual_items = [] + for _ in range(4): + items = await page.query_selector_all('.item') + manual_items = items + await page.evaluate('window.scrollTo(0, document.body.scrollHeight)') + await page.wait_for_timeout(1000) + + # Reset page + await page.close() + page = await browser.new_page() + await page.goto(target_url) + + # Get data with infinite_scroll utility + before_scroll = await page.query_selector_all('.item') + assert len(before_scroll) != len(manual_items) + assert len(before_scroll) == 10 + + await infinite_scroll(page) + + after_scroll = await page.query_selector_all('.item') + + assert len(before_scroll) < len(after_scroll) + assert len(manual_items) == len(after_scroll) + + await browser.close() + + +async def test_infinite_scroll_no_page_without_scroll(server_url: URL) -> None: + """Checks that infinite_scroll does not call error on a page without infinite scrolling.""" + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + await page.goto(str(server_url)) + + await infinite_scroll(page) + + title = await page.title() + + assert title == 'Hello, world!' + + await browser.close() + + +async def test_double_call_infinite_scroll(server_url: URL) -> None: + """Checks that calling infinite_scroll twice does not load more items the second time.""" + async with async_playwright() as p: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + + await page.goto(str(server_url / 'infinite_scroll')) + + await infinite_scroll(page) + first_count = len(await page.query_selector_all('.item')) + + await infinite_scroll(page) + second_count = len(await page.query_selector_all('.item')) + + assert first_count == second_count + + await browser.close() + + +async def test_block_requests_default(server_url: URL) -> None: + """Checks that block_requests blocks the correct resources by default.""" + async with async_playwright() as p: + browser = await p.chromium.launch() + + target_url = str(server_url / 'resource_loading_page') + + # Default behavior, all resources load + page = await browser.new_page() + loaded_urls_no_block = [] + + page.on('requestfinished', lambda req: loaded_urls_no_block.append(req.url.rsplit('/', 1)[-1])) + await page.goto(target_url) + await page.wait_for_load_state('networkidle') + await page.close() + + # With blocking — collect loaded resources + page = await browser.new_page() + loaded_urls_blocked = [] + + page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) + await block_requests(page) + await page.goto(target_url) + await page.wait_for_load_state('networkidle') + await page.close() + + await browser.close() + + # Without blocking, both resources should load + assert set(loaded_urls_no_block) == {'resource_loading_page', 'test.js', 'test.png'} + + # With blocking, only JS should load + assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.js'} + + +async def test_block_requests_with_extra_patterns(server_url: URL) -> None: + """Checks that block_requests blocks the correct resources with extra patterns.""" + async with async_playwright() as p: + browser = await p.chromium.launch() + + target_url = str(server_url / 'resource_loading_page') + + page = await browser.new_page() + loaded_urls_blocked = [] + + page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) + await block_requests(page, extra_url_patterns=['*.js']) + await page.goto(target_url) + await page.wait_for_load_state('networkidle') + await page.close() + + await browser.close() + + # With blocking, only HTML should load + assert set(loaded_urls_blocked) == {'resource_loading_page'} + + +async def test_block_requests_with_custom_patterns(server_url: URL) -> None: + """Checks that block_requests blocks the correct resources with custom patterns.""" + async with async_playwright() as p: + browser = await p.chromium.launch() + + target_url = str(server_url / 'resource_loading_page') + + page = await browser.new_page() + loaded_urls_blocked = [] + + page.on('requestfinished', lambda req: loaded_urls_blocked.append(req.url.rsplit('/', 1)[-1])) + await block_requests(page, url_patterns=['*.js']) + await page.goto(target_url) + await page.wait_for_load_state('networkidle') + await page.close() + + await browser.close() + + # With blocking, only PNG should load + assert set(loaded_urls_blocked) == {'resource_loading_page', 'test.png'} diff --git a/tests/unit/server.py b/tests/unit/server.py index 451800875f..879c9407fd 100644 --- a/tests/unit/server.py +++ b/tests/unit/server.py @@ -18,7 +18,9 @@ GENERIC_RESPONSE, HELLO_WORLD, INCAPSULA, + INFINITE_SCROLL, PROBLEMATIC_LINKS, + RESOURCE_LOADING_PAGE, ROBOTS_TXT, SECONDARY_INDEX, START_ENQUEUE, @@ -121,6 +123,8 @@ async def app(scope: dict[str, Any], receive: Receive, send: Send) -> None: 'xml': hello_world_xml, 'robots.txt': robots_txt, 'get_compressed': get_compressed, + 'infinite_scroll': infinite_scroll_endpoint, + 'resource_loading_page': resource_loading_endpoint, } path = URL(scope['path']).parts[1] # Route requests to appropriate handlers @@ -411,6 +415,22 @@ async def get_compressed(_scope: dict[str, Any], _receive: Receive, send: Send) await send({'type': 'http.response.body', 'body': gzip.compress(HELLO_WORLD * 1000)}) +async def infinite_scroll_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests for the infinite scroll page.""" + await send_html_response( + send, + INFINITE_SCROLL, + ) + + +async def resource_loading_endpoint(_scope: dict[str, Any], _receive: Receive, send: Send) -> None: + """Handle requests for the resource loading page.""" + await send_html_response( + send, + RESOURCE_LOADING_PAGE, + ) + + class TestServer(Server): """A test HTTP server implementation based on Uvicorn Server.""" diff --git a/tests/unit/server_endpoints.py b/tests/unit/server_endpoints.py index a42062d114..ba45b476a6 100644 --- a/tests/unit/server_endpoints.py +++ b/tests/unit/server_endpoints.py @@ -69,3 +69,62 @@ sitemap: http://not-exists.com/sitemap_1.xml sitemap: http://not-exists.com/sitemap_2.xml""" + + +INFINITE_SCROLL = b"""\ + + + +
+ + + + +""" + +RESOURCE_LOADING_PAGE = b"""\ + + + + + + + + + +""" diff --git a/tests/unit/server_static/test.js b/tests/unit/server_static/test.js new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/server_static/test.png b/tests/unit/server_static/test.png new file mode 100644 index 0000000000..e69de29bb2