From 31ef29a1f9a056f0ead08388f50f2735f949dbe3 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 29 Jul 2025 08:39:43 +0200 Subject: [PATCH] Fix retry count to not include the initial attempt --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- .../crawlers/_basic/test_basic_crawler.py | 43 ++++++------------- .../unit/crawlers/_http/test_http_crawler.py | 6 +-- .../_playwright/test_playwright_crawler.py | 4 +- 4 files changed, 19 insertions(+), 36 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 29f59086a4..440705c182 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -855,7 +855,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception) if max_request_retries is None: max_request_retries = self._max_request_retries - return (context.request.retry_count + 1) < max_request_retries + return context.request.retry_count < max_request_retries async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]: """Ensure that the `loaded_url` still matches the enqueue strategy after redirects. diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 23297a30b5..aa7eabf29e 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -17,7 +17,7 @@ from crawlee import ConcurrencySettings, Glob, service_locator from crawlee._request import Request -from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders, HttpMethod +from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod from crawlee._utils.robots import RobotsTxtFile from crawlee.configuration import Configuration from crawlee.crawlers import BasicCrawler @@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None: 'https://c.placeholder.com', 'https://b.placeholder.com', 'https://b.placeholder.com', + 'https://b.placeholder.com', ] async def test_respects_no_retry() -> None: - crawler = BasicCrawler(max_request_retries=3) + crawler = BasicCrawler(max_request_retries=2) calls = list[str]() @crawler.router.default_handler @@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_respects_request_specific_max_retries() -> None: - crawler = BasicCrawler(max_request_retries=1) + crawler = BasicCrawler(max_request_retries=0) calls = list[str]() @crawler.router.default_handler @@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None: [ 'https://a.placeholder.com', 'https://b.placeholder.com', - Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}), + Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}), ] ) @@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None: 'https://b.placeholder.com', 'https://c.placeholder.com', 'https://c.placeholder.com', - 'https://c.placeholder.com', - 'https://c.placeholder.com', ] @@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None: class Call: url: str error: Exception - custom_retry_count: int # List to store the information of calls to the error handler. calls = list[Call]() - crawler = BasicCrawler(max_request_retries=3) + crawler = BasicCrawler(max_request_retries=2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: @@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None: @crawler.error_handler async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request: - # Retrieve or initialize the headers, and extract the current custom retry count. - headers = context.request.headers or HttpHeaders() - custom_retry_count = int(headers.get('custom_retry_count', '0')) - # Append the current call information. - calls.append(Call(context.request.url, error, custom_retry_count)) - - # Update the request to include an incremented custom retry count in the headers and return it. - request = context.request.model_dump() - request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)}) - return Request.model_validate(request) + calls.append(Call(context.request.url, error)) + return context.request await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) # Verify that the error handler was called twice assert len(calls) == 2 - # Check the first call... - first_call = calls[0] - assert first_call.url == 'https://b.placeholder.com' - assert isinstance(first_call.error, RuntimeError) - assert first_call.custom_retry_count == 0 - - # Check the second call... - second_call = calls[1] - assert second_call.url == 'https://b.placeholder.com' - assert isinstance(second_call.error, RuntimeError) - assert second_call.custom_retry_count == 1 + # Check calls + for error_call in calls: + assert error_call.url == 'https://b.placeholder.com' + assert isinstance(error_call.error, RuntimeError) async def test_calls_error_handler_for_sesion_errors() -> None: @@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_final_statistics() -> None: - crawler = BasicCrawler(max_request_retries=3) + crawler = BasicCrawler(max_request_retries=2) @crawler.router.default_handler async def handler(context: BasicCrawlingContext) -> None: diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index bed2b5dc99..15cc132b63 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -104,7 +104,7 @@ async def test_handles_client_errors( request_handler=mock_request_handler, additional_http_error_status_codes=additional_http_error_status_codes, ignore_http_error_status_codes=ignore_http_error_status_codes, - max_request_retries=3, + max_request_retries=2, ) await crawler.add_requests([str(server_url / 'status/402')]) @@ -230,7 +230,7 @@ async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) -> '200': 10, '403': 100, # block errors change session and retry '402': 10, # client errors are not retried by default - '500': 30, # server errors are retried by default + '500': 40, # server errors are retried by default } @@ -568,7 +568,7 @@ async def request_handler(context: HttpCrawlingContext) -> None: kvs_content[key_info.key] = await kvs.get_value(key_info.key) # One error, three time retried. - assert crawler.statistics.error_tracker.total == 3 + assert crawler.statistics.error_tracker.total == 4 assert crawler.statistics.error_tracker.unique_error_count == 1 assert len(kvs_content) == 1 assert key_info.key.endswith('.html') diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index fada38fe97..a170f968b0 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -150,7 +150,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: async def test_nonexistent_url_invokes_error_handler() -> None: - crawler = PlaywrightCrawler(max_request_retries=4, request_handler=mock.AsyncMock()) + crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock()) error_handler = mock.AsyncMock(return_value=None) crawler.error_handler(error_handler) @@ -617,7 +617,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8') # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected. - assert crawler.statistics.error_tracker.total == 3 * max_retries + assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1) assert crawler.statistics.error_tracker.unique_error_count == 2 assert len(list(kvs_content.keys())) == 4