From 31ef29a1f9a056f0ead08388f50f2735f949dbe3 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 29 Jul 2025 08:39:43 +0200
Subject: [PATCH] Fix retry count to not include the initial attempt

---
 src/crawlee/crawlers/_basic/_basic_crawler.py |  2 +-
 .../crawlers/_basic/test_basic_crawler.py     | 43 ++++++-------------
 .../unit/crawlers/_http/test_http_crawler.py  |  6 +--
 .../_playwright/test_playwright_crawler.py    |  4 +-
 4 files changed, 19 insertions(+), 36 deletions(-)

diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py
index 29f59086a4..440705c182 100644
--- a/src/crawlee/crawlers/_basic/_basic_crawler.py
+++ b/src/crawlee/crawlers/_basic/_basic_crawler.py
@@ -855,7 +855,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
         if max_request_retries is None:
             max_request_retries = self._max_request_retries
 
-        return (context.request.retry_count + 1) < max_request_retries
+        return context.request.retry_count < max_request_retries
 
     async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
         """Ensure that the `loaded_url` still matches the enqueue strategy after redirects.
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
index 23297a30b5..aa7eabf29e 100644
--- a/tests/unit/crawlers/_basic/test_basic_crawler.py
+++ b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -17,7 +17,7 @@
 
 from crawlee import ConcurrencySettings, Glob, service_locator
 from crawlee._request import Request
-from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders, HttpMethod
+from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
 from crawlee._utils.robots import RobotsTxtFile
 from crawlee.configuration import Configuration
 from crawlee.crawlers import BasicCrawler
@@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None:
         'https://c.placeholder.com',
         'https://b.placeholder.com',
         'https://b.placeholder.com',
+        'https://b.placeholder.com',
     ]
 
 
 async def test_respects_no_retry() -> None:
-    crawler = BasicCrawler(max_request_retries=3)
+    crawler = BasicCrawler(max_request_retries=2)
     calls = list[str]()
 
     @crawler.router.default_handler
@@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 
 
 async def test_respects_request_specific_max_retries() -> None:
-    crawler = BasicCrawler(max_request_retries=1)
+    crawler = BasicCrawler(max_request_retries=0)
     calls = list[str]()
 
     @crawler.router.default_handler
@@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None:
         [
             'https://a.placeholder.com',
             'https://b.placeholder.com',
-            Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}),
+            Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}),
         ]
     )
 
@@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None:
         'https://b.placeholder.com',
         'https://c.placeholder.com',
         'https://c.placeholder.com',
-        'https://c.placeholder.com',
-        'https://c.placeholder.com',
     ]
 
 
@@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None:
     class Call:
         url: str
         error: Exception
-        custom_retry_count: int
 
     # List to store the information of calls to the error handler.
     calls = list[Call]()
 
-    crawler = BasicCrawler(max_request_retries=3)
+    crawler = BasicCrawler(max_request_retries=2)
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
@@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None:
 
     @crawler.error_handler
     async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request:
-        # Retrieve or initialize the headers, and extract the current custom retry count.
-        headers = context.request.headers or HttpHeaders()
-        custom_retry_count = int(headers.get('custom_retry_count', '0'))
-
         # Append the current call information.
-        calls.append(Call(context.request.url, error, custom_retry_count))
-
-        # Update the request to include an incremented custom retry count in the headers and return it.
-        request = context.request.model_dump()
-        request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)})
-        return Request.model_validate(request)
+        calls.append(Call(context.request.url, error))
+        return context.request
 
     await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
 
     # Verify that the error handler was called twice
     assert len(calls) == 2
 
-    # Check the first call...
-    first_call = calls[0]
-    assert first_call.url == 'https://b.placeholder.com'
-    assert isinstance(first_call.error, RuntimeError)
-    assert first_call.custom_retry_count == 0
-
-    # Check the second call...
-    second_call = calls[1]
-    assert second_call.url == 'https://b.placeholder.com'
-    assert isinstance(second_call.error, RuntimeError)
-    assert second_call.custom_retry_count == 1
+    # Check calls
+    for error_call in calls:
+        assert error_call.url == 'https://b.placeholder.com'
+        assert isinstance(error_call.error, RuntimeError)
 
 
 async def test_calls_error_handler_for_sesion_errors() -> None:
@@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:
 
 
 async def test_final_statistics() -> None:
-    crawler = BasicCrawler(max_request_retries=3)
+    crawler = BasicCrawler(max_request_retries=2)
 
     @crawler.router.default_handler
     async def handler(context: BasicCrawlingContext) -> None:
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
index bed2b5dc99..15cc132b63 100644
--- a/tests/unit/crawlers/_http/test_http_crawler.py
+++ b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -104,7 +104,7 @@ async def test_handles_client_errors(
         request_handler=mock_request_handler,
         additional_http_error_status_codes=additional_http_error_status_codes,
         ignore_http_error_status_codes=ignore_http_error_status_codes,
-        max_request_retries=3,
+        max_request_retries=2,
     )
 
     await crawler.add_requests([str(server_url / 'status/402')])
@@ -230,7 +230,7 @@ async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) ->
         '200': 10,
         '403': 100,  # block errors change session and retry
         '402': 10,  # client errors are not retried by default
-        '500': 30,  # server errors are retried by default
+        '500': 40,  # server errors are retried by default
     }
 
 
@@ -568,7 +568,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
         kvs_content[key_info.key] = await kvs.get_value(key_info.key)
 
     # One error, three time retried.
-    assert crawler.statistics.error_tracker.total == 3
+    assert crawler.statistics.error_tracker.total == 4
     assert crawler.statistics.error_tracker.unique_error_count == 1
     assert len(kvs_content) == 1
     assert key_info.key.endswith('.html')
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
index fada38fe97..a170f968b0 100644
--- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py
+++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -150,7 +150,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
 
 
 async def test_nonexistent_url_invokes_error_handler() -> None:
-    crawler = PlaywrightCrawler(max_request_retries=4, request_handler=mock.AsyncMock())
+    crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock())
 
     error_handler = mock.AsyncMock(return_value=None)
     crawler.error_handler(error_handler)
@@ -617,7 +617,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
             assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')
 
     # Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.
-    assert crawler.statistics.error_tracker.total == 3 * max_retries
+    assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1)
     assert crawler.statistics.error_tracker.unique_error_count == 2
     assert len(list(kvs_content.keys())) == 4