Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,7 @@ def _should_retry_request(self, context: BasicCrawlingContext, error: Exception)
if max_request_retries is None:
max_request_retries = self._max_request_retries

return (context.request.retry_count + 1) < max_request_retries
return context.request.retry_count < max_request_retries

async def _check_url_after_redirects(self, context: TCrawlingContext) -> AsyncGenerator[TCrawlingContext, None]:
"""Ensure that the `loaded_url` still matches the enqueue strategy after redirects.
Expand Down
43 changes: 13 additions & 30 deletions tests/unit/crawlers/_basic/test_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

from crawlee import ConcurrencySettings, Glob, service_locator
from crawlee._request import Request
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders, HttpMethod
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod
from crawlee._utils.robots import RobotsTxtFile
from crawlee.configuration import Configuration
from crawlee.crawlers import BasicCrawler
Expand Down Expand Up @@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None:
'https://c.placeholder.com',
'https://b.placeholder.com',
'https://b.placeholder.com',
'https://b.placeholder.com',
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intenational?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, default retries are 3. So now with this PR it will make 4 calls (original request + 3 retries)

]


async def test_respects_no_retry() -> None:
crawler = BasicCrawler(max_request_retries=3)
crawler = BasicCrawler(max_request_retries=2)
calls = list[str]()

@crawler.router.default_handler
Expand Down Expand Up @@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None:


async def test_respects_request_specific_max_retries() -> None:
crawler = BasicCrawler(max_request_retries=1)
crawler = BasicCrawler(max_request_retries=0)
calls = list[str]()

@crawler.router.default_handler
Expand All @@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None:
[
'https://a.placeholder.com',
'https://b.placeholder.com',
Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}),
Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}),
]
)

Expand All @@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None:
'https://b.placeholder.com',
'https://c.placeholder.com',
'https://c.placeholder.com',
'https://c.placeholder.com',
'https://c.placeholder.com',
Comment on lines -191 to -192
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intenational?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, 1 retry means 1 call + 1 retry.

]


Expand All @@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None:
class Call:
url: str
error: Exception
custom_retry_count: int

# List to store the information of calls to the error handler.
calls = list[Call]()

crawler = BasicCrawler(max_request_retries=3)
crawler = BasicCrawler(max_request_retries=2)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand All @@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None:

@crawler.error_handler
async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request:
# Retrieve or initialize the headers, and extract the current custom retry count.
headers = context.request.headers or HttpHeaders()
custom_retry_count = int(headers.get('custom_retry_count', '0'))

# Append the current call information.
calls.append(Call(context.request.url, error, custom_retry_count))

# Update the request to include an incremented custom retry count in the headers and return it.
request = context.request.model_dump()
request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)})
return Request.model_validate(request)
Comment on lines -216 to -226
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just some optimization?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think this was somewhat complicated for the test's intention, and the same could be achieved with a simpler setup.

calls.append(Call(context.request.url, error))
return context.request

await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])

# Verify that the error handler was called twice
assert len(calls) == 2

# Check the first call...
first_call = calls[0]
assert first_call.url == 'https://b.placeholder.com'
assert isinstance(first_call.error, RuntimeError)
assert first_call.custom_retry_count == 0

# Check the second call...
second_call = calls[1]
assert second_call.url == 'https://b.placeholder.com'
assert isinstance(second_call.error, RuntimeError)
assert second_call.custom_retry_count == 1
# Check calls
for error_call in calls:
assert error_call.url == 'https://b.placeholder.com'
assert isinstance(error_call.error, RuntimeError)


async def test_calls_error_handler_for_sesion_errors() -> None:
Expand Down Expand Up @@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None:


async def test_final_statistics() -> None:
crawler = BasicCrawler(max_request_retries=3)
crawler = BasicCrawler(max_request_retries=2)

@crawler.router.default_handler
async def handler(context: BasicCrawlingContext) -> None:
Expand Down
6 changes: 3 additions & 3 deletions tests/unit/crawlers/_http/test_http_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ async def test_handles_client_errors(
request_handler=mock_request_handler,
additional_http_error_status_codes=additional_http_error_status_codes,
ignore_http_error_status_codes=ignore_http_error_status_codes,
max_request_retries=3,
max_request_retries=2,
)

await crawler.add_requests([str(server_url / 'status/402')])
Expand Down Expand Up @@ -230,7 +230,7 @@ async def test_http_status_statistics(crawler: HttpCrawler, server_url: URL) ->
'200': 10,
'403': 100, # block errors change session and retry
'402': 10, # client errors are not retried by default
'500': 30, # server errors are retried by default
'500': 40, # server errors are retried by default
}


Expand Down Expand Up @@ -568,7 +568,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
kvs_content[key_info.key] = await kvs.get_value(key_info.key)

# One error, three time retried.
assert crawler.statistics.error_tracker.total == 3
assert crawler.statistics.error_tracker.total == 4
assert crawler.statistics.error_tracker.unique_error_count == 1
assert len(kvs_content) == 1
assert key_info.key.endswith('.html')
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:


async def test_nonexistent_url_invokes_error_handler() -> None:
crawler = PlaywrightCrawler(max_request_retries=4, request_handler=mock.AsyncMock())
crawler = PlaywrightCrawler(max_request_retries=3, request_handler=mock.AsyncMock())

error_handler = mock.AsyncMock(return_value=None)
crawler.error_handler(error_handler)
Expand Down Expand Up @@ -617,7 +617,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf-8')

# Three errors twice retried errors, but only 2 unique -> 4 (2 x (html and jpg)) artifacts expected.
assert crawler.statistics.error_tracker.total == 3 * max_retries
assert crawler.statistics.error_tracker.total == 3 * (max_retries + 1)
assert crawler.statistics.error_tracker.unique_error_count == 2
assert len(list(kvs_content.keys())) == 4

Expand Down
Loading