-
Notifications
You must be signed in to change notification settings - Fork 743
fix: Fix retry count to not count the original request #1328
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,7 +17,7 @@ | |
|
|
||
| from crawlee import ConcurrencySettings, Glob, service_locator | ||
| from crawlee._request import Request | ||
| from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpHeaders, HttpMethod | ||
| from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs, HttpMethod | ||
| from crawlee._utils.robots import RobotsTxtFile | ||
| from crawlee.configuration import Configuration | ||
| from crawlee.crawlers import BasicCrawler | ||
|
|
@@ -135,11 +135,12 @@ async def handler(context: BasicCrawlingContext) -> None: | |
| 'https://c.placeholder.com', | ||
| 'https://b.placeholder.com', | ||
| 'https://b.placeholder.com', | ||
| 'https://b.placeholder.com', | ||
| ] | ||
|
|
||
|
|
||
| async def test_respects_no_retry() -> None: | ||
| crawler = BasicCrawler(max_request_retries=3) | ||
| crawler = BasicCrawler(max_request_retries=2) | ||
| calls = list[str]() | ||
|
|
||
| @crawler.router.default_handler | ||
|
|
@@ -167,7 +168,7 @@ async def handler(context: BasicCrawlingContext) -> None: | |
|
|
||
|
|
||
| async def test_respects_request_specific_max_retries() -> None: | ||
| crawler = BasicCrawler(max_request_retries=1) | ||
| crawler = BasicCrawler(max_request_retries=0) | ||
| calls = list[str]() | ||
|
|
||
| @crawler.router.default_handler | ||
|
|
@@ -179,7 +180,7 @@ async def handler(context: BasicCrawlingContext) -> None: | |
| [ | ||
| 'https://a.placeholder.com', | ||
| 'https://b.placeholder.com', | ||
| Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 4}}), | ||
| Request.from_url(url='https://c.placeholder.com', user_data={'__crawlee': {'maxRetries': 1}}), | ||
| ] | ||
| ) | ||
|
|
||
|
|
@@ -188,8 +189,6 @@ async def handler(context: BasicCrawlingContext) -> None: | |
| 'https://b.placeholder.com', | ||
| 'https://c.placeholder.com', | ||
| 'https://c.placeholder.com', | ||
| 'https://c.placeholder.com', | ||
| 'https://c.placeholder.com', | ||
|
Comment on lines
-191
to
-192
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this intenational?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, 1 retry means 1 call + 1 retry. |
||
| ] | ||
|
|
||
|
|
||
|
|
@@ -199,12 +198,11 @@ async def test_calls_error_handler() -> None: | |
| class Call: | ||
| url: str | ||
| error: Exception | ||
| custom_retry_count: int | ||
|
|
||
| # List to store the information of calls to the error handler. | ||
| calls = list[Call]() | ||
|
|
||
| crawler = BasicCrawler(max_request_retries=3) | ||
| crawler = BasicCrawler(max_request_retries=2) | ||
|
|
||
| @crawler.router.default_handler | ||
| async def handler(context: BasicCrawlingContext) -> None: | ||
|
|
@@ -213,34 +211,19 @@ async def handler(context: BasicCrawlingContext) -> None: | |
|
|
||
| @crawler.error_handler | ||
| async def error_handler(context: BasicCrawlingContext, error: Exception) -> Request: | ||
| # Retrieve or initialize the headers, and extract the current custom retry count. | ||
| headers = context.request.headers or HttpHeaders() | ||
| custom_retry_count = int(headers.get('custom_retry_count', '0')) | ||
|
|
||
| # Append the current call information. | ||
| calls.append(Call(context.request.url, error, custom_retry_count)) | ||
|
|
||
| # Update the request to include an incremented custom retry count in the headers and return it. | ||
| request = context.request.model_dump() | ||
| request['headers'] = HttpHeaders({'custom_retry_count': str(custom_retry_count + 1)}) | ||
| return Request.model_validate(request) | ||
|
Comment on lines
-216
to
-226
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this just some optimization?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think this was somewhat complicated for the test's intention, and the same could be achieved with a simpler setup. |
||
| calls.append(Call(context.request.url, error)) | ||
| return context.request | ||
|
|
||
| await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com']) | ||
|
|
||
| # Verify that the error handler was called twice | ||
| assert len(calls) == 2 | ||
|
|
||
| # Check the first call... | ||
| first_call = calls[0] | ||
| assert first_call.url == 'https://b.placeholder.com' | ||
| assert isinstance(first_call.error, RuntimeError) | ||
| assert first_call.custom_retry_count == 0 | ||
|
|
||
| # Check the second call... | ||
| second_call = calls[1] | ||
| assert second_call.url == 'https://b.placeholder.com' | ||
| assert isinstance(second_call.error, RuntimeError) | ||
| assert second_call.custom_retry_count == 1 | ||
| # Check calls | ||
| for error_call in calls: | ||
| assert error_call.url == 'https://b.placeholder.com' | ||
| assert isinstance(error_call.error, RuntimeError) | ||
|
|
||
|
|
||
| async def test_calls_error_handler_for_sesion_errors() -> None: | ||
|
|
@@ -578,7 +561,7 @@ async def handler(context: BasicCrawlingContext) -> None: | |
|
|
||
|
|
||
| async def test_final_statistics() -> None: | ||
| crawler = BasicCrawler(max_request_retries=3) | ||
| crawler = BasicCrawler(max_request_retries=2) | ||
|
|
||
| @crawler.router.default_handler | ||
| async def handler(context: BasicCrawlingContext) -> None: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this intenational?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, default retries are 3. So now with this PR it will make 4 calls (original request + 3 retries)