From 8711e0cbe4aa23891b0bc081a2777f45c5f85c93 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 18 Nov 2025 11:46:29 +0100 Subject: [PATCH 1/8] Draft. TODO: Refactor add_requests --- src/crawlee/_types.py | 22 +++++++++++- src/crawlee/crawlers/_basic/_basic_crawler.py | 7 +++- .../crawlers/_basic/test_basic_crawler.py | 34 +++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 008a7fcf6a..247e13fdc2 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -15,7 +15,7 @@ import re from collections.abc import Callable, Coroutine, Sequence - from typing_extensions import NotRequired, Required, Unpack + from typing_extensions import NotRequired, Required, Self, Unpack from crawlee import Glob, Request from crawlee._request import RequestOptions @@ -33,6 +33,7 @@ from pydantic import JsonValue as JsonSerializable T = TypeVar('T') +TCrawlingContext = TypeVar('TCrawlingContext') HttpMethod = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH'] @@ -643,6 +644,25 @@ def __hash__(self) -> int: """Return hash of the context. Each context is considered unique.""" return id(self) + def create_modified_copy( + self, + push_data: PushDataFunction | None = None, + add_requests: AddRequestsFunction | None = None, + get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None, + ) -> Self: + """Create a modified copy of the crawling context with specified changes.""" + original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} + modified_fields = { + key: value + for key, value in { + 'push_data': push_data, + 'add_requests': add_requests, + 'get_key_value_store': get_key_value_store, + }.items() + if value + } + return self.__class__(**{**original_fields, **modified_fields}) + class GetDataKwargs(TypedDict): """Keyword arguments for dataset's `get_data` method.""" diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 74d2aaff13..0c86d6adda 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1166,7 +1166,12 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling if self._failed_request_handler: try: - await self._failed_request_handler(context, error) + error_context = context.create_modified_copy( + push_data=self._push_data, + add_requests=self.add_requests, + get_key_value_store=self.get_key_value_store, + ) + await self._failed_request_handler(error_context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index c7dad2725c..d3cd2efa90 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -284,6 +284,40 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception assert isinstance(calls[0][1], RuntimeError) +async def test_failed_request_handler_uses_context_helpers(tmp_path: Path) -> None: + """Test that context helpers used in `failed_request_handler` have effect.""" + storage_client = FileSystemStorageClient() + crawler = BasicCrawler( + max_request_retries=1, storage_client=storage_client, configuration=Configuration(storage_dir=str(tmp_path)) + ) + test_data = {'some': 'data'} + test_key = 'key' + test_value = 'value' + test_request = Request.from_url('https://d.placeholder.com') + + @crawler.router.default_handler + async def handler(context: BasicCrawlingContext) -> None: + if context.request.url == 'https://b.placeholder.com': + raise RuntimeError('Arbitrary crash for testing purposes') + + @crawler.failed_request_handler + async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: + await context.push_data(test_data) + await context.add_requests([test_request]) + kvs = await context.get_key_value_store() + await kvs.set_value(test_key, test_value) + + await crawler.run(['https://b.placeholder.com']) + + dataset = await Dataset.open(storage_client=storage_client) + kvs = await KeyValueStore.open(storage_client=storage_client) + rq = await RequestQueue.open(storage_client=storage_client) + + assert test_value == await kvs.get_value(test_key) + assert test_request == await rq.fetch_next_request() + assert [test_data] == (await dataset.get_data()).items + + async def test_handles_error_in_failed_request_handler() -> None: crawler = BasicCrawler(max_request_retries=3) From df84ac1b061bd9ffb2169e805d8cb7da0a6cffa7 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 18 Nov 2025 15:58:52 +0100 Subject: [PATCH 2/8] Update add_requests handling --- src/crawlee/crawlers/_basic/_basic_crawler.py | 94 ++++++++++--------- .../crawlers/_basic/test_basic_crawler.py | 2 +- 2 files changed, 53 insertions(+), 43 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 0c86d6adda..ee8d0ac62d 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -29,6 +29,7 @@ from crawlee._request import Request, RequestOptions, RequestState from crawlee._service_locator import ServiceLocator from crawlee._types import ( + AddRequestsFunction, BasicCrawlingContext, EnqueueLinksKwargs, GetKeyValueStoreFromRequestHandlerFunction, @@ -1168,8 +1169,8 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling try: error_context = context.create_modified_copy( push_data=self._push_data, - add_requests=self.add_requests, get_key_value_store=self.get_key_value_store, + add_requests=await self._create_context_aware_add_requests(context), ) await self._failed_request_handler(error_context, error) except Exception as e: @@ -1261,52 +1262,61 @@ def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_u else: yield Request.from_url(url) + async def _open_suitable_request_manager( + self, id: str | None, name: str | None, alias: str | None + ) -> RequestManager | RequestQueue: + if id or name or alias: + return await RequestQueue.open( + id=id, + name=name, + alias=alias, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + return await self.get_request_manager() + + def _get_context_aware_requests( + self, requests: Sequence[str | Request], context: BasicCrawlingContext, kwargs: EnqueueLinksKwargs + ) -> list[Request]: + context_aware_requests = list[Request]() + base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url + requests_iterator = self._convert_url_to_request_iterator(requests, base_url) + filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs) + for dst_request in filter_requests_iterator: + # Update the crawl depth of the request. + dst_request.crawl_depth = context.request.crawl_depth + 1 + + if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: + context_aware_requests.append(dst_request) + return context_aware_requests + + async def _create_context_aware_add_requests(self, context: BasicCrawlingContext) -> AddRequestsFunction: + """Create add_requests function that adds requests aware of the crawling context.""" + + async def context_aware_add_requests( + requests: Sequence[str | Request], + rq_id: str | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, + **kwargs: Unpack[EnqueueLinksKwargs], + ) -> None: + request_manager = await self._open_suitable_request_manager( + id=rq_id, + name=rq_name, + alias=rq_alias, + ) + context_aware_requests = self._get_context_aware_requests(requests=requests, context=context, kwargs=kwargs) + return await request_manager.add_requests(context_aware_requests) + + return context_aware_add_requests + async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None: """Commit request handler result for the input `context`. Result is taken from `_context_result_map`.""" result = self._context_result_map[context] - base_request_manager = await self.get_request_manager() - - origin = context.request.loaded_url or context.request.url - for add_requests_call in result.add_requests_calls: - rq_id = add_requests_call.get('rq_id') - rq_name = add_requests_call.get('rq_name') - rq_alias = add_requests_call.get('rq_alias') - specified_params = sum(1 for param in [rq_id, rq_name, rq_alias] if param is not None) - if specified_params > 1: - raise ValueError('You can only provide one of `rq_id`, `rq_name` or `rq_alias` arguments.') - if rq_id or rq_name or rq_alias: - request_manager: RequestManager | RequestQueue = await RequestQueue.open( - id=rq_id, - name=rq_name, - alias=rq_alias, - storage_client=self._service_locator.get_storage_client(), - configuration=self._service_locator.get_configuration(), - ) - else: - request_manager = base_request_manager - - requests = list[Request]() - - base_url = url if (url := add_requests_call.get('base_url')) else origin - - requests_iterator = self._convert_url_to_request_iterator(add_requests_call['requests'], base_url) - - enqueue_links_kwargs: EnqueueLinksKwargs = {k: v for k, v in add_requests_call.items() if k != 'requests'} # type: ignore[assignment] - - filter_requests_iterator = self._enqueue_links_filter_iterator( - requests_iterator, context.request.url, **enqueue_links_kwargs - ) - - for dst_request in filter_requests_iterator: - # Update the crawl depth of the request. - dst_request.crawl_depth = context.request.crawl_depth + 1 - - if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: - requests.append(dst_request) - - await request_manager.add_requests(requests) + context_aware_add_requests = await self._create_context_aware_add_requests(context) + await context_aware_add_requests(**add_requests_call) for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index d3cd2efa90..02d6bb818a 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -314,8 +314,8 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception rq = await RequestQueue.open(storage_client=storage_client) assert test_value == await kvs.get_value(test_key) - assert test_request == await rq.fetch_next_request() assert [test_data] == (await dataset.get_data()).items + assert test_request == await rq.fetch_next_request() async def test_handles_error_in_failed_request_handler() -> None: From a92d346193956ffe86195366e400d8ddf26e8d1c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 19 Nov 2025 11:05:48 +0100 Subject: [PATCH 3/8] Use also for `error_handler` and review comments --- src/crawlee/crawlers/_basic/_basic_crawler.py | 64 ++++++++++--------- .../crawlers/_basic/test_basic_crawler.py | 24 ++++--- 2 files changed, 50 insertions(+), 38 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index ee8d0ac62d..31d9433eca 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -2,6 +2,7 @@ from __future__ import annotations import asyncio +import functools import logging import signal import sys @@ -14,7 +15,7 @@ from datetime import timedelta from functools import partial from pathlib import Path -from typing import TYPE_CHECKING, Any, Generic, Literal, cast +from typing import TYPE_CHECKING, Any, Generic, Literal, ParamSpec, cast from urllib.parse import ParseResult, urlparse from weakref import WeakKeyDictionary @@ -97,6 +98,9 @@ TCrawlingContext = TypeVar('TCrawlingContext', bound=BasicCrawlingContext, default=BasicCrawlingContext) TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) TRequestIterator = TypeVar('TRequestIterator', str, Request) +TParams = ParamSpec('TParams') +T = TypeVar('T') + ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Request | None]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] @@ -521,6 +525,20 @@ def stop(self, reason: str = 'Stop was called externally.') -> None: self._logger.info(f'Crawler.stop() was called with following reason: {reason}.') self._unexpected_stop = True + def _wrap_handler_with_error_context( + self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]] + ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]: + @functools.wraps(handler) + async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T: + error_context = context.create_modified_copy( + push_data=self._push_data, + get_key_value_store=self.get_key_value_store, + add_requests=self._create_context_aware_add_requests(context), + ) + return await handler(error_context, exception) + + return wrapped_handler + def _stop_if_max_requests_count_exceeded(self) -> None: """Call `stop` when the maximum number of requests to crawl has been reached.""" if self._max_requests_per_crawl is None: @@ -619,7 +637,7 @@ def error_handler( The error handler is invoked after a request handler error occurs and before a retry attempt. """ - self._error_handler = handler + self._error_handler = self._wrap_handler_with_error_context(handler) return handler def failed_request_handler( @@ -629,7 +647,7 @@ def failed_request_handler( The failed request handler is invoked when a request has failed all retry attempts. """ - self._failed_request_handler = handler + self._failed_request_handler = self._wrap_handler_with_error_context(handler) return handler def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback: @@ -1167,12 +1185,7 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling if self._failed_request_handler: try: - error_context = context.create_modified_copy( - push_data=self._push_data, - get_key_value_store=self.get_key_value_store, - add_requests=await self._create_context_aware_add_requests(context), - ) - await self._failed_request_handler(error_context, error) + await self._failed_request_handler(context, error) except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e @@ -1262,19 +1275,6 @@ def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_u else: yield Request.from_url(url) - async def _open_suitable_request_manager( - self, id: str | None, name: str | None, alias: str | None - ) -> RequestManager | RequestQueue: - if id or name or alias: - return await RequestQueue.open( - id=id, - name=name, - alias=alias, - storage_client=self._service_locator.get_storage_client(), - configuration=self._service_locator.get_configuration(), - ) - return await self.get_request_manager() - def _get_context_aware_requests( self, requests: Sequence[str | Request], context: BasicCrawlingContext, kwargs: EnqueueLinksKwargs ) -> list[Request]: @@ -1290,7 +1290,7 @@ def _get_context_aware_requests( context_aware_requests.append(dst_request) return context_aware_requests - async def _create_context_aware_add_requests(self, context: BasicCrawlingContext) -> AddRequestsFunction: + def _create_context_aware_add_requests(self, context: BasicCrawlingContext) -> AddRequestsFunction: """Create add_requests function that adds requests aware of the crawling context.""" async def context_aware_add_requests( @@ -1300,11 +1300,17 @@ async def context_aware_add_requests( rq_alias: str | None = None, **kwargs: Unpack[EnqueueLinksKwargs], ) -> None: - request_manager = await self._open_suitable_request_manager( - id=rq_id, - name=rq_name, - alias=rq_alias, - ) + if rq_id or rq_name or rq_alias: + request_manager: RequestManager = await RequestQueue.open( + id=rq_id, + name=rq_name, + alias=rq_alias, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + else: + request_manager = await self.get_request_manager() + context_aware_requests = self._get_context_aware_requests(requests=requests, context=context, kwargs=kwargs) return await request_manager.add_requests(context_aware_requests) @@ -1315,7 +1321,7 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> result = self._context_result_map[context] for add_requests_call in result.add_requests_calls: - context_aware_add_requests = await self._create_context_aware_add_requests(context) + context_aware_add_requests = self._create_context_aware_add_requests(context) await context_aware_add_requests(**add_requests_call) for push_data_call in result.push_data_calls: diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 02d6bb818a..cdefba504b 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -284,34 +284,40 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception assert isinstance(calls[0][1], RuntimeError) -async def test_failed_request_handler_uses_context_helpers(tmp_path: Path) -> None: - """Test that context helpers used in `failed_request_handler` have effect.""" +@pytest.mark.parametrize('handler', ['failed_request_handler', 'error_handler']) +async def test_handlers_uses_context_helpers(tmp_path: Path, handler: str) -> None: + """Test that context helpers used in `failed_request_handler` and in `error_handler` have effect.""" + # Prepare crawler storage_client = FileSystemStorageClient() crawler = BasicCrawler( max_request_retries=1, storage_client=storage_client, configuration=Configuration(storage_dir=str(tmp_path)) ) + # Test data + rq_alias = 'other' test_data = {'some': 'data'} test_key = 'key' test_value = 'value' test_request = Request.from_url('https://d.placeholder.com') + # Request handler with injected error @crawler.router.default_handler - async def handler(context: BasicCrawlingContext) -> None: - if context.request.url == 'https://b.placeholder.com': - raise RuntimeError('Arbitrary crash for testing purposes') + async def request_handler(context: BasicCrawlingContext) -> None: + raise RuntimeError('Arbitrary crash for testing purposes') - @crawler.failed_request_handler - async def failed_request_handler(context: BasicCrawlingContext, error: Exception) -> None: + # Apply one of the handlers + @getattr(crawler, handler) # type:ignore[misc] # Untyped decorator is ok to make the test concise + async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None: await context.push_data(test_data) - await context.add_requests([test_request]) + await context.add_requests(requests=[test_request], rq_alias=rq_alias) kvs = await context.get_key_value_store() await kvs.set_value(test_key, test_value) await crawler.run(['https://b.placeholder.com']) + # Verify that the context helpers used in handlers had effect on used storages dataset = await Dataset.open(storage_client=storage_client) kvs = await KeyValueStore.open(storage_client=storage_client) - rq = await RequestQueue.open(storage_client=storage_client) + rq = await RequestQueue.open(alias=rq_alias, storage_client=storage_client) assert test_value == await kvs.get_value(test_key) assert [test_data] == (await dataset.get_data()).items From 00b2e9664bd20070c6dc8a30d59016c309c24320 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 19 Nov 2025 11:09:53 +0100 Subject: [PATCH 4/8] Fix typo --- tests/unit/crawlers/_basic/test_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index cdefba504b..be22d1f951 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -285,7 +285,7 @@ async def failed_request_handler(context: BasicCrawlingContext, error: Exception @pytest.mark.parametrize('handler', ['failed_request_handler', 'error_handler']) -async def test_handlers_uses_context_helpers(tmp_path: Path, handler: str) -> None: +async def test_handlers_use_context_helpers(tmp_path: Path, handler: str) -> None: """Test that context helpers used in `failed_request_handler` and in `error_handler` have effect.""" # Prepare crawler storage_client = FileSystemStorageClient() From eeb3a4486a77065db2c464c21e016b73f48189a8 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 19 Nov 2025 11:30:26 +0100 Subject: [PATCH 5/8] Polish and add comments --- src/crawlee/crawlers/_basic/_basic_crawler.py | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 31d9433eca..85032f62cf 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -30,7 +30,6 @@ from crawlee._request import Request, RequestOptions, RequestState from crawlee._service_locator import ServiceLocator from crawlee._types import ( - AddRequestsFunction, BasicCrawlingContext, EnqueueLinksKwargs, GetKeyValueStoreFromRequestHandlerFunction, @@ -528,12 +527,16 @@ def stop(self, reason: str = 'Stop was called externally.') -> None: def _wrap_handler_with_error_context( self, handler: Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]] ) -> Callable[[TCrawlingContext | BasicCrawlingContext, Exception], Awaitable[T]]: + """Decorate error handlers to make their context helpers usable.""" + @functools.wraps(handler) async def wrapped_handler(context: TCrawlingContext | BasicCrawlingContext, exception: Exception) -> T: + # Original context helpers that are from `RequestHandlerRunResult` will not be commited as the request + # failed. Modified context provides context helpers with direct access to the storages. error_context = context.create_modified_copy( push_data=self._push_data, get_key_value_store=self.get_key_value_store, - add_requests=self._create_context_aware_add_requests(context), + add_requests=functools.partial(self._add_requests, context), ) return await handler(error_context, exception) @@ -1290,39 +1293,36 @@ def _get_context_aware_requests( context_aware_requests.append(dst_request) return context_aware_requests - def _create_context_aware_add_requests(self, context: BasicCrawlingContext) -> AddRequestsFunction: - """Create add_requests function that adds requests aware of the crawling context.""" - - async def context_aware_add_requests( - requests: Sequence[str | Request], - rq_id: str | None = None, - rq_name: str | None = None, - rq_alias: str | None = None, - **kwargs: Unpack[EnqueueLinksKwargs], - ) -> None: - if rq_id or rq_name or rq_alias: - request_manager: RequestManager = await RequestQueue.open( - id=rq_id, - name=rq_name, - alias=rq_alias, - storage_client=self._service_locator.get_storage_client(), - configuration=self._service_locator.get_configuration(), - ) - else: - request_manager = await self.get_request_manager() - - context_aware_requests = self._get_context_aware_requests(requests=requests, context=context, kwargs=kwargs) - return await request_manager.add_requests(context_aware_requests) + async def _add_requests( + self, + context: BasicCrawlingContext, + requests: Sequence[str | Request], + rq_id: str | None = None, + rq_name: str | None = None, + rq_alias: str | None = None, + **kwargs: Unpack[EnqueueLinksKwargs], + ) -> None: + """Add requests method aware of the crawling context.""" + if rq_id or rq_name or rq_alias: + request_manager: RequestManager = await RequestQueue.open( + id=rq_id, + name=rq_name, + alias=rq_alias, + storage_client=self._service_locator.get_storage_client(), + configuration=self._service_locator.get_configuration(), + ) + else: + request_manager = await self.get_request_manager() - return context_aware_add_requests + context_aware_requests = self._get_context_aware_requests(requests=requests, context=context, kwargs=kwargs) + return await request_manager.add_requests(context_aware_requests) async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None: """Commit request handler result for the input `context`. Result is taken from `_context_result_map`.""" result = self._context_result_map[context] for add_requests_call in result.add_requests_calls: - context_aware_add_requests = self._create_context_aware_add_requests(context) - await context_aware_add_requests(**add_requests_call) + await functools.partial(self._add_requests, context)(**add_requests_call) for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) From b29f34d2a5c8c8060a8a77049f4c45ec3a86e2d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Wed, 19 Nov 2025 12:58:02 +0100 Subject: [PATCH 6/8] Update src/crawlee/crawlers/_basic/_basic_crawler.py Co-authored-by: Jan Buchar --- src/crawlee/crawlers/_basic/_basic_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 85032f62cf..8877dd520f 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1322,7 +1322,7 @@ async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> result = self._context_result_map[context] for add_requests_call in result.add_requests_calls: - await functools.partial(self._add_requests, context)(**add_requests_call) + await self._add_requests(context, **add_requests_call) for push_data_call in result.push_data_calls: await self._push_data(**push_data_call) From 7dc6c7e2ad28e69538e0b0840dd865332b6fa80c Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 19 Nov 2025 13:00:53 +0100 Subject: [PATCH 7/8] Review comments --- src/crawlee/crawlers/_basic/_basic_crawler.py | 27 ++++++++----------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 8877dd520f..67363b6aa8 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -1278,21 +1278,6 @@ def _convert_url_to_request_iterator(self, urls: Sequence[str | Request], base_u else: yield Request.from_url(url) - def _get_context_aware_requests( - self, requests: Sequence[str | Request], context: BasicCrawlingContext, kwargs: EnqueueLinksKwargs - ) -> list[Request]: - context_aware_requests = list[Request]() - base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url - requests_iterator = self._convert_url_to_request_iterator(requests, base_url) - filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs) - for dst_request in filter_requests_iterator: - # Update the crawl depth of the request. - dst_request.crawl_depth = context.request.crawl_depth + 1 - - if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: - context_aware_requests.append(dst_request) - return context_aware_requests - async def _add_requests( self, context: BasicCrawlingContext, @@ -1314,7 +1299,17 @@ async def _add_requests( else: request_manager = await self.get_request_manager() - context_aware_requests = self._get_context_aware_requests(requests=requests, context=context, kwargs=kwargs) + context_aware_requests = list[Request]() + base_url = kwargs.get('base_url') or context.request.loaded_url or context.request.url + requests_iterator = self._convert_url_to_request_iterator(requests, base_url) + filter_requests_iterator = self._enqueue_links_filter_iterator(requests_iterator, context.request.url, **kwargs) + for dst_request in filter_requests_iterator: + # Update the crawl depth of the request. + dst_request.crawl_depth = context.request.crawl_depth + 1 + + if self._max_crawl_depth is None or dst_request.crawl_depth <= self._max_crawl_depth: + context_aware_requests.append(dst_request) + return await request_manager.add_requests(context_aware_requests) async def _commit_request_handler_result(self, context: BasicCrawlingContext) -> None: From 58371c711a0f5d0d1df1b280b56b2587012767fe Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 19 Nov 2025 17:04:00 +0100 Subject: [PATCH 8/8] Remove leftover TypeVar --- src/crawlee/_types.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index 247e13fdc2..da11adae5d 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -33,7 +33,6 @@ from pydantic import JsonValue as JsonSerializable T = TypeVar('T') -TCrawlingContext = TypeVar('TCrawlingContext') HttpMethod = Literal['GET', 'HEAD', 'POST', 'PUT', 'DELETE', 'CONNECT', 'OPTIONS', 'TRACE', 'PATCH']