From 8cad430f57ad5699d03ef038ba8418ea2bb7474f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 16:12:59 +0200 Subject: [PATCH 01/63] feat: implement apifyclient wrapper --- langchain_apify/_client.py | 205 +++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 langchain_apify/_client.py diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py new file mode 100644 index 0000000..6ab49be --- /dev/null +++ b/langchain_apify/_client.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import os + +from apify_client import ApifyClient + +from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify.utils import create_apify_client + +_SCRAPE_ACTOR_ID = 'apify/website-content-crawler' + + +class ApifyToolsClient: + """Internal helper that wraps ``ApifyClient`` for the tools layer. + + One convenience method per tool operation. All methods are synchronous and + block until the Actor run finishes., + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Raises: + ValueError: If no token is provided and the env var is not set. + """ + + def __init__(self, apify_api_token: str | None = None) -> None: + token = apify_api_token or os.getenv('APIFY_API_TOKEN') + if not token: + msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = create_apify_client(ApifyClient, token) + + def run_actor( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + ) -> dict: + """Start an Actor and block until it finishes. + + Args: + actor_id: Actor ID or name (e.g. ``"apify/python-example"``). + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + run = self._client.actor(actor_id).call(**call_kwargs) + self._check_run_status(run) + return run + + def get_dataset_items(self, dataset_id: str, limit: int = 100, offset: int = 0) -> list[dict]: + """Fetch items from an existing dataset. + + Args: + dataset_id: Apify dataset ID. + limit: Maximum number of items to return. + offset: Number of items to skip from the start. + + Returns: + List of dataset item dicts (may be empty). + """ + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + + def run_actor_and_get_items( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + ) -> tuple[dict, list[dict]]: + """Run an Actor, then fetch items from its default dataset. + + Args: + actor_id: Actor ID or name. + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId', '') + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + return run, items + + def run_task( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + ) -> dict: + """Start a saved Actor task and block until it finishes. + + Args: + task_id: Task ID or name (e.g. ``"user/my-task"``). + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + run = self._client.task(task_id).call(**call_kwargs) + self._check_run_status(run) + return run + + def run_task_and_get_items( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + ) -> tuple[dict, list[dict]]: + """Run a saved Actor task, then fetch items from its default dataset. + + Args: + task_id: Task ID or name. + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId', '') + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + return run, items + + def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + """Scrape a single URL and return its content as markdown. + + Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. + + Args: + url: The URL to scrape. + timeout_secs: Maximum time to wait for the crawl to finish. + + Returns: + Markdown (or plain-text fallback) content of the page. + + Raises: + RuntimeError: If the Actor run fails or no content is extracted. + """ + run_input = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': 1, + } + _, items = self.run_actor_and_get_items( + _SCRAPE_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + if not items: + msg = f'No content extracted from {url}.' + raise RuntimeError(msg) + + content = items[0].get('markdown') or items[0].get('text') or '' + if not content: + msg = f'No content extracted from {url}.' + raise RuntimeError(msg) + return content + + @staticmethod + def _check_run_status(run: dict) -> None: + """Raise if the run did not succeed.""" + status = run.get('status') + if status != 'SUCCEEDED': + run_id = run.get('id', 'unknown') + msg = f'Actor run {run_id} ended with status {status}.' + raise RuntimeError(msg) From 2404b9cd73faaea8c3c904b4a34e58ee4cf96a17 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 16:45:59 +0200 Subject: [PATCH 02/63] feat: removed redundant const file --- langchain_apify/_client.py | 21 +++++++++++++-------- langchain_apify/const.py | 2 -- langchain_apify/tools.py | 7 +++---- langchain_apify/utils.py | 12 ++++++------ 4 files changed, 22 insertions(+), 20 deletions(-) delete mode 100644 langchain_apify/const.py diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 6ab49be..068835b 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -8,6 +8,11 @@ from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_DEFAULT_RUN_TIMEOUT_SECS = 300 +_DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_TASK_TIMEOUT_SECS = 300 +_DEFAULT_DATASET_ITEMS_LIMIT = 100 +_RUN_STATUS_SUCCEEDED = 'SUCCEEDED' class ApifyToolsClient: @@ -35,7 +40,7 @@ def run_actor( self, actor_id: str, run_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, ) -> dict: """Start an Actor and block until it finishes. @@ -77,9 +82,9 @@ def run_actor_and_get_items( self, actor_id: str, run_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: """Run an Actor, then fetch items from its default dataset. @@ -105,7 +110,7 @@ def run_task( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, ) -> dict: """Start a saved Actor task and block until it finishes. @@ -135,9 +140,9 @@ def run_task_and_get_items( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_TASK_TIMEOUT_SECS, memory_mbytes: int | None = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: """Run a saved Actor task, then fetch items from its default dataset. @@ -160,7 +165,7 @@ def run_task_and_get_items( items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items return run, items - def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: """Scrape a single URL and return its content as markdown. Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. @@ -199,7 +204,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: def _check_run_status(run: dict) -> None: """Raise if the run did not succeed.""" status = run.get('status') - if status != 'SUCCEEDED': + if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') msg = f'Actor run {run_id} ended with status {status}.' raise RuntimeError(msg) diff --git a/langchain_apify/const.py b/langchain_apify/const.py deleted file mode 100644 index 87e0d0e..0000000 --- a/langchain_apify/const.py +++ /dev/null @@ -1,2 +0,0 @@ -REQUESTS_TIMEOUT_SECS: float = 10.0 -MAX_DESCRIPTION_LEN: int = 350 diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 135314a..2afa413 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -10,14 +10,13 @@ from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import ( + _MAX_DESCRIPTION_LEN, actor_id_to_tool_name, create_apify_client, get_actor_latest_build, prune_actor_input_schema, ) -from .const import MAX_DESCRIPTION_LEN - if TYPE_CHECKING: from langchain_core.callbacks import ( CallbackManagerForToolRun, @@ -128,8 +127,8 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: """ build = get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') - if len(actor_description) > MAX_DESCRIPTION_LEN: - actor_description = actor_description[:MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' + if len(actor_description) > _MAX_DESCRIPTION_LEN: + actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' return actor_description @staticmethod diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 8cdc835..4f2e74f 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -7,14 +7,14 @@ from apify_client import ApifyClientAsync from apify_client.client import ApifyClient -from langchain_apify.const import MAX_DESCRIPTION_LEN, REQUESTS_TIMEOUT_SECS - -APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' +_MAX_DESCRIPTION_LEN: int = 350 +_REQUESTS_TIMEOUT_SECS: float = 10.0 +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' def prune_actor_input_schema( input_schema: dict, - max_description_len: int = MAX_DESCRIPTION_LEN, + max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. @@ -117,8 +117,8 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: msg = f'Failed to get the Actor object ID for {actor_id}.' raise ValueError(msg) - url = APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) - response = requests.request('GET', url, timeout=REQUESTS_TIMEOUT_SECS) + url = _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) + response = requests.request('GET', url, timeout=_REQUESTS_TIMEOUT_SECS) build = response.json() if not isinstance(build, dict): From b1a89a455602d3cd5941c5f5e1f05695899f5cd0 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 17:59:49 +0200 Subject: [PATCH 03/63] feat: add few more input schemas, helpers and tool classes --- langchain_apify/_client.py | 10 +- .../{error_messages.py => _error_messages.py} | 4 + langchain_apify/tools.py | 261 +++++++++++++++++- langchain_apify/utils.py | 3 +- tests/integration_tests/test_utils.py | 2 +- 5 files changed, 270 insertions(+), 10 deletions(-) rename langchain_apify/{error_messages.py => _error_messages.py} (75%) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 068835b..181c6ec 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -4,7 +4,7 @@ from apify_client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' @@ -65,7 +65,7 @@ def run_actor( self._check_run_status(run) return run - def get_dataset_items(self, dataset_id: str, limit: int = 100, offset: int = 0) -> list[dict]: + def get_dataset_items(self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0) -> list[dict]: """Fetch items from an existing dataset. Args: @@ -191,12 +191,12 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) dataset_items_limit=1, ) if not items: - msg = f'No content extracted from {url}.' + msg = ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) content = items[0].get('markdown') or items[0].get('text') or '' if not content: - msg = f'No content extracted from {url}.' + msg = ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) return content @@ -206,5 +206,5 @@ def _check_run_status(run: dict) -> None: status = run.get('status') if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') - msg = f'Actor run {run_id} ended with status {status}.' + msg = ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) raise RuntimeError(msg) diff --git a/langchain_apify/error_messages.py b/langchain_apify/_error_messages.py similarity index 75% rename from langchain_apify/error_messages.py rename to langchain_apify/_error_messages.py index 87462b8..a87c9cb 100644 --- a/langchain_apify/error_messages.py +++ b/langchain_apify/_error_messages.py @@ -5,3 +5,7 @@ ' To pass it as environment variable, you can use the following command:' ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) + +ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' + +ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 2afa413..40aeeee 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -5,10 +5,11 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient -from langchain_core.tools import BaseTool +from langchain_core.tools import BaseTool, ToolException from pydantic import BaseModel, Field, create_model -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import ( _MAX_DESCRIPTION_LEN, actor_id_to_tool_name, @@ -191,3 +192,259 @@ def _run_actor(self, run_input: dict) -> list[dict]: run = self._apify_client.run(run_id=run_id) return run.dataset().list_items(clean=True).items + + +# --------------------------------------------------------------------------- +# Input schemas for the generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorInput(BaseModel): + """Input schema for :class:`ApifyRunActorTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + + +class ApifyGetDatasetItemsInput(BaseModel): + """Input schema for :class:`ApifyGetDatasetItemsTool`.""" + + dataset_id: str = Field(description='Apify dataset ID.') + limit: int = Field(default=100, description='Maximum number of items to return.') + offset: int = Field(default=0, description='Number of items to skip from the start.') + + +class ApifyRunActorAndGetItemsInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetItemsTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +class ApifyScrapeUrlInput(BaseModel): + """Input schema for :class:`ApifyScrapeUrlTool`.""" + + url: str = Field(description='The URL to scrape.') + timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _iso(value: str | None) -> str | None: + """Pass through an ISO timestamp or *None*.""" + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } + + +# --------------------------------------------------------------------------- +# Generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorTool(BaseTool): + """Run any Apify Actor by ID with an arbitrary JSON input. + + Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON + string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the + results from the dataset. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor' + description: str = ( + 'Run an Apify Actor synchronously and return run metadata' + ' (run_id, status, dataset_id, timestamps) as a JSON string.' + ) + args_schema: type[BaseModel] = ApifyRunActorInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyGetDatasetItemsTool(BaseTool): + """Fetch items from an existing Apify dataset by ID. + + Returns items as a JSON string. When the dataset is empty the tool returns + an informative JSON message instead of raising an error. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGetDatasetItemsTool + + tool = ApifyGetDatasetItemsTool() + result = tool.invoke({"dataset_id": "abc123", "limit": 10}) + """ + + name: str = 'apify_get_dataset_items' + description: str = 'Fetch items from an Apify dataset by ID. Returns a JSON array of items.' + args_schema: type[BaseModel] = ApifyGetDatasetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + items = self._client.get_dataset_items(dataset_id, limit, offset) + if not items: + return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) + return json.dumps(items) + + +class ApifyRunActorAndGetItemsTool(BaseTool): + """Run any Apify Actor and return both run metadata and dataset items. + + Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorAndGetItemsTool + + tool = ApifyRunActorAndGetItemsTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor_and_get_items' + description: str = ( + 'Run an Apify Actor synchronously and return both run metadata and' + ' dataset items as a JSON string with "run" and "items" keys.' + ) + args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_actor_and_get_items( + actor_id, run_input, timeout_secs, memory_mbytes, dataset_items_limit + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyScrapeUrlTool(BaseTool): + """Scrape a single URL and return its content as markdown. + + Uses the ``apify/website-content-crawler`` Actor under the hood with + ``maxCrawlPages=1``. Returns the page content as a plain markdown string + (not JSON). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyScrapeUrlTool + + tool = ApifyScrapeUrlTool() + markdown = tool.invoke({"url": "https://apify.com"}) + """ + + name: str = 'apify_scrape_url' + description: str = ( + 'Scrape a single URL using Apify and return its content as markdown text.' + ) + args_schema: type[BaseModel] = ApifyScrapeUrlInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + url: str, + timeout_secs: int = 120, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + return self._client.scrape_url(url, timeout_secs) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 4f2e74f..6b9f9fd 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -9,8 +9,7 @@ _MAX_DESCRIPTION_LEN: int = 350 _REQUESTS_TIMEOUT_SECS: float = 10.0 -_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' - +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' def prune_actor_input_schema( input_schema: dict, diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 1107c7a..554cc2d 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,7 +2,7 @@ from apify_client.client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import create_apify_client, get_actor_latest_build From 0aa917582970bba0d0e50fd88cc17b8606397cfc Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 18:28:13 +0200 Subject: [PATCH 04/63] feat: export new tools from __init__ --- langchain_apify/__init__.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 66142be..cfedc69 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,19 +1,50 @@ from importlib import metadata from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.tools import ApifyActorsTool +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) from langchain_apify.wrappers import ApifyWrapper try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: - # Case where package metadata is not available. __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) +# --------------------------------------------------------------------------- +# Convenience tool-class lists for selective agent binding. +# +# Binding all tools at once overwhelms the LLM context window; pick the +# group(s) relevant to your use case: +# +# from langchain_apify import CORE_TOOLS +# agent = create_react_agent(model, [t() for t in CORE_TOOLS]) +# --------------------------------------------------------------------------- + +CORE_TOOLS: list[type] = [ + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, +] + __all__ = [ + # Existing components (backward-compatible) 'ApifyActorsTool', 'ApifyDatasetLoader', 'ApifyWrapper', + # Core generic tools + 'ApifyGetDatasetItemsTool', + 'ApifyRunActorAndGetItemsTool', + 'ApifyRunActorTool', + 'ApifyScrapeUrlTool', + # Tool group lists + 'CORE_TOOLS', + # Meta '__version__', ] From 4e46d3684048e15a4867a87c457c58d9f5e4ad2a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 18:35:25 +0200 Subject: [PATCH 05/63] feat: add unit tests --- tests/unit_tests/test_client.py | 232 ++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 tests/unit_tests/test_client.py diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py new file mode 100644 index 0000000..f30ed52 --- /dev/null +++ b/tests/unit_tests/test_client.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY + +_SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +_FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +_SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client) as mock_create: + c = ApifyToolsClient(apify_api_token='my-token') + mock_create.assert_called_once() + assert c._client is mock_apify_client + + +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyToolsClient() + + +# --------------------------------------------------------------------------- +# run_actor +# --------------------------------------------------------------------------- + + +def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + + result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) + + mock_apify_client.actor.assert_called_once_with('apify/test-actor') + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300 + ) + assert result == _SUCCEEDED_RUN + + +def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + + client.run_actor('apify/test-actor', memory_mbytes=512) + + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input=None, timeout_secs=300, memory_mbytes=512 + ) + + +def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# get_dataset_items +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) + + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) + assert items == _SAMPLE_ITEMS + + +def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.get_dataset_items('dataset-empty') + assert items == [] + + +# --------------------------------------------------------------------------- +# run_actor_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) + + assert run == _SUCCEEDED_RUN + assert items == _SAMPLE_ITEMS + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + + +# --------------------------------------------------------------------------- +# run_task +# --------------------------------------------------------------------------- + + +def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + + result = client.run_task('user/my-task', task_input={'key': 'val'}) + + mock_apify_client.task.assert_called_once_with('user/my-task') + mock_apify_client.task.return_value.call.assert_called_once_with( + task_input={'key': 'val'}, timeout_secs=300 + ) + assert result == _SUCCEEDED_RUN + + +def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_task('user/my-task') + + +# --------------------------------------------------------------------------- +# run_task_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + run, items = client.run_task_and_get_items('user/my-task') + + assert run == _SUCCEEDED_RUN + assert items == _SAMPLE_ITEMS + + +# --------------------------------------------------------------------------- +# scrape_url +# --------------------------------------------------------------------------- + + +def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == '# Hello' + + +def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == 'Plain text content' + + +def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '', 'text': '', 'url': 'https://example.com'}, + ] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +# --------------------------------------------------------------------------- +# _check_run_status +# --------------------------------------------------------------------------- + + +def test_check_run_status_succeeded() -> None: + ApifyToolsClient._check_run_status({'id': 'run-ok', 'status': 'SUCCEEDED'}) + + +def test_check_run_status_failed() -> None: + expected_msg = ERROR_ACTOR_RUN_FAILED.format(run_id='run-bad', status='FAILED') + with pytest.raises(RuntimeError, match='run-bad'): + ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) From fc6ef1286297c1f8581b15fe475b150ee1fa6b58 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 09:45:06 +0200 Subject: [PATCH 06/63] feat: implement tests and introduce tools list --- langchain_apify/__init__.py | 8 +- tests/unit_tests/test_tools.py | 186 ++++++++++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 6 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index cfedc69..21e5776 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -22,11 +22,11 @@ # Binding all tools at once overwhelms the LLM context window; pick the # group(s) relevant to your use case: # -# from langchain_apify import CORE_TOOLS -# agent = create_react_agent(model, [t() for t in CORE_TOOLS]) +# from langchain_apify import APIFY_CORE_TOOLS +# agent = create_react_agent(model, [t() for t in APIFY_CORE_TOOLS]) # --------------------------------------------------------------------------- -CORE_TOOLS: list[type] = [ +APIFY_CORE_TOOLS: list[type] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, @@ -44,7 +44,7 @@ 'ApifyRunActorTool', 'ApifyScrapeUrlTool', # Tool group lists - 'CORE_TOOLS', + 'APIFY_CORE_TOOLS', # Meta '__version__', ] diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index b10df2f..f17572f 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,12 +1,21 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from langchain_core.tools import ToolException from pydantic import BaseModel -from langchain_apify.tools import ApifyActorsTool +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) from langchain_apify.utils import actor_id_to_tool_name if TYPE_CHECKING: @@ -85,3 +94,176 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id='apify/python-example', apify_api_token='dummy-token') yield tool + + +# --------------------------------------------------------------------------- +# Shared test data for generic tools +# --------------------------------------------------------------------------- + +_SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +_SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +def _make_tool(tool_cls: type, mock_client: MagicMock) -> ApifyRunActorTool | ApifyGetDatasetItemsTool | ApifyRunActorAndGetItemsTool | ApifyScrapeUrlTool: + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token') + tool._client = mock_client + return tool + + +# --------------------------------------------------------------------------- +# ApifyRunActorTool +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = _SUCCEEDED_RUN + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_actor.assert_called_once_with('apify/test', {'key': 'val'}, 300, None) + + +def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(actor_id='apify/test') + + +def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorTool() + + +# --------------------------------------------------------------------------- +# ApifyGetDatasetItemsTool +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_tool_returns_json_array(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0]['text'] == 'item-1' + mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) + + +def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = [] + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-empty') + + parsed = json.loads(result) + assert parsed['items'] == [] + assert 'empty' in parsed['message'].lower() + + +# --------------------------------------------------------------------------- +# ApifyRunActorAndGetItemsTool +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) + tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_actor_and_get_items.assert_called_once_with( + 'apify/test', {'q': '1'}, 300, None, 50 + ) + + +def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(actor_id='apify/test') + + +# --------------------------------------------------------------------------- +# ApifyScrapeUrlTool +# --------------------------------------------------------------------------- + + +def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# Hello World' + tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert result == '# Hello World' + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + + +def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + + with pytest.raises(ToolException, match='No content extracted'): + tool._run(url='https://example.com') + + +# --------------------------------------------------------------------------- +# Tool metadata assertions +# --------------------------------------------------------------------------- + + +def test_generic_tools_have_correct_metadata() -> None: + """Verify name, description, and args_schema are set on all 4 tools.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyRunActorTool(apify_api_token='dummy'), + ApifyGetDatasetItemsTool(apify_api_token='dummy'), + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), + ApifyScrapeUrlTool(apify_api_token='dummy'), + ] + + expected_names = [ + 'apify_run_actor', + 'apify_get_dataset_items', + 'apify_run_actor_and_get_items', + 'apify_scrape_url', + ] + + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True From cc5be9e887edc95719742b682e463e98b3d0ca36 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:01:27 +0200 Subject: [PATCH 07/63] fix: lint fix --- langchain_apify/__init__.py | 12 ++++-------- langchain_apify/_client.py | 16 ++++++++++++++-- langchain_apify/tools.py | 20 +++++++++----------- langchain_apify/utils.py | 1 + tests/unit_tests/test_client.py | 10 ++-------- tests/unit_tests/test_tools.py | 12 ++++++------ 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 21e5776..1b65eef 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from importlib import metadata from langchain_apify.document_loaders import ApifyDatasetLoader @@ -16,15 +18,9 @@ __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) -# --------------------------------------------------------------------------- # Convenience tool-class lists for selective agent binding. -# -# Binding all tools at once overwhelms the LLM context window; pick the -# group(s) relevant to your use case: -# -# from langchain_apify import APIFY_CORE_TOOLS -# agent = create_react_agent(model, [t() for t in APIFY_CORE_TOOLS]) -# --------------------------------------------------------------------------- +# Binding all tools at once overwhelms the LLM context window; +# pick the group(s) relevant to your use case. APIFY_CORE_TOOLS: list[type] = [ ApifyRunActorTool, diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 181c6ec..bf3e01a 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -4,7 +4,11 @@ from apify_client import ApifyClient -from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY +from langchain_apify._error_messages import ( + ERROR_ACTOR_RUN_FAILED, + ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + ERROR_SCRAPE_EMPTY, +) from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' @@ -62,10 +66,15 @@ def run_actor( call_kwargs['memory_mbytes'] = memory_mbytes run = self._client.actor(actor_id).call(**call_kwargs) + if run is None: + msg = f'Actor {actor_id} call returned no run details.' + raise RuntimeError(msg) self._check_run_status(run) return run - def get_dataset_items(self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0) -> list[dict]: + def get_dataset_items( + self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0 + ) -> list[dict]: """Fetch items from an existing dataset. Args: @@ -133,6 +142,9 @@ def run_task( call_kwargs['memory_mbytes'] = memory_mbytes run = self._client.task(task_id).call(**call_kwargs) + if run is None: + msg = f'Task {task_id} call returned no run details.' + raise RuntimeError(msg) self._check_run_status(run) return run diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 40aeeee..a751dad 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -259,7 +259,7 @@ def _run_meta(run: dict) -> dict: # --------------------------------------------------------------------------- -class ApifyRunActorTool(BaseTool): +class ApifyRunActorTool(BaseTool): # type: ignore[override] """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -301,7 +301,7 @@ def _run( run_input: dict | None = None, timeout_secs: int = 300, memory_mbytes: int | None = None, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) @@ -310,7 +310,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(BaseTool): +class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] """Fetch items from an existing Apify dataset by ID. Returns items as a JSON string. When the dataset is empty the tool returns @@ -344,7 +344,7 @@ def _run( dataset_id: str, limit: int = 100, offset: int = 0, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: items = self._client.get_dataset_items(dataset_id, limit, offset) if not items: @@ -352,7 +352,7 @@ def _run( return json.dumps(items) -class ApifyRunActorAndGetItemsTool(BaseTool): +class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -395,7 +395,7 @@ def _run( timeout_secs: int = 300, memory_mbytes: int | None = None, dataset_items_limit: int = 100, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: run, items = self._client.run_actor_and_get_items( @@ -406,7 +406,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(BaseTool): +class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -426,9 +426,7 @@ class ApifyScrapeUrlTool(BaseTool): """ name: str = 'apify_scrape_url' - description: str = ( - 'Scrape a single URL using Apify and return its content as markdown text.' - ) + description: str = 'Scrape a single URL using Apify and return its content as markdown text.' args_schema: type[BaseModel] = ApifyScrapeUrlInput handle_tool_error: bool = True @@ -442,7 +440,7 @@ def _run( self, url: str, timeout_secs: int = 120, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: return self._client.scrape_url(url, timeout_secs) diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 6b9f9fd..d3a627f 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -11,6 +11,7 @@ _REQUESTS_TIMEOUT_SECS: float = 10.0 _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' + def prune_actor_input_schema( input_schema: dict, max_description_len: int = _MAX_DESCRIPTION_LEN, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index f30ed52..89862b1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -5,7 +5,6 @@ import pytest from langchain_apify._client import ApifyToolsClient -from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY _SUCCEEDED_RUN: dict = { 'id': 'run-abc', @@ -74,9 +73,7 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with( - run_input={'key': 'val'}, timeout_secs=300 - ) + mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300) assert result == _SUCCEEDED_RUN @@ -146,9 +143,7 @@ def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock result = client.run_task('user/my-task', task_input={'key': 'val'}) mock_apify_client.task.assert_called_once_with('user/my-task') - mock_apify_client.task.return_value.call.assert_called_once_with( - task_input={'key': 'val'}, timeout_secs=300 - ) + mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) assert result == _SUCCEEDED_RUN @@ -227,6 +222,5 @@ def test_check_run_status_succeeded() -> None: def test_check_run_status_failed() -> None: - expected_msg = ERROR_ACTOR_RUN_FAILED.format(run_id='run-bad', status='FAILED') with pytest.raises(RuntimeError, match='run-bad'): ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index f17572f..af43843 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from unittest.mock import MagicMock, patch import pytest @@ -119,7 +119,7 @@ def mock_tools_client() -> MagicMock: return MagicMock(spec=ApifyToolsClient) -def _make_tool(tool_cls: type, mock_client: MagicMock) -> ApifyRunActorTool | ApifyGetDatasetItemsTool | ApifyRunActorAndGetItemsTool | ApifyScrapeUrlTool: +def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 """Instantiate a generic tool with a mocked ApifyToolsClient.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tool = tool_cls(apify_api_token='dummy-token') @@ -204,13 +204,13 @@ def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) assert parsed['run']['run_id'] == 'run-abc' assert parsed['run']['status'] == 'SUCCEEDED' assert len(parsed['items']) == 2 - mock_tools_client.run_actor_and_get_items.assert_called_once_with( - 'apify/test', {'q': '1'}, 300, None, 50 - ) + mock_tools_client.run_actor_and_get_items.assert_called_once_with('apify/test', {'q': '1'}, 300, None, 50) def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): From c2b9cb6c68a862fa9f602d9e669988d19611238d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:36:30 +0200 Subject: [PATCH 08/63] feat: enhance error handling and documentation for apify tools --- langchain_apify/_client.py | 30 +++++++++++++--- langchain_apify/tools.py | 63 ++++++++++++++++++++++++++++++---- tests/unit_tests/test_tools.py | 35 +++++++++++++++++++ 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index bf3e01a..8434428 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -65,7 +65,11 @@ def run_actor( if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes - run = self._client.actor(actor_id).call(**call_kwargs) + try: + run = self._client.actor(actor_id).call(**call_kwargs) + except Exception as exc: + msg = f'Network error calling Actor {actor_id}: {exc}' + raise RuntimeError(msg) from exc if run is None: msg = f'Actor {actor_id} call returned no run details.' raise RuntimeError(msg) @@ -85,7 +89,11 @@ def get_dataset_items( Returns: List of dataset item dicts (may be empty). """ - return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + try: + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc def run_actor_and_get_items( self, @@ -112,7 +120,11 @@ def run_actor_and_get_items( """ run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + try: + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc return run, items def run_task( @@ -141,7 +153,11 @@ def run_task( if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes - run = self._client.task(task_id).call(**call_kwargs) + try: + run = self._client.task(task_id).call(**call_kwargs) + except Exception as exc: + msg = f'Network error calling task {task_id}: {exc}' + raise RuntimeError(msg) from exc if run is None: msg = f'Task {task_id} call returned no run details.' raise RuntimeError(msg) @@ -174,7 +190,11 @@ def run_task_and_get_items( """ run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + try: + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc return run, items def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index a751dad..9b433f3 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -266,6 +266,14 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the results from the dataset. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + Example: .. code-block:: python @@ -283,8 +291,12 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] name: str = 'apify_run_actor' description: str = ( - 'Run an Apify Actor synchronously and return run metadata' - ' (run_id, status, dataset_id, timestamps) as a JSON string.' + 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunActorInput handle_tool_error: bool = True @@ -316,6 +328,14 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] Returns items as a JSON string. When the dataset is empty the tool returns an informative JSON message instead of raising an error. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON array of item dicts, or ``{"items": [], "message": "..."}`` when + the dataset is empty. + Example: .. code-block:: python @@ -329,7 +349,12 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] """ name: str = 'apify_get_dataset_items' - description: str = 'Fetch items from an Apify dataset by ID. Returns a JSON array of items.' + description: str = ( + 'Fetch items from an Apify dataset by ID. Returns a JSON array of item dicts.' + ' Required: dataset_id (str) — Apify dataset ID.' + ' Optional: limit (int, default 100), offset (int, default 0).' + ' Returns an empty JSON object with a message when the dataset is empty.' + ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput handle_tool_error: bool = True @@ -359,6 +384,15 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] into a single call. Returns a JSON string with ``run`` (metadata) and ``items`` (list of dicts) keys. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + Example: .. code-block:: python @@ -376,8 +410,12 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] name: str = 'apify_run_actor_and_get_items' description: str = ( - 'Run an Apify Actor synchronously and return both run metadata and' - ' dataset items as a JSON string with "run" and "items" keys.' + 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput handle_tool_error: bool = True @@ -413,6 +451,14 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] ``maxCrawlPages=1``. Returns the page content as a plain markdown string (not JSON). + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + Markdown string with the full text content of the scraped page, or a + plain-text fallback when markdown is unavailable. + Example: .. code-block:: python @@ -426,7 +472,12 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] """ name: str = 'apify_scrape_url' - description: str = 'Scrape a single URL using Apify and return its content as markdown text.' + description: str = ( + 'Scrape a single URL using Apify and return its full content as a markdown string.' + ' Required: url (str) — the URL to scrape.' + ' Optional: timeout_secs (int, default 120).' + ' Returns the page content as markdown (or plain text if markdown is unavailable).' + ) args_schema: type[BaseModel] = ApifyScrapeUrlInput handle_tool_error: bool = True diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index af43843..91c53a0 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -8,6 +8,7 @@ from langchain_core.tools import ToolException from pydantic import BaseModel +from langchain_apify import APIFY_CORE_TOOLS from langchain_apify._client import ApifyToolsClient from langchain_apify.tools import ( ApifyActorsTool, @@ -189,6 +190,12 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo assert 'empty' in parsed['message'].lower() +def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGetDatasetItemsTool() + + # --------------------------------------------------------------------------- # ApifyRunActorAndGetItemsTool # --------------------------------------------------------------------------- @@ -217,6 +224,12 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c tool._run(actor_id='apify/test') +def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorAndGetItemsTool() + + # --------------------------------------------------------------------------- # ApifyScrapeUrlTool # --------------------------------------------------------------------------- @@ -240,6 +253,12 @@ def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMoc tool._run(url='https://example.com') +def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyScrapeUrlTool() + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- @@ -267,3 +286,19 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.description assert tool.args_schema is not None assert tool.handle_tool_error is True + + +# --------------------------------------------------------------------------- +# APIFY_CORE_TOOLS list +# --------------------------------------------------------------------------- + + +def test_apify_core_tools_contains_all_four_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 4 generic tool classes.""" + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, + } + assert len(APIFY_CORE_TOOLS) == 4 From 3edf1265fcbc368494a734a910bfcc9015324d94 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:58:37 +0200 Subject: [PATCH 09/63] fix: iso format fix --- langchain_apify/tools.py | 4 +- tests/unit_tests/test_tools.py | 73 ++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 9b433f3..7e1f11d 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -3,6 +3,7 @@ import json import os from typing import TYPE_CHECKING, Any +from datetime import datetime from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException @@ -239,7 +240,8 @@ class ApifyScrapeUrlInput(BaseModel): def _iso(value: str | None) -> str | None: - """Pass through an ISO timestamp or *None*.""" + if isinstance(value, datetime): + return value.isoformat() return value diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 91c53a0..5afb962 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from datetime import datetime, timezone from typing import TYPE_CHECKING, Any from unittest.mock import MagicMock, patch @@ -16,6 +17,8 @@ ApifyRunActorAndGetItemsTool, ApifyRunActorTool, ApifyScrapeUrlTool, + _iso, + _run_meta, ) from langchain_apify.utils import actor_id_to_tool_name @@ -128,6 +131,76 @@ def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 return tool +# --------------------------------------------------------------------------- +# _iso / _run_meta helpers +# --------------------------------------------------------------------------- + + +def test_iso_converts_datetime_to_string() -> None: + dt = datetime(2025, 6, 15, 12, 30, 45, tzinfo=timezone.utc) + assert _iso(dt) == '2025-06-15T12:30:45+00:00' + + +def test_iso_passes_through_string() -> None: + assert _iso('2025-01-01T00:00:00.000Z') == '2025-01-01T00:00:00.000Z' + + +def test_iso_passes_through_none() -> None: + assert _iso(None) is None + + +def test_run_meta_with_datetime_values_is_json_serializable() -> None: + run = { + 'id': 'run-dt', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-dt', + 'startedAt': datetime(2025, 3, 1, 10, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 3, 1, 10, 1, 0, tzinfo=timezone.utc), + } + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['run_id'] == 'run-dt' + assert parsed['started_at'] == '2025-03-01T10:00:00+00:00' + assert parsed['finished_at'] == '2025-03-01T10:01:00+00:00' + + +def test_run_meta_with_string_values_is_json_serializable() -> None: + meta = _run_meta(_SUCCEEDED_RUN) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + + +def test_run_meta_with_missing_timestamps() -> None: + run = {'id': 'run-none', 'status': 'RUNNING', 'defaultDatasetId': 'ds-none'} + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] is None + assert parsed['finished_at'] is None + + +def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: + """End-to-end: ApifyRunActorTool returns valid JSON when the client returns datetime objects.""" + mock_tools_client.run_actor.return_value = { + 'id': 'run-real', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-real', + 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), + } + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test') + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-real' + assert parsed['started_at'] == '2025-06-01T08:00:00+00:00' + assert parsed['finished_at'] == '2025-06-01T08:05:00+00:00' + + # --------------------------------------------------------------------------- # ApifyRunActorTool # --------------------------------------------------------------------------- From 8c36edc824f9e866d7c463d539a5faa401a4f299 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 12:58:32 +0200 Subject: [PATCH 10/63] feat: add apify run task and apify run task and get items tools with input schemas --- langchain_apify/__init__.py | 6 ++ langchain_apify/_client.py | 2 +- langchain_apify/tools.py | 150 ++++++++++++++++++++++++++++++++ tests/unit_tests/test_client.py | 4 +- tests/unit_tests/test_tools.py | 84 +++++++++++++++++- 5 files changed, 239 insertions(+), 7 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 1b65eef..fa1f369 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -8,6 +8,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, ) from langchain_apify.wrappers import ApifyWrapper @@ -27,6 +29,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, ] __all__ = [ @@ -38,6 +42,8 @@ 'ApifyGetDatasetItemsTool', 'ApifyRunActorAndGetItemsTool', 'ApifyRunActorTool', + 'ApifyRunTaskAndGetItemsTool', + 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', # Tool group lists 'APIFY_CORE_TOOLS', diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 8434428..c3ed22e 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -61,7 +61,7 @@ def run_actor( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs} + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None} if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 7e1f11d..421aefd 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -234,6 +234,25 @@ class ApifyScrapeUrlInput(BaseModel): timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') +class ApifyRunTaskInput(BaseModel): + """Input schema for :class:`ApifyRunTaskTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + + +class ApifyRunTaskAndGetItemsInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -499,3 +518,134 @@ def _run( return self._client.scrape_url(url, timeout_secs) except RuntimeError as exc: raise ToolException(str(exc)) from exc + + +class ApifyRunTaskTool(BaseTool): # type: ignore[override] + """Run a saved Apify Actor task by ID and return run metadata. + + Actor tasks are pre-configured Actor runs saved in the Apify Console. + This tool starts a task with optional input overrides and returns run + metadata (run ID, status, dataset ID, timestamps) as a JSON string. + Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve results. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskTool + + tool = ApifyRunTaskTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task' + description: str = ( + 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunTaskInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_task(task_id, task_input, timeout_secs, memory_mbytes) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] + """Run a saved Apify Actor task and return both run metadata and dataset items. + + Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskAndGetItemsTool + + tool = ApifyRunTaskAndGetItemsTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task_and_get_items' + description: str = ( + 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_task_and_get_items( + task_id, task_input, timeout_secs, memory_mbytes, dataset_items_limit + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 89862b1..95193c4 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -73,7 +73,7 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300) + mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) assert result == _SUCCEEDED_RUN @@ -83,7 +83,7 @@ def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: Magi client.run_actor('apify/test-actor', memory_mbytes=512) mock_apify_client.actor.return_value.call.assert_called_once_with( - run_input=None, timeout_secs=300, memory_mbytes=512 + run_input=None, timeout_secs=300, logger=None, memory_mbytes=512 ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 5afb962..1d61eb2 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -16,6 +16,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, _iso, _run_meta, @@ -332,19 +334,89 @@ def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: ApifyScrapeUrlTool() +# --------------------------------------------------------------------------- +# ApifyRunTaskTool +# --------------------------------------------------------------------------- + + +def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = _SUCCEEDED_RUN + tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_task.assert_called_once_with('user/my-task', {'key': 'val'}, 300, None) + + +def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(task_id='user/my-task') + + +def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskAndGetItemsTool +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) + tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_task_and_get_items.assert_called_once_with('user/my-task', {'q': '1'}, 300, None, 50) + + +def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(task_id='user/my-task') + + +def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskAndGetItemsTool() + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- def test_generic_tools_have_correct_metadata() -> None: - """Verify name, description, and args_schema are set on all 4 tools.""" + """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ ApifyRunActorTool(apify_api_token='dummy'), ApifyGetDatasetItemsTool(apify_api_token='dummy'), ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), ApifyScrapeUrlTool(apify_api_token='dummy'), + ApifyRunTaskTool(apify_api_token='dummy'), + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), ] expected_names = [ @@ -352,6 +424,8 @@ def test_generic_tools_have_correct_metadata() -> None: 'apify_get_dataset_items', 'apify_run_actor_and_get_items', 'apify_scrape_url', + 'apify_run_task', + 'apify_run_task_and_get_items', ] for tool, expected_name in zip(tools, expected_names): @@ -366,12 +440,14 @@ def test_generic_tools_have_correct_metadata() -> None: # --------------------------------------------------------------------------- -def test_apify_core_tools_contains_all_four_classes() -> None: - """APIFY_CORE_TOOLS must list exactly the 4 generic tool classes.""" +def test_apify_core_tools_contains_all_generic_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 6 generic tool classes.""" assert set(APIFY_CORE_TOOLS) == { ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, } - assert len(APIFY_CORE_TOOLS) == 4 + assert len(APIFY_CORE_TOOLS) == 6 From 026175a49471d1bfa826ad5655dc6ec31696e47b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 13:50:44 +0200 Subject: [PATCH 11/63] feat: introduce _ApifyGenericTool base class for Apify tools to streamline client handling and error management --- langchain_apify/tools.py | 76 +++++++++++++--------------------- tests/unit_tests/test_tools.py | 24 +++++++++++ 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 421aefd..93d884a 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -275,12 +275,34 @@ def _run_meta(run: dict) -> dict: } +# --------------------------------------------------------------------------- +# Shared base for generic tools +# --------------------------------------------------------------------------- + + +class _ApifyGenericTool(BaseTool): # type: ignore[override] + """Shared base for all generic Apify tools. + + Handles ``ApifyToolsClient`` creation and sets ``handle_tool_error``. + Subclasses only need to declare ``name``, ``description``, + ``args_schema``, and ``_run()``. + """ + + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + # --------------------------------------------------------------------------- # Generic tools # --------------------------------------------------------------------------- -class ApifyRunActorTool(BaseTool): # type: ignore[override] +class ApifyRunActorTool(_ApifyGenericTool): """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -320,13 +342,6 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunActorInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -343,7 +358,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] +class ApifyGetDatasetItemsTool(_ApifyGenericTool): """Fetch items from an existing Apify dataset by ID. Returns items as a JSON string. When the dataset is empty the tool returns @@ -377,13 +392,6 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] ' Returns an empty JSON object with a message when the dataset is empty.' ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -398,7 +406,7 @@ def _run( return json.dumps(items) -class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] +class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -439,13 +447,6 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -465,7 +466,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] +class ApifyScrapeUrlTool(_ApifyGenericTool): """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -500,13 +501,6 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] ' Returns the page content as markdown (or plain text if markdown is unavailable).' ) args_schema: type[BaseModel] = ApifyScrapeUrlInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -520,7 +514,7 @@ def _run( raise ToolException(str(exc)) from exc -class ApifyRunTaskTool(BaseTool): # type: ignore[override] +class ApifyRunTaskTool(_ApifyGenericTool): """Run a saved Apify Actor task by ID and return run metadata. Actor tasks are pre-configured Actor runs saved in the Apify Console. @@ -561,13 +555,6 @@ class ApifyRunTaskTool(BaseTool): # type: ignore[override] ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunTaskInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -584,7 +571,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] +class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` @@ -625,13 +612,6 @@ class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 1d61eb2..025b486 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -19,6 +19,7 @@ ApifyRunTaskAndGetItemsTool, ApifyRunTaskTool, ApifyScrapeUrlTool, + _ApifyGenericTool, _iso, _run_meta, ) @@ -435,6 +436,29 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.handle_tool_error is True +# --------------------------------------------------------------------------- +# _ApifyGenericTool inheritance +# --------------------------------------------------------------------------- + + +def test_all_generic_tools_inherit_from_base() -> None: + """Every generic tool must be a subclass of _ApifyGenericTool.""" + for tool_cls in ( + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, + ): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_legacy_tool_does_not_inherit_from_generic_base() -> None: + """ApifyActorsTool is legacy and must NOT inherit from _ApifyGenericTool.""" + assert not issubclass(ApifyActorsTool, _ApifyGenericTool) + + # --------------------------------------------------------------------------- # APIFY_CORE_TOOLS list # --------------------------------------------------------------------------- From 110c971c539509827b426e5bcb60c43b72d0d935 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:19:14 +0200 Subject: [PATCH 12/63] feat: add _actor_tools.py file to define upcomming search and social media tools for apify integration --- langchain_apify/_actor_tools.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 langchain_apify/_actor_tools.py diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py new file mode 100644 index 0000000..d7bd850 --- /dev/null +++ b/langchain_apify/_actor_tools.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from langchain_core.tools import ToolException +from pydantic import BaseModel, Field + +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import _ApifyGenericTool, _run_meta + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + + +# --------------------------------------------------------------------------- +# Search & Crawling tools +# --------------------------------------------------------------------------- + + +# --------------------------------------------------------------------------- +# Social-media tools +# --------------------------------------------------------------------------- \ No newline at end of file From a08f63ec458179798ca337d1a380cea332b629ef Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:46:35 +0200 Subject: [PATCH 13/63] fix: add try/except to match others --- langchain_apify/tools.py | 5 ++++- tests/unit_tests/test_tools.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 93d884a..8315bdc 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -400,7 +400,10 @@ def _run( offset: int = 0, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: - items = self._client.get_dataset_items(dataset_id, limit, offset) + try: + items = self._client.get_dataset_items(dataset_id, limit, offset) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc if not items: return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) return json.dumps(items) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 025b486..331054d 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -266,6 +266,14 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo assert 'empty' in parsed['message'].lower() +def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='Network error fetching dataset'): + tool._run(dataset_id='ds-bad') + + def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): From d028531588602a1cf1249803b18b41c8f13b3b6a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:57:32 +0200 Subject: [PATCH 14/63] fix: update timeout constants and improve input schema descripiton in Apify tools --- langchain_apify/_client.py | 7 ++++--- langchain_apify/tools.py | 21 +++++++++++++++------ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index c3ed22e..0409fe0 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -14,7 +14,6 @@ _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 -_DEFAULT_TASK_TIMEOUT_SECS = 300 _DEFAULT_DATASET_ITEMS_LIMIT = 100 _RUN_STATUS_SUCCEEDED = 'SUCCEEDED' @@ -23,7 +22,7 @@ class ApifyToolsClient: """Internal helper that wraps ``ApifyClient`` for the tools layer. One convenience method per tool operation. All methods are synchronous and - block until the Actor run finishes., + block until the Actor run finishes. Args: apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` @@ -118,6 +117,7 @@ def run_actor_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ + # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') try: @@ -168,7 +168,7 @@ def run_task_and_get_items( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = _DEFAULT_TASK_TIMEOUT_SECS, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: @@ -188,6 +188,7 @@ def run_task_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ + # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') try: diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 8315bdc..b870f0f 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -2,8 +2,8 @@ import json import os -from typing import TYPE_CHECKING, Any from datetime import datetime +from typing import TYPE_CHECKING, Any from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException @@ -238,18 +238,26 @@ class ApifyRunTaskInput(BaseModel): """Input schema for :class:`ApifyRunTaskTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') - task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') - memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) class ApifyRunTaskAndGetItemsInput(BaseModel): """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') - task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') - memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') @@ -258,7 +266,7 @@ class ApifyRunTaskAndGetItemsInput(BaseModel): # --------------------------------------------------------------------------- -def _iso(value: str | None) -> str | None: +def _iso(value: str | datetime | None) -> str | None: if isinstance(value, datetime): return value.isoformat() return value @@ -294,6 +302,7 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) + # Token validation (missing env var, empty string) is handled inside ApifyToolsClient.__init__. self._client = ApifyToolsClient(apify_api_token=apify_api_token) From 429a3ed6027b2e79d5b123d43dbdbf5ec3a621d6 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:00:47 +0200 Subject: [PATCH 15/63] fix: enhance error handling for missing dataset id in run_actor and run_task methods --- langchain_apify/_client.py | 10 ++++++++-- tests/unit_tests/test_client.py | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 0409fe0..b131484 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -119,7 +119,10 @@ def run_actor_and_get_items( """ # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) - dataset_id = run.get('defaultDatasetId', '') + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) try: items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items except Exception as exc: @@ -190,7 +193,10 @@ def run_task_and_get_items( """ # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) - dataset_id = run.get('defaultDatasetId', '') + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Task {task_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) try: items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items except Exception as exc: diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 95193c4..5485d8c 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -132,6 +132,14 @@ def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_cl mock_apify_client.dataset.assert_called_once_with('dataset-xyz') +def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_actor_and_get_items('apify/test-actor') + + # --------------------------------------------------------------------------- # run_task # --------------------------------------------------------------------------- @@ -169,6 +177,14 @@ def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_cli assert items == _SAMPLE_ITEMS +def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_task_and_get_items('user/my-task') + + # --------------------------------------------------------------------------- # scrape_url # --------------------------------------------------------------------------- From b914e47dfbefb8f13eeddf1ce6512efaa3d31b64 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:15:50 +0200 Subject: [PATCH 16/63] fix: update apifygetdatasetitemstool to return a json object with items and message for empty dataset --- langchain_apify/tools.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index b870f0f..af0d84b 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -370,16 +370,16 @@ def _run( class ApifyGetDatasetItemsTool(_ApifyGenericTool): """Fetch items from an existing Apify dataset by ID. - Returns items as a JSON string. When the dataset is empty the tool returns - an informative JSON message instead of raising an error. + Returns a JSON object with an ``"items"`` key containing the list of item + dicts. When the dataset is empty an additional ``"message"`` key is + included. Args: apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` environment variable when *None*. Returns: - JSON array of item dicts, or ``{"items": [], "message": "..."}`` when - the dataset is empty. + JSON object ``{"items": [...]}``; includes ``"message"`` when empty. Example: .. code-block:: python @@ -395,10 +395,9 @@ class ApifyGetDatasetItemsTool(_ApifyGenericTool): name: str = 'apify_get_dataset_items' description: str = ( - 'Fetch items from an Apify dataset by ID. Returns a JSON array of item dicts.' + 'Fetch items from an Apify dataset by ID. Returns a JSON object with an "items" array.' ' Required: dataset_id (str) — Apify dataset ID.' ' Optional: limit (int, default 100), offset (int, default 0).' - ' Returns an empty JSON object with a message when the dataset is empty.' ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput @@ -415,7 +414,7 @@ def _run( raise ToolException(str(exc)) from exc if not items: return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) - return json.dumps(items) + return json.dumps({'items': items}) class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): From 0f7118180ddd8aa583b7ac636d2aa33aefee5e68 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:16:28 +0200 Subject: [PATCH 17/63] feat: add integration smoke tests for generic Apify tools to validate api interaction --- tests/integration_tests/test_generic_tools.py | 68 +++++++++++++++++++ tests/unit_tests/test_tools.py | 6 +- 2 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 tests/integration_tests/test_generic_tools.py diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py new file mode 100644 index 0000000..58c5cbf --- /dev/null +++ b/tests/integration_tests/test_generic_tools.py @@ -0,0 +1,68 @@ +"""Integration smoke tests for the generic Apify tools. + +These tests hit the real Apify API and require the ``APIFY_API_TOKEN`` +environment variable to be set. They use ``apify/python-example`` (a +trivial Actor that adds two numbers) to keep execution fast and cheap. +""" + +from __future__ import annotations + +import json +import os + +import pytest + +from langchain_apify import ( + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) + +_ACTOR_ID = 'apify/python-example' +_RUN_INPUT = {'first_number': 2, 'second_number': 3} + +pytestmark = pytest.mark.skipif( + not os.getenv('APIFY_API_TOKEN'), + reason='APIFY_API_TOKEN not set', +) + + +def test_run_actor_tool_smoke() -> None: + tool = ApifyRunActorTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +def test_get_dataset_items_tool_smoke() -> None: + run_tool = ApifyRunActorTool() + run_result = json.loads(run_tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT})) + dataset_id = run_result['dataset_id'] + + items_tool = ApifyGetDatasetItemsTool() + result = items_tool.invoke({'dataset_id': dataset_id, 'limit': 10}) + + parsed = json.loads(result) + assert 'items' in parsed + assert isinstance(parsed['items'], list) + + +def test_run_actor_and_get_items_tool_smoke() -> None: + tool = ApifyRunActorAndGetItemsTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) + + +def test_scrape_url_tool_smoke() -> None: + tool = ApifyScrapeUrlTool() + result = tool.invoke({'url': 'https://crawlee.dev'}) + + assert isinstance(result, str) + assert len(result) > 0 diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 331054d..21e1009 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -243,15 +243,15 @@ def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: # --------------------------------------------------------------------------- -def test_get_dataset_items_tool_returns_json_array(mock_tools_client: MagicMock) -> None: +def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) parsed = json.loads(result) - assert len(parsed) == 2 - assert parsed[0]['text'] == 'item-1' + assert len(parsed['items']) == 2 + assert parsed['items'][0]['text'] == 'item-1' mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) From 50c52f2cda5b3007c63a85dd52c5f7e82b8321e9 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 16:01:30 +0200 Subject: [PATCH 18/63] feat: implement clamping for timeout, memory, and item limits in apify tools to enforce safety constraints --- langchain_apify/tools.py | 46 ++++- tests/integration_tests/test_generic_tools.py | 26 +++ tests/unit_tests/conftest.py | 51 ++++++ tests/unit_tests/test_client.py | 77 +++----- tests/unit_tests/test_tools.py | 173 ++++++++++++------ 5 files changed, 260 insertions(+), 113 deletions(-) create mode 100644 tests/unit_tests/conftest.py diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index af0d84b..31f55d5 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -267,6 +267,7 @@ class ApifyRunTaskAndGetItemsInput(BaseModel): def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" if isinstance(value, datetime): return value.isoformat() return value @@ -291,20 +292,37 @@ def _run_meta(run: dict) -> dict: class _ApifyGenericTool(BaseTool): # type: ignore[override] """Shared base for all generic Apify tools. - Handles ``ApifyToolsClient`` creation and sets ``handle_tool_error``. + Handles ``ApifyToolsClient`` creation, sets ``handle_tool_error``, + and defines developer-controlled safety limits that clamp values the + LLM may provide at invocation time. + Subclasses only need to declare ``name``, ``description``, ``args_schema``, and ``_run()``. """ handle_tool_error: bool = True + max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') + max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') + max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + _client: ApifyToolsClient def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) - # Token validation (missing env var, empty string) is handled inside ApifyToolsClient.__init__. self._client = ApifyToolsClient(apify_api_token=apify_api_token) + def _clamp_timeout(self, value: int) -> int: + return min(value, self.max_timeout_secs) + + def _clamp_memory(self, value: int | None) -> int | None: + if value is None: + return None + return min(value, self.max_memory_mbytes) + + def _clamp_items(self, value: int) -> int: + return min(value, self.max_items) + # --------------------------------------------------------------------------- # Generic tools @@ -361,7 +379,9 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + run = self._client.run_actor( + actor_id, run_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps(_run_meta(run)) @@ -409,7 +429,7 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - items = self._client.get_dataset_items(dataset_id, limit, offset) + items = self._client.get_dataset_items(dataset_id, self._clamp_items(limit), offset) except RuntimeError as exc: raise ToolException(str(exc)) from exc if not items: @@ -470,7 +490,11 @@ def _run( ) -> str: try: run, items = self._client.run_actor_and_get_items( - actor_id, run_input, timeout_secs, memory_mbytes, dataset_items_limit + actor_id, + run_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), ) except RuntimeError as exc: raise ToolException(str(exc)) from exc @@ -520,7 +544,7 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - return self._client.scrape_url(url, timeout_secs) + return self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) except RuntimeError as exc: raise ToolException(str(exc)) from exc @@ -576,7 +600,9 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - run = self._client.run_task(task_id, task_input, timeout_secs, memory_mbytes) + run = self._client.run_task( + task_id, task_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps(_run_meta(run)) @@ -635,7 +661,11 @@ def _run( ) -> str: try: run, items = self._client.run_task_and_get_items( - task_id, task_input, timeout_secs, memory_mbytes, dataset_items_limit + task_id, + task_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), ) except RuntimeError as exc: raise ToolException(str(exc)) from exc diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py index 58c5cbf..863efb1 100644 --- a/tests/integration_tests/test_generic_tools.py +++ b/tests/integration_tests/test_generic_tools.py @@ -16,6 +16,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -66,3 +68,27 @@ def test_scrape_url_tool_smoke() -> None: assert isinstance(result, str) assert len(result) > 0 + + +_TASK_ID = os.getenv('APIFY_TASK_ID', '') + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_tool_smoke() -> None: + tool = ApifyRunTaskTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_and_get_items_tool_smoke() -> None: + tool = ApifyRunTaskAndGetItemsTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py new file mode 100644 index 0000000..eedadb9 --- /dev/null +++ b/tests/unit_tests/conftest.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient + +SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +def make_tool(tool_cls: type, mock_client: MagicMock, **kwargs: Any) -> Any: # noqa: ANN401 + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token', **kwargs) + tool._client = mock_client + return tool diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 5485d8c..c35f495 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -5,36 +5,7 @@ import pytest from langchain_apify._client import ApifyToolsClient - -_SUCCEEDED_RUN: dict = { - 'id': 'run-abc', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'dataset-xyz', - 'startedAt': '2025-01-01T00:00:00.000Z', - 'finishedAt': '2025-01-01T00:01:00.000Z', -} - -_FAILED_RUN: dict = { - 'id': 'run-fail', - 'status': 'FAILED', - 'defaultDatasetId': 'dataset-xyz', -} - -_SAMPLE_ITEMS: list[dict] = [ - {'text': 'item-1', 'url': 'https://example.com/1'}, - {'text': 'item-2', 'url': 'https://example.com/2'}, -] - - -@pytest.fixture -def mock_apify_client() -> MagicMock: - return MagicMock() - - -@pytest.fixture -def client(mock_apify_client: MagicMock) -> ApifyToolsClient: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): - return ApifyToolsClient(apify_api_token='dummy-token') +from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN # --------------------------------------------------------------------------- @@ -68,17 +39,17 @@ def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) - assert result == _SUCCEEDED_RUN + assert result == SUCCEEDED_RUN def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN client.run_actor('apify/test-actor', memory_mbytes=512) @@ -88,7 +59,7 @@ def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: Magi def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _FAILED_RUN + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN with pytest.raises(RuntimeError, match='run-fail'): client.run_actor('apify/test-actor') @@ -100,13 +71,13 @@ def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_cli def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) mock_apify_client.dataset.assert_called_once_with('dataset-xyz') mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) - assert items == _SAMPLE_ITEMS + assert items == SAMPLE_ITEMS def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: @@ -122,18 +93,18 @@ def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: Ma def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) - assert run == _SUCCEEDED_RUN - assert items == _SAMPLE_ITEMS + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS mock_apify_client.dataset.assert_called_once_with('dataset-xyz') def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.actor.return_value.call.return_value = run_no_dataset with pytest.raises(RuntimeError, match='no default dataset ID'): @@ -146,17 +117,17 @@ def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsCli def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN result = client.run_task('user/my-task', task_input={'key': 'val'}) mock_apify_client.task.assert_called_once_with('user/my-task') mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) - assert result == _SUCCEEDED_RUN + assert result == SUCCEEDED_RUN def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _FAILED_RUN + mock_apify_client.task.return_value.call.return_value = FAILED_RUN with pytest.raises(RuntimeError, match='run-fail'): client.run_task('user/my-task') @@ -168,17 +139,17 @@ def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_clie def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS run, items = client.run_task_and_get_items('user/my-task') - assert run == _SUCCEEDED_RUN - assert items == _SAMPLE_ITEMS + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.task.return_value.call.return_value = run_no_dataset with pytest.raises(RuntimeError, match='no default dataset ID'): @@ -191,7 +162,7 @@ def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClie def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, ] @@ -201,7 +172,7 @@ def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'text': 'Plain text content', 'url': 'https://example.com'}, ] @@ -211,7 +182,7 @@ def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_clie def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [] with pytest.raises(RuntimeError, match='No content extracted'): @@ -219,7 +190,7 @@ def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_clie def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'markdown': '', 'text': '', 'url': 'https://example.com'}, ] diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 21e1009..61e4c8b 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -2,7 +2,7 @@ import json from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch import pytest @@ -24,6 +24,7 @@ _run_meta, ) from langchain_apify.utils import actor_id_to_tool_name +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: from collections.abc import Generator @@ -103,37 +104,6 @@ class DummyModel(BaseModel): yield tool -# --------------------------------------------------------------------------- -# Shared test data for generic tools -# --------------------------------------------------------------------------- - -_SUCCEEDED_RUN: dict = { - 'id': 'run-abc', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'dataset-xyz', - 'startedAt': '2025-01-01T00:00:00.000Z', - 'finishedAt': '2025-01-01T00:01:00.000Z', -} - -_SAMPLE_ITEMS: list[dict] = [ - {'text': 'item-1', 'url': 'https://example.com/1'}, - {'text': 'item-2', 'url': 'https://example.com/2'}, -] - - -@pytest.fixture -def mock_tools_client() -> MagicMock: - return MagicMock(spec=ApifyToolsClient) - - -def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 - """Instantiate a generic tool with a mocked ApifyToolsClient.""" - with patch.object(ApifyToolsClient, '__init__', return_value=None): - tool = tool_cls(apify_api_token='dummy-token') - tool._client = mock_client - return tool - - # --------------------------------------------------------------------------- # _iso / _run_meta helpers # --------------------------------------------------------------------------- @@ -169,7 +139,7 @@ def test_run_meta_with_datetime_values_is_json_serializable() -> None: def test_run_meta_with_string_values_is_json_serializable() -> None: - meta = _run_meta(_SUCCEEDED_RUN) + meta = _run_meta(SUCCEEDED_RUN) serialized = json.dumps(meta) parsed = json.loads(serialized) assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' @@ -194,7 +164,7 @@ def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), } - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + tool = make_tool(ApifyRunActorTool, mock_tools_client) result = tool._run(actor_id='apify/test') @@ -210,8 +180,8 @@ def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor.return_value = _SUCCEEDED_RUN - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) @@ -226,7 +196,7 @@ def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + tool = make_tool(ApifyRunActorTool, mock_tools_client) with pytest.raises(ToolException, match='FAILED'): tool._run(actor_id='apify/test') @@ -244,8 +214,8 @@ def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: - mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) @@ -257,7 +227,7 @@ def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.return_value = [] - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-empty') @@ -268,7 +238,7 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='Network error fetching dataset'): tool._run(dataset_id='ds-bad') @@ -286,8 +256,8 @@ def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) - def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) - tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) @@ -302,7 +272,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(actor_id='apify/test') @@ -321,7 +291,7 @@ def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPa def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: mock_tools_client.scrape_url.return_value = '# Hello World' - tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) result = tool._run(url='https://example.com') @@ -331,7 +301,7 @@ def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') - tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) with pytest.raises(ToolException, match='No content extracted'): tool._run(url='https://example.com') @@ -349,8 +319,8 @@ def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_task.return_value = _SUCCEEDED_RUN - tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) @@ -365,7 +335,7 @@ def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') - tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + tool = make_tool(ApifyRunTaskTool, mock_tools_client) with pytest.raises(ToolException, match='FAILED'): tool._run(task_id='user/my-task') @@ -383,8 +353,8 @@ def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_task_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) - tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) @@ -399,7 +369,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(task_id='user/my-task') @@ -411,6 +381,105 @@ def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPat ApifyRunTaskAndGetItemsTool() +# --------------------------------------------------------------------------- +# Value clamping (developer safety limits) +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=60) + + tool._run(actor_id='apify/test', timeout_secs=9999) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 60, None) + + +def test_run_actor_tool_clamps_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=8192) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 512) + + +def test_run_actor_tool_passes_none_memory_through(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=None) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=10) + + tool._run(dataset_id='ds-1', limit=50000) + + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 10, 0) + + +def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunActorAndGetItemsTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(actor_id='a', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_actor_and_get_items.assert_called_once_with('a', None, 30, 256, 5) + + +def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# content' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) + + tool._run(url='https://example.com', timeout_secs=9999) + + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + + +def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client, max_timeout_secs=60, max_memory_mbytes=512) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999) + + mock_tools_client.run_task.assert_called_once_with('t/1', None, 60, 512) + + +def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunTaskAndGetItemsTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) + + +def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: + """When LLM values are within limits they should pass through unchanged.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', timeout_secs=120, memory_mbytes=1024) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 120, 1024) + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- From ba179a6c043ee12cd4e387d48ea80f9112e0999b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 07:37:35 +0200 Subject: [PATCH 19/63] feat: clean up _actor_tools.py and tools.py for improved readibility and maintability; update test cases for better formatting and error handling --- langchain_apify/_actor_tools.py | 20 +++------ langchain_apify/tools.py | 16 +++---- tests/unit_tests/test_client.py | 78 +++++++++++++++++++++++++++++++-- tests/unit_tests/test_tools.py | 16 ++++--- 4 files changed, 98 insertions(+), 32 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index d7bd850..a989b11 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -1,17 +1,11 @@ -from __future__ import annotations - -import json -from typing import TYPE_CHECKING, Any - -from langchain_core.tools import ToolException -from pydantic import BaseModel, Field +"""Actor-specific tool subclasses (search, social-media, etc.). -from langchain_apify._client import ApifyToolsClient -from langchain_apify.tools import _ApifyGenericTool, _run_meta - -if TYPE_CHECKING: - from langchain_core.callbacks import CallbackManagerForToolRun +Downstream feature branches add concrete tools here. They inherit from +:class:`~langchain_apify.tools._ApifyGenericTool` and use +:func:`~langchain_apify.tools._run_meta` to format run metadata. +""" +from __future__ import annotations # --------------------------------------------------------------------------- # Search & Crawling tools @@ -20,4 +14,4 @@ # --------------------------------------------------------------------------- # Social-media tools -# --------------------------------------------------------------------------- \ No newline at end of file +# --------------------------------------------------------------------------- diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 31f55d5..f771d35 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -7,7 +7,7 @@ from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException -from pydantic import BaseModel, Field, create_model +from pydantic import BaseModel, Field, PrivateAttr, create_model from langchain_apify._client import ApifyToolsClient from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -306,7 +306,7 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') - _client: ApifyToolsClient + _client: ApifyToolsClient = PrivateAttr() def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) @@ -329,7 +329,7 @@ def _clamp_items(self, value: int) -> int: # --------------------------------------------------------------------------- -class ApifyRunActorTool(_ApifyGenericTool): +class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -387,7 +387,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(_ApifyGenericTool): +class ApifyGetDatasetItemsTool(_ApifyGenericTool): # type: ignore[override] """Fetch items from an existing Apify dataset by ID. Returns a JSON object with an ``"items"`` key containing the list of item @@ -437,7 +437,7 @@ def _run( return json.dumps({'items': items}) -class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): +class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -501,7 +501,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(_ApifyGenericTool): +class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -549,7 +549,7 @@ def _run( raise ToolException(str(exc)) from exc -class ApifyRunTaskTool(_ApifyGenericTool): +class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task by ID and return run metadata. Actor tasks are pre-configured Actor runs saved in the Apify Console. @@ -608,7 +608,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): +class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index c35f495..1c93f84 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -7,7 +7,6 @@ from langchain_apify._client import ApifyToolsClient from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN - # --------------------------------------------------------------------------- # __init__ # --------------------------------------------------------------------------- @@ -44,7 +43,9 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300, logger=None + ) assert result == SUCCEEDED_RUN @@ -103,7 +104,9 @@ def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_cl mock_apify_client.dataset.assert_called_once_with('dataset-xyz') -def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: +def test_run_actor_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.actor.return_value.call.return_value = run_no_dataset @@ -148,7 +151,9 @@ def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_cli assert items == SAMPLE_ITEMS -def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: +def test_run_task_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.task.return_value.call.return_value = run_no_dataset @@ -211,3 +216,68 @@ def test_check_run_status_succeeded() -> None: def test_check_run_status_failed() -> None: with pytest.raises(RuntimeError, match='run-bad'): ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) + + +# --------------------------------------------------------------------------- +# None returns from actor/task .call() +# --------------------------------------------------------------------------- + + +def test_run_actor_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_actor('apify/broken-actor') + + +def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_task('user/broken-task') + + +# --------------------------------------------------------------------------- +# Network error wrapping (transport exception -> RuntimeError) +# --------------------------------------------------------------------------- + + +def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.side_effect = ConnectionError('conn refused') + + with pytest.raises(RuntimeError, match='Network error calling Actor'): + client.run_actor('apify/test-actor') + + +def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('timeout') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.get_dataset_items('dataset-xyz') + + +def test_run_actor_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.run_actor_and_get_items('apify/test-actor') + + +def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.side_effect = ConnectionError('conn refused') + + with pytest.raises(RuntimeError, match='Network error calling task'): + client.run_task('user/my-task') + + +def test_run_task_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.run_task_and_get_items('user/my-task') diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 61e4c8b..6698589 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -237,7 +237,9 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: - mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') + mock_tools_client.get_dataset_items.side_effect = RuntimeError( + 'Network error fetching dataset ds-bad: connection reset' + ) tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='Network error fetching dataset'): @@ -489,12 +491,12 @@ def test_generic_tools_have_correct_metadata() -> None: """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ - ApifyRunActorTool(apify_api_token='dummy'), - ApifyGetDatasetItemsTool(apify_api_token='dummy'), - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), - ApifyScrapeUrlTool(apify_api_token='dummy'), - ApifyRunTaskTool(apify_api_token='dummy'), - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] ] expected_names = [ From da900ce0cdcdad33853b2bea33774fa042addd8f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 09:52:52 +0200 Subject: [PATCH 20/63] feat: add three new tools to _client.py --- langchain_apify/_client.py | 130 +++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index b131484..e6b4468 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -12,8 +12,12 @@ from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_CRAWL_ACTOR_ID = 'apify/website-content-crawler' +_GOOGLE_SEARCH_ACTOR_ID = 'apify/google-search-scraper' +_RAG_WEB_BROWSER_ACTOR_ID = 'apify/rag-web-browser' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_CRAWL_TIMEOUT_SECS = 300 _DEFAULT_DATASET_ITEMS_LIMIT = 100 _RUN_STATUS_SUCCEEDED = 'SUCCEEDED' @@ -239,6 +243,132 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) raise RuntimeError(msg) return content + def google_search( + self, + query: str, + max_results: int = 10, + country_code: str | None = None, + language_code: str | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> list[dict]: + """Run a Google search and return structured results. + + Uses ``apify/google-search-scraper`` with a single query. + + Args: + query: Search query string. + max_results: Maximum number of results to return. + country_code: Two-letter country code for localised results. + language_code: Two-letter language code. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of result dicts, each with ``title``, ``url``, and + ``description`` keys. + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'queries': query, + 'maxPagesPerQuery': 1, + 'resultsPerPage': max_results, + } + if country_code is not None: + run_input['countryCode'] = country_code + if language_code is not None: + run_input['languageCode'] = language_code + + _, items = self.run_actor_and_get_items( + _GOOGLE_SEARCH_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + results: list[dict] = [] + for item in items: + for organic in item.get('organicResults', []): + results.append({ + 'title': organic.get('title', ''), + 'url': organic.get('url', ''), + 'description': organic.get('description', ''), + }) + return results[:max_results] + + def rag_web_search( + self, + query: str, + max_results: int = 5, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> list[dict]: + """Search the web and return crawled page content for RAG. + + Uses ``apify/rag-web-browser``. + + Args: + query: Search query string. + max_results: Maximum number of results to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of result dicts with ``crawledUrl``, ``title``, and + ``text`` keys (among others from the Actor). + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'query': query, + 'maxResults': max_results, + } + _, items = self.run_actor_and_get_items( + _RAG_WEB_BROWSER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + return items + + def crawl_website( + self, + url: str, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: str = 'cheerio', + timeout_secs: int = _DEFAULT_CRAWL_TIMEOUT_SECS, + ) -> list[dict]: + """Crawl a website and return page content. + + Uses ``apify/website-content-crawler``. + + Args: + url: Seed URL to start crawling from. + max_crawl_pages: Maximum number of pages to crawl. + max_crawl_depth: Maximum link-follow depth from the seed URL. + crawler_type: Crawler engine (e.g. ``"cheerio"``, ``"playwright"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of page dicts, each with at least ``url``, ``title``, and + ``markdown`` (or ``text``) keys. + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': max_crawl_pages, + 'maxCrawlDepth': max_crawl_depth, + 'crawlerType': crawler_type, + } + _, items = self.run_actor_and_get_items( + _CRAWL_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_crawl_pages, + ) + return items + @staticmethod def _check_run_status(run: dict) -> None: """Raise if the run did not succeed.""" From ff6ffebad79eac660a42eb7fcf0828932fffeedf Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 09:57:48 +0200 Subject: [PATCH 21/63] feat: implement apifygooglesearchtool and apifywebcrawlertool --- langchain_apify/_actor_tools.py | 143 ++++++++++++++++++++++++++++++++ langchain_apify/tools.py | 19 +++++ 2 files changed, 162 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index a989b11..c62d912 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -7,11 +7,154 @@ from __future__ import annotations +import json +from typing import TYPE_CHECKING + +from langchain_core.tools import ToolException +from pydantic import BaseModel + +from langchain_apify.tools import ( + ApifyGoogleSearchInput, + ApifyWebCrawlerInput, + _ApifyGenericTool, +) + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + # --------------------------------------------------------------------------- # Search & Crawling tools # --------------------------------------------------------------------------- +class ApifyGoogleSearchTool(_ApifyGenericTool): # type: ignore[override] + """Search Google and return structured results via Apify. + + Wraps the ``apify/google-search-scraper`` Actor behind a simplified, + LLM-friendly interface. Returns a JSON string containing an array of + result objects, each with ``title``, ``url``, and ``description`` keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string — an array of ``{"title", "url", "description"}`` objects. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGoogleSearchTool + + tool = ApifyGoogleSearchTool() + results = tool.invoke({"query": "LangChain framework"}) + """ + + name: str = 'apify_google_search' + description: str = ( + 'Search Google using Apify and return structured results as a JSON array.' + ' Each result has keys: title, url, description.' + ' Required: query (str) — the search query.' + ' Optional: max_results (int, default 10),' + ' country_code (str|null), language_code (str|null).' + ) + args_schema: type[BaseModel] = ApifyGoogleSearchInput + + def _run( + self, + query: str, + max_results: int = 10, + country_code: str | None = None, + language_code: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + results = self._client.google_search( + query, + max_results=self._clamp_items(max_results), + country_code=country_code, + language_code=language_code, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(results) + + +class ApifyWebCrawlerTool(_ApifyGenericTool): # type: ignore[override] + """Crawl a website and return page content as JSON via Apify. + + Wraps the ``apify/website-content-crawler`` Actor. Returns a JSON string + containing an array of page objects, each with ``url``, ``title``, and + ``content`` (markdown) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string — an array of ``{"url", "title", "content"}`` objects. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyWebCrawlerTool + + tool = ApifyWebCrawlerTool() + pages = tool.invoke({ + "url": "https://docs.apify.com", + "max_crawl_pages": 5, + }) + """ + + name: str = 'apify_web_crawler' + description: str = ( + 'Crawl a website using Apify and return page content as a JSON array.' + ' Each page object has keys: url, title, content (markdown).' + ' Required: url (str) — seed URL to crawl.' + ' Optional: max_crawl_pages (int, default 10),' + ' max_crawl_depth (int, default 1),' + ' crawler_type (str, default "cheerio"),' + ' timeout_secs (int, default 300).' + ) + args_schema: type[BaseModel] = ApifyWebCrawlerInput + + def _run( + self, + url: str, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: str = 'cheerio', + timeout_secs: int = 300, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + items = self._client.crawl_website( + url, + max_crawl_pages=self._clamp_items(max_crawl_pages), + max_crawl_depth=max_crawl_depth, + crawler_type=crawler_type, + timeout_secs=self._clamp_timeout(timeout_secs), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + pages = [ + { + 'url': item.get('url', ''), + 'title': item.get('metadata', {}).get('title', ''), + 'content': item.get('markdown') or item.get('text', ''), + } + for item in items + ] + return json.dumps(pages) + + # --------------------------------------------------------------------------- # Social-media tools # --------------------------------------------------------------------------- diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index f771d35..1db4c55 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -234,6 +234,25 @@ class ApifyScrapeUrlInput(BaseModel): timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') +class ApifyGoogleSearchInput(BaseModel): + """Input schema for :class:`ApifyGoogleSearchTool`.""" + + query: str = Field(description='Search query string.') + max_results: int = Field(default=10, description='Maximum number of search results to return.') + country_code: str | None = Field(default=None, description='Two-letter country code for localised results.') + language_code: str | None = Field(default=None, description='Two-letter language code.') + + +class ApifyWebCrawlerInput(BaseModel): + """Input schema for :class:`ApifyWebCrawlerTool`.""" + + url: str = Field(description='Seed URL to start crawling from.') + max_crawl_pages: int = Field(default=10, description='Maximum number of pages to crawl.') + max_crawl_depth: int = Field(default=1, description='Maximum link-follow depth from the seed URL.') + crawler_type: str = Field(default='cheerio', description='Crawler engine (e.g. "cheerio", "playwright").') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the crawl to finish.') + + class ApifyRunTaskInput(BaseModel): """Input schema for :class:`ApifyRunTaskTool`.""" From 6e8888cc1663fa18214b09c4807f59ac7970fa69 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:02:45 +0200 Subject: [PATCH 22/63] feat: implement a apify search retrievel --- langchain_apify/retrievers.py | 135 ++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 langchain_apify/retrievers.py diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py new file mode 100644 index 0000000..0c990d3 --- /dev/null +++ b/langchain_apify/retrievers.py @@ -0,0 +1,135 @@ +"""LangChain retrievers backed by Apify Actors.""" + +from __future__ import annotations + +import os +from typing import TYPE_CHECKING, Any + +from apify_client import ApifyClient, ApifyClientAsync +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from pydantic import Field, PrivateAttr + +from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify.utils import create_apify_client + +if TYPE_CHECKING: + from langchain_core.callbacks import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, + ) + +_RAG_WEB_BROWSER_ACTOR_ID = 'apify/rag-web-browser' +_DEFAULT_TIMEOUT_SECS = 300 + + +class ApifySearchRetriever(BaseRetriever): + """Retrieve documents from the web for RAG using Apify. + + Wraps the ``apify/rag-web-browser`` Actor. Each invocation runs a web + search, crawls the top results, and returns their content as LangChain + ``Document`` objects ready for a RAG pipeline. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + max_results: Maximum number of ``Document`` objects to return per query. + timeout_secs: Maximum time in seconds to wait for the Actor run. + + Returns: + List of ``Document`` objects. ``page_content`` contains the crawled + text; ``metadata`` includes ``source`` (URL) and ``title``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifySearchRetriever + + retriever = ApifySearchRetriever(max_results=3) + docs = retriever.invoke("What is LangChain?") + """ + + max_results: int = Field(default=5, description='Maximum number of documents to return.') + timeout_secs: int = Field(default=_DEFAULT_TIMEOUT_SECS, description='Maximum Actor run time in seconds.') + + _sync_client: ApifyClient = PrivateAttr() + _async_client: ApifyClientAsync = PrivateAttr() + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + token = apify_api_token or os.getenv('APIFY_API_TOKEN') + if not token: + msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._sync_client = create_apify_client(ApifyClient, token) + self._async_client = create_apify_client(ApifyClientAsync, token) + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: CallbackManagerForRetrieverRun | None = None, + ) -> list[Document]: + run_input = { + 'query': query, + 'maxResults': self.max_results, + } + run = self._sync_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( + run_input=run_input, + timeout_secs=self.timeout_secs, + ) + if run is None: + return [] + + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + return [] + + items = self._sync_client.dataset(dataset_id).list_items( + limit=self.max_results, clean=True, + ).items + return self._items_to_documents(items) + + async def _aget_relevant_documents( + self, + query: str, + *, + run_manager: AsyncCallbackManagerForRetrieverRun | None = None, + ) -> list[Document]: + run_input = { + 'query': query, + 'maxResults': self.max_results, + } + run = await self._async_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( + run_input=run_input, + timeout_secs=self.timeout_secs, + ) + if run is None: + return [] + + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + return [] + + items = ( + await self._async_client.dataset(dataset_id).list_items( + limit=self.max_results, clean=True, + ) + ).items + return self._items_to_documents(items) + + @staticmethod + def _items_to_documents(items: list[dict]) -> list[Document]: + """Convert Actor dataset items to LangChain Documents.""" + docs: list[Document] = [] + for item in items: + page_content = item.get('text') or item.get('markdown') or '' + metadata: dict[str, Any] = { + 'source': item.get('crawledUrl') or item.get('url', ''), + 'title': item.get('metadata', {}).get('title', '') if isinstance(item.get('metadata'), dict) else '', + } + docs.append(Document(page_content=page_content, metadata=metadata)) + return docs From b124ce154ca678356ea4901143f4f8825864cfcf Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:05:57 +0200 Subject: [PATCH 23/63] feat: add apify crawl loader to document_loaders.py --- langchain_apify/document_loaders.py | 89 ++++++++++++++++++++++++++++- 1 file changed, 88 insertions(+), 1 deletion(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..d8064a8 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -8,8 +8,9 @@ from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator +from langchain_apify._client import ApifyToolsClient from langchain_apify.utils import create_apify_client if TYPE_CHECKING: @@ -112,3 +113,89 @@ def lazy_load(self) -> Iterator[Document]: ) for item in dataset_items: yield self.dataset_mapping_function(item) + + +class ApifyCrawlLoader(BaseLoader): + """Crawl a website and load pages as LangChain Documents. + + Wraps the ``apify/website-content-crawler`` Actor. Runs a crawl starting + from the seed URL and converts each crawled page into a ``Document`` with + markdown content and metadata (source URL, title, crawl depth). + + Args: + url: Seed URL to start crawling from. + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + max_crawl_pages: Maximum number of pages to crawl. + max_crawl_depth: Maximum link-follow depth from the seed URL. + crawler_type: Crawler engine (e.g. ``"cheerio"``, ``"playwright"``). + timeout_secs: Maximum time in seconds to wait for the crawl. + + Returns: + Iterator (or list) of ``Document`` objects. ``page_content`` contains + the page markdown; ``metadata`` includes ``source``, ``title``, and + ``crawl_depth``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyCrawlLoader + + loader = ApifyCrawlLoader( + url="https://docs.apify.com", + max_crawl_pages=5, + ) + documents = loader.load() + """ + + url: str + max_crawl_pages: int = Field(default=10) + max_crawl_depth: int = Field(default=1) + crawler_type: str = Field(default='cheerio') + timeout_secs: int = Field(default=300) + + _client: ApifyToolsClient = PrivateAttr() + + def __init__( + self, + url: str, + apify_api_token: str | None = None, + *, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: str = 'cheerio', + timeout_secs: int = 300, + ) -> None: + super().__init__( + url=url, + max_crawl_pages=max_crawl_pages, + max_crawl_depth=max_crawl_depth, + crawler_type=crawler_type, + timeout_secs=timeout_secs, + ) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def lazy_load(self) -> Iterator[Document]: + """Crawl the website and yield Documents lazily. + + Yields: + Document: One document per crawled page. + """ + items = self._client.crawl_website( + self.url, + max_crawl_pages=self.max_crawl_pages, + max_crawl_depth=self.max_crawl_depth, + crawler_type=self.crawler_type, + timeout_secs=self.timeout_secs, + ) + for item in items: + page_content = item.get('markdown') or item.get('text') or '' + metadata: dict[str, Any] = { + 'source': item.get('url', ''), + 'title': item.get('metadata', {}).get('title', '') if isinstance(item.get('metadata'), dict) else '', + 'crawl_depth': item.get('crawlDepth', 0), + } + yield Document(page_content=page_content, metadata=metadata) From 029b9e125fbc5ea8aa79988148477c0bad89e8bd Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:08:08 +0200 Subject: [PATCH 24/63] feat: update __init__ --- langchain_apify/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index fa1f369..a57c52a 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -2,7 +2,9 @@ from importlib import metadata -from langchain_apify.document_loaders import ApifyDatasetLoader +from langchain_apify._actor_tools import ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify.document_loaders import ApifyCrawlLoader, ApifyDatasetLoader +from langchain_apify.retrievers import ApifySearchRetriever from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, @@ -33,6 +35,11 @@ ApifyRunTaskAndGetItemsTool, ] +APIFY_ACTOR_TOOLS: list[type] = [ + ApifyGoogleSearchTool, + ApifyWebCrawlerTool, +] + __all__ = [ # Existing components (backward-compatible) 'ApifyActorsTool', @@ -45,7 +52,15 @@ 'ApifyRunTaskAndGetItemsTool', 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', + # Actor-specific tools + 'ApifyGoogleSearchTool', + 'ApifyWebCrawlerTool', + # Retriever + 'ApifySearchRetriever', + # Loaders + 'ApifyCrawlLoader', # Tool group lists + 'APIFY_ACTOR_TOOLS', 'APIFY_CORE_TOOLS', # Meta '__version__', From c7ee287c2f992f343840cbf84ee2b3f48999f26b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:19:11 +0200 Subject: [PATCH 25/63] feat: add unit tests --- tests/unit_tests/test_client.py | 151 ++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 1c93f84..ca85138 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -281,3 +281,154 @@ def test_run_task_and_get_items_dataset_fetch_network_error( with pytest.raises(RuntimeError, match='Network error fetching dataset'): client.run_task_and_get_items('user/my-task') + + +# --------------------------------------------------------------------------- +# google_search +# --------------------------------------------------------------------------- + +GOOGLE_SEARCH_ITEMS: list[dict] = [ + { + 'organicResults': [ + {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'}, + {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'}, + ], + }, +] + + +def test_google_search_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = GOOGLE_SEARCH_ITEMS + + results = client.google_search('test query', max_results=5) + + assert len(results) == 2 + assert results[0] == {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'} + assert results[1] == {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'} + + +def test_google_search_with_locale(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = GOOGLE_SEARCH_ITEMS + + client.google_search('test', country_code='us', language_code='en') + + call_args = mock_apify_client.actor.return_value.call.call_args + run_input = call_args.kwargs['run_input'] + assert run_input['countryCode'] == 'us' + assert run_input['languageCode'] == 'en' + + +def test_google_search_caps_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + many_results = [{'title': f'R{i}', 'url': f'https://example.com/{i}', 'description': f'D{i}'} for i in range(20)] + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [{'organicResults': many_results}] + + results = client.google_search('test', max_results=3) + + assert len(results) == 3 + + +def test_google_search_empty_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [{'organicResults': []}] + + results = client.google_search('test') + + assert results == [] + + +def test_google_search_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.google_search('test') + + +# --------------------------------------------------------------------------- +# rag_web_search +# --------------------------------------------------------------------------- + +RAG_SEARCH_ITEMS: list[dict] = [ + {'crawledUrl': 'https://example.com/1', 'text': 'Page 1 content', 'metadata': {'title': 'Page 1'}}, + {'crawledUrl': 'https://example.com/2', 'text': 'Page 2 content', 'metadata': {'title': 'Page 2'}}, +] + + +def test_rag_web_search_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = RAG_SEARCH_ITEMS + + items = client.rag_web_search('test query', max_results=5) + + assert len(items) == 2 + assert items[0]['crawledUrl'] == 'https://example.com/1' + assert items[1]['text'] == 'Page 2 content' + + +def test_rag_web_search_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.rag_web_search('test') + + assert items == [] + + +def test_rag_web_search_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.rag_web_search('test') + + +# --------------------------------------------------------------------------- +# crawl_website +# --------------------------------------------------------------------------- + +CRAWL_ITEMS: list[dict] = [ + {'url': 'https://example.com/', 'markdown': '# Home', 'text': 'Home', 'metadata': {'title': 'Home'}}, + {'url': 'https://example.com/about', 'markdown': '# About', 'text': 'About', 'metadata': {'title': 'About'}}, +] + + +def test_crawl_website_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = CRAWL_ITEMS + + items = client.crawl_website('https://example.com') + + assert len(items) == 2 + assert items[0]['url'] == 'https://example.com/' + assert items[1]['markdown'] == '# About' + + +def test_crawl_website_passes_params(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.crawl_website('https://example.com', max_crawl_pages=5, max_crawl_depth=2, crawler_type='playwright') + + call_args = mock_apify_client.actor.return_value.call.call_args + run_input = call_args.kwargs['run_input'] + assert run_input['startUrls'] == [{'url': 'https://example.com'}] + assert run_input['maxCrawlPages'] == 5 + assert run_input['maxCrawlDepth'] == 2 + assert run_input['crawlerType'] == 'playwright' + + +def test_crawl_website_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.crawl_website('https://example.com') + + assert items == [] + + +def test_crawl_website_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.crawl_website('https://example.com') From ec60765064eed63d6b8ac81f88a410531e5c0949 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:22:29 +0200 Subject: [PATCH 26/63] feat: add actor tools unit tests --- tests/unit_tests/test_actor_tools.py | 184 +++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 tests/unit_tests/test_actor_tools.py diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py new file mode 100644 index 0000000..2e5fbe4 --- /dev/null +++ b/tests/unit_tests/test_actor_tools.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import json +from unittest.mock import MagicMock, patch + +import pytest +from langchain_core.tools import ToolException + +from langchain_apify import APIFY_ACTOR_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import _ApifyGenericTool +from tests.unit_tests.conftest import make_tool + +# --------------------------------------------------------------------------- +# ApifyGoogleSearchTool +# --------------------------------------------------------------------------- + + +def test_google_search_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [ + {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'}, + {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'}, + ] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + result = tool._run(query='test query') + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0]['title'] == 'Result 1' + assert parsed[1]['url'] == 'https://example.com/2' + + +def test_google_search_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool._run(query='test', max_results=5, country_code='us', language_code='en') + + mock_tools_client.google_search.assert_called_once_with( + 'test', + max_results=5, + country_code='us', + language_code='en', + timeout_secs=600, + ) + + +def test_google_search_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client, max_items=3) + + tool._run(query='test', max_results=100) + + call_kwargs = mock_tools_client.google_search.call_args + assert call_kwargs.kwargs['max_results'] == 3 + + +def test_google_search_tool_empty_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + result = tool._run(query='nothing') + + assert json.loads(result) == [] + + +def test_google_search_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(query='test') + + +def test_google_search_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGoogleSearchTool() + + +# --------------------------------------------------------------------------- +# ApifyWebCrawlerTool +# --------------------------------------------------------------------------- + + +def test_web_crawler_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'markdown': '# Home', 'text': 'Home', 'metadata': {'title': 'Home'}}, + {'url': 'https://example.com/about', 'markdown': '', 'text': 'About us', 'metadata': {'title': 'About'}}, + ] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0] == {'url': 'https://example.com/', 'title': 'Home', 'content': '# Home'} + assert parsed[1] == {'url': 'https://example.com/about', 'title': 'About', 'content': 'About us'} + + +def test_web_crawler_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + tool._run( + url='https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright', + timeout_secs=120, + ) + + mock_tools_client.crawl_website.assert_called_once_with( + 'https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright', + timeout_secs=120, + ) + + +def test_web_crawler_tool_clamps_pages_and_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client, max_items=3, max_timeout_secs=60) + + tool._run(url='https://example.com', max_crawl_pages=100, timeout_secs=9999) + + call_kwargs = mock_tools_client.crawl_website.call_args + assert call_kwargs.kwargs['max_crawl_pages'] == 3 + assert call_kwargs.kwargs['timeout_secs'] == 60 + + +def test_web_crawler_tool_empty_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert json.loads(result) == [] + + +def test_web_crawler_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(url='https://example.com') + + +def test_web_crawler_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyWebCrawlerTool() + + +# --------------------------------------------------------------------------- +# Metadata & inheritance +# --------------------------------------------------------------------------- + + +def test_actor_tools_inherit_from_generic_base() -> None: + for tool_cls in (ApifyGoogleSearchTool, ApifyWebCrawlerTool): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_actor_tools_have_correct_metadata() -> None: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyGoogleSearchTool(apify_api_token='dummy'), + ApifyWebCrawlerTool(apify_api_token='dummy'), + ] + + expected_names = ['apify_google_search', 'apify_web_crawler'] + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True + + +def test_apify_actor_tools_list() -> None: + assert set(APIFY_ACTOR_TOOLS) == {ApifyGoogleSearchTool, ApifyWebCrawlerTool} + assert len(APIFY_ACTOR_TOOLS) == 2 From c07718663969ec1fdc9726a8ab767b0a6978ca51 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:22:50 +0200 Subject: [PATCH 27/63] feat: add retrievers unit tests --- tests/unit_tests/test_retrievers.py | 224 ++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 tests/unit_tests/test_retrievers.py diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py new file mode 100644 index 0000000..779a9c8 --- /dev/null +++ b/tests/unit_tests/test_retrievers.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from langchain_core.documents import Document + +from langchain_apify.retrievers import ApifySearchRetriever + +RAG_ITEMS: list[dict] = [ + { + 'crawledUrl': 'https://example.com/1', + 'text': 'Page 1 content', + 'metadata': {'title': 'Page 1'}, + }, + { + 'crawledUrl': 'https://example.com/2', + 'text': 'Page 2 content', + 'metadata': {'title': 'Page 2'}, + }, +] + + +def _make_retriever( + mock_sync_client: MagicMock, + mock_async_client: MagicMock | None = None, + **kwargs: object, +) -> ApifySearchRetriever: + """Create a retriever with mocked Apify clients.""" + with ( + patch('langchain_apify.retrievers.create_apify_client') as mock_create, + ): + mock_create.side_effect = [mock_sync_client, mock_async_client or MagicMock()] + return ApifySearchRetriever(apify_api_token='dummy-token', **kwargs) + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifySearchRetriever() + + +def test_init_with_explicit_token() -> None: + with patch('langchain_apify.retrievers.create_apify_client'): + retriever = ApifySearchRetriever(apify_api_token='my-token') + assert retriever.max_results == 5 + assert retriever.timeout_secs == 300 + + +def test_init_custom_params() -> None: + with patch('langchain_apify.retrievers.create_apify_client'): + retriever = ApifySearchRetriever(apify_api_token='t', max_results=3, timeout_secs=60) + assert retriever.max_results == 3 + assert retriever.timeout_secs == 60 + + +# --------------------------------------------------------------------------- +# _get_relevant_documents (sync) +# --------------------------------------------------------------------------- + + +def test_sync_returns_documents() -> None: + mock_client = MagicMock() + mock_client.actor.return_value.call.return_value = { + 'id': 'run-1', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-1', + } + mock_client.dataset.return_value.list_items.return_value.items = RAG_ITEMS + retriever = _make_retriever(mock_client, max_results=5) + + docs = retriever._get_relevant_documents('test query') + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == 'Page 1 content' + assert docs[0].metadata['source'] == 'https://example.com/1' + assert docs[0].metadata['title'] == 'Page 1' + assert docs[1].page_content == 'Page 2 content' + assert docs[1].metadata['source'] == 'https://example.com/2' + + +def test_sync_passes_correct_input() -> None: + mock_client = MagicMock() + mock_client.actor.return_value.call.return_value = { + 'defaultDatasetId': 'ds-1', + } + mock_client.dataset.return_value.list_items.return_value.items = [] + retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) + + retriever._get_relevant_documents('my search') + + mock_client.actor.return_value.call.assert_called_once_with( + run_input={'query': 'my search', 'maxResults': 3}, + timeout_secs=60, + ) + mock_client.dataset.return_value.list_items.assert_called_once_with( + limit=3, clean=True, + ) + + +def test_sync_empty_results() -> None: + mock_client = MagicMock() + mock_client.actor.return_value.call.return_value = { + 'defaultDatasetId': 'ds-1', + } + mock_client.dataset.return_value.list_items.return_value.items = [] + retriever = _make_retriever(mock_client) + + docs = retriever._get_relevant_documents('test') + + assert docs == [] + + +def test_sync_none_run_returns_empty() -> None: + mock_client = MagicMock() + mock_client.actor.return_value.call.return_value = None + retriever = _make_retriever(mock_client) + + docs = retriever._get_relevant_documents('test') + + assert docs == [] + + +def test_sync_no_dataset_id_returns_empty() -> None: + mock_client = MagicMock() + mock_client.actor.return_value.call.return_value = {'id': 'run-1', 'defaultDatasetId': None} + retriever = _make_retriever(mock_client) + + docs = retriever._get_relevant_documents('test') + + assert docs == [] + + +# --------------------------------------------------------------------------- +# _aget_relevant_documents (async) +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_async_returns_documents() -> None: + mock_async = MagicMock() + mock_async.actor.return_value.call = AsyncMock(return_value={ + 'id': 'run-1', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-1', + }) + mock_list_items = AsyncMock() + mock_list_items.return_value.items = RAG_ITEMS + mock_async.dataset.return_value.list_items = mock_list_items + + retriever = _make_retriever(MagicMock(), mock_async, max_results=5) + + docs = await retriever._aget_relevant_documents('test query') + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == 'Page 1 content' + assert docs[0].metadata['source'] == 'https://example.com/1' + + +@pytest.mark.asyncio +async def test_async_none_run_returns_empty() -> None: + mock_async = MagicMock() + mock_async.actor.return_value.call = AsyncMock(return_value=None) + retriever = _make_retriever(MagicMock(), mock_async) + + docs = await retriever._aget_relevant_documents('test') + + assert docs == [] + + +@pytest.mark.asyncio +async def test_async_no_dataset_id_returns_empty() -> None: + mock_async = MagicMock() + mock_async.actor.return_value.call = AsyncMock(return_value={'defaultDatasetId': None}) + retriever = _make_retriever(MagicMock(), mock_async) + + docs = await retriever._aget_relevant_documents('test') + + assert docs == [] + + +# --------------------------------------------------------------------------- +# _items_to_documents edge cases +# --------------------------------------------------------------------------- + + +def test_items_to_documents_uses_url_fallback() -> None: + items = [{'url': 'https://fallback.com', 'text': 'content', 'metadata': {'title': 'T'}}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['source'] == 'https://fallback.com' + + +def test_items_to_documents_uses_markdown_fallback() -> None: + items = [{'crawledUrl': 'https://example.com', 'markdown': '# MD content', 'metadata': {'title': 'T'}}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].page_content == '# MD content' + + +def test_items_to_documents_missing_metadata() -> None: + items = [{'crawledUrl': 'https://example.com', 'text': 'content'}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['title'] == '' + assert docs[0].metadata['source'] == 'https://example.com' + + +def test_items_to_documents_non_dict_metadata() -> None: + items = [{'crawledUrl': 'https://example.com', 'text': 'content', 'metadata': 'not-a-dict'}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['title'] == '' From 0b4ecbb3c3323230465994a3dd2925834b524bff Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 10:50:17 +0200 Subject: [PATCH 28/63] feat: simplify apify crawl loader init and enhance unit tests --- langchain_apify/document_loaders.py | 22 +--- tests/unit_tests/test_document_loaders.py | 140 +++++++++++++++++++++- 2 files changed, 144 insertions(+), 18 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index d8064a8..3a48329 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -8,7 +8,7 @@ from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, model_validator +from pydantic import BaseModel, ConfigDict, model_validator from langchain_apify._client import ApifyToolsClient from langchain_apify.utils import create_apify_client @@ -151,14 +151,6 @@ class ApifyCrawlLoader(BaseLoader): documents = loader.load() """ - url: str - max_crawl_pages: int = Field(default=10) - max_crawl_depth: int = Field(default=1) - crawler_type: str = Field(default='cheerio') - timeout_secs: int = Field(default=300) - - _client: ApifyToolsClient = PrivateAttr() - def __init__( self, url: str, @@ -169,13 +161,11 @@ def __init__( crawler_type: str = 'cheerio', timeout_secs: int = 300, ) -> None: - super().__init__( - url=url, - max_crawl_pages=max_crawl_pages, - max_crawl_depth=max_crawl_depth, - crawler_type=crawler_type, - timeout_secs=timeout_secs, - ) + self.url = url + self.max_crawl_pages = max_crawl_pages + self.max_crawl_depth = max_crawl_depth + self.crawler_type = crawler_type + self.timeout_secs = timeout_secs self._client = ApifyToolsClient(apify_api_token=apify_api_token) def lazy_load(self) -> Iterator[Document]: diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..4b3d493 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,10 +1,14 @@ -from unittest.mock import patch +from __future__ import annotations +from unittest.mock import MagicMock, patch + +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document -from langchain_apify import ApifyDatasetLoader +from langchain_apify import ApifyCrawlLoader, ApifyDatasetLoader +from langchain_apify._client import ApifyToolsClient def test_apify_dataset_loader_load() -> None: @@ -55,3 +59,135 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +# --------------------------------------------------------------------------- +# ApifyCrawlLoader +# --------------------------------------------------------------------------- + +CRAWL_ITEMS: list[dict] = [ + { + 'url': 'https://example.com/', + 'markdown': '# Home', + 'text': 'Home', + 'metadata': {'title': 'Home Page'}, + 'crawlDepth': 0, + }, + { + 'url': 'https://example.com/about', + 'markdown': '# About', + 'text': 'About', + 'metadata': {'title': 'About Page'}, + 'crawlDepth': 1, + }, +] + + +def _make_crawl_loader( + mock_client: MagicMock, + **kwargs: object, +) -> ApifyCrawlLoader: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + loader = ApifyCrawlLoader(url='https://example.com', apify_api_token='dummy', **kwargs) + loader._client = mock_client + return loader + + +def test_crawl_loader_lazy_load() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = CRAWL_ITEMS + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == '# Home' + assert docs[0].metadata['source'] == 'https://example.com/' + assert docs[0].metadata['title'] == 'Home Page' + assert docs[0].metadata['crawl_depth'] == 0 + assert docs[1].page_content == '# About' + assert docs[1].metadata['crawl_depth'] == 1 + + +def test_crawl_loader_load_delegates_to_lazy_load() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = CRAWL_ITEMS + loader = _make_crawl_loader(mock_client) + + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].page_content == '# Home' + + +def test_crawl_loader_passes_params() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [] + loader = _make_crawl_loader( + mock_client, + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright', + timeout_secs=120, + ) + + list(loader.lazy_load()) + + mock_client.crawl_website.assert_called_once_with( + 'https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright', + timeout_secs=120, + ) + + +def test_crawl_loader_empty_results() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [] + loader = _make_crawl_loader(mock_client) + + docs = loader.load() + + assert docs == [] + + +def test_crawl_loader_text_fallback() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'text': 'Plain text', 'metadata': {'title': 'T'}}, + ] + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert docs[0].page_content == 'Plain text' + + +def test_crawl_loader_missing_metadata() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'markdown': '# Content'}, + ] + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert docs[0].metadata['title'] == '' + assert docs[0].metadata['crawl_depth'] == 0 + + +def test_crawl_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyCrawlLoader(url='https://example.com') + + +def test_crawl_loader_failure_raises(mock_tools_client: MagicMock) -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + loader = _make_crawl_loader(mock_client) + + with pytest.raises(RuntimeError, match='FAILED'): + loader.load() From 005294b8cb0c48b4e6a95d926a7a7401c88343b4 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 14:08:24 +0200 Subject: [PATCH 29/63] ref: align private scope conventions with langchain partner package standards --- langchain_apify/_client.py | 18 ++++++++--------- langchain_apify/_error_messages.py | 6 +++--- langchain_apify/{utils.py => _utils.py} | 8 ++++---- langchain_apify/document_loaders.py | 9 ++++----- langchain_apify/tools.py | 27 ++++++++++++++----------- langchain_apify/wrappers.py | 12 +++++------ tests/integration_tests/test_utils.py | 10 ++++----- tests/unit_tests/conftest.py | 2 +- tests/unit_tests/test_client.py | 4 ++-- tests/unit_tests/test_tools.py | 4 ++-- 10 files changed, 51 insertions(+), 49 deletions(-) rename langchain_apify/{utils.py => _utils.py} (94%) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index b131484..84e840a 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -5,11 +5,11 @@ from apify_client import ApifyClient from langchain_apify._error_messages import ( - ERROR_ACTOR_RUN_FAILED, - ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, - ERROR_SCRAPE_EMPTY, + _ERROR_ACTOR_RUN_FAILED, + _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + _ERROR_SCRAPE_EMPTY, ) -from langchain_apify.utils import create_apify_client +from langchain_apify._utils import _create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 @@ -35,9 +35,9 @@ class ApifyToolsClient: def __init__(self, apify_api_token: str | None = None) -> None: token = apify_api_token or os.getenv('APIFY_API_TOKEN') if not token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = create_apify_client(ApifyClient, token) + self._client = _create_apify_client(ApifyClient, token) def run_actor( self, @@ -230,12 +230,12 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) dataset_items_limit=1, ) if not items: - msg = ERROR_SCRAPE_EMPTY.format(url=url) + msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) content = items[0].get('markdown') or items[0].get('text') or '' if not content: - msg = ERROR_SCRAPE_EMPTY.format(url=url) + msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) return content @@ -245,5 +245,5 @@ def _check_run_status(run: dict) -> None: status = run.get('status') if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') - msg = ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) raise RuntimeError(msg) diff --git a/langchain_apify/_error_messages.py b/langchain_apify/_error_messages.py index a87c9cb..0a8c612 100644 --- a/langchain_apify/_error_messages.py +++ b/langchain_apify/_error_messages.py @@ -1,4 +1,4 @@ -ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( +_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( 'APIFY_API_TOKEN environment variable is not set.' ' Please set it to your Apify API token by using `os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"' ' in your code or pass it as environment variable.' @@ -6,6 +6,6 @@ ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) -ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' +_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' -ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' +_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/utils.py b/langchain_apify/_utils.py similarity index 94% rename from langchain_apify/utils.py rename to langchain_apify/_utils.py index d3a627f..b19bcbf 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/_utils.py @@ -12,7 +12,7 @@ _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' -def prune_actor_input_schema( +def _prune_actor_input_schema( input_schema: dict, max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: @@ -48,7 +48,7 @@ def prune_actor_input_schema( T = TypeVar('T', ApifyClient, ApifyClientAsync) -def create_apify_client(client_cls: type[T], token: str) -> T: +def _create_apify_client(client_cls: type[T], token: str) -> T: """Create an Apify client instance with a custom user-agent. Args: @@ -79,7 +79,7 @@ def create_apify_client(client_cls: type[T], token: str) -> T: return client -def actor_id_to_tool_name(actor_id: str) -> str: +def _actor_id_to_tool_name(actor_id: str) -> str: """Turn actor_id into a valid tool name. Tool name must only contain letters, numbers, underscores, dashes, @@ -95,7 +95,7 @@ def actor_id_to_tool_name(actor_id: str) -> str: return 'apify_actor_' + ''.join(char if char in valid_chars else '_' for char in actor_id) -def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: +def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: """Get the latest build of an Actor from the default build tag. Args: diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..8554872 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -8,9 +8,9 @@ from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator -from langchain_apify.utils import create_apify_client +from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: from collections.abc import Iterator @@ -42,8 +42,7 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - apify_client: ApifyClient - """An instance of the ApifyClient class from the apify-client Python package.""" + apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -86,7 +85,7 @@ def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 # when running at Apify platform, use APIFY_TOKEN environment variable apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - client = create_apify_client(ApifyClient, apify_api_token) + client = _create_apify_client(ApifyClient, apify_api_token) values['apify_client'] = client diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index f771d35..81d9166 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -10,13 +10,13 @@ from pydantic import BaseModel, Field, PrivateAttr, create_model from langchain_apify._client import ApifyToolsClient -from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import ( +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import ( _MAX_DESCRIPTION_LEN, - actor_id_to_tool_name, - create_apify_client, - get_actor_latest_build, - prune_actor_input_schema, + _actor_id_to_tool_name, + _create_apify_client, + _get_actor_latest_build, + _prune_actor_input_schema, ) if TYPE_CHECKING: @@ -57,6 +57,9 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] chunk["messages"][-1].pretty_print() """ + _apify_client: ApifyClient = PrivateAttr() + _actor_id: str = PrivateAttr() + def __init__( self, actor_id: str, @@ -77,14 +80,14 @@ def __init__( """ apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') if not apify_api_token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, apify_api_token) kwargs.update( { - 'name': actor_id_to_tool_name(actor_id), + 'name': _actor_id_to_tool_name(actor_id), 'description': self._create_description(apify_client, actor_id), 'args_schema': self._build_tool_args_schema_model( apify_client, @@ -127,7 +130,7 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: Returns: str: The description. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') if len(actor_description) > _MAX_DESCRIPTION_LEN: actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' @@ -150,12 +153,12 @@ def _build_tool_args_schema_model( Raises: ValueError: If the input schema is not found in the Actor build. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) if not (actor_input := build.get('actorDefinition', {}).get('input')): msg = f'Input schema not found in the Actor build for Actor: {actor_id}' raise ValueError(msg) - properties, required = prune_actor_input_schema(actor_input) + properties, required = _prune_actor_input_schema(actor_input) properties = {'run_input': properties} description = ( diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index ef17873..34370fe 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -5,10 +5,10 @@ from apify_client import ApifyClient, ApifyClientAsync from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.utils import create_apify_client if TYPE_CHECKING: from collections.abc import Callable @@ -53,8 +53,8 @@ class ApifyWrapper(BaseModel): # allow arbitrary types in the model config for the apify client fields model_config = ConfigDict(arbitrary_types_allowed=True) - apify_client: ApifyClient - apify_client_async: ApifyClientAsync + apify_client: ApifyClient = Field(default=None, exclude=True) + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) apify_api_token: str | None = None def __init__( @@ -90,8 +90,8 @@ def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 """ apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - values['apify_client'] = create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = create_apify_client(ApifyClientAsync, apify_api_token) + values['apify_client'] = _create_apify_client(ApifyClient, apify_api_token) + values['apify_client_async'] = _create_apify_client(ApifyClientAsync, apify_api_token) return values diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 554cc2d..c92c038 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,8 +2,8 @@ from apify_client.client import ApifyClient -from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import create_apify_client, get_actor_latest_build +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client, _get_actor_latest_build def test_get_actor_latest_build() -> None: @@ -13,12 +13,12 @@ def test_get_actor_latest_build() -> None: ValueError: If the APIFY_API_TOKEN environment variable is not set. """ if (token := os.getenv('APIFY_API_TOKEN')) is None: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, token) + apify_client = _create_apify_client(ApifyClient, token) - build = get_actor_latest_build(apify_client, 'apify/rag-web-browser') + build = _get_actor_latest_build(apify_client, 'apify/rag-web-browser') assert isinstance(build, dict) assert 'id' in build diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index eedadb9..3384e79 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -39,7 +39,7 @@ def mock_apify_client() -> MagicMock: @pytest.fixture def client(mock_apify_client: MagicMock) -> ApifyToolsClient: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): return ApifyToolsClient(apify_api_token='dummy-token') diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 1c93f84..40c73dc 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -13,7 +13,7 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client) as mock_create: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: c = ApifyToolsClient(apify_api_token='my-token') mock_create.assert_called_once() assert c._client is mock_apify_client @@ -21,7 +21,7 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): c = ApifyToolsClient() assert c._client is mock_apify_client diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 6698589..542ec4e 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -11,6 +11,7 @@ from langchain_apify import APIFY_CORE_TOOLS from langchain_apify._client import ApifyToolsClient +from langchain_apify._utils import _actor_id_to_tool_name from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, @@ -23,7 +24,6 @@ _iso, _run_meta, ) -from langchain_apify.utils import actor_id_to_tool_name from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: @@ -57,7 +57,7 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id=actor_id, apify_api_token='dummy-token') assert isinstance(tool, ApifyActorsTool) assert tool.description == 'Mocked description' - assert tool.name == actor_id_to_tool_name(actor_id) + assert tool.name == _actor_id_to_tool_name(actor_id) assert tool.args_schema == DummyModel From 2f74c292ccf9422480484ce4921a4d3919c4c672 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:08:13 +0200 Subject: [PATCH 30/63] ref: migrate auth to SecretStr + secret_from_env pattern --- langchain_apify/document_loaders.py | 50 ++++++++++++++++----------- langchain_apify/tools.py | 26 ++++++++++---- langchain_apify/wrappers.py | 53 +++++++++++++++-------------- 3 files changed, 76 insertions(+), 53 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 8554872..3a777f3 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -7,9 +7,10 @@ from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: @@ -40,8 +41,12 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): documents = loader.load() """ - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + ) apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str """The ID of the dataset on the Apify platform.""" @@ -62,7 +67,8 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. + apify_api_token (str): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ super().__init__( dataset_id=dataset_id, @@ -70,26 +76,30 @@ def __init__( apify_api_token=apify_api_token, ) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. + @model_validator(mode='after') + def _init_client(self) -> 'ApifyDatasetLoader': + """Resolve the Apify API token and initialise the client. - Args: - values (dict): The values to validate. + Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the + Apify platform where only that variable is set. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - # when running at Apify platform, use APIFY_TOKEN environment variable - apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - - client = _create_apify_client(ApifyClient, apify_api_token) + ApifyDatasetLoader: The validated loader instance. - values['apify_client'] = client - - return values + Raises: + ValueError: If no token is available from any source. + """ + token = self.apify_api_token + if token is None: + # Secondary fallback for code running on the Apify platform. + raw = os.getenv('APIFY_TOKEN') + if raw: + token = SecretStr(raw) + if token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self.apify_client = _create_apify_client(ApifyClient, token.get_secret_value()) + return self def load(self) -> list[Document]: """Load documents. diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 81d9166..0097f4c 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -7,7 +7,8 @@ from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException -from pydantic import BaseModel, Field, PrivateAttr, create_model +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model from langchain_apify._client import ApifyToolsClient from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -78,12 +79,16 @@ def __init__( Raises: ValueError: If the `APIFY_API_TOKEN` environment variable is not set """ - apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not apify_api_token: + _raw_token: str | None = ( + apify_api_token.get_secret_value() + if isinstance(apify_api_token, SecretStr) + else apify_api_token or os.getenv('APIFY_API_TOKEN') + ) + if not _raw_token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = _create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, _raw_token) kwargs.update( { @@ -305,15 +310,22 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] handle_tool_error: bool = True + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + ) max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') _client: ApifyToolsClient = PrivateAttr() - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) + def model_post_init(self, __context: Any) -> None: # noqa: ANN401 + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) + super().model_post_init(__context) def _clamp_timeout(self, value: int) -> int: return min(value, self.max_timeout_secs) diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index 34370fe..9af591a 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -4,9 +4,10 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient, ApifyClientAsync -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader @@ -51,11 +52,14 @@ class ApifyWrapper(BaseModel): """ # allow arbitrary types in the model config for the apify client fields - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + ) apify_client: ApifyClient = Field(default=None, exclude=True) apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) - apify_api_token: str | None = None def __init__( self, @@ -63,37 +67,34 @@ def __init__( *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: - """Initialize the loader with an Apify dataset ID and a mapping function. + """Initialise the wrapper. Args: - dataset_id (str): The ID of the dataset on the Apify platform. - dataset_mapping_function (Callable): A function that takes a single - dictionary (an Apify dataset item) and converts it to an instance - of the Document class. - apify_api_token (Optional[str]): Apify API token. - *args: Any: Additional positional arguments. - **kwargs: Any: Additional keyword arguments. + apify_api_token (Optional[str]): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` environment variable when *None*. + *args: Any: Additional positional arguments forwarded to Pydantic. + **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ kwargs.update({'apify_api_token': apify_api_token}) super().__init__(*args, **kwargs) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Validate that an Apify API token is set and the apify-client - Python package exists in the current environment. + @model_validator(mode='after') + def _init_clients(self) -> 'ApifyWrapper': + """Validate the token and initialise both sync and async Apify clients. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') + ApifyWrapper: The validated wrapper instance. - values['apify_client'] = _create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = _create_apify_client(ApifyClientAsync, apify_api_token) - - return values + Raises: + ValueError: If no token is provided and APIFY_API_TOKEN is not set. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + token = self.apify_api_token.get_secret_value() + self.apify_client = _create_apify_client(ApifyClient, token) + self.apify_client_async = _create_apify_client(ApifyClientAsync, token) + return self def call_actor( # noqa: PLR0913 self, From 6258b2b9ad8ed2dffd09918929ef9ec7d7893f4c Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:38:37 +0200 Subject: [PATCH 31/63] fix: backward-compat fix --- langchain_apify/document_loaders.py | 14 +++++++++----- langchain_apify/wrappers.py | 5 ++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 3a777f3..4e286af 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -70,11 +70,15 @@ def __init__( apify_api_token (str): Apify API token. Falls back to the ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ - super().__init__( - dataset_id=dataset_id, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, - ) + init_kwargs: dict[str, Any] = { + 'dataset_id': dataset_id, + 'dataset_mapping_function': dataset_mapping_function, + } + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + init_kwargs['apify_api_token'] = apify_api_token + super().__init__(**init_kwargs) @model_validator(mode='after') def _init_client(self) -> 'ApifyDatasetLoader': diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index 9af591a..a1e0ab6 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -75,7 +75,10 @@ def __init__( *args: Any: Additional positional arguments forwarded to Pydantic. **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ - kwargs.update({'apify_api_token': apify_api_token}) + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + kwargs['apify_api_token'] = apify_api_token super().__init__(*args, **kwargs) @model_validator(mode='after') From 2905b679a7240b6286229a86f182ea0eddd3ac37 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:58:21 +0200 Subject: [PATCH 32/63] fix: update stale doc string --- langchain_apify/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/_utils.py b/langchain_apify/_utils.py index b19bcbf..9d74487 100644 --- a/langchain_apify/_utils.py +++ b/langchain_apify/_utils.py @@ -18,7 +18,7 @@ def _prune_actor_input_schema( ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. - Trim the description to 250 characters. + Trim descriptions to ``_MAX_DESCRIPTION_LEN`` characters. Args: input_schema (dict): The input schema from the Actor build. From 3238c0203e9965ad1c34dc004c9bd729714b0b1f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:59:06 +0200 Subject: [PATCH 33/63] chore: removed redundant file --- langchain_apify/_actor_tools.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 langchain_apify/_actor_tools.py diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py deleted file mode 100644 index a989b11..0000000 --- a/langchain_apify/_actor_tools.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Actor-specific tool subclasses (search, social-media, etc.). - -Downstream feature branches add concrete tools here. They inherit from -:class:`~langchain_apify.tools._ApifyGenericTool` and use -:func:`~langchain_apify.tools._run_meta` to format run metadata. -""" - -from __future__ import annotations - -# --------------------------------------------------------------------------- -# Search & Crawling tools -# --------------------------------------------------------------------------- - - -# --------------------------------------------------------------------------- -# Social-media tools -# --------------------------------------------------------------------------- From 92df406a8fde1996c1fe71713f16e1d2533d36dc Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 14:19:12 +0200 Subject: [PATCH 34/63] fix: extracted repeated code, fixed secretstr compatibility to apifytoolsclient --- langchain_apify/_client.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 84e840a..a828be2 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -3,6 +3,7 @@ import os from apify_client import ApifyClient +from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -32,12 +33,17 @@ class ApifyToolsClient: ValueError: If no token is provided and the env var is not set. """ - def __init__(self, apify_api_token: str | None = None) -> None: - token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not token: + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + _token: str | None = None + if isinstance(apify_api_token, SecretStr): + _token = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, token) + self._client = _create_apify_client(ApifyClient, _token) def run_actor( self, @@ -117,17 +123,12 @@ def run_actor_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId') if not dataset_id: msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' raise RuntimeError(msg) - try: - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' - raise RuntimeError(msg) from exc + items = self._list_items_or_raise(dataset_id, dataset_items_limit) return run, items def run_task( @@ -191,17 +192,12 @@ def run_task_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId') if not dataset_id: msg = f'Task {task_id} run succeeded but returned no default dataset ID.' raise RuntimeError(msg) - try: - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' - raise RuntimeError(msg) from exc + items = self._list_items_or_raise(dataset_id, dataset_items_limit) return run, items def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: @@ -239,6 +235,14 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) raise RuntimeError(msg) return content + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: + """Fetch dataset items, wrapping any network error in a RuntimeError.""" + try: + return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + @staticmethod def _check_run_status(run: dict) -> None: """Raise if the run did not succeed.""" From 3a0f666d08f4f6e05ec382699fe03a4d3e1e9414 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:02:43 +0200 Subject: [PATCH 35/63] fix: set min value to timeout, memory and items, add exlude and repr to apify_api_token --- langchain_apify/tools.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 0097f4c..3d7af3c 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -313,6 +313,8 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, ) max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') @@ -328,15 +330,15 @@ def model_post_init(self, __context: Any) -> None: # noqa: ANN401 super().model_post_init(__context) def _clamp_timeout(self, value: int) -> int: - return min(value, self.max_timeout_secs) + return max(1, min(value, self.max_timeout_secs)) def _clamp_memory(self, value: int | None) -> int | None: if value is None: return None - return min(value, self.max_memory_mbytes) + return max(1, min(value, self.max_memory_mbytes)) def _clamp_items(self, value: int) -> int: - return min(value, self.max_items) + return max(1, min(value, self.max_items)) # --------------------------------------------------------------------------- From 8614cfdbb54d4eff228b80c20668ed81e21cffb0 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:03:29 +0200 Subject: [PATCH 36/63] feat: added repr and exclude to apify api token --- langchain_apify/document_loaders.py | 2 ++ langchain_apify/wrappers.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 4e286af..400476e 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -46,6 +46,8 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + exclude=True, + repr=False, ) apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index a1e0ab6..e4cafb6 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -57,6 +57,8 @@ class ApifyWrapper(BaseModel): apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, ) apify_client: ApifyClient = Field(default=None, exclude=True) apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) From 2bf130a9c98a8d3c7436cf8e4daf14d9d5fc20c4 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:03:59 +0200 Subject: [PATCH 37/63] feat: add type checking to apify core tools list --- langchain_apify/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index fa1f369..bca8081 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from importlib import metadata +from typing import TYPE_CHECKING from langchain_apify.document_loaders import ApifyDatasetLoader from langchain_apify.tools import ( @@ -14,6 +15,9 @@ ) from langchain_apify.wrappers import ApifyWrapper +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: @@ -24,7 +28,7 @@ # Binding all tools at once overwhelms the LLM context window; # pick the group(s) relevant to your use case. -APIFY_CORE_TOOLS: list[type] = [ +APIFY_CORE_TOOLS: list[type[BaseTool]] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, From 98293d4cfa1e0ca6c488380ff93b5033b2d96314 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:05:05 +0200 Subject: [PATCH 38/63] feat: add tests for clamped values and apify api token --- tests/unit_tests/test_document_loaders.py | 25 +++++++++++++ tests/unit_tests/test_tools.py | 44 +++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..49ee9db 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,5 +1,6 @@ from unittest.mock import patch +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document @@ -55,3 +56,27 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +def test_apify_dataset_loader_apify_token_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Loader should accept APIFY_TOKEN as a secondary env-var fallback.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'platform-token') + + with patch.object(DatasetClient, 'list_items') as mock_list_items: + mock_list_items.return_value = ListPage(data={'items': []}) + loader = ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) + assert loader.load() == [] + + +def test_apify_dataset_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 542ec4e..3c99a71 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -472,6 +472,42 @@ def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) +def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600) + + tool._run(actor_id='apify/test', timeout_secs=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', timeout_secs=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + +def test_clamp_memory_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + + +def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=100) + + tool._run(dataset_id='ds-1', limit=-1) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + mock_tools_client.get_dataset_items.reset_mock() + tool._run(dataset_id='ds-1', limit=0) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: """When LLM values are within limits they should pass through unchanged.""" mock_tools_client.run_actor.return_value = SUCCEEDED_RUN @@ -515,6 +551,14 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.handle_tool_error is True +def test_apify_api_token_excluded_from_model_dump() -> None: + """The apify_api_token field must not appear in model_dump() output.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg] + dumped = tool.model_dump() + assert 'apify_api_token' not in dumped + + # --------------------------------------------------------------------------- # _ApifyGenericTool inheritance # --------------------------------------------------------------------------- From 863ed8d31b64457635a52abc0402918a37e1bc4a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:20:48 +0200 Subject: [PATCH 39/63] fix: lint fix --- langchain_apify/_client.py | 2 +- langchain_apify/document_loaders.py | 4 ++-- langchain_apify/tools.py | 2 +- langchain_apify/wrappers.py | 6 +++--- tests/unit_tests/test_tools.py | 14 +++++++------- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index a828be2..9a87d46 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -39,7 +39,7 @@ def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: _token = apify_api_token.get_secret_value() else: _token = apify_api_token or os.getenv('APIFY_API_TOKEN') - + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 400476e..131950d 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -49,7 +49,7 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): exclude=True, repr=False, ) - apify_client: ApifyClient = Field(default=None, exclude=True) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -83,7 +83,7 @@ def __init__( super().__init__(**init_kwargs) @model_validator(mode='after') - def _init_client(self) -> 'ApifyDatasetLoader': + def _init_client(self) -> ApifyDatasetLoader: """Resolve the Apify API token and initialise the client. Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 3d7af3c..e7721b7 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -64,7 +64,7 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] def __init__( self, actor_id: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index e4cafb6..d5fd25c 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -60,8 +60,8 @@ class ApifyWrapper(BaseModel): exclude=True, repr=False, ) - apify_client: ApifyClient = Field(default=None, exclude=True) - apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) # type: ignore[assignment] def __init__( self, @@ -84,7 +84,7 @@ def __init__( super().__init__(*args, **kwargs) @model_validator(mode='after') - def _init_clients(self) -> 'ApifyWrapper': + def _init_clients(self) -> ApifyWrapper: """Validate the token and initialise both sync and async Apify clients. Returns: diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 3c99a71..67fa1a7 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -527,12 +527,12 @@ def test_generic_tools_have_correct_metadata() -> None: """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ - ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ] expected_names = [ @@ -554,7 +554,7 @@ def test_generic_tools_have_correct_metadata() -> None: def test_apify_api_token_excluded_from_model_dump() -> None: """The apify_api_token field must not appear in model_dump() output.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): - tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg] + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg,arg-type] dumped = tool.model_dump() assert 'apify_api_token' not in dumped From 70527e0d839b02c1399620d76c6c599aa55434a3 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 24 Apr 2026 09:56:43 +0200 Subject: [PATCH 40/63] ref: update apify_api_token type to support SecretStr in document loaders --- langchain_apify/document_loaders.py | 4 ++-- langchain_apify/wrappers.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 131950d..6439740 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -60,7 +60,7 @@ def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[dict], Document], - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, ) -> None: """Initialize the loader with an Apify dataset ID and a mapping function. @@ -69,7 +69,7 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. Falls back to the + apify_api_token (str | SecretStr): Apify API token. Falls back to the ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ init_kwargs: dict[str, Any] = { diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index d5fd25c..57a9eeb 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -65,15 +65,15 @@ class ApifyWrapper(BaseModel): def __init__( self, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: """Initialise the wrapper. Args: - apify_api_token (Optional[str]): Apify API token. Falls back to the - ``APIFY_API_TOKEN`` environment variable when *None*. + apify_api_token (Optional[str | SecretStr]): Apify API token. Falls + back to the ``APIFY_API_TOKEN`` environment variable when *None*. *args: Any: Additional positional arguments forwarded to Pydantic. **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ From f005bc557f21b86f5bba7e53b1625f96ffb45783 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 24 Apr 2026 11:35:00 +0200 Subject: [PATCH 41/63] fix: turn off logger for ApifySearchRetrieval --- langchain_apify/retrievers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py index 1dc8aa7..709fd5e 100644 --- a/langchain_apify/retrievers.py +++ b/langchain_apify/retrievers.py @@ -80,6 +80,7 @@ def _get_relevant_documents( run = self._sync_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( run_input=run_input, timeout_secs=self.timeout_secs, + logger=None, ) if run is None: return [] @@ -106,6 +107,7 @@ async def _aget_relevant_documents( run = await self._async_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( run_input=run_input, timeout_secs=self.timeout_secs, + logger=None, ) if run is None: return [] From dd08098667a5b058f306adb467f59e22fe04e7f6 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 24 Apr 2026 12:03:23 +0200 Subject: [PATCH 42/63] fix: fix lint errors --- langchain_apify/_actor_tools.py | 2 +- langchain_apify/_client.py | 17 +++++++++-------- langchain_apify/document_loaders.py | 4 ++-- langchain_apify/retrievers.py | 18 ++++++++++++------ tests/unit_tests/test_actor_tools.py | 5 +++-- tests/unit_tests/test_document_loaders.py | 5 +++-- tests/unit_tests/test_retrievers.py | 17 ++++++++++------- 7 files changed, 40 insertions(+), 28 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index c62d912..43a73bc 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -11,7 +11,6 @@ from typing import TYPE_CHECKING from langchain_core.tools import ToolException -from pydantic import BaseModel from langchain_apify.tools import ( ApifyGoogleSearchInput, @@ -21,6 +20,7 @@ if TYPE_CHECKING: from langchain_core.callbacks import CallbackManagerForToolRun + from pydantic import BaseModel # --------------------------------------------------------------------------- # Search & Crawling tools diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index da79357..0251fb3 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -281,14 +281,15 @@ def google_search( timeout_secs=timeout_secs, dataset_items_limit=max_results, ) - results: list[dict] = [] - for item in items: - for organic in item.get('organicResults', []): - results.append({ - 'title': organic.get('title', ''), - 'url': organic.get('url', ''), - 'description': organic.get('description', ''), - }) + results: list[dict] = [ + { + 'title': organic.get('title', ''), + 'url': organic.get('url', ''), + 'description': organic.get('description', ''), + } + for item in items + for organic in item.get('organicResults', []) + ] return results[:max_results] def rag_web_search( diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index cba017c..014998e 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -6,7 +6,7 @@ from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader -from langchain_core.documents import Document # noqa: TCH002 +from langchain_core.documents import Document from langchain_core.utils import secret_from_env from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator @@ -166,7 +166,7 @@ class ApifyCrawlLoader(BaseLoader): documents = loader.load() """ - def __init__( + def __init__( # noqa: PLR0913 self, url: str, apify_api_token: str | None = None, diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py index 709fd5e..9d4c30e 100644 --- a/langchain_apify/retrievers.py +++ b/langchain_apify/retrievers.py @@ -71,7 +71,7 @@ def _get_relevant_documents( self, query: str, *, - run_manager: CallbackManagerForRetrieverRun | None = None, + run_manager: CallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: run_input = { 'query': query, @@ -89,16 +89,21 @@ def _get_relevant_documents( if not dataset_id: return [] - items = self._sync_client.dataset(dataset_id).list_items( - limit=self.max_results, clean=True, - ).items + items = ( + self._sync_client.dataset(dataset_id) + .list_items( + limit=self.max_results, + clean=True, + ) + .items + ) return self._items_to_documents(items) async def _aget_relevant_documents( self, query: str, *, - run_manager: AsyncCallbackManagerForRetrieverRun | None = None, + run_manager: AsyncCallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: run_input = { 'query': query, @@ -118,7 +123,8 @@ async def _aget_relevant_documents( items = ( await self._async_client.dataset(dataset_id).list_items( - limit=self.max_results, clean=True, + limit=self.max_results, + clean=True, ) ).items return self._items_to_documents(items) diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 2e5fbe4..6448d89 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -5,6 +5,7 @@ import pytest from langchain_core.tools import ToolException +from pydantic import SecretStr from langchain_apify import APIFY_ACTOR_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool from langchain_apify._client import ApifyToolsClient @@ -167,8 +168,8 @@ def test_actor_tools_inherit_from_generic_base() -> None: def test_actor_tools_have_correct_metadata() -> None: with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ - ApifyGoogleSearchTool(apify_api_token='dummy'), - ApifyWebCrawlerTool(apify_api_token='dummy'), + ApifyGoogleSearchTool(apify_api_token=SecretStr('dummy')), + ApifyWebCrawlerTool(apify_api_token=SecretStr('dummy')), ] expected_names = ['apify_google_search', 'apify_web_crawler'] diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index 00ecc0f..2bdcc8f 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import Any from unittest.mock import MagicMock, patch import pytest @@ -85,7 +86,7 @@ def test_apify_dataset_loader_lazy_load() -> None: def _make_crawl_loader( mock_client: MagicMock, - **kwargs: object, + **kwargs: Any, # noqa: ANN401 ) -> ApifyCrawlLoader: with patch.object(ApifyToolsClient, '__init__', return_value=None): loader = ApifyCrawlLoader(url='https://example.com', apify_api_token='dummy', **kwargs) @@ -184,7 +185,7 @@ def test_crawl_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: ApifyCrawlLoader(url='https://example.com') -def test_crawl_loader_failure_raises(mock_tools_client: MagicMock) -> None: +def test_crawl_loader_failure_raises() -> None: mock_client = MagicMock(spec=ApifyToolsClient) mock_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') loader = _make_crawl_loader(mock_client) diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py index 266ded7..8eede95 100644 --- a/tests/unit_tests/test_retrievers.py +++ b/tests/unit_tests/test_retrievers.py @@ -60,7 +60,7 @@ def test_init_custom_params() -> None: # --------------------------------------------------------------------------- -# _get_relevant_documents (sync) +# Sync retrieval # --------------------------------------------------------------------------- @@ -100,7 +100,8 @@ def test_sync_passes_correct_input() -> None: timeout_secs=60, ) mock_client.dataset.return_value.list_items.assert_called_once_with( - limit=3, clean=True, + limit=3, + clean=True, ) @@ -145,11 +146,13 @@ def test_sync_no_dataset_id_returns_empty() -> None: @pytest.mark.asyncio async def test_async_returns_documents() -> None: mock_async = MagicMock() - mock_async.actor.return_value.call = AsyncMock(return_value={ - 'id': 'run-1', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'ds-1', - }) + mock_async.actor.return_value.call = AsyncMock( + return_value={ + 'id': 'run-1', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-1', + } + ) mock_list_items = AsyncMock() mock_list_items.return_value.items = RAG_ITEMS mock_async.dataset.return_value.list_items = mock_list_items From 2804a5c1832590a84574d9cf5ea4f758354c17e2 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 24 Apr 2026 12:08:50 +0200 Subject: [PATCH 43/63] fix: tests fix --- langchain_apify/_actor_tools.py | 2 +- tests/unit_tests/test_retrievers.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 43a73bc..0746f4c 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -11,6 +11,7 @@ from typing import TYPE_CHECKING from langchain_core.tools import ToolException +from pydantic import BaseModel # noqa: TCH002 from langchain_apify.tools import ( ApifyGoogleSearchInput, @@ -20,7 +21,6 @@ if TYPE_CHECKING: from langchain_core.callbacks import CallbackManagerForToolRun - from pydantic import BaseModel # --------------------------------------------------------------------------- # Search & Crawling tools diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py index 8eede95..ee02dba 100644 --- a/tests/unit_tests/test_retrievers.py +++ b/tests/unit_tests/test_retrievers.py @@ -98,6 +98,7 @@ def test_sync_passes_correct_input() -> None: mock_client.actor.return_value.call.assert_called_once_with( run_input={'query': 'my search', 'maxResults': 3}, timeout_secs=60, + logger=None, ) mock_client.dataset.return_value.list_items.assert_called_once_with( limit=3, From ea8b16edf8ba33d29e0f7b3b79cd299825808703 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:07:19 +0200 Subject: [PATCH 44/63] chore: rename tools to match the task description --- langchain_apify/__init__.py | 12 +++--- langchain_apify/tools.py | 28 ++++++------- tests/integration_tests/test_generic_tools.py | 8 ++-- tests/unit_tests/test_tools.py | 40 +++++++++---------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index bca8081..7d0dfa9 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -7,9 +7,9 @@ from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -31,10 +31,10 @@ APIFY_CORE_TOOLS: list[type[BaseTool]] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ] __all__ = [ @@ -44,9 +44,9 @@ 'ApifyWrapper', # Core generic tools 'ApifyGetDatasetItemsTool', - 'ApifyRunActorAndGetItemsTool', + 'ApifyRunActorAndGetDatasetTool', 'ApifyRunActorTool', - 'ApifyRunTaskAndGetItemsTool', + 'ApifyRunTaskAndGetDatasetTool', 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', # Tool group lists diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index e7721b7..2b7cedf 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -225,8 +225,8 @@ class ApifyGetDatasetItemsInput(BaseModel): offset: int = Field(default=0, description='Number of items to skip from the start.') -class ApifyRunActorAndGetItemsInput(BaseModel): - """Input schema for :class:`ApifyRunActorAndGetItemsTool`.""" +class ApifyRunActorAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetDatasetTool`.""" actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') @@ -255,8 +255,8 @@ class ApifyRunTaskInput(BaseModel): ) -class ApifyRunTaskAndGetItemsInput(BaseModel): - """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" +class ApifyRunTaskAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetDatasetTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') task_input: dict | None = Field( @@ -454,7 +454,7 @@ def _run( return json.dumps({'items': items}) -class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] +class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -476,16 +476,16 @@ class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] import os os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" - from langchain_apify import ApifyRunActorAndGetItemsTool + from langchain_apify import ApifyRunActorAndGetDatasetTool - tool = ApifyRunActorAndGetItemsTool() + tool = ApifyRunActorAndGetDatasetTool() result = tool.invoke({ "actor_id": "apify/python-example", "run_input": {"first_number": 2, "second_number": 3}, }) """ - name: str = 'apify_run_actor_and_get_items' + name: str = 'apify_run_actor_and_get_dataset' description: str = ( 'Run an Apify Actor synchronously and return both run metadata and dataset items.' ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' @@ -494,7 +494,7 @@ class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' ' and items (list of dataset item dicts).' ) - args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput + args_schema: type[BaseModel] = ApifyRunActorAndGetDatasetInput def _run( self, @@ -625,7 +625,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] +class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` @@ -647,16 +647,16 @@ class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] import os os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" - from langchain_apify import ApifyRunTaskAndGetItemsTool + from langchain_apify import ApifyRunTaskAndGetDatasetTool - tool = ApifyRunTaskAndGetItemsTool() + tool = ApifyRunTaskAndGetDatasetTool() result = tool.invoke({ "task_id": "user/my-task", "task_input": {"key": "value"}, }) """ - name: str = 'apify_run_task_and_get_items' + name: str = 'apify_run_task_and_get_dataset' description: str = ( 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' @@ -665,7 +665,7 @@ class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' ' and items (list of dataset item dicts).' ) - args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput + args_schema: type[BaseModel] = ApifyRunTaskAndGetDatasetInput def _run( self, diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py index 863efb1..3f2a7c8 100644 --- a/tests/integration_tests/test_generic_tools.py +++ b/tests/integration_tests/test_generic_tools.py @@ -14,9 +14,9 @@ from langchain_apify import ( ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -54,7 +54,7 @@ def test_get_dataset_items_tool_smoke() -> None: def test_run_actor_and_get_items_tool_smoke() -> None: - tool = ApifyRunActorAndGetItemsTool() + tool = ApifyRunActorAndGetDatasetTool() result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) parsed = json.loads(result) @@ -86,7 +86,7 @@ def test_run_task_tool_smoke() -> None: @pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') def test_run_task_and_get_items_tool_smoke() -> None: - tool = ApifyRunTaskAndGetItemsTool() + tool = ApifyRunTaskAndGetDatasetTool() result = tool.invoke({'task_id': _TASK_ID}) parsed = json.loads(result) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 67fa1a7..9abe9dc 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -15,9 +15,9 @@ from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, _ApifyGenericTool, @@ -253,13 +253,13 @@ def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) - # --------------------------------------------------------------------------- -# ApifyRunActorAndGetItemsTool +# ApifyRunActorAndGetDatasetTool # --------------------------------------------------------------------------- def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) - tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) @@ -274,7 +274,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(actor_id='apify/test') @@ -283,7 +283,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyRunActorAndGetItemsTool() + ApifyRunActorAndGetDatasetTool() # --------------------------------------------------------------------------- @@ -350,13 +350,13 @@ def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: # --------------------------------------------------------------------------- -# ApifyRunTaskAndGetItemsTool +# ApifyRunTaskAndGetDatasetTool # --------------------------------------------------------------------------- def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) - tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) @@ -371,7 +371,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(task_id='user/my-task') @@ -380,7 +380,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyRunTaskAndGetItemsTool() + ApifyRunTaskAndGetDatasetTool() # --------------------------------------------------------------------------- @@ -427,7 +427,7 @@ def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> No def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) tool = make_tool( - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, mock_tools_client, max_timeout_secs=30, max_memory_mbytes=256, @@ -460,7 +460,7 @@ def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) - def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) tool = make_tool( - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, mock_tools_client, max_timeout_secs=30, max_memory_mbytes=256, @@ -529,19 +529,19 @@ def test_generic_tools_have_correct_metadata() -> None: tools = [ ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ] expected_names = [ 'apify_run_actor', 'apify_get_dataset_items', - 'apify_run_actor_and_get_items', + 'apify_run_actor_and_get_dataset', 'apify_scrape_url', 'apify_run_task', - 'apify_run_task_and_get_items', + 'apify_run_task_and_get_dataset', ] for tool, expected_name in zip(tools, expected_names): @@ -569,10 +569,10 @@ def test_all_generic_tools_inherit_from_base() -> None: for tool_cls in ( ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ): assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' @@ -592,9 +592,9 @@ def test_apify_core_tools_contains_all_generic_classes() -> None: assert set(APIFY_CORE_TOOLS) == { ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, } assert len(APIFY_CORE_TOOLS) == 6 From cd1eea1fc4a001296f941954decac2b4e996693d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:25:04 +0200 Subject: [PATCH 45/63] fix: narrow except blocks in _client.py to SDK/transport errors --- langchain_apify/_client.py | 21 +++++++++++++-------- tests/unit_tests/test_client.py | 31 ++++++++++++++++++++----------- tests/unit_tests/test_tools.py | 4 ++-- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 9a87d46..618d007 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -2,7 +2,9 @@ import os +import httpx from apify_client import ApifyClient +from apify_client.errors import ApifyClientError from pydantic import SecretStr from langchain_apify._error_messages import ( @@ -12,6 +14,9 @@ ) from langchain_apify._utils import _create_apify_client +# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate. +_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError) + _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 @@ -72,8 +77,8 @@ def run_actor( try: run = self._client.actor(actor_id).call(**call_kwargs) - except Exception as exc: - msg = f'Network error calling Actor {actor_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify Actor call failed for {actor_id}: {exc}' raise RuntimeError(msg) from exc if run is None: msg = f'Actor {actor_id} call returned no run details.' @@ -96,8 +101,8 @@ def get_dataset_items( """ try: return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' raise RuntimeError(msg) from exc def run_actor_and_get_items( @@ -159,8 +164,8 @@ def run_task( try: run = self._client.task(task_id).call(**call_kwargs) - except Exception as exc: - msg = f'Network error calling task {task_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify task call failed for {task_id}: {exc}' raise RuntimeError(msg) from exc if run is None: msg = f'Task {task_id} call returned no run details.' @@ -239,8 +244,8 @@ def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: """Fetch dataset items, wrapping any network error in a RuntimeError.""" try: return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' raise RuntimeError(msg) from exc @staticmethod diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 40c73dc..c43e4d1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock, patch +import httpx import pytest from langchain_apify._client import ApifyToolsClient @@ -238,21 +239,21 @@ def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client # --------------------------------------------------------------------------- -# Network error wrapping (transport exception -> RuntimeError) +# Transport-error wrapping (httpx / ApifyClientError -> RuntimeError) # --------------------------------------------------------------------------- def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.side_effect = ConnectionError('conn refused') + mock_apify_client.actor.return_value.call.side_effect = httpx.ConnectError('conn refused') - with pytest.raises(RuntimeError, match='Network error calling Actor'): + with pytest.raises(RuntimeError, match='Apify Actor call failed'): client.run_actor('apify/test-actor') def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('timeout') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('timeout') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.get_dataset_items('dataset-xyz') @@ -260,16 +261,16 @@ def test_run_actor_and_get_items_dataset_fetch_network_error( client: ApifyToolsClient, mock_apify_client: MagicMock ) -> None: mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.run_actor_and_get_items('apify/test-actor') def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.side_effect = ConnectionError('conn refused') + mock_apify_client.task.return_value.call.side_effect = httpx.ConnectError('conn refused') - with pytest.raises(RuntimeError, match='Network error calling task'): + with pytest.raises(RuntimeError, match='Apify task call failed'): client.run_task('user/my-task') @@ -277,7 +278,15 @@ def test_run_task_and_get_items_dataset_fetch_network_error( client: ApifyToolsClient, mock_apify_client: MagicMock ) -> None: mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.run_task_and_get_items('user/my-task') + + +def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" + mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') + + with pytest.raises(AttributeError, match='bug in SDK'): + client.run_actor('apify/test-actor') diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 9abe9dc..4a5dbdd 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -238,11 +238,11 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.side_effect = RuntimeError( - 'Network error fetching dataset ds-bad: connection reset' + 'Apify dataset fetch failed for ds-bad: connection reset' ) tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) - with pytest.raises(ToolException, match='Network error fetching dataset'): + with pytest.raises(ToolException, match='Apify dataset fetch failed'): tool._run(dataset_id='ds-bad') From 50c3583243919d50f240a4f1a0963822a6ec2c33 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:33:57 +0200 Subject: [PATCH 46/63] fix: clamp memory_mbytes to Apify platform minimum (128 MB) --- langchain_apify/tools.py | 7 +++++-- tests/unit_tests/test_tools.py | 20 +++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 2b7cedf..fafc858 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -333,9 +333,12 @@ def _clamp_timeout(self, value: int) -> int: return max(1, min(value, self.max_timeout_secs)) def _clamp_memory(self, value: int | None) -> int | None: - if value is None: + # Non-positive values fall through to the platform default. Positive + # values are floored at 128 MB (the Apify platform minimum) so the LLM + # cannot drive into an API rejection by requesting too little memory. + if value is None or value <= 0: return None - return max(1, min(value, self.max_memory_mbytes)) + return max(128, min(value, self.max_memory_mbytes)) def _clamp_items(self, value: int) -> int: return max(1, min(value, self.max_items)) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 4a5dbdd..108c695 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -484,16 +484,30 @@ def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) -def test_clamp_memory_floor_is_one(mock_tools_client: MagicMock) -> None: +def test_clamp_memory_non_positive_is_treated_as_none(mock_tools_client: MagicMock) -> None: + """memory_mbytes <= 0 maps to None so the Apify platform default is used.""" mock_tools_client.run_actor.return_value = SUCCEEDED_RUN tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) tool._run(actor_id='apify/test', memory_mbytes=-1) - mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) mock_tools_client.run_actor.reset_mock() tool._run(actor_id='apify/test', memory_mbytes=0) - mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_clamp_memory_floors_positive_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """A positive memory_mbytes below the Apify platform minimum (128 MB) is floored to 128.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=64) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: From 450728cb10ac6ed9dac16886fab48dc8586b9009 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:36:08 +0200 Subject: [PATCH 47/63] fix: narrow empty-dataset message in ApifyGetDatasetItemsTool --- langchain_apify/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index fafc858..4cd182d 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -453,7 +453,7 @@ def _run( except RuntimeError as exc: raise ToolException(str(exc)) from exc if not items: - return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) + return json.dumps({'items': [], 'message': f'Dataset {dataset_id} is empty.'}) return json.dumps({'items': items}) From 1360e9228a3be72c6dd6105b3f725d1c16dc599d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:40:51 +0200 Subject: [PATCH 48/63] ref: simplify ApifyToolsClient.__init__ to require explicit token --- langchain_apify/_client.py | 20 +++++--------------- tests/unit_tests/test_client.py | 12 ++---------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 618d007..cc1e4b8 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -1,11 +1,8 @@ from __future__ import annotations -import os - import httpx from apify_client import ApifyClient from apify_client.errors import ApifyClientError -from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -31,24 +28,17 @@ class ApifyToolsClient: block until the Actor run finishes. Args: - apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` - environment variable when *None*. + apify_api_token: Apify API token. Raises: - ValueError: If no token is provided and the env var is not set. + ValueError: If the token is empty. """ - def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: - _token: str | None = None - if isinstance(apify_api_token, SecretStr): - _token = apify_api_token.get_secret_value() - else: - _token = apify_api_token or os.getenv('APIFY_API_TOKEN') - - if not _token: + def __init__(self, apify_api_token: str) -> None: + if not apify_api_token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, _token) + self._client = _create_apify_client(ApifyClient, apify_api_token) def run_actor( self, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index c43e4d1..43f6f83 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -20,17 +20,9 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: assert c._client is mock_apify_client -def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: - monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') - with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): - c = ApifyToolsClient() - assert c._client is mock_apify_client - - -def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv('APIFY_API_TOKEN', raising=False) +def test_init_empty_token_raises() -> None: with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyToolsClient() + ApifyToolsClient(apify_api_token='') # --------------------------------------------------------------------------- From 09b6c6e045b9b2815bfbc0be28527635e85c4d26 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:45:55 +0200 Subject: [PATCH 49/63] docs: add module-level docstring to tools.py --- langchain_apify/tools.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 4cd182d..385fc57 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -1,3 +1,21 @@ +"""LangChain tools for the Apify platform. + +All tools require an Apify API token. Set it via the ``APIFY_API_TOKEN`` +environment variable, or pass ``apify_api_token`` to the tool constructor: + +.. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({"actor_id": "apify/python-example"}) + +For details, see https://docs.apify.com/platform/integrations/langchain +""" + from __future__ import annotations import json From a5bd7cce8a178da607392b651f361692af5fb682 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:48:52 +0200 Subject: [PATCH 50/63] ref: rename model_post_init parameter to --- langchain_apify/tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 385fc57..46846f2 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -340,12 +340,12 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] _client: ApifyToolsClient = PrivateAttr() - def model_post_init(self, __context: Any) -> None: # noqa: ANN401 + def model_post_init(self, context: Any) -> None: # noqa: ANN401 if self.apify_api_token is None: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) - super().model_post_init(__context) + super().model_post_init(context) def _clamp_timeout(self, value: int) -> int: return max(1, min(value, self.max_timeout_secs)) From 23242c1a490b8ec64f03838cf97ac33412bd1a59 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 11:18:51 +0200 Subject: [PATCH 51/63] revert: restore env-fallback --- langchain_apify/_client.py | 19 ++++++++++++++----- tests/unit_tests/test_client.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index cc1e4b8..77fe0dd 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -1,8 +1,11 @@ from __future__ import annotations +import os + import httpx from apify_client import ApifyClient from apify_client.errors import ApifyClientError +from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -28,17 +31,23 @@ class ApifyToolsClient: block until the Actor run finishes. Args: - apify_api_token: Apify API token. + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. Raises: - ValueError: If the token is empty. + ValueError: If no token is provided and the env var is not set. """ - def __init__(self, apify_api_token: str) -> None: - if not apify_api_token: + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + if isinstance(apify_api_token, SecretStr): + _token: str | None = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, apify_api_token) + self._client = _create_apify_client(ApifyClient, _token) def run_actor( self, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 43f6f83..c43e4d1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -20,9 +20,17 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: assert c._client is mock_apify_client -def test_init_empty_token_raises() -> None: +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyToolsClient(apify_api_token='') + ApifyToolsClient() # --------------------------------------------------------------------------- From 7ea3e8ccfbbb6f1af76bfaa4ac75f4dea451249c Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:12:54 +0200 Subject: [PATCH 52/63] chore: drop placeholder section in _actor_tools.py --- langchain_apify/_actor_tools.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 0746f4c..bc06136 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -1,8 +1,8 @@ -"""Actor-specific tool subclasses (search, social-media, etc.). +"""Actor-specific tool subclasses. -Downstream feature branches add concrete tools here. They inherit from -:class:`~langchain_apify.tools._ApifyGenericTool` and use -:func:`~langchain_apify.tools._run_meta` to format run metadata. +Tools in this module wrap a single Apify Actor behind a simplified, +LLM-friendly interface. They inherit from +:class:`~langchain_apify.tools._ApifyGenericTool`. """ from __future__ import annotations @@ -153,8 +153,3 @@ def _run( for item in items ] return json.dumps(pages) - - -# --------------------------------------------------------------------------- -# Social-media tools -# --------------------------------------------------------------------------- From 700e5ab8aeb76a1da09c484ae7dd042b7e197ba7 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:15:57 +0200 Subject: [PATCH 53/63] chore: align APIFY_ACTOR_TOOLS type hint with APIFY_CORE_TOOLS --- langchain_apify/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 5030d2a..287e2ab 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -39,7 +39,7 @@ ApifyRunTaskAndGetDatasetTool, ] -APIFY_ACTOR_TOOLS: list[type] = [ +APIFY_ACTOR_TOOLS: list[type[BaseTool]] = [ ApifyGoogleSearchTool, ApifyWebCrawlerTool, ] From c0dd11eaf77a96f7a7aa68a789bee79b54a06fdc Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:30:39 +0200 Subject: [PATCH 54/63] feat: constrain crawler_type to a Literal of valid Apify values --- langchain_apify/_actor_tools.py | 3 ++- langchain_apify/document_loaders.py | 4 +++- langchain_apify/tools.py | 10 ++++++++-- tests/unit_tests/test_actor_tools.py | 4 ++-- tests/unit_tests/test_document_loaders.py | 4 ++-- 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index bc06136..ec780f0 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -16,6 +16,7 @@ from langchain_apify.tools import ( ApifyGoogleSearchInput, ApifyWebCrawlerInput, + CrawlerType, _ApifyGenericTool, ) @@ -130,7 +131,7 @@ def _run( url: str, max_crawl_pages: int = 10, max_crawl_depth: int = 1, - crawler_type: str = 'cheerio', + crawler_type: CrawlerType = 'cheerio', timeout_secs: int = 300, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 014998e..6177a52 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -17,6 +17,8 @@ if TYPE_CHECKING: from collections.abc import Iterator + from langchain_apify.tools import CrawlerType + class ApifyDatasetLoader(BaseLoader, BaseModel): """Load datasets from Apify web scraping, crawling, and data extraction platform. @@ -173,7 +175,7 @@ def __init__( # noqa: PLR0913 *, max_crawl_pages: int = 10, max_crawl_depth: int = 1, - crawler_type: str = 'cheerio', + crawler_type: CrawlerType = 'cheerio', timeout_secs: int = 300, ) -> None: self.url = url diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 4fad7f2..f8af0e5 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -21,7 +21,7 @@ import json import os from datetime import datetime -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException @@ -42,6 +42,9 @@ from langchain_core.callbacks import ( CallbackManagerForToolRun, ) + +CrawlerType = Literal['cheerio', 'playwright:adaptive', 'playwright:firefox'] + class ApifyActorsTool(BaseTool): # type: ignore[override, override] @@ -275,7 +278,10 @@ class ApifyWebCrawlerInput(BaseModel): url: str = Field(description='Seed URL to start crawling from.') max_crawl_pages: int = Field(default=10, description='Maximum number of pages to crawl.') max_crawl_depth: int = Field(default=1, description='Maximum link-follow depth from the seed URL.') - crawler_type: str = Field(default='cheerio', description='Crawler engine (e.g. "cheerio", "playwright").') + crawler_type: CrawlerType = Field( + default='cheerio', + description='Crawler engine: "cheerio" (fast, static HTML), "playwright:adaptive" or "playwright:firefox".', + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the crawl to finish.') diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 6448d89..df6e391 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -108,7 +108,7 @@ def test_web_crawler_tool_passes_params(mock_tools_client: MagicMock) -> None: url='https://example.com', max_crawl_pages=5, max_crawl_depth=2, - crawler_type='playwright', + crawler_type='playwright:firefox', timeout_secs=120, ) @@ -116,7 +116,7 @@ def test_web_crawler_tool_passes_params(mock_tools_client: MagicMock) -> None: 'https://example.com', max_crawl_pages=5, max_crawl_depth=2, - crawler_type='playwright', + crawler_type='playwright:firefox', timeout_secs=120, ) diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index 2bdcc8f..a9d7ca1 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -129,7 +129,7 @@ def test_crawl_loader_passes_params() -> None: mock_client, max_crawl_pages=5, max_crawl_depth=2, - crawler_type='playwright', + crawler_type='playwright:firefox', timeout_secs=120, ) @@ -139,7 +139,7 @@ def test_crawl_loader_passes_params() -> None: 'https://example.com', max_crawl_pages=5, max_crawl_depth=2, - crawler_type='playwright', + crawler_type='playwright:firefox', timeout_secs=120, ) From 01899433c694415133f14bb3ad767047a2949993 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:34:46 +0200 Subject: [PATCH 55/63] feat: clamp max_crawl_depth in ApifyWebCrawlerTool --- langchain_apify/_actor_tools.py | 2 +- langchain_apify/tools.py | 7 ++++++- tests/unit_tests/test_actor_tools.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index ec780f0..5301a17 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -139,7 +139,7 @@ def _run( items = self._client.crawl_website( url, max_crawl_pages=self._clamp_items(max_crawl_pages), - max_crawl_depth=max_crawl_depth, + max_crawl_depth=self._clamp_depth(max_crawl_depth), crawler_type=crawler_type, timeout_secs=self._clamp_timeout(timeout_secs), ) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index f8af0e5..b14bf3e 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -42,7 +42,7 @@ from langchain_core.callbacks import ( CallbackManagerForToolRun, ) - + CrawlerType = Literal['cheerio', 'playwright:adaptive', 'playwright:firefox'] @@ -362,6 +362,7 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + max_crawl_depth: int = Field(default=5, description='Upper bound for max_crawl_depth the LLM may request.') _client: ApifyToolsClient = PrivateAttr() @@ -386,6 +387,10 @@ def _clamp_memory(self, value: int | None) -> int | None: def _clamp_items(self, value: int) -> int: return max(1, min(value, self.max_items)) + def _clamp_depth(self, value: int) -> int: + # Floor at 0 (a depth of 0 means "only crawl the seed URL"). + return max(0, min(value, self.max_crawl_depth)) + # --------------------------------------------------------------------------- # Generic tools diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index df6e391..227c3f7 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -132,6 +132,18 @@ def test_web_crawler_tool_clamps_pages_and_timeout(mock_tools_client: MagicMock) assert call_kwargs.kwargs['timeout_secs'] == 60 +def test_web_crawler_tool_clamps_depth(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client, max_crawl_depth=2) + + tool._run(url='https://example.com', max_crawl_depth=999) + assert mock_tools_client.crawl_website.call_args.kwargs['max_crawl_depth'] == 2 + + mock_tools_client.crawl_website.reset_mock() + tool._run(url='https://example.com', max_crawl_depth=-1) + assert mock_tools_client.crawl_website.call_args.kwargs['max_crawl_depth'] == 0 + + def test_web_crawler_tool_empty_results(mock_tools_client: MagicMock) -> None: mock_tools_client.crawl_website.return_value = [] tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) From 6d2422d8c033832fc242679870553a6855ea0b0a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:40:57 +0200 Subject: [PATCH 56/63] feat: expose timeout_secs in ApifyGoogleSearchInput --- langchain_apify/_actor_tools.py | 6 ++++-- langchain_apify/tools.py | 1 + tests/unit_tests/test_actor_tools.py | 13 +++++++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 5301a17..ab9c46d 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -60,7 +60,8 @@ class ApifyGoogleSearchTool(_ApifyGenericTool): # type: ignore[override] ' Each result has keys: title, url, description.' ' Required: query (str) — the search query.' ' Optional: max_results (int, default 10),' - ' country_code (str|null), language_code (str|null).' + ' country_code (str|null), language_code (str|null),' + ' timeout_secs (int, default 300).' ) args_schema: type[BaseModel] = ApifyGoogleSearchInput @@ -70,6 +71,7 @@ def _run( max_results: int = 10, country_code: str | None = None, language_code: str | None = None, + timeout_secs: int = 300, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: @@ -78,7 +80,7 @@ def _run( max_results=self._clamp_items(max_results), country_code=country_code, language_code=language_code, - timeout_secs=self.max_timeout_secs, + timeout_secs=self._clamp_timeout(timeout_secs), ) except RuntimeError as exc: raise ToolException(str(exc)) from exc diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index b14bf3e..a012a1c 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -270,6 +270,7 @@ class ApifyGoogleSearchInput(BaseModel): max_results: int = Field(default=10, description='Maximum number of search results to return.') country_code: str | None = Field(default=None, description='Two-letter country code for localised results.') language_code: str | None = Field(default=None, description='Two-letter language code.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the search to finish.') class ApifyWebCrawlerInput(BaseModel): diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 227c3f7..fc03026 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -36,17 +36,26 @@ def test_google_search_tool_passes_params(mock_tools_client: MagicMock) -> None: mock_tools_client.google_search.return_value = [] tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) - tool._run(query='test', max_results=5, country_code='us', language_code='en') + tool._run(query='test', max_results=5, country_code='us', language_code='en', timeout_secs=120) mock_tools_client.google_search.assert_called_once_with( 'test', max_results=5, country_code='us', language_code='en', - timeout_secs=600, + timeout_secs=120, ) +def test_google_search_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client, max_timeout_secs=60) + + tool._run(query='test', timeout_secs=9999) + + assert mock_tools_client.google_search.call_args.kwargs['timeout_secs'] == 60 + + def test_google_search_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: mock_tools_client.google_search.return_value = [] tool = make_tool(ApifyGoogleSearchTool, mock_tools_client, max_items=3) From 2dfecd7e971adc9ea211d800e972969f75d6701b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:43:08 +0200 Subject: [PATCH 57/63] ref: accept SecretStr token in ApifyCrawlLoader --- langchain_apify/document_loaders.py | 2 +- tests/unit_tests/test_document_loaders.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 6177a52..bfaa761 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -171,7 +171,7 @@ class ApifyCrawlLoader(BaseLoader): def __init__( # noqa: PLR0913 self, url: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *, max_crawl_pages: int = 10, max_crawl_depth: int = 1, diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a9d7ca1..5c71704 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -7,6 +7,7 @@ from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document +from pydantic import SecretStr from langchain_apify import ApifyCrawlLoader, ApifyDatasetLoader from langchain_apify._client import ApifyToolsClient @@ -185,6 +186,12 @@ def test_crawl_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: ApifyCrawlLoader(url='https://example.com') +def test_crawl_loader_accepts_secretstr_token() -> None: + with patch('langchain_apify._client._create_apify_client'): + loader = ApifyCrawlLoader(url='https://example.com', apify_api_token=SecretStr('s')) + assert loader.url == 'https://example.com' + + def test_crawl_loader_failure_raises() -> None: mock_client = MagicMock(spec=ApifyToolsClient) mock_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') From 9c8178564eb503cf2b6cfb4f50e99d0655120ffa Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 12:49:13 +0200 Subject: [PATCH 58/63] docs: clarify ApifyCrawlLoader.lazy_load is not truly lazy --- langchain_apify/document_loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index bfaa761..e1f0e6a 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -186,7 +186,7 @@ def __init__( # noqa: PLR0913 self._client = ApifyToolsClient(apify_api_token=apify_api_token) def lazy_load(self) -> Iterator[Document]: - """Crawl the website and yield Documents lazily. + """Crawl the website and yield Documents. Yields: Document: One document per crawled page. From 49dd4f0651877163eb115ea208a7144eab4531b0 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 13:12:50 +0200 Subject: [PATCH 59/63] ref: rewrite ApifySearchRetriever to use ApifyToolsClient --- langchain_apify/retrievers.py | 85 ++++++----------- langchain_apify/tools.py | 1 - tests/unit_tests/test_retrievers.py | 143 ++++++++++++---------------- 3 files changed, 89 insertions(+), 140 deletions(-) diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py index 9d4c30e..67a5d0a 100644 --- a/langchain_apify/retrievers.py +++ b/langchain_apify/retrievers.py @@ -2,16 +2,15 @@ from __future__ import annotations -import os +import asyncio from typing import TYPE_CHECKING, Any -from apify_client import ApifyClient, ApifyClientAsync from langchain_core.documents import Document from langchain_core.retrievers import BaseRetriever -from pydantic import Field, PrivateAttr +from langchain_core.utils import secret_from_env +from pydantic import Field, PrivateAttr, SecretStr -from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify._utils import _create_apify_client +from langchain_apify._client import ApifyToolsClient if TYPE_CHECKING: from langchain_core.callbacks import ( @@ -19,7 +18,6 @@ CallbackManagerForRetrieverRun, ) -_RAG_WEB_BROWSER_ACTOR_ID = 'apify/rag-web-browser' _DEFAULT_TIMEOUT_SECS = 300 @@ -52,20 +50,25 @@ class ApifySearchRetriever(BaseRetriever): docs = retriever.invoke("What is LangChain?") """ + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) max_results: int = Field(default=5, description='Maximum number of documents to return.') timeout_secs: int = Field(default=_DEFAULT_TIMEOUT_SECS, description='Maximum Actor run time in seconds.') - _sync_client: ApifyClient = PrivateAttr() - _async_client: ApifyClientAsync = PrivateAttr() + _client: ApifyToolsClient = PrivateAttr() + + def model_post_init(self, context: Any) -> None: # noqa: ANN401 + """Construct the underlying ``ApifyToolsClient``. - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not token: - msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET - raise ValueError(msg) - self._sync_client = _create_apify_client(ApifyClient, token) - self._async_client = _create_apify_client(ApifyClientAsync, token) + The helper handles ``None`` / ``SecretStr`` / env-fallback and raises + ``ValueError`` if no token is available. + """ + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token) + super().model_post_init(context) def _get_relevant_documents( self, @@ -73,29 +76,10 @@ def _get_relevant_documents( *, run_manager: CallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: - run_input = { - 'query': query, - 'maxResults': self.max_results, - } - run = self._sync_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( - run_input=run_input, + items = self._client.rag_web_search( + query, + max_results=self.max_results, timeout_secs=self.timeout_secs, - logger=None, - ) - if run is None: - return [] - - dataset_id = run.get('defaultDatasetId') - if not dataset_id: - return [] - - items = ( - self._sync_client.dataset(dataset_id) - .list_items( - limit=self.max_results, - clean=True, - ) - .items ) return self._items_to_documents(items) @@ -105,28 +89,13 @@ async def _aget_relevant_documents( *, run_manager: AsyncCallbackManagerForRetrieverRun | None = None, # noqa: ARG002 ) -> list[Document]: - run_input = { - 'query': query, - 'maxResults': self.max_results, - } - run = await self._async_client.actor(_RAG_WEB_BROWSER_ACTOR_ID).call( - run_input=run_input, + # ApifyToolsClient is sync-only. + items = await asyncio.to_thread( + self._client.rag_web_search, + query, + max_results=self.max_results, timeout_secs=self.timeout_secs, - logger=None, ) - if run is None: - return [] - - dataset_id = run.get('defaultDatasetId') - if not dataset_id: - return [] - - items = ( - await self._async_client.dataset(dataset_id).list_items( - limit=self.max_results, - clean=True, - ) - ).items return self._items_to_documents(items) @staticmethod diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index a012a1c..ec4359d 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -46,7 +46,6 @@ CrawlerType = Literal['cheerio', 'playwright:adaptive', 'playwright:firefox'] - class ApifyActorsTool(BaseTool): # type: ignore[override, override] """Tool that runs Apify Actors. diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py index ee02dba..0106645 100644 --- a/tests/unit_tests/test_retrievers.py +++ b/tests/unit_tests/test_retrievers.py @@ -1,10 +1,13 @@ from __future__ import annotations -from unittest.mock import AsyncMock, MagicMock, patch +from typing import Any +from unittest.mock import MagicMock, patch import pytest from langchain_core.documents import Document +from pydantic import SecretStr +from langchain_apify._client import ApifyToolsClient from langchain_apify.retrievers import ApifySearchRetriever RAG_ITEMS: list[dict] = [ @@ -21,17 +24,12 @@ ] -def _make_retriever( - mock_sync_client: MagicMock, - mock_async_client: MagicMock | None = None, - **kwargs: object, -) -> ApifySearchRetriever: - """Create a retriever with mocked Apify clients.""" - with ( - patch('langchain_apify.retrievers._create_apify_client') as mock_create, - ): - mock_create.side_effect = [mock_sync_client, mock_async_client or MagicMock()] - return ApifySearchRetriever(apify_api_token='dummy-token', **kwargs) +def _make_retriever(mock_client: MagicMock, **kwargs: Any) -> ApifySearchRetriever: # noqa: ANN401 + """Instantiate a retriever with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('dummy-token'), **kwargs) + retriever._client = mock_client + return retriever # --------------------------------------------------------------------------- @@ -46,15 +44,15 @@ def test_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: def test_init_with_explicit_token() -> None: - with patch('langchain_apify.retrievers._create_apify_client'): - retriever = ApifySearchRetriever(apify_api_token='my-token') + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('my-token')) assert retriever.max_results == 5 assert retriever.timeout_secs == 300 def test_init_custom_params() -> None: - with patch('langchain_apify.retrievers._create_apify_client'): - retriever = ApifySearchRetriever(apify_api_token='t', max_results=3, timeout_secs=60) + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('t'), max_results=3, timeout_secs=60) assert retriever.max_results == 3 assert retriever.timeout_secs == 60 @@ -65,13 +63,8 @@ def test_init_custom_params() -> None: def test_sync_returns_documents() -> None: - mock_client = MagicMock() - mock_client.actor.return_value.call.return_value = { - 'id': 'run-1', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'ds-1', - } - mock_client.dataset.return_value.list_items.return_value.items = RAG_ITEMS + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = RAG_ITEMS retriever = _make_retriever(mock_client, max_results=5) docs = retriever._get_relevant_documents('test query') @@ -85,43 +78,23 @@ def test_sync_returns_documents() -> None: assert docs[1].metadata['source'] == 'https://example.com/2' -def test_sync_passes_correct_input() -> None: - mock_client = MagicMock() - mock_client.actor.return_value.call.return_value = { - 'defaultDatasetId': 'ds-1', - } - mock_client.dataset.return_value.list_items.return_value.items = [] +def test_sync_calls_helper_with_correct_args() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) retriever._get_relevant_documents('my search') - mock_client.actor.return_value.call.assert_called_once_with( - run_input={'query': 'my search', 'maxResults': 3}, + mock_client.rag_web_search.assert_called_once_with( + 'my search', + max_results=3, timeout_secs=60, - logger=None, - ) - mock_client.dataset.return_value.list_items.assert_called_once_with( - limit=3, - clean=True, ) def test_sync_empty_results() -> None: - mock_client = MagicMock() - mock_client.actor.return_value.call.return_value = { - 'defaultDatasetId': 'ds-1', - } - mock_client.dataset.return_value.list_items.return_value.items = [] - retriever = _make_retriever(mock_client) - - docs = retriever._get_relevant_documents('test') - - assert docs == [] - - -def test_sync_none_run_returns_empty() -> None: - mock_client = MagicMock() - mock_client.actor.return_value.call.return_value = None + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] retriever = _make_retriever(mock_client) docs = retriever._get_relevant_documents('test') @@ -129,36 +102,28 @@ def test_sync_none_run_returns_empty() -> None: assert docs == [] -def test_sync_no_dataset_id_returns_empty() -> None: - mock_client = MagicMock() - mock_client.actor.return_value.call.return_value = {'id': 'run-1', 'defaultDatasetId': None} +def test_sync_helper_failure_propagates() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.side_effect = RuntimeError( + 'Actor run run-bad ended with status FAILED.', + ) retriever = _make_retriever(mock_client) - docs = retriever._get_relevant_documents('test') - - assert docs == [] + with pytest.raises(RuntimeError, match='FAILED'): + retriever._get_relevant_documents('test') # --------------------------------------------------------------------------- -# _aget_relevant_documents (async) +# Async retrieval # --------------------------------------------------------------------------- @pytest.mark.asyncio async def test_async_returns_documents() -> None: - mock_async = MagicMock() - mock_async.actor.return_value.call = AsyncMock( - return_value={ - 'id': 'run-1', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'ds-1', - } - ) - mock_list_items = AsyncMock() - mock_list_items.return_value.items = RAG_ITEMS - mock_async.dataset.return_value.list_items = mock_list_items - - retriever = _make_retriever(MagicMock(), mock_async, max_results=5) + """Async path wraps the sync helper via asyncio.to_thread.""" + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = RAG_ITEMS + retriever = _make_retriever(mock_client, max_results=5) docs = await retriever._aget_relevant_documents('test query') @@ -169,27 +134,43 @@ async def test_async_returns_documents() -> None: @pytest.mark.asyncio -async def test_async_none_run_returns_empty() -> None: - mock_async = MagicMock() - mock_async.actor.return_value.call = AsyncMock(return_value=None) - retriever = _make_retriever(MagicMock(), mock_async) +async def test_async_calls_helper_with_correct_args() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) - docs = await retriever._aget_relevant_documents('test') + await retriever._aget_relevant_documents('my search') - assert docs == [] + mock_client.rag_web_search.assert_called_once_with( + 'my search', + max_results=3, + timeout_secs=60, + ) @pytest.mark.asyncio -async def test_async_no_dataset_id_returns_empty() -> None: - mock_async = MagicMock() - mock_async.actor.return_value.call = AsyncMock(return_value={'defaultDatasetId': None}) - retriever = _make_retriever(MagicMock(), mock_async) +async def test_async_empty_results() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client) docs = await retriever._aget_relevant_documents('test') assert docs == [] +@pytest.mark.asyncio +async def test_async_helper_failure_propagates() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.side_effect = RuntimeError( + 'Actor run run-bad ended with status FAILED.', + ) + retriever = _make_retriever(mock_client) + + with pytest.raises(RuntimeError, match='FAILED'): + await retriever._aget_relevant_documents('test') + + # --------------------------------------------------------------------------- # _items_to_documents edge cases # --------------------------------------------------------------------------- From a060c146e649855d97d6ea0c9a2ee135f8902241 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 14:06:40 +0200 Subject: [PATCH 60/63] fix: normalise locale codes to lowercase to match Apify Actor schema --- langchain_apify/tools.py | 19 ++++++++++++++--- tests/unit_tests/test_actor_tools.py | 32 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index ec4359d..ff5c5b9 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -26,7 +26,7 @@ from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException from langchain_core.utils import secret_from_env -from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model, field_validator from langchain_apify._client import ApifyToolsClient from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -267,10 +267,23 @@ class ApifyGoogleSearchInput(BaseModel): query: str = Field(description='Search query string.') max_results: int = Field(default=10, description='Maximum number of search results to return.') - country_code: str | None = Field(default=None, description='Two-letter country code for localised results.') - language_code: str | None = Field(default=None, description='Two-letter language code.') + country_code: str | None = Field( + default=None, + description='Two-letter country code (case-insensitive; normalised to lowercase, e.g. "us", "gb").', + pattern=r'^[a-zA-Z]{2}$', + ) + language_code: str | None = Field( + default=None, + description='Two-letter language code (case-insensitive; normalised to lowercase, e.g. "en", "fr").', + pattern=r'^[a-zA-Z]{2}$', + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the search to finish.') + @field_validator('country_code', 'language_code') + @classmethod + def _normalise_locale_code(cls, value: str | None) -> str | None: + return value.lower() if value else value + class ApifyWebCrawlerInput(BaseModel): """Input schema for :class:`ApifyWebCrawlerTool`.""" diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index fc03026..e7fcf58 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -89,6 +89,38 @@ def test_google_search_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> No ApifyGoogleSearchTool() +@pytest.mark.parametrize('bad_code', ['USA', 'english', 'u', 'us1', '']) +def test_google_search_tool_rejects_malformed_locale(mock_tools_client: MagicMock, bad_code: str) -> None: + """country_code and language_code must be exactly two letters.""" + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + with pytest.raises(ValueError, match='string_pattern_mismatch|String should match pattern'): + tool.invoke({'query': 'test', 'country_code': bad_code}) + + with pytest.raises(ValueError, match='string_pattern_mismatch|String should match pattern'): + tool.invoke({'query': 'test', 'language_code': bad_code}) + + +@pytest.mark.parametrize('raw_country', ['us', 'US', 'Us', 'uS']) +def test_google_search_tool_normalises_country_code_to_lower(mock_tools_client: MagicMock, raw_country: str) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool.invoke({'query': 'test', 'country_code': raw_country}) + + assert mock_tools_client.google_search.call_args.kwargs['country_code'] == 'us' + + +@pytest.mark.parametrize('raw_language', ['en', 'EN', 'En', 'eN']) +def test_google_search_tool_normalises_language_code_to_lower(mock_tools_client: MagicMock, raw_language: str) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool.invoke({'query': 'test', 'language_code': raw_language}) + + assert mock_tools_client.google_search.call_args.kwargs['language_code'] == 'en' + + # --------------------------------------------------------------------------- # ApifyWebCrawlerTool # --------------------------------------------------------------------------- From a908467795e5f9d7fad61d9463f89f2a93080080 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:24:07 +0200 Subject: [PATCH 61/63] fix: extract source URL from metadata.url for apify/rag-web-browser --- langchain_apify/retrievers.py | 8 ++++++-- tests/unit_tests/test_retrievers.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py index 67a5d0a..80a2099 100644 --- a/langchain_apify/retrievers.py +++ b/langchain_apify/retrievers.py @@ -104,9 +104,13 @@ def _items_to_documents(items: list[dict]) -> list[Document]: docs: list[Document] = [] for item in items: page_content = item.get('text') or item.get('markdown') or '' + raw_meta = item.get('metadata') + item_metadata: dict = raw_meta if isinstance(raw_meta, dict) else {} metadata: dict[str, Any] = { - 'source': item.get('crawledUrl') or item.get('url', ''), - 'title': item.get('metadata', {}).get('title', '') if isinstance(item.get('metadata'), dict) else '', + # apify/rag-web-browser nests url/title under "metadata"; older + # Actors and tests use top-level keys. Both are supported. + 'source': item.get('crawledUrl') or item.get('url') or item_metadata.get('url', ''), + 'title': item_metadata.get('title', ''), } docs.append(Document(page_content=page_content, metadata=metadata)) return docs diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py index 0106645..17dfba9 100644 --- a/tests/unit_tests/test_retrievers.py +++ b/tests/unit_tests/test_retrievers.py @@ -184,6 +184,21 @@ def test_items_to_documents_uses_url_fallback() -> None: assert docs[0].metadata['source'] == 'https://fallback.com' +def test_items_to_documents_uses_metadata_url_fallback() -> None: + """apify/rag-web-browser nests the page URL under metadata.url.""" + items = [ + { + 'metadata': {'url': 'https://nested.example.com', 'title': 'Nested'}, + 'text': 'content', + }, + ] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['source'] == 'https://nested.example.com' + assert docs[0].metadata['title'] == 'Nested' + + def test_items_to_documents_uses_markdown_fallback() -> None: items = [{'crawledUrl': 'https://example.com', 'markdown': '# MD content', 'metadata': {'title': 'T'}}] From 250e1ac81feeb1c654212317eeda9194fc50d073 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 5 May 2026 10:53:41 +0200 Subject: [PATCH 62/63] fix: rename actor search group --- langchain_apify/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 287e2ab..573365b 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -39,7 +39,7 @@ ApifyRunTaskAndGetDatasetTool, ] -APIFY_ACTOR_TOOLS: list[type[BaseTool]] = [ +APIFY_SEARCH_TOOLS: list[type[BaseTool]] = [ ApifyGoogleSearchTool, ApifyWebCrawlerTool, ] @@ -64,7 +64,7 @@ # Loaders 'ApifyCrawlLoader', # Tool group lists - 'APIFY_ACTOR_TOOLS', + 'APIFY_SEARCH_TOOLS', 'APIFY_CORE_TOOLS', # Meta '__version__', From f4cf20e7bad5ee627157cd8da95fa1f88c557e65 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 5 May 2026 10:55:16 +0200 Subject: [PATCH 63/63] fix: test fix --- tests/unit_tests/test_actor_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index e7fcf58..2b14809 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -7,7 +7,7 @@ from langchain_core.tools import ToolException from pydantic import SecretStr -from langchain_apify import APIFY_ACTOR_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify import APIFY_SEARCH_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool from langchain_apify._client import ApifyToolsClient from langchain_apify.tools import _ApifyGenericTool from tests.unit_tests.conftest import make_tool @@ -233,6 +233,6 @@ def test_actor_tools_have_correct_metadata() -> None: assert tool.handle_tool_error is True -def test_apify_actor_tools_list() -> None: - assert set(APIFY_ACTOR_TOOLS) == {ApifyGoogleSearchTool, ApifyWebCrawlerTool} - assert len(APIFY_ACTOR_TOOLS) == 2 +def test_apify_search_tools_list() -> None: + assert set(APIFY_SEARCH_TOOLS) == {ApifyGoogleSearchTool, ApifyWebCrawlerTool} + assert len(APIFY_SEARCH_TOOLS) == 2