From 8cad430f57ad5699d03ef038ba8418ea2bb7474f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 16:12:59 +0200 Subject: [PATCH 01/62] feat: implement apifyclient wrapper --- langchain_apify/_client.py | 205 +++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 langchain_apify/_client.py diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py new file mode 100644 index 0000000..6ab49be --- /dev/null +++ b/langchain_apify/_client.py @@ -0,0 +1,205 @@ +from __future__ import annotations + +import os + +from apify_client import ApifyClient + +from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify.utils import create_apify_client + +_SCRAPE_ACTOR_ID = 'apify/website-content-crawler' + + +class ApifyToolsClient: + """Internal helper that wraps ``ApifyClient`` for the tools layer. + + One convenience method per tool operation. All methods are synchronous and + block until the Actor run finishes., + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Raises: + ValueError: If no token is provided and the env var is not set. + """ + + def __init__(self, apify_api_token: str | None = None) -> None: + token = apify_api_token or os.getenv('APIFY_API_TOKEN') + if not token: + msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = create_apify_client(ApifyClient, token) + + def run_actor( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + ) -> dict: + """Start an Actor and block until it finishes. + + Args: + actor_id: Actor ID or name (e.g. ``"apify/python-example"``). + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + run = self._client.actor(actor_id).call(**call_kwargs) + self._check_run_status(run) + return run + + def get_dataset_items(self, dataset_id: str, limit: int = 100, offset: int = 0) -> list[dict]: + """Fetch items from an existing dataset. + + Args: + dataset_id: Apify dataset ID. + limit: Maximum number of items to return. + offset: Number of items to skip from the start. + + Returns: + List of dataset item dicts (may be empty). + """ + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + + def run_actor_and_get_items( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + ) -> tuple[dict, list[dict]]: + """Run an Actor, then fetch items from its default dataset. + + Args: + actor_id: Actor ID or name. + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId', '') + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + return run, items + + def run_task( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + ) -> dict: + """Start a saved Actor task and block until it finishes. + + Args: + task_id: Task ID or name (e.g. ``"user/my-task"``). + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + run = self._client.task(task_id).call(**call_kwargs) + self._check_run_status(run) + return run + + def run_task_and_get_items( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + ) -> tuple[dict, list[dict]]: + """Run a saved Actor task, then fetch items from its default dataset. + + Args: + task_id: Task ID or name. + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId', '') + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + return run, items + + def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + """Scrape a single URL and return its content as markdown. + + Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. + + Args: + url: The URL to scrape. + timeout_secs: Maximum time to wait for the crawl to finish. + + Returns: + Markdown (or plain-text fallback) content of the page. + + Raises: + RuntimeError: If the Actor run fails or no content is extracted. + """ + run_input = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': 1, + } + _, items = self.run_actor_and_get_items( + _SCRAPE_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + if not items: + msg = f'No content extracted from {url}.' + raise RuntimeError(msg) + + content = items[0].get('markdown') or items[0].get('text') or '' + if not content: + msg = f'No content extracted from {url}.' + raise RuntimeError(msg) + return content + + @staticmethod + def _check_run_status(run: dict) -> None: + """Raise if the run did not succeed.""" + status = run.get('status') + if status != 'SUCCEEDED': + run_id = run.get('id', 'unknown') + msg = f'Actor run {run_id} ended with status {status}.' + raise RuntimeError(msg) From 2404b9cd73faaea8c3c904b4a34e58ee4cf96a17 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 16:45:59 +0200 Subject: [PATCH 02/62] feat: removed redundant const file --- langchain_apify/_client.py | 21 +++++++++++++-------- langchain_apify/const.py | 2 -- langchain_apify/tools.py | 7 +++---- langchain_apify/utils.py | 12 ++++++------ 4 files changed, 22 insertions(+), 20 deletions(-) delete mode 100644 langchain_apify/const.py diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 6ab49be..068835b 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -8,6 +8,11 @@ from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_DEFAULT_RUN_TIMEOUT_SECS = 300 +_DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_TASK_TIMEOUT_SECS = 300 +_DEFAULT_DATASET_ITEMS_LIMIT = 100 +_RUN_STATUS_SUCCEEDED = 'SUCCEEDED' class ApifyToolsClient: @@ -35,7 +40,7 @@ def run_actor( self, actor_id: str, run_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, ) -> dict: """Start an Actor and block until it finishes. @@ -77,9 +82,9 @@ def run_actor_and_get_items( self, actor_id: str, run_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: """Run an Actor, then fetch items from its default dataset. @@ -105,7 +110,7 @@ def run_task( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, ) -> dict: """Start a saved Actor task and block until it finishes. @@ -135,9 +140,9 @@ def run_task_and_get_items( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = 300, + timeout_secs: int = _DEFAULT_TASK_TIMEOUT_SECS, memory_mbytes: int | None = None, - dataset_items_limit: int = 100, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: """Run a saved Actor task, then fetch items from its default dataset. @@ -160,7 +165,7 @@ def run_task_and_get_items( items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items return run, items - def scrape_url(self, url: str, timeout_secs: int = 120) -> str: + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: """Scrape a single URL and return its content as markdown. Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. @@ -199,7 +204,7 @@ def scrape_url(self, url: str, timeout_secs: int = 120) -> str: def _check_run_status(run: dict) -> None: """Raise if the run did not succeed.""" status = run.get('status') - if status != 'SUCCEEDED': + if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') msg = f'Actor run {run_id} ended with status {status}.' raise RuntimeError(msg) diff --git a/langchain_apify/const.py b/langchain_apify/const.py deleted file mode 100644 index 87e0d0e..0000000 --- a/langchain_apify/const.py +++ /dev/null @@ -1,2 +0,0 @@ -REQUESTS_TIMEOUT_SECS: float = 10.0 -MAX_DESCRIPTION_LEN: int = 350 diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 135314a..2afa413 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -10,14 +10,13 @@ from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import ( + _MAX_DESCRIPTION_LEN, actor_id_to_tool_name, create_apify_client, get_actor_latest_build, prune_actor_input_schema, ) -from .const import MAX_DESCRIPTION_LEN - if TYPE_CHECKING: from langchain_core.callbacks import ( CallbackManagerForToolRun, @@ -128,8 +127,8 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: """ build = get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') - if len(actor_description) > MAX_DESCRIPTION_LEN: - actor_description = actor_description[:MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' + if len(actor_description) > _MAX_DESCRIPTION_LEN: + actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' return actor_description @staticmethod diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 8cdc835..4f2e74f 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -7,14 +7,14 @@ from apify_client import ApifyClientAsync from apify_client.client import ApifyClient -from langchain_apify.const import MAX_DESCRIPTION_LEN, REQUESTS_TIMEOUT_SECS - -APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' +_MAX_DESCRIPTION_LEN: int = 350 +_REQUESTS_TIMEOUT_SECS: float = 10.0 +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' def prune_actor_input_schema( input_schema: dict, - max_description_len: int = MAX_DESCRIPTION_LEN, + max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. @@ -117,8 +117,8 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: msg = f'Failed to get the Actor object ID for {actor_id}.' raise ValueError(msg) - url = APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) - response = requests.request('GET', url, timeout=REQUESTS_TIMEOUT_SECS) + url = _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) + response = requests.request('GET', url, timeout=_REQUESTS_TIMEOUT_SECS) build = response.json() if not isinstance(build, dict): From b1a89a455602d3cd5941c5f5e1f05695899f5cd0 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 17:59:49 +0200 Subject: [PATCH 03/62] feat: add few more input schemas, helpers and tool classes --- langchain_apify/_client.py | 10 +- .../{error_messages.py => _error_messages.py} | 4 + langchain_apify/tools.py | 261 +++++++++++++++++- langchain_apify/utils.py | 3 +- tests/integration_tests/test_utils.py | 2 +- 5 files changed, 270 insertions(+), 10 deletions(-) rename langchain_apify/{error_messages.py => _error_messages.py} (75%) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 068835b..181c6ec 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -4,7 +4,7 @@ from apify_client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' @@ -65,7 +65,7 @@ def run_actor( self._check_run_status(run) return run - def get_dataset_items(self, dataset_id: str, limit: int = 100, offset: int = 0) -> list[dict]: + def get_dataset_items(self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0) -> list[dict]: """Fetch items from an existing dataset. Args: @@ -191,12 +191,12 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) dataset_items_limit=1, ) if not items: - msg = f'No content extracted from {url}.' + msg = ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) content = items[0].get('markdown') or items[0].get('text') or '' if not content: - msg = f'No content extracted from {url}.' + msg = ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) return content @@ -206,5 +206,5 @@ def _check_run_status(run: dict) -> None: status = run.get('status') if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') - msg = f'Actor run {run_id} ended with status {status}.' + msg = ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) raise RuntimeError(msg) diff --git a/langchain_apify/error_messages.py b/langchain_apify/_error_messages.py similarity index 75% rename from langchain_apify/error_messages.py rename to langchain_apify/_error_messages.py index 87462b8..a87c9cb 100644 --- a/langchain_apify/error_messages.py +++ b/langchain_apify/_error_messages.py @@ -5,3 +5,7 @@ ' To pass it as environment variable, you can use the following command:' ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) + +ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' + +ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 2afa413..40aeeee 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -5,10 +5,11 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient -from langchain_core.tools import BaseTool +from langchain_core.tools import BaseTool, ToolException from pydantic import BaseModel, Field, create_model -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import ( _MAX_DESCRIPTION_LEN, actor_id_to_tool_name, @@ -191,3 +192,259 @@ def _run_actor(self, run_input: dict) -> list[dict]: run = self._apify_client.run(run_id=run_id) return run.dataset().list_items(clean=True).items + + +# --------------------------------------------------------------------------- +# Input schemas for the generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorInput(BaseModel): + """Input schema for :class:`ApifyRunActorTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + + +class ApifyGetDatasetItemsInput(BaseModel): + """Input schema for :class:`ApifyGetDatasetItemsTool`.""" + + dataset_id: str = Field(description='Apify dataset ID.') + limit: int = Field(default=100, description='Maximum number of items to return.') + offset: int = Field(default=0, description='Number of items to skip from the start.') + + +class ApifyRunActorAndGetItemsInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetItemsTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +class ApifyScrapeUrlInput(BaseModel): + """Input schema for :class:`ApifyScrapeUrlTool`.""" + + url: str = Field(description='The URL to scrape.') + timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _iso(value: str | None) -> str | None: + """Pass through an ISO timestamp or *None*.""" + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } + + +# --------------------------------------------------------------------------- +# Generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorTool(BaseTool): + """Run any Apify Actor by ID with an arbitrary JSON input. + + Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON + string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the + results from the dataset. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor' + description: str = ( + 'Run an Apify Actor synchronously and return run metadata' + ' (run_id, status, dataset_id, timestamps) as a JSON string.' + ) + args_schema: type[BaseModel] = ApifyRunActorInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyGetDatasetItemsTool(BaseTool): + """Fetch items from an existing Apify dataset by ID. + + Returns items as a JSON string. When the dataset is empty the tool returns + an informative JSON message instead of raising an error. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGetDatasetItemsTool + + tool = ApifyGetDatasetItemsTool() + result = tool.invoke({"dataset_id": "abc123", "limit": 10}) + """ + + name: str = 'apify_get_dataset_items' + description: str = 'Fetch items from an Apify dataset by ID. Returns a JSON array of items.' + args_schema: type[BaseModel] = ApifyGetDatasetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + items = self._client.get_dataset_items(dataset_id, limit, offset) + if not items: + return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) + return json.dumps(items) + + +class ApifyRunActorAndGetItemsTool(BaseTool): + """Run any Apify Actor and return both run metadata and dataset items. + + Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorAndGetItemsTool + + tool = ApifyRunActorAndGetItemsTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor_and_get_items' + description: str = ( + 'Run an Apify Actor synchronously and return both run metadata and' + ' dataset items as a JSON string with "run" and "items" keys.' + ) + args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_actor_and_get_items( + actor_id, run_input, timeout_secs, memory_mbytes, dataset_items_limit + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyScrapeUrlTool(BaseTool): + """Scrape a single URL and return its content as markdown. + + Uses the ``apify/website-content-crawler`` Actor under the hood with + ``maxCrawlPages=1``. Returns the page content as a plain markdown string + (not JSON). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyScrapeUrlTool + + tool = ApifyScrapeUrlTool() + markdown = tool.invoke({"url": "https://apify.com"}) + """ + + name: str = 'apify_scrape_url' + description: str = ( + 'Scrape a single URL using Apify and return its content as markdown text.' + ) + args_schema: type[BaseModel] = ApifyScrapeUrlInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + url: str, + timeout_secs: int = 120, + run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + return self._client.scrape_url(url, timeout_secs) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 4f2e74f..6b9f9fd 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -9,8 +9,7 @@ _MAX_DESCRIPTION_LEN: int = 350 _REQUESTS_TIMEOUT_SECS: float = 10.0 -_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' - +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' def prune_actor_input_schema( input_schema: dict, diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 1107c7a..554cc2d 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,7 +2,7 @@ from apify_client.client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify.utils import create_apify_client, get_actor_latest_build From 0aa917582970bba0d0e50fd88cc17b8606397cfc Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 18:28:13 +0200 Subject: [PATCH 04/62] feat: export new tools from __init__ --- langchain_apify/__init__.py | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 66142be..cfedc69 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,19 +1,50 @@ from importlib import metadata from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.tools import ApifyActorsTool +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) from langchain_apify.wrappers import ApifyWrapper try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: - # Case where package metadata is not available. __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) +# --------------------------------------------------------------------------- +# Convenience tool-class lists for selective agent binding. +# +# Binding all tools at once overwhelms the LLM context window; pick the +# group(s) relevant to your use case: +# +# from langchain_apify import CORE_TOOLS +# agent = create_react_agent(model, [t() for t in CORE_TOOLS]) +# --------------------------------------------------------------------------- + +CORE_TOOLS: list[type] = [ + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, +] + __all__ = [ + # Existing components (backward-compatible) 'ApifyActorsTool', 'ApifyDatasetLoader', 'ApifyWrapper', + # Core generic tools + 'ApifyGetDatasetItemsTool', + 'ApifyRunActorAndGetItemsTool', + 'ApifyRunActorTool', + 'ApifyScrapeUrlTool', + # Tool group lists + 'CORE_TOOLS', + # Meta '__version__', ] From 4e46d3684048e15a4867a87c457c58d9f5e4ad2a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 20 Apr 2026 18:35:25 +0200 Subject: [PATCH 05/62] feat: add unit tests --- tests/unit_tests/test_client.py | 232 ++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 tests/unit_tests/test_client.py diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py new file mode 100644 index 0000000..f30ed52 --- /dev/null +++ b/tests/unit_tests/test_client.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY + +_SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +_FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +_SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client) as mock_create: + c = ApifyToolsClient(apify_api_token='my-token') + mock_create.assert_called_once() + assert c._client is mock_apify_client + + +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyToolsClient() + + +# --------------------------------------------------------------------------- +# run_actor +# --------------------------------------------------------------------------- + + +def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + + result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) + + mock_apify_client.actor.assert_called_once_with('apify/test-actor') + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300 + ) + assert result == _SUCCEEDED_RUN + + +def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + + client.run_actor('apify/test-actor', memory_mbytes=512) + + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input=None, timeout_secs=300, memory_mbytes=512 + ) + + +def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# get_dataset_items +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) + + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) + assert items == _SAMPLE_ITEMS + + +def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.get_dataset_items('dataset-empty') + assert items == [] + + +# --------------------------------------------------------------------------- +# run_actor_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) + + assert run == _SUCCEEDED_RUN + assert items == _SAMPLE_ITEMS + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + + +# --------------------------------------------------------------------------- +# run_task +# --------------------------------------------------------------------------- + + +def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + + result = client.run_task('user/my-task', task_input={'key': 'val'}) + + mock_apify_client.task.assert_called_once_with('user/my-task') + mock_apify_client.task.return_value.call.assert_called_once_with( + task_input={'key': 'val'}, timeout_secs=300 + ) + assert result == _SUCCEEDED_RUN + + +def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_task('user/my-task') + + +# --------------------------------------------------------------------------- +# run_task_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + + run, items = client.run_task_and_get_items('user/my-task') + + assert run == _SUCCEEDED_RUN + assert items == _SAMPLE_ITEMS + + +# --------------------------------------------------------------------------- +# scrape_url +# --------------------------------------------------------------------------- + + +def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == '# Hello' + + +def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == 'Plain text content' + + +def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '', 'text': '', 'url': 'https://example.com'}, + ] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +# --------------------------------------------------------------------------- +# _check_run_status +# --------------------------------------------------------------------------- + + +def test_check_run_status_succeeded() -> None: + ApifyToolsClient._check_run_status({'id': 'run-ok', 'status': 'SUCCEEDED'}) + + +def test_check_run_status_failed() -> None: + expected_msg = ERROR_ACTOR_RUN_FAILED.format(run_id='run-bad', status='FAILED') + with pytest.raises(RuntimeError, match='run-bad'): + ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) From fc6ef1286297c1f8581b15fe475b150ee1fa6b58 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 09:45:06 +0200 Subject: [PATCH 06/62] feat: implement tests and introduce tools list --- langchain_apify/__init__.py | 8 +- tests/unit_tests/test_tools.py | 186 ++++++++++++++++++++++++++++++++- 2 files changed, 188 insertions(+), 6 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index cfedc69..21e5776 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -22,11 +22,11 @@ # Binding all tools at once overwhelms the LLM context window; pick the # group(s) relevant to your use case: # -# from langchain_apify import CORE_TOOLS -# agent = create_react_agent(model, [t() for t in CORE_TOOLS]) +# from langchain_apify import APIFY_CORE_TOOLS +# agent = create_react_agent(model, [t() for t in APIFY_CORE_TOOLS]) # --------------------------------------------------------------------------- -CORE_TOOLS: list[type] = [ +APIFY_CORE_TOOLS: list[type] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, @@ -44,7 +44,7 @@ 'ApifyRunActorTool', 'ApifyScrapeUrlTool', # Tool group lists - 'CORE_TOOLS', + 'APIFY_CORE_TOOLS', # Meta '__version__', ] diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index b10df2f..f17572f 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,12 +1,21 @@ from __future__ import annotations +import json from typing import TYPE_CHECKING -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from langchain_core.tools import ToolException from pydantic import BaseModel -from langchain_apify.tools import ApifyActorsTool +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) from langchain_apify.utils import actor_id_to_tool_name if TYPE_CHECKING: @@ -85,3 +94,176 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id='apify/python-example', apify_api_token='dummy-token') yield tool + + +# --------------------------------------------------------------------------- +# Shared test data for generic tools +# --------------------------------------------------------------------------- + +_SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +_SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +def _make_tool(tool_cls: type, mock_client: MagicMock) -> ApifyRunActorTool | ApifyGetDatasetItemsTool | ApifyRunActorAndGetItemsTool | ApifyScrapeUrlTool: + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token') + tool._client = mock_client + return tool + + +# --------------------------------------------------------------------------- +# ApifyRunActorTool +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = _SUCCEEDED_RUN + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_actor.assert_called_once_with('apify/test', {'key': 'val'}, 300, None) + + +def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(actor_id='apify/test') + + +def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorTool() + + +# --------------------------------------------------------------------------- +# ApifyGetDatasetItemsTool +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_tool_returns_json_array(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0]['text'] == 'item-1' + mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) + + +def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = [] + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-empty') + + parsed = json.loads(result) + assert parsed['items'] == [] + assert 'empty' in parsed['message'].lower() + + +# --------------------------------------------------------------------------- +# ApifyRunActorAndGetItemsTool +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) + tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_actor_and_get_items.assert_called_once_with( + 'apify/test', {'q': '1'}, 300, None, 50 + ) + + +def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(actor_id='apify/test') + + +# --------------------------------------------------------------------------- +# ApifyScrapeUrlTool +# --------------------------------------------------------------------------- + + +def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# Hello World' + tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert result == '# Hello World' + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + + +def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + + with pytest.raises(ToolException, match='No content extracted'): + tool._run(url='https://example.com') + + +# --------------------------------------------------------------------------- +# Tool metadata assertions +# --------------------------------------------------------------------------- + + +def test_generic_tools_have_correct_metadata() -> None: + """Verify name, description, and args_schema are set on all 4 tools.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyRunActorTool(apify_api_token='dummy'), + ApifyGetDatasetItemsTool(apify_api_token='dummy'), + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), + ApifyScrapeUrlTool(apify_api_token='dummy'), + ] + + expected_names = [ + 'apify_run_actor', + 'apify_get_dataset_items', + 'apify_run_actor_and_get_items', + 'apify_scrape_url', + ] + + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True From cc5be9e887edc95719742b682e463e98b3d0ca36 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:01:27 +0200 Subject: [PATCH 07/62] fix: lint fix --- langchain_apify/__init__.py | 12 ++++-------- langchain_apify/_client.py | 16 ++++++++++++++-- langchain_apify/tools.py | 20 +++++++++----------- langchain_apify/utils.py | 1 + tests/unit_tests/test_client.py | 10 ++-------- tests/unit_tests/test_tools.py | 12 ++++++------ 6 files changed, 36 insertions(+), 35 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 21e5776..1b65eef 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from importlib import metadata from langchain_apify.document_loaders import ApifyDatasetLoader @@ -16,15 +18,9 @@ __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) -# --------------------------------------------------------------------------- # Convenience tool-class lists for selective agent binding. -# -# Binding all tools at once overwhelms the LLM context window; pick the -# group(s) relevant to your use case: -# -# from langchain_apify import APIFY_CORE_TOOLS -# agent = create_react_agent(model, [t() for t in APIFY_CORE_TOOLS]) -# --------------------------------------------------------------------------- +# Binding all tools at once overwhelms the LLM context window; +# pick the group(s) relevant to your use case. APIFY_CORE_TOOLS: list[type] = [ ApifyRunActorTool, diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 181c6ec..bf3e01a 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -4,7 +4,11 @@ from apify_client import ApifyClient -from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY +from langchain_apify._error_messages import ( + ERROR_ACTOR_RUN_FAILED, + ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + ERROR_SCRAPE_EMPTY, +) from langchain_apify.utils import create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' @@ -62,10 +66,15 @@ def run_actor( call_kwargs['memory_mbytes'] = memory_mbytes run = self._client.actor(actor_id).call(**call_kwargs) + if run is None: + msg = f'Actor {actor_id} call returned no run details.' + raise RuntimeError(msg) self._check_run_status(run) return run - def get_dataset_items(self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0) -> list[dict]: + def get_dataset_items( + self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0 + ) -> list[dict]: """Fetch items from an existing dataset. Args: @@ -133,6 +142,9 @@ def run_task( call_kwargs['memory_mbytes'] = memory_mbytes run = self._client.task(task_id).call(**call_kwargs) + if run is None: + msg = f'Task {task_id} call returned no run details.' + raise RuntimeError(msg) self._check_run_status(run) return run diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 40aeeee..a751dad 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -259,7 +259,7 @@ def _run_meta(run: dict) -> dict: # --------------------------------------------------------------------------- -class ApifyRunActorTool(BaseTool): +class ApifyRunActorTool(BaseTool): # type: ignore[override] """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -301,7 +301,7 @@ def _run( run_input: dict | None = None, timeout_secs: int = 300, memory_mbytes: int | None = None, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) @@ -310,7 +310,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(BaseTool): +class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] """Fetch items from an existing Apify dataset by ID. Returns items as a JSON string. When the dataset is empty the tool returns @@ -344,7 +344,7 @@ def _run( dataset_id: str, limit: int = 100, offset: int = 0, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: items = self._client.get_dataset_items(dataset_id, limit, offset) if not items: @@ -352,7 +352,7 @@ def _run( return json.dumps(items) -class ApifyRunActorAndGetItemsTool(BaseTool): +class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -395,7 +395,7 @@ def _run( timeout_secs: int = 300, memory_mbytes: int | None = None, dataset_items_limit: int = 100, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: run, items = self._client.run_actor_and_get_items( @@ -406,7 +406,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(BaseTool): +class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -426,9 +426,7 @@ class ApifyScrapeUrlTool(BaseTool): """ name: str = 'apify_scrape_url' - description: str = ( - 'Scrape a single URL using Apify and return its content as markdown text.' - ) + description: str = 'Scrape a single URL using Apify and return its content as markdown text.' args_schema: type[BaseModel] = ApifyScrapeUrlInput handle_tool_error: bool = True @@ -442,7 +440,7 @@ def _run( self, url: str, timeout_secs: int = 120, - run_manager: CallbackManagerForToolRun | None = None, + _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: return self._client.scrape_url(url, timeout_secs) diff --git a/langchain_apify/utils.py b/langchain_apify/utils.py index 6b9f9fd..d3a627f 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/utils.py @@ -11,6 +11,7 @@ _REQUESTS_TIMEOUT_SECS: float = 10.0 _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' + def prune_actor_input_schema( input_schema: dict, max_description_len: int = _MAX_DESCRIPTION_LEN, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index f30ed52..89862b1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -5,7 +5,6 @@ import pytest from langchain_apify._client import ApifyToolsClient -from langchain_apify._error_messages import ERROR_ACTOR_RUN_FAILED, ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, ERROR_SCRAPE_EMPTY _SUCCEEDED_RUN: dict = { 'id': 'run-abc', @@ -74,9 +73,7 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with( - run_input={'key': 'val'}, timeout_secs=300 - ) + mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300) assert result == _SUCCEEDED_RUN @@ -146,9 +143,7 @@ def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock result = client.run_task('user/my-task', task_input={'key': 'val'}) mock_apify_client.task.assert_called_once_with('user/my-task') - mock_apify_client.task.return_value.call.assert_called_once_with( - task_input={'key': 'val'}, timeout_secs=300 - ) + mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) assert result == _SUCCEEDED_RUN @@ -227,6 +222,5 @@ def test_check_run_status_succeeded() -> None: def test_check_run_status_failed() -> None: - expected_msg = ERROR_ACTOR_RUN_FAILED.format(run_id='run-bad', status='FAILED') with pytest.raises(RuntimeError, match='run-bad'): ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index f17572f..af43843 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,7 +1,7 @@ from __future__ import annotations import json -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from unittest.mock import MagicMock, patch import pytest @@ -119,7 +119,7 @@ def mock_tools_client() -> MagicMock: return MagicMock(spec=ApifyToolsClient) -def _make_tool(tool_cls: type, mock_client: MagicMock) -> ApifyRunActorTool | ApifyGetDatasetItemsTool | ApifyRunActorAndGetItemsTool | ApifyScrapeUrlTool: +def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 """Instantiate a generic tool with a mocked ApifyToolsClient.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tool = tool_cls(apify_api_token='dummy-token') @@ -204,13 +204,13 @@ def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) assert parsed['run']['run_id'] == 'run-abc' assert parsed['run']['status'] == 'SUCCEEDED' assert len(parsed['items']) == 2 - mock_tools_client.run_actor_and_get_items.assert_called_once_with( - 'apify/test', {'q': '1'}, 300, None, 50 - ) + mock_tools_client.run_actor_and_get_items.assert_called_once_with('apify/test', {'q': '1'}, 300, None, 50) def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): From c2b9cb6c68a862fa9f602d9e669988d19611238d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:36:30 +0200 Subject: [PATCH 08/62] feat: enhance error handling and documentation for apify tools --- langchain_apify/_client.py | 30 +++++++++++++--- langchain_apify/tools.py | 63 ++++++++++++++++++++++++++++++---- tests/unit_tests/test_tools.py | 35 +++++++++++++++++++ 3 files changed, 117 insertions(+), 11 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index bf3e01a..8434428 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -65,7 +65,11 @@ def run_actor( if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes - run = self._client.actor(actor_id).call(**call_kwargs) + try: + run = self._client.actor(actor_id).call(**call_kwargs) + except Exception as exc: + msg = f'Network error calling Actor {actor_id}: {exc}' + raise RuntimeError(msg) from exc if run is None: msg = f'Actor {actor_id} call returned no run details.' raise RuntimeError(msg) @@ -85,7 +89,11 @@ def get_dataset_items( Returns: List of dataset item dicts (may be empty). """ - return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + try: + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc def run_actor_and_get_items( self, @@ -112,7 +120,11 @@ def run_actor_and_get_items( """ run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + try: + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc return run, items def run_task( @@ -141,7 +153,11 @@ def run_task( if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes - run = self._client.task(task_id).call(**call_kwargs) + try: + run = self._client.task(task_id).call(**call_kwargs) + except Exception as exc: + msg = f'Network error calling task {task_id}: {exc}' + raise RuntimeError(msg) from exc if run is None: msg = f'Task {task_id} call returned no run details.' raise RuntimeError(msg) @@ -174,7 +190,11 @@ def run_task_and_get_items( """ run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + try: + items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc return run, items def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index a751dad..9b433f3 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -266,6 +266,14 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the results from the dataset. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + Example: .. code-block:: python @@ -283,8 +291,12 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] name: str = 'apify_run_actor' description: str = ( - 'Run an Apify Actor synchronously and return run metadata' - ' (run_id, status, dataset_id, timestamps) as a JSON string.' + 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunActorInput handle_tool_error: bool = True @@ -316,6 +328,14 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] Returns items as a JSON string. When the dataset is empty the tool returns an informative JSON message instead of raising an error. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON array of item dicts, or ``{"items": [], "message": "..."}`` when + the dataset is empty. + Example: .. code-block:: python @@ -329,7 +349,12 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] """ name: str = 'apify_get_dataset_items' - description: str = 'Fetch items from an Apify dataset by ID. Returns a JSON array of items.' + description: str = ( + 'Fetch items from an Apify dataset by ID. Returns a JSON array of item dicts.' + ' Required: dataset_id (str) — Apify dataset ID.' + ' Optional: limit (int, default 100), offset (int, default 0).' + ' Returns an empty JSON object with a message when the dataset is empty.' + ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput handle_tool_error: bool = True @@ -359,6 +384,15 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] into a single call. Returns a JSON string with ``run`` (metadata) and ``items`` (list of dicts) keys. + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + Example: .. code-block:: python @@ -376,8 +410,12 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] name: str = 'apify_run_actor_and_get_items' description: str = ( - 'Run an Apify Actor synchronously and return both run metadata and' - ' dataset items as a JSON string with "run" and "items" keys.' + 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput handle_tool_error: bool = True @@ -413,6 +451,14 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] ``maxCrawlPages=1``. Returns the page content as a plain markdown string (not JSON). + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + Markdown string with the full text content of the scraped page, or a + plain-text fallback when markdown is unavailable. + Example: .. code-block:: python @@ -426,7 +472,12 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] """ name: str = 'apify_scrape_url' - description: str = 'Scrape a single URL using Apify and return its content as markdown text.' + description: str = ( + 'Scrape a single URL using Apify and return its full content as a markdown string.' + ' Required: url (str) — the URL to scrape.' + ' Optional: timeout_secs (int, default 120).' + ' Returns the page content as markdown (or plain text if markdown is unavailable).' + ) args_schema: type[BaseModel] = ApifyScrapeUrlInput handle_tool_error: bool = True diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index af43843..91c53a0 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -8,6 +8,7 @@ from langchain_core.tools import ToolException from pydantic import BaseModel +from langchain_apify import APIFY_CORE_TOOLS from langchain_apify._client import ApifyToolsClient from langchain_apify.tools import ( ApifyActorsTool, @@ -189,6 +190,12 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo assert 'empty' in parsed['message'].lower() +def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGetDatasetItemsTool() + + # --------------------------------------------------------------------------- # ApifyRunActorAndGetItemsTool # --------------------------------------------------------------------------- @@ -217,6 +224,12 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c tool._run(actor_id='apify/test') +def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorAndGetItemsTool() + + # --------------------------------------------------------------------------- # ApifyScrapeUrlTool # --------------------------------------------------------------------------- @@ -240,6 +253,12 @@ def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMoc tool._run(url='https://example.com') +def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyScrapeUrlTool() + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- @@ -267,3 +286,19 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.description assert tool.args_schema is not None assert tool.handle_tool_error is True + + +# --------------------------------------------------------------------------- +# APIFY_CORE_TOOLS list +# --------------------------------------------------------------------------- + + +def test_apify_core_tools_contains_all_four_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 4 generic tool classes.""" + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, + } + assert len(APIFY_CORE_TOOLS) == 4 From 3edf1265fcbc368494a734a910bfcc9015324d94 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 10:58:37 +0200 Subject: [PATCH 09/62] fix: iso format fix --- langchain_apify/tools.py | 4 +- tests/unit_tests/test_tools.py | 73 ++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 9b433f3..7e1f11d 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -3,6 +3,7 @@ import json import os from typing import TYPE_CHECKING, Any +from datetime import datetime from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException @@ -239,7 +240,8 @@ class ApifyScrapeUrlInput(BaseModel): def _iso(value: str | None) -> str | None: - """Pass through an ISO timestamp or *None*.""" + if isinstance(value, datetime): + return value.isoformat() return value diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 91c53a0..5afb962 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +from datetime import datetime, timezone from typing import TYPE_CHECKING, Any from unittest.mock import MagicMock, patch @@ -16,6 +17,8 @@ ApifyRunActorAndGetItemsTool, ApifyRunActorTool, ApifyScrapeUrlTool, + _iso, + _run_meta, ) from langchain_apify.utils import actor_id_to_tool_name @@ -128,6 +131,76 @@ def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 return tool +# --------------------------------------------------------------------------- +# _iso / _run_meta helpers +# --------------------------------------------------------------------------- + + +def test_iso_converts_datetime_to_string() -> None: + dt = datetime(2025, 6, 15, 12, 30, 45, tzinfo=timezone.utc) + assert _iso(dt) == '2025-06-15T12:30:45+00:00' + + +def test_iso_passes_through_string() -> None: + assert _iso('2025-01-01T00:00:00.000Z') == '2025-01-01T00:00:00.000Z' + + +def test_iso_passes_through_none() -> None: + assert _iso(None) is None + + +def test_run_meta_with_datetime_values_is_json_serializable() -> None: + run = { + 'id': 'run-dt', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-dt', + 'startedAt': datetime(2025, 3, 1, 10, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 3, 1, 10, 1, 0, tzinfo=timezone.utc), + } + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['run_id'] == 'run-dt' + assert parsed['started_at'] == '2025-03-01T10:00:00+00:00' + assert parsed['finished_at'] == '2025-03-01T10:01:00+00:00' + + +def test_run_meta_with_string_values_is_json_serializable() -> None: + meta = _run_meta(_SUCCEEDED_RUN) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + + +def test_run_meta_with_missing_timestamps() -> None: + run = {'id': 'run-none', 'status': 'RUNNING', 'defaultDatasetId': 'ds-none'} + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] is None + assert parsed['finished_at'] is None + + +def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: + """End-to-end: ApifyRunActorTool returns valid JSON when the client returns datetime objects.""" + mock_tools_client.run_actor.return_value = { + 'id': 'run-real', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-real', + 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), + } + tool = _make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test') + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-real' + assert parsed['started_at'] == '2025-06-01T08:00:00+00:00' + assert parsed['finished_at'] == '2025-06-01T08:05:00+00:00' + + # --------------------------------------------------------------------------- # ApifyRunActorTool # --------------------------------------------------------------------------- From 8c36edc824f9e866d7c463d539a5faa401a4f299 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 12:58:32 +0200 Subject: [PATCH 10/62] feat: add apify run task and apify run task and get items tools with input schemas --- langchain_apify/__init__.py | 6 ++ langchain_apify/_client.py | 2 +- langchain_apify/tools.py | 150 ++++++++++++++++++++++++++++++++ tests/unit_tests/test_client.py | 4 +- tests/unit_tests/test_tools.py | 84 +++++++++++++++++- 5 files changed, 239 insertions(+), 7 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 1b65eef..fa1f369 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -8,6 +8,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, ) from langchain_apify.wrappers import ApifyWrapper @@ -27,6 +29,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, ] __all__ = [ @@ -38,6 +42,8 @@ 'ApifyGetDatasetItemsTool', 'ApifyRunActorAndGetItemsTool', 'ApifyRunActorTool', + 'ApifyRunTaskAndGetItemsTool', + 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', # Tool group lists 'APIFY_CORE_TOOLS', diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 8434428..c3ed22e 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -61,7 +61,7 @@ def run_actor( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs} + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None} if memory_mbytes is not None: call_kwargs['memory_mbytes'] = memory_mbytes diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 7e1f11d..421aefd 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -234,6 +234,25 @@ class ApifyScrapeUrlInput(BaseModel): timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') +class ApifyRunTaskInput(BaseModel): + """Input schema for :class:`ApifyRunTaskTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + + +class ApifyRunTaskAndGetItemsInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- @@ -499,3 +518,134 @@ def _run( return self._client.scrape_url(url, timeout_secs) except RuntimeError as exc: raise ToolException(str(exc)) from exc + + +class ApifyRunTaskTool(BaseTool): # type: ignore[override] + """Run a saved Apify Actor task by ID and return run metadata. + + Actor tasks are pre-configured Actor runs saved in the Apify Console. + This tool starts a task with optional input overrides and returns run + metadata (run ID, status, dataset ID, timestamps) as a JSON string. + Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve results. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskTool + + tool = ApifyRunTaskTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task' + description: str = ( + 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunTaskInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_task(task_id, task_input, timeout_secs, memory_mbytes) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] + """Run a saved Apify Actor task and return both run metadata and dataset items. + + Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskAndGetItemsTool + + tool = ApifyRunTaskAndGetItemsTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task_and_get_items' + description: str = ( + 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_task_and_get_items( + task_id, task_input, timeout_secs, memory_mbytes, dataset_items_limit + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 89862b1..95193c4 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -73,7 +73,7 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300) + mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) assert result == _SUCCEEDED_RUN @@ -83,7 +83,7 @@ def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: Magi client.run_actor('apify/test-actor', memory_mbytes=512) mock_apify_client.actor.return_value.call.assert_called_once_with( - run_input=None, timeout_secs=300, memory_mbytes=512 + run_input=None, timeout_secs=300, logger=None, memory_mbytes=512 ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 5afb962..1d61eb2 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -16,6 +16,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, _iso, _run_meta, @@ -332,19 +334,89 @@ def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: ApifyScrapeUrlTool() +# --------------------------------------------------------------------------- +# ApifyRunTaskTool +# --------------------------------------------------------------------------- + + +def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = _SUCCEEDED_RUN + tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_task.assert_called_once_with('user/my-task', {'key': 'val'}, 300, None) + + +def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(task_id='user/my-task') + + +def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskAndGetItemsTool +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) + tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_task_and_get_items.assert_called_once_with('user/my-task', {'q': '1'}, 300, None, 50) + + +def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(task_id='user/my-task') + + +def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskAndGetItemsTool() + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- def test_generic_tools_have_correct_metadata() -> None: - """Verify name, description, and args_schema are set on all 4 tools.""" + """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ ApifyRunActorTool(apify_api_token='dummy'), ApifyGetDatasetItemsTool(apify_api_token='dummy'), ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), ApifyScrapeUrlTool(apify_api_token='dummy'), + ApifyRunTaskTool(apify_api_token='dummy'), + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), ] expected_names = [ @@ -352,6 +424,8 @@ def test_generic_tools_have_correct_metadata() -> None: 'apify_get_dataset_items', 'apify_run_actor_and_get_items', 'apify_scrape_url', + 'apify_run_task', + 'apify_run_task_and_get_items', ] for tool, expected_name in zip(tools, expected_names): @@ -366,12 +440,14 @@ def test_generic_tools_have_correct_metadata() -> None: # --------------------------------------------------------------------------- -def test_apify_core_tools_contains_all_four_classes() -> None: - """APIFY_CORE_TOOLS must list exactly the 4 generic tool classes.""" +def test_apify_core_tools_contains_all_generic_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 6 generic tool classes.""" assert set(APIFY_CORE_TOOLS) == { ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, } - assert len(APIFY_CORE_TOOLS) == 4 + assert len(APIFY_CORE_TOOLS) == 6 From 026175a49471d1bfa826ad5655dc6ec31696e47b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 13:50:44 +0200 Subject: [PATCH 11/62] feat: introduce _ApifyGenericTool base class for Apify tools to streamline client handling and error management --- langchain_apify/tools.py | 76 +++++++++++++--------------------- tests/unit_tests/test_tools.py | 24 +++++++++++ 2 files changed, 52 insertions(+), 48 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 421aefd..93d884a 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -275,12 +275,34 @@ def _run_meta(run: dict) -> dict: } +# --------------------------------------------------------------------------- +# Shared base for generic tools +# --------------------------------------------------------------------------- + + +class _ApifyGenericTool(BaseTool): # type: ignore[override] + """Shared base for all generic Apify tools. + + Handles ``ApifyToolsClient`` creation and sets ``handle_tool_error``. + Subclasses only need to declare ``name``, ``description``, + ``args_schema``, and ``_run()``. + """ + + handle_tool_error: bool = True + + _client: ApifyToolsClient + + def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 + super().__init__(**kwargs) + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + # --------------------------------------------------------------------------- # Generic tools # --------------------------------------------------------------------------- -class ApifyRunActorTool(BaseTool): # type: ignore[override] +class ApifyRunActorTool(_ApifyGenericTool): """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -320,13 +342,6 @@ class ApifyRunActorTool(BaseTool): # type: ignore[override] ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunActorInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -343,7 +358,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] +class ApifyGetDatasetItemsTool(_ApifyGenericTool): """Fetch items from an existing Apify dataset by ID. Returns items as a JSON string. When the dataset is empty the tool returns @@ -377,13 +392,6 @@ class ApifyGetDatasetItemsTool(BaseTool): # type: ignore[override] ' Returns an empty JSON object with a message when the dataset is empty.' ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -398,7 +406,7 @@ def _run( return json.dumps(items) -class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] +class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -439,13 +447,6 @@ class ApifyRunActorAndGetItemsTool(BaseTool): # type: ignore[override] ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -465,7 +466,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] +class ApifyScrapeUrlTool(_ApifyGenericTool): """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -500,13 +501,6 @@ class ApifyScrapeUrlTool(BaseTool): # type: ignore[override] ' Returns the page content as markdown (or plain text if markdown is unavailable).' ) args_schema: type[BaseModel] = ApifyScrapeUrlInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -520,7 +514,7 @@ def _run( raise ToolException(str(exc)) from exc -class ApifyRunTaskTool(BaseTool): # type: ignore[override] +class ApifyRunTaskTool(_ApifyGenericTool): """Run a saved Apify Actor task by ID and return run metadata. Actor tasks are pre-configured Actor runs saved in the Apify Console. @@ -561,13 +555,6 @@ class ApifyRunTaskTool(BaseTool): # type: ignore[override] ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' ) args_schema: type[BaseModel] = ApifyRunTaskInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, @@ -584,7 +571,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] +class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` @@ -625,13 +612,6 @@ class ApifyRunTaskAndGetItemsTool(BaseTool): # type: ignore[override] ' and items (list of dataset item dicts).' ) args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput - handle_tool_error: bool = True - - _client: ApifyToolsClient - - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) def _run( self, diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 1d61eb2..025b486 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -19,6 +19,7 @@ ApifyRunTaskAndGetItemsTool, ApifyRunTaskTool, ApifyScrapeUrlTool, + _ApifyGenericTool, _iso, _run_meta, ) @@ -435,6 +436,29 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.handle_tool_error is True +# --------------------------------------------------------------------------- +# _ApifyGenericTool inheritance +# --------------------------------------------------------------------------- + + +def test_all_generic_tools_inherit_from_base() -> None: + """Every generic tool must be a subclass of _ApifyGenericTool.""" + for tool_cls in ( + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetItemsTool, + ): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_legacy_tool_does_not_inherit_from_generic_base() -> None: + """ApifyActorsTool is legacy and must NOT inherit from _ApifyGenericTool.""" + assert not issubclass(ApifyActorsTool, _ApifyGenericTool) + + # --------------------------------------------------------------------------- # APIFY_CORE_TOOLS list # --------------------------------------------------------------------------- From 110c971c539509827b426e5bcb60c43b72d0d935 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:19:14 +0200 Subject: [PATCH 12/62] feat: add _actor_tools.py file to define upcomming search and social media tools for apify integration --- langchain_apify/_actor_tools.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 langchain_apify/_actor_tools.py diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py new file mode 100644 index 0000000..d7bd850 --- /dev/null +++ b/langchain_apify/_actor_tools.py @@ -0,0 +1,23 @@ +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Any + +from langchain_core.tools import ToolException +from pydantic import BaseModel, Field + +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import _ApifyGenericTool, _run_meta + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + + +# --------------------------------------------------------------------------- +# Search & Crawling tools +# --------------------------------------------------------------------------- + + +# --------------------------------------------------------------------------- +# Social-media tools +# --------------------------------------------------------------------------- \ No newline at end of file From a08f63ec458179798ca337d1a380cea332b629ef Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:46:35 +0200 Subject: [PATCH 13/62] fix: add try/except to match others --- langchain_apify/tools.py | 5 ++++- tests/unit_tests/test_tools.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 93d884a..8315bdc 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -400,7 +400,10 @@ def _run( offset: int = 0, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: - items = self._client.get_dataset_items(dataset_id, limit, offset) + try: + items = self._client.get_dataset_items(dataset_id, limit, offset) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc if not items: return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) return json.dumps(items) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 025b486..331054d 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -266,6 +266,14 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo assert 'empty' in parsed['message'].lower() +def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') + tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='Network error fetching dataset'): + tool._run(dataset_id='ds-bad') + + def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): From d028531588602a1cf1249803b18b41c8f13b3b6a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 14:57:32 +0200 Subject: [PATCH 14/62] fix: update timeout constants and improve input schema descripiton in Apify tools --- langchain_apify/_client.py | 7 ++++--- langchain_apify/tools.py | 21 +++++++++++++++------ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index c3ed22e..0409fe0 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -14,7 +14,6 @@ _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 -_DEFAULT_TASK_TIMEOUT_SECS = 300 _DEFAULT_DATASET_ITEMS_LIMIT = 100 _RUN_STATUS_SUCCEEDED = 'SUCCEEDED' @@ -23,7 +22,7 @@ class ApifyToolsClient: """Internal helper that wraps ``ApifyClient`` for the tools layer. One convenience method per tool operation. All methods are synchronous and - block until the Actor run finishes., + block until the Actor run finishes. Args: apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` @@ -118,6 +117,7 @@ def run_actor_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ + # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') try: @@ -168,7 +168,7 @@ def run_task_and_get_items( self, task_id: str, task_input: dict | None = None, - timeout_secs: int = _DEFAULT_TASK_TIMEOUT_SECS, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, memory_mbytes: int | None = None, dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, ) -> tuple[dict, list[dict]]: @@ -188,6 +188,7 @@ def run_task_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ + # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId', '') try: diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 8315bdc..b870f0f 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -2,8 +2,8 @@ import json import os -from typing import TYPE_CHECKING, Any from datetime import datetime +from typing import TYPE_CHECKING, Any from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException @@ -238,18 +238,26 @@ class ApifyRunTaskInput(BaseModel): """Input schema for :class:`ApifyRunTaskTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') - task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') - memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) class ApifyRunTaskAndGetItemsInput(BaseModel): """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') - task_input: dict | None = Field(default=None, description='JSON-serialisable input that overrides the task\'s pre-saved input.') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') - memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for task default.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') @@ -258,7 +266,7 @@ class ApifyRunTaskAndGetItemsInput(BaseModel): # --------------------------------------------------------------------------- -def _iso(value: str | None) -> str | None: +def _iso(value: str | datetime | None) -> str | None: if isinstance(value, datetime): return value.isoformat() return value @@ -294,6 +302,7 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) + # Token validation (missing env var, empty string) is handled inside ApifyToolsClient.__init__. self._client = ApifyToolsClient(apify_api_token=apify_api_token) From 429a3ed6027b2e79d5b123d43dbdbf5ec3a621d6 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:00:47 +0200 Subject: [PATCH 15/62] fix: enhance error handling for missing dataset id in run_actor and run_task methods --- langchain_apify/_client.py | 10 ++++++++-- tests/unit_tests/test_client.py | 16 ++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 0409fe0..b131484 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -119,7 +119,10 @@ def run_actor_and_get_items( """ # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) - dataset_id = run.get('defaultDatasetId', '') + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) try: items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items except Exception as exc: @@ -190,7 +193,10 @@ def run_task_and_get_items( """ # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) - dataset_id = run.get('defaultDatasetId', '') + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Task {task_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) try: items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items except Exception as exc: diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 95193c4..5485d8c 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -132,6 +132,14 @@ def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_cl mock_apify_client.dataset.assert_called_once_with('dataset-xyz') +def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_actor_and_get_items('apify/test-actor') + + # --------------------------------------------------------------------------- # run_task # --------------------------------------------------------------------------- @@ -169,6 +177,14 @@ def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_cli assert items == _SAMPLE_ITEMS +def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_task_and_get_items('user/my-task') + + # --------------------------------------------------------------------------- # scrape_url # --------------------------------------------------------------------------- From b914e47dfbefb8f13eeddf1ce6512efaa3d31b64 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:15:50 +0200 Subject: [PATCH 16/62] fix: update apifygetdatasetitemstool to return a json object with items and message for empty dataset --- langchain_apify/tools.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index b870f0f..af0d84b 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -370,16 +370,16 @@ def _run( class ApifyGetDatasetItemsTool(_ApifyGenericTool): """Fetch items from an existing Apify dataset by ID. - Returns items as a JSON string. When the dataset is empty the tool returns - an informative JSON message instead of raising an error. + Returns a JSON object with an ``"items"`` key containing the list of item + dicts. When the dataset is empty an additional ``"message"`` key is + included. Args: apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` environment variable when *None*. Returns: - JSON array of item dicts, or ``{"items": [], "message": "..."}`` when - the dataset is empty. + JSON object ``{"items": [...]}``; includes ``"message"`` when empty. Example: .. code-block:: python @@ -395,10 +395,9 @@ class ApifyGetDatasetItemsTool(_ApifyGenericTool): name: str = 'apify_get_dataset_items' description: str = ( - 'Fetch items from an Apify dataset by ID. Returns a JSON array of item dicts.' + 'Fetch items from an Apify dataset by ID. Returns a JSON object with an "items" array.' ' Required: dataset_id (str) — Apify dataset ID.' ' Optional: limit (int, default 100), offset (int, default 0).' - ' Returns an empty JSON object with a message when the dataset is empty.' ) args_schema: type[BaseModel] = ApifyGetDatasetItemsInput @@ -415,7 +414,7 @@ def _run( raise ToolException(str(exc)) from exc if not items: return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) - return json.dumps(items) + return json.dumps({'items': items}) class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): From 0f7118180ddd8aa583b7ac636d2aa33aefee5e68 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 15:16:28 +0200 Subject: [PATCH 17/62] feat: add integration smoke tests for generic Apify tools to validate api interaction --- tests/integration_tests/test_generic_tools.py | 68 +++++++++++++++++++ tests/unit_tests/test_tools.py | 6 +- 2 files changed, 71 insertions(+), 3 deletions(-) create mode 100644 tests/integration_tests/test_generic_tools.py diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py new file mode 100644 index 0000000..58c5cbf --- /dev/null +++ b/tests/integration_tests/test_generic_tools.py @@ -0,0 +1,68 @@ +"""Integration smoke tests for the generic Apify tools. + +These tests hit the real Apify API and require the ``APIFY_API_TOKEN`` +environment variable to be set. They use ``apify/python-example`` (a +trivial Actor that adds two numbers) to keep execution fast and cheap. +""" + +from __future__ import annotations + +import json +import os + +import pytest + +from langchain_apify import ( + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetItemsTool, + ApifyRunActorTool, + ApifyScrapeUrlTool, +) + +_ACTOR_ID = 'apify/python-example' +_RUN_INPUT = {'first_number': 2, 'second_number': 3} + +pytestmark = pytest.mark.skipif( + not os.getenv('APIFY_API_TOKEN'), + reason='APIFY_API_TOKEN not set', +) + + +def test_run_actor_tool_smoke() -> None: + tool = ApifyRunActorTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +def test_get_dataset_items_tool_smoke() -> None: + run_tool = ApifyRunActorTool() + run_result = json.loads(run_tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT})) + dataset_id = run_result['dataset_id'] + + items_tool = ApifyGetDatasetItemsTool() + result = items_tool.invoke({'dataset_id': dataset_id, 'limit': 10}) + + parsed = json.loads(result) + assert 'items' in parsed + assert isinstance(parsed['items'], list) + + +def test_run_actor_and_get_items_tool_smoke() -> None: + tool = ApifyRunActorAndGetItemsTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) + + +def test_scrape_url_tool_smoke() -> None: + tool = ApifyScrapeUrlTool() + result = tool.invoke({'url': 'https://crawlee.dev'}) + + assert isinstance(result, str) + assert len(result) > 0 diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 331054d..21e1009 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -243,15 +243,15 @@ def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: # --------------------------------------------------------------------------- -def test_get_dataset_items_tool_returns_json_array(mock_tools_client: MagicMock) -> None: +def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) parsed = json.loads(result) - assert len(parsed) == 2 - assert parsed[0]['text'] == 'item-1' + assert len(parsed['items']) == 2 + assert parsed['items'][0]['text'] == 'item-1' mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) From 50c52f2cda5b3007c63a85dd52c5f7e82b8321e9 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 21 Apr 2026 16:01:30 +0200 Subject: [PATCH 18/62] feat: implement clamping for timeout, memory, and item limits in apify tools to enforce safety constraints --- langchain_apify/tools.py | 46 ++++- tests/integration_tests/test_generic_tools.py | 26 +++ tests/unit_tests/conftest.py | 51 ++++++ tests/unit_tests/test_client.py | 77 +++----- tests/unit_tests/test_tools.py | 173 ++++++++++++------ 5 files changed, 260 insertions(+), 113 deletions(-) create mode 100644 tests/unit_tests/conftest.py diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index af0d84b..31f55d5 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -267,6 +267,7 @@ class ApifyRunTaskAndGetItemsInput(BaseModel): def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" if isinstance(value, datetime): return value.isoformat() return value @@ -291,20 +292,37 @@ def _run_meta(run: dict) -> dict: class _ApifyGenericTool(BaseTool): # type: ignore[override] """Shared base for all generic Apify tools. - Handles ``ApifyToolsClient`` creation and sets ``handle_tool_error``. + Handles ``ApifyToolsClient`` creation, sets ``handle_tool_error``, + and defines developer-controlled safety limits that clamp values the + LLM may provide at invocation time. + Subclasses only need to declare ``name``, ``description``, ``args_schema``, and ``_run()``. """ handle_tool_error: bool = True + max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') + max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') + max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + _client: ApifyToolsClient def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) - # Token validation (missing env var, empty string) is handled inside ApifyToolsClient.__init__. self._client = ApifyToolsClient(apify_api_token=apify_api_token) + def _clamp_timeout(self, value: int) -> int: + return min(value, self.max_timeout_secs) + + def _clamp_memory(self, value: int | None) -> int | None: + if value is None: + return None + return min(value, self.max_memory_mbytes) + + def _clamp_items(self, value: int) -> int: + return min(value, self.max_items) + # --------------------------------------------------------------------------- # Generic tools @@ -361,7 +379,9 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - run = self._client.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + run = self._client.run_actor( + actor_id, run_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps(_run_meta(run)) @@ -409,7 +429,7 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - items = self._client.get_dataset_items(dataset_id, limit, offset) + items = self._client.get_dataset_items(dataset_id, self._clamp_items(limit), offset) except RuntimeError as exc: raise ToolException(str(exc)) from exc if not items: @@ -470,7 +490,11 @@ def _run( ) -> str: try: run, items = self._client.run_actor_and_get_items( - actor_id, run_input, timeout_secs, memory_mbytes, dataset_items_limit + actor_id, + run_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), ) except RuntimeError as exc: raise ToolException(str(exc)) from exc @@ -520,7 +544,7 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - return self._client.scrape_url(url, timeout_secs) + return self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) except RuntimeError as exc: raise ToolException(str(exc)) from exc @@ -576,7 +600,9 @@ def _run( _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: - run = self._client.run_task(task_id, task_input, timeout_secs, memory_mbytes) + run = self._client.run_task( + task_id, task_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps(_run_meta(run)) @@ -635,7 +661,11 @@ def _run( ) -> str: try: run, items = self._client.run_task_and_get_items( - task_id, task_input, timeout_secs, memory_mbytes, dataset_items_limit + task_id, + task_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), ) except RuntimeError as exc: raise ToolException(str(exc)) from exc diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py index 58c5cbf..863efb1 100644 --- a/tests/integration_tests/test_generic_tools.py +++ b/tests/integration_tests/test_generic_tools.py @@ -16,6 +16,8 @@ ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, ApifyRunActorTool, + ApifyRunTaskAndGetItemsTool, + ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -66,3 +68,27 @@ def test_scrape_url_tool_smoke() -> None: assert isinstance(result, str) assert len(result) > 0 + + +_TASK_ID = os.getenv('APIFY_TASK_ID', '') + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_tool_smoke() -> None: + tool = ApifyRunTaskTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_and_get_items_tool_smoke() -> None: + tool = ApifyRunTaskAndGetItemsTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py new file mode 100644 index 0000000..eedadb9 --- /dev/null +++ b/tests/unit_tests/conftest.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient + +SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +def make_tool(tool_cls: type, mock_client: MagicMock, **kwargs: Any) -> Any: # noqa: ANN401 + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token', **kwargs) + tool._client = mock_client + return tool diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 5485d8c..c35f495 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -5,36 +5,7 @@ import pytest from langchain_apify._client import ApifyToolsClient - -_SUCCEEDED_RUN: dict = { - 'id': 'run-abc', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'dataset-xyz', - 'startedAt': '2025-01-01T00:00:00.000Z', - 'finishedAt': '2025-01-01T00:01:00.000Z', -} - -_FAILED_RUN: dict = { - 'id': 'run-fail', - 'status': 'FAILED', - 'defaultDatasetId': 'dataset-xyz', -} - -_SAMPLE_ITEMS: list[dict] = [ - {'text': 'item-1', 'url': 'https://example.com/1'}, - {'text': 'item-2', 'url': 'https://example.com/2'}, -] - - -@pytest.fixture -def mock_apify_client() -> MagicMock: - return MagicMock() - - -@pytest.fixture -def client(mock_apify_client: MagicMock) -> ApifyToolsClient: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): - return ApifyToolsClient(apify_api_token='dummy-token') +from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN # --------------------------------------------------------------------------- @@ -68,17 +39,17 @@ def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) - assert result == _SUCCEEDED_RUN + assert result == SUCCEEDED_RUN def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN client.run_actor('apify/test-actor', memory_mbytes=512) @@ -88,7 +59,7 @@ def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: Magi def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _FAILED_RUN + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN with pytest.raises(RuntimeError, match='run-fail'): client.run_actor('apify/test-actor') @@ -100,13 +71,13 @@ def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_cli def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) mock_apify_client.dataset.assert_called_once_with('dataset-xyz') mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) - assert items == _SAMPLE_ITEMS + assert items == SAMPLE_ITEMS def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: @@ -122,18 +93,18 @@ def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: Ma def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) - assert run == _SUCCEEDED_RUN - assert items == _SAMPLE_ITEMS + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS mock_apify_client.dataset.assert_called_once_with('dataset-xyz') def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.actor.return_value.call.return_value = run_no_dataset with pytest.raises(RuntimeError, match='no default dataset ID'): @@ -146,17 +117,17 @@ def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsCli def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN result = client.run_task('user/my-task', task_input={'key': 'val'}) mock_apify_client.task.assert_called_once_with('user/my-task') mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) - assert result == _SUCCEEDED_RUN + assert result == SUCCEEDED_RUN def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _FAILED_RUN + mock_apify_client.task.return_value.call.return_value = FAILED_RUN with pytest.raises(RuntimeError, match='run-fail'): client.run_task('user/my-task') @@ -168,17 +139,17 @@ def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_clie def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.return_value = _SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.return_value.items = _SAMPLE_ITEMS + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS run, items = client.run_task_and_get_items('user/my-task') - assert run == _SUCCEEDED_RUN - assert items == _SAMPLE_ITEMS + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - run_no_dataset = {**_SUCCEEDED_RUN, 'defaultDatasetId': None} + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.task.return_value.call.return_value = run_no_dataset with pytest.raises(RuntimeError, match='no default dataset ID'): @@ -191,7 +162,7 @@ def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClie def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, ] @@ -201,7 +172,7 @@ def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'text': 'Plain text content', 'url': 'https://example.com'}, ] @@ -211,7 +182,7 @@ def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_clie def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [] with pytest.raises(RuntimeError, match='No content extracted'): @@ -219,7 +190,7 @@ def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_clie def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.return_value = _SUCCEEDED_RUN + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN mock_apify_client.dataset.return_value.list_items.return_value.items = [ {'markdown': '', 'text': '', 'url': 'https://example.com'}, ] diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 21e1009..61e4c8b 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -2,7 +2,7 @@ import json from datetime import datetime, timezone -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING from unittest.mock import MagicMock, patch import pytest @@ -24,6 +24,7 @@ _run_meta, ) from langchain_apify.utils import actor_id_to_tool_name +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: from collections.abc import Generator @@ -103,37 +104,6 @@ class DummyModel(BaseModel): yield tool -# --------------------------------------------------------------------------- -# Shared test data for generic tools -# --------------------------------------------------------------------------- - -_SUCCEEDED_RUN: dict = { - 'id': 'run-abc', - 'status': 'SUCCEEDED', - 'defaultDatasetId': 'dataset-xyz', - 'startedAt': '2025-01-01T00:00:00.000Z', - 'finishedAt': '2025-01-01T00:01:00.000Z', -} - -_SAMPLE_ITEMS: list[dict] = [ - {'text': 'item-1', 'url': 'https://example.com/1'}, - {'text': 'item-2', 'url': 'https://example.com/2'}, -] - - -@pytest.fixture -def mock_tools_client() -> MagicMock: - return MagicMock(spec=ApifyToolsClient) - - -def _make_tool(tool_cls: type, mock_client: MagicMock) -> Any: # noqa: ANN401 - """Instantiate a generic tool with a mocked ApifyToolsClient.""" - with patch.object(ApifyToolsClient, '__init__', return_value=None): - tool = tool_cls(apify_api_token='dummy-token') - tool._client = mock_client - return tool - - # --------------------------------------------------------------------------- # _iso / _run_meta helpers # --------------------------------------------------------------------------- @@ -169,7 +139,7 @@ def test_run_meta_with_datetime_values_is_json_serializable() -> None: def test_run_meta_with_string_values_is_json_serializable() -> None: - meta = _run_meta(_SUCCEEDED_RUN) + meta = _run_meta(SUCCEEDED_RUN) serialized = json.dumps(meta) parsed = json.loads(serialized) assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' @@ -194,7 +164,7 @@ def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), } - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + tool = make_tool(ApifyRunActorTool, mock_tools_client) result = tool._run(actor_id='apify/test') @@ -210,8 +180,8 @@ def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor.return_value = _SUCCEEDED_RUN - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) @@ -226,7 +196,7 @@ def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') - tool = _make_tool(ApifyRunActorTool, mock_tools_client) + tool = make_tool(ApifyRunActorTool, mock_tools_client) with pytest.raises(ToolException, match='FAILED'): tool._run(actor_id='apify/test') @@ -244,8 +214,8 @@ def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: - mock_tools_client.get_dataset_items.return_value = _SAMPLE_ITEMS - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) @@ -257,7 +227,7 @@ def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.return_value = [] - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) result = tool._run(dataset_id='dataset-empty') @@ -268,7 +238,7 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') - tool = _make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='Network error fetching dataset'): tool._run(dataset_id='ds-bad') @@ -286,8 +256,8 @@ def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) - def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_actor_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) - tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) @@ -302,7 +272,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = _make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(actor_id='apify/test') @@ -321,7 +291,7 @@ def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPa def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: mock_tools_client.scrape_url.return_value = '# Hello World' - tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) result = tool._run(url='https://example.com') @@ -331,7 +301,7 @@ def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') - tool = _make_tool(ApifyScrapeUrlTool, mock_tools_client) + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) with pytest.raises(ToolException, match='No content extracted'): tool._run(url='https://example.com') @@ -349,8 +319,8 @@ def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_task.return_value = _SUCCEEDED_RUN - tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) @@ -365,7 +335,7 @@ def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') - tool = _make_tool(ApifyRunTaskTool, mock_tools_client) + tool = make_tool(ApifyRunTaskTool, mock_tools_client) with pytest.raises(ToolException, match='FAILED'): tool._run(task_id='user/my-task') @@ -383,8 +353,8 @@ def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: - mock_tools_client.run_task_and_get_items.return_value = (_SUCCEEDED_RUN, _SAMPLE_ITEMS) - tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) @@ -399,7 +369,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = _make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(task_id='user/my-task') @@ -411,6 +381,105 @@ def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPat ApifyRunTaskAndGetItemsTool() +# --------------------------------------------------------------------------- +# Value clamping (developer safety limits) +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=60) + + tool._run(actor_id='apify/test', timeout_secs=9999) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 60, None) + + +def test_run_actor_tool_clamps_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=8192) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 512) + + +def test_run_actor_tool_passes_none_memory_through(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=None) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=10) + + tool._run(dataset_id='ds-1', limit=50000) + + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 10, 0) + + +def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunActorAndGetItemsTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(actor_id='a', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_actor_and_get_items.assert_called_once_with('a', None, 30, 256, 5) + + +def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# content' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) + + tool._run(url='https://example.com', timeout_secs=9999) + + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + + +def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client, max_timeout_secs=60, max_memory_mbytes=512) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999) + + mock_tools_client.run_task.assert_called_once_with('t/1', None, 60, 512) + + +def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunTaskAndGetItemsTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) + + +def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: + """When LLM values are within limits they should pass through unchanged.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', timeout_secs=120, memory_mbytes=1024) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 120, 1024) + + # --------------------------------------------------------------------------- # Tool metadata assertions # --------------------------------------------------------------------------- From ba179a6c043ee12cd4e387d48ea80f9112e0999b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 07:37:35 +0200 Subject: [PATCH 19/62] feat: clean up _actor_tools.py and tools.py for improved readibility and maintability; update test cases for better formatting and error handling --- langchain_apify/_actor_tools.py | 20 +++------ langchain_apify/tools.py | 16 +++---- tests/unit_tests/test_client.py | 78 +++++++++++++++++++++++++++++++-- tests/unit_tests/test_tools.py | 16 ++++--- 4 files changed, 98 insertions(+), 32 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index d7bd850..a989b11 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -1,17 +1,11 @@ -from __future__ import annotations - -import json -from typing import TYPE_CHECKING, Any - -from langchain_core.tools import ToolException -from pydantic import BaseModel, Field +"""Actor-specific tool subclasses (search, social-media, etc.). -from langchain_apify._client import ApifyToolsClient -from langchain_apify.tools import _ApifyGenericTool, _run_meta - -if TYPE_CHECKING: - from langchain_core.callbacks import CallbackManagerForToolRun +Downstream feature branches add concrete tools here. They inherit from +:class:`~langchain_apify.tools._ApifyGenericTool` and use +:func:`~langchain_apify.tools._run_meta` to format run metadata. +""" +from __future__ import annotations # --------------------------------------------------------------------------- # Search & Crawling tools @@ -20,4 +14,4 @@ # --------------------------------------------------------------------------- # Social-media tools -# --------------------------------------------------------------------------- \ No newline at end of file +# --------------------------------------------------------------------------- diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 31f55d5..f771d35 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -7,7 +7,7 @@ from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException -from pydantic import BaseModel, Field, create_model +from pydantic import BaseModel, Field, PrivateAttr, create_model from langchain_apify._client import ApifyToolsClient from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -306,7 +306,7 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') - _client: ApifyToolsClient + _client: ApifyToolsClient = PrivateAttr() def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 super().__init__(**kwargs) @@ -329,7 +329,7 @@ def _clamp_items(self, value: int) -> int: # --------------------------------------------------------------------------- -class ApifyRunActorTool(_ApifyGenericTool): +class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor by ID with an arbitrary JSON input. Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON @@ -387,7 +387,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyGetDatasetItemsTool(_ApifyGenericTool): +class ApifyGetDatasetItemsTool(_ApifyGenericTool): # type: ignore[override] """Fetch items from an existing Apify dataset by ID. Returns a JSON object with an ``"items"`` key containing the list of item @@ -437,7 +437,7 @@ def _run( return json.dumps({'items': items}) -class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): +class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -501,7 +501,7 @@ def _run( return json.dumps({'run': _run_meta(run), 'items': items}) -class ApifyScrapeUrlTool(_ApifyGenericTool): +class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] """Scrape a single URL and return its content as markdown. Uses the ``apify/website-content-crawler`` Actor under the hood with @@ -549,7 +549,7 @@ def _run( raise ToolException(str(exc)) from exc -class ApifyRunTaskTool(_ApifyGenericTool): +class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task by ID and return run metadata. Actor tasks are pre-configured Actor runs saved in the Apify Console. @@ -608,7 +608,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): +class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index c35f495..1c93f84 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -7,7 +7,6 @@ from langchain_apify._client import ApifyToolsClient from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN - # --------------------------------------------------------------------------- # __init__ # --------------------------------------------------------------------------- @@ -44,7 +43,9 @@ def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMoc result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) mock_apify_client.actor.assert_called_once_with('apify/test-actor') - mock_apify_client.actor.return_value.call.assert_called_once_with(run_input={'key': 'val'}, timeout_secs=300, logger=None) + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300, logger=None + ) assert result == SUCCEEDED_RUN @@ -103,7 +104,9 @@ def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_cl mock_apify_client.dataset.assert_called_once_with('dataset-xyz') -def test_run_actor_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: +def test_run_actor_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.actor.return_value.call.return_value = run_no_dataset @@ -148,7 +151,9 @@ def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_cli assert items == SAMPLE_ITEMS -def test_run_task_and_get_items_missing_dataset_id_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: +def test_run_task_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} mock_apify_client.task.return_value.call.return_value = run_no_dataset @@ -211,3 +216,68 @@ def test_check_run_status_succeeded() -> None: def test_check_run_status_failed() -> None: with pytest.raises(RuntimeError, match='run-bad'): ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) + + +# --------------------------------------------------------------------------- +# None returns from actor/task .call() +# --------------------------------------------------------------------------- + + +def test_run_actor_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_actor('apify/broken-actor') + + +def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_task('user/broken-task') + + +# --------------------------------------------------------------------------- +# Network error wrapping (transport exception -> RuntimeError) +# --------------------------------------------------------------------------- + + +def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.side_effect = ConnectionError('conn refused') + + with pytest.raises(RuntimeError, match='Network error calling Actor'): + client.run_actor('apify/test-actor') + + +def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('timeout') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.get_dataset_items('dataset-xyz') + + +def test_run_actor_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.run_actor_and_get_items('apify/test-actor') + + +def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.side_effect = ConnectionError('conn refused') + + with pytest.raises(RuntimeError, match='Network error calling task'): + client.run_task('user/my-task') + + +def test_run_task_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + + with pytest.raises(RuntimeError, match='Network error fetching dataset'): + client.run_task_and_get_items('user/my-task') diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 61e4c8b..6698589 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -237,7 +237,9 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: - mock_tools_client.get_dataset_items.side_effect = RuntimeError('Network error fetching dataset ds-bad: connection reset') + mock_tools_client.get_dataset_items.side_effect = RuntimeError( + 'Network error fetching dataset ds-bad: connection reset' + ) tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) with pytest.raises(ToolException, match='Network error fetching dataset'): @@ -489,12 +491,12 @@ def test_generic_tools_have_correct_metadata() -> None: """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ - ApifyRunActorTool(apify_api_token='dummy'), - ApifyGetDatasetItemsTool(apify_api_token='dummy'), - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), - ApifyScrapeUrlTool(apify_api_token='dummy'), - ApifyRunTaskTool(apify_api_token='dummy'), - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] ] expected_names = [ From 005294b8cb0c48b4e6a95d926a7a7401c88343b4 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Wed, 22 Apr 2026 14:08:24 +0200 Subject: [PATCH 20/62] ref: align private scope conventions with langchain partner package standards --- langchain_apify/_client.py | 18 ++++++++--------- langchain_apify/_error_messages.py | 6 +++--- langchain_apify/{utils.py => _utils.py} | 8 ++++---- langchain_apify/document_loaders.py | 9 ++++----- langchain_apify/tools.py | 27 ++++++++++++++----------- langchain_apify/wrappers.py | 12 +++++------ tests/integration_tests/test_utils.py | 10 ++++----- tests/unit_tests/conftest.py | 2 +- tests/unit_tests/test_client.py | 4 ++-- tests/unit_tests/test_tools.py | 4 ++-- 10 files changed, 51 insertions(+), 49 deletions(-) rename langchain_apify/{utils.py => _utils.py} (94%) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index b131484..84e840a 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -5,11 +5,11 @@ from apify_client import ApifyClient from langchain_apify._error_messages import ( - ERROR_ACTOR_RUN_FAILED, - ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, - ERROR_SCRAPE_EMPTY, + _ERROR_ACTOR_RUN_FAILED, + _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + _ERROR_SCRAPE_EMPTY, ) -from langchain_apify.utils import create_apify_client +from langchain_apify._utils import _create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 @@ -35,9 +35,9 @@ class ApifyToolsClient: def __init__(self, apify_api_token: str | None = None) -> None: token = apify_api_token or os.getenv('APIFY_API_TOKEN') if not token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = create_apify_client(ApifyClient, token) + self._client = _create_apify_client(ApifyClient, token) def run_actor( self, @@ -230,12 +230,12 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) dataset_items_limit=1, ) if not items: - msg = ERROR_SCRAPE_EMPTY.format(url=url) + msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) content = items[0].get('markdown') or items[0].get('text') or '' if not content: - msg = ERROR_SCRAPE_EMPTY.format(url=url) + msg = _ERROR_SCRAPE_EMPTY.format(url=url) raise RuntimeError(msg) return content @@ -245,5 +245,5 @@ def _check_run_status(run: dict) -> None: status = run.get('status') if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') - msg = ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) raise RuntimeError(msg) diff --git a/langchain_apify/_error_messages.py b/langchain_apify/_error_messages.py index a87c9cb..0a8c612 100644 --- a/langchain_apify/_error_messages.py +++ b/langchain_apify/_error_messages.py @@ -1,4 +1,4 @@ -ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( +_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( 'APIFY_API_TOKEN environment variable is not set.' ' Please set it to your Apify API token by using `os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"' ' in your code or pass it as environment variable.' @@ -6,6 +6,6 @@ ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) -ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' +_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' -ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' +_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/utils.py b/langchain_apify/_utils.py similarity index 94% rename from langchain_apify/utils.py rename to langchain_apify/_utils.py index d3a627f..b19bcbf 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/_utils.py @@ -12,7 +12,7 @@ _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' -def prune_actor_input_schema( +def _prune_actor_input_schema( input_schema: dict, max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: @@ -48,7 +48,7 @@ def prune_actor_input_schema( T = TypeVar('T', ApifyClient, ApifyClientAsync) -def create_apify_client(client_cls: type[T], token: str) -> T: +def _create_apify_client(client_cls: type[T], token: str) -> T: """Create an Apify client instance with a custom user-agent. Args: @@ -79,7 +79,7 @@ def create_apify_client(client_cls: type[T], token: str) -> T: return client -def actor_id_to_tool_name(actor_id: str) -> str: +def _actor_id_to_tool_name(actor_id: str) -> str: """Turn actor_id into a valid tool name. Tool name must only contain letters, numbers, underscores, dashes, @@ -95,7 +95,7 @@ def actor_id_to_tool_name(actor_id: str) -> str: return 'apify_actor_' + ''.join(char if char in valid_chars else '_' for char in actor_id) -def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: +def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: """Get the latest build of an Actor from the default build tag. Args: diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..8554872 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -8,9 +8,9 @@ from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator -from langchain_apify.utils import create_apify_client +from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: from collections.abc import Iterator @@ -42,8 +42,7 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - apify_client: ApifyClient - """An instance of the ApifyClient class from the apify-client Python package.""" + apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -86,7 +85,7 @@ def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 # when running at Apify platform, use APIFY_TOKEN environment variable apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - client = create_apify_client(ApifyClient, apify_api_token) + client = _create_apify_client(ApifyClient, apify_api_token) values['apify_client'] = client diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index f771d35..81d9166 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -10,13 +10,13 @@ from pydantic import BaseModel, Field, PrivateAttr, create_model from langchain_apify._client import ApifyToolsClient -from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import ( +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import ( _MAX_DESCRIPTION_LEN, - actor_id_to_tool_name, - create_apify_client, - get_actor_latest_build, - prune_actor_input_schema, + _actor_id_to_tool_name, + _create_apify_client, + _get_actor_latest_build, + _prune_actor_input_schema, ) if TYPE_CHECKING: @@ -57,6 +57,9 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] chunk["messages"][-1].pretty_print() """ + _apify_client: ApifyClient = PrivateAttr() + _actor_id: str = PrivateAttr() + def __init__( self, actor_id: str, @@ -77,14 +80,14 @@ def __init__( """ apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') if not apify_api_token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, apify_api_token) kwargs.update( { - 'name': actor_id_to_tool_name(actor_id), + 'name': _actor_id_to_tool_name(actor_id), 'description': self._create_description(apify_client, actor_id), 'args_schema': self._build_tool_args_schema_model( apify_client, @@ -127,7 +130,7 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: Returns: str: The description. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') if len(actor_description) > _MAX_DESCRIPTION_LEN: actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' @@ -150,12 +153,12 @@ def _build_tool_args_schema_model( Raises: ValueError: If the input schema is not found in the Actor build. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) if not (actor_input := build.get('actorDefinition', {}).get('input')): msg = f'Input schema not found in the Actor build for Actor: {actor_id}' raise ValueError(msg) - properties, required = prune_actor_input_schema(actor_input) + properties, required = _prune_actor_input_schema(actor_input) properties = {'run_input': properties} description = ( diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index ef17873..34370fe 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -5,10 +5,10 @@ from apify_client import ApifyClient, ApifyClientAsync from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.utils import create_apify_client if TYPE_CHECKING: from collections.abc import Callable @@ -53,8 +53,8 @@ class ApifyWrapper(BaseModel): # allow arbitrary types in the model config for the apify client fields model_config = ConfigDict(arbitrary_types_allowed=True) - apify_client: ApifyClient - apify_client_async: ApifyClientAsync + apify_client: ApifyClient = Field(default=None, exclude=True) + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) apify_api_token: str | None = None def __init__( @@ -90,8 +90,8 @@ def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 """ apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - values['apify_client'] = create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = create_apify_client(ApifyClientAsync, apify_api_token) + values['apify_client'] = _create_apify_client(ApifyClient, apify_api_token) + values['apify_client_async'] = _create_apify_client(ApifyClientAsync, apify_api_token) return values diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 554cc2d..c92c038 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,8 +2,8 @@ from apify_client.client import ApifyClient -from langchain_apify._error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import create_apify_client, get_actor_latest_build +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client, _get_actor_latest_build def test_get_actor_latest_build() -> None: @@ -13,12 +13,12 @@ def test_get_actor_latest_build() -> None: ValueError: If the APIFY_API_TOKEN environment variable is not set. """ if (token := os.getenv('APIFY_API_TOKEN')) is None: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, token) + apify_client = _create_apify_client(ApifyClient, token) - build = get_actor_latest_build(apify_client, 'apify/rag-web-browser') + build = _get_actor_latest_build(apify_client, 'apify/rag-web-browser') assert isinstance(build, dict) assert 'id' in build diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py index eedadb9..3384e79 100644 --- a/tests/unit_tests/conftest.py +++ b/tests/unit_tests/conftest.py @@ -39,7 +39,7 @@ def mock_apify_client() -> MagicMock: @pytest.fixture def client(mock_apify_client: MagicMock) -> ApifyToolsClient: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): return ApifyToolsClient(apify_api_token='dummy-token') diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 1c93f84..40c73dc 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -13,7 +13,7 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client) as mock_create: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: c = ApifyToolsClient(apify_api_token='my-token') mock_create.assert_called_once() assert c._client is mock_apify_client @@ -21,7 +21,7 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') - with patch('langchain_apify._client.create_apify_client', return_value=mock_apify_client): + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): c = ApifyToolsClient() assert c._client is mock_apify_client diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 6698589..542ec4e 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -11,6 +11,7 @@ from langchain_apify import APIFY_CORE_TOOLS from langchain_apify._client import ApifyToolsClient +from langchain_apify._utils import _actor_id_to_tool_name from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, @@ -23,7 +24,6 @@ _iso, _run_meta, ) -from langchain_apify.utils import actor_id_to_tool_name from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: @@ -57,7 +57,7 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id=actor_id, apify_api_token='dummy-token') assert isinstance(tool, ApifyActorsTool) assert tool.description == 'Mocked description' - assert tool.name == actor_id_to_tool_name(actor_id) + assert tool.name == _actor_id_to_tool_name(actor_id) assert tool.args_schema == DummyModel From 2f74c292ccf9422480484ce4921a4d3919c4c672 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:08:13 +0200 Subject: [PATCH 21/62] ref: migrate auth to SecretStr + secret_from_env pattern --- langchain_apify/document_loaders.py | 50 ++++++++++++++++----------- langchain_apify/tools.py | 26 ++++++++++---- langchain_apify/wrappers.py | 53 +++++++++++++++-------------- 3 files changed, 76 insertions(+), 53 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 8554872..3a777f3 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -7,9 +7,10 @@ from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: @@ -40,8 +41,12 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): documents = loader.load() """ - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + ) apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str """The ID of the dataset on the Apify platform.""" @@ -62,7 +67,8 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. + apify_api_token (str): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ super().__init__( dataset_id=dataset_id, @@ -70,26 +76,30 @@ def __init__( apify_api_token=apify_api_token, ) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. + @model_validator(mode='after') + def _init_client(self) -> 'ApifyDatasetLoader': + """Resolve the Apify API token and initialise the client. - Args: - values (dict): The values to validate. + Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the + Apify platform where only that variable is set. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - # when running at Apify platform, use APIFY_TOKEN environment variable - apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - - client = _create_apify_client(ApifyClient, apify_api_token) + ApifyDatasetLoader: The validated loader instance. - values['apify_client'] = client - - return values + Raises: + ValueError: If no token is available from any source. + """ + token = self.apify_api_token + if token is None: + # Secondary fallback for code running on the Apify platform. + raw = os.getenv('APIFY_TOKEN') + if raw: + token = SecretStr(raw) + if token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self.apify_client = _create_apify_client(ApifyClient, token.get_secret_value()) + return self def load(self) -> list[Document]: """Load documents. diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 81d9166..0097f4c 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -7,7 +7,8 @@ from apify_client import ApifyClient from langchain_core.tools import BaseTool, ToolException -from pydantic import BaseModel, Field, PrivateAttr, create_model +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model from langchain_apify._client import ApifyToolsClient from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET @@ -78,12 +79,16 @@ def __init__( Raises: ValueError: If the `APIFY_API_TOKEN` environment variable is not set """ - apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not apify_api_token: + _raw_token: str | None = ( + apify_api_token.get_secret_value() + if isinstance(apify_api_token, SecretStr) + else apify_api_token or os.getenv('APIFY_API_TOKEN') + ) + if not _raw_token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = _create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, _raw_token) kwargs.update( { @@ -305,15 +310,22 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] handle_tool_error: bool = True + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + ) max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') _client: ApifyToolsClient = PrivateAttr() - def __init__(self, apify_api_token: str | None = None, **kwargs: Any) -> None: # noqa: ANN401 - super().__init__(**kwargs) - self._client = ApifyToolsClient(apify_api_token=apify_api_token) + def model_post_init(self, __context: Any) -> None: # noqa: ANN401 + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) + super().model_post_init(__context) def _clamp_timeout(self, value: int) -> int: return min(value, self.max_timeout_secs) diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index 34370fe..9af591a 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -4,9 +4,10 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient, ApifyClientAsync -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, Field, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader @@ -51,11 +52,14 @@ class ApifyWrapper(BaseModel): """ # allow arbitrary types in the model config for the apify client fields - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + ) apify_client: ApifyClient = Field(default=None, exclude=True) apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) - apify_api_token: str | None = None def __init__( self, @@ -63,37 +67,34 @@ def __init__( *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: - """Initialize the loader with an Apify dataset ID and a mapping function. + """Initialise the wrapper. Args: - dataset_id (str): The ID of the dataset on the Apify platform. - dataset_mapping_function (Callable): A function that takes a single - dictionary (an Apify dataset item) and converts it to an instance - of the Document class. - apify_api_token (Optional[str]): Apify API token. - *args: Any: Additional positional arguments. - **kwargs: Any: Additional keyword arguments. + apify_api_token (Optional[str]): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` environment variable when *None*. + *args: Any: Additional positional arguments forwarded to Pydantic. + **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ kwargs.update({'apify_api_token': apify_api_token}) super().__init__(*args, **kwargs) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Validate that an Apify API token is set and the apify-client - Python package exists in the current environment. + @model_validator(mode='after') + def _init_clients(self) -> 'ApifyWrapper': + """Validate the token and initialise both sync and async Apify clients. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') + ApifyWrapper: The validated wrapper instance. - values['apify_client'] = _create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = _create_apify_client(ApifyClientAsync, apify_api_token) - - return values + Raises: + ValueError: If no token is provided and APIFY_API_TOKEN is not set. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + token = self.apify_api_token.get_secret_value() + self.apify_client = _create_apify_client(ApifyClient, token) + self.apify_client_async = _create_apify_client(ApifyClientAsync, token) + return self def call_actor( # noqa: PLR0913 self, From 6258b2b9ad8ed2dffd09918929ef9ec7d7893f4c Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:38:37 +0200 Subject: [PATCH 22/62] fix: backward-compat fix --- langchain_apify/document_loaders.py | 14 +++++++++----- langchain_apify/wrappers.py | 5 ++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 3a777f3..4e286af 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -70,11 +70,15 @@ def __init__( apify_api_token (str): Apify API token. Falls back to the ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ - super().__init__( - dataset_id=dataset_id, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, - ) + init_kwargs: dict[str, Any] = { + 'dataset_id': dataset_id, + 'dataset_mapping_function': dataset_mapping_function, + } + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + init_kwargs['apify_api_token'] = apify_api_token + super().__init__(**init_kwargs) @model_validator(mode='after') def _init_client(self) -> 'ApifyDatasetLoader': diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index 9af591a..a1e0ab6 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -75,7 +75,10 @@ def __init__( *args: Any: Additional positional arguments forwarded to Pydantic. **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ - kwargs.update({'apify_api_token': apify_api_token}) + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + kwargs['apify_api_token'] = apify_api_token super().__init__(*args, **kwargs) @model_validator(mode='after') From 2905b679a7240b6286229a86f182ea0eddd3ac37 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:58:21 +0200 Subject: [PATCH 23/62] fix: update stale doc string --- langchain_apify/_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/_utils.py b/langchain_apify/_utils.py index b19bcbf..9d74487 100644 --- a/langchain_apify/_utils.py +++ b/langchain_apify/_utils.py @@ -18,7 +18,7 @@ def _prune_actor_input_schema( ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. - Trim the description to 250 characters. + Trim descriptions to ``_MAX_DESCRIPTION_LEN`` characters. Args: input_schema (dict): The input schema from the Actor build. From 3238c0203e9965ad1c34dc004c9bd729714b0b1f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 13:59:06 +0200 Subject: [PATCH 24/62] chore: removed redundant file --- langchain_apify/_actor_tools.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 langchain_apify/_actor_tools.py diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py deleted file mode 100644 index a989b11..0000000 --- a/langchain_apify/_actor_tools.py +++ /dev/null @@ -1,17 +0,0 @@ -"""Actor-specific tool subclasses (search, social-media, etc.). - -Downstream feature branches add concrete tools here. They inherit from -:class:`~langchain_apify.tools._ApifyGenericTool` and use -:func:`~langchain_apify.tools._run_meta` to format run metadata. -""" - -from __future__ import annotations - -# --------------------------------------------------------------------------- -# Search & Crawling tools -# --------------------------------------------------------------------------- - - -# --------------------------------------------------------------------------- -# Social-media tools -# --------------------------------------------------------------------------- From 92df406a8fde1996c1fe71713f16e1d2533d36dc Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 14:19:12 +0200 Subject: [PATCH 25/62] fix: extracted repeated code, fixed secretstr compatibility to apifytoolsclient --- langchain_apify/_client.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 84e840a..a828be2 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -3,6 +3,7 @@ import os from apify_client import ApifyClient +from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -32,12 +33,17 @@ class ApifyToolsClient: ValueError: If no token is provided and the env var is not set. """ - def __init__(self, apify_api_token: str | None = None) -> None: - token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not token: + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + _token: str | None = None + if isinstance(apify_api_token, SecretStr): + _token = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, token) + self._client = _create_apify_client(ApifyClient, _token) def run_actor( self, @@ -117,17 +123,12 @@ def run_actor_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - # run_actor() raises RuntimeError on Actor failure; the except below only covers the dataset fetch. run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId') if not dataset_id: msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' raise RuntimeError(msg) - try: - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' - raise RuntimeError(msg) from exc + items = self._list_items_or_raise(dataset_id, dataset_items_limit) return run, items def run_task( @@ -191,17 +192,12 @@ def run_task_and_get_items( Raises: RuntimeError: If the run does not finish with status ``SUCCEEDED``. """ - # run_task() raises RuntimeError on task failure; the except below only covers the dataset fetch. run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) dataset_id = run.get('defaultDatasetId') if not dataset_id: msg = f'Task {task_id} run succeeded but returned no default dataset ID.' raise RuntimeError(msg) - try: - items = self._client.dataset(dataset_id).list_items(limit=dataset_items_limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' - raise RuntimeError(msg) from exc + items = self._list_items_or_raise(dataset_id, dataset_items_limit) return run, items def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: @@ -239,6 +235,14 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) raise RuntimeError(msg) return content + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: + """Fetch dataset items, wrapping any network error in a RuntimeError.""" + try: + return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items + except Exception as exc: + msg = f'Network error fetching dataset {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + @staticmethod def _check_run_status(run: dict) -> None: """Raise if the run did not succeed.""" From 3a0f666d08f4f6e05ec382699fe03a4d3e1e9414 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:02:43 +0200 Subject: [PATCH 26/62] fix: set min value to timeout, memory and items, add exlude and repr to apify_api_token --- langchain_apify/tools.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 0097f4c..3d7af3c 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -313,6 +313,8 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, ) max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') @@ -328,15 +330,15 @@ def model_post_init(self, __context: Any) -> None: # noqa: ANN401 super().model_post_init(__context) def _clamp_timeout(self, value: int) -> int: - return min(value, self.max_timeout_secs) + return max(1, min(value, self.max_timeout_secs)) def _clamp_memory(self, value: int | None) -> int | None: if value is None: return None - return min(value, self.max_memory_mbytes) + return max(1, min(value, self.max_memory_mbytes)) def _clamp_items(self, value: int) -> int: - return min(value, self.max_items) + return max(1, min(value, self.max_items)) # --------------------------------------------------------------------------- From 8614cfdbb54d4eff228b80c20668ed81e21cffb0 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:03:29 +0200 Subject: [PATCH 27/62] feat: added repr and exclude to apify api token --- langchain_apify/document_loaders.py | 2 ++ langchain_apify/wrappers.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 4e286af..400476e 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -46,6 +46,8 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + exclude=True, + repr=False, ) apify_client: ApifyClient = Field(default=None, exclude=True) dataset_id: str diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index a1e0ab6..e4cafb6 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -57,6 +57,8 @@ class ApifyWrapper(BaseModel): apify_api_token: SecretStr | None = Field( default_factory=secret_from_env('APIFY_API_TOKEN', default=None), description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, ) apify_client: ApifyClient = Field(default=None, exclude=True) apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) From 2bf130a9c98a8d3c7436cf8e4daf14d9d5fc20c4 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:03:59 +0200 Subject: [PATCH 28/62] feat: add type checking to apify core tools list --- langchain_apify/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index fa1f369..bca8081 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,6 +1,7 @@ from __future__ import annotations from importlib import metadata +from typing import TYPE_CHECKING from langchain_apify.document_loaders import ApifyDatasetLoader from langchain_apify.tools import ( @@ -14,6 +15,9 @@ ) from langchain_apify.wrappers import ApifyWrapper +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: @@ -24,7 +28,7 @@ # Binding all tools at once overwhelms the LLM context window; # pick the group(s) relevant to your use case. -APIFY_CORE_TOOLS: list[type] = [ +APIFY_CORE_TOOLS: list[type[BaseTool]] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, ApifyRunActorAndGetItemsTool, From 98293d4cfa1e0ca6c488380ff93b5033b2d96314 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:05:05 +0200 Subject: [PATCH 29/62] feat: add tests for clamped values and apify api token --- tests/unit_tests/test_document_loaders.py | 25 +++++++++++++ tests/unit_tests/test_tools.py | 44 +++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..49ee9db 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,5 +1,6 @@ from unittest.mock import patch +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document @@ -55,3 +56,27 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +def test_apify_dataset_loader_apify_token_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Loader should accept APIFY_TOKEN as a secondary env-var fallback.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'platform-token') + + with patch.object(DatasetClient, 'list_items') as mock_list_items: + mock_list_items.return_value = ListPage(data={'items': []}) + loader = ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) + assert loader.load() == [] + + +def test_apify_dataset_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 542ec4e..3c99a71 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -472,6 +472,42 @@ def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) +def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600) + + tool._run(actor_id='apify/test', timeout_secs=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', timeout_secs=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + +def test_clamp_memory_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + + +def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=100) + + tool._run(dataset_id='ds-1', limit=-1) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + mock_tools_client.get_dataset_items.reset_mock() + tool._run(dataset_id='ds-1', limit=0) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: """When LLM values are within limits they should pass through unchanged.""" mock_tools_client.run_actor.return_value = SUCCEEDED_RUN @@ -515,6 +551,14 @@ def test_generic_tools_have_correct_metadata() -> None: assert tool.handle_tool_error is True +def test_apify_api_token_excluded_from_model_dump() -> None: + """The apify_api_token field must not appear in model_dump() output.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg] + dumped = tool.model_dump() + assert 'apify_api_token' not in dumped + + # --------------------------------------------------------------------------- # _ApifyGenericTool inheritance # --------------------------------------------------------------------------- From 863ed8d31b64457635a52abc0402918a37e1bc4a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 23 Apr 2026 15:20:48 +0200 Subject: [PATCH 30/62] fix: lint fix --- langchain_apify/_client.py | 2 +- langchain_apify/document_loaders.py | 4 ++-- langchain_apify/tools.py | 2 +- langchain_apify/wrappers.py | 6 +++--- tests/unit_tests/test_tools.py | 14 +++++++------- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index a828be2..9a87d46 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -39,7 +39,7 @@ def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: _token = apify_api_token.get_secret_value() else: _token = apify_api_token or os.getenv('APIFY_API_TOKEN') - + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 400476e..131950d 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -49,7 +49,7 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): exclude=True, repr=False, ) - apify_client: ApifyClient = Field(default=None, exclude=True) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -83,7 +83,7 @@ def __init__( super().__init__(**init_kwargs) @model_validator(mode='after') - def _init_client(self) -> 'ApifyDatasetLoader': + def _init_client(self) -> ApifyDatasetLoader: """Resolve the Apify API token and initialise the client. Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 3d7af3c..e7721b7 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -64,7 +64,7 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] def __init__( self, actor_id: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index e4cafb6..d5fd25c 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -60,8 +60,8 @@ class ApifyWrapper(BaseModel): exclude=True, repr=False, ) - apify_client: ApifyClient = Field(default=None, exclude=True) - apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) # type: ignore[assignment] def __init__( self, @@ -84,7 +84,7 @@ def __init__( super().__init__(*args, **kwargs) @model_validator(mode='after') - def _init_clients(self) -> 'ApifyWrapper': + def _init_clients(self) -> ApifyWrapper: """Validate the token and initialise both sync and async Apify clients. Returns: diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 3c99a71..67fa1a7 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -527,12 +527,12 @@ def test_generic_tools_have_correct_metadata() -> None: """Verify name, description, and args_schema are set on all generic tools.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): tools = [ - ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg] - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg] + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ] expected_names = [ @@ -554,7 +554,7 @@ def test_generic_tools_have_correct_metadata() -> None: def test_apify_api_token_excluded_from_model_dump() -> None: """The apify_api_token field must not appear in model_dump() output.""" with patch.object(ApifyToolsClient, '__init__', return_value=None): - tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg] + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg,arg-type] dumped = tool.model_dump() assert 'apify_api_token' not in dumped From 70527e0d839b02c1399620d76c6c599aa55434a3 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Fri, 24 Apr 2026 09:56:43 +0200 Subject: [PATCH 31/62] ref: update apify_api_token type to support SecretStr in document loaders --- langchain_apify/document_loaders.py | 4 ++-- langchain_apify/wrappers.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 131950d..6439740 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -60,7 +60,7 @@ def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[dict], Document], - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, ) -> None: """Initialize the loader with an Apify dataset ID and a mapping function. @@ -69,7 +69,7 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. Falls back to the + apify_api_token (str | SecretStr): Apify API token. Falls back to the ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ init_kwargs: dict[str, Any] = { diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index d5fd25c..57a9eeb 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -65,15 +65,15 @@ class ApifyWrapper(BaseModel): def __init__( self, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: """Initialise the wrapper. Args: - apify_api_token (Optional[str]): Apify API token. Falls back to the - ``APIFY_API_TOKEN`` environment variable when *None*. + apify_api_token (Optional[str | SecretStr]): Apify API token. Falls + back to the ``APIFY_API_TOKEN`` environment variable when *None*. *args: Any: Additional positional arguments forwarded to Pydantic. **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ From 06bf2f6e109f7f31cbc2f0200206f498a33254c2 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:03:30 +0200 Subject: [PATCH 32/62] feat: implement social actors tools methods in _client.py --- langchain_apify/_client.py | 301 +++++++++++++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 9a87d46..bcf78cc 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -13,11 +13,28 @@ from langchain_apify._utils import _create_apify_client _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_INSTAGRAM_ACTOR_ID = 'apify/instagram-scraper' +_LINKEDIN_POSTS_ACTOR_ID = 'apimaestro/linkedin-profile-posts' +_LINKEDIN_SEARCH_ACTOR_ID = 'harvestapi/linkedin-profile-search' +_LINKEDIN_DETAIL_ACTOR_ID = 'apimaestro/linkedin-profile-detail' +_TWITTER_ACTOR_ID = 'apidojo/twitter-scraper-lite' +_TIKTOK_ACTOR_ID = 'clockworks/tiktok-scraper' +_FACEBOOK_ACTOR_ID = 'apify/facebook-posts-scraper' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_SOCIAL_TIMEOUT_SECS = 600 _DEFAULT_DATASET_ITEMS_LIMIT = 100 +_DEFAULT_SOCIAL_RESULTS_LIMIT = 20 _RUN_STATUS_SUCCEEDED = 'SUCCEEDED' +# Instagram-specific mappings +_INSTAGRAM_RESULTS_TYPE_MAP = { + 'user': 'posts', + 'hashtag': 'posts', + 'post': 'posts', + 'comments': 'comments', +} + class ApifyToolsClient: """Internal helper that wraps ``ApifyClient`` for the tools layer. @@ -235,6 +252,290 @@ def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) raise RuntimeError(msg) return content + def instagram_scrape( + self, + search_type: str, + search_query: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + only_posts_newer_than: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Instagram via ``apify/instagram-scraper``. + + Args: + search_type: One of ``"user"``, ``"hashtag"``, ``"post"``, ``"comments"``. + search_query: Username, hashtag, or Instagram URL depending on + ``search_type``. + max_results: Maximum number of items to return. + only_posts_newer_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_type`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + results_type = _INSTAGRAM_RESULTS_TYPE_MAP.get(search_type) + if results_type is None: + msg = ( + f'Unsupported Instagram search_type {search_type!r}. ' + f'Expected one of: {sorted(_INSTAGRAM_RESULTS_TYPE_MAP)}.' + ) + raise ValueError(msg) + + direct_url = self._build_instagram_url(search_type, search_query) + run_input: dict = { + 'directUrls': [direct_url], + 'resultsType': results_type, + 'resultsLimit': max_results, + } + if only_posts_newer_than is not None: + run_input['onlyPostsNewerThan'] = only_posts_newer_than + return self.run_actor_and_get_items( + _INSTAGRAM_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_posts( + self, + profile_url: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape LinkedIn profile posts via ``apimaestro/linkedin-profile-posts``. + + Args: + profile_url: LinkedIn profile URL or username. + max_results: Maximum number of posts to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'username': profile_url, + 'total_posts': max_results, + } + return self.run_actor_and_get_items( + _LINKEDIN_POSTS_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_search( + self, + query: str, + max_results: int = 10, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Search LinkedIn profiles via ``harvestapi/linkedin-profile-search``. + + Args: + query: Search keywords (e.g., name, title, company). + max_results: Maximum number of profiles to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'searchQuery': query, + 'maxItems': max_results, + } + return self.run_actor_and_get_items( + _LINKEDIN_SEARCH_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_detail( + self, + profile_url: str, + include_email: bool = False, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Fetch a LinkedIn profile via ``apimaestro/linkedin-profile-detail``. + + Args: + profile_url: LinkedIn profile URL or username. + include_email: If True, attempt to include the profile email when + available. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. ``items`` typically contains a + single profile dict. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'username': profile_url, + 'includeEmail': include_email, + } + return self.run_actor_and_get_items( + _LINKEDIN_DETAIL_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + + def twitter_scrape( + self, + search_query: str, + search_mode: str = 'search', + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + start: str | None = None, + end: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Twitter/X via ``apidojo/twitter-scraper-lite``. + + Args: + search_query: Search term, username, or tweet URL. + search_mode: One of ``"search"``, ``"user"``, ``"replies"``. + max_results: Maximum number of tweets to return. + start: Optional ISO-8601 start date — only return tweets newer + than this date. + end: Optional ISO-8601 end date — only return tweets older than + this date. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_mode`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = {'maxItems': max_results} + if search_mode == 'search': + run_input['searchTerms'] = [search_query] + elif search_mode == 'user': + run_input['twitterHandles'] = [search_query.lstrip('@')] + elif search_mode == 'replies': + run_input['startUrls'] = [search_query] + else: + msg = ( + f'Unsupported Twitter search_mode {search_mode!r}. ' + "Expected one of: ['search', 'user', 'replies']." + ) + raise ValueError(msg) + if start is not None: + run_input['start'] = start + if end is not None: + run_input['end'] = end + return self.run_actor_and_get_items( + _TWITTER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def tiktok_scrape( + self, + search_query: str, + search_type: str = 'search', + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape TikTok via ``clockworks/tiktok-scraper``. + + Args: + search_query: Username, hashtag, or search keyword. + search_type: One of ``"search"``, ``"user"``, ``"hashtag"``. + max_results: Maximum number of items to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_type`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = {'resultsPerPage': max_results} + if search_type == 'search': + run_input['searchQueries'] = [search_query] + elif search_type == 'user': + run_input['profiles'] = [search_query.lstrip('@')] + elif search_type == 'hashtag': + run_input['hashtags'] = [search_query.lstrip('#')] + else: + msg = ( + f'Unsupported TikTok search_type {search_type!r}. ' + "Expected one of: ['search', 'user', 'hashtag']." + ) + raise ValueError(msg) + return self.run_actor_and_get_items( + _TIKTOK_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def facebook_posts_scrape( + self, + page_url: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + only_posts_newer_than: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Facebook page posts via ``apify/facebook-posts-scraper``. + + Args: + page_url: Facebook page URL. + max_results: Maximum number of posts to return. + only_posts_newer_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'startUrls': [{'url': page_url}], + 'resultsLimit': max_results, + } + if only_posts_newer_than is not None: + run_input['onlyPostsNewerThan'] = only_posts_newer_than + return self.run_actor_and_get_items( + _FACEBOOK_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + @staticmethod + def _build_instagram_url(search_type: str, search_query: str) -> str: + """Build an Instagram URL from a username/hashtag/URL based on search type.""" + if search_query.startswith(('http://', 'https://')): + return search_query + if search_type == 'hashtag': + tag = search_query.lstrip('#') + return f'https://www.instagram.com/explore/tags/{tag}/' + if search_type == 'user': + handle = search_query.lstrip('@') + return f'https://www.instagram.com/{handle}/' + # post/comments expect a URL; if a bare ID is given, build a /p/ URL + return f'https://www.instagram.com/p/{search_query}/' + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: """Fetch dataset items, wrapping any network error in a RuntimeError.""" try: From 15863e5936140f01706df6e6bc1976fbb7b9a041 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:08:02 +0200 Subject: [PATCH 33/62] feat: implement unit tests --- tests/unit_tests/test_client.py | 296 ++++++++++++++++++++++++++++++++ 1 file changed, 296 insertions(+) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 40c73dc..a199700 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -281,3 +281,299 @@ def test_run_task_and_get_items_dataset_fetch_network_error( with pytest.raises(RuntimeError, match='Network error fetching dataset'): client.run_task_and_get_items('user/my-task') + + +# --------------------------------------------------------------------------- +# instagram_scrape +# --------------------------------------------------------------------------- + + +def _setup_run_and_items(mock_apify_client: MagicMock, items: list[dict] | None = None) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = items or SAMPLE_ITEMS + + +def test_instagram_scrape_user_builds_profile_url(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.instagram_scrape('user', 'apify', max_results=5) + + mock_apify_client.actor.assert_called_once_with('apify/instagram-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'directUrls': ['https://www.instagram.com/apify/'], + 'resultsType': 'posts', + 'resultsLimit': 5, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_instagram_scrape_hashtag_builds_tag_url(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('hashtag', '#travel', max_results=10) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['directUrls'] == ['https://www.instagram.com/explore/tags/travel/'] + assert call_kwargs['run_input']['resultsType'] == 'posts' + + +def test_instagram_scrape_comments_uses_comments_results_type( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('comments', 'https://www.instagram.com/p/ABC123/', max_results=15) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['resultsType'] == 'comments' + assert call_kwargs['run_input']['directUrls'] == ['https://www.instagram.com/p/ABC123/'] + + +def test_instagram_scrape_passes_only_posts_newer_than( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('user', 'apify', only_posts_newer_than='1 week') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsNewerThan'] == '1 week' + + +def test_instagram_scrape_invalid_search_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported Instagram search_type'): + client.instagram_scrape('reels', 'apify') + + +# --------------------------------------------------------------------------- +# linkedin_profile_posts +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_posts_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.linkedin_profile_posts('https://www.linkedin.com/in/satyanadella', max_results=30) + + mock_apify_client.actor.assert_called_once_with('apimaestro/linkedin-profile-posts') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'username': 'https://www.linkedin.com/in/satyanadella', + 'total_posts': 30, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +# --------------------------------------------------------------------------- +# linkedin_profile_search +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_search_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_search('Founder', max_results=25) + + mock_apify_client.actor.assert_called_once_with('harvestapi/linkedin-profile-search') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'searchQuery': 'Founder', 'maxItems': 25} + + +def test_linkedin_profile_search_default_max_results( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_search('CTO') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['maxItems'] == 10 + + +# --------------------------------------------------------------------------- +# linkedin_profile_detail +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_detail_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client, items=[{'firstName': 'Neal'}]) + + run, items = client.linkedin_profile_detail('neal-mohan', include_email=True) + + mock_apify_client.actor.assert_called_once_with('apimaestro/linkedin-profile-detail') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'username': 'neal-mohan', 'includeEmail': True} + assert run == SUCCEEDED_RUN + assert items == [{'firstName': 'Neal'}] + + +def test_linkedin_profile_detail_default_include_email_false( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_detail('neal-mohan') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['includeEmail'] is False + + +# --------------------------------------------------------------------------- +# twitter_scrape +# --------------------------------------------------------------------------- + + +def test_twitter_scrape_search_mode(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', max_results=50) + + mock_apify_client.actor.assert_called_once_with('apidojo/twitter-scraper-lite') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'maxItems': 50, 'searchTerms': ['apify']} + + +def test_twitter_scrape_user_mode_strips_at(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('@apify', search_mode='user', max_results=10) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'maxItems': 10, 'twitterHandles': ['apify']} + + +def test_twitter_scrape_replies_mode_uses_start_urls( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('https://x.com/apify/status/123', search_mode='replies') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['startUrls'] == ['https://x.com/apify/status/123'] + + +def test_twitter_scrape_passes_date_range(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', start='2025-01-01', end='2025-02-01') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['start'] == '2025-01-01' + assert call_kwargs['run_input']['end'] == '2025-02-01' + + +def test_twitter_scrape_invalid_mode_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported Twitter search_mode'): + client.twitter_scrape('apify', search_mode='followers') + + +# --------------------------------------------------------------------------- +# tiktok_scrape +# --------------------------------------------------------------------------- + + +def test_tiktok_scrape_search_mode(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('cooking', max_results=12) + + mock_apify_client.actor.assert_called_once_with('clockworks/tiktok-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'resultsPerPage': 12, 'searchQueries': ['cooking']} + + +def test_tiktok_scrape_user_mode_strips_at(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('@charlidamelio', search_type='user') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['profiles'] == ['charlidamelio'] + + +def test_tiktok_scrape_hashtag_mode_strips_hash(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('#fyp', search_type='hashtag') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['hashtags'] == ['fyp'] + + +def test_tiktok_scrape_invalid_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported TikTok search_type'): + client.tiktok_scrape('cooking', search_type='trending') + + +# --------------------------------------------------------------------------- +# facebook_posts_scrape +# --------------------------------------------------------------------------- + + +def test_facebook_posts_scrape_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', max_results=15) + + mock_apify_client.actor.assert_called_once_with('apify/facebook-posts-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'startUrls': [{'url': 'https://www.facebook.com/humansofnewyork/'}], + 'resultsLimit': 15, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_facebook_posts_scrape_passes_only_posts_newer_than( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', only_posts_newer_than='2025-01-01') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsNewerThan'] == '2025-01-01' + + +# --------------------------------------------------------------------------- +# Failed run propagates from social helpers +# --------------------------------------------------------------------------- + + +def test_social_helper_propagates_failed_run(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.instagram_scrape('user', 'apify') + + +# --------------------------------------------------------------------------- +# _build_instagram_url +# --------------------------------------------------------------------------- + + +def test_build_instagram_url_passthrough_for_full_url() -> None: + assert ( + ApifyToolsClient._build_instagram_url('post', 'https://www.instagram.com/p/abc/') + == 'https://www.instagram.com/p/abc/' + ) + + +def test_build_instagram_url_user() -> None: + assert ApifyToolsClient._build_instagram_url('user', '@apify') == 'https://www.instagram.com/apify/' + + +def test_build_instagram_url_hashtag() -> None: + assert ( + ApifyToolsClient._build_instagram_url('hashtag', '#travel') + == 'https://www.instagram.com/explore/tags/travel/' + ) + + +def test_build_instagram_url_post_from_id() -> None: + assert ApifyToolsClient._build_instagram_url('post', 'ABC123') == 'https://www.instagram.com/p/ABC123/' From cde433b6ff6348d7c021f273f3737a502b8936f4 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:19:31 +0200 Subject: [PATCH 34/62] feat: implement input schemas for new tools --- langchain_apify/_actor_tools.py | 120 ++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 langchain_apify/_actor_tools.py diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py new file mode 100644 index 0000000..981cfcd --- /dev/null +++ b/langchain_apify/_actor_tools.py @@ -0,0 +1,120 @@ +"""Apify Actor-specific LangChain tools for social media platforms. + +Each tool wraps a single Apify Actor behind a simplified, LLM-friendly +interface so that LangChain agents can scrape social media data without +needing to know Actor IDs or raw input schemas. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, Field + +# --------------------------------------------------------------------------- +# Input schemas +# --------------------------------------------------------------------------- + + +class ApifyInstagramScraperInput(BaseModel): + """Input schema for :class:`ApifyInstagramScraperTool`.""" + + search_type: Literal['user', 'hashtag', 'post', 'comments'] = Field( + description=( + 'Type of data to scrape: "user" for a profile\'s posts, "hashtag" ' + 'for posts under a tag, "post" for a single post, "comments" for ' + 'comments on a post.' + ), + ) + search_query: str = Field( + description=( + 'Username, hashtag, or full Instagram URL depending on search_type. ' + 'For "comments" you must pass a post URL (e.g. instagram.com/p/...).' + ), + ) + max_results: int = Field(default=20, description='Maximum number of items to return.') + only_posts_newer_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) + + +class ApifyLinkedInProfilePostsInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfilePostsTool`.""" + + profile_url: str = Field( + description='LinkedIn profile URL or username (e.g. "satyanadella" or "linkedin.com/in/satyanadella").', + ) + max_results: int = Field(default=20, description='Maximum number of posts to return.') + + +class ApifyLinkedInProfileSearchInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfileSearchTool`.""" + + query: str = Field(description='Search keywords (e.g. name, title, company).') + max_results: int = Field(default=10, description='Maximum number of profiles to return.') + + +class ApifyLinkedInProfileDetailInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfileDetailTool`.""" + + profile_url: str = Field( + description='LinkedIn profile URL, username, or URN (e.g. "neal-mohan").', + ) + include_email: bool = Field( + default=False, + description='If True, attempt to include the profile email when available.', + ) + + +class ApifyTwitterScraperInput(BaseModel): + """Input schema for :class:`ApifyTwitterScraperTool`.""" + + search_query: str = Field(description='Search term, Twitter handle, or tweet URL.') + search_mode: Literal['search', 'user', 'replies'] = Field( + default='search', + description=( + 'Scraping mode: "search" for keyword search, "user" for a handle\'s ' + 'tweets, "replies" for a tweet URL\'s replies.' + ), + ) + max_results: int = Field(default=20, description='Maximum number of tweets to return.') + start: str | None = Field( + default=None, + description='Optional start date — only return tweets newer than this date.', + ) + end: str | None = Field( + default=None, + description='Optional end date — only return tweets older than this date.', + ) + + +class ApifyTikTokScraperInput(BaseModel): + """Input schema for :class:`ApifyTikTokScraperTool`.""" + + search_query: str = Field(description='Username, hashtag, or search keyword.') + search_type: Literal['search', 'user', 'hashtag'] = Field( + default='search', + description=( + 'Type of content to scrape: "search" for keyword search, "user" for ' + "a profile's videos, \"hashtag\" for videos under a tag." + ), + ) + max_results: int = Field(default=20, description='Maximum number of items to return.') + + +class ApifyFacebookPostsScraperInput(BaseModel): + """Input schema for :class:`ApifyFacebookPostsScraperTool`.""" + + page_url: str = Field(description='Facebook page URL to scrape (public pages only).') + max_results: int = Field(default=20, description='Maximum number of posts to return.') + only_posts_newer_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) From 02bd034c7582d7826ba93b68b99c257378320656 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:22:35 +0200 Subject: [PATCH 35/62] feat: implement instagram scraper tool --- langchain_apify/_actor_tools.py | 76 ++++++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 981cfcd..ea4e240 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -7,10 +7,17 @@ from __future__ import annotations -from typing import Literal +import json +from typing import TYPE_CHECKING, Literal +from langchain_core.tools import ToolException from pydantic import BaseModel, Field +from langchain_apify.tools import _ApifyGenericTool, _run_meta + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + # --------------------------------------------------------------------------- # Input schemas # --------------------------------------------------------------------------- @@ -118,3 +125,70 @@ class ApifyFacebookPostsScraperInput(BaseModel): 'values like "1 day", "2 months", "3 years".' ), ) + + +# --------------------------------------------------------------------------- +# Tools +# --------------------------------------------------------------------------- + + +class ApifyInstagramScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape Instagram profiles, hashtags, posts, or comments. + + Uses the ``apify/instagram-scraper`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of scraped item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyInstagramScraperTool + + tool = ApifyInstagramScraperTool() + result = tool.invoke({ + "search_type": "user", + "search_query": "apify", + "max_results": 10, + }) + """ + + name: str = 'apify_instagram_scraper' + description: str = ( + 'Scrape Instagram profiles, hashtags, posts, or comments and return the results as JSON.' + ' Required: search_type (one of "user", "hashtag", "post", "comments"),' + ' search_query (str — username, hashtag, or post URL).' + ' Optional: max_results (int, default 20),' + ' only_posts_newer_than (str — date filter, e.g. "2025-01-01" or "1 week").' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyInstagramScraperInput + + def _run( + self, + search_type: Literal['user', 'hashtag', 'post', 'comments'], + search_query: str, + max_results: int = 20, + only_posts_newer_than: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.instagram_scrape( + search_type=search_type, + search_query=search_query, + max_results=self._clamp_items(max_results), + only_posts_newer_than=only_posts_newer_than, + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From 5fb5530bb617cc5624c5bd3d7f185b457b15f70c Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:32:30 +0200 Subject: [PATCH 36/62] feat: implement linkedin profile posts tool --- langchain_apify/_actor_tools.py | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index ea4e240..b320305 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -192,3 +192,58 @@ def _run( except (RuntimeError, ValueError) as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfilePostsTool(_ApifyGenericTool): # type: ignore[override] + """Extract posts from a LinkedIn profile. + + Uses the ``apimaestro/linkedin-profile-posts`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of post dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfilePostsTool + + tool = ApifyLinkedInProfilePostsTool() + result = tool.invoke({ + "profile_url": "https://www.linkedin.com/in/satyanadella", + "max_results": 10, + }) + """ + + name: str = 'apify_linkedin_profile_posts' + description: str = ( + 'Extract posts from a LinkedIn profile and return them as JSON.' + ' Required: profile_url (str — LinkedIn profile URL or username, e.g. "satyanadella").' + ' Optional: max_results (int, default 20).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfilePostsInput + + def _run( + self, + profile_url: str, + max_results: int = 20, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_posts( + profile_url=profile_url, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From d860e22d4ffb81066300904171f2dd316a5a7da1 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:35:37 +0200 Subject: [PATCH 37/62] feat: implement linkedin profi search tool --- langchain_apify/_actor_tools.py | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index b320305..62f290b 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -247,3 +247,58 @@ def _run( except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfileSearchTool(_ApifyGenericTool): # type: ignore[override] + """Search for LinkedIn profiles by keyword or criteria. + + Uses the ``harvestapi/linkedin-profile-search`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of profile dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfileSearchTool + + tool = ApifyLinkedInProfileSearchTool() + result = tool.invoke({ + "query": "Founder", + "max_results": 10, + }) + """ + + name: str = 'apify_linkedin_profile_search' + description: str = ( + 'Search for LinkedIn profiles by keyword (name, title, company) and return matching profiles as JSON.' + ' Required: query (str — search keywords).' + ' Optional: max_results (int, default 10).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfileSearchInput + + def _run( + self, + query: str, + max_results: int = 10, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_search( + query=query, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From d5ab9ef71cc4f428ef8ddd2092c30b2c80ce251a Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:38:09 +0200 Subject: [PATCH 38/62] feat: implement linkedin profile detail tool --- langchain_apify/_actor_tools.py | 54 +++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 62f290b..028ae44 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -302,3 +302,57 @@ def _run( except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[override] + """Retrieve detailed information from a specific LinkedIn profile. + + Uses the ``apimaestro/linkedin-profile-detail`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (typically + a single-element list with the profile dict). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfileDetailTool + + tool = ApifyLinkedInProfileDetailTool() + result = tool.invoke({ + "profile_url": "https://www.linkedin.com/in/neal-mohan", + }) + """ + + name: str = 'apify_linkedin_profile_detail' + description: str = ( + 'Retrieve detailed information from a specific LinkedIn profile and return it as JSON.' + ' Required: profile_url (str — LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' + ' Optional: include_email (bool, default False — include profile email if available).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfileDetailInput + + def _run( + self, + profile_url: str, + include_email: bool = False, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_detail( + profile_url=profile_url, + include_email=include_email, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From f55a24d498daab963ba631d73b08a711d1093f8e Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:42:29 +0200 Subject: [PATCH 39/62] feat: implement twitter scraper tool --- langchain_apify/_actor_tools.py | 81 +++++++++++++++++++++++++++++---- 1 file changed, 73 insertions(+), 8 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 028ae44..9d3253c 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -91,11 +91,11 @@ class ApifyTwitterScraperInput(BaseModel): max_results: int = Field(default=20, description='Maximum number of tweets to return.') start: str | None = Field( default=None, - description='Optional start date — only return tweets newer than this date.', + description='Optional start date - only return tweets newer than this date.', ) end: str | None = Field( default=None, - description='Optional end date — only return tweets older than this date.', + description='Optional end date - only return tweets older than this date.', ) @@ -166,9 +166,9 @@ class ApifyInstagramScraperTool(_ApifyGenericTool): # type: ignore[override] description: str = ( 'Scrape Instagram profiles, hashtags, posts, or comments and return the results as JSON.' ' Required: search_type (one of "user", "hashtag", "post", "comments"),' - ' search_query (str — username, hashtag, or post URL).' + ' search_query (str - username, hashtag, or post URL).' ' Optional: max_results (int, default 20),' - ' only_posts_newer_than (str — date filter, e.g. "2025-01-01" or "1 week").' + ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ) args_schema: type[BaseModel] = ApifyInstagramScraperInput @@ -226,7 +226,7 @@ class ApifyLinkedInProfilePostsTool(_ApifyGenericTool): # type: ignore[override name: str = 'apify_linkedin_profile_posts' description: str = ( 'Extract posts from a LinkedIn profile and return them as JSON.' - ' Required: profile_url (str — LinkedIn profile URL or username, e.g. "satyanadella").' + ' Required: profile_url (str - LinkedIn profile URL or username, e.g. "satyanadella").' ' Optional: max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ) @@ -281,7 +281,7 @@ class ApifyLinkedInProfileSearchTool(_ApifyGenericTool): # type: ignore[overrid name: str = 'apify_linkedin_profile_search' description: str = ( 'Search for LinkedIn profiles by keyword (name, title, company) and return matching profiles as JSON.' - ' Required: query (str — search keywords).' + ' Required: query (str - search keywords).' ' Optional: max_results (int, default 10).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ) @@ -335,8 +335,8 @@ class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[overrid name: str = 'apify_linkedin_profile_detail' description: str = ( 'Retrieve detailed information from a specific LinkedIn profile and return it as JSON.' - ' Required: profile_url (str — LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' - ' Optional: include_email (bool, default False — include profile email if available).' + ' Required: profile_url (str - LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' + ' Optional: include_email (bool, default False - include profile email if available).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ) args_schema: type[BaseModel] = ApifyLinkedInProfileDetailInput @@ -356,3 +356,68 @@ def _run( except RuntimeError as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyTwitterScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape tweets, profiles, or replies from Twitter/X. + + Uses the ``apidojo/twitter-scraper-lite`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of tweet dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyTwitterScraperTool + + tool = ApifyTwitterScraperTool() + result = tool.invoke({ + "search_query": "apify", + "search_mode": "search", + "max_results": 20, + }) + """ + + name: str = 'apify_twitter_scraper' + description: str = ( + 'Scrape tweets from Twitter/X by search term, user handle, or tweet URL and return them as JSON.' + ' Required: search_query (str - search term, handle, or tweet URL).' + ' Optional: search_mode (one of "search", "user", "replies"; default "search"),' + ' max_results (int, default 20),' + ' start (str - ISO date, only return tweets newer than this date),' + ' end (str - ISO date, only return tweets older than this date).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyTwitterScraperInput + + def _run( + self, + search_query: str, + search_mode: Literal['search', 'user', 'replies'] = 'search', + max_results: int = 20, + start: str | None = None, + end: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.twitter_scrape( + search_query=search_query, + search_mode=search_mode, + max_results=self._clamp_items(max_results), + start=start, + end=end, + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From e638c44b590bbb7508424ed5de48ac3a804b60d3 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:50:55 +0200 Subject: [PATCH 40/62] feat: implement tiktok scraper tool --- langchain_apify/_actor_tools.py | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 9d3253c..96d2005 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -421,3 +421,62 @@ def _run( except (RuntimeError, ValueError) as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape TikTok videos, profiles, or hashtag content. + + Uses the ``clockworks/tiktok-scraper`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of TikTok item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyTikTokScraperTool + + tool = ApifyTikTokScraperTool() + result = tool.invoke({ + "search_query": "cooking", + "search_type": "search", + "max_results": 20, + }) + """ + + name: str = 'apify_tiktok_scraper' + description: str = ( + 'Scrape TikTok by search keyword, profile, or hashtag and return the results as JSON.' + ' Required: search_query (str - keyword, username, or hashtag).' + ' Optional: search_type (one of "search", "user", "hashtag"; default "search"),' + ' max_results (int, default 20).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyTikTokScraperInput + + def _run( + self, + search_query: str, + search_type: Literal['search', 'user', 'hashtag'] = 'search', + max_results: int = 20, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.tiktok_scrape( + search_query=search_query, + search_type=search_type, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From f1c608e150d39e8ba92ef5f7a8c538552f80a6f2 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:53:29 +0200 Subject: [PATCH 41/62] feat: implement facebook posts scraper tool --- langchain_apify/_actor_tools.py | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 96d2005..c4c7e86 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -480,3 +480,63 @@ def _run( except (RuntimeError, ValueError) as exc: raise ToolException(str(exc)) from exc return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyFacebookPostsScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape public Facebook page posts. + + Uses the ``apify/facebook-posts-scraper`` Actor under the hood. + Only public Facebook pages are supported - personal profiles cannot + be scraped. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of post dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyFacebookPostsScraperTool + + tool = ApifyFacebookPostsScraperTool() + result = tool.invoke({ + "page_url": "https://www.facebook.com/humansofnewyork/", + "max_results": 20, + }) + """ + + name: str = 'apify_facebook_posts_scraper' + description: str = ( + 'Scrape posts from a public Facebook page and return them as JSON.' + ' Required: page_url (str - Facebook page URL; personal profiles are not supported).' + ' Optional: max_results (int, default 20),' + ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ) + args_schema: type[BaseModel] = ApifyFacebookPostsScraperInput + + def _run( + self, + page_url: str, + max_results: int = 20, + only_posts_newer_than: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.facebook_posts_scrape( + page_url=page_url, + max_results=self._clamp_items(max_results), + only_posts_newer_than=only_posts_newer_than, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) From 960e9dea2005ed662bf046c54cafb368c575fc3f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 13:56:23 +0200 Subject: [PATCH 42/62] feat: implement unit tests for new tools --- tests/unit_tests/test_actor_tools.py | 330 +++++++++++++++++++++++++++ 1 file changed, 330 insertions(+) create mode 100644 tests/unit_tests/test_actor_tools.py diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py new file mode 100644 index 0000000..580b1aa --- /dev/null +++ b/tests/unit_tests/test_actor_tools.py @@ -0,0 +1,330 @@ +from __future__ import annotations + +import json +from unittest.mock import MagicMock + +import pytest +from langchain_core.tools import ToolException + +from langchain_apify._actor_tools import ( + ApifyFacebookPostsScraperTool, + ApifyInstagramScraperTool, + ApifyLinkedInProfileDetailTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyTikTokScraperTool, + ApifyTwitterScraperTool, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool + +EXPECTED_RUN_META: dict = { + 'run_id': 'run-abc', + 'status': 'SUCCEEDED', + 'dataset_id': 'dataset-xyz', + 'started_at': '2025-01-01T00:00:00.000Z', + 'finished_at': '2025-01-01T00:01:00.000Z', +} + + +# --------------------------------------------------------------------------- +# Missing token (shared base behavior) +# --------------------------------------------------------------------------- + + +def test_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyInstagramScraperTool() + + +# --------------------------------------------------------------------------- +# ApifyInstagramScraperTool +# --------------------------------------------------------------------------- + + +def test_instagram_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + result = tool._run(search_type='user', search_query='apify', max_results=10) + + parsed = json.loads(result) + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + + +def test_instagram_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + tool._run( + search_type='hashtag', + search_query='#travel', + max_results=5, + only_posts_newer_than='1 week', + ) + + mock_tools_client.instagram_scrape.assert_called_once_with( + search_type='hashtag', + search_query='#travel', + max_results=5, + only_posts_newer_than='1 week', + timeout_secs=600, + ) + + +def test_instagram_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client, max_items=3) + + tool._run(search_type='user', search_query='apify', max_results=100) + + assert mock_tools_client.instagram_scrape.call_args.kwargs['max_results'] == 3 + + +def test_instagram_tool_runtime_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.side_effect = RuntimeError('Actor run run-X ended with status FAILED.') + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='run-X'): + tool._run(search_type='user', search_query='apify') + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfilePostsTool +# --------------------------------------------------------------------------- + + +def test_linkedin_posts_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_posts.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyLinkedInProfilePostsTool, mock_tools_client) + + result = tool._run(profile_url='satyanadella', max_results=10) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.linkedin_profile_posts.assert_called_once_with( + profile_url='satyanadella', + max_results=10, + timeout_secs=600, + ) + + +def test_linkedin_posts_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_posts.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfilePostsTool, mock_tools_client, max_items=5) + + tool._run(profile_url='satyanadella', max_results=999) + + assert mock_tools_client.linkedin_profile_posts.call_args.kwargs['max_results'] == 5 + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfileSearchTool +# --------------------------------------------------------------------------- + + +def test_linkedin_search_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + result = tool._run(query='Founder', max_results=10) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.linkedin_profile_search.assert_called_once_with( + query='Founder', + max_results=10, + timeout_secs=600, + ) + + +def test_linkedin_search_tool_default_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + tool._run(query='CTO') + + assert mock_tools_client.linkedin_profile_search.call_args.kwargs['max_results'] == 10 + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfileDetailTool +# --------------------------------------------------------------------------- + + +def test_linkedin_detail_tool_happy_path(mock_tools_client: MagicMock) -> None: + profile_item = [{'firstName': 'Neal', 'lastName': 'Mohan'}] + mock_tools_client.linkedin_profile_detail.return_value = (SUCCEEDED_RUN, profile_item) + tool = make_tool(ApifyLinkedInProfileDetailTool, mock_tools_client) + + result = tool._run(profile_url='neal-mohan', include_email=True) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == profile_item + mock_tools_client.linkedin_profile_detail.assert_called_once_with( + profile_url='neal-mohan', + include_email=True, + timeout_secs=600, + ) + + +def test_linkedin_detail_tool_default_include_email_false(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_detail.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileDetailTool, mock_tools_client) + + tool._run(profile_url='neal-mohan') + + assert mock_tools_client.linkedin_profile_detail.call_args.kwargs['include_email'] is False + + +# --------------------------------------------------------------------------- +# ApifyTwitterScraperTool +# --------------------------------------------------------------------------- + + +def test_twitter_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + result = tool._run(search_query='apify', max_results=20) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.twitter_scrape.assert_called_once_with( + search_query='apify', + search_mode='search', + max_results=20, + start=None, + end=None, + timeout_secs=600, + ) + + +def test_twitter_tool_passes_date_range(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + tool._run(search_query='apify', search_mode='user', start='2025-01-01', end='2025-02-01') + + kwargs = mock_tools_client.twitter_scrape.call_args.kwargs + assert kwargs['search_mode'] == 'user' + assert kwargs['start'] == '2025-01-01' + assert kwargs['end'] == '2025-02-01' + + +def test_twitter_tool_value_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.side_effect = ValueError('Unsupported Twitter search_mode') + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Unsupported Twitter search_mode'): + tool._run(search_query='apify', search_mode='replies') # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# ApifyTikTokScraperTool +# --------------------------------------------------------------------------- + + +def test_tiktok_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client) + + result = tool._run(search_query='cooking', search_type='search', max_results=12) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.tiktok_scrape.assert_called_once_with( + search_query='cooking', + search_type='search', + max_results=12, + timeout_secs=600, + ) + + +def test_tiktok_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client, max_items=4) + + tool._run(search_query='cooking', max_results=500) + + assert mock_tools_client.tiktok_scrape.call_args.kwargs['max_results'] == 4 + + +# --------------------------------------------------------------------------- +# ApifyFacebookPostsScraperTool +# --------------------------------------------------------------------------- + + +def test_facebook_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + result = tool._run(page_url='https://www.facebook.com/humansofnewyork/', max_results=15) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.facebook_posts_scrape.assert_called_once_with( + page_url='https://www.facebook.com/humansofnewyork/', + max_results=15, + only_posts_newer_than=None, + timeout_secs=600, + ) + + +def test_facebook_tool_passes_only_posts_newer_than(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + tool._run(page_url='https://www.facebook.com/humansofnewyork/', only_posts_newer_than='2025-01-01') + + assert mock_tools_client.facebook_posts_scrape.call_args.kwargs['only_posts_newer_than'] == '2025-01-01' + + +def test_facebook_tool_runtime_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.side_effect = RuntimeError('Network error') + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Network error'): + tool._run(page_url='https://www.facebook.com/humansofnewyork/') + + +# --------------------------------------------------------------------------- +# Empty results - tools should still return valid JSON +# --------------------------------------------------------------------------- + + +def test_tool_returns_valid_json_for_empty_items(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + result = tool._run(query='nonexistent') + parsed = json.loads(result) + + assert parsed['items'] == [] + assert parsed['run']['status'] == 'SUCCEEDED' + + +# --------------------------------------------------------------------------- +# handle_tool_error is True on every social tool (existing base behavior) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + 'tool_cls', + [ + ApifyInstagramScraperTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyLinkedInProfileDetailTool, + ApifyTwitterScraperTool, + ApifyTikTokScraperTool, + ApifyFacebookPostsScraperTool, + ], +) +def test_social_tool_handle_tool_error_enabled(tool_cls: type, mock_tools_client: MagicMock) -> None: + tool = make_tool(tool_cls, mock_tools_client) + assert tool.handle_tool_error is True From a34c619de78bbf947503d1236ec7a2dc4d30d2be Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 14:04:23 +0200 Subject: [PATCH 43/62] feat: implement init to import new social tools --- langchain_apify/__init__.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index bca8081..21459af 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -3,6 +3,15 @@ from importlib import metadata from typing import TYPE_CHECKING +from langchain_apify._actor_tools import ( + ApifyFacebookPostsScraperTool, + ApifyInstagramScraperTool, + ApifyLinkedInProfileDetailTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyTikTokScraperTool, + ApifyTwitterScraperTool, +) from langchain_apify.document_loaders import ApifyDatasetLoader from langchain_apify.tools import ( ApifyActorsTool, @@ -37,6 +46,16 @@ ApifyRunTaskAndGetItemsTool, ] +APIFY_SOCIAL_TOOLS: list[type[BaseTool]] = [ + ApifyInstagramScraperTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyLinkedInProfileDetailTool, + ApifyTwitterScraperTool, + ApifyTikTokScraperTool, + ApifyFacebookPostsScraperTool, +] + __all__ = [ # Existing components (backward-compatible) 'ApifyActorsTool', @@ -49,8 +68,17 @@ 'ApifyRunTaskAndGetItemsTool', 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', + # Social media Actor tools + 'ApifyFacebookPostsScraperTool', + 'ApifyInstagramScraperTool', + 'ApifyLinkedInProfileDetailTool', + 'ApifyLinkedInProfilePostsTool', + 'ApifyLinkedInProfileSearchTool', + 'ApifyTikTokScraperTool', + 'ApifyTwitterScraperTool', # Tool group lists 'APIFY_CORE_TOOLS', + 'APIFY_SOCIAL_TOOLS', # Meta '__version__', ] From 6e8c693c5c3163f19e9a499ded21853b924fb504 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 14:06:32 +0200 Subject: [PATCH 44/62] feat: fix circural reference issue --- langchain_apify/_actor_tools.py | 3 ++- langchain_apify/_utils.py | 19 +++++++++++++++++++ langchain_apify/tools.py | 25 +------------------------ 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index c4c7e86..cd20f26 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -13,7 +13,8 @@ from langchain_core.tools import ToolException from pydantic import BaseModel, Field -from langchain_apify.tools import _ApifyGenericTool, _run_meta +from langchain_apify._utils import _run_meta +from langchain_apify.tools import _ApifyGenericTool if TYPE_CHECKING: from langchain_core.callbacks import CallbackManagerForToolRun diff --git a/langchain_apify/_utils.py b/langchain_apify/_utils.py index 9d74487..6d7c2c0 100644 --- a/langchain_apify/_utils.py +++ b/langchain_apify/_utils.py @@ -1,6 +1,7 @@ from __future__ import annotations import string +from datetime import datetime from typing import TypeVar import requests @@ -130,3 +131,21 @@ def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: raise ValueError(msg) return data + + +def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index e7721b7..ec41a93 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -2,7 +2,6 @@ import json import os -from datetime import datetime from typing import TYPE_CHECKING, Any from apify_client import ApifyClient @@ -18,6 +17,7 @@ _create_apify_client, _get_actor_latest_build, _prune_actor_input_schema, + _run_meta, ) if TYPE_CHECKING: @@ -269,29 +269,6 @@ class ApifyRunTaskAndGetItemsInput(BaseModel): dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') -# --------------------------------------------------------------------------- -# Helpers -# --------------------------------------------------------------------------- - - -def _iso(value: str | datetime | None) -> str | None: - """Coerce a possible ``datetime`` to an ISO-8601 string.""" - if isinstance(value, datetime): - return value.isoformat() - return value - - -def _run_meta(run: dict) -> dict: - """Extract a compact metadata dict from an Apify run-details dict.""" - return { - 'run_id': run.get('id'), - 'status': run.get('status'), - 'dataset_id': run.get('defaultDatasetId'), - 'started_at': _iso(run.get('startedAt')), - 'finished_at': _iso(run.get('finishedAt')), - } - - # --------------------------------------------------------------------------- # Shared base for generic tools # --------------------------------------------------------------------------- From ff4cca28b28ce103898bc507f2819b7052329c68 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 14:13:46 +0200 Subject: [PATCH 45/62] fix: lint fixes --- langchain_apify/_actor_tools.py | 3 ++- langchain_apify/_client.py | 13 ++++--------- tests/unit_tests/test_client.py | 15 ++++----------- tests/unit_tests/test_tools.py | 4 +--- 4 files changed, 11 insertions(+), 24 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index cd20f26..6cf4e8d 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -108,7 +108,7 @@ class ApifyTikTokScraperInput(BaseModel): default='search', description=( 'Type of content to scrape: "search" for keyword search, "user" for ' - "a profile's videos, \"hashtag\" for videos under a tag." + 'a profile\'s videos, "hashtag" for videos under a tag.' ), ) max_results: int = Field(default=20, description='Maximum number of items to return.') @@ -345,6 +345,7 @@ class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[overrid def _run( self, profile_url: str, + *, include_email: bool = False, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index bcf78cc..48a29ed 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -364,6 +364,7 @@ def linkedin_profile_search( def linkedin_profile_detail( self, profile_url: str, + *, include_email: bool = False, timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, ) -> tuple[dict, list[dict]]: @@ -393,7 +394,7 @@ def linkedin_profile_detail( dataset_items_limit=1, ) - def twitter_scrape( + def twitter_scrape( # noqa: PLR0913 self, search_query: str, search_mode: str = 'search', @@ -429,10 +430,7 @@ def twitter_scrape( elif search_mode == 'replies': run_input['startUrls'] = [search_query] else: - msg = ( - f'Unsupported Twitter search_mode {search_mode!r}. ' - "Expected one of: ['search', 'user', 'replies']." - ) + msg = f"Unsupported Twitter search_mode {search_mode!r}. Expected one of: ['search', 'user', 'replies']." raise ValueError(msg) if start is not None: run_input['start'] = start @@ -475,10 +473,7 @@ def tiktok_scrape( elif search_type == 'hashtag': run_input['hashtags'] = [search_query.lstrip('#')] else: - msg = ( - f'Unsupported TikTok search_type {search_type!r}. ' - "Expected one of: ['search', 'user', 'hashtag']." - ) + msg = f"Unsupported TikTok search_type {search_type!r}. Expected one of: ['search', 'user', 'hashtag']." raise ValueError(msg) return self.run_actor_and_get_items( _TIKTOK_ACTOR_ID, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index a199700..147097d 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -331,9 +331,7 @@ def test_instagram_scrape_comments_uses_comments_results_type( assert call_kwargs['run_input']['directUrls'] == ['https://www.instagram.com/p/ABC123/'] -def test_instagram_scrape_passes_only_posts_newer_than( - client: ApifyToolsClient, mock_apify_client: MagicMock -) -> None: +def test_instagram_scrape_passes_only_posts_newer_than(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: _setup_run_and_items(mock_apify_client) client.instagram_scrape('user', 'apify', only_posts_newer_than='1 week') @@ -382,9 +380,7 @@ def test_linkedin_profile_search_maps_input(client: ApifyToolsClient, mock_apify assert call_kwargs['run_input'] == {'searchQuery': 'Founder', 'maxItems': 25} -def test_linkedin_profile_search_default_max_results( - client: ApifyToolsClient, mock_apify_client: MagicMock -) -> None: +def test_linkedin_profile_search_default_max_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: _setup_run_and_items(mock_apify_client) client.linkedin_profile_search('CTO') @@ -445,9 +441,7 @@ def test_twitter_scrape_user_mode_strips_at(client: ApifyToolsClient, mock_apify assert call_kwargs['run_input'] == {'maxItems': 10, 'twitterHandles': ['apify']} -def test_twitter_scrape_replies_mode_uses_start_urls( - client: ApifyToolsClient, mock_apify_client: MagicMock -) -> None: +def test_twitter_scrape_replies_mode_uses_start_urls(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: _setup_run_and_items(mock_apify_client) client.twitter_scrape('https://x.com/apify/status/123', search_mode='replies') @@ -570,8 +564,7 @@ def test_build_instagram_url_user() -> None: def test_build_instagram_url_hashtag() -> None: assert ( - ApifyToolsClient._build_instagram_url('hashtag', '#travel') - == 'https://www.instagram.com/explore/tags/travel/' + ApifyToolsClient._build_instagram_url('hashtag', '#travel') == 'https://www.instagram.com/explore/tags/travel/' ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 67fa1a7..960f05b 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -11,7 +11,7 @@ from langchain_apify import APIFY_CORE_TOOLS from langchain_apify._client import ApifyToolsClient -from langchain_apify._utils import _actor_id_to_tool_name +from langchain_apify._utils import _actor_id_to_tool_name, _iso, _run_meta from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, @@ -21,8 +21,6 @@ ApifyRunTaskTool, ApifyScrapeUrlTool, _ApifyGenericTool, - _iso, - _run_meta, ) from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool From 35de3d15943d8f46b1f0e1d35c02e8cc2708f715 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Mon, 27 Apr 2026 16:10:01 +0200 Subject: [PATCH 46/62] docs: add line for llm not to halucinate missing data --- langchain_apify/_actor_tools.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 6cf4e8d..463bbf0 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -171,6 +171,7 @@ class ApifyInstagramScraperTool(_ApifyGenericTool): # type: ignore[override] ' Optional: max_results (int, default 20),' ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyInstagramScraperInput @@ -230,6 +231,7 @@ class ApifyLinkedInProfilePostsTool(_ApifyGenericTool): # type: ignore[override ' Required: profile_url (str - LinkedIn profile URL or username, e.g. "satyanadella").' ' Optional: max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfilePostsInput @@ -285,6 +287,7 @@ class ApifyLinkedInProfileSearchTool(_ApifyGenericTool): # type: ignore[overrid ' Required: query (str - search keywords).' ' Optional: max_results (int, default 10).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfileSearchInput @@ -339,6 +342,7 @@ class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[overrid ' Required: profile_url (str - LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' ' Optional: include_email (bool, default False - include profile email if available).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfileDetailInput @@ -399,6 +403,7 @@ class ApifyTwitterScraperTool(_ApifyGenericTool): # type: ignore[override] ' start (str - ISO date, only return tweets newer than this date),' ' end (str - ISO date, only return tweets older than this date).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyTwitterScraperInput @@ -462,6 +467,7 @@ class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] ' Optional: search_type (one of "search", "user", "hashtag"; default "search"),' ' max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyTikTokScraperInput @@ -522,6 +528,7 @@ class ApifyFacebookPostsScraperTool(_ApifyGenericTool): # type: ignore[override ' Optional: max_results (int, default 20),' ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not halucinate missing fields.' ) args_schema: type[BaseModel] = ApifyFacebookPostsScraperInput From ea8b16edf8ba33d29e0f7b3b79cd299825808703 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:07:19 +0200 Subject: [PATCH 47/62] chore: rename tools to match the task description --- langchain_apify/__init__.py | 12 +++--- langchain_apify/tools.py | 28 ++++++------- tests/integration_tests/test_generic_tools.py | 8 ++-- tests/unit_tests/test_tools.py | 40 +++++++++---------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index bca8081..7d0dfa9 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -7,9 +7,9 @@ from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -31,10 +31,10 @@ APIFY_CORE_TOOLS: list[type[BaseTool]] = [ ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ] __all__ = [ @@ -44,9 +44,9 @@ 'ApifyWrapper', # Core generic tools 'ApifyGetDatasetItemsTool', - 'ApifyRunActorAndGetItemsTool', + 'ApifyRunActorAndGetDatasetTool', 'ApifyRunActorTool', - 'ApifyRunTaskAndGetItemsTool', + 'ApifyRunTaskAndGetDatasetTool', 'ApifyRunTaskTool', 'ApifyScrapeUrlTool', # Tool group lists diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index e7721b7..2b7cedf 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -225,8 +225,8 @@ class ApifyGetDatasetItemsInput(BaseModel): offset: int = Field(default=0, description='Number of items to skip from the start.') -class ApifyRunActorAndGetItemsInput(BaseModel): - """Input schema for :class:`ApifyRunActorAndGetItemsTool`.""" +class ApifyRunActorAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetDatasetTool`.""" actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') @@ -255,8 +255,8 @@ class ApifyRunTaskInput(BaseModel): ) -class ApifyRunTaskAndGetItemsInput(BaseModel): - """Input schema for :class:`ApifyRunTaskAndGetItemsTool`.""" +class ApifyRunTaskAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetDatasetTool`.""" task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') task_input: dict | None = Field( @@ -454,7 +454,7 @@ def _run( return json.dumps({'items': items}) -class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] +class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] """Run any Apify Actor and return both run metadata and dataset items. Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` @@ -476,16 +476,16 @@ class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] import os os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" - from langchain_apify import ApifyRunActorAndGetItemsTool + from langchain_apify import ApifyRunActorAndGetDatasetTool - tool = ApifyRunActorAndGetItemsTool() + tool = ApifyRunActorAndGetDatasetTool() result = tool.invoke({ "actor_id": "apify/python-example", "run_input": {"first_number": 2, "second_number": 3}, }) """ - name: str = 'apify_run_actor_and_get_items' + name: str = 'apify_run_actor_and_get_dataset' description: str = ( 'Run an Apify Actor synchronously and return both run metadata and dataset items.' ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' @@ -494,7 +494,7 @@ class ApifyRunActorAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' ' and items (list of dataset item dicts).' ) - args_schema: type[BaseModel] = ApifyRunActorAndGetItemsInput + args_schema: type[BaseModel] = ApifyRunActorAndGetDatasetInput def _run( self, @@ -625,7 +625,7 @@ def _run( return json.dumps(_run_meta(run)) -class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] +class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] """Run a saved Apify Actor task and return both run metadata and dataset items. Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` @@ -647,16 +647,16 @@ class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] import os os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" - from langchain_apify import ApifyRunTaskAndGetItemsTool + from langchain_apify import ApifyRunTaskAndGetDatasetTool - tool = ApifyRunTaskAndGetItemsTool() + tool = ApifyRunTaskAndGetDatasetTool() result = tool.invoke({ "task_id": "user/my-task", "task_input": {"key": "value"}, }) """ - name: str = 'apify_run_task_and_get_items' + name: str = 'apify_run_task_and_get_dataset' description: str = ( 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' @@ -665,7 +665,7 @@ class ApifyRunTaskAndGetItemsTool(_ApifyGenericTool): # type: ignore[override] ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' ' and items (list of dataset item dicts).' ) - args_schema: type[BaseModel] = ApifyRunTaskAndGetItemsInput + args_schema: type[BaseModel] = ApifyRunTaskAndGetDatasetInput def _run( self, diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py index 863efb1..3f2a7c8 100644 --- a/tests/integration_tests/test_generic_tools.py +++ b/tests/integration_tests/test_generic_tools.py @@ -14,9 +14,9 @@ from langchain_apify import ( ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, ) @@ -54,7 +54,7 @@ def test_get_dataset_items_tool_smoke() -> None: def test_run_actor_and_get_items_tool_smoke() -> None: - tool = ApifyRunActorAndGetItemsTool() + tool = ApifyRunActorAndGetDatasetTool() result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) parsed = json.loads(result) @@ -86,7 +86,7 @@ def test_run_task_tool_smoke() -> None: @pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') def test_run_task_and_get_items_tool_smoke() -> None: - tool = ApifyRunTaskAndGetItemsTool() + tool = ApifyRunTaskAndGetDatasetTool() result = tool.invoke({'task_id': _TASK_ID}) parsed = json.loads(result) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 67fa1a7..9abe9dc 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -15,9 +15,9 @@ from langchain_apify.tools import ( ApifyActorsTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyRunActorTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ApifyRunTaskTool, ApifyScrapeUrlTool, _ApifyGenericTool, @@ -253,13 +253,13 @@ def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) - # --------------------------------------------------------------------------- -# ApifyRunActorAndGetItemsTool +# ApifyRunActorAndGetDatasetTool # --------------------------------------------------------------------------- def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) - tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) @@ -274,7 +274,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = make_tool(ApifyRunActorAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(actor_id='apify/test') @@ -283,7 +283,7 @@ def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_c def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyRunActorAndGetItemsTool() + ApifyRunActorAndGetDatasetTool() # --------------------------------------------------------------------------- @@ -350,13 +350,13 @@ def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: # --------------------------------------------------------------------------- -# ApifyRunTaskAndGetItemsTool +# ApifyRunTaskAndGetDatasetTool # --------------------------------------------------------------------------- def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) - tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) @@ -371,7 +371,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( 'Actor run run-bad ended with status TIMED-OUT.' ) - tool = make_tool(ApifyRunTaskAndGetItemsTool, mock_tools_client) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) with pytest.raises(ToolException, match='TIMED-OUT'): tool._run(task_id='user/my-task') @@ -380,7 +380,7 @@ def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_cl def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyRunTaskAndGetItemsTool() + ApifyRunTaskAndGetDatasetTool() # --------------------------------------------------------------------------- @@ -427,7 +427,7 @@ def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> No def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) tool = make_tool( - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, mock_tools_client, max_timeout_secs=30, max_memory_mbytes=256, @@ -460,7 +460,7 @@ def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) - def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) tool = make_tool( - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, mock_tools_client, max_timeout_secs=30, max_memory_mbytes=256, @@ -529,19 +529,19 @@ def test_generic_tools_have_correct_metadata() -> None: tools = [ ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] - ApifyRunActorAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] - ApifyRunTaskAndGetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] ] expected_names = [ 'apify_run_actor', 'apify_get_dataset_items', - 'apify_run_actor_and_get_items', + 'apify_run_actor_and_get_dataset', 'apify_scrape_url', 'apify_run_task', - 'apify_run_task_and_get_items', + 'apify_run_task_and_get_dataset', ] for tool, expected_name in zip(tools, expected_names): @@ -569,10 +569,10 @@ def test_all_generic_tools_inherit_from_base() -> None: for tool_cls in ( ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, ): assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' @@ -592,9 +592,9 @@ def test_apify_core_tools_contains_all_generic_classes() -> None: assert set(APIFY_CORE_TOOLS) == { ApifyRunActorTool, ApifyGetDatasetItemsTool, - ApifyRunActorAndGetItemsTool, + ApifyRunActorAndGetDatasetTool, ApifyScrapeUrlTool, ApifyRunTaskTool, - ApifyRunTaskAndGetItemsTool, + ApifyRunTaskAndGetDatasetTool, } assert len(APIFY_CORE_TOOLS) == 6 From cd1eea1fc4a001296f941954decac2b4e996693d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:25:04 +0200 Subject: [PATCH 48/62] fix: narrow except blocks in _client.py to SDK/transport errors --- langchain_apify/_client.py | 21 +++++++++++++-------- tests/unit_tests/test_client.py | 31 ++++++++++++++++++++----------- tests/unit_tests/test_tools.py | 4 ++-- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 9a87d46..618d007 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -2,7 +2,9 @@ import os +import httpx from apify_client import ApifyClient +from apify_client.errors import ApifyClientError from pydantic import SecretStr from langchain_apify._error_messages import ( @@ -12,6 +14,9 @@ ) from langchain_apify._utils import _create_apify_client +# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate. +_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError) + _SCRAPE_ACTOR_ID = 'apify/website-content-crawler' _DEFAULT_RUN_TIMEOUT_SECS = 300 _DEFAULT_SCRAPE_TIMEOUT_SECS = 120 @@ -72,8 +77,8 @@ def run_actor( try: run = self._client.actor(actor_id).call(**call_kwargs) - except Exception as exc: - msg = f'Network error calling Actor {actor_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify Actor call failed for {actor_id}: {exc}' raise RuntimeError(msg) from exc if run is None: msg = f'Actor {actor_id} call returned no run details.' @@ -96,8 +101,8 @@ def get_dataset_items( """ try: return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' raise RuntimeError(msg) from exc def run_actor_and_get_items( @@ -159,8 +164,8 @@ def run_task( try: run = self._client.task(task_id).call(**call_kwargs) - except Exception as exc: - msg = f'Network error calling task {task_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify task call failed for {task_id}: {exc}' raise RuntimeError(msg) from exc if run is None: msg = f'Task {task_id} call returned no run details.' @@ -239,8 +244,8 @@ def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: """Fetch dataset items, wrapping any network error in a RuntimeError.""" try: return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items - except Exception as exc: - msg = f'Network error fetching dataset {dataset_id}: {exc}' + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' raise RuntimeError(msg) from exc @staticmethod diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 40c73dc..c43e4d1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -2,6 +2,7 @@ from unittest.mock import MagicMock, patch +import httpx import pytest from langchain_apify._client import ApifyToolsClient @@ -238,21 +239,21 @@ def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client # --------------------------------------------------------------------------- -# Network error wrapping (transport exception -> RuntimeError) +# Transport-error wrapping (httpx / ApifyClientError -> RuntimeError) # --------------------------------------------------------------------------- def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.actor.return_value.call.side_effect = ConnectionError('conn refused') + mock_apify_client.actor.return_value.call.side_effect = httpx.ConnectError('conn refused') - with pytest.raises(RuntimeError, match='Network error calling Actor'): + with pytest.raises(RuntimeError, match='Apify Actor call failed'): client.run_actor('apify/test-actor') def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('timeout') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('timeout') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.get_dataset_items('dataset-xyz') @@ -260,16 +261,16 @@ def test_run_actor_and_get_items_dataset_fetch_network_error( client: ApifyToolsClient, mock_apify_client: MagicMock ) -> None: mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.run_actor_and_get_items('apify/test-actor') def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: - mock_apify_client.task.return_value.call.side_effect = ConnectionError('conn refused') + mock_apify_client.task.return_value.call.side_effect = httpx.ConnectError('conn refused') - with pytest.raises(RuntimeError, match='Network error calling task'): + with pytest.raises(RuntimeError, match='Apify task call failed'): client.run_task('user/my-task') @@ -277,7 +278,15 @@ def test_run_task_and_get_items_dataset_fetch_network_error( client: ApifyToolsClient, mock_apify_client: MagicMock ) -> None: mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN - mock_apify_client.dataset.return_value.list_items.side_effect = ConnectionError('reset') + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') - with pytest.raises(RuntimeError, match='Network error fetching dataset'): + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.run_task_and_get_items('user/my-task') + + +def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" + mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') + + with pytest.raises(AttributeError, match='bug in SDK'): + client.run_actor('apify/test-actor') diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 9abe9dc..4a5dbdd 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -238,11 +238,11 @@ def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMo def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.get_dataset_items.side_effect = RuntimeError( - 'Network error fetching dataset ds-bad: connection reset' + 'Apify dataset fetch failed for ds-bad: connection reset' ) tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) - with pytest.raises(ToolException, match='Network error fetching dataset'): + with pytest.raises(ToolException, match='Apify dataset fetch failed'): tool._run(dataset_id='ds-bad') From 50c3583243919d50f240a4f1a0963822a6ec2c33 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:33:57 +0200 Subject: [PATCH 49/62] fix: clamp memory_mbytes to Apify platform minimum (128 MB) --- langchain_apify/tools.py | 7 +++++-- tests/unit_tests/test_tools.py | 20 +++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 2b7cedf..fafc858 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -333,9 +333,12 @@ def _clamp_timeout(self, value: int) -> int: return max(1, min(value, self.max_timeout_secs)) def _clamp_memory(self, value: int | None) -> int | None: - if value is None: + # Non-positive values fall through to the platform default. Positive + # values are floored at 128 MB (the Apify platform minimum) so the LLM + # cannot drive into an API rejection by requesting too little memory. + if value is None or value <= 0: return None - return max(1, min(value, self.max_memory_mbytes)) + return max(128, min(value, self.max_memory_mbytes)) def _clamp_items(self, value: int) -> int: return max(1, min(value, self.max_items)) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index 4a5dbdd..108c695 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -484,16 +484,30 @@ def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) -def test_clamp_memory_floor_is_one(mock_tools_client: MagicMock) -> None: +def test_clamp_memory_non_positive_is_treated_as_none(mock_tools_client: MagicMock) -> None: + """memory_mbytes <= 0 maps to None so the Apify platform default is used.""" mock_tools_client.run_actor.return_value = SUCCEEDED_RUN tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) tool._run(actor_id='apify/test', memory_mbytes=-1) - mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) mock_tools_client.run_actor.reset_mock() tool._run(actor_id='apify/test', memory_mbytes=0) - mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_clamp_memory_floors_positive_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """A positive memory_mbytes below the Apify platform minimum (128 MB) is floored to 128.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=64) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: From 450728cb10ac6ed9dac16886fab48dc8586b9009 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:36:08 +0200 Subject: [PATCH 50/62] fix: narrow empty-dataset message in ApifyGetDatasetItemsTool --- langchain_apify/tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index fafc858..4cd182d 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -453,7 +453,7 @@ def _run( except RuntimeError as exc: raise ToolException(str(exc)) from exc if not items: - return json.dumps({'items': [], 'message': 'Dataset is empty or not found.'}) + return json.dumps({'items': [], 'message': f'Dataset {dataset_id} is empty.'}) return json.dumps({'items': items}) From 1360e9228a3be72c6dd6105b3f725d1c16dc599d Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:40:51 +0200 Subject: [PATCH 51/62] ref: simplify ApifyToolsClient.__init__ to require explicit token --- langchain_apify/_client.py | 20 +++++--------------- tests/unit_tests/test_client.py | 12 ++---------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 618d007..cc1e4b8 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -1,11 +1,8 @@ from __future__ import annotations -import os - import httpx from apify_client import ApifyClient from apify_client.errors import ApifyClientError -from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -31,24 +28,17 @@ class ApifyToolsClient: block until the Actor run finishes. Args: - apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` - environment variable when *None*. + apify_api_token: Apify API token. Raises: - ValueError: If no token is provided and the env var is not set. + ValueError: If the token is empty. """ - def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: - _token: str | None = None - if isinstance(apify_api_token, SecretStr): - _token = apify_api_token.get_secret_value() - else: - _token = apify_api_token or os.getenv('APIFY_API_TOKEN') - - if not _token: + def __init__(self, apify_api_token: str) -> None: + if not apify_api_token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, _token) + self._client = _create_apify_client(ApifyClient, apify_api_token) def run_actor( self, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index c43e4d1..43f6f83 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -20,17 +20,9 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: assert c._client is mock_apify_client -def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: - monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') - with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): - c = ApifyToolsClient() - assert c._client is mock_apify_client - - -def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: - monkeypatch.delenv('APIFY_API_TOKEN', raising=False) +def test_init_empty_token_raises() -> None: with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyToolsClient() + ApifyToolsClient(apify_api_token='') # --------------------------------------------------------------------------- From 09b6c6e045b9b2815bfbc0be28527635e85c4d26 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:45:55 +0200 Subject: [PATCH 52/62] docs: add module-level docstring to tools.py --- langchain_apify/tools.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 4cd182d..385fc57 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -1,3 +1,21 @@ +"""LangChain tools for the Apify platform. + +All tools require an Apify API token. Set it via the ``APIFY_API_TOKEN`` +environment variable, or pass ``apify_api_token`` to the tool constructor: + +.. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({"actor_id": "apify/python-example"}) + +For details, see https://docs.apify.com/platform/integrations/langchain +""" + from __future__ import annotations import json From a5bd7cce8a178da607392b651f361692af5fb682 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 10:48:52 +0200 Subject: [PATCH 53/62] ref: rename model_post_init parameter to --- langchain_apify/tools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 385fc57..46846f2 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -340,12 +340,12 @@ class _ApifyGenericTool(BaseTool): # type: ignore[override] _client: ApifyToolsClient = PrivateAttr() - def model_post_init(self, __context: Any) -> None: # noqa: ANN401 + def model_post_init(self, context: Any) -> None: # noqa: ANN401 if self.apify_api_token is None: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) - super().model_post_init(__context) + super().model_post_init(context) def _clamp_timeout(self, value: int) -> int: return max(1, min(value, self.max_timeout_secs)) From 23242c1a490b8ec64f03838cf97ac33412bd1a59 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 11:18:51 +0200 Subject: [PATCH 54/62] revert: restore env-fallback --- langchain_apify/_client.py | 19 ++++++++++++++----- tests/unit_tests/test_client.py | 12 ++++++++++-- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index cc1e4b8..77fe0dd 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -1,8 +1,11 @@ from __future__ import annotations +import os + import httpx from apify_client import ApifyClient from apify_client.errors import ApifyClientError +from pydantic import SecretStr from langchain_apify._error_messages import ( _ERROR_ACTOR_RUN_FAILED, @@ -28,17 +31,23 @@ class ApifyToolsClient: block until the Actor run finishes. Args: - apify_api_token: Apify API token. + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. Raises: - ValueError: If the token is empty. + ValueError: If no token is provided and the env var is not set. """ - def __init__(self, apify_api_token: str) -> None: - if not apify_api_token: + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + if isinstance(apify_api_token, SecretStr): + _token: str | None = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - self._client = _create_apify_client(ApifyClient, apify_api_token) + self._client = _create_apify_client(ApifyClient, _token) def run_actor( self, diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 43f6f83..c43e4d1 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -20,9 +20,17 @@ def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: assert c._client is mock_apify_client -def test_init_empty_token_raises() -> None: +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) with pytest.raises(ValueError, match='APIFY_API_TOKEN'): - ApifyToolsClient(apify_api_token='') + ApifyToolsClient() # --------------------------------------------------------------------------- From df2a4123618c48d4e967fed81021cc0422077f83 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:41:32 +0200 Subject: [PATCH 55/62] fix: typo in social tool descriptions (halucinate to hallucinate) --- langchain_apify/_actor_tools.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 463bbf0..1ff0793 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -171,7 +171,7 @@ class ApifyInstagramScraperTool(_ApifyGenericTool): # type: ignore[override] ' Optional: max_results (int, default 20),' ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyInstagramScraperInput @@ -231,7 +231,7 @@ class ApifyLinkedInProfilePostsTool(_ApifyGenericTool): # type: ignore[override ' Required: profile_url (str - LinkedIn profile URL or username, e.g. "satyanadella").' ' Optional: max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfilePostsInput @@ -287,7 +287,7 @@ class ApifyLinkedInProfileSearchTool(_ApifyGenericTool): # type: ignore[overrid ' Required: query (str - search keywords).' ' Optional: max_results (int, default 10).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfileSearchInput @@ -342,7 +342,7 @@ class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[overrid ' Required: profile_url (str - LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' ' Optional: include_email (bool, default False - include profile email if available).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyLinkedInProfileDetailInput @@ -403,7 +403,7 @@ class ApifyTwitterScraperTool(_ApifyGenericTool): # type: ignore[override] ' start (str - ISO date, only return tweets newer than this date),' ' end (str - ISO date, only return tweets older than this date).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyTwitterScraperInput @@ -467,7 +467,7 @@ class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] ' Optional: search_type (one of "search", "user", "hashtag"; default "search"),' ' max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyTikTokScraperInput @@ -528,7 +528,7 @@ class ApifyFacebookPostsScraperTool(_ApifyGenericTool): # type: ignore[override ' Optional: max_results (int, default 20),' ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' - ' Use only the data returned; do not halucinate missing fields.' + ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyFacebookPostsScraperInput From 5ce667408f86ae324beb4e62bf22e9151210589b Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:44:51 +0200 Subject: [PATCH 56/62] test: parametrize RuntimeError to ToolException across all 7 social tools --- tests/unit_tests/test_actor_tools.py | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 580b1aa..08a42ae 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -328,3 +328,35 @@ def test_tool_returns_valid_json_for_empty_items(mock_tools_client: MagicMock) - def test_social_tool_handle_tool_error_enabled(tool_cls: type, mock_tools_client: MagicMock) -> None: tool = make_tool(tool_cls, mock_tools_client) assert tool.handle_tool_error is True + + +# --------------------------------------------------------------------------- +# Per-tool RuntimeError -> ToolException coverage +# --------------------------------------------------------------------------- + +# (tool_cls, client_method_name, _run kwargs) +_TOOL_INVOCATIONS: list[tuple[type, str, dict]] = [ + (ApifyInstagramScraperTool, 'instagram_scrape', {'search_type': 'user', 'search_query': 'apify'}), + (ApifyLinkedInProfilePostsTool, 'linkedin_profile_posts', {'profile_url': 'satyanadella'}), + (ApifyLinkedInProfileSearchTool, 'linkedin_profile_search', {'query': 'Founder'}), + (ApifyLinkedInProfileDetailTool, 'linkedin_profile_detail', {'profile_url': 'neal-mohan'}), + (ApifyTwitterScraperTool, 'twitter_scrape', {'search_query': 'apify'}), + (ApifyTikTokScraperTool, 'tiktok_scrape', {'search_query': 'cooking'}), + (ApifyFacebookPostsScraperTool, 'facebook_posts_scrape', {'page_url': 'https://www.facebook.com/x/'}), +] + + +@pytest.mark.parametrize(('tool_cls', 'method_name', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_social_tool_runtime_error_raises_tool_exception( + tool_cls: type, + method_name: str, + run_kwargs: dict, + mock_tools_client: MagicMock, +) -> None: + getattr(mock_tools_client, method_name).side_effect = RuntimeError( + 'Actor run run-XYZ ended with status FAILED.', + ) + tool = make_tool(tool_cls, mock_tools_client) + + with pytest.raises(ToolException, match='run-XYZ'): + tool._run(**run_kwargs) From 832c4dff3465aed7e19d44bd76bf4515f5dc2cff Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:46:03 +0200 Subject: [PATCH 57/62] test: parametrize empty-dataset coverage across all 7 social tools --- tests/unit_tests/test_actor_tools.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 08a42ae..4db57c8 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -360,3 +360,25 @@ def test_social_tool_runtime_error_raises_tool_exception( with pytest.raises(ToolException, match='run-XYZ'): tool._run(**run_kwargs) + + +# --------------------------------------------------------------------------- +# Per-tool empty-dataset coverage +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize(('tool_cls', 'method_name', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_social_tool_returns_valid_json_for_empty_items( + tool_cls: type, + method_name: str, + run_kwargs: dict, + mock_tools_client: MagicMock, +) -> None: + getattr(mock_tools_client, method_name).return_value = (SUCCEEDED_RUN, []) + tool = make_tool(tool_cls, mock_tools_client) + + result = tool._run(**run_kwargs) + parsed = json.loads(result) + + assert parsed['items'] == [] + assert parsed['run'] == EXPECTED_RUN_META From 928202242acf8f4c0eb45b8972c2c6caeab3d9da Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:48:50 +0200 Subject: [PATCH 58/62] feat: include statusMessage in actor-run-failed RuntimeError --- langchain_apify/_client.py | 2 ++ tests/unit_tests/test_client.py | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 65e6a8d..62f3595 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -550,4 +550,6 @@ def _check_run_status(run: dict) -> None: if status != _RUN_STATUS_SUCCEEDED: run_id = run.get('id', 'unknown') msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + if status_message := run.get('statusMessage'): + msg = f'{msg} {status_message}' raise RuntimeError(msg) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index e5322c1..0fae0cb 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -219,6 +219,13 @@ def test_check_run_status_failed() -> None: ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) +def test_check_run_status_failed_includes_status_message() -> None: + with pytest.raises(RuntimeError, match='Actor exited out of memory'): + ApifyToolsClient._check_run_status( + {'id': 'run-oom', 'status': 'FAILED', 'statusMessage': 'Actor exited out of memory'}, + ) + + # --------------------------------------------------------------------------- # None returns from actor/task .call() # --------------------------------------------------------------------------- From 6f63431b4f14e5b0f9f436650f9f300055c8ad48 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Tue, 28 Apr 2026 15:49:51 +0200 Subject: [PATCH 59/62] fix: lint fix --- tests/unit_tests/test_client.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 0fae0cb..f1706a7 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -290,6 +290,7 @@ def test_run_task_and_get_items_dataset_fetch_network_error( with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): client.run_task_and_get_items('user/my-task') + def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') @@ -297,6 +298,7 @@ def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_a with pytest.raises(AttributeError, match='bug in SDK'): client.run_actor('apify/test-actor') + # --------------------------------------------------------------------------- # instagram_scrape # --------------------------------------------------------------------------- From 238d803a44fd90315f96b7f62324904cd8f9ae34 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 30 Apr 2026 12:02:00 +0200 Subject: [PATCH 60/62] feat: add sort param to twitter tool --- langchain_apify/_actor_tools.py | 11 +++++++++-- langchain_apify/_client.py | 4 ++++ tests/unit_tests/test_actor_tools.py | 11 +++++++++++ tests/unit_tests/test_client.py | 9 +++++++++ 4 files changed, 33 insertions(+), 2 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 1ff0793..7c6d2f2 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -98,6 +98,10 @@ class ApifyTwitterScraperInput(BaseModel): default=None, description='Optional end date - only return tweets older than this date.', ) + sort: Literal['Latest', 'Top'] | None = Field( + default=None, + description='Optional sort order: "Latest" for most recent first, "Top" for most popular.', + ) class ApifyTikTokScraperInput(BaseModel): @@ -401,19 +405,21 @@ class ApifyTwitterScraperTool(_ApifyGenericTool): # type: ignore[override] ' Optional: search_mode (one of "search", "user", "replies"; default "search"),' ' max_results (int, default 20),' ' start (str - ISO date, only return tweets newer than this date),' - ' end (str - ISO date, only return tweets older than this date).' + ' end (str - ISO date, only return tweets older than this date),' + ' sort (one of "Latest", "Top" - sort order for results).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ' Use only the data returned; do not hallucinate missing fields.' ) args_schema: type[BaseModel] = ApifyTwitterScraperInput - def _run( + def _run( # noqa: PLR0913 self, search_query: str, search_mode: Literal['search', 'user', 'replies'] = 'search', max_results: int = 20, start: str | None = None, end: str | None = None, + sort: Literal['Latest', 'Top'] | None = None, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: @@ -423,6 +429,7 @@ def _run( max_results=self._clamp_items(max_results), start=start, end=end, + sort=sort, timeout_secs=self.max_timeout_secs, ) except (RuntimeError, ValueError) as exc: diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 62f3595..f076dea 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -405,6 +405,7 @@ def twitter_scrape( # noqa: PLR0913 max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, start: str | None = None, end: str | None = None, + sort: str | None = None, timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, ) -> tuple[dict, list[dict]]: """Scrape Twitter/X via ``apidojo/twitter-scraper-lite``. @@ -417,6 +418,7 @@ def twitter_scrape( # noqa: PLR0913 than this date. end: Optional ISO-8601 end date — only return tweets older than this date. + sort: Optional sort order. One of ``"Latest"`` or ``"Top"``. timeout_secs: Maximum time to wait for the run to finish. Returns: @@ -440,6 +442,8 @@ def twitter_scrape( # noqa: PLR0913 run_input['start'] = start if end is not None: run_input['end'] = end + if sort is not None: + run_input['sort'] = sort return self.run_actor_and_get_items( _TWITTER_ACTOR_ID, run_input=run_input, diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 4db57c8..9f0bb67 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -199,10 +199,21 @@ def test_twitter_tool_happy_path(mock_tools_client: MagicMock) -> None: max_results=20, start=None, end=None, + sort=None, timeout_secs=600, ) +def test_twitter_tool_passes_sort(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + tool._run(search_query='apify', sort='Top') + + kwargs = mock_tools_client.twitter_scrape.call_args.kwargs + assert kwargs['sort'] == 'Top' + + def test_twitter_tool_passes_date_range(mock_tools_client: MagicMock) -> None: mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, []) tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index f1706a7..8f3af8c 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -476,6 +476,15 @@ def test_twitter_scrape_passes_date_range(client: ApifyToolsClient, mock_apify_c assert call_kwargs['run_input']['end'] == '2025-02-01' +def test_twitter_scrape_passes_sort(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', sort='Top') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['sort'] == 'Top' + + def test_twitter_scrape_invalid_mode_raises(client: ApifyToolsClient) -> None: with pytest.raises(ValueError, match='Unsupported Twitter search_mode'): client.twitter_scrape('apify', search_mode='followers') From 350faa8db24ebe702aec2f8c3ddac3d16c175d40 Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 30 Apr 2026 12:04:01 +0200 Subject: [PATCH 61/62] feat: add older then param to facebook tool --- langchain_apify/_actor_tools.py | 12 +++++++++++- langchain_apify/_client.py | 5 +++++ tests/unit_tests/test_actor_tools.py | 10 ++++++++++ tests/unit_tests/test_client.py | 11 +++++++++++ 4 files changed, 37 insertions(+), 1 deletion(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index 7c6d2f2..da3bfa0 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -130,6 +130,13 @@ class ApifyFacebookPostsScraperInput(BaseModel): 'values like "1 day", "2 months", "3 years".' ), ) + only_posts_older_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) # --------------------------------------------------------------------------- @@ -533,7 +540,8 @@ class ApifyFacebookPostsScraperTool(_ApifyGenericTool): # type: ignore[override 'Scrape posts from a public Facebook page and return them as JSON.' ' Required: page_url (str - Facebook page URL; personal profiles are not supported).' ' Optional: max_results (int, default 20),' - ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' + ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week"),' + ' only_posts_older_than (str - date filter, e.g. "2025-01-01" or "1 week").' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ' Use only the data returned; do not hallucinate missing fields.' ) @@ -544,6 +552,7 @@ def _run( page_url: str, max_results: int = 20, only_posts_newer_than: str | None = None, + only_posts_older_than: str | None = None, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: try: @@ -551,6 +560,7 @@ def _run( page_url=page_url, max_results=self._clamp_items(max_results), only_posts_newer_than=only_posts_newer_than, + only_posts_older_than=only_posts_older_than, timeout_secs=self.max_timeout_secs, ) except RuntimeError as exc: diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index f076dea..8a34fc6 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -495,6 +495,7 @@ def facebook_posts_scrape( page_url: str, max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, only_posts_newer_than: str | None = None, + only_posts_older_than: str | None = None, timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, ) -> tuple[dict, list[dict]]: """Scrape Facebook page posts via ``apify/facebook-posts-scraper``. @@ -504,6 +505,8 @@ def facebook_posts_scrape( max_results: Maximum number of posts to return. only_posts_newer_than: Optional date filter. Accepts ``YYYY-MM-DD``, ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + only_posts_older_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). timeout_secs: Maximum time to wait for the run to finish. Returns: @@ -518,6 +521,8 @@ def facebook_posts_scrape( } if only_posts_newer_than is not None: run_input['onlyPostsNewerThan'] = only_posts_newer_than + if only_posts_older_than is not None: + run_input['onlyPostsOlderThan'] = only_posts_older_than return self.run_actor_and_get_items( _FACEBOOK_ACTOR_ID, run_input=run_input, diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index 9f0bb67..cf6fc78 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -282,6 +282,7 @@ def test_facebook_tool_happy_path(mock_tools_client: MagicMock) -> None: page_url='https://www.facebook.com/humansofnewyork/', max_results=15, only_posts_newer_than=None, + only_posts_older_than=None, timeout_secs=600, ) @@ -295,6 +296,15 @@ def test_facebook_tool_passes_only_posts_newer_than(mock_tools_client: MagicMock assert mock_tools_client.facebook_posts_scrape.call_args.kwargs['only_posts_newer_than'] == '2025-01-01' +def test_facebook_tool_passes_only_posts_older_than(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + tool._run(page_url='https://www.facebook.com/humansofnewyork/', only_posts_older_than='2025-12-31') + + assert mock_tools_client.facebook_posts_scrape.call_args.kwargs['only_posts_older_than'] == '2025-12-31' + + def test_facebook_tool_runtime_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: mock_tools_client.facebook_posts_scrape.side_effect = RuntimeError('Network error') tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 8f3af8c..64a872b 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -559,6 +559,17 @@ def test_facebook_posts_scrape_passes_only_posts_newer_than( assert call_kwargs['run_input']['onlyPostsNewerThan'] == '2025-01-01' +def test_facebook_posts_scrape_passes_only_posts_older_than( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', only_posts_older_than='2025-12-31') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsOlderThan'] == '2025-12-31' + + # --------------------------------------------------------------------------- # Failed run propagates from social helpers # --------------------------------------------------------------------------- From 702891db5247212e127e68d8ebbbb0e86501369f Mon Sep 17 00:00:00 2001 From: David Omrai Date: Thu, 30 Apr 2026 12:10:27 +0200 Subject: [PATCH 62/62] feaet: add search_type param to tiktok tool --- langchain_apify/_actor_tools.py | 15 ++++++++------- langchain_apify/_client.py | 11 ++++++++--- tests/unit_tests/test_actor_tools.py | 9 +++++++++ tests/unit_tests/test_client.py | 9 +++++++++ 4 files changed, 34 insertions(+), 10 deletions(-) diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py index da3bfa0..f7e7721 100644 --- a/langchain_apify/_actor_tools.py +++ b/langchain_apify/_actor_tools.py @@ -107,12 +107,13 @@ class ApifyTwitterScraperInput(BaseModel): class ApifyTikTokScraperInput(BaseModel): """Input schema for :class:`ApifyTikTokScraperTool`.""" - search_query: str = Field(description='Username, hashtag, or search keyword.') - search_type: Literal['search', 'user', 'hashtag'] = Field( + search_query: str = Field(description='Username, hashtag, search keyword, or TikTok post URL.') + search_type: Literal['search', 'user', 'hashtag', 'post'] = Field( default='search', description=( 'Type of content to scrape: "search" for keyword search, "user" for ' - 'a profile\'s videos, "hashtag" for videos under a tag.' + 'a profile\'s videos, "hashtag" for videos under a tag, "post" for a ' + 'specific TikTok post URL.' ), ) max_results: int = Field(default=20, description='Maximum number of items to return.') @@ -476,9 +477,9 @@ class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] name: str = 'apify_tiktok_scraper' description: str = ( - 'Scrape TikTok by search keyword, profile, or hashtag and return the results as JSON.' - ' Required: search_query (str - keyword, username, or hashtag).' - ' Optional: search_type (one of "search", "user", "hashtag"; default "search"),' + 'Scrape TikTok by search keyword, profile, hashtag, or post URL and return the results as JSON.' + ' Required: search_query (str - keyword, username, hashtag, or TikTok post URL).' + ' Optional: search_type (one of "search", "user", "hashtag", "post"; default "search"),' ' max_results (int, default 20).' ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' ' Use only the data returned; do not hallucinate missing fields.' @@ -488,7 +489,7 @@ class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] def _run( self, search_query: str, - search_type: Literal['search', 'user', 'hashtag'] = 'search', + search_type: Literal['search', 'user', 'hashtag', 'post'] = 'search', max_results: int = 20, _run_manager: CallbackManagerForToolRun | None = None, ) -> str: diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py index 8a34fc6..54de8d6 100644 --- a/langchain_apify/_client.py +++ b/langchain_apify/_client.py @@ -461,8 +461,8 @@ def tiktok_scrape( """Scrape TikTok via ``clockworks/tiktok-scraper``. Args: - search_query: Username, hashtag, or search keyword. - search_type: One of ``"search"``, ``"user"``, ``"hashtag"``. + search_query: Username, hashtag, search keyword, or TikTok post URL. + search_type: One of ``"search"``, ``"user"``, ``"hashtag"``, ``"post"``. max_results: Maximum number of items to return. timeout_secs: Maximum time to wait for the run to finish. @@ -480,8 +480,13 @@ def tiktok_scrape( run_input['profiles'] = [search_query.lstrip('@')] elif search_type == 'hashtag': run_input['hashtags'] = [search_query.lstrip('#')] + elif search_type == 'post': + run_input['postURLs'] = [search_query] else: - msg = f"Unsupported TikTok search_type {search_type!r}. Expected one of: ['search', 'user', 'hashtag']." + msg = ( + f'Unsupported TikTok search_type {search_type!r}. ' + "Expected one of: ['search', 'user', 'hashtag', 'post']." + ) raise ValueError(msg) return self.run_actor_and_get_items( _TIKTOK_ACTOR_ID, diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py index cf6fc78..cf622fc 100644 --- a/tests/unit_tests/test_actor_tools.py +++ b/tests/unit_tests/test_actor_tools.py @@ -264,6 +264,15 @@ def test_tiktok_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: assert mock_tools_client.tiktok_scrape.call_args.kwargs['max_results'] == 4 +def test_tiktok_tool_passes_post_search_type(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client) + + tool._run(search_query='https://www.tiktok.com/@charlidamelio/video/123', search_type='post') + + assert mock_tools_client.tiktok_scrape.call_args.kwargs['search_type'] == 'post' + + # --------------------------------------------------------------------------- # ApifyFacebookPostsScraperTool # --------------------------------------------------------------------------- diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py index 64a872b..f0c0919 100644 --- a/tests/unit_tests/test_client.py +++ b/tests/unit_tests/test_client.py @@ -523,6 +523,15 @@ def test_tiktok_scrape_hashtag_mode_strips_hash(client: ApifyToolsClient, mock_a assert call_kwargs['run_input']['hashtags'] == ['fyp'] +def test_tiktok_scrape_post_mode_uses_post_urls(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('https://www.tiktok.com/@charlidamelio/video/123', search_type='post') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['postURLs'] == ['https://www.tiktok.com/@charlidamelio/video/123'] + + def test_tiktok_scrape_invalid_type_raises(client: ApifyToolsClient) -> None: with pytest.raises(ValueError, match='Unsupported TikTok search_type'): client.tiktok_scrape('cooking', search_type='trending')