diff --git a/README.md b/README.md index e95b86a..25e54ee 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ pip install langchain-apify ## Prerequisites You should configure credentials by setting the following environment variables: -- `APIFY_API_TOKEN` - Apify API token +- `APIFY_TOKEN` - Apify API token Register your free Apify account [here](https://console.apify.com/sign-up) and learn how to get your API token in the [Apify documentation](https://docs.apify.com/platform/integrations/api). @@ -57,7 +57,7 @@ import json from langchain_apify import ApifyActorsTool os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY" -os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN" +os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN" browser = ApifyActorsTool('apify/rag-web-browser') search_results = browser.invoke(input={ @@ -92,7 +92,7 @@ Example usage for `ApifyDatasetLoader` with a custom dataset mapping function fo import os from langchain_apify import ApifyDatasetLoader -os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN" +os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN" # Example dataset structure # [ @@ -129,7 +129,7 @@ import os from langchain_apify import ApifyWrapper from langchain_core.documents import Document -os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN" +os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN" apify = ApifyWrapper() diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 66142be..7d0dfa9 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,19 +1,56 @@ +from __future__ import annotations + from importlib import metadata +from typing import TYPE_CHECKING from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.tools import ApifyActorsTool +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) from langchain_apify.wrappers import ApifyWrapper +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: - # Case where package metadata is not available. __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) +# Convenience tool-class lists for selective agent binding. +# Binding all tools at once overwhelms the LLM context window; +# pick the group(s) relevant to your use case. + +APIFY_CORE_TOOLS: list[type[BaseTool]] = [ + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, +] + __all__ = [ + # Existing components (backward-compatible) 'ApifyActorsTool', 'ApifyDatasetLoader', 'ApifyWrapper', + # Core generic tools + 'ApifyGetDatasetItemsTool', + 'ApifyRunActorAndGetDatasetTool', + 'ApifyRunActorTool', + 'ApifyRunTaskAndGetDatasetTool', + 'ApifyRunTaskTool', + 'ApifyScrapeUrlTool', + # Tool group lists + 'APIFY_CORE_TOOLS', + # Meta '__version__', ] diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py new file mode 100644 index 0000000..e8d6b2b --- /dev/null +++ b/langchain_apify/_client.py @@ -0,0 +1,256 @@ +from __future__ import annotations + +import httpx +from apify_client import ApifyClient +from apify_client.errors import ApifyClientError +from pydantic import SecretStr + +from langchain_apify._error_messages import ( + _ERROR_ACTOR_RUN_FAILED, + _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + _ERROR_SCRAPE_EMPTY, +) +from langchain_apify._utils import _create_apify_client, _resolve_apify_token + +# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate. +_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError) + +_SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_DEFAULT_RUN_TIMEOUT_SECS = 300 +_DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_DATASET_ITEMS_LIMIT = 100 +_RUN_STATUS_SUCCEEDED = 'SUCCEEDED' + + +class ApifyToolsClient: + """Internal helper that wraps ``ApifyClient`` for the tools layer. + + One convenience method per tool operation. All methods are synchronous and + block until the Actor run finishes. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable (or ``APIFY_API_TOKEN`` for backwards + compatibility) when *None*. + + Raises: + ValueError: If no token is provided and the env var is not set. + """ + + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + if isinstance(apify_api_token, SecretStr): + _token: str | None = apify_api_token.get_secret_value() + else: + _token = apify_api_token or _resolve_apify_token() + + if not _token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = _create_apify_client(ApifyClient, _token) + + def run_actor( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start an Actor and block until it finishes. + + Args: + actor_id: Actor ID or name (e.g. ``"apify/python-example"``). + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.actor(actor_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify Actor call failed for {actor_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Actor {actor_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def get_dataset_items( + self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0 + ) -> list[dict]: + """Fetch items from an existing dataset. + + Args: + dataset_id: Apify dataset ID. + limit: Maximum number of items to return. + offset: Number of items to skip from the start. + + Returns: + List of dataset item dicts (may be empty). + """ + try: + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + def run_actor_and_get_items( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run an Actor, then fetch items from its default dataset. + + Args: + actor_id: Actor ID or name. + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def run_task( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start a saved Actor task and block until it finishes. + + Args: + task_id: Task ID or name (e.g. ``"user/my-task"``). + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.task(task_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify task call failed for {task_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Task {task_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def run_task_and_get_items( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run a saved Actor task, then fetch items from its default dataset. + + Args: + task_id: Task ID or name. + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Task {task_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: + """Scrape a single URL and return its content as markdown. + + Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. + + Args: + url: The URL to scrape. + timeout_secs: Maximum time to wait for the crawl to finish. + + Returns: + Markdown (or plain-text fallback) content of the page. + + Raises: + RuntimeError: If the Actor run fails or no content is extracted. + """ + run_input = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': 1, + } + _, items = self.run_actor_and_get_items( + _SCRAPE_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + if not items: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + + content = items[0].get('markdown') or items[0].get('text') or '' + if not content: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + return content + + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: + """Fetch dataset items, wrapping any network error in a RuntimeError.""" + try: + return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + @staticmethod + def _check_run_status(run: dict) -> None: + """Raise if the run did not succeed.""" + status = run.get('status') + if status != _RUN_STATUS_SUCCEEDED: + run_id = run.get('id', 'unknown') + msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + raise RuntimeError(msg) diff --git a/langchain_apify/_error_messages.py b/langchain_apify/_error_messages.py new file mode 100644 index 0000000..eb2e605 --- /dev/null +++ b/langchain_apify/_error_messages.py @@ -0,0 +1,11 @@ +_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( + 'APIFY_TOKEN environment variable is not set.' + ' Please set it to your Apify API token by using `os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN"`' + ' in your code or pass it as environment variable.' + ' To pass it as environment variable, you can use the following command:' + ' `APIFY_TOKEN="YOUR_APIFY_TOKEN" python your_script.py`' +) + +_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' + +_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/utils.py b/langchain_apify/_utils.py similarity index 74% rename from langchain_apify/utils.py rename to langchain_apify/_utils.py index 8cdc835..7e18931 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/_utils.py @@ -1,24 +1,42 @@ from __future__ import annotations +import os import string from typing import TypeVar import requests from apify_client import ApifyClientAsync from apify_client.client import ApifyClient +from pydantic import SecretStr -from langchain_apify.const import MAX_DESCRIPTION_LEN, REQUESTS_TIMEOUT_SECS +_MAX_DESCRIPTION_LEN: int = 350 +_REQUESTS_TIMEOUT_SECS: float = 10.0 +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' -APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' +def _resolve_apify_token() -> str | None: + """Resolve the Apify API token from environment variables. -def prune_actor_input_schema( + ``APIFY_TOKEN`` (SDK-standard) takes precedence; ``APIFY_API_TOKEN`` is + kept as a fallback for backwards compatibility with this package's + historical naming. + """ + return os.getenv('APIFY_TOKEN') or os.getenv('APIFY_API_TOKEN') + + +def _apify_token_secret_factory() -> SecretStr | None: + """Pydantic ``default_factory`` returning the resolved token as ``SecretStr``.""" + token = _resolve_apify_token() + return SecretStr(token) if token else None + + +def _prune_actor_input_schema( input_schema: dict, - max_description_len: int = MAX_DESCRIPTION_LEN, + max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. - Trim the description to 250 characters. + Trim descriptions to ``_MAX_DESCRIPTION_LEN`` characters. Args: input_schema (dict): The input schema from the Actor build. @@ -48,7 +66,7 @@ def prune_actor_input_schema( T = TypeVar('T', ApifyClient, ApifyClientAsync) -def create_apify_client(client_cls: type[T], token: str) -> T: +def _create_apify_client(client_cls: type[T], token: str) -> T: """Create an Apify client instance with a custom user-agent. Args: @@ -79,7 +97,7 @@ def create_apify_client(client_cls: type[T], token: str) -> T: return client -def actor_id_to_tool_name(actor_id: str) -> str: +def _actor_id_to_tool_name(actor_id: str) -> str: """Turn actor_id into a valid tool name. Tool name must only contain letters, numbers, underscores, dashes, @@ -95,7 +113,7 @@ def actor_id_to_tool_name(actor_id: str) -> str: return 'apify_actor_' + ''.join(char if char in valid_chars else '_' for char in actor_id) -def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: +def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: """Get the latest build of an Actor from the default build tag. Args: @@ -117,8 +135,8 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: msg = f'Failed to get the Actor object ID for {actor_id}.' raise ValueError(msg) - url = APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) - response = requests.request('GET', url, timeout=REQUESTS_TIMEOUT_SECS) + url = _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) + response = requests.request('GET', url, timeout=_REQUESTS_TIMEOUT_SECS) build = response.json() if not isinstance(build, dict): diff --git a/langchain_apify/const.py b/langchain_apify/const.py deleted file mode 100644 index 87e0d0e..0000000 --- a/langchain_apify/const.py +++ /dev/null @@ -1,2 +0,0 @@ -REQUESTS_TIMEOUT_SECS: float = 10.0 -MAX_DESCRIPTION_LEN: int = 350 diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..658e21a 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -1,16 +1,15 @@ from __future__ import annotations -import os from collections.abc import Callable from typing import TYPE_CHECKING, Any from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator -from langchain_apify.utils import create_apify_client +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _apify_token_secret_factory, _create_apify_client if TYPE_CHECKING: from collections.abc import Iterator @@ -19,7 +18,7 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): """Load datasets from Apify web scraping, crawling, and data extraction platform. - To use, you should have the environment variable `APIFY_API_TOKEN` set + To use, you should have the environment variable `APIFY_TOKEN` set with your API key, or pass `apify_api_token` as a named parameter to the constructor. @@ -40,10 +39,15 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): documents = loader.load() """ - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - """An instance of the ApifyClient class from the apify-client Python package.""" + apify_api_token: SecretStr | None = Field( + default_factory=_apify_token_secret_factory, + description='Apify API token. Falls back to the APIFY_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -54,7 +58,7 @@ def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[dict], Document], - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, ) -> None: """Initialize the loader with an Apify dataset ID and a mapping function. @@ -63,34 +67,34 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. + apify_api_token (str | SecretStr): Apify API token. Falls back to the + ``APIFY_TOKEN`` environment variable when *None*. """ - super().__init__( - dataset_id=dataset_id, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, - ) - - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Args: - values (dict): The values to validate. + init_kwargs: dict[str, Any] = { + 'dataset_id': dataset_id, + 'dataset_mapping_function': dataset_mapping_function, + } + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + init_kwargs['apify_api_token'] = apify_api_token + super().__init__(**init_kwargs) + + @model_validator(mode='after') + def _init_client(self) -> ApifyDatasetLoader: + """Validate the resolved Apify token and initialise the client. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - # when running at Apify platform, use APIFY_TOKEN environment variable - apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - - client = create_apify_client(ApifyClient, apify_api_token) + ApifyDatasetLoader: The validated loader instance. - values['apify_client'] = client - - return values + Raises: + ValueError: If no token is available from any source. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self.apify_client = _create_apify_client(ApifyClient, self.apify_api_token.get_secret_value()) + return self def load(self) -> list[Document]: """Load documents. diff --git a/langchain_apify/error_messages.py b/langchain_apify/error_messages.py deleted file mode 100644 index 87462b8..0000000 --- a/langchain_apify/error_messages.py +++ /dev/null @@ -1,7 +0,0 @@ -ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( - 'APIFY_API_TOKEN environment variable is not set.' - ' Please set it to your Apify API token by using `os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"' - ' in your code or pass it as environment variable.' - ' To pass it as environment variable, you can use the following command:' - ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' -) diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 135314a..710ee45 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -1,23 +1,44 @@ +"""LangChain tools for the Apify platform. + +All tools require an Apify API token. Set it via the ``APIFY_TOKEN`` +environment variable, or pass ``apify_api_token`` to the tool constructor: + +.. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({"actor_id": "apify/python-example"}) + +For details, see https://docs.apify.com/platform/integrations/langchain +""" + from __future__ import annotations +import bisect import json -import os +from datetime import datetime from typing import TYPE_CHECKING, Any from apify_client import ApifyClient -from langchain_core.tools import BaseTool -from pydantic import BaseModel, Field, create_model - -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import ( - actor_id_to_tool_name, - create_apify_client, - get_actor_latest_build, - prune_actor_input_schema, +from langchain_core.tools import BaseTool, ToolException +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model + +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import ( + _MAX_DESCRIPTION_LEN, + _actor_id_to_tool_name, + _apify_token_secret_factory, + _create_apify_client, + _get_actor_latest_build, + _prune_actor_input_schema, + _resolve_apify_token, ) -from .const import MAX_DESCRIPTION_LEN - if TYPE_CHECKING: from langchain_core.callbacks import ( CallbackManagerForToolRun, @@ -27,7 +48,7 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] """Tool that runs Apify Actors. - To use, you should have the environment variable `APIFY_API_TOKEN` set + To use, you should have the environment variable `APIFY_TOKEN` set with your API key, or pass `apify_api_token` as a named parameter to the constructor. @@ -56,10 +77,13 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] chunk["messages"][-1].pretty_print() """ + _apify_client: ApifyClient = PrivateAttr() + _actor_id: str = PrivateAttr() + def __init__( self, actor_id: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: @@ -72,18 +96,22 @@ def __init__( **kwargs: Additional keyword arguments. Raises: - ValueError: If the `APIFY_API_TOKEN` environment variable is not set + ValueError: If the `APIFY_TOKEN` environment variable is not set """ - apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not apify_api_token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + _raw_token: str | None = ( + apify_api_token.get_secret_value() + if isinstance(apify_api_token, SecretStr) + else apify_api_token or _resolve_apify_token() + ) + if not _raw_token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, _raw_token) kwargs.update( { - 'name': actor_id_to_tool_name(actor_id), + 'name': _actor_id_to_tool_name(actor_id), 'description': self._create_description(apify_client, actor_id), 'args_schema': self._build_tool_args_schema_model( apify_client, @@ -126,10 +154,10 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: Returns: str: The description. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') - if len(actor_description) > MAX_DESCRIPTION_LEN: - actor_description = actor_description[:MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' + if len(actor_description) > _MAX_DESCRIPTION_LEN: + actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' return actor_description @staticmethod @@ -149,12 +177,12 @@ def _build_tool_args_schema_model( Raises: ValueError: If the input schema is not found in the Actor build. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) if not (actor_input := build.get('actorDefinition', {}).get('input')): msg = f'Input schema not found in the Actor build for Actor: {actor_id}' raise ValueError(msg) - properties, required = prune_actor_input_schema(actor_input) + properties, required = _prune_actor_input_schema(actor_input) properties = {'run_input': properties} description = ( @@ -192,3 +220,502 @@ def _run_actor(self, run_input: dict) -> list[dict]: run = self._apify_client.run(run_id=run_id) return run.dataset().list_items(clean=True).items + + +# --------------------------------------------------------------------------- +# Input schemas for the generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorInput(BaseModel): + """Input schema for :class:`ApifyRunActorTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + + +class ApifyGetDatasetItemsInput(BaseModel): + """Input schema for :class:`ApifyGetDatasetItemsTool`.""" + + dataset_id: str = Field(description='Apify dataset ID.') + limit: int = Field(default=100, description='Maximum number of items to return.') + offset: int = Field(default=0, description='Number of items to skip from the start.') + + +class ApifyRunActorAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetDatasetTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +class ApifyScrapeUrlInput(BaseModel): + """Input schema for :class:`ApifyScrapeUrlTool`.""" + + url: str = Field(description='The URL to scrape.') + timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') + + +class ApifyRunTaskInput(BaseModel): + """Input schema for :class:`ApifyRunTaskTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + + +class ApifyRunTaskAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetDatasetTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } + + +# Apify accepts memory_mbytes only as one of these power-of-2 values. +# https://docs.apify.com/api/v2/act-runs-post +_VALID_MEMORY_MBYTES: tuple[int, ...] = (128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768) + + +# --------------------------------------------------------------------------- +# Shared base for generic tools +# --------------------------------------------------------------------------- + + +class _ApifyGenericTool(BaseTool): # type: ignore[override] + """Shared base for all generic Apify tools. + + Handles ``ApifyToolsClient`` creation, sets ``handle_tool_error``, + and defines developer-controlled safety limits that clamp values the + LLM may provide at invocation time. + + Subclasses only need to declare ``name``, ``description``, + ``args_schema``, and ``_run()``. + """ + + handle_tool_error: bool = True + + apify_api_token: SecretStr | None = Field( + default_factory=_apify_token_secret_factory, + description='Apify API token. Falls back to the APIFY_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') + max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') + max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + + _client: ApifyToolsClient = PrivateAttr() + + def model_post_init(self, context: Any) -> None: # noqa: ANN401 + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) + super().model_post_init(context) + + def _clamp_timeout(self, value: int) -> int: + return max(1, min(value, self.max_timeout_secs)) + + def _clamp_memory(self, value: int | None) -> int | None: + # Clamp positive values to [128, max_memory_mbytes] and snap up to next valid Apify power-of-2. Non-positive uses default. + + if value is None or value <= 0: + return None + clamped = max(128, min(value, self.max_memory_mbytes)) + idx = bisect.bisect_left(_VALID_MEMORY_MBYTES, clamped) + # If snap-up exceeds cap, use largest valid at-or-below cap + if idx >= len(_VALID_MEMORY_MBYTES) or _VALID_MEMORY_MBYTES[idx] > self.max_memory_mbytes: + idx = bisect.bisect_right(_VALID_MEMORY_MBYTES, self.max_memory_mbytes) - 1 + # Misconfigured cap below the platform minimum, return the minimum. + return _VALID_MEMORY_MBYTES[max(idx, 0)] + + def _clamp_items(self, value: int) -> int: + return max(1, min(value, self.max_items)) + + +# --------------------------------------------------------------------------- +# Generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor by ID with an arbitrary JSON input. + + Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON + string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the + results from the dataset. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor' + description: str = ( + 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunActorInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_actor( + actor_id, run_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyGetDatasetItemsTool(_ApifyGenericTool): # type: ignore[override] + """Fetch items from an existing Apify dataset by ID. + + Returns a JSON object with an ``"items"`` key containing the list of item + dicts. When the dataset is empty an additional ``"message"`` key is + included. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"items": [...]}``; includes ``"message"`` when empty. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyGetDatasetItemsTool + + tool = ApifyGetDatasetItemsTool() + result = tool.invoke({"dataset_id": "abc123", "limit": 10}) + """ + + name: str = 'apify_get_dataset_items' + description: str = ( + 'Fetch items from an Apify dataset by ID. Returns a JSON object with an "items" array.' + ' Required: dataset_id (str) — Apify dataset ID.' + ' Optional: limit (int, default 100), offset (int, default 0).' + ) + args_schema: type[BaseModel] = ApifyGetDatasetItemsInput + + def _run( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + items = self._client.get_dataset_items(dataset_id, self._clamp_items(limit), offset) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + if not items: + return json.dumps({'items': [], 'message': f'Dataset {dataset_id} is empty.'}) + return json.dumps({'items': items}) + + +class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor and return both run metadata and dataset items. + + Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRunActorAndGetDatasetTool + + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor_and_get_dataset' + description: str = ( + 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunActorAndGetDatasetInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_actor_and_get_items( + actor_id, + run_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] + """Scrape a single URL and return its content as markdown. + + Uses the ``apify/website-content-crawler`` Actor under the hood with + ``maxCrawlPages=1``. Returns the page content as a plain markdown string + (not JSON). + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + Markdown string with the full text content of the scraped page, or a + plain-text fallback when markdown is unavailable. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyScrapeUrlTool + + tool = ApifyScrapeUrlTool() + markdown = tool.invoke({"url": "https://apify.com"}) + """ + + name: str = 'apify_scrape_url' + description: str = ( + 'Scrape a single URL using Apify and return its full content as a markdown string.' + ' Required: url (str) — the URL to scrape.' + ' Optional: timeout_secs (int, default 120).' + ' Returns the page content as markdown (or plain text if markdown is unavailable).' + ) + args_schema: type[BaseModel] = ApifyScrapeUrlInput + + def _run( + self, + url: str, + timeout_secs: int = 120, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + return self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + + +class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task by ID and return run metadata. + + Actor tasks are pre-configured Actor runs saved in the Apify Console. + This tool starts a task with optional input overrides and returns run + metadata (run ID, status, dataset ID, timestamps) as a JSON string. + Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve results. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRunTaskTool + + tool = ApifyRunTaskTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task' + description: str = ( + 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunTaskInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_task( + task_id, task_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task and return both run metadata and dataset items. + + Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_TOKEN"] = "your-apify-token" + + from langchain_apify import ApifyRunTaskAndGetDatasetTool + + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task_and_get_dataset' + description: str = ( + 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunTaskAndGetDatasetInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_task_and_get_items( + task_id, + task_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index ef17873..b78f330 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -4,11 +4,11 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient, ApifyClientAsync -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _apify_token_secret_factory, _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.utils import create_apify_client if TYPE_CHECKING: from collections.abc import Callable @@ -19,7 +19,7 @@ class ApifyWrapper(BaseModel): """Wrapper around Apify client for LangChain. - To use, you should have the environment variable `APIFY_API_TOKEN` set + To use, you should have the environment variable `APIFY_TOKEN` set with your API key, or pass `apify_api_token` as a named parameter to the constructor. @@ -51,49 +51,54 @@ class ApifyWrapper(BaseModel): """ # allow arbitrary types in the model config for the apify client fields - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - apify_client_async: ApifyClientAsync - apify_api_token: str | None = None + apify_api_token: SecretStr | None = Field( + default_factory=_apify_token_secret_factory, + description='Apify API token. Falls back to the APIFY_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) # type: ignore[assignment] def __init__( self, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: - """Initialize the loader with an Apify dataset ID and a mapping function. + """Initialise the wrapper. Args: - dataset_id (str): The ID of the dataset on the Apify platform. - dataset_mapping_function (Callable): A function that takes a single - dictionary (an Apify dataset item) and converts it to an instance - of the Document class. - apify_api_token (Optional[str]): Apify API token. - *args: Any: Additional positional arguments. - **kwargs: Any: Additional keyword arguments. + apify_api_token (Optional[str | SecretStr]): Apify API token. Falls + back to the ``APIFY_TOKEN`` environment variable when *None*. + *args: Any: Additional positional arguments forwarded to Pydantic. + **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ - kwargs.update({'apify_api_token': apify_api_token}) + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + kwargs['apify_api_token'] = apify_api_token super().__init__(*args, **kwargs) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Validate that an Apify API token is set and the apify-client - Python package exists in the current environment. + @model_validator(mode='after') + def _init_clients(self) -> ApifyWrapper: + """Validate the token and initialise both sync and async Apify clients. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') + ApifyWrapper: The validated wrapper instance. - values['apify_client'] = create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = create_apify_client(ApifyClientAsync, apify_api_token) - - return values + Raises: + ValueError: If no token is provided and APIFY_TOKEN is not set. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + token = self.apify_api_token.get_secret_value() + self.apify_client = _create_apify_client(ApifyClient, token) + self.apify_client_async = _create_apify_client(ApifyClientAsync, token) + return self def call_actor( # noqa: PLR0913 self, diff --git a/llms.txt b/llms.txt index a011fa6..a39a99a 100644 --- a/llms.txt +++ b/llms.txt @@ -10,11 +10,11 @@ To install the package, use pip: pip install langchain-apify ``` -Ensure you have set the `APIFY_API_TOKEN` environment variable with your Apify API token. +Ensure you have set the `APIFY_TOKEN` environment variable with your Apify API token. ```python import os -os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN" +os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN" ``` ## Key Imports @@ -110,5 +110,5 @@ documents = loader.load() **Note:** - This document assumes you're familiar with Python and LangChain basics. -- Adjust the `YOUR_APIFY_API_TOKEN` placeholder with your actual token or follow the instructions for setting environment variables. +- Adjust the `YOUR_APIFY_TOKEN` placeholder with your actual token or follow the instructions for setting environment variables. - The `dataset_mapping_function` is crucial for shaping the data into `Document` objects that LangChain can understand. Adjust based on the structure of your data. diff --git a/tests/integration_tests/test_document_loaders.py b/tests/integration_tests/test_document_loaders.py index 2fa2b2c..674eeba 100644 --- a/tests/integration_tests/test_document_loaders.py +++ b/tests/integration_tests/test_document_loaders.py @@ -1,10 +1,10 @@ -import os from collections.abc import Iterator from apify_client import ApifyClient from langchain_core.documents import Document from langchain_apify import ApifyDatasetLoader +from langchain_apify._utils import _resolve_apify_token def test_apify_dataset_loader_load() -> None: @@ -13,7 +13,7 @@ def test_apify_dataset_loader_load() -> None: Creates a new dataset, pushes items to it, and then loads the items using the loader. """ - token = os.getenv('APIFY_API_TOKEN') + token = _resolve_apify_token() client = ApifyClient(token=token) dataset_name = 'langchain-test-apify-dataset-loader-load' @@ -53,7 +53,7 @@ def test_apify_dataset_loader_lazy_load() -> None: Creates a new dataset, pushes items to it, and then loads the items using the loader. """ - token = os.getenv('APIFY_API_TOKEN') + token = _resolve_apify_token() client = ApifyClient(token=token) dataset_name = 'langchain-test-apify-dataset-loader-lazy-load' diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py new file mode 100644 index 0000000..60a9dde --- /dev/null +++ b/tests/integration_tests/test_generic_tools.py @@ -0,0 +1,96 @@ +"""Integration smoke tests for the generic Apify tools. + +These tests hit the real Apify API and require the ``APIFY_TOKEN`` +environment variable to be set (``APIFY_API_TOKEN`` is also accepted for +backwards compatibility). They use ``apify/python-example`` (a trivial +Actor that adds two numbers) to keep execution fast and cheap. +""" + +from __future__ import annotations + +import json +import os + +import pytest + +from langchain_apify import ( + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) +from langchain_apify._utils import _resolve_apify_token + +_ACTOR_ID = 'apify/python-example' +_RUN_INPUT = {'first_number': 2, 'second_number': 3} + +pytestmark = pytest.mark.skipif( + not _resolve_apify_token(), + reason='APIFY_TOKEN not set', +) + + +def test_run_actor_tool_smoke() -> None: + tool = ApifyRunActorTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +def test_get_dataset_items_tool_smoke() -> None: + run_tool = ApifyRunActorTool() + run_result = json.loads(run_tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT})) + dataset_id = run_result['dataset_id'] + + items_tool = ApifyGetDatasetItemsTool() + result = items_tool.invoke({'dataset_id': dataset_id, 'limit': 10}) + + parsed = json.loads(result) + assert 'items' in parsed + assert isinstance(parsed['items'], list) + + +def test_run_actor_and_get_items_tool_smoke() -> None: + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) + + +def test_scrape_url_tool_smoke() -> None: + tool = ApifyScrapeUrlTool() + result = tool.invoke({'url': 'https://crawlee.dev'}) + + assert isinstance(result, str) + assert len(result) > 0 + + +_TASK_ID = os.getenv('APIFY_TASK_ID', '') + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_tool_smoke() -> None: + tool = ApifyRunTaskTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_and_get_items_tool_smoke() -> None: + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 1107c7a..28ae02c 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -1,24 +1,22 @@ -import os - from apify_client.client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import create_apify_client, get_actor_latest_build +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client, _get_actor_latest_build, _resolve_apify_token def test_get_actor_latest_build() -> None: """Tests the get_actor_latest_build function. Raises: - ValueError: If the APIFY_API_TOKEN environment variable is not set. + ValueError: If the APIFY_TOKEN environment variable is not set. """ - if (token := os.getenv('APIFY_API_TOKEN')) is None: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + if (token := _resolve_apify_token()) is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, token) + apify_client = _create_apify_client(ApifyClient, token) - build = get_actor_latest_build(apify_client, 'apify/rag-web-browser') + build = _get_actor_latest_build(apify_client, 'apify/rag-web-browser') assert isinstance(build, dict) assert 'id' in build diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py new file mode 100644 index 0000000..3384e79 --- /dev/null +++ b/tests/unit_tests/conftest.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient + +SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +def make_tool(tool_cls: type, mock_client: MagicMock, **kwargs: Any) -> Any: # noqa: ANN401 + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token', **kwargs) + tool._client = mock_client + return tool diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py new file mode 100644 index 0000000..0f1d16b --- /dev/null +++ b/tests/unit_tests/test_client.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import httpx +import pytest +from apify_client import ApifyClient + +from langchain_apify._client import ApifyToolsClient +from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: + c = ApifyToolsClient(apify_api_token='my-token') + mock_create.assert_called_once() + assert c._client is mock_apify_client + + +def test_init_with_apify_token_env(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + """``APIFY_TOKEN`` (SDK-standard) should be picked up when set.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'sdk-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_with_legacy_apify_api_token_env( + monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock +) -> None: + """``APIFY_API_TOKEN`` is still honoured for backwards compatibility.""" + monkeypatch.delenv('APIFY_TOKEN', raising=False) + monkeypatch.setenv('APIFY_API_TOKEN', 'legacy-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_apify_token_takes_precedence( + monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock +) -> None: + """When both env vars are set, ``APIFY_TOKEN`` wins over ``APIFY_API_TOKEN``.""" + monkeypatch.setenv('APIFY_API_TOKEN', 'legacy-token') + monkeypatch.setenv('APIFY_TOKEN', 'sdk-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: + ApifyToolsClient() + mock_create.assert_called_once_with(ApifyClient, 'sdk-token') + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyToolsClient() + + +# --------------------------------------------------------------------------- +# run_actor +# --------------------------------------------------------------------------- + + +def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) + + mock_apify_client.actor.assert_called_once_with('apify/test-actor') + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300, logger=None + ) + assert result == SUCCEEDED_RUN + + +def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + client.run_actor('apify/test-actor', memory_mbytes=512) + + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input=None, timeout_secs=300, logger=None, memory_mbytes=512 + ) + + +def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# get_dataset_items +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) + + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) + assert items == SAMPLE_ITEMS + + +def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.get_dataset_items('dataset-empty') + assert items == [] + + +# --------------------------------------------------------------------------- +# run_actor_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + + +def test_run_actor_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_actor_and_get_items('apify/test-actor') + + +# --------------------------------------------------------------------------- +# run_task +# --------------------------------------------------------------------------- + + +def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_task('user/my-task', task_input={'key': 'val'}) + + mock_apify_client.task.assert_called_once_with('user/my-task') + mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) + assert result == SUCCEEDED_RUN + + +def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_task('user/my-task') + + +# --------------------------------------------------------------------------- +# run_task_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_task_and_get_items('user/my-task') + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_run_task_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_task_and_get_items('user/my-task') + + +# --------------------------------------------------------------------------- +# scrape_url +# --------------------------------------------------------------------------- + + +def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == '# Hello' + + +def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == 'Plain text content' + + +def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '', 'text': '', 'url': 'https://example.com'}, + ] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +# --------------------------------------------------------------------------- +# _check_run_status +# --------------------------------------------------------------------------- + + +def test_check_run_status_succeeded() -> None: + ApifyToolsClient._check_run_status({'id': 'run-ok', 'status': 'SUCCEEDED'}) + + +def test_check_run_status_failed() -> None: + with pytest.raises(RuntimeError, match='run-bad'): + ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) + + +# --------------------------------------------------------------------------- +# None returns from actor/task .call() +# --------------------------------------------------------------------------- + + +def test_run_actor_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_actor('apify/broken-actor') + + +def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_task('user/broken-task') + + +# --------------------------------------------------------------------------- +# Transport-error wrapping (httpx / ApifyClientError -> RuntimeError) +# --------------------------------------------------------------------------- + + +def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify Actor call failed'): + client.run_actor('apify/test-actor') + + +def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('timeout') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.get_dataset_items('dataset-xyz') + + +def test_run_actor_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_actor_and_get_items('apify/test-actor') + + +def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify task call failed'): + client.run_task('user/my-task') + + +def test_run_task_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_task_and_get_items('user/my-task') + + +def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" + mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') + + with pytest.raises(AttributeError, match='bug in SDK'): + client.run_actor('apify/test-actor') diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..a345f1f 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,5 +1,6 @@ from unittest.mock import patch +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document @@ -55,3 +56,27 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +def test_apify_dataset_loader_apify_token_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Loader should accept APIFY_TOKEN as a secondary env-var fallback.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'platform-token') + + with patch.object(DatasetClient, 'list_items') as mock_list_items: + mock_list_items.return_value = ListPage(data={'items': []}) + loader = ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) + assert loader.load() == [] + + +def test_apify_dataset_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index b10df2f..4702f24 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,13 +1,30 @@ from __future__ import annotations +import json +from datetime import datetime, timezone from typing import TYPE_CHECKING -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from langchain_core.tools import ToolException from pydantic import BaseModel -from langchain_apify.tools import ApifyActorsTool -from langchain_apify.utils import actor_id_to_tool_name +from langchain_apify import APIFY_CORE_TOOLS +from langchain_apify._client import ApifyToolsClient +from langchain_apify._utils import _actor_id_to_tool_name +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, + _ApifyGenericTool, + _iso, + _run_meta, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: from collections.abc import Generator @@ -40,7 +57,7 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id=actor_id, apify_api_token='dummy-token') assert isinstance(tool, ApifyActorsTool) assert tool.description == 'Mocked description' - assert tool.name == actor_id_to_tool_name(actor_id) + assert tool.name == _actor_id_to_tool_name(actor_id) assert tool.args_schema == DummyModel @@ -52,8 +69,8 @@ def test_run_actor_method(apify_actors_tool_fixture: ApifyActorsTool) -> None: with patch.object(ApifyActorsTool, '_run_actor') as mock_run_actor: mock_run_actor.return_value = [{'text': 'Apify is great!'}] - result = apify_actors_tool_fixture.invoke( - input={'run_input': {'query': 'what is Apify?', 'maxResults': 3}}, + result = apify_actors_tool_fixture._run( + run_input={'query': 'what is Apify?', 'maxResults': 3}, ) mock_run_actor.assert_called_once() assert result[0]['text'] == 'Apify is great!' @@ -85,3 +102,563 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id='apify/python-example', apify_api_token='dummy-token') yield tool + + +# --------------------------------------------------------------------------- +# _iso / _run_meta helpers +# --------------------------------------------------------------------------- + + +def test_iso_converts_datetime_to_string() -> None: + dt = datetime(2025, 6, 15, 12, 30, 45, tzinfo=timezone.utc) + assert _iso(dt) == '2025-06-15T12:30:45+00:00' + + +def test_iso_passes_through_string() -> None: + assert _iso('2025-01-01T00:00:00.000Z') == '2025-01-01T00:00:00.000Z' + + +def test_iso_passes_through_none() -> None: + assert _iso(None) is None + + +def test_run_meta_with_datetime_values_is_json_serializable() -> None: + run = { + 'id': 'run-dt', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-dt', + 'startedAt': datetime(2025, 3, 1, 10, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 3, 1, 10, 1, 0, tzinfo=timezone.utc), + } + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['run_id'] == 'run-dt' + assert parsed['started_at'] == '2025-03-01T10:00:00+00:00' + assert parsed['finished_at'] == '2025-03-01T10:01:00+00:00' + + +def test_run_meta_with_string_values_is_json_serializable() -> None: + meta = _run_meta(SUCCEEDED_RUN) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + + +def test_run_meta_with_missing_timestamps() -> None: + run = {'id': 'run-none', 'status': 'RUNNING', 'defaultDatasetId': 'ds-none'} + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] is None + assert parsed['finished_at'] is None + + +def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: + """End-to-end: ApifyRunActorTool returns valid JSON when the client returns datetime objects.""" + mock_tools_client.run_actor.return_value = { + 'id': 'run-real', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-real', + 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), + } + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test') + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-real' + assert parsed['started_at'] == '2025-06-01T08:00:00+00:00' + assert parsed['finished_at'] == '2025-06-01T08:05:00+00:00' + + +# --------------------------------------------------------------------------- +# ApifyRunActorTool +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_actor.assert_called_once_with('apify/test', {'key': 'val'}, 300, None) + + +def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(actor_id='apify/test') + + +def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyRunActorTool() + + +# --------------------------------------------------------------------------- +# ApifyGetDatasetItemsTool +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) + + parsed = json.loads(result) + assert len(parsed['items']) == 2 + assert parsed['items'][0]['text'] == 'item-1' + mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) + + +def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = [] + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-empty') + + parsed = json.loads(result) + assert parsed['items'] == [] + assert 'empty' in parsed['message'].lower() + + +def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.side_effect = RuntimeError( + 'Apify dataset fetch failed for ds-bad: connection reset' + ) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='Apify dataset fetch failed'): + tool._run(dataset_id='ds-bad') + + +def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyGetDatasetItemsTool() + + +# --------------------------------------------------------------------------- +# ApifyRunActorAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_actor_and_get_items.assert_called_once_with('apify/test', {'q': '1'}, 300, None, 50) + + +def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(actor_id='apify/test') + + +def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyRunActorAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# ApifyScrapeUrlTool +# --------------------------------------------------------------------------- + + +def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# Hello World' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert result == '# Hello World' + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + + +def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + with pytest.raises(ToolException, match='No content extracted'): + tool._run(url='https://example.com') + + +def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyScrapeUrlTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskTool +# --------------------------------------------------------------------------- + + +def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_task.assert_called_once_with('user/my-task', {'key': 'val'}, 300, None) + + +def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(task_id='user/my-task') + + +def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyRunTaskTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_task_and_get_items.assert_called_once_with('user/my-task', {'q': '1'}, 300, None, 50) + + +def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(task_id='user/my-task') + + +def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_TOKEN'): + ApifyRunTaskAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# Value clamping (developer safety limits) +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=60) + + tool._run(actor_id='apify/test', timeout_secs=9999) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 60, None) + + +def test_run_actor_tool_clamps_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=8192) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 512) + + +def test_run_actor_tool_passes_none_memory_through(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=None) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=10) + + tool._run(dataset_id='ds-1', limit=50000) + + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 10, 0) + + +def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunActorAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(actor_id='a', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_actor_and_get_items.assert_called_once_with('a', None, 30, 256, 5) + + +def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# content' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) + + tool._run(url='https://example.com', timeout_secs=9999) + + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + + +def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client, max_timeout_secs=60, max_memory_mbytes=512) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999) + + mock_tools_client.run_task.assert_called_once_with('t/1', None, 60, 512) + + +def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunTaskAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) + + +def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600) + + tool._run(actor_id='apify/test', timeout_secs=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', timeout_secs=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + +def test_clamp_memory_non_positive_is_treated_as_none(mock_tools_client: MagicMock) -> None: + """memory_mbytes <= 0 maps to None so the Apify platform default is used.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_clamp_memory_floors_positive_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """A positive memory_mbytes below the Apify platform minimum (128 MB) is floored to 128.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=64) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + +@pytest.mark.parametrize( + ('input_mb', 'expected_mb'), + [ + (128, 128), # already valid + (200, 256), # snap up + (500, 512), # snap up + (1024, 1024), # already valid + (1500, 2048), # snap up + (2048, 2048), # already valid + (3000, 4096), # snap up + (16384, 16384), # already valid + (32768, 32768), # already valid (top of range) + ], +) +def test_clamp_memory_snaps_up_to_power_of_two( + mock_tools_client: MagicMock, input_mb: int, expected_mb: int +) -> None: + """``memory_mbytes`` is snapped UP to the next valid Apify power-of-2 value.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=32768) + + tool._run(actor_id='apify/test', memory_mbytes=input_mb) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, expected_mb) + + +def test_clamp_memory_snap_up_capped_to_max(mock_tools_client: MagicMock) -> None: + """When snap-up would exceed ``max_memory_mbytes``, the largest valid value at-or-below the cap is used.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + # cap is not itself a power of 2; clamped value (500) snaps up to 512 which exceeds cap → fall back to 256. + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=500) + + tool._run(actor_id='apify/test', memory_mbytes=500) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 256) + + +def test_clamp_memory_misconfigured_cap_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """If the developer-set cap is below 128 (the Apify minimum), fall back to 128 rather than overshooting.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=100) + + tool._run(actor_id='apify/test', memory_mbytes=100) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + +def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=100) + + tool._run(dataset_id='ds-1', limit=-1) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + mock_tools_client.get_dataset_items.reset_mock() + tool._run(dataset_id='ds-1', limit=0) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + +def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: + """When LLM values are within limits they should pass through unchanged.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', timeout_secs=120, memory_mbytes=1024) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 120, 1024) + + +# --------------------------------------------------------------------------- +# Tool metadata assertions +# --------------------------------------------------------------------------- + + +def test_generic_tools_have_correct_metadata() -> None: + """Verify name, description, and args_schema are set on all generic tools.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ] + + expected_names = [ + 'apify_run_actor', + 'apify_get_dataset_items', + 'apify_run_actor_and_get_dataset', + 'apify_scrape_url', + 'apify_run_task', + 'apify_run_task_and_get_dataset', + ] + + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True + + +def test_apify_api_token_excluded_from_model_dump() -> None: + """The apify_api_token field must not appear in model_dump() output.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg,arg-type] + dumped = tool.model_dump() + assert 'apify_api_token' not in dumped + + +# --------------------------------------------------------------------------- +# _ApifyGenericTool inheritance +# --------------------------------------------------------------------------- + + +def test_all_generic_tools_inherit_from_base() -> None: + """Every generic tool must be a subclass of _ApifyGenericTool.""" + for tool_cls in ( + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + ): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_legacy_tool_does_not_inherit_from_generic_base() -> None: + """ApifyActorsTool is legacy and must NOT inherit from _ApifyGenericTool.""" + assert not issubclass(ApifyActorsTool, _ApifyGenericTool) + + +# --------------------------------------------------------------------------- +# APIFY_CORE_TOOLS list +# --------------------------------------------------------------------------- + + +def test_apify_core_tools_contains_all_generic_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 6 generic tool classes.""" + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + } + assert len(APIFY_CORE_TOOLS) == 6