diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 66142be..573365b 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,19 +1,71 @@ +from __future__ import annotations + from importlib import metadata +from typing import TYPE_CHECKING -from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.tools import ApifyActorsTool +from langchain_apify._actor_tools import ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify.document_loaders import ApifyCrawlLoader, ApifyDatasetLoader +from langchain_apify.retrievers import ApifySearchRetriever +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) from langchain_apify.wrappers import ApifyWrapper +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: - # Case where package metadata is not available. __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) +# Convenience tool-class lists for selective agent binding. +# Binding all tools at once overwhelms the LLM context window; +# pick the group(s) relevant to your use case. + +APIFY_CORE_TOOLS: list[type[BaseTool]] = [ + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, +] + +APIFY_SEARCH_TOOLS: list[type[BaseTool]] = [ + ApifyGoogleSearchTool, + ApifyWebCrawlerTool, +] + __all__ = [ + # Existing components (backward-compatible) 'ApifyActorsTool', 'ApifyDatasetLoader', 'ApifyWrapper', + # Core generic tools + 'ApifyGetDatasetItemsTool', + 'ApifyRunActorAndGetDatasetTool', + 'ApifyRunActorTool', + 'ApifyRunTaskAndGetDatasetTool', + 'ApifyRunTaskTool', + 'ApifyScrapeUrlTool', + # Actor-specific tools + 'ApifyGoogleSearchTool', + 'ApifyWebCrawlerTool', + # Retriever + 'ApifySearchRetriever', + # Loaders + 'ApifyCrawlLoader', + # Tool group lists + 'APIFY_SEARCH_TOOLS', + 'APIFY_CORE_TOOLS', + # Meta '__version__', ] diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py new file mode 100644 index 0000000..ab9c46d --- /dev/null +++ b/langchain_apify/_actor_tools.py @@ -0,0 +1,158 @@ +"""Actor-specific tool subclasses. + +Tools in this module wrap a single Apify Actor behind a simplified, +LLM-friendly interface. They inherit from +:class:`~langchain_apify.tools._ApifyGenericTool`. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING + +from langchain_core.tools import ToolException +from pydantic import BaseModel # noqa: TCH002 + +from langchain_apify.tools import ( + ApifyGoogleSearchInput, + ApifyWebCrawlerInput, + CrawlerType, + _ApifyGenericTool, +) + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + +# --------------------------------------------------------------------------- +# Search & Crawling tools +# --------------------------------------------------------------------------- + + +class ApifyGoogleSearchTool(_ApifyGenericTool): # type: ignore[override] + """Search Google and return structured results via Apify. + + Wraps the ``apify/google-search-scraper`` Actor behind a simplified, + LLM-friendly interface. Returns a JSON string containing an array of + result objects, each with ``title``, ``url``, and ``description`` keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string — an array of ``{"title", "url", "description"}`` objects. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGoogleSearchTool + + tool = ApifyGoogleSearchTool() + results = tool.invoke({"query": "LangChain framework"}) + """ + + name: str = 'apify_google_search' + description: str = ( + 'Search Google using Apify and return structured results as a JSON array.' + ' Each result has keys: title, url, description.' + ' Required: query (str) — the search query.' + ' Optional: max_results (int, default 10),' + ' country_code (str|null), language_code (str|null),' + ' timeout_secs (int, default 300).' + ) + args_schema: type[BaseModel] = ApifyGoogleSearchInput + + def _run( + self, + query: str, + max_results: int = 10, + country_code: str | None = None, + language_code: str | None = None, + timeout_secs: int = 300, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + results = self._client.google_search( + query, + max_results=self._clamp_items(max_results), + country_code=country_code, + language_code=language_code, + timeout_secs=self._clamp_timeout(timeout_secs), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(results) + + +class ApifyWebCrawlerTool(_ApifyGenericTool): # type: ignore[override] + """Crawl a website and return page content as JSON via Apify. + + Wraps the ``apify/website-content-crawler`` Actor. Returns a JSON string + containing an array of page objects, each with ``url``, ``title``, and + ``content`` (markdown) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string — an array of ``{"url", "title", "content"}`` objects. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyWebCrawlerTool + + tool = ApifyWebCrawlerTool() + pages = tool.invoke({ + "url": "https://docs.apify.com", + "max_crawl_pages": 5, + }) + """ + + name: str = 'apify_web_crawler' + description: str = ( + 'Crawl a website using Apify and return page content as a JSON array.' + ' Each page object has keys: url, title, content (markdown).' + ' Required: url (str) — seed URL to crawl.' + ' Optional: max_crawl_pages (int, default 10),' + ' max_crawl_depth (int, default 1),' + ' crawler_type (str, default "cheerio"),' + ' timeout_secs (int, default 300).' + ) + args_schema: type[BaseModel] = ApifyWebCrawlerInput + + def _run( + self, + url: str, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: CrawlerType = 'cheerio', + timeout_secs: int = 300, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + items = self._client.crawl_website( + url, + max_crawl_pages=self._clamp_items(max_crawl_pages), + max_crawl_depth=self._clamp_depth(max_crawl_depth), + crawler_type=crawler_type, + timeout_secs=self._clamp_timeout(timeout_secs), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + pages = [ + { + 'url': item.get('url', ''), + 'title': item.get('metadata', {}).get('title', ''), + 'content': item.get('markdown') or item.get('text', ''), + } + for item in items + ] + return json.dumps(pages) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py new file mode 100644 index 0000000..39de22a --- /dev/null +++ b/langchain_apify/_client.py @@ -0,0 +1,388 @@ +from __future__ import annotations + +import os + +import httpx +from apify_client import ApifyClient +from apify_client.errors import ApifyClientError +from pydantic import SecretStr + +from langchain_apify._error_messages import ( + _ERROR_ACTOR_RUN_FAILED, + _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + _ERROR_SCRAPE_EMPTY, +) +from langchain_apify._utils import _create_apify_client + +# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate. +_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError) + +_SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_CRAWL_ACTOR_ID = 'apify/website-content-crawler' +_GOOGLE_SEARCH_ACTOR_ID = 'apify/google-search-scraper' +_RAG_WEB_BROWSER_ACTOR_ID = 'apify/rag-web-browser' +_DEFAULT_RUN_TIMEOUT_SECS = 300 +_DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_CRAWL_TIMEOUT_SECS = 300 +_DEFAULT_DATASET_ITEMS_LIMIT = 100 +_RUN_STATUS_SUCCEEDED = 'SUCCEEDED' + + +class ApifyToolsClient: + """Internal helper that wraps ``ApifyClient`` for the tools layer. + + One convenience method per tool operation. All methods are synchronous and + block until the Actor run finishes. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Raises: + ValueError: If no token is provided and the env var is not set. + """ + + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + if isinstance(apify_api_token, SecretStr): + _token: str | None = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = _create_apify_client(ApifyClient, _token) + + def run_actor( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start an Actor and block until it finishes. + + Args: + actor_id: Actor ID or name (e.g. ``"apify/python-example"``). + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.actor(actor_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify Actor call failed for {actor_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Actor {actor_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def get_dataset_items( + self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0 + ) -> list[dict]: + """Fetch items from an existing dataset. + + Args: + dataset_id: Apify dataset ID. + limit: Maximum number of items to return. + offset: Number of items to skip from the start. + + Returns: + List of dataset item dicts (may be empty). + """ + try: + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + def run_actor_and_get_items( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run an Actor, then fetch items from its default dataset. + + Args: + actor_id: Actor ID or name. + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def run_task( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start a saved Actor task and block until it finishes. + + Args: + task_id: Task ID or name (e.g. ``"user/my-task"``). + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.task(task_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify task call failed for {task_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Task {task_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def run_task_and_get_items( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run a saved Actor task, then fetch items from its default dataset. + + Args: + task_id: Task ID or name. + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Task {task_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: + """Scrape a single URL and return its content as markdown. + + Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. + + Args: + url: The URL to scrape. + timeout_secs: Maximum time to wait for the crawl to finish. + + Returns: + Markdown (or plain-text fallback) content of the page. + + Raises: + RuntimeError: If the Actor run fails or no content is extracted. + """ + run_input = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': 1, + } + _, items = self.run_actor_and_get_items( + _SCRAPE_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + if not items: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + + content = items[0].get('markdown') or items[0].get('text') or '' + if not content: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + return content + + def google_search( + self, + query: str, + max_results: int = 10, + country_code: str | None = None, + language_code: str | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> list[dict]: + """Run a Google search and return structured results. + + Uses ``apify/google-search-scraper`` with a single query. + + Args: + query: Search query string. + max_results: Maximum number of results to return. + country_code: Two-letter country code for localised results. + language_code: Two-letter language code. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of result dicts, each with ``title``, ``url``, and + ``description`` keys. + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'queries': query, + 'maxPagesPerQuery': 1, + 'resultsPerPage': max_results, + } + if country_code is not None: + run_input['countryCode'] = country_code + if language_code is not None: + run_input['languageCode'] = language_code + + _, items = self.run_actor_and_get_items( + _GOOGLE_SEARCH_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + results: list[dict] = [ + { + 'title': organic.get('title', ''), + 'url': organic.get('url', ''), + 'description': organic.get('description', ''), + } + for item in items + for organic in item.get('organicResults', []) + ] + return results[:max_results] + + def rag_web_search( + self, + query: str, + max_results: int = 5, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + ) -> list[dict]: + """Search the web and return crawled page content for RAG. + + Uses ``apify/rag-web-browser``. + + Args: + query: Search query string. + max_results: Maximum number of results to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of result dicts with ``crawledUrl``, ``title``, and + ``text`` keys (among others from the Actor). + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'query': query, + 'maxResults': max_results, + } + _, items = self.run_actor_and_get_items( + _RAG_WEB_BROWSER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + return items + + def crawl_website( + self, + url: str, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: str = 'cheerio', + timeout_secs: int = _DEFAULT_CRAWL_TIMEOUT_SECS, + ) -> list[dict]: + """Crawl a website and return page content. + + Uses ``apify/website-content-crawler``. + + Args: + url: Seed URL to start crawling from. + max_crawl_pages: Maximum number of pages to crawl. + max_crawl_depth: Maximum link-follow depth from the seed URL. + crawler_type: Crawler engine (e.g. ``"cheerio"``, ``"playwright"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + List of page dicts, each with at least ``url``, ``title``, and + ``markdown`` (or ``text``) keys. + + Raises: + RuntimeError: If the Actor run fails. + """ + run_input: dict = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': max_crawl_pages, + 'maxCrawlDepth': max_crawl_depth, + 'crawlerType': crawler_type, + } + _, items = self.run_actor_and_get_items( + _CRAWL_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_crawl_pages, + ) + return items + + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: + """Fetch dataset items, wrapping any network error in a RuntimeError.""" + try: + return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + @staticmethod + def _check_run_status(run: dict) -> None: + """Raise if the run did not succeed.""" + status = run.get('status') + if status != _RUN_STATUS_SUCCEEDED: + run_id = run.get('id', 'unknown') + msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + raise RuntimeError(msg) diff --git a/langchain_apify/error_messages.py b/langchain_apify/_error_messages.py similarity index 68% rename from langchain_apify/error_messages.py rename to langchain_apify/_error_messages.py index 87462b8..0a8c612 100644 --- a/langchain_apify/error_messages.py +++ b/langchain_apify/_error_messages.py @@ -1,7 +1,11 @@ -ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( +_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( 'APIFY_API_TOKEN environment variable is not set.' ' Please set it to your Apify API token by using `os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"' ' in your code or pass it as environment variable.' ' To pass it as environment variable, you can use the following command:' ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) + +_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' + +_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/utils.py b/langchain_apify/_utils.py similarity index 84% rename from langchain_apify/utils.py rename to langchain_apify/_utils.py index 8cdc835..9d74487 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/_utils.py @@ -7,18 +7,18 @@ from apify_client import ApifyClientAsync from apify_client.client import ApifyClient -from langchain_apify.const import MAX_DESCRIPTION_LEN, REQUESTS_TIMEOUT_SECS +_MAX_DESCRIPTION_LEN: int = 350 +_REQUESTS_TIMEOUT_SECS: float = 10.0 +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' -APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' - -def prune_actor_input_schema( +def _prune_actor_input_schema( input_schema: dict, - max_description_len: int = MAX_DESCRIPTION_LEN, + max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. - Trim the description to 250 characters. + Trim descriptions to ``_MAX_DESCRIPTION_LEN`` characters. Args: input_schema (dict): The input schema from the Actor build. @@ -48,7 +48,7 @@ def prune_actor_input_schema( T = TypeVar('T', ApifyClient, ApifyClientAsync) -def create_apify_client(client_cls: type[T], token: str) -> T: +def _create_apify_client(client_cls: type[T], token: str) -> T: """Create an Apify client instance with a custom user-agent. Args: @@ -79,7 +79,7 @@ def create_apify_client(client_cls: type[T], token: str) -> T: return client -def actor_id_to_tool_name(actor_id: str) -> str: +def _actor_id_to_tool_name(actor_id: str) -> str: """Turn actor_id into a valid tool name. Tool name must only contain letters, numbers, underscores, dashes, @@ -95,7 +95,7 @@ def actor_id_to_tool_name(actor_id: str) -> str: return 'apify_actor_' + ''.join(char if char in valid_chars else '_' for char in actor_id) -def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: +def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: """Get the latest build of an Actor from the default build tag. Args: @@ -117,8 +117,8 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: msg = f'Failed to get the Actor object ID for {actor_id}.' raise ValueError(msg) - url = APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) - response = requests.request('GET', url, timeout=REQUESTS_TIMEOUT_SECS) + url = _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) + response = requests.request('GET', url, timeout=_REQUESTS_TIMEOUT_SECS) build = response.json() if not isinstance(build, dict): diff --git a/langchain_apify/const.py b/langchain_apify/const.py deleted file mode 100644 index 87e0d0e..0000000 --- a/langchain_apify/const.py +++ /dev/null @@ -1,2 +0,0 @@ -REQUESTS_TIMEOUT_SECS: float = 10.0 -MAX_DESCRIPTION_LEN: int = 350 diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..e1f0e6a 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -6,15 +6,19 @@ from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader -from langchain_core.documents import Document # noqa: TCH002 -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from langchain_core.documents import Document +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator -from langchain_apify.utils import create_apify_client +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: from collections.abc import Iterator + from langchain_apify.tools import CrawlerType + class ApifyDatasetLoader(BaseLoader, BaseModel): """Load datasets from Apify web scraping, crawling, and data extraction platform. @@ -40,10 +44,15 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): documents = loader.load() """ - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - """An instance of the ApifyClient class from the apify-client Python package.""" + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -54,7 +63,7 @@ def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[dict], Document], - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, ) -> None: """Initialize the loader with an Apify dataset ID and a mapping function. @@ -63,34 +72,43 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. + apify_api_token (str | SecretStr): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ - super().__init__( - dataset_id=dataset_id, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, - ) - - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Args: - values (dict): The values to validate. + init_kwargs: dict[str, Any] = { + 'dataset_id': dataset_id, + 'dataset_mapping_function': dataset_mapping_function, + } + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + init_kwargs['apify_api_token'] = apify_api_token + super().__init__(**init_kwargs) + + @model_validator(mode='after') + def _init_client(self) -> ApifyDatasetLoader: + """Resolve the Apify API token and initialise the client. + + Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the + Apify platform where only that variable is set. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - # when running at Apify platform, use APIFY_TOKEN environment variable - apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - - client = create_apify_client(ApifyClient, apify_api_token) - - values['apify_client'] = client + ApifyDatasetLoader: The validated loader instance. - return values + Raises: + ValueError: If no token is available from any source. + """ + token = self.apify_api_token + if token is None: + # Secondary fallback for code running on the Apify platform. + raw = os.getenv('APIFY_TOKEN') + if raw: + token = SecretStr(raw) + if token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self.apify_client = _create_apify_client(ApifyClient, token.get_secret_value()) + return self def load(self) -> list[Document]: """Load documents. @@ -112,3 +130,79 @@ def lazy_load(self) -> Iterator[Document]: ) for item in dataset_items: yield self.dataset_mapping_function(item) + + +class ApifyCrawlLoader(BaseLoader): + """Crawl a website and load pages as LangChain Documents. + + Wraps the ``apify/website-content-crawler`` Actor. Runs a crawl starting + from the seed URL and converts each crawled page into a ``Document`` with + markdown content and metadata (source URL, title, crawl depth). + + Args: + url: Seed URL to start crawling from. + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + max_crawl_pages: Maximum number of pages to crawl. + max_crawl_depth: Maximum link-follow depth from the seed URL. + crawler_type: Crawler engine (e.g. ``"cheerio"``, ``"playwright"``). + timeout_secs: Maximum time in seconds to wait for the crawl. + + Returns: + Iterator (or list) of ``Document`` objects. ``page_content`` contains + the page markdown; ``metadata`` includes ``source``, ``title``, and + ``crawl_depth``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyCrawlLoader + + loader = ApifyCrawlLoader( + url="https://docs.apify.com", + max_crawl_pages=5, + ) + documents = loader.load() + """ + + def __init__( # noqa: PLR0913 + self, + url: str, + apify_api_token: str | SecretStr | None = None, + *, + max_crawl_pages: int = 10, + max_crawl_depth: int = 1, + crawler_type: CrawlerType = 'cheerio', + timeout_secs: int = 300, + ) -> None: + self.url = url + self.max_crawl_pages = max_crawl_pages + self.max_crawl_depth = max_crawl_depth + self.crawler_type = crawler_type + self.timeout_secs = timeout_secs + self._client = ApifyToolsClient(apify_api_token=apify_api_token) + + def lazy_load(self) -> Iterator[Document]: + """Crawl the website and yield Documents. + + Yields: + Document: One document per crawled page. + """ + items = self._client.crawl_website( + self.url, + max_crawl_pages=self.max_crawl_pages, + max_crawl_depth=self.max_crawl_depth, + crawler_type=self.crawler_type, + timeout_secs=self.timeout_secs, + ) + for item in items: + page_content = item.get('markdown') or item.get('text') or '' + metadata: dict[str, Any] = { + 'source': item.get('url', ''), + 'title': item.get('metadata', {}).get('title', '') if isinstance(item.get('metadata'), dict) else '', + 'crawl_depth': item.get('crawlDepth', 0), + } + yield Document(page_content=page_content, metadata=metadata) diff --git a/langchain_apify/retrievers.py b/langchain_apify/retrievers.py new file mode 100644 index 0000000..80a2099 --- /dev/null +++ b/langchain_apify/retrievers.py @@ -0,0 +1,116 @@ +"""LangChain retrievers backed by Apify Actors.""" + +from __future__ import annotations + +import asyncio +from typing import TYPE_CHECKING, Any + +from langchain_core.documents import Document +from langchain_core.retrievers import BaseRetriever +from langchain_core.utils import secret_from_env +from pydantic import Field, PrivateAttr, SecretStr + +from langchain_apify._client import ApifyToolsClient + +if TYPE_CHECKING: + from langchain_core.callbacks import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, + ) + +_DEFAULT_TIMEOUT_SECS = 300 + + +class ApifySearchRetriever(BaseRetriever): + """Retrieve documents from the web for RAG using Apify. + + Wraps the ``apify/rag-web-browser`` Actor. Each invocation runs a web + search, crawls the top results, and returns their content as LangChain + ``Document`` objects ready for a RAG pipeline. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + max_results: Maximum number of ``Document`` objects to return per query. + timeout_secs: Maximum time in seconds to wait for the Actor run. + + Returns: + List of ``Document`` objects. ``page_content`` contains the crawled + text; ``metadata`` includes ``source`` (URL) and ``title``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifySearchRetriever + + retriever = ApifySearchRetriever(max_results=3) + docs = retriever.invoke("What is LangChain?") + """ + + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + max_results: int = Field(default=5, description='Maximum number of documents to return.') + timeout_secs: int = Field(default=_DEFAULT_TIMEOUT_SECS, description='Maximum Actor run time in seconds.') + + _client: ApifyToolsClient = PrivateAttr() + + def model_post_init(self, context: Any) -> None: # noqa: ANN401 + """Construct the underlying ``ApifyToolsClient``. + + The helper handles ``None`` / ``SecretStr`` / env-fallback and raises + ``ValueError`` if no token is available. + """ + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token) + super().model_post_init(context) + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: CallbackManagerForRetrieverRun | None = None, # noqa: ARG002 + ) -> list[Document]: + items = self._client.rag_web_search( + query, + max_results=self.max_results, + timeout_secs=self.timeout_secs, + ) + return self._items_to_documents(items) + + async def _aget_relevant_documents( + self, + query: str, + *, + run_manager: AsyncCallbackManagerForRetrieverRun | None = None, # noqa: ARG002 + ) -> list[Document]: + # ApifyToolsClient is sync-only. + items = await asyncio.to_thread( + self._client.rag_web_search, + query, + max_results=self.max_results, + timeout_secs=self.timeout_secs, + ) + return self._items_to_documents(items) + + @staticmethod + def _items_to_documents(items: list[dict]) -> list[Document]: + """Convert Actor dataset items to LangChain Documents.""" + docs: list[Document] = [] + for item in items: + page_content = item.get('text') or item.get('markdown') or '' + raw_meta = item.get('metadata') + item_metadata: dict = raw_meta if isinstance(raw_meta, dict) else {} + metadata: dict[str, Any] = { + # apify/rag-web-browser nests url/title under "metadata"; older + # Actors and tests use top-level keys. Both are supported. + 'source': item.get('crawledUrl') or item.get('url') or item_metadata.get('url', ''), + 'title': item_metadata.get('title', ''), + } + docs.append(Document(page_content=page_content, metadata=metadata)) + return docs diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 135314a..ff5c5b9 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -1,28 +1,50 @@ +"""LangChain tools for the Apify platform. + +All tools require an Apify API token. Set it via the ``APIFY_API_TOKEN`` +environment variable, or pass ``apify_api_token`` to the tool constructor: + +.. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({"actor_id": "apify/python-example"}) + +For details, see https://docs.apify.com/platform/integrations/langchain +""" + from __future__ import annotations import json import os -from typing import TYPE_CHECKING, Any +from datetime import datetime +from typing import TYPE_CHECKING, Any, Literal from apify_client import ApifyClient -from langchain_core.tools import BaseTool -from pydantic import BaseModel, Field, create_model - -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import ( - actor_id_to_tool_name, - create_apify_client, - get_actor_latest_build, - prune_actor_input_schema, +from langchain_core.tools import BaseTool, ToolException +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model, field_validator + +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import ( + _MAX_DESCRIPTION_LEN, + _actor_id_to_tool_name, + _create_apify_client, + _get_actor_latest_build, + _prune_actor_input_schema, ) -from .const import MAX_DESCRIPTION_LEN - if TYPE_CHECKING: from langchain_core.callbacks import ( CallbackManagerForToolRun, ) +CrawlerType = Literal['cheerio', 'playwright:adaptive', 'playwright:firefox'] + class ApifyActorsTool(BaseTool): # type: ignore[override, override] """Tool that runs Apify Actors. @@ -56,10 +78,13 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] chunk["messages"][-1].pretty_print() """ + _apify_client: ApifyClient = PrivateAttr() + _actor_id: str = PrivateAttr() + def __init__( self, actor_id: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: @@ -74,16 +99,20 @@ def __init__( Raises: ValueError: If the `APIFY_API_TOKEN` environment variable is not set """ - apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not apify_api_token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + _raw_token: str | None = ( + apify_api_token.get_secret_value() + if isinstance(apify_api_token, SecretStr) + else apify_api_token or os.getenv('APIFY_API_TOKEN') + ) + if not _raw_token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, _raw_token) kwargs.update( { - 'name': actor_id_to_tool_name(actor_id), + 'name': _actor_id_to_tool_name(actor_id), 'description': self._create_description(apify_client, actor_id), 'args_schema': self._build_tool_args_schema_model( apify_client, @@ -126,10 +155,10 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: Returns: str: The description. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') - if len(actor_description) > MAX_DESCRIPTION_LEN: - actor_description = actor_description[:MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' + if len(actor_description) > _MAX_DESCRIPTION_LEN: + actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' return actor_description @staticmethod @@ -149,12 +178,12 @@ def _build_tool_args_schema_model( Raises: ValueError: If the input schema is not found in the Actor build. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) if not (actor_input := build.get('actorDefinition', {}).get('input')): msg = f'Input schema not found in the Actor build for Actor: {actor_id}' raise ValueError(msg) - properties, required = prune_actor_input_schema(actor_input) + properties, required = _prune_actor_input_schema(actor_input) properties = {'run_input': properties} description = ( @@ -192,3 +221,533 @@ def _run_actor(self, run_input: dict) -> list[dict]: run = self._apify_client.run(run_id=run_id) return run.dataset().list_items(clean=True).items + + +# --------------------------------------------------------------------------- +# Input schemas for the generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorInput(BaseModel): + """Input schema for :class:`ApifyRunActorTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + + +class ApifyGetDatasetItemsInput(BaseModel): + """Input schema for :class:`ApifyGetDatasetItemsTool`.""" + + dataset_id: str = Field(description='Apify dataset ID.') + limit: int = Field(default=100, description='Maximum number of items to return.') + offset: int = Field(default=0, description='Number of items to skip from the start.') + + +class ApifyRunActorAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetDatasetTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +class ApifyScrapeUrlInput(BaseModel): + """Input schema for :class:`ApifyScrapeUrlTool`.""" + + url: str = Field(description='The URL to scrape.') + timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') + + +class ApifyGoogleSearchInput(BaseModel): + """Input schema for :class:`ApifyGoogleSearchTool`.""" + + query: str = Field(description='Search query string.') + max_results: int = Field(default=10, description='Maximum number of search results to return.') + country_code: str | None = Field( + default=None, + description='Two-letter country code (case-insensitive; normalised to lowercase, e.g. "us", "gb").', + pattern=r'^[a-zA-Z]{2}$', + ) + language_code: str | None = Field( + default=None, + description='Two-letter language code (case-insensitive; normalised to lowercase, e.g. "en", "fr").', + pattern=r'^[a-zA-Z]{2}$', + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the search to finish.') + + @field_validator('country_code', 'language_code') + @classmethod + def _normalise_locale_code(cls, value: str | None) -> str | None: + return value.lower() if value else value + + +class ApifyWebCrawlerInput(BaseModel): + """Input schema for :class:`ApifyWebCrawlerTool`.""" + + url: str = Field(description='Seed URL to start crawling from.') + max_crawl_pages: int = Field(default=10, description='Maximum number of pages to crawl.') + max_crawl_depth: int = Field(default=1, description='Maximum link-follow depth from the seed URL.') + crawler_type: CrawlerType = Field( + default='cheerio', + description='Crawler engine: "cheerio" (fast, static HTML), "playwright:adaptive" or "playwright:firefox".', + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the crawl to finish.') + + +class ApifyRunTaskInput(BaseModel): + """Input schema for :class:`ApifyRunTaskTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + + +class ApifyRunTaskAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetDatasetTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } + + +# --------------------------------------------------------------------------- +# Shared base for generic tools +# --------------------------------------------------------------------------- + + +class _ApifyGenericTool(BaseTool): # type: ignore[override] + """Shared base for all generic Apify tools. + + Handles ``ApifyToolsClient`` creation, sets ``handle_tool_error``, + and defines developer-controlled safety limits that clamp values the + LLM may provide at invocation time. + + Subclasses only need to declare ``name``, ``description``, + ``args_schema``, and ``_run()``. + """ + + handle_tool_error: bool = True + + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') + max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') + max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + max_crawl_depth: int = Field(default=5, description='Upper bound for max_crawl_depth the LLM may request.') + + _client: ApifyToolsClient = PrivateAttr() + + def model_post_init(self, context: Any) -> None: # noqa: ANN401 + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) + super().model_post_init(context) + + def _clamp_timeout(self, value: int) -> int: + return max(1, min(value, self.max_timeout_secs)) + + def _clamp_memory(self, value: int | None) -> int | None: + # Non-positive values fall through to the platform default. Positive + # values are floored at 128 MB (the Apify platform minimum) so the LLM + # cannot drive into an API rejection by requesting too little memory. + if value is None or value <= 0: + return None + return max(128, min(value, self.max_memory_mbytes)) + + def _clamp_items(self, value: int) -> int: + return max(1, min(value, self.max_items)) + + def _clamp_depth(self, value: int) -> int: + # Floor at 0 (a depth of 0 means "only crawl the seed URL"). + return max(0, min(value, self.max_crawl_depth)) + + +# --------------------------------------------------------------------------- +# Generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor by ID with an arbitrary JSON input. + + Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON + string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the + results from the dataset. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor' + description: str = ( + 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunActorInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_actor( + actor_id, run_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyGetDatasetItemsTool(_ApifyGenericTool): # type: ignore[override] + """Fetch items from an existing Apify dataset by ID. + + Returns a JSON object with an ``"items"`` key containing the list of item + dicts. When the dataset is empty an additional ``"message"`` key is + included. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"items": [...]}``; includes ``"message"`` when empty. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGetDatasetItemsTool + + tool = ApifyGetDatasetItemsTool() + result = tool.invoke({"dataset_id": "abc123", "limit": 10}) + """ + + name: str = 'apify_get_dataset_items' + description: str = ( + 'Fetch items from an Apify dataset by ID. Returns a JSON object with an "items" array.' + ' Required: dataset_id (str) — Apify dataset ID.' + ' Optional: limit (int, default 100), offset (int, default 0).' + ) + args_schema: type[BaseModel] = ApifyGetDatasetItemsInput + + def _run( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + items = self._client.get_dataset_items(dataset_id, self._clamp_items(limit), offset) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + if not items: + return json.dumps({'items': [], 'message': f'Dataset {dataset_id} is empty.'}) + return json.dumps({'items': items}) + + +class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor and return both run metadata and dataset items. + + Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorAndGetDatasetTool + + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor_and_get_dataset' + description: str = ( + 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunActorAndGetDatasetInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_actor_and_get_items( + actor_id, + run_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] + """Scrape a single URL and return its content as markdown. + + Uses the ``apify/website-content-crawler`` Actor under the hood with + ``maxCrawlPages=1``. Returns the page content as a plain markdown string + (not JSON). + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + Markdown string with the full text content of the scraped page, or a + plain-text fallback when markdown is unavailable. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyScrapeUrlTool + + tool = ApifyScrapeUrlTool() + markdown = tool.invoke({"url": "https://apify.com"}) + """ + + name: str = 'apify_scrape_url' + description: str = ( + 'Scrape a single URL using Apify and return its full content as a markdown string.' + ' Required: url (str) — the URL to scrape.' + ' Optional: timeout_secs (int, default 120).' + ' Returns the page content as markdown (or plain text if markdown is unavailable).' + ) + args_schema: type[BaseModel] = ApifyScrapeUrlInput + + def _run( + self, + url: str, + timeout_secs: int = 120, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + return self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + + +class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task by ID and return run metadata. + + Actor tasks are pre-configured Actor runs saved in the Apify Console. + This tool starts a task with optional input overrides and returns run + metadata (run ID, status, dataset ID, timestamps) as a JSON string. + Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve results. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskTool + + tool = ApifyRunTaskTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task' + description: str = ( + 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunTaskInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_task( + task_id, task_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task and return both run metadata and dataset items. + + Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskAndGetDatasetTool + + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task_and_get_dataset' + description: str = ( + 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunTaskAndGetDatasetInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_task_and_get_items( + task_id, + task_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index ef17873..57a9eeb 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -4,11 +4,12 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient, ApifyClientAsync -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.utils import create_apify_client if TYPE_CHECKING: from collections.abc import Callable @@ -51,49 +52,54 @@ class ApifyWrapper(BaseModel): """ # allow arbitrary types in the model config for the apify client fields - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - apify_client_async: ApifyClientAsync - apify_api_token: str | None = None + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) # type: ignore[assignment] def __init__( self, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: - """Initialize the loader with an Apify dataset ID and a mapping function. + """Initialise the wrapper. Args: - dataset_id (str): The ID of the dataset on the Apify platform. - dataset_mapping_function (Callable): A function that takes a single - dictionary (an Apify dataset item) and converts it to an instance - of the Document class. - apify_api_token (Optional[str]): Apify API token. - *args: Any: Additional positional arguments. - **kwargs: Any: Additional keyword arguments. + apify_api_token (Optional[str | SecretStr]): Apify API token. Falls + back to the ``APIFY_API_TOKEN`` environment variable when *None*. + *args: Any: Additional positional arguments forwarded to Pydantic. + **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ - kwargs.update({'apify_api_token': apify_api_token}) + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + kwargs['apify_api_token'] = apify_api_token super().__init__(*args, **kwargs) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Validate that an Apify API token is set and the apify-client - Python package exists in the current environment. + @model_validator(mode='after') + def _init_clients(self) -> ApifyWrapper: + """Validate the token and initialise both sync and async Apify clients. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') + ApifyWrapper: The validated wrapper instance. - values['apify_client'] = create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = create_apify_client(ApifyClientAsync, apify_api_token) - - return values + Raises: + ValueError: If no token is provided and APIFY_API_TOKEN is not set. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + token = self.apify_api_token.get_secret_value() + self.apify_client = _create_apify_client(ApifyClient, token) + self.apify_client_async = _create_apify_client(ApifyClientAsync, token) + return self def call_actor( # noqa: PLR0913 self, diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py new file mode 100644 index 0000000..3f2a7c8 --- /dev/null +++ b/tests/integration_tests/test_generic_tools.py @@ -0,0 +1,94 @@ +"""Integration smoke tests for the generic Apify tools. + +These tests hit the real Apify API and require the ``APIFY_API_TOKEN`` +environment variable to be set. They use ``apify/python-example`` (a +trivial Actor that adds two numbers) to keep execution fast and cheap. +""" + +from __future__ import annotations + +import json +import os + +import pytest + +from langchain_apify import ( + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) + +_ACTOR_ID = 'apify/python-example' +_RUN_INPUT = {'first_number': 2, 'second_number': 3} + +pytestmark = pytest.mark.skipif( + not os.getenv('APIFY_API_TOKEN'), + reason='APIFY_API_TOKEN not set', +) + + +def test_run_actor_tool_smoke() -> None: + tool = ApifyRunActorTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +def test_get_dataset_items_tool_smoke() -> None: + run_tool = ApifyRunActorTool() + run_result = json.loads(run_tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT})) + dataset_id = run_result['dataset_id'] + + items_tool = ApifyGetDatasetItemsTool() + result = items_tool.invoke({'dataset_id': dataset_id, 'limit': 10}) + + parsed = json.loads(result) + assert 'items' in parsed + assert isinstance(parsed['items'], list) + + +def test_run_actor_and_get_items_tool_smoke() -> None: + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) + + +def test_scrape_url_tool_smoke() -> None: + tool = ApifyScrapeUrlTool() + result = tool.invoke({'url': 'https://crawlee.dev'}) + + assert isinstance(result, str) + assert len(result) > 0 + + +_TASK_ID = os.getenv('APIFY_TASK_ID', '') + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_tool_smoke() -> None: + tool = ApifyRunTaskTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_and_get_items_tool_smoke() -> None: + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 1107c7a..c92c038 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,8 +2,8 @@ from apify_client.client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import create_apify_client, get_actor_latest_build +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client, _get_actor_latest_build def test_get_actor_latest_build() -> None: @@ -13,12 +13,12 @@ def test_get_actor_latest_build() -> None: ValueError: If the APIFY_API_TOKEN environment variable is not set. """ if (token := os.getenv('APIFY_API_TOKEN')) is None: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, token) + apify_client = _create_apify_client(ApifyClient, token) - build = get_actor_latest_build(apify_client, 'apify/rag-web-browser') + build = _get_actor_latest_build(apify_client, 'apify/rag-web-browser') assert isinstance(build, dict) assert 'id' in build diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py new file mode 100644 index 0000000..3384e79 --- /dev/null +++ b/tests/unit_tests/conftest.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient + +SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +def make_tool(tool_cls: type, mock_client: MagicMock, **kwargs: Any) -> Any: # noqa: ANN401 + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token', **kwargs) + tool._client = mock_client + return tool diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py new file mode 100644 index 0000000..2b14809 --- /dev/null +++ b/tests/unit_tests/test_actor_tools.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import json +from unittest.mock import MagicMock, patch + +import pytest +from langchain_core.tools import ToolException +from pydantic import SecretStr + +from langchain_apify import APIFY_SEARCH_TOOLS, ApifyGoogleSearchTool, ApifyWebCrawlerTool +from langchain_apify._client import ApifyToolsClient +from langchain_apify.tools import _ApifyGenericTool +from tests.unit_tests.conftest import make_tool + +# --------------------------------------------------------------------------- +# ApifyGoogleSearchTool +# --------------------------------------------------------------------------- + + +def test_google_search_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [ + {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'}, + {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'}, + ] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + result = tool._run(query='test query') + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0]['title'] == 'Result 1' + assert parsed[1]['url'] == 'https://example.com/2' + + +def test_google_search_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool._run(query='test', max_results=5, country_code='us', language_code='en', timeout_secs=120) + + mock_tools_client.google_search.assert_called_once_with( + 'test', + max_results=5, + country_code='us', + language_code='en', + timeout_secs=120, + ) + + +def test_google_search_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client, max_timeout_secs=60) + + tool._run(query='test', timeout_secs=9999) + + assert mock_tools_client.google_search.call_args.kwargs['timeout_secs'] == 60 + + +def test_google_search_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client, max_items=3) + + tool._run(query='test', max_results=100) + + call_kwargs = mock_tools_client.google_search.call_args + assert call_kwargs.kwargs['max_results'] == 3 + + +def test_google_search_tool_empty_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + result = tool._run(query='nothing') + + assert json.loads(result) == [] + + +def test_google_search_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.google_search.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(query='test') + + +def test_google_search_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGoogleSearchTool() + + +@pytest.mark.parametrize('bad_code', ['USA', 'english', 'u', 'us1', '']) +def test_google_search_tool_rejects_malformed_locale(mock_tools_client: MagicMock, bad_code: str) -> None: + """country_code and language_code must be exactly two letters.""" + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + with pytest.raises(ValueError, match='string_pattern_mismatch|String should match pattern'): + tool.invoke({'query': 'test', 'country_code': bad_code}) + + with pytest.raises(ValueError, match='string_pattern_mismatch|String should match pattern'): + tool.invoke({'query': 'test', 'language_code': bad_code}) + + +@pytest.mark.parametrize('raw_country', ['us', 'US', 'Us', 'uS']) +def test_google_search_tool_normalises_country_code_to_lower(mock_tools_client: MagicMock, raw_country: str) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool.invoke({'query': 'test', 'country_code': raw_country}) + + assert mock_tools_client.google_search.call_args.kwargs['country_code'] == 'us' + + +@pytest.mark.parametrize('raw_language', ['en', 'EN', 'En', 'eN']) +def test_google_search_tool_normalises_language_code_to_lower(mock_tools_client: MagicMock, raw_language: str) -> None: + mock_tools_client.google_search.return_value = [] + tool = make_tool(ApifyGoogleSearchTool, mock_tools_client) + + tool.invoke({'query': 'test', 'language_code': raw_language}) + + assert mock_tools_client.google_search.call_args.kwargs['language_code'] == 'en' + + +# --------------------------------------------------------------------------- +# ApifyWebCrawlerTool +# --------------------------------------------------------------------------- + + +def test_web_crawler_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'markdown': '# Home', 'text': 'Home', 'metadata': {'title': 'Home'}}, + {'url': 'https://example.com/about', 'markdown': '', 'text': 'About us', 'metadata': {'title': 'About'}}, + ] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + parsed = json.loads(result) + assert len(parsed) == 2 + assert parsed[0] == {'url': 'https://example.com/', 'title': 'Home', 'content': '# Home'} + assert parsed[1] == {'url': 'https://example.com/about', 'title': 'About', 'content': 'About us'} + + +def test_web_crawler_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + tool._run( + url='https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright:firefox', + timeout_secs=120, + ) + + mock_tools_client.crawl_website.assert_called_once_with( + 'https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright:firefox', + timeout_secs=120, + ) + + +def test_web_crawler_tool_clamps_pages_and_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client, max_items=3, max_timeout_secs=60) + + tool._run(url='https://example.com', max_crawl_pages=100, timeout_secs=9999) + + call_kwargs = mock_tools_client.crawl_website.call_args + assert call_kwargs.kwargs['max_crawl_pages'] == 3 + assert call_kwargs.kwargs['timeout_secs'] == 60 + + +def test_web_crawler_tool_clamps_depth(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client, max_crawl_depth=2) + + tool._run(url='https://example.com', max_crawl_depth=999) + assert mock_tools_client.crawl_website.call_args.kwargs['max_crawl_depth'] == 2 + + mock_tools_client.crawl_website.reset_mock() + tool._run(url='https://example.com', max_crawl_depth=-1) + assert mock_tools_client.crawl_website.call_args.kwargs['max_crawl_depth'] == 0 + + +def test_web_crawler_tool_empty_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.return_value = [] + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert json.loads(result) == [] + + +def test_web_crawler_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status TIMED-OUT.') + tool = make_tool(ApifyWebCrawlerTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(url='https://example.com') + + +def test_web_crawler_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyWebCrawlerTool() + + +# --------------------------------------------------------------------------- +# Metadata & inheritance +# --------------------------------------------------------------------------- + + +def test_actor_tools_inherit_from_generic_base() -> None: + for tool_cls in (ApifyGoogleSearchTool, ApifyWebCrawlerTool): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_actor_tools_have_correct_metadata() -> None: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyGoogleSearchTool(apify_api_token=SecretStr('dummy')), + ApifyWebCrawlerTool(apify_api_token=SecretStr('dummy')), + ] + + expected_names = ['apify_google_search', 'apify_web_crawler'] + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True + + +def test_apify_search_tools_list() -> None: + assert set(APIFY_SEARCH_TOOLS) == {ApifyGoogleSearchTool, ApifyWebCrawlerTool} + assert len(APIFY_SEARCH_TOOLS) == 2 diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py new file mode 100644 index 0000000..13ccd59 --- /dev/null +++ b/tests/unit_tests/test_client.py @@ -0,0 +1,443 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import httpx +import pytest + +from langchain_apify._client import ApifyToolsClient +from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: + c = ApifyToolsClient(apify_api_token='my-token') + mock_create.assert_called_once() + assert c._client is mock_apify_client + + +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyToolsClient() + + +# --------------------------------------------------------------------------- +# run_actor +# --------------------------------------------------------------------------- + + +def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) + + mock_apify_client.actor.assert_called_once_with('apify/test-actor') + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300, logger=None + ) + assert result == SUCCEEDED_RUN + + +def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + client.run_actor('apify/test-actor', memory_mbytes=512) + + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input=None, timeout_secs=300, logger=None, memory_mbytes=512 + ) + + +def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# get_dataset_items +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) + + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) + assert items == SAMPLE_ITEMS + + +def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.get_dataset_items('dataset-empty') + assert items == [] + + +# --------------------------------------------------------------------------- +# run_actor_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + + +def test_run_actor_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_actor_and_get_items('apify/test-actor') + + +# --------------------------------------------------------------------------- +# run_task +# --------------------------------------------------------------------------- + + +def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_task('user/my-task', task_input={'key': 'val'}) + + mock_apify_client.task.assert_called_once_with('user/my-task') + mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) + assert result == SUCCEEDED_RUN + + +def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_task('user/my-task') + + +# --------------------------------------------------------------------------- +# run_task_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_task_and_get_items('user/my-task') + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_run_task_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_task_and_get_items('user/my-task') + + +# --------------------------------------------------------------------------- +# scrape_url +# --------------------------------------------------------------------------- + + +def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == '# Hello' + + +def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == 'Plain text content' + + +def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '', 'text': '', 'url': 'https://example.com'}, + ] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +# --------------------------------------------------------------------------- +# _check_run_status +# --------------------------------------------------------------------------- + + +def test_check_run_status_succeeded() -> None: + ApifyToolsClient._check_run_status({'id': 'run-ok', 'status': 'SUCCEEDED'}) + + +def test_check_run_status_failed() -> None: + with pytest.raises(RuntimeError, match='run-bad'): + ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) + + +# --------------------------------------------------------------------------- +# None returns from actor/task .call() +# --------------------------------------------------------------------------- + + +def test_run_actor_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_actor('apify/broken-actor') + + +def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_task('user/broken-task') + + +# --------------------------------------------------------------------------- +# Transport-error wrapping (httpx / ApifyClientError -> RuntimeError) +# --------------------------------------------------------------------------- + + +def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify Actor call failed'): + client.run_actor('apify/test-actor') + + +def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('timeout') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.get_dataset_items('dataset-xyz') + + +def test_run_actor_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_actor_and_get_items('apify/test-actor') + + +def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify task call failed'): + client.run_task('user/my-task') + + +def test_run_task_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_task_and_get_items('user/my-task') + + +def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" + mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') + + with pytest.raises(AttributeError, match='bug in SDK'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# google_search +# --------------------------------------------------------------------------- + +GOOGLE_SEARCH_ITEMS: list[dict] = [ + { + 'organicResults': [ + {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'}, + {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'}, + ], + }, +] + + +def test_google_search_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = GOOGLE_SEARCH_ITEMS + + results = client.google_search('test query', max_results=5) + + assert len(results) == 2 + assert results[0] == {'title': 'Result 1', 'url': 'https://example.com/1', 'description': 'Desc 1'} + assert results[1] == {'title': 'Result 2', 'url': 'https://example.com/2', 'description': 'Desc 2'} + + +def test_google_search_with_locale(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = GOOGLE_SEARCH_ITEMS + + client.google_search('test', country_code='us', language_code='en') + + call_args = mock_apify_client.actor.return_value.call.call_args + run_input = call_args.kwargs['run_input'] + assert run_input['countryCode'] == 'us' + assert run_input['languageCode'] == 'en' + + +def test_google_search_caps_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + many_results = [{'title': f'R{i}', 'url': f'https://example.com/{i}', 'description': f'D{i}'} for i in range(20)] + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [{'organicResults': many_results}] + + results = client.google_search('test', max_results=3) + + assert len(results) == 3 + + +def test_google_search_empty_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [{'organicResults': []}] + + results = client.google_search('test') + + assert results == [] + + +def test_google_search_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.google_search('test') + + +# --------------------------------------------------------------------------- +# rag_web_search +# --------------------------------------------------------------------------- + +RAG_SEARCH_ITEMS: list[dict] = [ + {'crawledUrl': 'https://example.com/1', 'text': 'Page 1 content', 'metadata': {'title': 'Page 1'}}, + {'crawledUrl': 'https://example.com/2', 'text': 'Page 2 content', 'metadata': {'title': 'Page 2'}}, +] + + +def test_rag_web_search_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = RAG_SEARCH_ITEMS + + items = client.rag_web_search('test query', max_results=5) + + assert len(items) == 2 + assert items[0]['crawledUrl'] == 'https://example.com/1' + assert items[1]['text'] == 'Page 2 content' + + +def test_rag_web_search_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.rag_web_search('test') + + assert items == [] + + +def test_rag_web_search_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.rag_web_search('test') + + +# --------------------------------------------------------------------------- +# crawl_website +# --------------------------------------------------------------------------- + +CRAWL_ITEMS: list[dict] = [ + {'url': 'https://example.com/', 'markdown': '# Home', 'text': 'Home', 'metadata': {'title': 'Home'}}, + {'url': 'https://example.com/about', 'markdown': '# About', 'text': 'About', 'metadata': {'title': 'About'}}, +] + + +def test_crawl_website_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = CRAWL_ITEMS + + items = client.crawl_website('https://example.com') + + assert len(items) == 2 + assert items[0]['url'] == 'https://example.com/' + assert items[1]['markdown'] == '# About' + + +def test_crawl_website_passes_params(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + client.crawl_website('https://example.com', max_crawl_pages=5, max_crawl_depth=2, crawler_type='playwright') + + call_args = mock_apify_client.actor.return_value.call.call_args + run_input = call_args.kwargs['run_input'] + assert run_input['startUrls'] == [{'url': 'https://example.com'}] + assert run_input['maxCrawlPages'] == 5 + assert run_input['maxCrawlDepth'] == 2 + assert run_input['crawlerType'] == 'playwright' + + +def test_crawl_website_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.crawl_website('https://example.com') + + assert items == [] + + +def test_crawl_website_failed_run_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.crawl_website('https://example.com') diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..5c71704 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,10 +1,16 @@ -from unittest.mock import patch +from __future__ import annotations +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document +from pydantic import SecretStr -from langchain_apify import ApifyDatasetLoader +from langchain_apify import ApifyCrawlLoader, ApifyDatasetLoader +from langchain_apify._client import ApifyToolsClient def test_apify_dataset_loader_load() -> None: @@ -55,3 +61,165 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +# --------------------------------------------------------------------------- +# ApifyCrawlLoader +# --------------------------------------------------------------------------- + +CRAWL_ITEMS: list[dict] = [ + { + 'url': 'https://example.com/', + 'markdown': '# Home', + 'text': 'Home', + 'metadata': {'title': 'Home Page'}, + 'crawlDepth': 0, + }, + { + 'url': 'https://example.com/about', + 'markdown': '# About', + 'text': 'About', + 'metadata': {'title': 'About Page'}, + 'crawlDepth': 1, + }, +] + + +def _make_crawl_loader( + mock_client: MagicMock, + **kwargs: Any, # noqa: ANN401 +) -> ApifyCrawlLoader: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + loader = ApifyCrawlLoader(url='https://example.com', apify_api_token='dummy', **kwargs) + loader._client = mock_client + return loader + + +def test_crawl_loader_lazy_load() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = CRAWL_ITEMS + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == '# Home' + assert docs[0].metadata['source'] == 'https://example.com/' + assert docs[0].metadata['title'] == 'Home Page' + assert docs[0].metadata['crawl_depth'] == 0 + assert docs[1].page_content == '# About' + assert docs[1].metadata['crawl_depth'] == 1 + + +def test_crawl_loader_load_delegates_to_lazy_load() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = CRAWL_ITEMS + loader = _make_crawl_loader(mock_client) + + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].page_content == '# Home' + + +def test_crawl_loader_passes_params() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [] + loader = _make_crawl_loader( + mock_client, + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright:firefox', + timeout_secs=120, + ) + + list(loader.lazy_load()) + + mock_client.crawl_website.assert_called_once_with( + 'https://example.com', + max_crawl_pages=5, + max_crawl_depth=2, + crawler_type='playwright:firefox', + timeout_secs=120, + ) + + +def test_crawl_loader_empty_results() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [] + loader = _make_crawl_loader(mock_client) + + docs = loader.load() + + assert docs == [] + + +def test_crawl_loader_text_fallback() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'text': 'Plain text', 'metadata': {'title': 'T'}}, + ] + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert docs[0].page_content == 'Plain text' + + +def test_crawl_loader_missing_metadata() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.return_value = [ + {'url': 'https://example.com/', 'markdown': '# Content'}, + ] + loader = _make_crawl_loader(mock_client) + + docs = list(loader.lazy_load()) + + assert docs[0].metadata['title'] == '' + assert docs[0].metadata['crawl_depth'] == 0 + + +def test_crawl_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyCrawlLoader(url='https://example.com') + + +def test_crawl_loader_accepts_secretstr_token() -> None: + with patch('langchain_apify._client._create_apify_client'): + loader = ApifyCrawlLoader(url='https://example.com', apify_api_token=SecretStr('s')) + assert loader.url == 'https://example.com' + + +def test_crawl_loader_failure_raises() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.crawl_website.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + loader = _make_crawl_loader(mock_client) + + with pytest.raises(RuntimeError, match='FAILED'): + loader.load() + + +def test_apify_dataset_loader_apify_token_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Loader should accept APIFY_TOKEN as a secondary env-var fallback.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'platform-token') + + with patch.object(DatasetClient, 'list_items') as mock_list_items: + mock_list_items.return_value = ListPage(data={'items': []}) + loader = ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) + assert loader.load() == [] + + +def test_apify_dataset_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) diff --git a/tests/unit_tests/test_retrievers.py b/tests/unit_tests/test_retrievers.py new file mode 100644 index 0000000..17dfba9 --- /dev/null +++ b/tests/unit_tests/test_retrievers.py @@ -0,0 +1,224 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest +from langchain_core.documents import Document +from pydantic import SecretStr + +from langchain_apify._client import ApifyToolsClient +from langchain_apify.retrievers import ApifySearchRetriever + +RAG_ITEMS: list[dict] = [ + { + 'crawledUrl': 'https://example.com/1', + 'text': 'Page 1 content', + 'metadata': {'title': 'Page 1'}, + }, + { + 'crawledUrl': 'https://example.com/2', + 'text': 'Page 2 content', + 'metadata': {'title': 'Page 2'}, + }, +] + + +def _make_retriever(mock_client: MagicMock, **kwargs: Any) -> ApifySearchRetriever: # noqa: ANN401 + """Instantiate a retriever with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('dummy-token'), **kwargs) + retriever._client = mock_client + return retriever + + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifySearchRetriever() + + +def test_init_with_explicit_token() -> None: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('my-token')) + assert retriever.max_results == 5 + assert retriever.timeout_secs == 300 + + +def test_init_custom_params() -> None: + with patch.object(ApifyToolsClient, '__init__', return_value=None): + retriever = ApifySearchRetriever(apify_api_token=SecretStr('t'), max_results=3, timeout_secs=60) + assert retriever.max_results == 3 + assert retriever.timeout_secs == 60 + + +# --------------------------------------------------------------------------- +# Sync retrieval +# --------------------------------------------------------------------------- + + +def test_sync_returns_documents() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = RAG_ITEMS + retriever = _make_retriever(mock_client, max_results=5) + + docs = retriever._get_relevant_documents('test query') + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == 'Page 1 content' + assert docs[0].metadata['source'] == 'https://example.com/1' + assert docs[0].metadata['title'] == 'Page 1' + assert docs[1].page_content == 'Page 2 content' + assert docs[1].metadata['source'] == 'https://example.com/2' + + +def test_sync_calls_helper_with_correct_args() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) + + retriever._get_relevant_documents('my search') + + mock_client.rag_web_search.assert_called_once_with( + 'my search', + max_results=3, + timeout_secs=60, + ) + + +def test_sync_empty_results() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client) + + docs = retriever._get_relevant_documents('test') + + assert docs == [] + + +def test_sync_helper_failure_propagates() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.side_effect = RuntimeError( + 'Actor run run-bad ended with status FAILED.', + ) + retriever = _make_retriever(mock_client) + + with pytest.raises(RuntimeError, match='FAILED'): + retriever._get_relevant_documents('test') + + +# --------------------------------------------------------------------------- +# Async retrieval +# --------------------------------------------------------------------------- + + +@pytest.mark.asyncio +async def test_async_returns_documents() -> None: + """Async path wraps the sync helper via asyncio.to_thread.""" + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = RAG_ITEMS + retriever = _make_retriever(mock_client, max_results=5) + + docs = await retriever._aget_relevant_documents('test query') + + assert len(docs) == 2 + assert all(isinstance(d, Document) for d in docs) + assert docs[0].page_content == 'Page 1 content' + assert docs[0].metadata['source'] == 'https://example.com/1' + + +@pytest.mark.asyncio +async def test_async_calls_helper_with_correct_args() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client, max_results=3, timeout_secs=60) + + await retriever._aget_relevant_documents('my search') + + mock_client.rag_web_search.assert_called_once_with( + 'my search', + max_results=3, + timeout_secs=60, + ) + + +@pytest.mark.asyncio +async def test_async_empty_results() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.return_value = [] + retriever = _make_retriever(mock_client) + + docs = await retriever._aget_relevant_documents('test') + + assert docs == [] + + +@pytest.mark.asyncio +async def test_async_helper_failure_propagates() -> None: + mock_client = MagicMock(spec=ApifyToolsClient) + mock_client.rag_web_search.side_effect = RuntimeError( + 'Actor run run-bad ended with status FAILED.', + ) + retriever = _make_retriever(mock_client) + + with pytest.raises(RuntimeError, match='FAILED'): + await retriever._aget_relevant_documents('test') + + +# --------------------------------------------------------------------------- +# _items_to_documents edge cases +# --------------------------------------------------------------------------- + + +def test_items_to_documents_uses_url_fallback() -> None: + items = [{'url': 'https://fallback.com', 'text': 'content', 'metadata': {'title': 'T'}}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['source'] == 'https://fallback.com' + + +def test_items_to_documents_uses_metadata_url_fallback() -> None: + """apify/rag-web-browser nests the page URL under metadata.url.""" + items = [ + { + 'metadata': {'url': 'https://nested.example.com', 'title': 'Nested'}, + 'text': 'content', + }, + ] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['source'] == 'https://nested.example.com' + assert docs[0].metadata['title'] == 'Nested' + + +def test_items_to_documents_uses_markdown_fallback() -> None: + items = [{'crawledUrl': 'https://example.com', 'markdown': '# MD content', 'metadata': {'title': 'T'}}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].page_content == '# MD content' + + +def test_items_to_documents_missing_metadata() -> None: + items = [{'crawledUrl': 'https://example.com', 'text': 'content'}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['title'] == '' + assert docs[0].metadata['source'] == 'https://example.com' + + +def test_items_to_documents_non_dict_metadata() -> None: + items = [{'crawledUrl': 'https://example.com', 'text': 'content', 'metadata': 'not-a-dict'}] + + docs = ApifySearchRetriever._items_to_documents(items) + + assert docs[0].metadata['title'] == '' diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index b10df2f..108c695 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,13 +1,30 @@ from __future__ import annotations +import json +from datetime import datetime, timezone from typing import TYPE_CHECKING -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from langchain_core.tools import ToolException from pydantic import BaseModel -from langchain_apify.tools import ApifyActorsTool -from langchain_apify.utils import actor_id_to_tool_name +from langchain_apify import APIFY_CORE_TOOLS +from langchain_apify._client import ApifyToolsClient +from langchain_apify._utils import _actor_id_to_tool_name +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, + _ApifyGenericTool, + _iso, + _run_meta, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: from collections.abc import Generator @@ -40,7 +57,7 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id=actor_id, apify_api_token='dummy-token') assert isinstance(tool, ApifyActorsTool) assert tool.description == 'Mocked description' - assert tool.name == actor_id_to_tool_name(actor_id) + assert tool.name == _actor_id_to_tool_name(actor_id) assert tool.args_schema == DummyModel @@ -85,3 +102,513 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id='apify/python-example', apify_api_token='dummy-token') yield tool + + +# --------------------------------------------------------------------------- +# _iso / _run_meta helpers +# --------------------------------------------------------------------------- + + +def test_iso_converts_datetime_to_string() -> None: + dt = datetime(2025, 6, 15, 12, 30, 45, tzinfo=timezone.utc) + assert _iso(dt) == '2025-06-15T12:30:45+00:00' + + +def test_iso_passes_through_string() -> None: + assert _iso('2025-01-01T00:00:00.000Z') == '2025-01-01T00:00:00.000Z' + + +def test_iso_passes_through_none() -> None: + assert _iso(None) is None + + +def test_run_meta_with_datetime_values_is_json_serializable() -> None: + run = { + 'id': 'run-dt', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-dt', + 'startedAt': datetime(2025, 3, 1, 10, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 3, 1, 10, 1, 0, tzinfo=timezone.utc), + } + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['run_id'] == 'run-dt' + assert parsed['started_at'] == '2025-03-01T10:00:00+00:00' + assert parsed['finished_at'] == '2025-03-01T10:01:00+00:00' + + +def test_run_meta_with_string_values_is_json_serializable() -> None: + meta = _run_meta(SUCCEEDED_RUN) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + + +def test_run_meta_with_missing_timestamps() -> None: + run = {'id': 'run-none', 'status': 'RUNNING', 'defaultDatasetId': 'ds-none'} + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] is None + assert parsed['finished_at'] is None + + +def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: + """End-to-end: ApifyRunActorTool returns valid JSON when the client returns datetime objects.""" + mock_tools_client.run_actor.return_value = { + 'id': 'run-real', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-real', + 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), + } + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test') + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-real' + assert parsed['started_at'] == '2025-06-01T08:00:00+00:00' + assert parsed['finished_at'] == '2025-06-01T08:05:00+00:00' + + +# --------------------------------------------------------------------------- +# ApifyRunActorTool +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_actor.assert_called_once_with('apify/test', {'key': 'val'}, 300, None) + + +def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(actor_id='apify/test') + + +def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorTool() + + +# --------------------------------------------------------------------------- +# ApifyGetDatasetItemsTool +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) + + parsed = json.loads(result) + assert len(parsed['items']) == 2 + assert parsed['items'][0]['text'] == 'item-1' + mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) + + +def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = [] + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-empty') + + parsed = json.loads(result) + assert parsed['items'] == [] + assert 'empty' in parsed['message'].lower() + + +def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.side_effect = RuntimeError( + 'Apify dataset fetch failed for ds-bad: connection reset' + ) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='Apify dataset fetch failed'): + tool._run(dataset_id='ds-bad') + + +def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGetDatasetItemsTool() + + +# --------------------------------------------------------------------------- +# ApifyRunActorAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_actor_and_get_items.assert_called_once_with('apify/test', {'q': '1'}, 300, None, 50) + + +def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(actor_id='apify/test') + + +def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# ApifyScrapeUrlTool +# --------------------------------------------------------------------------- + + +def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# Hello World' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert result == '# Hello World' + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + + +def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + with pytest.raises(ToolException, match='No content extracted'): + tool._run(url='https://example.com') + + +def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyScrapeUrlTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskTool +# --------------------------------------------------------------------------- + + +def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_task.assert_called_once_with('user/my-task', {'key': 'val'}, 300, None) + + +def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(task_id='user/my-task') + + +def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_task_and_get_items.assert_called_once_with('user/my-task', {'q': '1'}, 300, None, 50) + + +def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(task_id='user/my-task') + + +def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# Value clamping (developer safety limits) +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=60) + + tool._run(actor_id='apify/test', timeout_secs=9999) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 60, None) + + +def test_run_actor_tool_clamps_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=8192) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 512) + + +def test_run_actor_tool_passes_none_memory_through(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=None) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=10) + + tool._run(dataset_id='ds-1', limit=50000) + + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 10, 0) + + +def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunActorAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(actor_id='a', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_actor_and_get_items.assert_called_once_with('a', None, 30, 256, 5) + + +def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# content' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) + + tool._run(url='https://example.com', timeout_secs=9999) + + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + + +def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client, max_timeout_secs=60, max_memory_mbytes=512) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999) + + mock_tools_client.run_task.assert_called_once_with('t/1', None, 60, 512) + + +def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunTaskAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) + + +def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600) + + tool._run(actor_id='apify/test', timeout_secs=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', timeout_secs=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + +def test_clamp_memory_non_positive_is_treated_as_none(mock_tools_client: MagicMock) -> None: + """memory_mbytes <= 0 maps to None so the Apify platform default is used.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_clamp_memory_floors_positive_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """A positive memory_mbytes below the Apify platform minimum (128 MB) is floored to 128.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=64) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + +def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=100) + + tool._run(dataset_id='ds-1', limit=-1) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + mock_tools_client.get_dataset_items.reset_mock() + tool._run(dataset_id='ds-1', limit=0) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + +def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: + """When LLM values are within limits they should pass through unchanged.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', timeout_secs=120, memory_mbytes=1024) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 120, 1024) + + +# --------------------------------------------------------------------------- +# Tool metadata assertions +# --------------------------------------------------------------------------- + + +def test_generic_tools_have_correct_metadata() -> None: + """Verify name, description, and args_schema are set on all generic tools.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ] + + expected_names = [ + 'apify_run_actor', + 'apify_get_dataset_items', + 'apify_run_actor_and_get_dataset', + 'apify_scrape_url', + 'apify_run_task', + 'apify_run_task_and_get_dataset', + ] + + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True + + +def test_apify_api_token_excluded_from_model_dump() -> None: + """The apify_api_token field must not appear in model_dump() output.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg,arg-type] + dumped = tool.model_dump() + assert 'apify_api_token' not in dumped + + +# --------------------------------------------------------------------------- +# _ApifyGenericTool inheritance +# --------------------------------------------------------------------------- + + +def test_all_generic_tools_inherit_from_base() -> None: + """Every generic tool must be a subclass of _ApifyGenericTool.""" + for tool_cls in ( + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + ): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_legacy_tool_does_not_inherit_from_generic_base() -> None: + """ApifyActorsTool is legacy and must NOT inherit from _ApifyGenericTool.""" + assert not issubclass(ApifyActorsTool, _ApifyGenericTool) + + +# --------------------------------------------------------------------------- +# APIFY_CORE_TOOLS list +# --------------------------------------------------------------------------- + + +def test_apify_core_tools_contains_all_generic_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 6 generic tool classes.""" + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + } + assert len(APIFY_CORE_TOOLS) == 6