diff --git a/langchain_apify/__init__.py b/langchain_apify/__init__.py index 66142be..21fc291 100644 --- a/langchain_apify/__init__.py +++ b/langchain_apify/__init__.py @@ -1,19 +1,84 @@ +from __future__ import annotations + from importlib import metadata +from typing import TYPE_CHECKING +from langchain_apify._actor_tools import ( + ApifyFacebookPostsScraperTool, + ApifyInstagramScraperTool, + ApifyLinkedInProfileDetailTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyTikTokScraperTool, + ApifyTwitterScraperTool, +) from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.tools import ApifyActorsTool +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) from langchain_apify.wrappers import ApifyWrapper +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + try: __version__ = metadata.version(__package__) except metadata.PackageNotFoundError: - # Case where package metadata is not available. __version__ = '' del metadata # optional, avoids polluting the results of dir(__package__) +# Convenience tool-class lists for selective agent binding. +# Binding all tools at once overwhelms the LLM context window; +# pick the group(s) relevant to your use case. + +APIFY_CORE_TOOLS: list[type[BaseTool]] = [ + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, +] + +APIFY_SOCIAL_TOOLS: list[type[BaseTool]] = [ + ApifyInstagramScraperTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyLinkedInProfileDetailTool, + ApifyTwitterScraperTool, + ApifyTikTokScraperTool, + ApifyFacebookPostsScraperTool, +] + __all__ = [ + # Existing components (backward-compatible) 'ApifyActorsTool', 'ApifyDatasetLoader', 'ApifyWrapper', + # Core generic tools + 'ApifyGetDatasetItemsTool', + 'ApifyRunActorAndGetDatasetTool', + 'ApifyRunActorTool', + 'ApifyRunTaskAndGetDatasetTool', + 'ApifyRunTaskTool', + 'ApifyScrapeUrlTool', + # Social media Actor tools + 'ApifyFacebookPostsScraperTool', + 'ApifyInstagramScraperTool', + 'ApifyLinkedInProfileDetailTool', + 'ApifyLinkedInProfilePostsTool', + 'ApifyLinkedInProfileSearchTool', + 'ApifyTikTokScraperTool', + 'ApifyTwitterScraperTool', + # Tool group lists + 'APIFY_CORE_TOOLS', + 'APIFY_SOCIAL_TOOLS', + # Meta '__version__', ] diff --git a/langchain_apify/_actor_tools.py b/langchain_apify/_actor_tools.py new file mode 100644 index 0000000..f7e7721 --- /dev/null +++ b/langchain_apify/_actor_tools.py @@ -0,0 +1,569 @@ +"""Apify Actor-specific LangChain tools for social media platforms. + +Each tool wraps a single Apify Actor behind a simplified, LLM-friendly +interface so that LangChain agents can scrape social media data without +needing to know Actor IDs or raw input schemas. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Literal + +from langchain_core.tools import ToolException +from pydantic import BaseModel, Field + +from langchain_apify._utils import _run_meta +from langchain_apify.tools import _ApifyGenericTool + +if TYPE_CHECKING: + from langchain_core.callbacks import CallbackManagerForToolRun + +# --------------------------------------------------------------------------- +# Input schemas +# --------------------------------------------------------------------------- + + +class ApifyInstagramScraperInput(BaseModel): + """Input schema for :class:`ApifyInstagramScraperTool`.""" + + search_type: Literal['user', 'hashtag', 'post', 'comments'] = Field( + description=( + 'Type of data to scrape: "user" for a profile\'s posts, "hashtag" ' + 'for posts under a tag, "post" for a single post, "comments" for ' + 'comments on a post.' + ), + ) + search_query: str = Field( + description=( + 'Username, hashtag, or full Instagram URL depending on search_type. ' + 'For "comments" you must pass a post URL (e.g. instagram.com/p/...).' + ), + ) + max_results: int = Field(default=20, description='Maximum number of items to return.') + only_posts_newer_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) + + +class ApifyLinkedInProfilePostsInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfilePostsTool`.""" + + profile_url: str = Field( + description='LinkedIn profile URL or username (e.g. "satyanadella" or "linkedin.com/in/satyanadella").', + ) + max_results: int = Field(default=20, description='Maximum number of posts to return.') + + +class ApifyLinkedInProfileSearchInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfileSearchTool`.""" + + query: str = Field(description='Search keywords (e.g. name, title, company).') + max_results: int = Field(default=10, description='Maximum number of profiles to return.') + + +class ApifyLinkedInProfileDetailInput(BaseModel): + """Input schema for :class:`ApifyLinkedInProfileDetailTool`.""" + + profile_url: str = Field( + description='LinkedIn profile URL, username, or URN (e.g. "neal-mohan").', + ) + include_email: bool = Field( + default=False, + description='If True, attempt to include the profile email when available.', + ) + + +class ApifyTwitterScraperInput(BaseModel): + """Input schema for :class:`ApifyTwitterScraperTool`.""" + + search_query: str = Field(description='Search term, Twitter handle, or tweet URL.') + search_mode: Literal['search', 'user', 'replies'] = Field( + default='search', + description=( + 'Scraping mode: "search" for keyword search, "user" for a handle\'s ' + 'tweets, "replies" for a tweet URL\'s replies.' + ), + ) + max_results: int = Field(default=20, description='Maximum number of tweets to return.') + start: str | None = Field( + default=None, + description='Optional start date - only return tweets newer than this date.', + ) + end: str | None = Field( + default=None, + description='Optional end date - only return tweets older than this date.', + ) + sort: Literal['Latest', 'Top'] | None = Field( + default=None, + description='Optional sort order: "Latest" for most recent first, "Top" for most popular.', + ) + + +class ApifyTikTokScraperInput(BaseModel): + """Input schema for :class:`ApifyTikTokScraperTool`.""" + + search_query: str = Field(description='Username, hashtag, search keyword, or TikTok post URL.') + search_type: Literal['search', 'user', 'hashtag', 'post'] = Field( + default='search', + description=( + 'Type of content to scrape: "search" for keyword search, "user" for ' + 'a profile\'s videos, "hashtag" for videos under a tag, "post" for a ' + 'specific TikTok post URL.' + ), + ) + max_results: int = Field(default=20, description='Maximum number of items to return.') + + +class ApifyFacebookPostsScraperInput(BaseModel): + """Input schema for :class:`ApifyFacebookPostsScraperTool`.""" + + page_url: str = Field(description='Facebook page URL to scrape (public pages only).') + max_results: int = Field(default=20, description='Maximum number of posts to return.') + only_posts_newer_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) + only_posts_older_than: str | None = Field( + default=None, + description=( + 'Optional date filter. Accepts YYYY-MM-DD, ISO-8601, or relative ' + 'values like "1 day", "2 months", "3 years".' + ), + ) + + +# --------------------------------------------------------------------------- +# Tools +# --------------------------------------------------------------------------- + + +class ApifyInstagramScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape Instagram profiles, hashtags, posts, or comments. + + Uses the ``apify/instagram-scraper`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of scraped item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyInstagramScraperTool + + tool = ApifyInstagramScraperTool() + result = tool.invoke({ + "search_type": "user", + "search_query": "apify", + "max_results": 10, + }) + """ + + name: str = 'apify_instagram_scraper' + description: str = ( + 'Scrape Instagram profiles, hashtags, posts, or comments and return the results as JSON.' + ' Required: search_type (one of "user", "hashtag", "post", "comments"),' + ' search_query (str - username, hashtag, or post URL).' + ' Optional: max_results (int, default 20),' + ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week").' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyInstagramScraperInput + + def _run( + self, + search_type: Literal['user', 'hashtag', 'post', 'comments'], + search_query: str, + max_results: int = 20, + only_posts_newer_than: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.instagram_scrape( + search_type=search_type, + search_query=search_query, + max_results=self._clamp_items(max_results), + only_posts_newer_than=only_posts_newer_than, + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfilePostsTool(_ApifyGenericTool): # type: ignore[override] + """Extract posts from a LinkedIn profile. + + Uses the ``apimaestro/linkedin-profile-posts`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of post dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfilePostsTool + + tool = ApifyLinkedInProfilePostsTool() + result = tool.invoke({ + "profile_url": "https://www.linkedin.com/in/satyanadella", + "max_results": 10, + }) + """ + + name: str = 'apify_linkedin_profile_posts' + description: str = ( + 'Extract posts from a LinkedIn profile and return them as JSON.' + ' Required: profile_url (str - LinkedIn profile URL or username, e.g. "satyanadella").' + ' Optional: max_results (int, default 20).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfilePostsInput + + def _run( + self, + profile_url: str, + max_results: int = 20, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_posts( + profile_url=profile_url, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfileSearchTool(_ApifyGenericTool): # type: ignore[override] + """Search for LinkedIn profiles by keyword or criteria. + + Uses the ``harvestapi/linkedin-profile-search`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of profile dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfileSearchTool + + tool = ApifyLinkedInProfileSearchTool() + result = tool.invoke({ + "query": "Founder", + "max_results": 10, + }) + """ + + name: str = 'apify_linkedin_profile_search' + description: str = ( + 'Search for LinkedIn profiles by keyword (name, title, company) and return matching profiles as JSON.' + ' Required: query (str - search keywords).' + ' Optional: max_results (int, default 10).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfileSearchInput + + def _run( + self, + query: str, + max_results: int = 10, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_search( + query=query, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyLinkedInProfileDetailTool(_ApifyGenericTool): # type: ignore[override] + """Retrieve detailed information from a specific LinkedIn profile. + + Uses the ``apimaestro/linkedin-profile-detail`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (typically + a single-element list with the profile dict). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyLinkedInProfileDetailTool + + tool = ApifyLinkedInProfileDetailTool() + result = tool.invoke({ + "profile_url": "https://www.linkedin.com/in/neal-mohan", + }) + """ + + name: str = 'apify_linkedin_profile_detail' + description: str = ( + 'Retrieve detailed information from a specific LinkedIn profile and return it as JSON.' + ' Required: profile_url (str - LinkedIn profile URL, username, or URN, e.g. "neal-mohan").' + ' Optional: include_email (bool, default False - include profile email if available).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyLinkedInProfileDetailInput + + def _run( + self, + profile_url: str, + *, + include_email: bool = False, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.linkedin_profile_detail( + profile_url=profile_url, + include_email=include_email, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyTwitterScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape tweets, profiles, or replies from Twitter/X. + + Uses the ``apidojo/twitter-scraper-lite`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of tweet dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyTwitterScraperTool + + tool = ApifyTwitterScraperTool() + result = tool.invoke({ + "search_query": "apify", + "search_mode": "search", + "max_results": 20, + }) + """ + + name: str = 'apify_twitter_scraper' + description: str = ( + 'Scrape tweets from Twitter/X by search term, user handle, or tweet URL and return them as JSON.' + ' Required: search_query (str - search term, handle, or tweet URL).' + ' Optional: search_mode (one of "search", "user", "replies"; default "search"),' + ' max_results (int, default 20),' + ' start (str - ISO date, only return tweets newer than this date),' + ' end (str - ISO date, only return tweets older than this date),' + ' sort (one of "Latest", "Top" - sort order for results).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyTwitterScraperInput + + def _run( # noqa: PLR0913 + self, + search_query: str, + search_mode: Literal['search', 'user', 'replies'] = 'search', + max_results: int = 20, + start: str | None = None, + end: str | None = None, + sort: Literal['Latest', 'Top'] | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.twitter_scrape( + search_query=search_query, + search_mode=search_mode, + max_results=self._clamp_items(max_results), + start=start, + end=end, + sort=sort, + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyTikTokScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape TikTok videos, profiles, or hashtag content. + + Uses the ``clockworks/tiktok-scraper`` Actor under the hood. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of TikTok item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyTikTokScraperTool + + tool = ApifyTikTokScraperTool() + result = tool.invoke({ + "search_query": "cooking", + "search_type": "search", + "max_results": 20, + }) + """ + + name: str = 'apify_tiktok_scraper' + description: str = ( + 'Scrape TikTok by search keyword, profile, hashtag, or post URL and return the results as JSON.' + ' Required: search_query (str - keyword, username, hashtag, or TikTok post URL).' + ' Optional: search_type (one of "search", "user", "hashtag", "post"; default "search"),' + ' max_results (int, default 20).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyTikTokScraperInput + + def _run( + self, + search_query: str, + search_type: Literal['search', 'user', 'hashtag', 'post'] = 'search', + max_results: int = 20, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.tiktok_scrape( + search_query=search_query, + search_type=search_type, + max_results=self._clamp_items(max_results), + timeout_secs=self.max_timeout_secs, + ) + except (RuntimeError, ValueError) as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyFacebookPostsScraperTool(_ApifyGenericTool): # type: ignore[override] + """Scrape public Facebook page posts. + + Uses the ``apify/facebook-posts-scraper`` Actor under the hood. + Only public Facebook pages are supported - personal profiles cannot + be scraped. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of post dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyFacebookPostsScraperTool + + tool = ApifyFacebookPostsScraperTool() + result = tool.invoke({ + "page_url": "https://www.facebook.com/humansofnewyork/", + "max_results": 20, + }) + """ + + name: str = 'apify_facebook_posts_scraper' + description: str = ( + 'Scrape posts from a public Facebook page and return them as JSON.' + ' Required: page_url (str - Facebook page URL; personal profiles are not supported).' + ' Optional: max_results (int, default 20),' + ' only_posts_newer_than (str - date filter, e.g. "2025-01-01" or "1 week"),' + ' only_posts_older_than (str - date filter, e.g. "2025-01-01" or "1 week").' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at) and items.' + ' Use only the data returned; do not hallucinate missing fields.' + ) + args_schema: type[BaseModel] = ApifyFacebookPostsScraperInput + + def _run( + self, + page_url: str, + max_results: int = 20, + only_posts_newer_than: str | None = None, + only_posts_older_than: str | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.facebook_posts_scrape( + page_url=page_url, + max_results=self._clamp_items(max_results), + only_posts_newer_than=only_posts_newer_than, + only_posts_older_than=only_posts_older_than, + timeout_secs=self.max_timeout_secs, + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/langchain_apify/_client.py b/langchain_apify/_client.py new file mode 100644 index 0000000..54de8d6 --- /dev/null +++ b/langchain_apify/_client.py @@ -0,0 +1,569 @@ +from __future__ import annotations + +import os + +import httpx +from apify_client import ApifyClient +from apify_client.errors import ApifyClientError +from pydantic import SecretStr + +from langchain_apify._error_messages import ( + _ERROR_ACTOR_RUN_FAILED, + _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET, + _ERROR_SCRAPE_EMPTY, +) +from langchain_apify._utils import _create_apify_client + +# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate. +_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError) + +_SCRAPE_ACTOR_ID = 'apify/website-content-crawler' +_INSTAGRAM_ACTOR_ID = 'apify/instagram-scraper' +_LINKEDIN_POSTS_ACTOR_ID = 'apimaestro/linkedin-profile-posts' +_LINKEDIN_SEARCH_ACTOR_ID = 'harvestapi/linkedin-profile-search' +_LINKEDIN_DETAIL_ACTOR_ID = 'apimaestro/linkedin-profile-detail' +_TWITTER_ACTOR_ID = 'apidojo/twitter-scraper-lite' +_TIKTOK_ACTOR_ID = 'clockworks/tiktok-scraper' +_FACEBOOK_ACTOR_ID = 'apify/facebook-posts-scraper' +_DEFAULT_RUN_TIMEOUT_SECS = 300 +_DEFAULT_SCRAPE_TIMEOUT_SECS = 120 +_DEFAULT_SOCIAL_TIMEOUT_SECS = 600 +_DEFAULT_DATASET_ITEMS_LIMIT = 100 +_DEFAULT_SOCIAL_RESULTS_LIMIT = 20 +_RUN_STATUS_SUCCEEDED = 'SUCCEEDED' + +# Instagram-specific mappings +_INSTAGRAM_RESULTS_TYPE_MAP = { + 'user': 'posts', + 'hashtag': 'posts', + 'post': 'posts', + 'comments': 'comments', +} + + +class ApifyToolsClient: + """Internal helper that wraps ``ApifyClient`` for the tools layer. + + One convenience method per tool operation. All methods are synchronous and + block until the Actor run finishes. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Raises: + ValueError: If no token is provided and the env var is not set. + """ + + def __init__(self, apify_api_token: SecretStr | str | None = None) -> None: + if isinstance(apify_api_token, SecretStr): + _token: str | None = apify_api_token.get_secret_value() + else: + _token = apify_api_token or os.getenv('APIFY_API_TOKEN') + + if not _token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = _create_apify_client(ApifyClient, _token) + + def run_actor( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start an Actor and block until it finishes. + + Args: + actor_id: Actor ID or name (e.g. ``"apify/python-example"``). + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.actor(actor_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify Actor call failed for {actor_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Actor {actor_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def get_dataset_items( + self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0 + ) -> list[dict]: + """Fetch items from an existing dataset. + + Args: + dataset_id: Apify dataset ID. + limit: Maximum number of items to return. + offset: Number of items to skip from the start. + + Returns: + List of dataset item dicts (may be empty). + """ + try: + return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + def run_actor_and_get_items( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run an Actor, then fetch items from its default dataset. + + Args: + actor_id: Actor ID or name. + run_input: JSON-serialisable input for the Actor. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for Actor default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def run_task( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + ) -> dict: + """Start a saved Actor task and block until it finishes. + + Args: + task_id: Task ID or name (e.g. ``"user/my-task"``). + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + + Returns: + Full run-details dict returned by the Apify API. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs} + if memory_mbytes is not None: + call_kwargs['memory_mbytes'] = memory_mbytes + + try: + run = self._client.task(task_id).call(**call_kwargs) + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify task call failed for {task_id}: {exc}' + raise RuntimeError(msg) from exc + if run is None: + msg = f'Task {task_id} call returned no run details.' + raise RuntimeError(msg) + self._check_run_status(run) + return run + + def run_task_and_get_items( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS, + memory_mbytes: int | None = None, + dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, + ) -> tuple[dict, list[dict]]: + """Run a saved Actor task, then fetch items from its default dataset. + + Args: + task_id: Task ID or name. + task_input: JSON-serialisable input that overrides the task's + pre-saved input. + timeout_secs: Maximum time to wait for the run to finish. + memory_mbytes: Memory limit for the run, or *None* for task default. + dataset_items_limit: Maximum number of dataset items to return. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the run does not finish with status ``SUCCEEDED``. + """ + run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes) + dataset_id = run.get('defaultDatasetId') + if not dataset_id: + msg = f'Task {task_id} run succeeded but returned no default dataset ID.' + raise RuntimeError(msg) + items = self._list_items_or_raise(dataset_id, dataset_items_limit) + return run, items + + def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str: + """Scrape a single URL and return its content as markdown. + + Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``. + + Args: + url: The URL to scrape. + timeout_secs: Maximum time to wait for the crawl to finish. + + Returns: + Markdown (or plain-text fallback) content of the page. + + Raises: + RuntimeError: If the Actor run fails or no content is extracted. + """ + run_input = { + 'startUrls': [{'url': url}], + 'maxCrawlPages': 1, + } + _, items = self.run_actor_and_get_items( + _SCRAPE_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + if not items: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + + content = items[0].get('markdown') or items[0].get('text') or '' + if not content: + msg = _ERROR_SCRAPE_EMPTY.format(url=url) + raise RuntimeError(msg) + return content + + def instagram_scrape( + self, + search_type: str, + search_query: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + only_posts_newer_than: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Instagram via ``apify/instagram-scraper``. + + Args: + search_type: One of ``"user"``, ``"hashtag"``, ``"post"``, ``"comments"``. + search_query: Username, hashtag, or Instagram URL depending on + ``search_type``. + max_results: Maximum number of items to return. + only_posts_newer_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_type`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + results_type = _INSTAGRAM_RESULTS_TYPE_MAP.get(search_type) + if results_type is None: + msg = ( + f'Unsupported Instagram search_type {search_type!r}. ' + f'Expected one of: {sorted(_INSTAGRAM_RESULTS_TYPE_MAP)}.' + ) + raise ValueError(msg) + + direct_url = self._build_instagram_url(search_type, search_query) + run_input: dict = { + 'directUrls': [direct_url], + 'resultsType': results_type, + 'resultsLimit': max_results, + } + if only_posts_newer_than is not None: + run_input['onlyPostsNewerThan'] = only_posts_newer_than + return self.run_actor_and_get_items( + _INSTAGRAM_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_posts( + self, + profile_url: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape LinkedIn profile posts via ``apimaestro/linkedin-profile-posts``. + + Args: + profile_url: LinkedIn profile URL or username. + max_results: Maximum number of posts to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'username': profile_url, + 'total_posts': max_results, + } + return self.run_actor_and_get_items( + _LINKEDIN_POSTS_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_search( + self, + query: str, + max_results: int = 10, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Search LinkedIn profiles via ``harvestapi/linkedin-profile-search``. + + Args: + query: Search keywords (e.g., name, title, company). + max_results: Maximum number of profiles to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'searchQuery': query, + 'maxItems': max_results, + } + return self.run_actor_and_get_items( + _LINKEDIN_SEARCH_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def linkedin_profile_detail( + self, + profile_url: str, + *, + include_email: bool = False, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Fetch a LinkedIn profile via ``apimaestro/linkedin-profile-detail``. + + Args: + profile_url: LinkedIn profile URL or username. + include_email: If True, attempt to include the profile email when + available. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. ``items`` typically contains a + single profile dict. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'username': profile_url, + 'includeEmail': include_email, + } + return self.run_actor_and_get_items( + _LINKEDIN_DETAIL_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=1, + ) + + def twitter_scrape( # noqa: PLR0913 + self, + search_query: str, + search_mode: str = 'search', + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + start: str | None = None, + end: str | None = None, + sort: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Twitter/X via ``apidojo/twitter-scraper-lite``. + + Args: + search_query: Search term, username, or tweet URL. + search_mode: One of ``"search"``, ``"user"``, ``"replies"``. + max_results: Maximum number of tweets to return. + start: Optional ISO-8601 start date — only return tweets newer + than this date. + end: Optional ISO-8601 end date — only return tweets older than + this date. + sort: Optional sort order. One of ``"Latest"`` or ``"Top"``. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_mode`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = {'maxItems': max_results} + if search_mode == 'search': + run_input['searchTerms'] = [search_query] + elif search_mode == 'user': + run_input['twitterHandles'] = [search_query.lstrip('@')] + elif search_mode == 'replies': + run_input['startUrls'] = [search_query] + else: + msg = f"Unsupported Twitter search_mode {search_mode!r}. Expected one of: ['search', 'user', 'replies']." + raise ValueError(msg) + if start is not None: + run_input['start'] = start + if end is not None: + run_input['end'] = end + if sort is not None: + run_input['sort'] = sort + return self.run_actor_and_get_items( + _TWITTER_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def tiktok_scrape( + self, + search_query: str, + search_type: str = 'search', + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape TikTok via ``clockworks/tiktok-scraper``. + + Args: + search_query: Username, hashtag, search keyword, or TikTok post URL. + search_type: One of ``"search"``, ``"user"``, ``"hashtag"``, ``"post"``. + max_results: Maximum number of items to return. + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + ValueError: If ``search_type`` is not recognised. + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = {'resultsPerPage': max_results} + if search_type == 'search': + run_input['searchQueries'] = [search_query] + elif search_type == 'user': + run_input['profiles'] = [search_query.lstrip('@')] + elif search_type == 'hashtag': + run_input['hashtags'] = [search_query.lstrip('#')] + elif search_type == 'post': + run_input['postURLs'] = [search_query] + else: + msg = ( + f'Unsupported TikTok search_type {search_type!r}. ' + "Expected one of: ['search', 'user', 'hashtag', 'post']." + ) + raise ValueError(msg) + return self.run_actor_and_get_items( + _TIKTOK_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + def facebook_posts_scrape( + self, + page_url: str, + max_results: int = _DEFAULT_SOCIAL_RESULTS_LIMIT, + only_posts_newer_than: str | None = None, + only_posts_older_than: str | None = None, + timeout_secs: int = _DEFAULT_SOCIAL_TIMEOUT_SECS, + ) -> tuple[dict, list[dict]]: + """Scrape Facebook page posts via ``apify/facebook-posts-scraper``. + + Args: + page_url: Facebook page URL. + max_results: Maximum number of posts to return. + only_posts_newer_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + only_posts_older_than: Optional date filter. Accepts ``YYYY-MM-DD``, + ISO-8601, or relative (e.g. ``"1 day"``, ``"2 months"``). + timeout_secs: Maximum time to wait for the run to finish. + + Returns: + A ``(run_details, items)`` tuple. + + Raises: + RuntimeError: If the Actor run does not succeed. + """ + run_input: dict = { + 'startUrls': [{'url': page_url}], + 'resultsLimit': max_results, + } + if only_posts_newer_than is not None: + run_input['onlyPostsNewerThan'] = only_posts_newer_than + if only_posts_older_than is not None: + run_input['onlyPostsOlderThan'] = only_posts_older_than + return self.run_actor_and_get_items( + _FACEBOOK_ACTOR_ID, + run_input=run_input, + timeout_secs=timeout_secs, + dataset_items_limit=max_results, + ) + + @staticmethod + def _build_instagram_url(search_type: str, search_query: str) -> str: + """Build an Instagram URL from a username/hashtag/URL based on search type.""" + if search_query.startswith(('http://', 'https://')): + return search_query + if search_type == 'hashtag': + tag = search_query.lstrip('#') + return f'https://www.instagram.com/explore/tags/{tag}/' + if search_type == 'user': + handle = search_query.lstrip('@') + return f'https://www.instagram.com/{handle}/' + # post/comments expect a URL; if a bare ID is given, build a /p/ URL + return f'https://www.instagram.com/p/{search_query}/' + + def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]: + """Fetch dataset items, wrapping any network error in a RuntimeError.""" + try: + return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items + except _TRANSPORT_EXCEPTIONS as exc: + msg = f'Apify dataset fetch failed for {dataset_id}: {exc}' + raise RuntimeError(msg) from exc + + @staticmethod + def _check_run_status(run: dict) -> None: + """Raise if the run did not succeed.""" + status = run.get('status') + if status != _RUN_STATUS_SUCCEEDED: + run_id = run.get('id', 'unknown') + msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status) + if status_message := run.get('statusMessage'): + msg = f'{msg} {status_message}' + raise RuntimeError(msg) diff --git a/langchain_apify/error_messages.py b/langchain_apify/_error_messages.py similarity index 68% rename from langchain_apify/error_messages.py rename to langchain_apify/_error_messages.py index 87462b8..0a8c612 100644 --- a/langchain_apify/error_messages.py +++ b/langchain_apify/_error_messages.py @@ -1,7 +1,11 @@ -ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( +_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = ( 'APIFY_API_TOKEN environment variable is not set.' ' Please set it to your Apify API token by using `os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"' ' in your code or pass it as environment variable.' ' To pass it as environment variable, you can use the following command:' ' `APIFY_API_TOKEN="YOUR_APIFY_API_TOKEN" python your_script.py`' ) + +_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.' + +_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.' diff --git a/langchain_apify/utils.py b/langchain_apify/_utils.py similarity index 74% rename from langchain_apify/utils.py rename to langchain_apify/_utils.py index 8cdc835..6d7c2c0 100644 --- a/langchain_apify/utils.py +++ b/langchain_apify/_utils.py @@ -1,24 +1,25 @@ from __future__ import annotations import string +from datetime import datetime from typing import TypeVar import requests from apify_client import ApifyClientAsync from apify_client.client import ApifyClient -from langchain_apify.const import MAX_DESCRIPTION_LEN, REQUESTS_TIMEOUT_SECS +_MAX_DESCRIPTION_LEN: int = 350 +_REQUESTS_TIMEOUT_SECS: float = 10.0 +_APIFY_API_ENDPOINT_GET_DEFAULT_BUILD: str = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' -APIFY_API_ENDPOINT_GET_DEFAULT_BUILD = 'https://api.apify.com/v2/acts/{actor_id}/builds/default' - -def prune_actor_input_schema( +def _prune_actor_input_schema( input_schema: dict, - max_description_len: int = MAX_DESCRIPTION_LEN, + max_description_len: int = _MAX_DESCRIPTION_LEN, ) -> tuple[dict, list[str]]: """Get the input schema from the Actor build. - Trim the description to 250 characters. + Trim descriptions to ``_MAX_DESCRIPTION_LEN`` characters. Args: input_schema (dict): The input schema from the Actor build. @@ -48,7 +49,7 @@ def prune_actor_input_schema( T = TypeVar('T', ApifyClient, ApifyClientAsync) -def create_apify_client(client_cls: type[T], token: str) -> T: +def _create_apify_client(client_cls: type[T], token: str) -> T: """Create an Apify client instance with a custom user-agent. Args: @@ -79,7 +80,7 @@ def create_apify_client(client_cls: type[T], token: str) -> T: return client -def actor_id_to_tool_name(actor_id: str) -> str: +def _actor_id_to_tool_name(actor_id: str) -> str: """Turn actor_id into a valid tool name. Tool name must only contain letters, numbers, underscores, dashes, @@ -95,7 +96,7 @@ def actor_id_to_tool_name(actor_id: str) -> str: return 'apify_actor_' + ''.join(char if char in valid_chars else '_' for char in actor_id) -def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: +def _get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: """Get the latest build of an Actor from the default build tag. Args: @@ -117,8 +118,8 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: msg = f'Failed to get the Actor object ID for {actor_id}.' raise ValueError(msg) - url = APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) - response = requests.request('GET', url, timeout=REQUESTS_TIMEOUT_SECS) + url = _APIFY_API_ENDPOINT_GET_DEFAULT_BUILD.format(actor_id=actor_obj_id) + response = requests.request('GET', url, timeout=_REQUESTS_TIMEOUT_SECS) build = response.json() if not isinstance(build, dict): @@ -130,3 +131,21 @@ def get_actor_latest_build(apify_client: ApifyClient, actor_id: str) -> dict: raise ValueError(msg) return data + + +def _iso(value: str | datetime | None) -> str | None: + """Coerce a possible ``datetime`` to an ISO-8601 string.""" + if isinstance(value, datetime): + return value.isoformat() + return value + + +def _run_meta(run: dict) -> dict: + """Extract a compact metadata dict from an Apify run-details dict.""" + return { + 'run_id': run.get('id'), + 'status': run.get('status'), + 'dataset_id': run.get('defaultDatasetId'), + 'started_at': _iso(run.get('startedAt')), + 'finished_at': _iso(run.get('finishedAt')), + } diff --git a/langchain_apify/const.py b/langchain_apify/const.py deleted file mode 100644 index 87e0d0e..0000000 --- a/langchain_apify/const.py +++ /dev/null @@ -1,2 +0,0 @@ -REQUESTS_TIMEOUT_SECS: float = 10.0 -MAX_DESCRIPTION_LEN: int = 350 diff --git a/langchain_apify/document_loaders.py b/langchain_apify/document_loaders.py index 49befb6..6439740 100644 --- a/langchain_apify/document_loaders.py +++ b/langchain_apify/document_loaders.py @@ -7,10 +7,11 @@ from apify_client import ApifyClient from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document # noqa: TCH002 -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator -from langchain_apify.utils import create_apify_client +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client if TYPE_CHECKING: from collections.abc import Iterator @@ -40,10 +41,15 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): documents = loader.load() """ - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - """An instance of the ApifyClient class from the apify-client Python package.""" + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to APIFY_API_TOKEN / APIFY_TOKEN environment variables.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[dict], Document] @@ -54,7 +60,7 @@ def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[dict], Document], - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, ) -> None: """Initialize the loader with an Apify dataset ID and a mapping function. @@ -63,34 +69,43 @@ def __init__( dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class. - apify_api_token (str): Apify API token. + apify_api_token (str | SecretStr): Apify API token. Falls back to the + ``APIFY_API_TOKEN`` / ``APIFY_TOKEN`` environment variables. """ - super().__init__( - dataset_id=dataset_id, - dataset_mapping_function=dataset_mapping_function, - apify_api_token=apify_api_token, - ) - - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Args: - values (dict): The values to validate. + init_kwargs: dict[str, Any] = { + 'dataset_id': dataset_id, + 'dataset_mapping_function': dataset_mapping_function, + } + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + init_kwargs['apify_api_token'] = apify_api_token + super().__init__(**init_kwargs) + + @model_validator(mode='after') + def _init_client(self) -> ApifyDatasetLoader: + """Resolve the Apify API token and initialise the client. + + Checks ``APIFY_TOKEN`` as a secondary fallback for code running on the + Apify platform where only that variable is set. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') - # when running at Apify platform, use APIFY_TOKEN environment variable - apify_api_token = apify_api_token or os.getenv('APIFY_TOKEN', '') - - client = create_apify_client(ApifyClient, apify_api_token) + ApifyDatasetLoader: The validated loader instance. - values['apify_client'] = client - - return values + Raises: + ValueError: If no token is available from any source. + """ + token = self.apify_api_token + if token is None: + # Secondary fallback for code running on the Apify platform. + raw = os.getenv('APIFY_TOKEN') + if raw: + token = SecretStr(raw) + if token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self.apify_client = _create_apify_client(ApifyClient, token.get_secret_value()) + return self def load(self) -> list[Document]: """Load documents. diff --git a/langchain_apify/tools.py b/langchain_apify/tools.py index 135314a..4bb63e8 100644 --- a/langchain_apify/tools.py +++ b/langchain_apify/tools.py @@ -1,3 +1,21 @@ +"""LangChain tools for the Apify platform. + +All tools require an Apify API token. Set it via the ``APIFY_API_TOKEN`` +environment variable, or pass ``apify_api_token`` to the tool constructor: + +.. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({"actor_id": "apify/python-example"}) + +For details, see https://docs.apify.com/platform/integrations/langchain +""" + from __future__ import annotations import json @@ -5,19 +23,21 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient -from langchain_core.tools import BaseTool -from pydantic import BaseModel, Field, create_model - -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import ( - actor_id_to_tool_name, - create_apify_client, - get_actor_latest_build, - prune_actor_input_schema, +from langchain_core.tools import BaseTool, ToolException +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, Field, PrivateAttr, SecretStr, create_model + +from langchain_apify._client import ApifyToolsClient +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import ( + _MAX_DESCRIPTION_LEN, + _actor_id_to_tool_name, + _create_apify_client, + _get_actor_latest_build, + _prune_actor_input_schema, + _run_meta, ) -from .const import MAX_DESCRIPTION_LEN - if TYPE_CHECKING: from langchain_core.callbacks import ( CallbackManagerForToolRun, @@ -56,10 +76,13 @@ class ApifyActorsTool(BaseTool): # type: ignore[override, override] chunk["messages"][-1].pretty_print() """ + _apify_client: ApifyClient = PrivateAttr() + _actor_id: str = PrivateAttr() + def __init__( self, actor_id: str, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: @@ -74,16 +97,20 @@ def __init__( Raises: ValueError: If the `APIFY_API_TOKEN` environment variable is not set """ - apify_api_token = apify_api_token or os.getenv('APIFY_API_TOKEN') - if not apify_api_token: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + _raw_token: str | None = ( + apify_api_token.get_secret_value() + if isinstance(apify_api_token, SecretStr) + else apify_api_token or os.getenv('APIFY_API_TOKEN') + ) + if not _raw_token: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, apify_api_token) + apify_client = _create_apify_client(ApifyClient, _raw_token) kwargs.update( { - 'name': actor_id_to_tool_name(actor_id), + 'name': _actor_id_to_tool_name(actor_id), 'description': self._create_description(apify_client, actor_id), 'args_schema': self._build_tool_args_schema_model( apify_client, @@ -126,10 +153,10 @@ def _create_description(apify_client: ApifyClient, actor_id: str) -> str: Returns: str: The description. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) actor_description = build.get('actorDefinition', {}).get('description', '') - if len(actor_description) > MAX_DESCRIPTION_LEN: - actor_description = actor_description[:MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' + if len(actor_description) > _MAX_DESCRIPTION_LEN: + actor_description = actor_description[:_MAX_DESCRIPTION_LEN] + '...(TRUNCATED, TOO LONG)' return actor_description @staticmethod @@ -149,12 +176,12 @@ def _build_tool_args_schema_model( Raises: ValueError: If the input schema is not found in the Actor build. """ - build = get_actor_latest_build(apify_client, actor_id) + build = _get_actor_latest_build(apify_client, actor_id) if not (actor_input := build.get('actorDefinition', {}).get('input')): msg = f'Input schema not found in the Actor build for Actor: {actor_id}' raise ValueError(msg) - properties, required = prune_actor_input_schema(actor_input) + properties, required = _prune_actor_input_schema(actor_input) properties = {'run_input': properties} description = ( @@ -192,3 +219,469 @@ def _run_actor(self, run_input: dict) -> list[dict]: run = self._apify_client.run(run_id=run_id) return run.dataset().list_items(clean=True).items + + +# --------------------------------------------------------------------------- +# Input schemas for the generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorInput(BaseModel): + """Input schema for :class:`ApifyRunActorTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + + +class ApifyGetDatasetItemsInput(BaseModel): + """Input schema for :class:`ApifyGetDatasetItemsTool`.""" + + dataset_id: str = Field(description='Apify dataset ID.') + limit: int = Field(default=100, description='Maximum number of items to return.') + offset: int = Field(default=0, description='Number of items to skip from the start.') + + +class ApifyRunActorAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunActorAndGetDatasetTool`.""" + + actor_id: str = Field(description='Actor ID or name (e.g. "apify/python-example").') + run_input: dict | None = Field(default=None, description='JSON-serialisable input for the Actor.') + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field(default=None, description='Memory limit in MB for the run, or null for default.') + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +class ApifyScrapeUrlInput(BaseModel): + """Input schema for :class:`ApifyScrapeUrlTool`.""" + + url: str = Field(description='The URL to scrape.') + timeout_secs: int = Field(default=120, description='Maximum time in seconds to wait for the crawl to finish.') + + +class ApifyRunTaskInput(BaseModel): + """Input schema for :class:`ApifyRunTaskTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + + +class ApifyRunTaskAndGetDatasetInput(BaseModel): + """Input schema for :class:`ApifyRunTaskAndGetDatasetTool`.""" + + task_id: str = Field(description='Task ID or name (e.g. "user/my-task").') + task_input: dict | None = Field( + default=None, description="JSON-serialisable input that overrides the task's pre-saved input." + ) + timeout_secs: int = Field(default=300, description='Maximum time in seconds to wait for the run to finish.') + memory_mbytes: int | None = Field( + default=None, description='Memory limit in MB for the run, or null for task default.' + ) + dataset_items_limit: int = Field(default=100, description='Maximum number of dataset items to return.') + + +# --------------------------------------------------------------------------- +# Shared base for generic tools +# --------------------------------------------------------------------------- + + +class _ApifyGenericTool(BaseTool): # type: ignore[override] + """Shared base for all generic Apify tools. + + Handles ``ApifyToolsClient`` creation, sets ``handle_tool_error``, + and defines developer-controlled safety limits that clamp values the + LLM may provide at invocation time. + + Subclasses only need to declare ``name``, ``description``, + ``args_schema``, and ``_run()``. + """ + + handle_tool_error: bool = True + + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + max_timeout_secs: int = Field(default=600, description='Upper bound for timeout_secs the LLM may request.') + max_memory_mbytes: int = Field(default=32768, description='Upper bound for memory_mbytes the LLM may request.') + max_items: int = Field(default=1000, description='Upper bound for limit / dataset_items_limit the LLM may request.') + + _client: ApifyToolsClient = PrivateAttr() + + def model_post_init(self, context: Any) -> None: # noqa: ANN401 + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + self._client = ApifyToolsClient(apify_api_token=self.apify_api_token.get_secret_value()) + super().model_post_init(context) + + def _clamp_timeout(self, value: int) -> int: + return max(1, min(value, self.max_timeout_secs)) + + def _clamp_memory(self, value: int | None) -> int | None: + # Non-positive values fall through to the platform default. Positive + # values are floored at 128 MB (the Apify platform minimum) so the LLM + # cannot drive into an API rejection by requesting too little memory. + if value is None or value <= 0: + return None + return max(128, min(value, self.max_memory_mbytes)) + + def _clamp_items(self, value: int) -> int: + return max(1, min(value, self.max_items)) + + +# --------------------------------------------------------------------------- +# Generic tools +# --------------------------------------------------------------------------- + + +class ApifyRunActorTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor by ID with an arbitrary JSON input. + + Returns run metadata (run ID, status, dataset ID, timestamps) as a JSON + string. Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve the + results from the dataset. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorTool + + tool = ApifyRunActorTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor' + description: str = ( + 'Run an Apify Actor synchronously and return run metadata as a JSON string.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunActorInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_actor( + actor_id, run_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyGetDatasetItemsTool(_ApifyGenericTool): # type: ignore[override] + """Fetch items from an existing Apify dataset by ID. + + Returns a JSON object with an ``"items"`` key containing the list of item + dicts. When the dataset is empty an additional ``"message"`` key is + included. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON object ``{"items": [...]}``; includes ``"message"`` when empty. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyGetDatasetItemsTool + + tool = ApifyGetDatasetItemsTool() + result = tool.invoke({"dataset_id": "abc123", "limit": 10}) + """ + + name: str = 'apify_get_dataset_items' + description: str = ( + 'Fetch items from an Apify dataset by ID. Returns a JSON object with an "items" array.' + ' Required: dataset_id (str) — Apify dataset ID.' + ' Optional: limit (int, default 100), offset (int, default 0).' + ) + args_schema: type[BaseModel] = ApifyGetDatasetItemsInput + + def _run( + self, + dataset_id: str, + limit: int = 100, + offset: int = 0, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + items = self._client.get_dataset_items(dataset_id, self._clamp_items(limit), offset) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + if not items: + return json.dumps({'items': [], 'message': f'Dataset {dataset_id} is empty.'}) + return json.dumps({'items': items}) + + +class ApifyRunActorAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run any Apify Actor and return both run metadata and dataset items. + + Combines :class:`ApifyRunActorTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunActorAndGetDatasetTool + + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({ + "actor_id": "apify/python-example", + "run_input": {"first_number": 2, "second_number": 3}, + }) + """ + + name: str = 'apify_run_actor_and_get_dataset' + description: str = ( + 'Run an Apify Actor synchronously and return both run metadata and dataset items.' + ' Required: actor_id (str) — Actor ID or name (e.g. "apify/python-example").' + ' Optional: run_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunActorAndGetDatasetInput + + def _run( + self, + actor_id: str, + run_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_actor_and_get_items( + actor_id, + run_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) + + +class ApifyScrapeUrlTool(_ApifyGenericTool): # type: ignore[override] + """Scrape a single URL and return its content as markdown. + + Uses the ``apify/website-content-crawler`` Actor under the hood with + ``maxCrawlPages=1``. Returns the page content as a plain markdown string + (not JSON). + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + Markdown string with the full text content of the scraped page, or a + plain-text fallback when markdown is unavailable. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyScrapeUrlTool + + tool = ApifyScrapeUrlTool() + markdown = tool.invoke({"url": "https://apify.com"}) + """ + + name: str = 'apify_scrape_url' + description: str = ( + 'Scrape a single URL using Apify and return its full content as a markdown string.' + ' Required: url (str) — the URL to scrape.' + ' Optional: timeout_secs (int, default 120).' + ' Returns the page content as markdown (or plain text if markdown is unavailable).' + ) + args_schema: type[BaseModel] = ApifyScrapeUrlInput + + def _run( + self, + url: str, + timeout_secs: int = 120, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + return self._client.scrape_url(url, self._clamp_timeout(timeout_secs)) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + + +class ApifyRunTaskTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task by ID and return run metadata. + + Actor tasks are pre-configured Actor runs saved in the Apify Console. + This tool starts a task with optional input overrides and returns run + metadata (run ID, status, dataset ID, timestamps) as a JSON string. + Use :class:`ApifyGetDatasetItemsTool` afterwards to retrieve results. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with keys ``run_id``, ``status``, ``dataset_id``, + ``started_at``, and ``finished_at``. + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskTool + + tool = ApifyRunTaskTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task' + description: str = ( + 'Run a saved Apify Actor task synchronously and return run metadata as a JSON string.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null).' + ' Returns JSON with keys: run_id, status, dataset_id, started_at, finished_at.' + ' Use apify_get_dataset_items with the returned dataset_id to fetch results.' + ) + args_schema: type[BaseModel] = ApifyRunTaskInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run = self._client.run_task( + task_id, task_input, self._clamp_timeout(timeout_secs), self._clamp_memory(memory_mbytes) + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps(_run_meta(run)) + + +class ApifyRunTaskAndGetDatasetTool(_ApifyGenericTool): # type: ignore[override] + """Run a saved Apify Actor task and return both run metadata and dataset items. + + Combines :class:`ApifyRunTaskTool` and :class:`ApifyGetDatasetItemsTool` + into a single call. Returns a JSON string with ``run`` (metadata) and + ``items`` (list of dicts) keys. + + Args: + apify_api_token: Apify API token. Falls back to the ``APIFY_API_TOKEN`` + environment variable when *None*. + + Returns: + JSON string with two keys: ``run`` (dict with ``run_id``, ``status``, + ``dataset_id``, ``started_at``, ``finished_at``) and ``items`` (list + of dataset item dicts). + + Example: + .. code-block:: python + + import os + os.environ["APIFY_API_TOKEN"] = "your-apify-api-token" + + from langchain_apify import ApifyRunTaskAndGetDatasetTool + + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({ + "task_id": "user/my-task", + "task_input": {"key": "value"}, + }) + """ + + name: str = 'apify_run_task_and_get_dataset' + description: str = ( + 'Run a saved Apify Actor task synchronously and return both run metadata and dataset items.' + ' Required: task_id (str) — task ID or name (e.g. "user/my-task").' + ' Optional: task_input (dict), timeout_secs (int, default 300),' + ' memory_mbytes (int|null), dataset_items_limit (int, default 100).' + ' Returns JSON with keys: run (run_id, status, dataset_id, started_at, finished_at)' + ' and items (list of dataset item dicts).' + ) + args_schema: type[BaseModel] = ApifyRunTaskAndGetDatasetInput + + def _run( + self, + task_id: str, + task_input: dict | None = None, + timeout_secs: int = 300, + memory_mbytes: int | None = None, + dataset_items_limit: int = 100, + _run_manager: CallbackManagerForToolRun | None = None, + ) -> str: + try: + run, items = self._client.run_task_and_get_items( + task_id, + task_input, + self._clamp_timeout(timeout_secs), + self._clamp_memory(memory_mbytes), + self._clamp_items(dataset_items_limit), + ) + except RuntimeError as exc: + raise ToolException(str(exc)) from exc + return json.dumps({'run': _run_meta(run), 'items': items}) diff --git a/langchain_apify/wrappers.py b/langchain_apify/wrappers.py index ef17873..57a9eeb 100644 --- a/langchain_apify/wrappers.py +++ b/langchain_apify/wrappers.py @@ -4,11 +4,12 @@ from typing import TYPE_CHECKING, Any from apify_client import ApifyClient, ApifyClientAsync -from langchain_core.utils import get_from_dict_or_env -from pydantic import BaseModel, ConfigDict, model_validator +from langchain_core.utils import secret_from_env +from pydantic import BaseModel, ConfigDict, Field, SecretStr, model_validator +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client from langchain_apify.document_loaders import ApifyDatasetLoader -from langchain_apify.utils import create_apify_client if TYPE_CHECKING: from collections.abc import Callable @@ -51,49 +52,54 @@ class ApifyWrapper(BaseModel): """ # allow arbitrary types in the model config for the apify client fields - model_config = ConfigDict(arbitrary_types_allowed=True) + model_config = ConfigDict(arbitrary_types_allowed=True, populate_by_name=True) - apify_client: ApifyClient - apify_client_async: ApifyClientAsync - apify_api_token: str | None = None + apify_api_token: SecretStr | None = Field( + default_factory=secret_from_env('APIFY_API_TOKEN', default=None), + description='Apify API token. Falls back to the APIFY_API_TOKEN environment variable when None.', + exclude=True, + repr=False, + ) + apify_client: ApifyClient = Field(default=None, exclude=True) # type: ignore[assignment] + apify_client_async: ApifyClientAsync = Field(default=None, exclude=True) # type: ignore[assignment] def __init__( self, - apify_api_token: str | None = None, + apify_api_token: str | SecretStr | None = None, *args: Any, # noqa: ANN401 **kwargs: Any, # noqa: ANN401 ) -> None: - """Initialize the loader with an Apify dataset ID and a mapping function. + """Initialise the wrapper. Args: - dataset_id (str): The ID of the dataset on the Apify platform. - dataset_mapping_function (Callable): A function that takes a single - dictionary (an Apify dataset item) and converts it to an instance - of the Document class. - apify_api_token (Optional[str]): Apify API token. - *args: Any: Additional positional arguments. - **kwargs: Any: Additional keyword arguments. + apify_api_token (Optional[str | SecretStr]): Apify API token. Falls + back to the ``APIFY_API_TOKEN`` environment variable when *None*. + *args: Any: Additional positional arguments forwarded to Pydantic. + **kwargs: Any: Additional keyword arguments forwarded to Pydantic. """ - kwargs.update({'apify_api_token': apify_api_token}) + # Only forward the token when explicitly provided; otherwise let the + # Pydantic ``default_factory`` read it from the environment. + if apify_api_token is not None: + kwargs['apify_api_token'] = apify_api_token super().__init__(*args, **kwargs) - @model_validator(mode='before') - @classmethod - def validate_environment(cls, values: dict) -> Any: # noqa: ANN401 - """Validate environment. - - Validate that an Apify API token is set and the apify-client - Python package exists in the current environment. + @model_validator(mode='after') + def _init_clients(self) -> ApifyWrapper: + """Validate the token and initialise both sync and async Apify clients. Returns: - Any: The validated values. - """ - apify_api_token = get_from_dict_or_env(values, 'apify_api_token', 'APIFY_API_TOKEN') + ApifyWrapper: The validated wrapper instance. - values['apify_client'] = create_apify_client(ApifyClient, apify_api_token) - values['apify_client_async'] = create_apify_client(ApifyClientAsync, apify_api_token) - - return values + Raises: + ValueError: If no token is provided and APIFY_API_TOKEN is not set. + """ + if self.apify_api_token is None: + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + raise ValueError(msg) + token = self.apify_api_token.get_secret_value() + self.apify_client = _create_apify_client(ApifyClient, token) + self.apify_client_async = _create_apify_client(ApifyClientAsync, token) + return self def call_actor( # noqa: PLR0913 self, diff --git a/tests/integration_tests/test_generic_tools.py b/tests/integration_tests/test_generic_tools.py new file mode 100644 index 0000000..3f2a7c8 --- /dev/null +++ b/tests/integration_tests/test_generic_tools.py @@ -0,0 +1,94 @@ +"""Integration smoke tests for the generic Apify tools. + +These tests hit the real Apify API and require the ``APIFY_API_TOKEN`` +environment variable to be set. They use ``apify/python-example`` (a +trivial Actor that adds two numbers) to keep execution fast and cheap. +""" + +from __future__ import annotations + +import json +import os + +import pytest + +from langchain_apify import ( + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, +) + +_ACTOR_ID = 'apify/python-example' +_RUN_INPUT = {'first_number': 2, 'second_number': 3} + +pytestmark = pytest.mark.skipif( + not os.getenv('APIFY_API_TOKEN'), + reason='APIFY_API_TOKEN not set', +) + + +def test_run_actor_tool_smoke() -> None: + tool = ApifyRunActorTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +def test_get_dataset_items_tool_smoke() -> None: + run_tool = ApifyRunActorTool() + run_result = json.loads(run_tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT})) + dataset_id = run_result['dataset_id'] + + items_tool = ApifyGetDatasetItemsTool() + result = items_tool.invoke({'dataset_id': dataset_id, 'limit': 10}) + + parsed = json.loads(result) + assert 'items' in parsed + assert isinstance(parsed['items'], list) + + +def test_run_actor_and_get_items_tool_smoke() -> None: + tool = ApifyRunActorAndGetDatasetTool() + result = tool.invoke({'actor_id': _ACTOR_ID, 'run_input': _RUN_INPUT}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) + + +def test_scrape_url_tool_smoke() -> None: + tool = ApifyScrapeUrlTool() + result = tool.invoke({'url': 'https://crawlee.dev'}) + + assert isinstance(result, str) + assert len(result) > 0 + + +_TASK_ID = os.getenv('APIFY_TASK_ID', '') + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_tool_smoke() -> None: + tool = ApifyRunTaskTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['status'] == 'SUCCEEDED' + assert parsed['run_id'] + assert parsed['dataset_id'] + + +@pytest.mark.skipif(not _TASK_ID, reason='APIFY_TASK_ID not set') +def test_run_task_and_get_items_tool_smoke() -> None: + tool = ApifyRunTaskAndGetDatasetTool() + result = tool.invoke({'task_id': _TASK_ID}) + + parsed = json.loads(result) + assert parsed['run']['status'] == 'SUCCEEDED' + assert isinstance(parsed['items'], list) diff --git a/tests/integration_tests/test_utils.py b/tests/integration_tests/test_utils.py index 1107c7a..c92c038 100644 --- a/tests/integration_tests/test_utils.py +++ b/tests/integration_tests/test_utils.py @@ -2,8 +2,8 @@ from apify_client.client import ApifyClient -from langchain_apify.error_messages import ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET -from langchain_apify.utils import create_apify_client, get_actor_latest_build +from langchain_apify._error_messages import _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET +from langchain_apify._utils import _create_apify_client, _get_actor_latest_build def test_get_actor_latest_build() -> None: @@ -13,12 +13,12 @@ def test_get_actor_latest_build() -> None: ValueError: If the APIFY_API_TOKEN environment variable is not set. """ if (token := os.getenv('APIFY_API_TOKEN')) is None: - msg = ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET + msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET raise ValueError(msg) - apify_client = create_apify_client(ApifyClient, token) + apify_client = _create_apify_client(ApifyClient, token) - build = get_actor_latest_build(apify_client, 'apify/rag-web-browser') + build = _get_actor_latest_build(apify_client, 'apify/rag-web-browser') assert isinstance(build, dict) assert 'id' in build diff --git a/tests/unit_tests/conftest.py b/tests/unit_tests/conftest.py new file mode 100644 index 0000000..3384e79 --- /dev/null +++ b/tests/unit_tests/conftest.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from langchain_apify._client import ApifyToolsClient + +SUCCEEDED_RUN: dict = { + 'id': 'run-abc', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'dataset-xyz', + 'startedAt': '2025-01-01T00:00:00.000Z', + 'finishedAt': '2025-01-01T00:01:00.000Z', +} + +FAILED_RUN: dict = { + 'id': 'run-fail', + 'status': 'FAILED', + 'defaultDatasetId': 'dataset-xyz', +} + +SAMPLE_ITEMS: list[dict] = [ + {'text': 'item-1', 'url': 'https://example.com/1'}, + {'text': 'item-2', 'url': 'https://example.com/2'}, +] + + +@pytest.fixture +def mock_tools_client() -> MagicMock: + return MagicMock(spec=ApifyToolsClient) + + +@pytest.fixture +def mock_apify_client() -> MagicMock: + return MagicMock() + + +@pytest.fixture +def client(mock_apify_client: MagicMock) -> ApifyToolsClient: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + return ApifyToolsClient(apify_api_token='dummy-token') + + +def make_tool(tool_cls: type, mock_client: MagicMock, **kwargs: Any) -> Any: # noqa: ANN401 + """Instantiate a generic tool with a mocked ApifyToolsClient.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = tool_cls(apify_api_token='dummy-token', **kwargs) + tool._client = mock_client + return tool diff --git a/tests/unit_tests/test_actor_tools.py b/tests/unit_tests/test_actor_tools.py new file mode 100644 index 0000000..cf622fc --- /dev/null +++ b/tests/unit_tests/test_actor_tools.py @@ -0,0 +1,414 @@ +from __future__ import annotations + +import json +from unittest.mock import MagicMock + +import pytest +from langchain_core.tools import ToolException + +from langchain_apify._actor_tools import ( + ApifyFacebookPostsScraperTool, + ApifyInstagramScraperTool, + ApifyLinkedInProfileDetailTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyTikTokScraperTool, + ApifyTwitterScraperTool, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool + +EXPECTED_RUN_META: dict = { + 'run_id': 'run-abc', + 'status': 'SUCCEEDED', + 'dataset_id': 'dataset-xyz', + 'started_at': '2025-01-01T00:00:00.000Z', + 'finished_at': '2025-01-01T00:01:00.000Z', +} + + +# --------------------------------------------------------------------------- +# Missing token (shared base behavior) +# --------------------------------------------------------------------------- + + +def test_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyInstagramScraperTool() + + +# --------------------------------------------------------------------------- +# ApifyInstagramScraperTool +# --------------------------------------------------------------------------- + + +def test_instagram_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + result = tool._run(search_type='user', search_query='apify', max_results=10) + + parsed = json.loads(result) + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + + +def test_instagram_tool_passes_params(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + tool._run( + search_type='hashtag', + search_query='#travel', + max_results=5, + only_posts_newer_than='1 week', + ) + + mock_tools_client.instagram_scrape.assert_called_once_with( + search_type='hashtag', + search_query='#travel', + max_results=5, + only_posts_newer_than='1 week', + timeout_secs=600, + ) + + +def test_instagram_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client, max_items=3) + + tool._run(search_type='user', search_query='apify', max_results=100) + + assert mock_tools_client.instagram_scrape.call_args.kwargs['max_results'] == 3 + + +def test_instagram_tool_runtime_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.instagram_scrape.side_effect = RuntimeError('Actor run run-X ended with status FAILED.') + tool = make_tool(ApifyInstagramScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='run-X'): + tool._run(search_type='user', search_query='apify') + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfilePostsTool +# --------------------------------------------------------------------------- + + +def test_linkedin_posts_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_posts.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyLinkedInProfilePostsTool, mock_tools_client) + + result = tool._run(profile_url='satyanadella', max_results=10) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.linkedin_profile_posts.assert_called_once_with( + profile_url='satyanadella', + max_results=10, + timeout_secs=600, + ) + + +def test_linkedin_posts_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_posts.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfilePostsTool, mock_tools_client, max_items=5) + + tool._run(profile_url='satyanadella', max_results=999) + + assert mock_tools_client.linkedin_profile_posts.call_args.kwargs['max_results'] == 5 + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfileSearchTool +# --------------------------------------------------------------------------- + + +def test_linkedin_search_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + result = tool._run(query='Founder', max_results=10) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.linkedin_profile_search.assert_called_once_with( + query='Founder', + max_results=10, + timeout_secs=600, + ) + + +def test_linkedin_search_tool_default_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + tool._run(query='CTO') + + assert mock_tools_client.linkedin_profile_search.call_args.kwargs['max_results'] == 10 + + +# --------------------------------------------------------------------------- +# ApifyLinkedInProfileDetailTool +# --------------------------------------------------------------------------- + + +def test_linkedin_detail_tool_happy_path(mock_tools_client: MagicMock) -> None: + profile_item = [{'firstName': 'Neal', 'lastName': 'Mohan'}] + mock_tools_client.linkedin_profile_detail.return_value = (SUCCEEDED_RUN, profile_item) + tool = make_tool(ApifyLinkedInProfileDetailTool, mock_tools_client) + + result = tool._run(profile_url='neal-mohan', include_email=True) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == profile_item + mock_tools_client.linkedin_profile_detail.assert_called_once_with( + profile_url='neal-mohan', + include_email=True, + timeout_secs=600, + ) + + +def test_linkedin_detail_tool_default_include_email_false(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_detail.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileDetailTool, mock_tools_client) + + tool._run(profile_url='neal-mohan') + + assert mock_tools_client.linkedin_profile_detail.call_args.kwargs['include_email'] is False + + +# --------------------------------------------------------------------------- +# ApifyTwitterScraperTool +# --------------------------------------------------------------------------- + + +def test_twitter_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + result = tool._run(search_query='apify', max_results=20) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.twitter_scrape.assert_called_once_with( + search_query='apify', + search_mode='search', + max_results=20, + start=None, + end=None, + sort=None, + timeout_secs=600, + ) + + +def test_twitter_tool_passes_sort(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + tool._run(search_query='apify', sort='Top') + + kwargs = mock_tools_client.twitter_scrape.call_args.kwargs + assert kwargs['sort'] == 'Top' + + +def test_twitter_tool_passes_date_range(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + tool._run(search_query='apify', search_mode='user', start='2025-01-01', end='2025-02-01') + + kwargs = mock_tools_client.twitter_scrape.call_args.kwargs + assert kwargs['search_mode'] == 'user' + assert kwargs['start'] == '2025-01-01' + assert kwargs['end'] == '2025-02-01' + + +def test_twitter_tool_value_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.twitter_scrape.side_effect = ValueError('Unsupported Twitter search_mode') + tool = make_tool(ApifyTwitterScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Unsupported Twitter search_mode'): + tool._run(search_query='apify', search_mode='replies') # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# ApifyTikTokScraperTool +# --------------------------------------------------------------------------- + + +def test_tiktok_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client) + + result = tool._run(search_query='cooking', search_type='search', max_results=12) + parsed = json.loads(result) + + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.tiktok_scrape.assert_called_once_with( + search_query='cooking', + search_type='search', + max_results=12, + timeout_secs=600, + ) + + +def test_tiktok_tool_clamps_max_results(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client, max_items=4) + + tool._run(search_query='cooking', max_results=500) + + assert mock_tools_client.tiktok_scrape.call_args.kwargs['max_results'] == 4 + + +def test_tiktok_tool_passes_post_search_type(mock_tools_client: MagicMock) -> None: + mock_tools_client.tiktok_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyTikTokScraperTool, mock_tools_client) + + tool._run(search_query='https://www.tiktok.com/@charlidamelio/video/123', search_type='post') + + assert mock_tools_client.tiktok_scrape.call_args.kwargs['search_type'] == 'post' + + +# --------------------------------------------------------------------------- +# ApifyFacebookPostsScraperTool +# --------------------------------------------------------------------------- + + +def test_facebook_tool_happy_path(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + result = tool._run(page_url='https://www.facebook.com/humansofnewyork/', max_results=15) + parsed = json.loads(result) + + assert parsed['run'] == EXPECTED_RUN_META + assert parsed['items'] == SAMPLE_ITEMS + mock_tools_client.facebook_posts_scrape.assert_called_once_with( + page_url='https://www.facebook.com/humansofnewyork/', + max_results=15, + only_posts_newer_than=None, + only_posts_older_than=None, + timeout_secs=600, + ) + + +def test_facebook_tool_passes_only_posts_newer_than(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + tool._run(page_url='https://www.facebook.com/humansofnewyork/', only_posts_newer_than='2025-01-01') + + assert mock_tools_client.facebook_posts_scrape.call_args.kwargs['only_posts_newer_than'] == '2025-01-01' + + +def test_facebook_tool_passes_only_posts_older_than(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + tool._run(page_url='https://www.facebook.com/humansofnewyork/', only_posts_older_than='2025-12-31') + + assert mock_tools_client.facebook_posts_scrape.call_args.kwargs['only_posts_older_than'] == '2025-12-31' + + +def test_facebook_tool_runtime_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.facebook_posts_scrape.side_effect = RuntimeError('Network error') + tool = make_tool(ApifyFacebookPostsScraperTool, mock_tools_client) + + with pytest.raises(ToolException, match='Network error'): + tool._run(page_url='https://www.facebook.com/humansofnewyork/') + + +# --------------------------------------------------------------------------- +# Empty results - tools should still return valid JSON +# --------------------------------------------------------------------------- + + +def test_tool_returns_valid_json_for_empty_items(mock_tools_client: MagicMock) -> None: + mock_tools_client.linkedin_profile_search.return_value = (SUCCEEDED_RUN, []) + tool = make_tool(ApifyLinkedInProfileSearchTool, mock_tools_client) + + result = tool._run(query='nonexistent') + parsed = json.loads(result) + + assert parsed['items'] == [] + assert parsed['run']['status'] == 'SUCCEEDED' + + +# --------------------------------------------------------------------------- +# handle_tool_error is True on every social tool (existing base behavior) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + 'tool_cls', + [ + ApifyInstagramScraperTool, + ApifyLinkedInProfilePostsTool, + ApifyLinkedInProfileSearchTool, + ApifyLinkedInProfileDetailTool, + ApifyTwitterScraperTool, + ApifyTikTokScraperTool, + ApifyFacebookPostsScraperTool, + ], +) +def test_social_tool_handle_tool_error_enabled(tool_cls: type, mock_tools_client: MagicMock) -> None: + tool = make_tool(tool_cls, mock_tools_client) + assert tool.handle_tool_error is True + + +# --------------------------------------------------------------------------- +# Per-tool RuntimeError -> ToolException coverage +# --------------------------------------------------------------------------- + +# (tool_cls, client_method_name, _run kwargs) +_TOOL_INVOCATIONS: list[tuple[type, str, dict]] = [ + (ApifyInstagramScraperTool, 'instagram_scrape', {'search_type': 'user', 'search_query': 'apify'}), + (ApifyLinkedInProfilePostsTool, 'linkedin_profile_posts', {'profile_url': 'satyanadella'}), + (ApifyLinkedInProfileSearchTool, 'linkedin_profile_search', {'query': 'Founder'}), + (ApifyLinkedInProfileDetailTool, 'linkedin_profile_detail', {'profile_url': 'neal-mohan'}), + (ApifyTwitterScraperTool, 'twitter_scrape', {'search_query': 'apify'}), + (ApifyTikTokScraperTool, 'tiktok_scrape', {'search_query': 'cooking'}), + (ApifyFacebookPostsScraperTool, 'facebook_posts_scrape', {'page_url': 'https://www.facebook.com/x/'}), +] + + +@pytest.mark.parametrize(('tool_cls', 'method_name', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_social_tool_runtime_error_raises_tool_exception( + tool_cls: type, + method_name: str, + run_kwargs: dict, + mock_tools_client: MagicMock, +) -> None: + getattr(mock_tools_client, method_name).side_effect = RuntimeError( + 'Actor run run-XYZ ended with status FAILED.', + ) + tool = make_tool(tool_cls, mock_tools_client) + + with pytest.raises(ToolException, match='run-XYZ'): + tool._run(**run_kwargs) + + +# --------------------------------------------------------------------------- +# Per-tool empty-dataset coverage +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize(('tool_cls', 'method_name', 'run_kwargs'), _TOOL_INVOCATIONS) +def test_social_tool_returns_valid_json_for_empty_items( + tool_cls: type, + method_name: str, + run_kwargs: dict, + mock_tools_client: MagicMock, +) -> None: + getattr(mock_tools_client, method_name).return_value = (SUCCEEDED_RUN, []) + tool = make_tool(tool_cls, mock_tools_client) + + result = tool._run(**run_kwargs) + parsed = json.loads(result) + + assert parsed['items'] == [] + assert parsed['run'] == EXPECTED_RUN_META diff --git a/tests/unit_tests/test_client.py b/tests/unit_tests/test_client.py new file mode 100644 index 0000000..f0c0919 --- /dev/null +++ b/tests/unit_tests/test_client.py @@ -0,0 +1,617 @@ +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +import httpx +import pytest + +from langchain_apify._client import ApifyToolsClient +from tests.unit_tests.conftest import FAILED_RUN, SAMPLE_ITEMS, SUCCEEDED_RUN + +# --------------------------------------------------------------------------- +# __init__ +# --------------------------------------------------------------------------- + + +def test_init_with_explicit_token(mock_apify_client: MagicMock) -> None: + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client) as mock_create: + c = ApifyToolsClient(apify_api_token='my-token') + mock_create.assert_called_once() + assert c._client is mock_apify_client + + +def test_init_with_env_token(monkeypatch: pytest.MonkeyPatch, mock_apify_client: MagicMock) -> None: + monkeypatch.setenv('APIFY_API_TOKEN', 'env-token') + with patch('langchain_apify._client._create_apify_client', return_value=mock_apify_client): + c = ApifyToolsClient() + assert c._client is mock_apify_client + + +def test_init_missing_token_raises(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyToolsClient() + + +# --------------------------------------------------------------------------- +# run_actor +# --------------------------------------------------------------------------- + + +def test_run_actor_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_actor('apify/test-actor', run_input={'key': 'val'}) + + mock_apify_client.actor.assert_called_once_with('apify/test-actor') + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input={'key': 'val'}, timeout_secs=300, logger=None + ) + assert result == SUCCEEDED_RUN + + +def test_run_actor_with_memory(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + + client.run_actor('apify/test-actor', memory_mbytes=512) + + mock_apify_client.actor.return_value.call.assert_called_once_with( + run_input=None, timeout_secs=300, logger=None, memory_mbytes=512 + ) + + +def test_run_actor_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# get_dataset_items +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + items = client.get_dataset_items('dataset-xyz', limit=50, offset=10) + + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + mock_apify_client.dataset.return_value.list_items.assert_called_once_with(limit=50, offset=10, clean=True) + assert items == SAMPLE_ITEMS + + +def test_get_dataset_items_empty(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + items = client.get_dataset_items('dataset-empty') + assert items == [] + + +# --------------------------------------------------------------------------- +# run_actor_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_actor_and_get_items('apify/test-actor', run_input={'q': '1'}) + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + mock_apify_client.dataset.assert_called_once_with('dataset-xyz') + + +def test_run_actor_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.actor.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_actor_and_get_items('apify/test-actor') + + +# --------------------------------------------------------------------------- +# run_task +# --------------------------------------------------------------------------- + + +def test_run_task_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + + result = client.run_task('user/my-task', task_input={'key': 'val'}) + + mock_apify_client.task.assert_called_once_with('user/my-task') + mock_apify_client.task.return_value.call.assert_called_once_with(task_input={'key': 'val'}, timeout_secs=300) + assert result == SUCCEEDED_RUN + + +def test_run_task_failed_status_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.run_task('user/my-task') + + +# --------------------------------------------------------------------------- +# run_task_and_get_items +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_success(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = SAMPLE_ITEMS + + run, items = client.run_task_and_get_items('user/my-task') + + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_run_task_and_get_items_missing_dataset_id_raises( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + run_no_dataset = {**SUCCEEDED_RUN, 'defaultDatasetId': None} + mock_apify_client.task.return_value.call.return_value = run_no_dataset + + with pytest.raises(RuntimeError, match='no default dataset ID'): + client.run_task_and_get_items('user/my-task') + + +# --------------------------------------------------------------------------- +# scrape_url +# --------------------------------------------------------------------------- + + +def test_scrape_url_returns_markdown(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '# Hello', 'text': 'Hello', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == '# Hello' + + +def test_scrape_url_falls_back_to_text(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'text': 'Plain text content', 'url': 'https://example.com'}, + ] + + content = client.scrape_url('https://example.com') + assert content == 'Plain text content' + + +def test_scrape_url_empty_items_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +def test_scrape_url_empty_content_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = [ + {'markdown': '', 'text': '', 'url': 'https://example.com'}, + ] + + with pytest.raises(RuntimeError, match='No content extracted'): + client.scrape_url('https://example.com') + + +# --------------------------------------------------------------------------- +# _check_run_status +# --------------------------------------------------------------------------- + + +def test_check_run_status_succeeded() -> None: + ApifyToolsClient._check_run_status({'id': 'run-ok', 'status': 'SUCCEEDED'}) + + +def test_check_run_status_failed() -> None: + with pytest.raises(RuntimeError, match='run-bad'): + ApifyToolsClient._check_run_status({'id': 'run-bad', 'status': 'FAILED'}) + + +def test_check_run_status_failed_includes_status_message() -> None: + with pytest.raises(RuntimeError, match='Actor exited out of memory'): + ApifyToolsClient._check_run_status( + {'id': 'run-oom', 'status': 'FAILED', 'statusMessage': 'Actor exited out of memory'}, + ) + + +# --------------------------------------------------------------------------- +# None returns from actor/task .call() +# --------------------------------------------------------------------------- + + +def test_run_actor_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_actor('apify/broken-actor') + + +def test_run_task_none_return_raises(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.return_value = None + + with pytest.raises(RuntimeError, match='returned no run details'): + client.run_task('user/broken-task') + + +# --------------------------------------------------------------------------- +# Transport-error wrapping (httpx / ApifyClientError -> RuntimeError) +# --------------------------------------------------------------------------- + + +def test_run_actor_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify Actor call failed'): + client.run_actor('apify/test-actor') + + +def test_get_dataset_items_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('timeout') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.get_dataset_items('dataset-xyz') + + +def test_run_actor_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_actor_and_get_items('apify/test-actor') + + +def test_run_task_network_error_wraps(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.task.return_value.call.side_effect = httpx.ConnectError('conn refused') + + with pytest.raises(RuntimeError, match='Apify task call failed'): + client.run_task('user/my-task') + + +def test_run_task_and_get_items_dataset_fetch_network_error( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + mock_apify_client.task.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.side_effect = httpx.ConnectError('reset') + + with pytest.raises(RuntimeError, match='Apify dataset fetch failed'): + client.run_task_and_get_items('user/my-task') + + +def test_run_actor_programming_error_propagates(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + """Non-transport exceptions (programming errors) must NOT be wrapped as RuntimeError.""" + mock_apify_client.actor.return_value.call.side_effect = AttributeError('bug in SDK') + + with pytest.raises(AttributeError, match='bug in SDK'): + client.run_actor('apify/test-actor') + + +# --------------------------------------------------------------------------- +# instagram_scrape +# --------------------------------------------------------------------------- + + +def _setup_run_and_items(mock_apify_client: MagicMock, items: list[dict] | None = None) -> None: + mock_apify_client.actor.return_value.call.return_value = SUCCEEDED_RUN + mock_apify_client.dataset.return_value.list_items.return_value.items = items or SAMPLE_ITEMS + + +def test_instagram_scrape_user_builds_profile_url(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.instagram_scrape('user', 'apify', max_results=5) + + mock_apify_client.actor.assert_called_once_with('apify/instagram-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'directUrls': ['https://www.instagram.com/apify/'], + 'resultsType': 'posts', + 'resultsLimit': 5, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_instagram_scrape_hashtag_builds_tag_url(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('hashtag', '#travel', max_results=10) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['directUrls'] == ['https://www.instagram.com/explore/tags/travel/'] + assert call_kwargs['run_input']['resultsType'] == 'posts' + + +def test_instagram_scrape_comments_uses_comments_results_type( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('comments', 'https://www.instagram.com/p/ABC123/', max_results=15) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['resultsType'] == 'comments' + assert call_kwargs['run_input']['directUrls'] == ['https://www.instagram.com/p/ABC123/'] + + +def test_instagram_scrape_passes_only_posts_newer_than(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.instagram_scrape('user', 'apify', only_posts_newer_than='1 week') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsNewerThan'] == '1 week' + + +def test_instagram_scrape_invalid_search_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported Instagram search_type'): + client.instagram_scrape('reels', 'apify') + + +# --------------------------------------------------------------------------- +# linkedin_profile_posts +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_posts_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.linkedin_profile_posts('https://www.linkedin.com/in/satyanadella', max_results=30) + + mock_apify_client.actor.assert_called_once_with('apimaestro/linkedin-profile-posts') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'username': 'https://www.linkedin.com/in/satyanadella', + 'total_posts': 30, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +# --------------------------------------------------------------------------- +# linkedin_profile_search +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_search_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_search('Founder', max_results=25) + + mock_apify_client.actor.assert_called_once_with('harvestapi/linkedin-profile-search') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'searchQuery': 'Founder', 'maxItems': 25} + + +def test_linkedin_profile_search_default_max_results(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_search('CTO') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['maxItems'] == 10 + + +# --------------------------------------------------------------------------- +# linkedin_profile_detail +# --------------------------------------------------------------------------- + + +def test_linkedin_profile_detail_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client, items=[{'firstName': 'Neal'}]) + + run, items = client.linkedin_profile_detail('neal-mohan', include_email=True) + + mock_apify_client.actor.assert_called_once_with('apimaestro/linkedin-profile-detail') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'username': 'neal-mohan', 'includeEmail': True} + assert run == SUCCEEDED_RUN + assert items == [{'firstName': 'Neal'}] + + +def test_linkedin_profile_detail_default_include_email_false( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.linkedin_profile_detail('neal-mohan') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['includeEmail'] is False + + +# --------------------------------------------------------------------------- +# twitter_scrape +# --------------------------------------------------------------------------- + + +def test_twitter_scrape_search_mode(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', max_results=50) + + mock_apify_client.actor.assert_called_once_with('apidojo/twitter-scraper-lite') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'maxItems': 50, 'searchTerms': ['apify']} + + +def test_twitter_scrape_user_mode_strips_at(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('@apify', search_mode='user', max_results=10) + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'maxItems': 10, 'twitterHandles': ['apify']} + + +def test_twitter_scrape_replies_mode_uses_start_urls(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('https://x.com/apify/status/123', search_mode='replies') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['startUrls'] == ['https://x.com/apify/status/123'] + + +def test_twitter_scrape_passes_date_range(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', start='2025-01-01', end='2025-02-01') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['start'] == '2025-01-01' + assert call_kwargs['run_input']['end'] == '2025-02-01' + + +def test_twitter_scrape_passes_sort(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.twitter_scrape('apify', sort='Top') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['sort'] == 'Top' + + +def test_twitter_scrape_invalid_mode_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported Twitter search_mode'): + client.twitter_scrape('apify', search_mode='followers') + + +# --------------------------------------------------------------------------- +# tiktok_scrape +# --------------------------------------------------------------------------- + + +def test_tiktok_scrape_search_mode(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('cooking', max_results=12) + + mock_apify_client.actor.assert_called_once_with('clockworks/tiktok-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == {'resultsPerPage': 12, 'searchQueries': ['cooking']} + + +def test_tiktok_scrape_user_mode_strips_at(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('@charlidamelio', search_type='user') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['profiles'] == ['charlidamelio'] + + +def test_tiktok_scrape_hashtag_mode_strips_hash(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('#fyp', search_type='hashtag') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['hashtags'] == ['fyp'] + + +def test_tiktok_scrape_post_mode_uses_post_urls(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + client.tiktok_scrape('https://www.tiktok.com/@charlidamelio/video/123', search_type='post') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['postURLs'] == ['https://www.tiktok.com/@charlidamelio/video/123'] + + +def test_tiktok_scrape_invalid_type_raises(client: ApifyToolsClient) -> None: + with pytest.raises(ValueError, match='Unsupported TikTok search_type'): + client.tiktok_scrape('cooking', search_type='trending') + + +# --------------------------------------------------------------------------- +# facebook_posts_scrape +# --------------------------------------------------------------------------- + + +def test_facebook_posts_scrape_maps_input(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + _setup_run_and_items(mock_apify_client) + + run, items = client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', max_results=15) + + mock_apify_client.actor.assert_called_once_with('apify/facebook-posts-scraper') + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input'] == { + 'startUrls': [{'url': 'https://www.facebook.com/humansofnewyork/'}], + 'resultsLimit': 15, + } + assert run == SUCCEEDED_RUN + assert items == SAMPLE_ITEMS + + +def test_facebook_posts_scrape_passes_only_posts_newer_than( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', only_posts_newer_than='2025-01-01') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsNewerThan'] == '2025-01-01' + + +def test_facebook_posts_scrape_passes_only_posts_older_than( + client: ApifyToolsClient, mock_apify_client: MagicMock +) -> None: + _setup_run_and_items(mock_apify_client) + + client.facebook_posts_scrape('https://www.facebook.com/humansofnewyork/', only_posts_older_than='2025-12-31') + + call_kwargs = mock_apify_client.actor.return_value.call.call_args.kwargs + assert call_kwargs['run_input']['onlyPostsOlderThan'] == '2025-12-31' + + +# --------------------------------------------------------------------------- +# Failed run propagates from social helpers +# --------------------------------------------------------------------------- + + +def test_social_helper_propagates_failed_run(client: ApifyToolsClient, mock_apify_client: MagicMock) -> None: + mock_apify_client.actor.return_value.call.return_value = FAILED_RUN + + with pytest.raises(RuntimeError, match='run-fail'): + client.instagram_scrape('user', 'apify') + + +# --------------------------------------------------------------------------- +# _build_instagram_url +# --------------------------------------------------------------------------- + + +def test_build_instagram_url_passthrough_for_full_url() -> None: + assert ( + ApifyToolsClient._build_instagram_url('post', 'https://www.instagram.com/p/abc/') + == 'https://www.instagram.com/p/abc/' + ) + + +def test_build_instagram_url_user() -> None: + assert ApifyToolsClient._build_instagram_url('user', '@apify') == 'https://www.instagram.com/apify/' + + +def test_build_instagram_url_hashtag() -> None: + assert ( + ApifyToolsClient._build_instagram_url('hashtag', '#travel') == 'https://www.instagram.com/explore/tags/travel/' + ) + + +def test_build_instagram_url_post_from_id() -> None: + assert ApifyToolsClient._build_instagram_url('post', 'ABC123') == 'https://www.instagram.com/p/ABC123/' diff --git a/tests/unit_tests/test_document_loaders.py b/tests/unit_tests/test_document_loaders.py index a6c7a61..49ee9db 100644 --- a/tests/unit_tests/test_document_loaders.py +++ b/tests/unit_tests/test_document_loaders.py @@ -1,5 +1,6 @@ from unittest.mock import patch +import pytest from apify_client._types import ListPage from apify_client.clients import DatasetClient from langchain_core.documents import Document @@ -55,3 +56,27 @@ def test_apify_dataset_loader_lazy_load() -> None: mock_list_items.assert_called_once() assert documents[0].page_content == 'Apify is great!' assert documents[0].metadata['source'] == 'https://apify.com' + + +def test_apify_dataset_loader_apify_token_fallback(monkeypatch: pytest.MonkeyPatch) -> None: + """Loader should accept APIFY_TOKEN as a secondary env-var fallback.""" + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.setenv('APIFY_TOKEN', 'platform-token') + + with patch.object(DatasetClient, 'list_items') as mock_list_items: + mock_list_items.return_value = ListPage(data={'items': []}) + loader = ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) + assert loader.load() == [] + + +def test_apify_dataset_loader_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + monkeypatch.delenv('APIFY_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyDatasetLoader( + dataset_id='d', + dataset_mapping_function=lambda _item: Document(page_content='x'), + ) diff --git a/tests/unit_tests/test_tools.py b/tests/unit_tests/test_tools.py index b10df2f..372f9a6 100644 --- a/tests/unit_tests/test_tools.py +++ b/tests/unit_tests/test_tools.py @@ -1,13 +1,28 @@ from __future__ import annotations +import json +from datetime import datetime, timezone from typing import TYPE_CHECKING -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest +from langchain_core.tools import ToolException from pydantic import BaseModel -from langchain_apify.tools import ApifyActorsTool -from langchain_apify.utils import actor_id_to_tool_name +from langchain_apify import APIFY_CORE_TOOLS +from langchain_apify._client import ApifyToolsClient +from langchain_apify._utils import _actor_id_to_tool_name, _iso, _run_meta +from langchain_apify.tools import ( + ApifyActorsTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyRunActorTool, + ApifyRunTaskAndGetDatasetTool, + ApifyRunTaskTool, + ApifyScrapeUrlTool, + _ApifyGenericTool, +) +from tests.unit_tests.conftest import SAMPLE_ITEMS, SUCCEEDED_RUN, make_tool if TYPE_CHECKING: from collections.abc import Generator @@ -40,7 +55,7 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id=actor_id, apify_api_token='dummy-token') assert isinstance(tool, ApifyActorsTool) assert tool.description == 'Mocked description' - assert tool.name == actor_id_to_tool_name(actor_id) + assert tool.name == _actor_id_to_tool_name(actor_id) assert tool.args_schema == DummyModel @@ -85,3 +100,513 @@ class DummyModel(BaseModel): tool = ApifyActorsTool(actor_id='apify/python-example', apify_api_token='dummy-token') yield tool + + +# --------------------------------------------------------------------------- +# _iso / _run_meta helpers +# --------------------------------------------------------------------------- + + +def test_iso_converts_datetime_to_string() -> None: + dt = datetime(2025, 6, 15, 12, 30, 45, tzinfo=timezone.utc) + assert _iso(dt) == '2025-06-15T12:30:45+00:00' + + +def test_iso_passes_through_string() -> None: + assert _iso('2025-01-01T00:00:00.000Z') == '2025-01-01T00:00:00.000Z' + + +def test_iso_passes_through_none() -> None: + assert _iso(None) is None + + +def test_run_meta_with_datetime_values_is_json_serializable() -> None: + run = { + 'id': 'run-dt', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-dt', + 'startedAt': datetime(2025, 3, 1, 10, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 3, 1, 10, 1, 0, tzinfo=timezone.utc), + } + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['run_id'] == 'run-dt' + assert parsed['started_at'] == '2025-03-01T10:00:00+00:00' + assert parsed['finished_at'] == '2025-03-01T10:01:00+00:00' + + +def test_run_meta_with_string_values_is_json_serializable() -> None: + meta = _run_meta(SUCCEEDED_RUN) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + + +def test_run_meta_with_missing_timestamps() -> None: + run = {'id': 'run-none', 'status': 'RUNNING', 'defaultDatasetId': 'ds-none'} + meta = _run_meta(run) + serialized = json.dumps(meta) + parsed = json.loads(serialized) + assert parsed['started_at'] is None + assert parsed['finished_at'] is None + + +def test_run_actor_tool_with_datetime_run(mock_tools_client: MagicMock) -> None: + """End-to-end: ApifyRunActorTool returns valid JSON when the client returns datetime objects.""" + mock_tools_client.run_actor.return_value = { + 'id': 'run-real', + 'status': 'SUCCEEDED', + 'defaultDatasetId': 'ds-real', + 'startedAt': datetime(2025, 6, 1, 8, 0, 0, tzinfo=timezone.utc), + 'finishedAt': datetime(2025, 6, 1, 8, 5, 0, tzinfo=timezone.utc), + } + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test') + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-real' + assert parsed['started_at'] == '2025-06-01T08:00:00+00:00' + assert parsed['finished_at'] == '2025-06-01T08:05:00+00:00' + + +# --------------------------------------------------------------------------- +# ApifyRunActorTool +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_actor.assert_called_once_with('apify/test', {'key': 'val'}, 300, None) + + +def test_run_actor_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunActorTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(actor_id='apify/test') + + +def test_run_actor_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorTool() + + +# --------------------------------------------------------------------------- +# ApifyGetDatasetItemsTool +# --------------------------------------------------------------------------- + + +def test_get_dataset_items_tool_returns_json_object(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-xyz', limit=50, offset=5) + + parsed = json.loads(result) + assert len(parsed['items']) == 2 + assert parsed['items'][0]['text'] == 'item-1' + mock_tools_client.get_dataset_items.assert_called_once_with('dataset-xyz', 50, 5) + + +def test_get_dataset_items_tool_empty_returns_message(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = [] + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + result = tool._run(dataset_id='dataset-empty') + + parsed = json.loads(result) + assert parsed['items'] == [] + assert 'empty' in parsed['message'].lower() + + +def test_get_dataset_items_tool_network_error_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.side_effect = RuntimeError( + 'Apify dataset fetch failed for ds-bad: connection reset' + ) + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client) + + with pytest.raises(ToolException, match='Apify dataset fetch failed'): + tool._run(dataset_id='ds-bad') + + +def test_get_dataset_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyGetDatasetItemsTool() + + +# --------------------------------------------------------------------------- +# ApifyRunActorAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_actor_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + result = tool._run(actor_id='apify/test', run_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_actor_and_get_items.assert_called_once_with('apify/test', {'q': '1'}, 300, None, 50) + + +def test_run_actor_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunActorAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(actor_id='apify/test') + + +def test_run_actor_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunActorAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# ApifyScrapeUrlTool +# --------------------------------------------------------------------------- + + +def test_scrape_url_tool_returns_markdown(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# Hello World' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + result = tool._run(url='https://example.com') + + assert result == '# Hello World' + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 120) + + +def test_scrape_url_tool_empty_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.side_effect = RuntimeError('No content extracted from https://example.com.') + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client) + + with pytest.raises(ToolException, match='No content extracted'): + tool._run(url='https://example.com') + + +def test_scrape_url_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyScrapeUrlTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskTool +# --------------------------------------------------------------------------- + + +def test_run_task_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'key': 'val'}) + + parsed = json.loads(result) + assert parsed['run_id'] == 'run-abc' + assert parsed['status'] == 'SUCCEEDED' + assert parsed['dataset_id'] == 'dataset-xyz' + assert parsed['started_at'] == '2025-01-01T00:00:00.000Z' + assert parsed['finished_at'] == '2025-01-01T00:01:00.000Z' + mock_tools_client.run_task.assert_called_once_with('user/my-task', {'key': 'val'}, 300, None) + + +def test_run_task_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.side_effect = RuntimeError('Actor run run-bad ended with status FAILED.') + tool = make_tool(ApifyRunTaskTool, mock_tools_client) + + with pytest.raises(ToolException, match='FAILED'): + tool._run(task_id='user/my-task') + + +def test_run_task_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskTool() + + +# --------------------------------------------------------------------------- +# ApifyRunTaskAndGetDatasetTool +# --------------------------------------------------------------------------- + + +def test_run_task_and_get_items_tool_returns_json(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + result = tool._run(task_id='user/my-task', task_input={'q': '1'}, dataset_items_limit=50) + + parsed = json.loads(result) + assert parsed['run']['run_id'] == 'run-abc' + assert parsed['run']['status'] == 'SUCCEEDED' + assert len(parsed['items']) == 2 + mock_tools_client.run_task_and_get_items.assert_called_once_with('user/my-task', {'q': '1'}, 300, None, 50) + + +def test_run_task_and_get_items_tool_failure_raises_tool_exception(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.side_effect = RuntimeError( + 'Actor run run-bad ended with status TIMED-OUT.' + ) + tool = make_tool(ApifyRunTaskAndGetDatasetTool, mock_tools_client) + + with pytest.raises(ToolException, match='TIMED-OUT'): + tool._run(task_id='user/my-task') + + +def test_run_task_and_get_items_tool_missing_token(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv('APIFY_API_TOKEN', raising=False) + with pytest.raises(ValueError, match='APIFY_API_TOKEN'): + ApifyRunTaskAndGetDatasetTool() + + +# --------------------------------------------------------------------------- +# Value clamping (developer safety limits) +# --------------------------------------------------------------------------- + + +def test_run_actor_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=60) + + tool._run(actor_id='apify/test', timeout_secs=9999) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 60, None) + + +def test_run_actor_tool_clamps_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=8192) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 512) + + +def test_run_actor_tool_passes_none_memory_through(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=512) + + tool._run(actor_id='apify/test', memory_mbytes=None) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_get_dataset_items_tool_clamps_limit(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=10) + + tool._run(dataset_id='ds-1', limit=50000) + + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 10, 0) + + +def test_run_actor_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunActorAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(actor_id='a', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_actor_and_get_items.assert_called_once_with('a', None, 30, 256, 5) + + +def test_scrape_url_tool_clamps_timeout(mock_tools_client: MagicMock) -> None: + mock_tools_client.scrape_url.return_value = '# content' + tool = make_tool(ApifyScrapeUrlTool, mock_tools_client, max_timeout_secs=30) + + tool._run(url='https://example.com', timeout_secs=9999) + + mock_tools_client.scrape_url.assert_called_once_with('https://example.com', 30) + + +def test_run_task_tool_clamps_timeout_and_memory(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunTaskTool, mock_tools_client, max_timeout_secs=60, max_memory_mbytes=512) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999) + + mock_tools_client.run_task.assert_called_once_with('t/1', None, 60, 512) + + +def test_run_task_and_get_items_tool_clamps_all(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_task_and_get_items.return_value = (SUCCEEDED_RUN, SAMPLE_ITEMS) + tool = make_tool( + ApifyRunTaskAndGetDatasetTool, + mock_tools_client, + max_timeout_secs=30, + max_memory_mbytes=256, + max_items=5, + ) + + tool._run(task_id='t/1', timeout_secs=9999, memory_mbytes=9999, dataset_items_limit=9999) + + mock_tools_client.run_task_and_get_items.assert_called_once_with('t/1', None, 30, 256, 5) + + +def test_clamp_timeout_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600) + + tool._run(actor_id='apify/test', timeout_secs=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', timeout_secs=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 1, None) + + +def test_clamp_memory_non_positive_is_treated_as_none(mock_tools_client: MagicMock) -> None: + """memory_mbytes <= 0 maps to None so the Apify platform default is used.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=-1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=0) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, None) + + +def test_clamp_memory_floors_positive_below_platform_minimum(mock_tools_client: MagicMock) -> None: + """A positive memory_mbytes below the Apify platform minimum (128 MB) is floored to 128.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', memory_mbytes=64) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + mock_tools_client.run_actor.reset_mock() + tool._run(actor_id='apify/test', memory_mbytes=1) + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 300, 128) + + +def test_clamp_items_floor_is_one(mock_tools_client: MagicMock) -> None: + mock_tools_client.get_dataset_items.return_value = SAMPLE_ITEMS + tool = make_tool(ApifyGetDatasetItemsTool, mock_tools_client, max_items=100) + + tool._run(dataset_id='ds-1', limit=-1) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + mock_tools_client.get_dataset_items.reset_mock() + tool._run(dataset_id='ds-1', limit=0) + mock_tools_client.get_dataset_items.assert_called_once_with('ds-1', 1, 0) + + +def test_values_below_max_pass_through(mock_tools_client: MagicMock) -> None: + """When LLM values are within limits they should pass through unchanged.""" + mock_tools_client.run_actor.return_value = SUCCEEDED_RUN + tool = make_tool(ApifyRunActorTool, mock_tools_client, max_timeout_secs=600, max_memory_mbytes=4096) + + tool._run(actor_id='apify/test', timeout_secs=120, memory_mbytes=1024) + + mock_tools_client.run_actor.assert_called_once_with('apify/test', None, 120, 1024) + + +# --------------------------------------------------------------------------- +# Tool metadata assertions +# --------------------------------------------------------------------------- + + +def test_generic_tools_have_correct_metadata() -> None: + """Verify name, description, and args_schema are set on all generic tools.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tools = [ + ApifyRunActorTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyGetDatasetItemsTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunActorAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyScrapeUrlTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ApifyRunTaskAndGetDatasetTool(apify_api_token='dummy'), # type: ignore[call-arg,arg-type] + ] + + expected_names = [ + 'apify_run_actor', + 'apify_get_dataset_items', + 'apify_run_actor_and_get_dataset', + 'apify_scrape_url', + 'apify_run_task', + 'apify_run_task_and_get_dataset', + ] + + for tool, expected_name in zip(tools, expected_names): + assert tool.name == expected_name + assert tool.description + assert tool.args_schema is not None + assert tool.handle_tool_error is True + + +def test_apify_api_token_excluded_from_model_dump() -> None: + """The apify_api_token field must not appear in model_dump() output.""" + with patch.object(ApifyToolsClient, '__init__', return_value=None): + tool = ApifyRunActorTool(apify_api_token='x') # type: ignore[call-arg,arg-type] + dumped = tool.model_dump() + assert 'apify_api_token' not in dumped + + +# --------------------------------------------------------------------------- +# _ApifyGenericTool inheritance +# --------------------------------------------------------------------------- + + +def test_all_generic_tools_inherit_from_base() -> None: + """Every generic tool must be a subclass of _ApifyGenericTool.""" + for tool_cls in ( + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + ): + assert issubclass(tool_cls, _ApifyGenericTool), f'{tool_cls.__name__} must extend _ApifyGenericTool' + + +def test_legacy_tool_does_not_inherit_from_generic_base() -> None: + """ApifyActorsTool is legacy and must NOT inherit from _ApifyGenericTool.""" + assert not issubclass(ApifyActorsTool, _ApifyGenericTool) + + +# --------------------------------------------------------------------------- +# APIFY_CORE_TOOLS list +# --------------------------------------------------------------------------- + + +def test_apify_core_tools_contains_all_generic_classes() -> None: + """APIFY_CORE_TOOLS must list exactly the 6 generic tool classes.""" + assert set(APIFY_CORE_TOOLS) == { + ApifyRunActorTool, + ApifyGetDatasetItemsTool, + ApifyRunActorAndGetDatasetTool, + ApifyScrapeUrlTool, + ApifyRunTaskTool, + ApifyRunTaskAndGetDatasetTool, + } + assert len(APIFY_CORE_TOOLS) == 6