Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
8cad430
feat: implement apifyclient wrapper
daveomri Apr 20, 2026
2404b9c
feat: removed redundant const file
daveomri Apr 20, 2026
b1a89a4
feat: add few more input schemas, helpers and tool classes
daveomri Apr 20, 2026
0aa9175
feat: export new tools from __init__
daveomri Apr 20, 2026
4e46d36
feat: add unit tests
daveomri Apr 20, 2026
fc6ef12
feat: implement tests and introduce tools list
daveomri Apr 21, 2026
cc5be9e
fix: lint fix
daveomri Apr 21, 2026
c2b9cb6
feat: enhance error handling and documentation for apify tools
daveomri Apr 21, 2026
3edf126
fix: iso format fix
daveomri Apr 21, 2026
8c36edc
feat: add apify run task and apify run task and get items tools with …
daveomri Apr 21, 2026
026175a
feat: introduce _ApifyGenericTool base class for Apify tools to strea…
daveomri Apr 21, 2026
110c971
feat: add _actor_tools.py file to define upcomming search and social …
daveomri Apr 21, 2026
a08f63e
fix: add try/except to match others
daveomri Apr 21, 2026
d028531
fix: update timeout constants and improve input schema descripiton in…
daveomri Apr 21, 2026
429a3ed
fix: enhance error handling for missing dataset id in run_actor and r…
daveomri Apr 21, 2026
b914e47
fix: update apifygetdatasetitemstool to return a json object with ite…
daveomri Apr 21, 2026
0f71181
feat: add integration smoke tests for generic Apify tools to validate…
daveomri Apr 21, 2026
50c52f2
feat: implement clamping for timeout, memory, and item limits in apif…
daveomri Apr 21, 2026
ba179a6
feat: clean up _actor_tools.py and tools.py for improved readibility …
daveomri Apr 22, 2026
005294b
ref: align private scope conventions with langchain partner package s…
daveomri Apr 22, 2026
2f74c29
ref: migrate auth to SecretStr + secret_from_env pattern
daveomri Apr 23, 2026
6258b2b
fix: backward-compat fix
daveomri Apr 23, 2026
2905b67
fix: update stale doc string
daveomri Apr 23, 2026
3238c02
chore: removed redundant file
daveomri Apr 23, 2026
92df406
fix: extracted repeated code, fixed secretstr compatibility to apifyt…
daveomri Apr 23, 2026
3a0f666
fix: set min value to timeout, memory and items, add exlude and repr …
daveomri Apr 23, 2026
8614cfd
feat: added repr and exclude to apify api token
daveomri Apr 23, 2026
2bf130a
feat: add type checking to apify core tools list
daveomri Apr 23, 2026
98293d4
feat: add tests for clamped values and apify api token
daveomri Apr 23, 2026
863ed8d
fix: lint fix
daveomri Apr 23, 2026
70527e0
ref: update apify_api_token type to support SecretStr in document loa…
daveomri Apr 24, 2026
ea8b16e
chore: rename tools to match the task description
daveomri Apr 28, 2026
cd1eea1
fix: narrow except blocks in _client.py to SDK/transport errors
daveomri Apr 28, 2026
50c3583
fix: clamp memory_mbytes to Apify platform minimum (128 MB)
daveomri Apr 28, 2026
450728c
fix: narrow empty-dataset message in ApifyGetDatasetItemsTool
daveomri Apr 28, 2026
1360e92
ref: simplify ApifyToolsClient.__init__ to require explicit token
daveomri Apr 28, 2026
09b6c6e
docs: add module-level docstring to tools.py
daveomri Apr 28, 2026
a5bd7cc
ref: rename model_post_init parameter to
daveomri Apr 28, 2026
23242c1
revert: restore env-fallback
daveomri Apr 28, 2026
660ae46
ref: support APIFY_TOKEN env var via shared resolver
daveomri May 7, 2026
78c3f23
fix: snap memory_mbytes to nearest valid Apify power-of-2
daveomri May 7, 2026
7ff1e6a
docs: switch README and llms.txt to APIFY_TOKEN
daveomri May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ pip install langchain-apify
## Prerequisites

You should configure credentials by setting the following environment variables:
- `APIFY_API_TOKEN` - Apify API token
- `APIFY_TOKEN` - Apify API token

Register your free Apify account [here](https://console.apify.com/sign-up) and learn how to get your API token in the [Apify documentation](https://docs.apify.com/platform/integrations/api).

Expand All @@ -57,7 +57,7 @@ import json
from langchain_apify import ApifyActorsTool

os.environ["OPENAI_API_KEY"] = "YOUR_OPENAI_API_KEY"
os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"
os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN"

browser = ApifyActorsTool('apify/rag-web-browser')
search_results = browser.invoke(input={
Expand Down Expand Up @@ -92,7 +92,7 @@ Example usage for `ApifyDatasetLoader` with a custom dataset mapping function fo
import os
from langchain_apify import ApifyDatasetLoader

os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"
os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN"

# Example dataset structure
# [
Expand Down Expand Up @@ -129,7 +129,7 @@ import os
from langchain_apify import ApifyWrapper
from langchain_core.documents import Document

os.environ["APIFY_API_TOKEN"] = "YOUR_APIFY_API_TOKEN"
os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN"

apify = ApifyWrapper()

Expand Down
41 changes: 39 additions & 2 deletions langchain_apify/__init__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,56 @@
from __future__ import annotations

from importlib import metadata
from typing import TYPE_CHECKING

from langchain_apify.document_loaders import ApifyDatasetLoader
from langchain_apify.tools import ApifyActorsTool
from langchain_apify.tools import (
ApifyActorsTool,
ApifyGetDatasetItemsTool,
ApifyRunActorAndGetDatasetTool,
ApifyRunActorTool,
ApifyRunTaskAndGetDatasetTool,
ApifyRunTaskTool,
ApifyScrapeUrlTool,
)
from langchain_apify.wrappers import ApifyWrapper

if TYPE_CHECKING:
from langchain_core.tools import BaseTool

try:
__version__ = metadata.version(__package__)
except metadata.PackageNotFoundError:
# Case where package metadata is not available.
__version__ = ''
del metadata # optional, avoids polluting the results of dir(__package__)

# Convenience tool-class lists for selective agent binding.
# Binding all tools at once overwhelms the LLM context window;
# pick the group(s) relevant to your use case.

APIFY_CORE_TOOLS: list[type[BaseTool]] = [
ApifyRunActorTool,
ApifyGetDatasetItemsTool,
ApifyRunActorAndGetDatasetTool,
ApifyScrapeUrlTool,
ApifyRunTaskTool,
ApifyRunTaskAndGetDatasetTool,
]

__all__ = [
# Existing components (backward-compatible)
'ApifyActorsTool',
'ApifyDatasetLoader',
'ApifyWrapper',
# Core generic tools
'ApifyGetDatasetItemsTool',
'ApifyRunActorAndGetDatasetTool',
'ApifyRunActorTool',
'ApifyRunTaskAndGetDatasetTool',
'ApifyRunTaskTool',
'ApifyScrapeUrlTool',
# Tool group lists
'APIFY_CORE_TOOLS',
# Meta
'__version__',
]
256 changes: 256 additions & 0 deletions langchain_apify/_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
from __future__ import annotations

import httpx
from apify_client import ApifyClient
from apify_client.errors import ApifyClientError
from pydantic import SecretStr

from langchain_apify._error_messages import (
_ERROR_ACTOR_RUN_FAILED,
_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET,
_ERROR_SCRAPE_EMPTY,
)
from langchain_apify._utils import _create_apify_client, _resolve_apify_token

# Only catches ApifyClientError and httpx.HTTPError. Other errors propagate.
_TRANSPORT_EXCEPTIONS = (ApifyClientError, httpx.HTTPError)

_SCRAPE_ACTOR_ID = 'apify/website-content-crawler'
_DEFAULT_RUN_TIMEOUT_SECS = 300
_DEFAULT_SCRAPE_TIMEOUT_SECS = 120
_DEFAULT_DATASET_ITEMS_LIMIT = 100
_RUN_STATUS_SUCCEEDED = 'SUCCEEDED'


class ApifyToolsClient:
"""Internal helper that wraps ``ApifyClient`` for the tools layer.

One convenience method per tool operation. All methods are synchronous and
block until the Actor run finishes.

Args:
apify_api_token: Apify API token. Falls back to the ``APIFY_TOKEN``
environment variable (or ``APIFY_API_TOKEN`` for backwards
compatibility) when *None*.

Raises:
ValueError: If no token is provided and the env var is not set.
"""

def __init__(self, apify_api_token: SecretStr | str | None = None) -> None:
if isinstance(apify_api_token, SecretStr):
_token: str | None = apify_api_token.get_secret_value()
else:
_token = apify_api_token or _resolve_apify_token()

if not _token:
msg = _ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET
raise ValueError(msg)
self._client = _create_apify_client(ApifyClient, _token)

def run_actor(
self,
actor_id: str,
run_input: dict | None = None,
timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS,
memory_mbytes: int | None = None,
) -> dict:
"""Start an Actor and block until it finishes.

Args:
actor_id: Actor ID or name (e.g. ``"apify/python-example"``).
run_input: JSON-serialisable input for the Actor.
timeout_secs: Maximum time to wait for the run to finish.
memory_mbytes: Memory limit for the run, or *None* for Actor default.

Returns:
Full run-details dict returned by the Apify API.

Raises:
RuntimeError: If the run does not finish with status ``SUCCEEDED``.
"""
call_kwargs: dict = {'run_input': run_input, 'timeout_secs': timeout_secs, 'logger': None}
if memory_mbytes is not None:
call_kwargs['memory_mbytes'] = memory_mbytes

try:
run = self._client.actor(actor_id).call(**call_kwargs)
except _TRANSPORT_EXCEPTIONS as exc:
msg = f'Apify Actor call failed for {actor_id}: {exc}'
raise RuntimeError(msg) from exc
if run is None:
msg = f'Actor {actor_id} call returned no run details.'
raise RuntimeError(msg)
self._check_run_status(run)
return run

def get_dataset_items(
self, dataset_id: str, limit: int = _DEFAULT_DATASET_ITEMS_LIMIT, offset: int = 0
) -> list[dict]:
"""Fetch items from an existing dataset.

Args:
dataset_id: Apify dataset ID.
limit: Maximum number of items to return.
offset: Number of items to skip from the start.

Returns:
List of dataset item dicts (may be empty).
"""
try:
return self._client.dataset(dataset_id).list_items(limit=limit, offset=offset, clean=True).items
except _TRANSPORT_EXCEPTIONS as exc:
msg = f'Apify dataset fetch failed for {dataset_id}: {exc}'
raise RuntimeError(msg) from exc

def run_actor_and_get_items(
self,
actor_id: str,
run_input: dict | None = None,
timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS,
memory_mbytes: int | None = None,
dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT,
) -> tuple[dict, list[dict]]:
"""Run an Actor, then fetch items from its default dataset.

Args:
actor_id: Actor ID or name.
run_input: JSON-serialisable input for the Actor.
timeout_secs: Maximum time to wait for the run to finish.
memory_mbytes: Memory limit for the run, or *None* for Actor default.
dataset_items_limit: Maximum number of dataset items to return.

Returns:
A ``(run_details, items)`` tuple.

Raises:
RuntimeError: If the run does not finish with status ``SUCCEEDED``.
"""
run = self.run_actor(actor_id, run_input, timeout_secs, memory_mbytes)
dataset_id = run.get('defaultDatasetId')
if not dataset_id:
msg = f'Actor {actor_id} run succeeded but returned no default dataset ID.'
raise RuntimeError(msg)
items = self._list_items_or_raise(dataset_id, dataset_items_limit)
return run, items

def run_task(
self,
task_id: str,
task_input: dict | None = None,
timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS,
memory_mbytes: int | None = None,
) -> dict:
"""Start a saved Actor task and block until it finishes.

Args:
task_id: Task ID or name (e.g. ``"user/my-task"``).
task_input: JSON-serialisable input that overrides the task's
pre-saved input.
timeout_secs: Maximum time to wait for the run to finish.
memory_mbytes: Memory limit for the run, or *None* for task default.

Returns:
Full run-details dict returned by the Apify API.

Raises:
RuntimeError: If the run does not finish with status ``SUCCEEDED``.
"""
call_kwargs: dict = {'task_input': task_input, 'timeout_secs': timeout_secs}
if memory_mbytes is not None:
call_kwargs['memory_mbytes'] = memory_mbytes

try:
run = self._client.task(task_id).call(**call_kwargs)
except _TRANSPORT_EXCEPTIONS as exc:
msg = f'Apify task call failed for {task_id}: {exc}'
raise RuntimeError(msg) from exc
if run is None:
msg = f'Task {task_id} call returned no run details.'
raise RuntimeError(msg)
self._check_run_status(run)
return run

def run_task_and_get_items(
self,
task_id: str,
task_input: dict | None = None,
timeout_secs: int = _DEFAULT_RUN_TIMEOUT_SECS,
memory_mbytes: int | None = None,
dataset_items_limit: int = _DEFAULT_DATASET_ITEMS_LIMIT,
) -> tuple[dict, list[dict]]:
"""Run a saved Actor task, then fetch items from its default dataset.

Args:
task_id: Task ID or name.
task_input: JSON-serialisable input that overrides the task's
pre-saved input.
timeout_secs: Maximum time to wait for the run to finish.
memory_mbytes: Memory limit for the run, or *None* for task default.
dataset_items_limit: Maximum number of dataset items to return.

Returns:
A ``(run_details, items)`` tuple.

Raises:
RuntimeError: If the run does not finish with status ``SUCCEEDED``.
"""
run = self.run_task(task_id, task_input, timeout_secs, memory_mbytes)
dataset_id = run.get('defaultDatasetId')
if not dataset_id:
msg = f'Task {task_id} run succeeded but returned no default dataset ID.'
raise RuntimeError(msg)
items = self._list_items_or_raise(dataset_id, dataset_items_limit)
return run, items

def scrape_url(self, url: str, timeout_secs: int = _DEFAULT_SCRAPE_TIMEOUT_SECS) -> str:
"""Scrape a single URL and return its content as markdown.

Uses ``apify/website-content-crawler`` with ``maxCrawlPages=1``.

Args:
url: The URL to scrape.
timeout_secs: Maximum time to wait for the crawl to finish.

Returns:
Markdown (or plain-text fallback) content of the page.

Raises:
RuntimeError: If the Actor run fails or no content is extracted.
"""
run_input = {
'startUrls': [{'url': url}],
'maxCrawlPages': 1,
}
_, items = self.run_actor_and_get_items(
_SCRAPE_ACTOR_ID,
run_input=run_input,
timeout_secs=timeout_secs,
dataset_items_limit=1,
)
if not items:
msg = _ERROR_SCRAPE_EMPTY.format(url=url)
raise RuntimeError(msg)

content = items[0].get('markdown') or items[0].get('text') or ''
if not content:
msg = _ERROR_SCRAPE_EMPTY.format(url=url)
raise RuntimeError(msg)
return content

def _list_items_or_raise(self, dataset_id: str, limit: int) -> list[dict]:
"""Fetch dataset items, wrapping any network error in a RuntimeError."""
try:
return self._client.dataset(dataset_id).list_items(limit=limit, clean=True).items
except _TRANSPORT_EXCEPTIONS as exc:
msg = f'Apify dataset fetch failed for {dataset_id}: {exc}'
raise RuntimeError(msg) from exc

@staticmethod
def _check_run_status(run: dict) -> None:
"""Raise if the run did not succeed."""
status = run.get('status')
if status != _RUN_STATUS_SUCCEEDED:
run_id = run.get('id', 'unknown')
msg = _ERROR_ACTOR_RUN_FAILED.format(run_id=run_id, status=status)
raise RuntimeError(msg)
11 changes: 11 additions & 0 deletions langchain_apify/_error_messages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
_ERROR_APIFY_TOKEN_ENV_VAR_NOT_SET = (
'APIFY_TOKEN environment variable is not set.'
' Please set it to your Apify API token by using `os.environ["APIFY_TOKEN"] = "YOUR_APIFY_TOKEN"`'
' in your code or pass it as environment variable.'
' To pass it as environment variable, you can use the following command:'
' `APIFY_TOKEN="YOUR_APIFY_TOKEN" python your_script.py`'
)

_ERROR_ACTOR_RUN_FAILED = 'Actor run {run_id} ended with status {status}.'

_ERROR_SCRAPE_EMPTY = 'No content extracted from {url}.'
Loading
Loading