From 2dd49c800524ec05b58a8c09bae0e3b2ad3586b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Ad=C3=A1mek?= Date: Fri, 6 Feb 2026 16:25:49 +0100 Subject: [PATCH] feat: Add Crawlee + Stagehand actor templates (JS + TS) Add two new actor templates using StagehandCrawler from @crawlee/stagehand, which combines Crawlee's crawling infrastructure with Stagehand's AI-powered browser automation (act(), extract(), observe()). Templates: - js-crawlee-stagehand-chrome: JavaScript version - ts-crawlee-stagehand-chrome: TypeScript version Both demonstrate page.act() for natural language actions and page.extract() for structured data extraction with Zod schemas. Default LLM provider is OpenAI (gpt-4o-mini). Co-Authored-By: Claude Opus 4.6 --- .../.actor/actor.json | 20 +++++++ .../.actor/dataset_schema.json | 33 +++++++++++ .../.actor/input_schema.json | 18 ++++++ .../.actor/output_schema.json | 11 ++++ .../js-crawlee-stagehand-chrome/.dockerignore | 15 +++++ .../js-crawlee-stagehand-chrome/.editorconfig | 10 ++++ .../js-crawlee-stagehand-chrome/.gitignore | 10 ++++ .../js-crawlee-stagehand-chrome/.prettierrc | 5 ++ .../js-crawlee-stagehand-chrome/Dockerfile | 34 +++++++++++ .../js-crawlee-stagehand-chrome/README.md | 23 ++++++++ .../eslint.config.mjs | 6 ++ .../js-crawlee-stagehand-chrome/package.json | 30 ++++++++++ .../js-crawlee-stagehand-chrome/src/main.js | 44 ++++++++++++++ .../js-crawlee-stagehand-chrome/src/routes.js | 35 +++++++++++ templates/manifest.json | 32 ++++++++++ .../.actor/actor.json | 20 +++++++ .../.actor/dataset_schema.json | 33 +++++++++++ .../.actor/input_schema.json | 24 ++++++++ .../.actor/output_schema.json | 11 ++++ .../ts-crawlee-stagehand-chrome/.dockerignore | 15 +++++ .../ts-crawlee-stagehand-chrome/.editorconfig | 9 +++ .../ts-crawlee-stagehand-chrome/.gitignore | 10 ++++ .../.prettierignore | 1 + .../ts-crawlee-stagehand-chrome/.prettierrc | 5 ++ .../ts-crawlee-stagehand-chrome/Dockerfile | 59 +++++++++++++++++++ .../ts-crawlee-stagehand-chrome/README.md | 23 ++++++++ .../eslint.config.mjs | 30 ++++++++++ .../ts-crawlee-stagehand-chrome/package.json | 42 +++++++++++++ .../ts-crawlee-stagehand-chrome/src/main.ts | 58 ++++++++++++++++++ .../ts-crawlee-stagehand-chrome/src/routes.ts | 35 +++++++++++ .../ts-crawlee-stagehand-chrome/tsconfig.json | 13 ++++ 31 files changed, 714 insertions(+) create mode 100644 templates/js-crawlee-stagehand-chrome/.actor/actor.json create mode 100644 templates/js-crawlee-stagehand-chrome/.actor/dataset_schema.json create mode 100644 templates/js-crawlee-stagehand-chrome/.actor/input_schema.json create mode 100644 templates/js-crawlee-stagehand-chrome/.actor/output_schema.json create mode 100644 templates/js-crawlee-stagehand-chrome/.dockerignore create mode 100644 templates/js-crawlee-stagehand-chrome/.editorconfig create mode 100644 templates/js-crawlee-stagehand-chrome/.gitignore create mode 100644 templates/js-crawlee-stagehand-chrome/.prettierrc create mode 100644 templates/js-crawlee-stagehand-chrome/Dockerfile create mode 100644 templates/js-crawlee-stagehand-chrome/README.md create mode 100644 templates/js-crawlee-stagehand-chrome/eslint.config.mjs create mode 100644 templates/js-crawlee-stagehand-chrome/package.json create mode 100644 templates/js-crawlee-stagehand-chrome/src/main.js create mode 100644 templates/js-crawlee-stagehand-chrome/src/routes.js create mode 100644 templates/ts-crawlee-stagehand-chrome/.actor/actor.json create mode 100644 templates/ts-crawlee-stagehand-chrome/.actor/dataset_schema.json create mode 100644 templates/ts-crawlee-stagehand-chrome/.actor/input_schema.json create mode 100644 templates/ts-crawlee-stagehand-chrome/.actor/output_schema.json create mode 100644 templates/ts-crawlee-stagehand-chrome/.dockerignore create mode 100644 templates/ts-crawlee-stagehand-chrome/.editorconfig create mode 100644 templates/ts-crawlee-stagehand-chrome/.gitignore create mode 100644 templates/ts-crawlee-stagehand-chrome/.prettierignore create mode 100644 templates/ts-crawlee-stagehand-chrome/.prettierrc create mode 100644 templates/ts-crawlee-stagehand-chrome/Dockerfile create mode 100644 templates/ts-crawlee-stagehand-chrome/README.md create mode 100644 templates/ts-crawlee-stagehand-chrome/eslint.config.mjs create mode 100644 templates/ts-crawlee-stagehand-chrome/package.json create mode 100644 templates/ts-crawlee-stagehand-chrome/src/main.ts create mode 100644 templates/ts-crawlee-stagehand-chrome/src/routes.ts create mode 100644 templates/ts-crawlee-stagehand-chrome/tsconfig.json diff --git a/templates/js-crawlee-stagehand-chrome/.actor/actor.json b/templates/js-crawlee-stagehand-chrome/.actor/actor.json new file mode 100644 index 00000000..3e1da3de --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.actor/actor.json @@ -0,0 +1,20 @@ +{ + "actorSpecification": 1, + "name": "project-stagehand-crawler-javascript", + "title": "Project Stagehand Crawler JavaScript", + "description": "Crawlee and Stagehand project in JavaScript.", + "version": "0.0", + "meta": { + "templateId": "js-crawlee-stagehand-chrome", + "generatedBy": "" + }, + "input": "./input_schema.json", + "output": "./output_schema.json", + "storages": { + "dataset": "./dataset_schema.json" + }, + "dockerfile": "../Dockerfile", + "environmentVariables": { + "OPENAI_API_KEY": "@OPENAI_API_KEY" + } +} diff --git a/templates/js-crawlee-stagehand-chrome/.actor/dataset_schema.json b/templates/js-crawlee-stagehand-chrome/.actor/dataset_schema.json new file mode 100644 index 00000000..99c05ae8 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.actor/dataset_schema.json @@ -0,0 +1,33 @@ +{ + "actorSpecification": 1, + "fields": {}, + "views": { + "overview": { + "title": "Overview", + "transformation": { + "fields": ["title", "heading", "description", "url"] + }, + "display": { + "component": "table", + "properties": { + "title": { + "label": "Page Title", + "format": "text" + }, + "heading": { + "label": "Heading", + "format": "text" + }, + "description": { + "label": "Description", + "format": "text" + }, + "url": { + "label": "URL", + "format": "link" + } + } + } + } + } +} diff --git a/templates/js-crawlee-stagehand-chrome/.actor/input_schema.json b/templates/js-crawlee-stagehand-chrome/.actor/input_schema.json new file mode 100644 index 00000000..9a77fc01 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.actor/input_schema.json @@ -0,0 +1,18 @@ +{ + "title": "StagehandCrawler Template", + "type": "object", + "schemaVersion": 1, + "properties": { + "startUrls": { + "title": "Start URLs", + "type": "array", + "description": "URLs to start with.", + "editor": "requestListSources", + "prefill": [ + { + "url": "https://apify.com" + } + ] + } + } +} diff --git a/templates/js-crawlee-stagehand-chrome/.actor/output_schema.json b/templates/js-crawlee-stagehand-chrome/.actor/output_schema.json new file mode 100644 index 00000000..666b1e6a --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.actor/output_schema.json @@ -0,0 +1,11 @@ +{ + "actorOutputSchemaVersion": 1, + "title": "Output schema", + "properties": { + "overview": { + "type": "string", + "title": "Overview", + "template": "{{links.apiDefaultDatasetUrl}}/items?view=overview" + } + } +} diff --git a/templates/js-crawlee-stagehand-chrome/.dockerignore b/templates/js-crawlee-stagehand-chrome/.dockerignore new file mode 100644 index 00000000..113a8a8e --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.dockerignore @@ -0,0 +1,15 @@ +# configurations +.idea +.vscode +.zed + +# crawlee and apify storage folders +apify_storage +crawlee_storage +storage + +# installed files +node_modules + +# git folder +.git diff --git a/templates/js-crawlee-stagehand-chrome/.editorconfig b/templates/js-crawlee-stagehand-chrome/.editorconfig new file mode 100644 index 00000000..a2046ffd --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.editorconfig @@ -0,0 +1,10 @@ +root = true + +[*] +indent_style = space +indent_size = 4 +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true +end_of_line = lf +quote_type = single diff --git a/templates/js-crawlee-stagehand-chrome/.gitignore b/templates/js-crawlee-stagehand-chrome/.gitignore new file mode 100644 index 00000000..b0007f8c --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.gitignore @@ -0,0 +1,10 @@ +# This file tells Git which files shouldn't be added to source control + +.DS_Store +.idea +.vscode +.zed +dist +node_modules +apify_storage +storage diff --git a/templates/js-crawlee-stagehand-chrome/.prettierrc b/templates/js-crawlee-stagehand-chrome/.prettierrc new file mode 100644 index 00000000..ca0c14e7 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/.prettierrc @@ -0,0 +1,5 @@ +{ + "printWidth": 120, + "tabWidth": 4, + "singleQuote": true +} diff --git a/templates/js-crawlee-stagehand-chrome/Dockerfile b/templates/js-crawlee-stagehand-chrome/Dockerfile new file mode 100644 index 00000000..84c4ba70 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/Dockerfile @@ -0,0 +1,34 @@ +# Specify the base Docker image. You can read more about +# the available images at https://crawlee.dev/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node-playwright-chrome:22-1.58.1 + +# Check preinstalled packages +RUN npm ls @crawlee/core apify puppeteer playwright + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser:myuser package*.json Dockerfile ./ + +# Check Playwright version is the same as the one from base image. +RUN node check-playwright-version.mjs + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --omit=dev --omit=optional \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version \ + && rm -r ~/.npm + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY --chown=myuser:myuser . ./ + +CMD ["node", "src/main.js"] diff --git a/templates/js-crawlee-stagehand-chrome/README.md b/templates/js-crawlee-stagehand-chrome/README.md new file mode 100644 index 00000000..3bad4d33 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/README.md @@ -0,0 +1,23 @@ +## StagehandCrawler template + + + +This template is a production-ready boilerplate for developing an [Actor](https://apify.com/actors) with `StagehandCrawler`. It combines [Crawlee](https://crawlee.dev)'s powerful crawling infrastructure with [Stagehand](https://github.com/browserbase/stagehand)'s AI-powered browser automation, enabling you to interact with web pages and extract structured data using natural language instructions. + +The template demonstrates two key Stagehand capabilities: +- **`act()`** — Perform actions on a page using natural language (e.g., closing cookie dialogs) +- **`extract()`** — Extract structured data from pages using Zod schemas and AI + +## Resources + +If you're looking for examples or want to learn more visit: + +- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform) +- [Stagehand documentation](https://docs.stagehand.dev) +- [Crawlee documentation](https://crawlee.dev/docs/quick-start) +- [Node.js tutorials](https://docs.apify.com/academy/node-js) in Academy +- [Integration with Zapier](https://apify.com/integrations), Make, GitHub, Google Drive and other apps +- [Video guide on getting data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM) +- A short guide on how to create Actors using code templates: + +[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w) diff --git a/templates/js-crawlee-stagehand-chrome/eslint.config.mjs b/templates/js-crawlee-stagehand-chrome/eslint.config.mjs new file mode 100644 index 00000000..368ae6ee --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/eslint.config.mjs @@ -0,0 +1,6 @@ +import prettier from 'eslint-config-prettier'; + +import apify from '@apify/eslint-config/js.js'; + +// eslint-disable-next-line import/no-default-export +export default [{ ignores: ['**/dist'] }, ...apify, prettier]; diff --git a/templates/js-crawlee-stagehand-chrome/package.json b/templates/js-crawlee-stagehand-chrome/package.json new file mode 100644 index 00000000..b83ac4af --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/package.json @@ -0,0 +1,30 @@ +{ + "name": "crawlee-stagehand-javascript", + "version": "0.0.1", + "type": "module", + "description": "This is an example of an Apify Actor.", + "dependencies": { + "apify": "^3.5.2", + "@crawlee/stagehand": "^3.16.0", + "playwright": "1.58.1", + "openai": "^4.96.0", + "zod": "^3.25.0" + }, + "devDependencies": { + "@apify/eslint-config": "^1.0.0", + "eslint": "^9.29.0", + "eslint-config-prettier": "^10.1.5", + "prettier": "^3.5.3" + }, + "scripts": { + "start": "node src/main.js", + "format": "prettier --write .", + "format:check": "prettier --check .", + "lint": "eslint", + "lint:fix": "eslint --fix", + "test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1", + "postinstall": "npx crawlee install-playwright-browsers" + }, + "author": "It's not you it's me", + "license": "ISC" +} diff --git a/templates/js-crawlee-stagehand-chrome/src/main.js b/templates/js-crawlee-stagehand-chrome/src/main.js new file mode 100644 index 00000000..c0bc8867 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/src/main.js @@ -0,0 +1,44 @@ +/** + * This template is a production ready boilerplate for developing with `StagehandCrawler`. + * Use this to bootstrap your projects using the most up-to-date code. + * If you're looking for examples or want to learn more, see README. + */ + +// For more information, see https://crawlee.dev +import { StagehandCrawler } from '@crawlee/stagehand'; +// For more information, see https://docs.apify.com/sdk/js +import { Actor } from 'apify'; + +// this is ESM project, and as such, it requires you to specify extensions in your relative imports +// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions +import { router } from './routes.js'; + +// Initialize the Apify SDK +await Actor.init(); + +const { startUrls = ['https://apify.com'] } = (await Actor.getInput()) ?? {}; + +// `checkAccess` flag ensures the proxy credentials are valid, but the check can take a few hundred milliseconds. +// Disable it for short runs if you are sure your proxy configuration is correct +const proxyConfiguration = await Actor.createProxyConfiguration({ checkAccess: true }); + +const crawler = new StagehandCrawler({ + proxyConfiguration, + requestHandler: router, + stagehandOptions: { + model: 'openai/gpt-4o-mini', + apiKey: process.env.OPENAI_API_KEY, + }, + launchContext: { + launchOptions: { + args: [ + '--disable-gpu', // Mitigates the "crashing GPU process" issue in Docker containers + ], + }, + }, +}); + +await crawler.run(startUrls); + +// Exit successfully +await Actor.exit(); diff --git a/templates/js-crawlee-stagehand-chrome/src/routes.js b/templates/js-crawlee-stagehand-chrome/src/routes.js new file mode 100644 index 00000000..01647461 --- /dev/null +++ b/templates/js-crawlee-stagehand-chrome/src/routes.js @@ -0,0 +1,35 @@ +import { createStagehandRouter, Dataset } from '@crawlee/stagehand'; +import { z } from 'zod'; + +export const router = createStagehandRouter(); + +router.addDefaultHandler(async ({ enqueueLinks, log }) => { + log.info('enqueueing new URLs'); + await enqueueLinks({ + globs: ['https://apify.com/*'], + label: 'detail', + }); +}); + +router.addHandler('detail', async ({ request, page, log }) => { + const title = await page.title(); + log.info(`${title}`, { url: request.loadedUrl }); + + // Use Stagehand act() to interact with the page + await page.act('Close any cookie consent dialogs or popups if present'); + + // Use Stagehand extract() to pull structured data with a Zod schema + const extracted = await page.extract({ + instruction: 'Extract the main heading and a brief description of this page', + schema: z.object({ + heading: z.string().describe('The main heading of the page'), + description: z.string().describe('A brief description or subtitle of the page'), + }), + }); + + await Dataset.pushData({ + url: request.loadedUrl, + title, + ...extracted, + }); +}); diff --git a/templates/manifest.json b/templates/manifest.json index 56345e55..a35c7ab4 100644 --- a/templates/manifest.json +++ b/templates/manifest.json @@ -413,6 +413,22 @@ "showcaseFiles": ["src/main.js", "src/routes.js"], "useCases": ["WEB_SCRAPING"] }, + { + "id": "js-crawlee-stagehand-chrome", + "name": "project_stagehand_crawler_js", + "label": "Crawlee + Stagehand + Chrome", + "category": "javascript", + "technologies": ["nodejs", "crawlee", "stagehand", "chrome"], + "description": "AI-powered web scraper using Crawlee and Stagehand. Uses natural language to interact with pages and extract structured data using LLMs.", + "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/js-crawlee-stagehand-chrome.zip?raw=true", + "defaultRunOptions": { + "build": "latest", + "memoryMbytes": 4096, + "timeoutSecs": 3600 + }, + "showcaseFiles": ["src/main.js", "src/routes.js"], + "useCases": ["AI", "WEB_SCRAPING"] + }, { "id": "js-crawlee-playwright-camoufox", "name": "project_playwright_camoufox_crawler_js", @@ -494,6 +510,22 @@ "showcaseFiles": ["src/main.ts", "src/routes.ts"], "useCases": ["WEB_SCRAPING"] }, + { + "id": "ts-crawlee-stagehand-chrome", + "name": "project_stagehand_crawler_ts", + "label": "Crawlee + Stagehand + Chrome", + "category": "typescript", + "technologies": ["nodejs", "crawlee", "stagehand", "chrome"], + "description": "AI-powered web scraper using Crawlee and Stagehand. Uses natural language to interact with pages and extract structured data using LLMs.", + "archiveUrl": "https://github.com/apify/actor-templates/blob/master/dist/templates/ts-crawlee-stagehand-chrome.zip?raw=true", + "defaultRunOptions": { + "build": "latest", + "memoryMbytes": 4096, + "timeoutSecs": 3600 + }, + "showcaseFiles": ["src/main.ts", "src/routes.ts"], + "useCases": ["AI", "WEB_SCRAPING"] + }, { "id": "ts-crawlee-playwright-camoufox", "name": "project_playwright_camoufox_crawler_ts", diff --git a/templates/ts-crawlee-stagehand-chrome/.actor/actor.json b/templates/ts-crawlee-stagehand-chrome/.actor/actor.json new file mode 100644 index 00000000..287a1246 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.actor/actor.json @@ -0,0 +1,20 @@ +{ + "actorSpecification": 1, + "name": "project-stagehand-crawler-typescript", + "title": "Project Stagehand Crawler Typescript", + "description": "Crawlee and Stagehand project in typescript.", + "version": "0.0", + "meta": { + "templateId": "ts-crawlee-stagehand-chrome", + "generatedBy": "" + }, + "input": "./input_schema.json", + "output": "./output_schema.json", + "storages": { + "dataset": "./dataset_schema.json" + }, + "dockerfile": "../Dockerfile", + "environmentVariables": { + "OPENAI_API_KEY": "@OPENAI_API_KEY" + } +} diff --git a/templates/ts-crawlee-stagehand-chrome/.actor/dataset_schema.json b/templates/ts-crawlee-stagehand-chrome/.actor/dataset_schema.json new file mode 100644 index 00000000..99c05ae8 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.actor/dataset_schema.json @@ -0,0 +1,33 @@ +{ + "actorSpecification": 1, + "fields": {}, + "views": { + "overview": { + "title": "Overview", + "transformation": { + "fields": ["title", "heading", "description", "url"] + }, + "display": { + "component": "table", + "properties": { + "title": { + "label": "Page Title", + "format": "text" + }, + "heading": { + "label": "Heading", + "format": "text" + }, + "description": { + "label": "Description", + "format": "text" + }, + "url": { + "label": "URL", + "format": "link" + } + } + } + } + } +} diff --git a/templates/ts-crawlee-stagehand-chrome/.actor/input_schema.json b/templates/ts-crawlee-stagehand-chrome/.actor/input_schema.json new file mode 100644 index 00000000..47cde6db --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.actor/input_schema.json @@ -0,0 +1,24 @@ +{ + "title": "StagehandCrawler Template", + "type": "object", + "schemaVersion": 1, + "properties": { + "startUrls": { + "title": "Start URLs", + "type": "array", + "description": "URLs to start with.", + "editor": "requestListSources", + "prefill": [ + { + "url": "https://apify.com" + } + ] + }, + "maxRequestsPerCrawl": { + "title": "Max Requests per Crawl", + "type": "integer", + "description": "Maximum number of requests that can be made by this crawler.", + "default": 100 + } + } +} diff --git a/templates/ts-crawlee-stagehand-chrome/.actor/output_schema.json b/templates/ts-crawlee-stagehand-chrome/.actor/output_schema.json new file mode 100644 index 00000000..666b1e6a --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.actor/output_schema.json @@ -0,0 +1,11 @@ +{ + "actorOutputSchemaVersion": 1, + "title": "Output schema", + "properties": { + "overview": { + "type": "string", + "title": "Overview", + "template": "{{links.apiDefaultDatasetUrl}}/items?view=overview" + } + } +} diff --git a/templates/ts-crawlee-stagehand-chrome/.dockerignore b/templates/ts-crawlee-stagehand-chrome/.dockerignore new file mode 100644 index 00000000..113a8a8e --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.dockerignore @@ -0,0 +1,15 @@ +# configurations +.idea +.vscode +.zed + +# crawlee and apify storage folders +apify_storage +crawlee_storage +storage + +# installed files +node_modules + +# git folder +.git diff --git a/templates/ts-crawlee-stagehand-chrome/.editorconfig b/templates/ts-crawlee-stagehand-chrome/.editorconfig new file mode 100644 index 00000000..81eba8cc --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.editorconfig @@ -0,0 +1,9 @@ +root = true + +[*] +indent_style = space +indent_size = 4 +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true +end_of_line = lf diff --git a/templates/ts-crawlee-stagehand-chrome/.gitignore b/templates/ts-crawlee-stagehand-chrome/.gitignore new file mode 100644 index 00000000..b0007f8c --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.gitignore @@ -0,0 +1,10 @@ +# This file tells Git which files shouldn't be added to source control + +.DS_Store +.idea +.vscode +.zed +dist +node_modules +apify_storage +storage diff --git a/templates/ts-crawlee-stagehand-chrome/.prettierignore b/templates/ts-crawlee-stagehand-chrome/.prettierignore new file mode 100644 index 00000000..12479419 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.prettierignore @@ -0,0 +1 @@ +.prettierignore diff --git a/templates/ts-crawlee-stagehand-chrome/.prettierrc b/templates/ts-crawlee-stagehand-chrome/.prettierrc new file mode 100644 index 00000000..0ff25f92 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/.prettierrc @@ -0,0 +1,5 @@ +{ + "printWidth": 120, + "singleQuote": true, + "tabWidth": 4 +} diff --git a/templates/ts-crawlee-stagehand-chrome/Dockerfile b/templates/ts-crawlee-stagehand-chrome/Dockerfile new file mode 100644 index 00000000..97e2e048 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/Dockerfile @@ -0,0 +1,59 @@ +# Specify the base Docker image. You can read more about +# the available images at https://crawlee.dev/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node-playwright-chrome:22-1.58.1 AS builder + +# Check preinstalled packages +RUN npm ls @crawlee/core apify puppeteer playwright + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser:myuser package*.json Dockerfile ./ + +# Check Playwright version is the same as the one from base image. +RUN node check-playwright-version.mjs + +# Install all dependencies. Don't audit to speed up the installation. +RUN npm install --include=dev --audit=false + +# Next, copy the source files using the user set +# in the base image. +COPY --chown=myuser:myuser . ./ + +# Install all dependencies and build the project. +# Don't audit to speed up the installation. +RUN npm run build + +# Create final image +FROM apify/actor-node-playwright-chrome:22-1.58.1 + +# Check preinstalled packages +RUN npm ls @crawlee/core apify puppeteer playwright + +# Copy just package.json and package-lock.json +# to speed up the build using Docker layer cache. +COPY --chown=myuser:myuser package*.json ./ + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --omit=dev --omit=optional \ + && echo "Installed NPM packages:" \ + && (npm list --omit=dev --all || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version \ + && rm -r ~/.npm + +# Copy built JS files from builder image +COPY --from=builder --chown=myuser:myuser /home/myuser/dist ./dist + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY --chown=myuser:myuser . ./ + +# Run the image. +CMD ["node", "dist/main.js"] diff --git a/templates/ts-crawlee-stagehand-chrome/README.md b/templates/ts-crawlee-stagehand-chrome/README.md new file mode 100644 index 00000000..3bad4d33 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/README.md @@ -0,0 +1,23 @@ +## StagehandCrawler template + + + +This template is a production-ready boilerplate for developing an [Actor](https://apify.com/actors) with `StagehandCrawler`. It combines [Crawlee](https://crawlee.dev)'s powerful crawling infrastructure with [Stagehand](https://github.com/browserbase/stagehand)'s AI-powered browser automation, enabling you to interact with web pages and extract structured data using natural language instructions. + +The template demonstrates two key Stagehand capabilities: +- **`act()`** — Perform actions on a page using natural language (e.g., closing cookie dialogs) +- **`extract()`** — Extract structured data from pages using Zod schemas and AI + +## Resources + +If you're looking for examples or want to learn more visit: + +- [Crawlee + Apify Platform guide](https://crawlee.dev/docs/guides/apify-platform) +- [Stagehand documentation](https://docs.stagehand.dev) +- [Crawlee documentation](https://crawlee.dev/docs/quick-start) +- [Node.js tutorials](https://docs.apify.com/academy/node-js) in Academy +- [Integration with Zapier](https://apify.com/integrations), Make, GitHub, Google Drive and other apps +- [Video guide on getting data using Apify API](https://www.youtube.com/watch?v=ViYYDHSBAKM) +- A short guide on how to create Actors using code templates: + +[web scraper template](https://www.youtube.com/watch?v=u-i-Korzf8w) diff --git a/templates/ts-crawlee-stagehand-chrome/eslint.config.mjs b/templates/ts-crawlee-stagehand-chrome/eslint.config.mjs new file mode 100644 index 00000000..3252277a --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/eslint.config.mjs @@ -0,0 +1,30 @@ +import prettier from 'eslint-config-prettier'; + +import apify from '@apify/eslint-config/ts.js'; +import globals from 'globals'; +import tsEslint from 'typescript-eslint'; + +// eslint-disable-next-line import/no-default-export +export default [ + { ignores: ['**/dist', 'eslint.config.mjs'] }, + ...apify, + prettier, + { + languageOptions: { + parser: tsEslint.parser, + parserOptions: { + project: 'tsconfig.json', + }, + globals: { + ...globals.node, + ...globals.jest, + }, + }, + plugins: { + '@typescript-eslint': tsEslint.plugin, + }, + rules: { + 'no-console': 0, + }, + }, +]; diff --git a/templates/ts-crawlee-stagehand-chrome/package.json b/templates/ts-crawlee-stagehand-chrome/package.json new file mode 100644 index 00000000..a17064e1 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/package.json @@ -0,0 +1,42 @@ +{ + "name": "crawlee-stagehand-typescript", + "version": "0.0.1", + "type": "module", + "description": "This is an example of an Apify Actor.", + "engines": { + "node": ">=18.0.0" + }, + "dependencies": { + "apify": "^3.5.2", + "@crawlee/stagehand": "^3.16.0", + "playwright": "1.58.1", + "openai": "^4.96.0", + "zod": "^3.25.0" + }, + "devDependencies": { + "@apify/eslint-config": "^1.0.0", + "@apify/tsconfig": "^0.1.1", + "@types/node": "^22.15.32", + "eslint": "^9.29.0", + "eslint-config-prettier": "^10.1.5", + "globals": "^17.0.0", + "prettier": "^3.5.3", + "tsx": "^4.20.3", + "typescript": "^5.9.3", + "typescript-eslint": "^8.34.1" + }, + "scripts": { + "start": "npm run start:dev", + "start:prod": "node dist/main.js", + "start:dev": "tsx src/main.ts", + "build": "tsc", + "lint": "eslint", + "lint:fix": "eslint --fix", + "format": "prettier --write .", + "format:check": "prettier --check .", + "test": "echo \"Error: oops, the Actor has no tests yet, sad!\" && exit 1", + "postinstall": "npx crawlee install-playwright-browsers" + }, + "author": "It's not you it's me", + "license": "ISC" +} diff --git a/templates/ts-crawlee-stagehand-chrome/src/main.ts b/templates/ts-crawlee-stagehand-chrome/src/main.ts new file mode 100644 index 00000000..f7e1da84 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/src/main.ts @@ -0,0 +1,58 @@ +/** + * This template is a production ready boilerplate for developing with `StagehandCrawler`. + * Use this to bootstrap your projects using the most up-to-date code. + * If you're looking for examples or want to learn more, see README. + */ + +// For more information, see https://crawlee.dev +import { StagehandCrawler } from '@crawlee/stagehand'; +// For more information, see https://docs.apify.com/sdk/js +import { Actor } from 'apify'; + +// this is ESM project, and as such, it requires you to specify extensions in your relative imports +// read more about this here: https://nodejs.org/docs/latest-v18.x/api/esm.html#mandatory-file-extensions +// note that we need to use `.js` even when inside TS files +import { router } from './routes.js'; + +interface Input { + startUrls: { + url: string; + method?: 'GET' | 'HEAD' | 'POST' | 'PUT' | 'DELETE' | 'TRACE' | 'OPTIONS' | 'CONNECT' | 'PATCH'; + headers?: Record; + userData: Record; + }[]; + maxRequestsPerCrawl: number; +} + +// Initialize the Apify SDK +await Actor.init(); + +// Structure of input is defined in input_schema.json +const { startUrls = ['https://apify.com'], maxRequestsPerCrawl = 100 } = + (await Actor.getInput()) ?? ({} as Input); + +// `checkAccess` flag ensures the proxy credentials are valid, but the check can take a few hundred milliseconds. +// Disable it for short runs if you are sure your proxy configuration is correct +const proxyConfiguration = await Actor.createProxyConfiguration({ checkAccess: true }); + +const crawler = new StagehandCrawler({ + proxyConfiguration, + maxRequestsPerCrawl, + requestHandler: router, + stagehandOptions: { + model: 'openai/gpt-4o-mini', + apiKey: process.env.OPENAI_API_KEY, + }, + launchContext: { + launchOptions: { + args: [ + '--disable-gpu', // Mitigates the "crashing GPU process" issue in Docker containers + ], + }, + }, +}); + +await crawler.run(startUrls); + +// Exit successfully +await Actor.exit(); diff --git a/templates/ts-crawlee-stagehand-chrome/src/routes.ts b/templates/ts-crawlee-stagehand-chrome/src/routes.ts new file mode 100644 index 00000000..01647461 --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/src/routes.ts @@ -0,0 +1,35 @@ +import { createStagehandRouter, Dataset } from '@crawlee/stagehand'; +import { z } from 'zod'; + +export const router = createStagehandRouter(); + +router.addDefaultHandler(async ({ enqueueLinks, log }) => { + log.info('enqueueing new URLs'); + await enqueueLinks({ + globs: ['https://apify.com/*'], + label: 'detail', + }); +}); + +router.addHandler('detail', async ({ request, page, log }) => { + const title = await page.title(); + log.info(`${title}`, { url: request.loadedUrl }); + + // Use Stagehand act() to interact with the page + await page.act('Close any cookie consent dialogs or popups if present'); + + // Use Stagehand extract() to pull structured data with a Zod schema + const extracted = await page.extract({ + instruction: 'Extract the main heading and a brief description of this page', + schema: z.object({ + heading: z.string().describe('The main heading of the page'), + description: z.string().describe('A brief description or subtitle of the page'), + }), + }); + + await Dataset.pushData({ + url: request.loadedUrl, + title, + ...extracted, + }); +}); diff --git a/templates/ts-crawlee-stagehand-chrome/tsconfig.json b/templates/ts-crawlee-stagehand-chrome/tsconfig.json new file mode 100644 index 00000000..9a7c145a --- /dev/null +++ b/templates/ts-crawlee-stagehand-chrome/tsconfig.json @@ -0,0 +1,13 @@ +{ + "extends": "@apify/tsconfig", + "compilerOptions": { + "module": "NodeNext", + "moduleResolution": "NodeNext", + "target": "ES2022", + "outDir": "dist", + "noUnusedLocals": false, + "skipLibCheck": true, + "lib": ["DOM"] + }, + "include": ["./src/**/*"] +}