Initial commit: Python quickstart template for Plasmate

dbhurley · dbhurley · commit 03375b9e8d34 · 2026-03-25T07:13:43.000-04:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,33 @@
+name: Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Install Plasmate
+        run: cargo install plasmate
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12"
+
+      - name: Test fetch_page.py
+        run: python fetch_page.py https://example.com
+
+      - name: Test extract_links.py
+        run: python extract_links.py https://example.com
+
+      - name: Test batch_fetch.py
+        run: python batch_fetch.py https://example.com https://example.org
diff --git a/README.md b/README.md
@@ -0,0 +1,71 @@
+# Plasmate Quickstart — Python
+
+[![Use this template](https://img.shields.io/badge/Use%20this-template-blue?style=for-the-badge)](https://github.com/plasmate-labs/quickstart-python/generate)
+[![Test](https://github.com/plasmate-labs/quickstart-python/actions/workflows/test.yml/badge.svg)](https://github.com/plasmate-labs/quickstart-python/actions/workflows/test.yml)
+
+A minimal template showing how to use [Plasmate](https://github.com/nickthecook/plasmate) from Python. Fetch web pages and get back a structured **Semantic Object Model (SOM)** instead of raw HTML.
+
+## Prerequisites
+
+Install Plasmate:
+
+```bash
+cargo install plasmate
+```
+
+## What's Included
+
+| Script | Description |
+|--------|-------------|
+| `fetch_page.py` | Fetch a single URL and print the semantic content |
+| `batch_fetch.py` | Fetch multiple URLs and save results as JSON |
+| `extract_links.py` | Extract all links from a page using the SOM |
+
+## Quick Start
+
+```bash
+# Clone this template
+gh repo create my-scraper --template plasmate-labs/quickstart-python --clone
+cd my-scraper
+
+# Fetch a page
+python fetch_page.py https://news.ycombinator.com
+
+# Extract links
+python extract_links.py https://github.com/trending
+
+# Batch fetch
+python batch_fetch.py https://example.com https://example.org
+```
+
+## How It Works
+
+Plasmate fetches web pages and returns a **Semantic Object Model** — a structured JSON representation of the page content organized by semantic regions (navigation, main content, sidebars, etc.) and elements (headings, links, text, images).
+
+```python
+import subprocess
+import json
+
+result = subprocess.run(["plasmate", "fetch", "https://example.com"], capture_output=True, text=True)
+som = json.loads(result.stdout)
+
+# som = {
+#   "title": "Example Domain",
+#   "lang": "en",
+#   "regions": [
+#     {
+#       "role": "main",
+#       "id": "content",
+#       "elements": [
+#         {"role": "heading", "text": "Example Domain", "level": 1},
+#         {"role": "text", "text": "This domain is for use in illustrative examples..."},
+#         {"role": "link", "text": "More information...", "href": "https://www.iana.org/domains/example"}
+#       ]
+#     }
+#   ]
+# }
+```
+
+## License
+
+MIT
diff --git a/batch_fetch.py b/batch_fetch.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python3
+"""Fetch multiple URLs with Plasmate and save results as JSON."""
+import subprocess
+import json
+import sys
+import os
+from datetime import datetime
+
+URLS = [
+    "https://news.ycombinator.com",
+    "https://github.com/trending",
+    "https://www.reddit.com/r/programming",
+]
+
+def fetch_url(url: str) -> dict:
+    """Fetch a single URL and return the SOM."""
+    result = subprocess.run(
+        ["plasmate", "fetch", url],
+        capture_output=True,
+        text=True,
+    )
+    if result.returncode != 0:
+        return {"url": url, "error": result.stderr.strip()}
+    try:
+        som = json.loads(result.stdout)
+        return {"url": url, "som": som}
+    except json.JSONDecodeError as e:
+        return {"url": url, "error": f"JSON decode error: {e}"}
+
+def main():
+    urls = sys.argv[1:] if len(sys.argv) > 1 else URLS
+    print(f"Fetching {len(urls)} URLs...")
+
+    results = []
+    for url in urls:
+        print(f"  → {url}")
+        results.append(fetch_url(url))
+
+    output = {
+        "fetched_at": datetime.utcnow().isoformat() + "Z",
+        "count": len(results),
+        "results": results,
+    }
+
+    output_file = "results.json"
+    with open(output_file, "w") as f:
+        json.dump(output, f, indent=2)
+
+    successes = sum(1 for r in results if "som" in r)
+    errors = sum(1 for r in results if "error" in r)
+    print(f"\nDone: {successes} succeeded, {errors} failed")
+    print(f"Results saved to {output_file}")
+
+if __name__ == "__main__":
+    main()
diff --git a/extract_links.py b/extract_links.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+"""Extract all links from a web page using Plasmate's SOM."""
+import subprocess
+import json
+import sys
+
+url = sys.argv[1] if len(sys.argv) > 1 else "https://news.ycombinator.com"
+
+result = subprocess.run(["plasmate", "fetch", url], capture_output=True, text=True)
+
+if result.returncode != 0:
+    print(f"Error: {result.stderr}", file=sys.stderr)
+    sys.exit(1)
+
+som = json.loads(result.stdout)
+print(f"Links from: {som.get('title', url)}\n")
+
+links = []
+for region in som.get("regions", []):
+    for el in region.get("elements", []):
+        if el.get("role") == "link" and el.get("href"):
+            links.append({
+                "text": el.get("text", "").strip(),
+                "href": el["href"],
+                "region": region.get("role", "unknown"),
+            })
+
+print(f"Found {len(links)} links:\n")
+for i, link in enumerate(links, 1):
+    text = link["text"][:60] if link["text"] else "(no text)"
+    print(f"  {i:3d}. [{link['region']}] {text}")
+    print(f"       → {link['href']}")
+
+# Optionally save as JSON
+if "--json" in sys.argv:
+    output_file = "links.json"
+    with open(output_file, "w") as f:
+        json.dump(links, f, indent=2)
+    print(f"\nSaved to {output_file}")
diff --git a/fetch_page.py b/fetch_page.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python3
+"""Fetch a web page using Plasmate and print the semantic content."""
+import subprocess
+import json
+import sys
+
+url = sys.argv[1] if len(sys.argv) > 1 else "https://news.ycombinator.com"
+result = subprocess.run(["plasmate", "fetch", url], capture_output=True, text=True)
+
+if result.returncode != 0:
+    print(f"Error: {result.stderr}", file=sys.stderr)
+    sys.exit(1)
+
+som = json.loads(result.stdout)
+print(f"Title: {som.get('title', 'N/A')}")
+print(f"Language: {som.get('lang', 'N/A')}")
+print(f"Regions: {len(som.get('regions', []))}")
+print()
+for region in som.get('regions', []):
+    print(f"[{region['role']}] {region['id']}")
+    for el in region.get('elements', [])[:5]:
+        text = el.get('text', '')[:80]
+        print(f"  {el['role']}: {text}")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,5 @@
+# Plasmate is installed as a CLI tool, not a Python package.
+# Install it with: cargo install plasmate
+#
+# This file is here for any additional Python dependencies
+# your project may need.