Skip to content

Commit 03375b9

Browse files
committed
Initial commit: Python quickstart template for Plasmate
0 parents  commit 03375b9

6 files changed

Lines changed: 226 additions & 0 deletions

File tree

.github/workflows/test.yml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
name: Test
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
test:
11+
runs-on: ubuntu-latest
12+
steps:
13+
- uses: actions/checkout@v4
14+
15+
- name: Install Rust
16+
uses: dtolnay/rust-toolchain@stable
17+
18+
- name: Install Plasmate
19+
run: cargo install plasmate
20+
21+
- name: Set up Python
22+
uses: actions/setup-python@v5
23+
with:
24+
python-version: "3.12"
25+
26+
- name: Test fetch_page.py
27+
run: python fetch_page.py https://example.com
28+
29+
- name: Test extract_links.py
30+
run: python extract_links.py https://example.com
31+
32+
- name: Test batch_fetch.py
33+
run: python batch_fetch.py https://example.com https://example.org

README.md

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
# Plasmate Quickstart — Python
2+
3+
[![Use this template](https://img.shields.io/badge/Use%20this-template-blue?style=for-the-badge)](https://github.com/plasmate-labs/quickstart-python/generate)
4+
[![Test](https://github.com/plasmate-labs/quickstart-python/actions/workflows/test.yml/badge.svg)](https://github.com/plasmate-labs/quickstart-python/actions/workflows/test.yml)
5+
6+
A minimal template showing how to use [Plasmate](https://github.com/nickthecook/plasmate) from Python. Fetch web pages and get back a structured **Semantic Object Model (SOM)** instead of raw HTML.
7+
8+
## Prerequisites
9+
10+
Install Plasmate:
11+
12+
```bash
13+
cargo install plasmate
14+
```
15+
16+
## What's Included
17+
18+
| Script | Description |
19+
|--------|-------------|
20+
| `fetch_page.py` | Fetch a single URL and print the semantic content |
21+
| `batch_fetch.py` | Fetch multiple URLs and save results as JSON |
22+
| `extract_links.py` | Extract all links from a page using the SOM |
23+
24+
## Quick Start
25+
26+
```bash
27+
# Clone this template
28+
gh repo create my-scraper --template plasmate-labs/quickstart-python --clone
29+
cd my-scraper
30+
31+
# Fetch a page
32+
python fetch_page.py https://news.ycombinator.com
33+
34+
# Extract links
35+
python extract_links.py https://github.com/trending
36+
37+
# Batch fetch
38+
python batch_fetch.py https://example.com https://example.org
39+
```
40+
41+
## How It Works
42+
43+
Plasmate fetches web pages and returns a **Semantic Object Model** — a structured JSON representation of the page content organized by semantic regions (navigation, main content, sidebars, etc.) and elements (headings, links, text, images).
44+
45+
```python
46+
import subprocess
47+
import json
48+
49+
result = subprocess.run(["plasmate", "fetch", "https://example.com"], capture_output=True, text=True)
50+
som = json.loads(result.stdout)
51+
52+
# som = {
53+
# "title": "Example Domain",
54+
# "lang": "en",
55+
# "regions": [
56+
# {
57+
# "role": "main",
58+
# "id": "content",
59+
# "elements": [
60+
# {"role": "heading", "text": "Example Domain", "level": 1},
61+
# {"role": "text", "text": "This domain is for use in illustrative examples..."},
62+
# {"role": "link", "text": "More information...", "href": "https://www.iana.org/domains/example"}
63+
# ]
64+
# }
65+
# ]
66+
# }
67+
```
68+
69+
## License
70+
71+
MIT

batch_fetch.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python3
2+
"""Fetch multiple URLs with Plasmate and save results as JSON."""
3+
import subprocess
4+
import json
5+
import sys
6+
import os
7+
from datetime import datetime
8+
9+
URLS = [
10+
"https://news.ycombinator.com",
11+
"https://github.com/trending",
12+
"https://www.reddit.com/r/programming",
13+
]
14+
15+
def fetch_url(url: str) -> dict:
16+
"""Fetch a single URL and return the SOM."""
17+
result = subprocess.run(
18+
["plasmate", "fetch", url],
19+
capture_output=True,
20+
text=True,
21+
)
22+
if result.returncode != 0:
23+
return {"url": url, "error": result.stderr.strip()}
24+
try:
25+
som = json.loads(result.stdout)
26+
return {"url": url, "som": som}
27+
except json.JSONDecodeError as e:
28+
return {"url": url, "error": f"JSON decode error: {e}"}
29+
30+
def main():
31+
urls = sys.argv[1:] if len(sys.argv) > 1 else URLS
32+
print(f"Fetching {len(urls)} URLs...")
33+
34+
results = []
35+
for url in urls:
36+
print(f" → {url}")
37+
results.append(fetch_url(url))
38+
39+
output = {
40+
"fetched_at": datetime.utcnow().isoformat() + "Z",
41+
"count": len(results),
42+
"results": results,
43+
}
44+
45+
output_file = "results.json"
46+
with open(output_file, "w") as f:
47+
json.dump(output, f, indent=2)
48+
49+
successes = sum(1 for r in results if "som" in r)
50+
errors = sum(1 for r in results if "error" in r)
51+
print(f"\nDone: {successes} succeeded, {errors} failed")
52+
print(f"Results saved to {output_file}")
53+
54+
if __name__ == "__main__":
55+
main()

extract_links.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env python3
2+
"""Extract all links from a web page using Plasmate's SOM."""
3+
import subprocess
4+
import json
5+
import sys
6+
7+
url = sys.argv[1] if len(sys.argv) > 1 else "https://news.ycombinator.com"
8+
9+
result = subprocess.run(["plasmate", "fetch", url], capture_output=True, text=True)
10+
11+
if result.returncode != 0:
12+
print(f"Error: {result.stderr}", file=sys.stderr)
13+
sys.exit(1)
14+
15+
som = json.loads(result.stdout)
16+
print(f"Links from: {som.get('title', url)}\n")
17+
18+
links = []
19+
for region in som.get("regions", []):
20+
for el in region.get("elements", []):
21+
if el.get("role") == "link" and el.get("href"):
22+
links.append({
23+
"text": el.get("text", "").strip(),
24+
"href": el["href"],
25+
"region": region.get("role", "unknown"),
26+
})
27+
28+
print(f"Found {len(links)} links:\n")
29+
for i, link in enumerate(links, 1):
30+
text = link["text"][:60] if link["text"] else "(no text)"
31+
print(f" {i:3d}. [{link['region']}] {text}")
32+
print(f" → {link['href']}")
33+
34+
# Optionally save as JSON
35+
if "--json" in sys.argv:
36+
output_file = "links.json"
37+
with open(output_file, "w") as f:
38+
json.dump(links, f, indent=2)
39+
print(f"\nSaved to {output_file}")

fetch_page.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env python3
2+
"""Fetch a web page using Plasmate and print the semantic content."""
3+
import subprocess
4+
import json
5+
import sys
6+
7+
url = sys.argv[1] if len(sys.argv) > 1 else "https://news.ycombinator.com"
8+
result = subprocess.run(["plasmate", "fetch", url], capture_output=True, text=True)
9+
10+
if result.returncode != 0:
11+
print(f"Error: {result.stderr}", file=sys.stderr)
12+
sys.exit(1)
13+
14+
som = json.loads(result.stdout)
15+
print(f"Title: {som.get('title', 'N/A')}")
16+
print(f"Language: {som.get('lang', 'N/A')}")
17+
print(f"Regions: {len(som.get('regions', []))}")
18+
print()
19+
for region in som.get('regions', []):
20+
print(f"[{region['role']}] {region['id']}")
21+
for el in region.get('elements', [])[:5]:
22+
text = el.get('text', '')[:80]
23+
print(f" {el['role']}: {text}")

requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Plasmate is installed as a CLI tool, not a Python package.
2+
# Install it with: cargo install plasmate
3+
#
4+
# This file is here for any additional Python dependencies
5+
# your project may need.

0 commit comments

Comments
 (0)