-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract-structured-data.mjs
More file actions
90 lines (81 loc) · 2.35 KB
/
extract-structured-data.mjs
File metadata and controls
90 lines (81 loc) · 2.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env node
/**
* Extract structured data from a web page using Plasmate's SOM.
*
* Demonstrates how to walk the SOM and pull out specific data types:
* headings, links, images, and text content organized by region.
*/
import { execSync } from "node:child_process";
const url = process.argv[2] || "https://news.ycombinator.com";
try {
const output = execSync(`plasmate fetch "${url}"`, {
encoding: "utf-8",
stdio: ["pipe", "pipe", "pipe"],
});
const som = JSON.parse(output);
console.log(`Structured data from: ${som.title ?? url}\n`);
const data = {
headings: [],
links: [],
images: [],
textBlocks: [],
};
for (const region of som.regions ?? []) {
for (const el of region.elements ?? []) {
switch (el.role) {
case "heading":
data.headings.push({
level: el.level,
text: el.text,
region: region.role,
});
break;
case "link":
if (el.href) {
data.links.push({
text: el.text?.trim(),
href: el.href,
region: region.role,
});
}
break;
case "image":
if (el.src) {
data.images.push({
alt: el.alt ?? "",
src: el.src,
region: region.role,
});
}
break;
case "text":
if (el.text?.trim()) {
data.textBlocks.push({
text: el.text.trim().slice(0, 200),
region: region.role,
});
}
break;
}
}
}
console.log(`📑 Headings (${data.headings.length}):`);
for (const h of data.headings.slice(0, 10)) {
console.log(` ${"#".repeat(h.level)} ${h.text}`);
}
console.log(`\n🔗 Links (${data.links.length}):`);
for (const l of data.links.slice(0, 10)) {
console.log(` ${l.text?.slice(0, 50) || "(no text)"} → ${l.href}`);
}
console.log(`\n🖼 Images (${data.images.length}):`);
for (const img of data.images.slice(0, 5)) {
console.log(` ${img.alt || "(no alt)"} → ${img.src}`);
}
console.log(`\n📝 Text blocks (${data.textBlocks.length}):`);
for (const t of data.textBlocks.slice(0, 5)) {
console.log(` [${t.region}] ${t.text.slice(0, 80)}...`);
}
} catch (err) {
console.error(`Error: ${err.message}`);
process.exit(1);
}