From 8586b27d137e5d156b77d142fab19682a968d446 Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 21:54:10 -0400 Subject: [PATCH 1/7] fix perf --- README.md | 31 +- bin/cli.test.ts | 41 - bin/cli.ts | 39 +- bun.lock | 5 + core-spec/v1/spec.md | 111 +- .../schemas/PUBLIC/tables/ACCOUNTS.yaml | 2 +- .../PUBLIC/tables/ANALYTIC_EVENTS.yaml | 2 +- .../schemas/PUBLIC/tables/FEEDBACK.yaml | 2 +- .../schemas/PUBLIC/tables/INVOICES.yaml | 2 +- .../schemas/PUBLIC/tables/ORDERS.yaml | 2 +- .../schemas/PUBLIC/tables/PEOPLE.yaml | 2 +- .../schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml | 12 - .../schemas/PUBLIC/tables/PEOPLE/STATE.yaml | 57 - .../schemas/PUBLIC/tables/PRODUCTS.yaml | 2 +- .../PUBLIC/tables/PRODUCTS/CATEGORY.yaml | 11 - .../PUBLIC/tables/PRODUCTS/RATING.yaml | 19 - .../schemas/PUBLIC/tables/REVIEWS.yaml | 2 +- examples/v1/field_values.json | 43 - examples/v1/table_metadata.json | 1083 ++++++++++++++--- package.json | 1 + src/extract-field-values.test.ts | 262 ---- src/extract-field-values.ts | 125 -- src/extract-table-metadata.test.ts | 31 +- src/extract-table-metadata.ts | 238 ++-- src/index.ts | 5 - src/lib.ts | 164 --- 26 files changed, 1081 insertions(+), 1213 deletions(-) delete mode 100644 examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml delete mode 100644 examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml delete mode 100644 examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/CATEGORY.yaml delete mode 100644 examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml delete mode 100644 examples/v1/field_values.json delete mode 100644 src/extract-field-values.test.ts delete mode 100644 src/extract-field-values.ts delete mode 100644 src/lib.ts diff --git a/README.md b/README.md index 5e9aaf1..1bfb070 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ Metabase represents database metadata — synced databases, their tables, and their fields — as a tree of YAML files. Files are diff-friendly: numeric IDs are omitted entirely, and foreign keys use natural-key tuples like `["Sample Database", "PUBLIC", "ORDERS"]` instead of database identifiers. -This repository contains the specification, examples, and a CLI that converts the `table_metadata.json` downloaded from a Metabase workspace page into YAML. +This repository contains the specification, examples, and a CLI that converts the `table_metadata.json` downloaded from a Metabase instance into YAML. ## Specification -The format is defined in **[core-spec/v1/spec.md](core-spec/v1/spec.md)** (v1.0.4). It covers entity keys, field types, folder structure, sampled field values, and the shape of each entity. +The format is defined in **[core-spec/v1/spec.md](core-spec/v1/spec.md)** (v1.0.4). It covers entity keys, field types, folder structure, and the shape of each entity. Reference output for the Sample Database lives in **[examples/v1/](examples/v1/)** — both the raw `table_metadata.json` and the extracted YAML tree. @@ -20,7 +20,7 @@ Reference output for the Sample Database lives in **[examples/v1/](examples/v1/) ## Obtaining metadata -Metadata is downloaded as `table_metadata.json` from the Metabase workspace page (Workspaces → the relevant workspace → "Download table_metadata.json"). The file is a flat JSON document with three arrays — `databases`, `tables`, and `fields` — that even warehouses with very large schemas can produce without exhausting server memory. +Metadata is fetched from Metabase's `GET /api/ee/serialization/metadata/export` endpoint as a `table_metadata.json` file — a flat JSON document with three arrays (`databases`, `tables`, and `fields`) streamed so even warehouses with very large schemas can be exported without exhausting server memory. ### Extracting metadata to YAML @@ -30,23 +30,9 @@ The CLI turns that JSON into the human- and agent-friendly YAML tree described i bunx @metabase/database-metadata extract-table-metadata ``` -- `` — path to the `table_metadata.json` downloaded from the workspace page. +- `` — path to the `table_metadata.json` downloaded from Metabase. - `` — destination directory. Database folders are created directly under it. -### Extracting field values - -Metabase keeps a sampled list of distinct values for each field that's low-cardinality enough to enumerate (the same list that powers filter dropdowns in the UI). Download `field_values.json` from the same workspace page and extract it alongside the metadata: - -```sh -bunx @metabase/database-metadata extract-field-values -``` - -- `` — the same `table_metadata.json` used by `extract-table-metadata`. Field values reference fields by numeric ID, which the CLI resolves to natural keys using the metadata. -- `` — path to the `field_values.json` downloaded from the workspace page. -- `` — destination directory; typically the same one used for `extract-table-metadata`, so values files land next to the table YAMLs they belong to. - -One YAML file is written per field that has values. Fields with empty samples are skipped; field IDs not present in the metadata are reported as orphans and skipped. See the spec's [Field Values](core-spec/v1/spec.md#field-values) section for the on-disk shape and when agents should consult these files. - ### Extracting the spec The bundled spec can be extracted to any file — convenient for agents that need to read it locally: @@ -82,20 +68,19 @@ On a large data warehouse the metadata export can easily reach **hundreds of meg Each developer (or a CI job) fetches metadata on demand from their own Metabase instance instead. -### 3. Download from the workspace page and extract +### 3. Download from Metabase and extract -Each developer downloads `table_metadata.json` (and optionally `field_values.json`) from the Metabase workspace page and drops them into `.metadata/`. Then run the extractors: +Each developer downloads `table_metadata.json` from their Metabase instance and drops it into `.metadata/`. Then run the extractor: ```sh mkdir -p .metadata -# Drop table_metadata.json (and optionally field_values.json) from the workspace page into .metadata/ +# Drop table_metadata.json from Metabase into .metadata/ rm -rf .metadata/databases bunx @metabase/database-metadata extract-table-metadata .metadata/table_metadata.json .metadata/databases -bunx @metabase/database-metadata extract-field-values .metadata/table_metadata.json .metadata/field_values.json .metadata/databases ``` -After this, tools and agents should read the YAML tree under `.metadata/databases/` — not `table_metadata.json` or `field_values.json`, which exist only as input to the extractors. +After this, tools and agents should read the YAML tree under `.metadata/databases/` — not `table_metadata.json`, which exists only as input to the extractor. ## Publishing to NPM diff --git a/bin/cli.test.ts b/bin/cli.test.ts index b5a9d2a..660f63c 100644 --- a/bin/cli.test.ts +++ b/bin/cli.test.ts @@ -6,7 +6,6 @@ import { join, resolve } from "path"; const REPO_ROOT = resolve(import.meta.dirname, ".."); const CLI = "bin/cli.ts"; const EXAMPLE_INPUT = "examples/v1/table_metadata.json"; -const EXAMPLE_FIELD_VALUES = "examples/v1/field_values.json"; type RunResult = { stdout: string; @@ -81,46 +80,6 @@ describe("cli", () => { }); }); - describe("extract-field-values", () => { - let workdir: string; - - beforeEach(() => { - workdir = mkdtempSync(join(tmpdir(), "database-metadata-values-cli-")); - }); - - afterEach(() => { - rmSync(workdir, { recursive: true, force: true }); - }); - - it("extracts the bundled example field values", () => { - const { stdout, exitCode } = runCli([ - "extract-field-values", - EXAMPLE_INPUT, - EXAMPLE_FIELD_VALUES, - workdir, - ]); - expect(exitCode).toBe(0); - expect(stdout).toContain("Extracted values for 4 fields"); - - const statePath = join( - workdir, - "Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml", - ); - expect(existsSync(statePath)).toBe(true); - }); - - it("errors when arguments are missing", () => { - const { stderr, exitCode } = runCli([ - "extract-field-values", - EXAMPLE_INPUT, - ]); - expect(exitCode).toBe(1); - expect(stderr).toContain( - ", , and arguments are required", - ); - }); - }); - describe("extract-spec", () => { let workdir: string; diff --git a/bin/cli.ts b/bin/cli.ts index d29b4d0..533317b 100644 --- a/bin/cli.ts +++ b/bin/cli.ts @@ -2,7 +2,6 @@ import { parseArgs } from "node:util"; -import { extractFieldValues } from "../src/extract-field-values.js"; import { extractTableMetadata } from "../src/extract-table-metadata.js"; import { extractSpec } from "../src/extract-spec.js"; @@ -18,11 +17,6 @@ Commands: Writes one YAML per database + one per table with fields nested inside. - extract-field-values - Extract field values JSON into YAML files - placed next to each table YAML, one per - field that has sampled values. - extract-spec Copy the bundled spec.md into a target file --file Destination file (default: ./spec.md) @@ -39,7 +33,7 @@ function parseArguments() { }); } -function handleExtractMetadata(positionals: string[]): void { +async function handleExtractMetadata(positionals: string[]): Promise { const inputFile = positionals[1]; const outputFolder = positionals[2]; @@ -50,43 +44,20 @@ function handleExtractMetadata(positionals: string[]): void { process.exit(1); } - const stats = extractTableMetadata({ inputFile, outputFolder }); + const stats = await extractTableMetadata({ inputFile, outputFolder }); console.log( `Extracted ${stats.databases} databases, ${stats.tables} tables, ${stats.fields} fields`, ); process.exit(0); } -function handleExtractFieldValues(positionals: string[]): void { - const metadataFile = positionals[1]; - const fieldValuesFile = positionals[2]; - const outputFolder = positionals[3]; - - if (!metadataFile || !fieldValuesFile || !outputFolder) { - console.error( - "Error: , , and arguments are required", - ); - process.exit(1); - } - - const stats = extractFieldValues({ - metadataFile, - fieldValuesFile, - outputFolder, - }); - console.log( - `Extracted values for ${stats.fieldsWithValues} fields (${stats.fieldsSkipped} skipped, ${stats.orphans} orphans)`, - ); - process.exit(0); -} - function handleExtractSpec(values: ParsedValues): void { const { target } = extractSpec({ file: values.file ?? "spec.md" }); console.log(`Spec extracted to ${target}`); process.exit(0); } -function main(): void { +async function main(): Promise { const { values, positionals } = parseArguments(); const command = positionals[0]; @@ -98,8 +69,6 @@ function main(): void { switch (command) { case "extract-table-metadata": return handleExtractMetadata(positionals); - case "extract-field-values": - return handleExtractFieldValues(positionals); case "extract-spec": return handleExtractSpec(values); default: @@ -108,4 +77,4 @@ function main(): void { } } -main(); +await main(); diff --git a/bun.lock b/bun.lock index de5ae96..5b9342d 100644 --- a/bun.lock +++ b/bun.lock @@ -5,6 +5,7 @@ "": { "name": "@metabase/database-metadata", "dependencies": { + "@streamparser/json-node": "^0.0.22", "js-yaml": "^4.1.0", }, "devDependencies": { @@ -84,6 +85,10 @@ "@oxfmt/binding-win32-x64-msvc": ["@oxfmt/binding-win32-x64-msvc@0.45.0", "", { "os": "win32", "cpu": "x64" }, "sha512-w5MMTRCK1dpQeRA+HHqXQXyN33DlG/N2LOYxJmaT4fJjcmZrbNnqw7SmIk7I2/a2493PPLZ+2E/Ar6t2iKVMug=="], + "@streamparser/json": ["@streamparser/json@0.0.22", "", {}, "sha512-b6gTSBjJ8G8SuO3Gbbj+zXbVx8NSs1EbpbMKpzGLWMdkR+98McH9bEjSz3+0mPJf68c5nxa3CrJHp5EQNXM6zQ=="], + + "@streamparser/json-node": ["@streamparser/json-node@0.0.22", "", { "dependencies": { "@streamparser/json": "^0.0.22" } }, "sha512-sJT2ptNRwqB1lIsQrQlCoWk5rF4tif9wDh+7yluAGijJamAhrHGYpFB/Zg3hJeceoZypi74ftXk8DHzwYpbZSg=="], + "@types/bun": ["@types/bun@1.3.12", "", { "dependencies": { "bun-types": "1.3.12" } }, "sha512-DBv81elK+/VSwXHDlnH3Qduw+KxkTIWi7TXkAeh24zpi5l0B2kUg9Ga3tb4nJaPcOFswflgi/yAvMVBPrxMB+A=="], "@types/esrecurse": ["@types/esrecurse@4.3.1", "", {}, "sha512-xJBAbDifo5hpffDBuHl0Y8ywswbiAp/Wi7Y/GtAgSlZyIABppyurxVueOPE8LUQOxdlgi6Zqce7uoEpqNTeiUw=="], diff --git a/core-spec/v1/spec.md b/core-spec/v1/spec.md index 68a98a6..ef6bb71 100644 --- a/core-spec/v1/spec.md +++ b/core-spec/v1/spec.md @@ -1,6 +1,6 @@ # Metabase Database Metadata Format -**Version:** 1.0.4 +**Version:** 1.0.0 ## Overview @@ -8,7 +8,7 @@ Metabase database metadata is a read-only snapshot of databases, tables, and fie The format is designed to be **portable** and **reviewable**: numeric IDs are omitted or replaced with human-readable natural keys (database name, `[database, schema, table]` tuples, etc.). Files can be diffed, grepped, and edited by hand. -The raw `table_metadata.json` (downloaded from the Metabase workspace page) is a single flat JSON document with `databases`, `tables`, and `fields` arrays, optimized for transport rather than reading. It can be arbitrarily large — tens or hundreds of megabytes on warehouses with many tables — and is not intended for direct consumption. Tools and humans should read the extracted YAML tree under `databases/` instead, where each entity lives in its own small file and foreign keys are resolved to natural-key tuples. +The raw `table_metadata.json` is a single flat JSON document with `databases`, `tables`, and `fields` arrays, optimized for transport rather than reading. It can be arbitrarily large — tens or hundreds of megabytes on warehouses with many tables — and is not intended for direct consumption. Tools and humans should read the extracted YAML tree under `databases/` instead, where each entity lives in its own small file. ## Table of Contents @@ -18,7 +18,6 @@ The raw `table_metadata.json` (downloaded from the Metabase workspace page) is a 4. [Database](#database) 5. [Table](#table) 6. [Field](#field) -7. [Field Values](#field-values) --- @@ -133,13 +132,9 @@ By convention, metadata is extracted under a `.metadata/databases/` directory, w ├── schemas/ │ └── {schema}/ │ └── tables/ - │ ├── {table}.yaml - │ └── {table}/ # Optional: one YAML per field - │ └── {field}.yaml # that has sampled values + │ └── {table}.yaml └── tables/ # Schemaless databases - ├── {table}.yaml - └── {table}/ - └── {field}.yaml + └── {table}.yaml ``` ### Path Construction Rules @@ -258,101 +253,3 @@ parent_id: - user ``` ---- - -## Field Values - -A `Field Values` entry records the **sampled distinct values** Metabase keeps for a single field. These power filter dropdowns in the Metabase UI and give agents a concrete sense of a column's domain — what values actually appear in the data, and what human-readable labels (if any) are associated with them. - -Field values are **sampled, not exhaustive**: Metabase caps the list (typically at ~1000 distinct values), and fields above that cap, or fields whose type doesn't lend itself to enumeration (long text, high-cardinality numerics), will not have a values file at all. Agents should treat a field values file as evidence that *these* values exist, not as a ground-truth enumeration of *all* values in the column. - -### Extraction order - -**Field values must be extracted *after* metadata, never before or in isolation.** The raw `field_values.json` references fields by numeric `field_id` only; resolving those IDs to the natural-key tuples used everywhere in this format requires the metadata index. The extractor takes both `table_metadata.json` and `field_values.json` as inputs, and the two **must come from the same Metabase workspace download at the same point in time** — a stale metadata file paired with a fresh values file (or vice versa) will silently drop entries as orphans whenever a field has been added, removed, or had its ID reassigned. - -The recommended workflow is therefore strictly sequential: - -1. Download `table_metadata.json` from the Metabase workspace page. -2. Run `extract-table-metadata` to write the database/table/field YAML tree. -3. Download `field_values.json` from the **same** workspace, ideally back-to-back with step 1. -4. Run `extract-field-values` against the same output folder to drop per-field values files into the existing tree. - -Agents reading the tree can rely on this ordering: any `{table}/{field}.yaml` file is guaranteed to have a corresponding entry in the parent `{table}.yaml`'s `fields` array. - -### When to consult field values - -- Filtering by a categorical, enum-like, or low-cardinality column — the values file tells you the vocabulary you can filter against. -- Checking whether a particular value appears in a field. -- Showing example values or options to users. -- Distinguishing between display labels and stored values (e.g., a numeric `RATING` column stored as `0-5` but displayed as `Unrated`, `Poor`, …, `Excellent`). - -### Folder layout - -Field values live one directory down from the table YAML, in a folder named after the table: - -``` -schemas/{schema}/tables/ -├── {table}.yaml -└── {table}/ - └── {field}.yaml -``` - -For schemaless databases, the same pattern applies directly under `tables/`. - -**Absence is meaningful:** if a table has no `{table}/` folder, or a field has no `{field}.yaml`, then no sampled values are available for that entity. This is not an error — it's the default for high-cardinality or non-enumerable fields. - -### Filename rule - -The field filename is the field's name directly (e.g., `STATUS.yaml`). For nested JSON-unfolded fields, the path is joined with `.`: - -``` -tables/EVENTS/ -└── DATA.user.name.yaml # represents DATA → user → name -``` - -Literal dots inside a field segment are escaped as `__DOT__` so the join remains unambiguous. - -### Schema - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `field_id` | array | Yes | Field FK (natural-key tuple, same form as in the enclosing table YAML) | -| `has_more_values` | boolean | Yes | `true` when Metabase truncated the list at its internal cap; the file contains a representative sample, not the full domain | -| `values` | array | Yes | Sampled distinct values. Two encodings — see below | - -#### `values` encoding - -When no human-readable labels exist, values are bare scalars: - -```yaml -field_id: - - Sample Database - - PUBLIC - - PEOPLE - - STATE -has_more_values: false -values: - - AK - - AL - - AR -``` - -When a field has display labels (typically a remapped FK or an enum with friendly names), each entry is a `{value, label}` object: - -```yaml -field_id: - - Sample Database - - PUBLIC - - PRODUCTS - - RATING -has_more_values: false -values: - - value: 0 - label: Unrated - - value: 1 - label: Poor - - value: 5 - label: Excellent -``` - -The two encodings are mutually exclusive per file; a single YAML never mixes scalar and object entries. diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ACCOUNTS.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ACCOUNTS.yaml index 37ce31d..87d9723 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ACCOUNTS.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ACCOUNTS.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: ACCOUNTS schema: PUBLIC description: Information on customer accounts registered with Piespace. Each account represents a new organization signing up for on-demand pies. -db_id: Sample Database fields: - name: LONGITUDE base_type: type/Float diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ANALYTIC_EVENTS.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ANALYTIC_EVENTS.yaml index 2f942ab..f6d9cf4 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ANALYTIC_EVENTS.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ANALYTIC_EVENTS.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: ANALYTIC_EVENTS schema: PUBLIC description: Piespace does some anonymous analytics tracking on how users interact with their platform. They’ve only had time to implement a few events, but you know how it is. Pies come first. -db_id: Sample Database fields: - name: BUTTON_LABEL base_type: type/Text diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/FEEDBACK.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/FEEDBACK.yaml index 42bb44b..d3724c8 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/FEEDBACK.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/FEEDBACK.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: FEEDBACK schema: PUBLIC description: With each order of pies sent out, Piespace includes a place for customers to submit feedback and review their order. -db_id: Sample Database fields: - name: ID base_type: type/BigInteger diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/INVOICES.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/INVOICES.yaml index 11fc50e..09d8009 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/INVOICES.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/INVOICES.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: INVOICES schema: PUBLIC description: Confirmed payments from Piespace’s customers. Most accounts pay for their pie subscription on a monthly basis. -db_id: Sample Database fields: - name: PLAN base_type: type/Text diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ORDERS.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ORDERS.yaml index b0c0b40..38d94cf 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ORDERS.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/ORDERS.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: ORDERS schema: PUBLIC description: Confirmed Sample Company orders for a product, from a user. -db_id: Sample Database fields: - name: QUANTITY description: Number of products bought. diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE.yaml index 4f4ac7b..f27375d 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: PEOPLE schema: PUBLIC description: Information on the user accounts registered with Sample Company. -db_id: Sample Database fields: - name: STATE description: The state or province of the account’s billing address diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml deleted file mode 100644 index 562ac97..0000000 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml +++ /dev/null @@ -1,12 +0,0 @@ -field_id: - - Sample Database - - PUBLIC - - PEOPLE - - SOURCE -has_more_values: false -values: - - Affiliate - - Facebook - - Google - - Organic - - Twitter diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml deleted file mode 100644 index 22ea4ff..0000000 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml +++ /dev/null @@ -1,57 +0,0 @@ -field_id: - - Sample Database - - PUBLIC - - PEOPLE - - STATE -has_more_values: false -values: - - AK - - AL - - AR - - AZ - - CA - - CO - - CT - - DE - - FL - - GA - - HI - - IA - - ID - - IL - - IN - - KS - - KY - - LA - - MA - - MD - - ME - - MI - - MN - - MO - - MS - - MT - - NC - - ND - - NE - - NH - - NJ - - NM - - NV - - NY - - OH - - OK - - OR - - PA - - RI - - SC - - SD - - TN - - TX - - UT - - VA - - VT - - WA - - WI - - WV - - WY diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS.yaml index e417fa5..0b98b74 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: PRODUCTS schema: PUBLIC description: Includes a catalog of all the products ever sold by the famed Sample Company. -db_id: Sample Database fields: - name: ID description: The numerical product number. Only used internally. All external communication should use the title or EAN. diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/CATEGORY.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/CATEGORY.yaml deleted file mode 100644 index 0cb1231..0000000 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/CATEGORY.yaml +++ /dev/null @@ -1,11 +0,0 @@ -field_id: - - Sample Database - - PUBLIC - - PRODUCTS - - CATEGORY -has_more_values: false -values: - - Doohickey - - Gadget - - Gizmo - - Widget diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml deleted file mode 100644 index e068f36..0000000 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml +++ /dev/null @@ -1,19 +0,0 @@ -field_id: - - Sample Database - - PUBLIC - - PRODUCTS - - RATING -has_more_values: false -values: - - value: 0 - label: Unrated - - value: 1 - label: Poor - - value: 2 - label: Fair - - value: 3 - label: Good - - value: 4 - label: Great - - value: 5 - label: Excellent diff --git a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/REVIEWS.yaml b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/REVIEWS.yaml index 8c77fea..6cd318f 100644 --- a/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/REVIEWS.yaml +++ b/examples/v1/databases/Sample Database/schemas/PUBLIC/tables/REVIEWS.yaml @@ -1,7 +1,7 @@ +db_id: Sample Database name: REVIEWS schema: PUBLIC description: Reviews that Sample Company customers have left on our products. -db_id: Sample Database fields: - name: RATING description: The rating (on a scale of 1-5) the user left. diff --git a/examples/v1/field_values.json b/examples/v1/field_values.json deleted file mode 100644 index dc9e89c..0000000 --- a/examples/v1/field_values.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "field_values": [ - { - "field_id": 1, - "values": [ - "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE", - "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", - "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", - "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", - "NV", "NY", "OH", "OK", "OR", "PA", "RI", "SC", - "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", - "WV", "WY" - ], - "human_readable_values": [], - "has_more_values": false - }, - { - "field_id": 30, - "values": ["Affiliate", "Facebook", "Google", "Organic", "Twitter"], - "human_readable_values": [], - "has_more_values": false - }, - { - "field_id": 18, - "values": ["Doohickey", "Gadget", "Gizmo", "Widget"], - "human_readable_values": [], - "has_more_values": false - }, - { - "field_id": 16, - "values": [0, 1, 2, 3, 4, 5], - "human_readable_values": [ - "Unrated", - "Poor", - "Fair", - "Good", - "Great", - "Excellent" - ], - "has_more_values": false - } - ] -} diff --git a/examples/v1/table_metadata.json b/examples/v1/table_metadata.json index b0d47da..91ffc23 100644 --- a/examples/v1/table_metadata.json +++ b/examples/v1/table_metadata.json @@ -1,82 +1,132 @@ { "databases": [ { - "id": 1, + "id": "Sample Database", "name": "Sample Database", "engine": "postgres" } ], "tables": [ { - "id": 1, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], + "db_id": "Sample Database", "name": "PEOPLE", "schema": "PUBLIC", "description": "Information on the user accounts registered with Sample Company." }, { - "id": 2, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], + "db_id": "Sample Database", "name": "ORDERS", "schema": "PUBLIC", "description": "Confirmed Sample Company orders for a product, from a user." }, { - "id": 3, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], + "db_id": "Sample Database", "name": "PRODUCTS", "schema": "PUBLIC", "description": "Includes a catalog of all the products ever sold by the famed Sample Company." }, { - "id": 4, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], + "db_id": "Sample Database", "name": "REVIEWS", "schema": "PUBLIC", "description": "Reviews that Sample Company customers have left on our products." }, { - "id": 5, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], + "db_id": "Sample Database", "name": "FEEDBACK", "schema": "PUBLIC", "description": "With each order of pies sent out, Piespace includes a place for customers to submit feedback and review their order." }, { - "id": 6, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], + "db_id": "Sample Database", "name": "ACCOUNTS", "schema": "PUBLIC", "description": "Information on customer accounts registered with Piespace. Each account represents a new organization signing up for on-demand pies." }, { - "id": 7, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], + "db_id": "Sample Database", "name": "ANALYTIC_EVENTS", "schema": "PUBLIC", - "description": "Piespace does some anonymous analytics tracking on how users interact with their platform. They’ve only had time to implement a few events, but you know how it is. Pies come first." + "description": "Piespace does some anonymous analytics tracking on how users interact with their platform. They\u2019ve only had time to implement a few events, but you know how it is. Pies come first." }, { - "id": 8, - "db_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], + "db_id": "Sample Database", "name": "INVOICES", "schema": "PUBLIC", - "description": "Confirmed payments from Piespace’s customers. Most accounts pay for their pie subscription on a monthly basis." + "description": "Confirmed payments from Piespace\u2019s customers. Most accounts pay for their pie subscription on a monthly basis." } ], "fields": [ { - "id": 1, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "STATE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "STATE", - "description": "The state or province of the account’s billing address", + "description": "The state or province of the account\u2019s billing address", "base_type": "type/Text", "database_type": "CHARACTER", "semantic_type": "type/State" }, { - "id": 4, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "ID", "description": "A unique identifier given to each user.", "base_type": "type/BigInteger", @@ -84,8 +134,17 @@ "semantic_type": "type/PK" }, { - "id": 7, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "EMAIL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "EMAIL", "description": "The contact email for the account.", "base_type": "type/Text", @@ -93,16 +152,34 @@ "semantic_type": "type/Email" }, { - "id": 12, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "BIRTH_DATE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "BIRTH_DATE", "description": "The date of birth of the user", "base_type": "type/Date", "database_type": "DATE" }, { - "id": 30, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "SOURCE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "SOURCE", "description": "The channel through which we acquired this user. Valid values include: Affiliate, Facebook, Google, Organic and Twitter", "base_type": "type/Text", @@ -110,8 +187,17 @@ "semantic_type": "type/Source" }, { - "id": 48, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "NAME" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "NAME", "description": "The name of the user who owns an account", "base_type": "type/Text", @@ -119,25 +205,52 @@ "semantic_type": "type/Name" }, { - "id": 50, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "CREATED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "CREATED_AT", - "description": "The date the user record was created. Also referred to as the user’s \"join date\"", + "description": "The date the user record was created. Also referred to as the user\u2019s \"join date\"", "base_type": "type/DateTime", "database_type": "TIMESTAMP", "semantic_type": "type/CreationTimestamp" }, { - "id": 51, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "ADDRESS" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "ADDRESS", - "description": "The street address of the account’s billing address", + "description": "The street address of the account\u2019s billing address", "base_type": "type/Text", "database_type": "CHARACTER VARYING" }, { - "id": 52, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "LATITUDE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "LATITUDE", "description": "This is the latitude of the user on sign-up. It might be updated in the future to the last seen location.", "base_type": "type/Float", @@ -145,25 +258,52 @@ "semantic_type": "type/Latitude" }, { - "id": 53, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "CITY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "CITY", - "description": "The city of the account’s billing address", + "description": "The city of the account\u2019s billing address", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/City" }, { - "id": 54, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "PASSWORD" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "PASSWORD", "description": "This is the salted password of the user. It should not be visible", "base_type": "type/Text", "database_type": "CHARACTER VARYING" }, { - "id": 58, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "LONGITUDE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "LONGITUDE", "description": "This is the longitude of the user on sign-up. It might be updated in the future to the last seen location.", "base_type": "type/Float", @@ -171,17 +311,35 @@ "semantic_type": "type/Longitude" }, { - "id": 61, - "table_id": 1, + "id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "ZIP" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE" + ], "name": "ZIP", - "description": "The postal code of the account’s billing address", + "description": "The postal code of the account\u2019s billing address", "base_type": "type/Text", "database_type": "CHARACTER", "semantic_type": "type/ZipCode" }, { - "id": 2, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "QUANTITY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "QUANTITY", "description": "Number of products bought.", "base_type": "type/Integer", @@ -189,8 +347,17 @@ "semantic_type": "type/Quantity" }, { - "id": 3, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "DISCOUNT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "DISCOUNT", "description": "Discount amount.", "base_type": "type/Float", @@ -198,51 +365,110 @@ "semantic_type": "type/Discount" }, { - "id": 5, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "TOTAL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "TOTAL", "description": "The total billed amount.", "base_type": "type/Float", "database_type": "DOUBLE PRECISION" }, { - "id": 6, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "TAX" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "TAX", "description": "This is the amount of local and federal taxes that are collected on the purchase. Note that other governmental fees on some products are not included here, but instead are accounted for in the subtotal.", "base_type": "type/Float", "database_type": "DOUBLE PRECISION" }, { - "id": 9, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "ID", - "description": "This is a unique ID for the product. It is also called the “Invoice number” or “Confirmation number” in customer facing emails and screens.", + "description": "This is a unique ID for the product. It is also called the \u201cInvoice number\u201d or \u201cConfirmation number\u201d in customer facing emails and screens.", "base_type": "type/BigInteger", "database_type": "BIGINT", "semantic_type": "type/PK" }, { - "id": 10, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "SUBTOTAL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "SUBTOTAL", "description": "The raw, pre-tax cost of the order. Note that this might be different in the future from the product price due to promotions, credits, etc.", "base_type": "type/Float", "database_type": "DOUBLE PRECISION" }, { - "id": 11, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "USER_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "USER_ID", - "fk_target_field_id": 4, "description": "The id of the user who made this order. Note that in some cases where an order was created on behalf of a customer who phoned the order in, this might be the employee who handled the request.", "base_type": "type/Integer", "database_type": "INTEGER", - "semantic_type": "type/FK" - }, - { - "id": 13, - "table_id": 2, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "PEOPLE", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "CREATED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "CREATED_AT", "description": "The date and time an order was submitted.", "base_type": "type/DateTime", @@ -250,18 +476,41 @@ "semantic_type": "type/CreationTimestamp" }, { - "id": 14, - "table_id": 2, + "id": [ + "Sample Database", + "PUBLIC", + "ORDERS", + "PRODUCT_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ORDERS" + ], "name": "PRODUCT_ID", - "fk_target_field_id": 8, "description": "The product ID. This is an internal identifier for the product, NOT the SKU.", "base_type": "type/Integer", "database_type": "INTEGER", - "semantic_type": "type/FK" - }, - { - "id": 8, - "table_id": 3, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "ID", "description": "The numerical product number. Only used internally. All external communication should use the title or EAN.", "base_type": "type/BigInteger", @@ -269,16 +518,34 @@ "semantic_type": "type/PK" }, { - "id": 15, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "EAN" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "EAN", "description": "The international article number. A 13 digit number uniquely identifying the product.", "base_type": "type/Text", "database_type": "CHARACTER" }, { - "id": 16, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "RATING" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "RATING", "description": "The average rating users have given the product. This ranges from 1 - 5", "base_type": "type/Float", @@ -286,8 +553,17 @@ "semantic_type": "type/Score" }, { - "id": 17, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "TITLE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "TITLE", "description": "The name of the product as it should be displayed to customers.", "base_type": "type/Text", @@ -295,8 +571,17 @@ "semantic_type": "type/Title" }, { - "id": 18, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "CATEGORY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "CATEGORY", "description": "The type of product, valid values include: Doohicky, Gadget, Gizmo and Widget", "base_type": "type/Text", @@ -304,8 +589,17 @@ "semantic_type": "type/Category" }, { - "id": 34, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "VENDOR" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "VENDOR", "description": "The source of the product.", "base_type": "type/Text", @@ -313,16 +607,34 @@ "semantic_type": "type/Company" }, { - "id": 44, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "PRICE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "PRICE", "description": "The list price of the product. Note that this is not always the price the product sold for due to discounts, promotions, etc.", "base_type": "type/Float", "database_type": "DOUBLE PRECISION" }, { - "id": 63, - "table_id": 3, + "id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "CREATED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS" + ], "name": "CREATED_AT", "description": "The date the product was added to our catalog.", "base_type": "type/DateTime", @@ -330,8 +642,17 @@ "semantic_type": "type/CreationTimestamp" }, { - "id": 19, - "table_id": 4, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "RATING" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "RATING", "description": "The rating (on a scale of 1-5) the user left.", "base_type": "type/Integer", @@ -339,8 +660,17 @@ "semantic_type": "type/Score" }, { - "id": 55, - "table_id": 4, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "CREATED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "CREATED_AT", "description": "The day and time a review was written by a user.", "base_type": "type/DateTime", @@ -348,8 +678,17 @@ "semantic_type": "type/CreationTimestamp" }, { - "id": 59, - "table_id": 4, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "ID", "description": "A unique internal identifier for the review. Should not be used externally.", "base_type": "type/BigInteger", @@ -357,26 +696,58 @@ "semantic_type": "type/PK" }, { - "id": 65, - "table_id": 4, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "PRODUCT_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "PRODUCT_ID", - "fk_target_field_id": 8, "description": "The product the review was for", "base_type": "type/Integer", "database_type": "INTEGER", - "semantic_type": "type/FK" - }, - { - "id": 67, - "table_id": 4, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "PRODUCTS", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "REVIEWER" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "REVIEWER", "description": "The user who left the review", "base_type": "type/Text", "database_type": "CHARACTER VARYING" }, { - "id": 69, - "table_id": 4, + "id": [ + "Sample Database", + "PUBLIC", + "REVIEWS", + "BODY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "REVIEWS" + ], "name": "BODY", "description": "The review the user left. Limited to 2000 characters.", "base_type": "type/Text", @@ -384,273 +755,603 @@ "semantic_type": "type/Description" }, { - "id": 20, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "ID", "base_type": "type/BigInteger", "database_type": "BIGINT", "semantic_type": "type/PK" }, { - "id": 22, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "RATING" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "RATING", "base_type": "type/Integer", "database_type": "SMALLINT", "semantic_type": "type/Score" }, { - "id": 23, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "RATING_MAPPED" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "RATING_MAPPED", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Category" }, { - "id": 25, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "ACCOUNT_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "ACCOUNT_ID", - "fk_target_field_id": 24, "base_type": "type/BigInteger", "database_type": "BIGINT", - "semantic_type": "type/FK" - }, - { - "id": 26, - "table_id": 5, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "EMAIL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "EMAIL", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Email" }, { - "id": 27, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "DATE_RECEIVED" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "DATE_RECEIVED", "base_type": "type/DateTime", "database_type": "TIMESTAMP" }, { - "id": 28, - "table_id": 5, + "id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK", + "BODY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "FEEDBACK" + ], "name": "BODY", "base_type": "type/Text", "database_type": "CHARACTER LARGE OBJECT" }, { - "id": 21, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "LONGITUDE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "LONGITUDE", "base_type": "type/Float", "database_type": "DOUBLE PRECISION", "semantic_type": "type/Longitude" }, { - "id": 24, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "ID", "base_type": "type/BigInteger", "database_type": "BIGINT", "semantic_type": "type/PK" }, { - "id": 31, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "CREATED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "CREATED_AT", "base_type": "type/DateTime", "database_type": "TIMESTAMP", "semantic_type": "type/CreationTimestamp" }, { - "id": 33, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "SEATS" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "SEATS", "base_type": "type/Integer", "database_type": "INTEGER" }, { - "id": 36, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "LAST_NAME" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "LAST_NAME", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Name" }, { - "id": 37, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "FIRST_NAME" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "FIRST_NAME", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Name" }, { - "id": 38, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "TRIAL_CONVERTED" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "TRIAL_CONVERTED", "base_type": "type/Boolean", "database_type": "BOOLEAN" }, { - "id": 39, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "CANCELED_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "CANCELED_AT", "base_type": "type/DateTime", "database_type": "TIMESTAMP", "semantic_type": "type/CancelationTimestamp" }, { - "id": 40, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "TRIAL_ENDS_AT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "TRIAL_ENDS_AT", "base_type": "type/DateTime", "database_type": "TIMESTAMP" }, { - "id": 41, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "SOURCE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "SOURCE", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Source" }, { - "id": 42, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "ACTIVE_SUBSCRIPTION" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "ACTIVE_SUBSCRIPTION", "base_type": "type/Boolean", "database_type": "BOOLEAN" }, { - "id": 43, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "PLAN" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "PLAN", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Category" }, { - "id": 47, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "LEGACY_PLAN" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "LEGACY_PLAN", "base_type": "type/Boolean", "database_type": "BOOLEAN" }, { - "id": 56, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "COUNTRY" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "COUNTRY", "base_type": "type/Text", "database_type": "CHARACTER", "semantic_type": "type/Country" }, { - "id": 57, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "LATITUDE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "LATITUDE", "base_type": "type/Float", "database_type": "DOUBLE PRECISION", "semantic_type": "type/Latitude" }, { - "id": 62, - "table_id": 6, + "id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "EMAIL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS" + ], "name": "EMAIL", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Email" }, { - "id": 29, - "table_id": 7, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "BUTTON_LABEL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "BUTTON_LABEL", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Category" }, { - "id": 32, - "table_id": 7, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "EVENT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "EVENT", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Category" }, { - "id": 35, - "table_id": 7, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "TIMESTAMP" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "TIMESTAMP", "base_type": "type/DateTime", "database_type": "TIMESTAMP" }, { - "id": 45, - "table_id": 7, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "ACCOUNT_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "ACCOUNT_ID", - "fk_target_field_id": 24, "base_type": "type/BigInteger", "database_type": "BIGINT", - "semantic_type": "type/FK" - }, - { - "id": 49, - "table_id": 7, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "PAGE_URL" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "PAGE_URL", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/URL" }, { - "id": 60, - "table_id": 7, + "id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "ANALYTIC_EVENTS" + ], "name": "ID", "base_type": "type/BigInteger", "database_type": "BIGINT", "semantic_type": "type/PK" }, { - "id": 46, - "table_id": 8, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "PLAN" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "PLAN", "base_type": "type/Text", "database_type": "CHARACTER VARYING", "semantic_type": "type/Category" }, { - "id": 64, - "table_id": 8, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "PAYMENT" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "PAYMENT", "base_type": "type/Float", "database_type": "DOUBLE PRECISION" }, { - "id": 66, - "table_id": 8, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "ACCOUNT_ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "ACCOUNT_ID", - "fk_target_field_id": 24, "base_type": "type/BigInteger", "database_type": "BIGINT", - "semantic_type": "type/FK" - }, - { - "id": 68, - "table_id": 8, + "semantic_type": "type/FK", + "fk_target_field_id": [ + "Sample Database", + "PUBLIC", + "ACCOUNTS", + "ID" + ] + }, + { + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "ID" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "ID", "base_type": "type/BigInteger", "database_type": "BIGINT", "semantic_type": "type/PK" }, { - "id": 70, - "table_id": 8, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "EXPECTED_INVOICE" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "EXPECTED_INVOICE", "base_type": "type/Boolean", "database_type": "BOOLEAN" }, { - "id": 71, - "table_id": 8, + "id": [ + "Sample Database", + "PUBLIC", + "INVOICES", + "DATE_RECEIVED" + ], + "table_id": [ + "Sample Database", + "PUBLIC", + "INVOICES" + ], "name": "DATE_RECEIVED", "base_type": "type/DateTime", "database_type": "TIMESTAMP" diff --git a/package.json b/package.json index 7e2264b..b06c6a0 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,7 @@ "test": "bun test" }, "dependencies": { + "@streamparser/json-node": "^0.0.22", "js-yaml": "^4.1.0" }, "devDependencies": { diff --git a/src/extract-field-values.test.ts b/src/extract-field-values.test.ts deleted file mode 100644 index 021a492..0000000 --- a/src/extract-field-values.test.ts +++ /dev/null @@ -1,262 +0,0 @@ -import { afterEach, beforeEach, describe, expect, it } from "bun:test"; -import { - existsSync, - mkdtempSync, - readFileSync, - rmSync, - writeFileSync, -} from "fs"; -import { tmpdir } from "os"; -import { join, resolve } from "path"; -import yaml from "js-yaml"; - -import { extractFieldValues } from "./extract-field-values.js"; - -const REPO_ROOT = resolve(import.meta.dirname, ".."); -const EXAMPLE_METADATA = join(REPO_ROOT, "examples/v1/table_metadata.json"); -const EXAMPLE_FIELD_VALUES = join(REPO_ROOT, "examples/v1/field_values.json"); - -describe("extractFieldValues", () => { - let workdir: string; - - beforeEach(() => { - workdir = mkdtempSync(join(tmpdir(), "database-metadata-values-")); - }); - - afterEach(() => { - rmSync(workdir, { recursive: true, force: true }); - }); - - it("writes one YAML per non-empty field values entry", () => { - const stats = extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile: EXAMPLE_FIELD_VALUES, - outputFolder: workdir, - }); - - expect(stats).toEqual({ - fieldsWithValues: 4, - fieldsSkipped: 0, - orphans: 0, - }); - - const statePath = join( - workdir, - "Sample Database", - "schemas", - "PUBLIC", - "tables", - "PEOPLE", - "STATE.yaml", - ); - expect(existsSync(statePath)).toBe(true); - }); - - it("emits bare scalars when human_readable_values is empty", () => { - extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile: EXAMPLE_FIELD_VALUES, - outputFolder: workdir, - }); - - const statePath = join( - workdir, - "Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml", - ); - const doc = yaml.load(readFileSync(statePath, "utf8")) as { - field_id: unknown[]; - has_more_values: boolean; - values: unknown[]; - }; - - expect(doc.field_id).toEqual([ - "Sample Database", - "PUBLIC", - "PEOPLE", - "STATE", - ]); - expect(doc.has_more_values).toBe(false); - expect(doc.values.slice(0, 3)).toEqual(["AK", "AL", "AR"]); - }); - - it("emits {value, label} objects when human_readable_values is provided", () => { - extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile: EXAMPLE_FIELD_VALUES, - outputFolder: workdir, - }); - - const ratingPath = join( - workdir, - "Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml", - ); - const doc = yaml.load(readFileSync(ratingPath, "utf8")) as { - values: Array<{ value: number; label: string }>; - }; - - expect(doc.values[0]).toEqual({ value: 0, label: "Unrated" }); - expect(doc.values[5]).toEqual({ value: 5, label: "Excellent" }); - }); - - it("skips entries with an empty values array", () => { - const metadata = JSON.parse(readFileSync(EXAMPLE_METADATA, "utf8")); - const fieldValues = { - field_values: [ - { - field_id: metadata.fields[0].id, - values: [], - human_readable_values: [], - has_more_values: false, - }, - ], - }; - const fieldValuesFile = join(workdir, "field_values.json"); - writeFileSync(fieldValuesFile, JSON.stringify(fieldValues)); - - const stats = extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile, - outputFolder: join(workdir, "out"), - }); - - expect(stats).toEqual({ - fieldsWithValues: 0, - fieldsSkipped: 1, - orphans: 0, - }); - }); - - it("counts and skips orphaned field_ids", () => { - const fieldValues = { - field_values: [ - { - field_id: 999999, - values: ["x"], - human_readable_values: [], - has_more_values: false, - }, - ], - }; - const fieldValuesFile = join(workdir, "field_values.json"); - writeFileSync(fieldValuesFile, JSON.stringify(fieldValues)); - - const stats = extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile, - outputFolder: join(workdir, "out"), - }); - - expect(stats).toEqual({ - fieldsWithValues: 0, - fieldsSkipped: 0, - orphans: 1, - }); - }); - - it("joins nested JSON field paths with dots in the filename", () => { - const metadataPath = join(workdir, "table_metadata.json"); - writeFileSync( - metadataPath, - JSON.stringify({ - databases: [{ id: 1, name: "DB", engine: "postgres" }], - tables: [{ id: 10, db_id: 1, name: "EVENTS", schema: "PUBLIC" }], - fields: [ - { id: 100, table_id: 10, name: "DATA", base_type: "type/Structured" }, - { id: 101, table_id: 10, name: "user", parent_id: 100 }, - { id: 102, table_id: 10, name: "name", parent_id: 101 }, - ], - }), - ); - - const fieldValuesPath = join(workdir, "field_values.json"); - writeFileSync( - fieldValuesPath, - JSON.stringify({ - field_values: [ - { - field_id: 102, - values: ["alice", "bob"], - has_more_values: false, - }, - ], - }), - ); - - const out = join(workdir, "out"); - extractFieldValues({ - metadataFile: metadataPath, - fieldValuesFile: fieldValuesPath, - outputFolder: out, - }); - - const nestedPath = join( - out, - "DB/schemas/PUBLIC/tables/EVENTS/DATA.user.name.yaml", - ); - expect(existsSync(nestedPath)).toBe(true); - - const doc = yaml.load(readFileSync(nestedPath, "utf8")) as { - field_id: unknown[]; - }; - expect(doc.field_id).toEqual([ - "DB", - "PUBLIC", - "EVENTS", - "DATA", - "user", - "name", - ]); - }); - - it("passes has_more_values through unchanged", () => { - const fieldValues = { - field_values: [ - { - field_id: 1, - values: ["foo"], - human_readable_values: [], - has_more_values: true, - }, - ], - }; - const fieldValuesFile = join(workdir, "field_values.json"); - writeFileSync(fieldValuesFile, JSON.stringify(fieldValues)); - - extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile, - outputFolder: workdir, - }); - - const statePath = join( - workdir, - "Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml", - ); - const doc = yaml.load(readFileSync(statePath, "utf8")) as { - has_more_values: boolean; - }; - expect(doc.has_more_values).toBe(true); - }); - - it("regenerates output that matches the bundled examples", () => { - extractFieldValues({ - metadataFile: EXAMPLE_METADATA, - fieldValuesFile: EXAMPLE_FIELD_VALUES, - outputFolder: workdir, - }); - - for (const relative of [ - "Sample Database/schemas/PUBLIC/tables/PEOPLE/STATE.yaml", - "Sample Database/schemas/PUBLIC/tables/PEOPLE/SOURCE.yaml", - "Sample Database/schemas/PUBLIC/tables/PRODUCTS/CATEGORY.yaml", - "Sample Database/schemas/PUBLIC/tables/PRODUCTS/RATING.yaml", - ]) { - const checkedIn = readFileSync( - join(REPO_ROOT, "examples/v1/databases", relative), - "utf8", - ); - const generated = readFileSync(join(workdir, relative), "utf8"); - expect(generated).toBe(checkedIn); - } - }); -}); diff --git a/src/extract-field-values.ts b/src/extract-field-values.ts deleted file mode 100644 index dce3568..0000000 --- a/src/extract-field-values.ts +++ /dev/null @@ -1,125 +0,0 @@ -import { readFileSync } from "node:fs"; -import { join } from "node:path"; - -import { - buildIndex, - createFolder, - escapeFilename, - getFieldKey, - getTablesFolder, - getFieldFilename, - writeYaml, - type FieldID, - type FieldKey, - type RawMetadata, -} from "./lib.js"; - -type RawFieldValueEntry = { - field_id: FieldID; - values: unknown[]; - human_readable_values?: string[]; - has_more_values?: boolean; -}; - -type RawFieldValues = { - field_values: RawFieldValueEntry[]; -}; - -type FormattedValue = unknown | { value: unknown; label: string }; - -type FormattedFieldValues = { - field_id: FieldKey; - has_more_values: boolean; - values: FormattedValue[]; -}; - -export type ExtractFieldValuesOptions = { - metadataFile: string; - fieldValuesFile: string; - outputFolder: string; -}; - -export type ExtractFieldValuesResult = { - fieldsWithValues: number; - fieldsSkipped: number; - orphans: number; -}; - -function formatValues(entry: RawFieldValueEntry): FormattedValue[] { - const labels = entry.human_readable_values ?? []; - return entry.values.map((value, index) => { - const label = labels[index]; - if (label !== undefined && label !== null && label !== "") { - return { value, label }; - } - return value; - }); -} - -export function extractFieldValues({ - metadataFile, - fieldValuesFile, - outputFolder, -}: ExtractFieldValuesOptions): ExtractFieldValuesResult { - const metadata = JSON.parse( - readFileSync(metadataFile, "utf-8"), - ) as RawMetadata; - const rawFieldValues = JSON.parse( - readFileSync(fieldValuesFile, "utf-8"), - ) as RawFieldValues; - - const index = buildIndex(metadata); - const { fieldsById, tablesById, databasesById } = index; - - let fieldsWithValues = 0; - let fieldsSkipped = 0; - let orphans = 0; - const createdFolders = new Set(); - - for (const entry of rawFieldValues.field_values ?? []) { - const field = fieldsById.get(entry.field_id); - if (!field) { - orphans += 1; - console.warn( - `Skipping field values for unknown field_id ${entry.field_id}`, - ); - continue; - } - - if (entry.values.length === 0) { - fieldsSkipped += 1; - continue; - } - - const table = tablesById.get(field.table_id); - const db = table && databasesById.get(table.db_id); - const fieldKey = table && db && getFieldKey(db, table, field, fieldsById); - if (!table || !db || !fieldKey) { - orphans += 1; - console.warn( - `Skipping field values for field_id ${entry.field_id}: could not resolve field path`, - ); - continue; - } - - const tableFolder = join( - getTablesFolder(outputFolder, db, table), - escapeFilename(table.name), - ); - if (!createdFolders.has(tableFolder)) { - createFolder(tableFolder); - createdFolders.add(tableFolder); - } - - const payload: FormattedFieldValues = { - field_id: fieldKey, - has_more_values: entry.has_more_values ?? false, - values: formatValues(entry), - }; - - writeYaml(join(tableFolder, `${getFieldFilename(fieldKey)}.yaml`), payload); - fieldsWithValues += 1; - } - - return { fieldsWithValues, fieldsSkipped, orphans }; -} diff --git a/src/extract-table-metadata.test.ts b/src/extract-table-metadata.test.ts index ad03ed3..0ffb936 100644 --- a/src/extract-table-metadata.test.ts +++ b/src/extract-table-metadata.test.ts @@ -26,8 +26,8 @@ describe("extractTableMetadata", () => { rmSync(workdir, { recursive: true, force: true }); }); - it("extracts the bundled sample database to YAML", () => { - const stats = extractTableMetadata({ + it("extracts the bundled sample database to YAML", async () => { + const stats = await extractTableMetadata({ inputFile: EXAMPLE_INPUT, outputFolder: workdir, }); @@ -48,8 +48,11 @@ describe("extractTableMetadata", () => { expect(existsSync(ordersPath)).toBe(true); }); - it("strips numeric ids and uses natural-key db_id on tables", () => { - extractTableMetadata({ inputFile: EXAMPLE_INPUT, outputFolder: workdir }); + it("strips numeric ids and uses natural-key db_id on tables", async () => { + await extractTableMetadata({ + inputFile: EXAMPLE_INPUT, + outputFolder: workdir, + }); const tablePath = join( workdir, "Sample Database", @@ -68,8 +71,11 @@ describe("extractTableMetadata", () => { expect(Array.isArray(table.fields)).toBe(true); }); - it("rewrites fk_target_field_id as a natural-key tuple", () => { - extractTableMetadata({ inputFile: EXAMPLE_INPUT, outputFolder: workdir }); + it("rewrites fk_target_field_id as a natural-key tuple", async () => { + await extractTableMetadata({ + inputFile: EXAMPLE_INPUT, + outputFolder: workdir, + }); const tablePath = join( workdir, "Sample Database", @@ -92,26 +98,29 @@ describe("extractTableMetadata", () => { ]); }); - it("escapes slashes in entity names", () => { + it("escapes slashes in entity names", async () => { const input = join(workdir, "input.json"); writeFileSync( input, JSON.stringify({ - databases: [{ id: 1, name: "weird/name" }], + databases: [{ id: "weird/name", name: "weird/name", engine: "h2" }], tables: [], fields: [], }), ); const out = join(workdir, "out"); - extractTableMetadata({ inputFile: input, outputFolder: out }); + await extractTableMetadata({ inputFile: input, outputFolder: out }); expect( existsSync(join(out, "weird__SLASH__name", "weird__SLASH__name.yaml")), ).toBe(true); }); - it("regenerates output that matches the bundled examples", () => { - extractTableMetadata({ inputFile: EXAMPLE_INPUT, outputFolder: workdir }); + it("regenerates output that matches the bundled examples", async () => { + await extractTableMetadata({ + inputFile: EXAMPLE_INPUT, + outputFolder: workdir, + }); const checkedIn = readFileSync( join( diff --git a/src/extract-table-metadata.ts b/src/extract-table-metadata.ts index 526a530..d51fe37 100644 --- a/src/extract-table-metadata.ts +++ b/src/extract-table-metadata.ts @@ -1,45 +1,40 @@ -import { readFileSync } from "node:fs"; - -import { - buildIndex, - createFolder, - getDatabaseFolder, - getDatabaseKey, - getDatabasePath, - getFieldKey, - getTablePath, - getTablesFolder, - writeYaml, - type DatabaseKey, - type FieldKey, - type MetadataIndex, - type RawDatabase, - type RawField, - type RawMetadata, - type RawTable, -} from "./lib.js"; +import { createReadStream } from "node:fs"; +import { access, appendFile, mkdir, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import yaml from "js-yaml"; +import { JSONParser } from "@streamparser/json-node"; + +type DatabaseId = string; +type TableId = [DatabaseId, string | null, string]; +type FieldId = [...TableId, string, ...string[]]; type Database = { + id: DatabaseId; name: string; engine: string; }; -type Field = { +type Table = { + id: TableId; + db_id: DatabaseId; name: string; + schema: string | null; description?: string; - base_type?: string; - database_type?: string; - semantic_type?: string; - parent_id?: FieldKey; - fk_target_field_id?: FieldKey; }; -type Table = { +type Field = { + id: FieldId; + table_id: TableId; name: string; - schema: string | null; description?: string; - db_id: DatabaseKey; - fields: Field[]; + base_type?: string; + database_type?: string; + effective_type?: string; + semantic_type?: string; + coercion_strategy?: string; + parent_id?: FieldId; + fk_target_field_id?: FieldId; + nfc_path?: string[]; }; export type ExtractMetadataOptions = { @@ -53,86 +48,131 @@ export type ExtractMetadataResult = { fields: number; }; -function formatDatabase(db: RawDatabase): Database { - const { id: _id, ...result } = db; - return result; +const YAML_OPTS = { lineWidth: -1, noRefs: true } as const; + +async function fileExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } } -function formatTable(db: RawDatabase, table: RawTable): Omit { - const { id: _id, db_id: _db_id, ...rest } = table; - return { ...rest, db_id: getDatabaseKey(db) }; +function escapeFilename(name: string): string { + return name.replace(/\//g, "__SLASH__").replace(/\\/g, "__BACKSLASH__"); } -function formatField( - db: RawDatabase, - table: RawTable, - field: RawField, - index: MetadataIndex, -): Field { - const { fieldsById, tablesById, databasesById } = index; - const { - id: _id, - table_id: _table_id, - parent_id, - fk_target_field_id, - ...rest - } = field; - const result: Field = { ...rest }; - // Silently drop parent_id / fk_target_field_id if the referenced entity can't be resolved. - if (parent_id) { - const parent = fieldsById.get(parent_id); - const parentKey = parent && getFieldKey(db, table, parent, fieldsById); - if (parentKey) { - result.parent_id = parentKey; - } - } - if (fk_target_field_id) { - const targetField = fieldsById.get(fk_target_field_id); - const targetTable = targetField && tablesById.get(targetField.table_id); - const targetDb = targetTable && databasesById.get(targetTable.db_id); - const targetKey = - targetDb && - targetTable && - targetField && - getFieldKey(targetDb, targetTable, targetField, fieldsById); - if (targetKey) { - result.fk_target_field_id = targetKey; - } - } - return result; +function getDatabasePath(outputFolder: string, dbName: string): string { + const safe = escapeFilename(dbName); + return join(outputFolder, safe, `${safe}.yaml`); } -function buildStats(metadata: RawMetadata): ExtractMetadataResult { - return { - databases: metadata.databases.length, - tables: metadata.tables.length, - fields: metadata.fields.length, - }; +function getTablePath( + outputFolder: string, + dbName: DatabaseId, + tableSchema: string | null, + tableName: string, +): string { + const dbFolder = join(outputFolder, escapeFilename(dbName)); + const tablesFolder = tableSchema + ? join(dbFolder, "schemas", escapeFilename(tableSchema), "tables") + : join(dbFolder, "tables"); + return join(tablesFolder, `${escapeFilename(tableName)}.yaml`); } -export function extractTableMetadata({ +function indentLines(text: string, prefix: string): string { + return text.replace(/^(?=.)/gm, prefix); +} + +function formatDatabase(db: Database) { + const { id: _id, ...rest } = db; + return rest; +} + +function formatTable(table: Table) { + const { id: _id, ...rest } = table; + return rest; +} + +function formatField(field: Field) { + const { id: _id, table_id: _table_id, ...rest } = field; + return rest; +} + +function createParser(inputFile: string, key: string): JSONParser { + const parser = new JSONParser({ + paths: [`$.${key}.*`], + keepStack: false, + }); + createReadStream(inputFile).pipe(parser); + return parser; +} + +export async function extractTableMetadata({ inputFile, outputFolder, -}: ExtractMetadataOptions): ExtractMetadataResult { - const metadata = JSON.parse(readFileSync(inputFile, "utf-8")) as RawMetadata; - const index = buildIndex(metadata); - const { databases, tablesByDbId, fieldsByTableId } = index; - - for (const db of databases) { - createFolder(getDatabaseFolder(outputFolder, db)); - writeYaml(getDatabasePath(outputFolder, db), formatDatabase(db)); - - for (const table of tablesByDbId.get(db.id) ?? []) { - const fields = (fieldsByTableId.get(table.id) ?? []).map((field) => - formatField(db, table, field, index), - ); - createFolder(getTablesFolder(outputFolder, db, table)); - writeYaml(getTablePath(outputFolder, db, table), { - ...formatTable(db, table), - fields, - }); +}: ExtractMetadataOptions): Promise { + let databases = 0; + let tables = 0; + let fields = 0; + + // Pass 1 — databases: write each database yaml. + for await (const { value } of createParser(inputFile, "databases")) { + const db: Database = value; + await mkdir(join(outputFolder, escapeFilename(db.name)), { + recursive: true, + }); + await writeFile( + getDatabasePath(outputFolder, db.name), + yaml.dump(formatDatabase(db), YAML_OPTS), + ); + databases++; + } + + // Pass 2 — fields (touch): create an empty file at each parent table's path so pass 3 + // can detect "this table has fields" via fileExists. + for await (const { value } of createParser(inputFile, "fields")) { + const field: Field = value; + const [dbName, tableSchema, tableName] = field.table_id; + const path = getTablePath(outputFolder, dbName, tableSchema, tableName); + if (!(await fileExists(path))) { + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, ""); } } - return buildStats(metadata); + // Pass 3 — tables: write each table's metadata; when pass 2 already created the file, + // append a bare `fields:` so pass 4 can stream items underneath. + for await (const { value } of createParser(inputFile, "tables")) { + const table: Table = value; + const path = getTablePath( + outputFolder, + table.db_id, + table.schema, + table.name, + ); + const hasFields = await fileExists(path); + if (!hasFields) { + await mkdir(dirname(path), { recursive: true }); + } + let content = yaml.dump(formatTable(table), YAML_OPTS); + if (hasFields) { + content += "fields:\n"; + } + await writeFile(path, content); + tables++; + } + + // Pass 4 — fields (write): append each field as a 2-space-indented YAML list item. + for await (const { value } of createParser(inputFile, "fields")) { + const field: Field = value; + const [dbName, tableSchema, tableName] = field.table_id; + const path = getTablePath(outputFolder, dbName, tableSchema, tableName); + const item = yaml.dump([formatField(field)], YAML_OPTS); + await appendFile(path, indentLines(item, " ")); + fields++; + } + + return { databases, tables, fields }; } diff --git a/src/index.ts b/src/index.ts index dc82dcf..96f8d6f 100644 --- a/src/index.ts +++ b/src/index.ts @@ -3,11 +3,6 @@ export { type ExtractMetadataOptions, type ExtractMetadataResult, } from "./extract-table-metadata.js"; -export { - extractFieldValues, - type ExtractFieldValuesOptions, - type ExtractFieldValuesResult, -} from "./extract-field-values.js"; export { extractSpec, type ExtractSpecOptions, diff --git a/src/lib.ts b/src/lib.ts deleted file mode 100644 index 30d3284..0000000 --- a/src/lib.ts +++ /dev/null @@ -1,164 +0,0 @@ -import { mkdirSync, writeFileSync } from "node:fs"; -import { join } from "node:path"; -import yaml from "js-yaml"; - -export type DatabaseID = number; -type TableID = number; -export type FieldID = number; - -export type DatabaseKey = string; -export type TableKey = [DatabaseKey, string | null, string]; -export type FieldKey = [...TableKey, string, ...string[]]; - -export type RawDatabase = { - id: DatabaseID; - name: string; - engine: string; -}; - -export type RawTable = { - id: TableID; - db_id: DatabaseID; - name: string; - schema: string | null; - description?: string; -}; - -export type RawField = { - id: FieldID; - table_id: TableID; - name: string; - description?: string; - base_type?: string; - database_type?: string; - semantic_type?: string; - parent_id?: FieldID | null; - fk_target_field_id?: FieldID | null; -}; - -export type RawMetadata = { - databases: RawDatabase[]; - tables: RawTable[]; - fields: RawField[]; -}; - -export type MetadataIndex = { - databases: RawDatabase[]; - databasesById: Map; - tablesByDbId: Map; - tablesById: Map; - fieldsByTableId: Map; - fieldsById: Map; -}; - -export function escapeFilename(name: string): string { - return name.replace(/\//g, "__SLASH__").replace(/\\/g, "__BACKSLASH__"); -} - -// Field-values filenames join nested JSON paths with dots, so a literal dot in -// a field segment would produce an ambiguous path (e.g. `a.b.yaml` could mean -// either a nested path or a single field named `a.b`). We escape dots inside -// each segment before joining. -function escapeFieldSegment(name: string): string { - return escapeFilename(name).replace(/\./g, "__DOT__"); -} - -export function getDatabaseFolder( - outputFolder: string, - db: RawDatabase, -): string { - return join(outputFolder, escapeFilename(db.name)); -} - -export function getTablesFolder( - outputFolder: string, - db: RawDatabase, - table: RawTable, -): string { - const dbFolder = getDatabaseFolder(outputFolder, db); - if (table.schema) { - return join(dbFolder, "schemas", escapeFilename(table.schema), "tables"); - } - return join(dbFolder, "tables"); -} - -export function getDatabasePath(outputFolder: string, db: RawDatabase): string { - return join( - getDatabaseFolder(outputFolder, db), - `${escapeFilename(db.name)}.yaml`, - ); -} - -export function getTablePath( - outputFolder: string, - db: RawDatabase, - table: RawTable, -): string { - return join( - getTablesFolder(outputFolder, db, table), - `${escapeFilename(table.name)}.yaml`, - ); -} - -export function getDatabaseKey(db: RawDatabase): DatabaseKey { - return db.name; -} - -export function getTableKey(db: RawDatabase, table: RawTable): TableKey { - return [getDatabaseKey(db), table.schema ?? null, table.name]; -} - -export function getFieldKey( - db: RawDatabase, - table: RawTable, - field: RawField, - fieldsById: Map, -): FieldKey | null { - if (!field.parent_id) { - return [...getTableKey(db, table), field.name]; - } - const parent = fieldsById.get(field.parent_id); - if (!parent) { - return null; - } - const parentKey = getFieldKey(db, table, parent, fieldsById); - return parentKey && [...parentKey, field.name]; -} - -export function getFieldFilename(fieldKey: FieldKey): string { - const [, , , ...fieldPath] = fieldKey; - return fieldPath.map(escapeFieldSegment).join("."); -} - -export function createFolder(folderPath: string): void { - mkdirSync(folderPath, { recursive: true }); -} - -export function writeYaml(filePath: string, data: unknown): void { - writeFileSync(filePath, yaml.dump(data, { lineWidth: -1, noRefs: true })); -} - -function groupBy(items: T[], keyFn: (item: T) => K): Map { - const result = new Map(); - for (const item of items) { - const key = keyFn(item); - const existing = result.get(key); - if (existing) { - existing.push(item); - } else { - result.set(key, [item]); - } - } - return result; -} - -export function buildIndex(metadata: RawMetadata): MetadataIndex { - return { - databases: metadata.databases, - databasesById: new Map(metadata.databases.map((d) => [d.id, d])), - tablesByDbId: groupBy(metadata.tables, (t) => t.db_id), - tablesById: new Map(metadata.tables.map((t) => [t.id, t])), - fieldsByTableId: groupBy(metadata.fields, (f) => f.table_id), - fieldsById: new Map(metadata.fields.map((f) => [f.id, f])), - }; -} From 86c2f47b6a9d518a5b34f712e18e49a527647731 Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:19:33 -0400 Subject: [PATCH 2/7] fixes --- src/extract-table-metadata.ts | 247 ++++++++++++++++++++++++---------- 1 file changed, 173 insertions(+), 74 deletions(-) diff --git a/src/extract-table-metadata.ts b/src/extract-table-metadata.ts index d51fe37..bdd1a9f 100644 --- a/src/extract-table-metadata.ts +++ b/src/extract-table-metadata.ts @@ -1,5 +1,10 @@ -import { createReadStream } from "node:fs"; -import { access, appendFile, mkdir, writeFile } from "node:fs/promises"; +import { + appendFileSync, + createReadStream, + existsSync, + mkdirSync, + writeFileSync, +} from "node:fs"; import { dirname, join } from "node:path"; import yaml from "js-yaml"; import { JSONParser } from "@streamparser/json-node"; @@ -48,16 +53,18 @@ export type ExtractMetadataResult = { fields: number; }; -const YAML_OPTS = { lineWidth: -1, noRefs: true } as const; +type Order = "tables-first" | "fields-first"; -async function fileExists(path: string): Promise { - try { - await access(path); - return true; - } catch { - return false; - } -} +type TouchState = { + lastTouched: string | null +}; + +type FieldState = { + buffer: string + bufferedPath: string | null; +}; + +const YAML_OPTS = { lineWidth: -1, noRefs: true } as const; function escapeFilename(name: string): string { return name.replace(/\//g, "__SLASH__").replace(/\\/g, "__BACKSLASH__"); @@ -100,79 +107,171 @@ function formatField(field: Field) { return rest; } -function createParser(inputFile: string, key: string): JSONParser { - const parser = new JSONParser({ - paths: [`$.${key}.*`], - keepStack: false, - }); - createReadStream(inputFile).pipe(parser); - return parser; +function isDatabase(value: unknown): value is Database { + return typeof value === "object" && value !== null && "engine" in value; } -export async function extractTableMetadata({ - inputFile, - outputFolder, -}: ExtractMetadataOptions): Promise { - let databases = 0; - let tables = 0; - let fields = 0; - - // Pass 1 — databases: write each database yaml. - for await (const { value } of createParser(inputFile, "databases")) { - const db: Database = value; - await mkdir(join(outputFolder, escapeFilename(db.name)), { - recursive: true, - }); - await writeFile( - getDatabasePath(outputFolder, db.name), - yaml.dump(formatDatabase(db), YAML_OPTS), - ); - databases++; +function isField(value: unknown): value is Field { + return typeof value === "object" && value !== null && "table_id" in value; +} + +function isTable(value: unknown): value is Table { + return typeof value === "object" && value !== null && "db_id" in value; +} + +// --- Subpass 1: write a database yaml file. +function writeDatabase( + outputFolder: string, + db: Database, + stats: ExtractMetadataResult, +): void { + mkdirSync(join(outputFolder, escapeFilename(db.name)), { recursive: true }); + writeFileSync( + getDatabasePath(outputFolder, db.name), + yaml.dump(formatDatabase(db), YAML_OPTS), + ); + stats.databases++; +} + +// --- Subpass 2: touch each parent table file so the table phase can detect "has fields" +// via existsSync. Skips the syscall for runs of consecutive fields sharing a path. +function touchTableFile( + outputFolder: string, + field: Field, + state: TouchState, +): void { + const [dbName, tableSchema, tableName] = field.table_id; + const path = getTablePath(outputFolder, dbName, tableSchema, tableName); + if (path === state.lastTouched) { + return; } + if (!existsSync(path)) { + mkdirSync(dirname(path), { recursive: true }); + writeFileSync(path, ""); + } + state.lastTouched = path; +} - // Pass 2 — fields (touch): create an empty file at each parent table's path so pass 3 - // can detect "this table has fields" via fileExists. - for await (const { value } of createParser(inputFile, "fields")) { - const field: Field = value; - const [dbName, tableSchema, tableName] = field.table_id; - const path = getTablePath(outputFolder, dbName, tableSchema, tableName); - if (!(await fileExists(path))) { - await mkdir(dirname(path), { recursive: true }); - await writeFile(path, ""); - } +// --- Subpass 3: write the table yaml; if the file already exists (touched by subpass 2), +// append a bare `fields:` trailer so subpass 4 can stream items underneath. +function writeTable( + outputFolder: string, + table: Table, + stats: ExtractMetadataResult, +): void { + const path = getTablePath( + outputFolder, + table.db_id, + table.schema, + table.name, + ); + const hasFields = existsSync(path); + if (!hasFields) { + mkdirSync(dirname(path), { recursive: true }); + } + let content = yaml.dump(formatTable(table), YAML_OPTS); + if (hasFields) { + content += "fields:\n"; } + writeFileSync(path, content); + stats.tables++; +} - // Pass 3 — tables: write each table's metadata; when pass 2 already created the file, - // append a bare `fields:` so pass 4 can stream items underneath. - for await (const { value } of createParser(inputFile, "tables")) { - const table: Table = value; - const path = getTablePath( - outputFolder, - table.db_id, - table.schema, - table.name, - ); - const hasFields = await fileExists(path); - if (!hasFields) { - await mkdir(dirname(path), { recursive: true }); +// --- Subpass 4: append a field as a 2-space-indented YAML list item, buffering +// consecutive fields sharing a path so they coalesce into one appendFileSync per table. +// The caller flushes the trailing buffer once the stream ends. +function writeField( + outputFolder: string, + field: Field, + state: FieldState, + stats: ExtractMetadataResult, +): void { + const [dbName, tableSchema, tableName] = field.table_id; + const path = getTablePath(outputFolder, dbName, tableSchema, tableName); + if (path !== state.bufferedPath) { + if (state.bufferedPath !== null) { + appendFileSync(state.bufferedPath, state.buffer); } - let content = yaml.dump(formatTable(table), YAML_OPTS); - if (hasFields) { - content += "fields:\n"; + state.bufferedPath = path; + state.buffer = ""; + } + state.buffer += indentLines(yaml.dump([formatField(field)], YAML_OPTS), " "); + stats.fields++; +} + +function streamAll(inputFile: string, paths: string[]): JSONParser { + const parser = new JSONParser({ paths, keepStack: false }); + createReadStream(inputFile).pipe(parser); + return parser; +} + +// --- Pass 1: stream the entire JSON. Always run subpass 1 (dbs) + subpass 2 (touch). +// Detect order from the first non-database hit; if fields appear before tables, also run +// subpass 3 (writeTable) here so pass 2 only has to write fields. +async function firstPass( + inputFile: string, + outputFolder: string, + stats: ExtractMetadataResult, +): Promise { + let order: Order | null = null; + const state: TouchState = { lastTouched: null }; + + for await (const { value } of streamAll(inputFile, [ + "$.databases.*", + "$.tables.*", + "$.fields.*", + ])) { + if (isDatabase(value)) { + writeDatabase(outputFolder, value, stats); + } else if (isField(value)) { + if (order === null) { + order = "fields-first"; + } + touchTableFile(outputFolder, value, state); + } else if (isTable(value)) { + if (order === null) { + order = "tables-first"; + } + if (order === "fields-first") { + writeTable(outputFolder, value, stats); + } + // tables-first: skip — pass 2 will write them. } - await writeFile(path, content); - tables++; } - // Pass 4 — fields (write): append each field as a 2-space-indented YAML list item. - for await (const { value } of createParser(inputFile, "fields")) { - const field: Field = value; - const [dbName, tableSchema, tableName] = field.table_id; - const path = getTablePath(outputFolder, dbName, tableSchema, tableName); - const item = yaml.dump([formatField(field)], YAML_OPTS); - await appendFile(path, indentLines(item, " ")); - fields++; + return order ?? "tables-first"; +} + +// --- Pass 2: in tables-first mode, run subpass 3 (writeTable) + subpass 4 (writeField). +// In fields-first mode, only subpass 4 — tables were already written in pass 1. +async function secondPass( + inputFile: string, + outputFolder: string, + order: Order, + stats: ExtractMetadataResult, +): Promise { + const state: FieldState = { buffer: "", bufferedPath: null }; + const paths = + order === "tables-first" ? ["$.tables.*", "$.fields.*"] : ["$.fields.*"]; + + for await (const { value } of streamAll(inputFile, paths)) { + if (isTable(value)) { + writeTable(outputFolder, value, stats); + } else if (isField(value)) { + writeField(outputFolder, value, state, stats); + } } + if (state.bufferedPath !== null) { + appendFileSync(state.bufferedPath, state.buffer); + } +} - return { databases, tables, fields }; +export async function extractTableMetadata({ + inputFile, + outputFolder, +}: ExtractMetadataOptions): Promise { + const stats: ExtractMetadataResult = { databases: 0, tables: 0, fields: 0 }; + const order = await firstPass(inputFile, outputFolder, stats); + await secondPass(inputFile, outputFolder, order, stats); + return stats; } From e50cf44d65be9bc50e66980662b20f8bdf30ef8e Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:42:31 -0400 Subject: [PATCH 3/7] limit --- src/extract-table-metadata.ts | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/src/extract-table-metadata.ts b/src/extract-table-metadata.ts index bdd1a9f..6a054f2 100644 --- a/src/extract-table-metadata.ts +++ b/src/extract-table-metadata.ts @@ -60,12 +60,15 @@ type TouchState = { }; type FieldState = { - buffer: string + buffer: string; bufferedPath: string | null; }; const YAML_OPTS = { lineWidth: -1, noRefs: true } as const; +// Per-table field buffer size before flushing +const FIELD_BUFFER_LIMIT = 1024 * 1024; + function escapeFilename(name: string): string { return name.replace(/\//g, "__SLASH__").replace(/\\/g, "__BACKSLASH__"); } @@ -177,8 +180,16 @@ function writeTable( stats.tables++; } +function flushFieldBuffer(state: FieldState): void { + if (state.bufferedPath !== null) { + appendFileSync(state.bufferedPath, state.buffer); + state.buffer = ""; + } +} + // --- Subpass 4: append a field as a 2-space-indented YAML list item, buffering // consecutive fields sharing a path so they coalesce into one appendFileSync per table. +// Wide tables flush mid-stream once the buffer exceeds FIELD_BUFFER_LIMIT bytes. // The caller flushes the trailing buffer once the stream ends. function writeField( outputFolder: string, @@ -189,13 +200,13 @@ function writeField( const [dbName, tableSchema, tableName] = field.table_id; const path = getTablePath(outputFolder, dbName, tableSchema, tableName); if (path !== state.bufferedPath) { - if (state.bufferedPath !== null) { - appendFileSync(state.bufferedPath, state.buffer); - } + flushFieldBuffer(state); state.bufferedPath = path; - state.buffer = ""; } state.buffer += indentLines(yaml.dump([formatField(field)], YAML_OPTS), " "); + if (state.buffer.length >= FIELD_BUFFER_LIMIT) { + flushFieldBuffer(state); + } stats.fields++; } @@ -261,9 +272,7 @@ async function secondPass( writeField(outputFolder, value, state, stats); } } - if (state.bufferedPath !== null) { - appendFileSync(state.bufferedPath, state.buffer); - } + flushFieldBuffer(state); } export async function extractTableMetadata({ From e9e6dd90441f64e1ef7fe9dbe2afc9ff8bf013ec Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:44:24 -0400 Subject: [PATCH 4/7] fix --- src/extract-table-metadata.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/extract-table-metadata.ts b/src/extract-table-metadata.ts index 6a054f2..f4283da 100644 --- a/src/extract-table-metadata.ts +++ b/src/extract-table-metadata.ts @@ -122,7 +122,7 @@ function isTable(value: unknown): value is Table { return typeof value === "object" && value !== null && "db_id" in value; } -// --- Subpass 1: write a database yaml file. +// Subpass 1: write a database yaml file. function writeDatabase( outputFolder: string, db: Database, @@ -136,7 +136,7 @@ function writeDatabase( stats.databases++; } -// --- Subpass 2: touch each parent table file so the table phase can detect "has fields" +// Subpass 2: touch each parent table file so the table phase can detect "has fields" // via existsSync. Skips the syscall for runs of consecutive fields sharing a path. function touchTableFile( outputFolder: string, @@ -155,7 +155,7 @@ function touchTableFile( state.lastTouched = path; } -// --- Subpass 3: write the table yaml; if the file already exists (touched by subpass 2), +// Subpass 3: write the table yaml; if the file already exists (touched by subpass 2), // append a bare `fields:` trailer so subpass 4 can stream items underneath. function writeTable( outputFolder: string, @@ -187,7 +187,7 @@ function flushFieldBuffer(state: FieldState): void { } } -// --- Subpass 4: append a field as a 2-space-indented YAML list item, buffering +// Subpass 4: append a field as a 2-space-indented YAML list item, buffering // consecutive fields sharing a path so they coalesce into one appendFileSync per table. // Wide tables flush mid-stream once the buffer exceeds FIELD_BUFFER_LIMIT bytes. // The caller flushes the trailing buffer once the stream ends. @@ -216,7 +216,7 @@ function streamAll(inputFile: string, paths: string[]): JSONParser { return parser; } -// --- Pass 1: stream the entire JSON. Always run subpass 1 (dbs) + subpass 2 (touch). +// Pass 1: stream the entire JSON. Always run subpass 1 (dbs) + subpass 2 (touch). // Detect order from the first non-database hit; if fields appear before tables, also run // subpass 3 (writeTable) here so pass 2 only has to write fields. async function firstPass( @@ -253,7 +253,7 @@ async function firstPass( return order ?? "tables-first"; } -// --- Pass 2: in tables-first mode, run subpass 3 (writeTable) + subpass 4 (writeField). +// Pass 2: in tables-first mode, run subpass 3 (writeTable) + subpass 4 (writeField). // In fields-first mode, only subpass 4 — tables were already written in pass 1. async function secondPass( inputFile: string, From 201e2c9d04a99df794d24e1251cc1d22edd0c28d Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:48:19 -0400 Subject: [PATCH 5/7] fix --- examples/v1/{table_metadata.json => metadata.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/v1/{table_metadata.json => metadata.json} (100%) diff --git a/examples/v1/table_metadata.json b/examples/v1/metadata.json similarity index 100% rename from examples/v1/table_metadata.json rename to examples/v1/metadata.json From 3e12535d8d33d8a30580b9917faeabeb6ea8d2d3 Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:48:19 -0400 Subject: [PATCH 6/7] fix --- README.md | 22 +++++++++++----------- bin/cli.test.ts | 2 +- core-spec/v1/spec.md | 2 +- package.json | 2 +- src/extract-table-metadata.test.ts | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 1bfb070..50333f7 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,13 @@ Metabase represents database metadata — synced databases, their tables, and their fields — as a tree of YAML files. Files are diff-friendly: numeric IDs are omitted entirely, and foreign keys use natural-key tuples like `["Sample Database", "PUBLIC", "ORDERS"]` instead of database identifiers. -This repository contains the specification, examples, and a CLI that converts the `table_metadata.json` downloaded from a Metabase instance into YAML. +This repository contains the specification, examples, and a CLI that converts the `metadata.json` downloaded from a Metabase instance into YAML. ## Specification The format is defined in **[core-spec/v1/spec.md](core-spec/v1/spec.md)** (v1.0.4). It covers entity keys, field types, folder structure, and the shape of each entity. -Reference output for the Sample Database lives in **[examples/v1/](examples/v1/)** — both the raw `table_metadata.json` and the extracted YAML tree. +Reference output for the Sample Database lives in **[examples/v1/](examples/v1/)** — both the raw `metadata.json` and the extracted YAML tree. ### Entities @@ -20,7 +20,7 @@ Reference output for the Sample Database lives in **[examples/v1/](examples/v1/) ## Obtaining metadata -Metadata is fetched from Metabase's `GET /api/ee/serialization/metadata/export` endpoint as a `table_metadata.json` file — a flat JSON document with three arrays (`databases`, `tables`, and `fields`) streamed so even warehouses with very large schemas can be exported without exhausting server memory. +Metadata is fetched from Metabase's `GET /api/ee/serialization/metadata/export` endpoint as a `metadata.json` file — a flat JSON document with three arrays (`databases`, `tables`, and `fields`) streamed so even warehouses with very large schemas can be exported without exhausting server memory. ### Extracting metadata to YAML @@ -30,7 +30,7 @@ The CLI turns that JSON into the human- and agent-friendly YAML tree described i bunx @metabase/database-metadata extract-table-metadata ``` -- `` — path to the `table_metadata.json` downloaded from Metabase. +- `` — path to the `metadata.json` downloaded from Metabase. - `` — destination directory. Database folders are created directly under it. ### Extracting the spec @@ -49,11 +49,11 @@ The following is the **default** workflow for a project that wants to use Metaba ### 1. A `.metadata/` directory at the repo root -Create a top-level `.metadata/` directory and **add it to `.gitignore`**. This is where the raw `table_metadata.json` and the extracted `databases/` YAML tree live: +Create a top-level `.metadata/` directory and **add it to `.gitignore`**. This is where the raw `metadata.json` and the extracted `databases/` YAML tree live: ``` .metadata/ -├── table_metadata.json +├── metadata.json └── databases/ └── … ``` @@ -70,17 +70,17 @@ Each developer (or a CI job) fetches metadata on demand from their own Metabase ### 3. Download from Metabase and extract -Each developer downloads `table_metadata.json` from their Metabase instance and drops it into `.metadata/`. Then run the extractor: +Each developer downloads `metadata.json` from their Metabase instance and drops it into `.metadata/`. Then run the extractor: ```sh mkdir -p .metadata -# Drop table_metadata.json from Metabase into .metadata/ +# Drop metadata.json from Metabase into .metadata/ rm -rf .metadata/databases -bunx @metabase/database-metadata extract-table-metadata .metadata/table_metadata.json .metadata/databases +bunx @metabase/database-metadata extract-table-metadata .metadata/metadata.json .metadata/databases ``` -After this, tools and agents should read the YAML tree under `.metadata/databases/` — not `table_metadata.json`, which exists only as input to the extractor. +After this, tools and agents should read the YAML tree under `.metadata/databases/` — not `metadata.json`, which exists only as input to the extractor. ## Publishing to NPM @@ -94,7 +94,7 @@ The workflow requires an `NPM_RELEASE_TOKEN` secret with publish access to the ` ```sh bun install -bun bin/cli.ts extract-table-metadata examples/v1/table_metadata.json /tmp/.metadata/databases +bun bin/cli.ts extract-table-metadata examples/v1/metadata.json /tmp/.metadata/databases ``` ### Scripts diff --git a/bin/cli.test.ts b/bin/cli.test.ts index 660f63c..5ecbf49 100644 --- a/bin/cli.test.ts +++ b/bin/cli.test.ts @@ -5,7 +5,7 @@ import { join, resolve } from "path"; const REPO_ROOT = resolve(import.meta.dirname, ".."); const CLI = "bin/cli.ts"; -const EXAMPLE_INPUT = "examples/v1/table_metadata.json"; +const EXAMPLE_INPUT = "examples/v1/metadata.json"; type RunResult = { stdout: string; diff --git a/core-spec/v1/spec.md b/core-spec/v1/spec.md index ef6bb71..ec8ef27 100644 --- a/core-spec/v1/spec.md +++ b/core-spec/v1/spec.md @@ -8,7 +8,7 @@ Metabase database metadata is a read-only snapshot of databases, tables, and fie The format is designed to be **portable** and **reviewable**: numeric IDs are omitted or replaced with human-readable natural keys (database name, `[database, schema, table]` tuples, etc.). Files can be diffed, grepped, and edited by hand. -The raw `table_metadata.json` is a single flat JSON document with `databases`, `tables`, and `fields` arrays, optimized for transport rather than reading. It can be arbitrarily large — tens or hundreds of megabytes on warehouses with many tables — and is not intended for direct consumption. Tools and humans should read the extracted YAML tree under `databases/` instead, where each entity lives in its own small file. +The raw `metadata.json` is a single flat JSON document with `databases`, `tables`, and `fields` arrays, optimized for transport rather than reading. It can be arbitrarily large — tens or hundreds of megabytes on warehouses with many tables — and is not intended for direct consumption. Tools and humans should read the extracted YAML tree under `databases/` instead, where each entity lives in its own small file. ## Table of Contents diff --git a/package.json b/package.json index b06c6a0..9786f00 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@metabase/database-metadata", - "version": "1.0.5", + "version": "1.0.6", "description": "CLI tool to extract Metabase database metadata into YAML files", "license": "SEE LICENSE IN LICENSE.txt", "repository": { diff --git a/src/extract-table-metadata.test.ts b/src/extract-table-metadata.test.ts index 0ffb936..38fb274 100644 --- a/src/extract-table-metadata.test.ts +++ b/src/extract-table-metadata.test.ts @@ -13,7 +13,7 @@ import yaml from "js-yaml"; import { extractTableMetadata } from "./extract-table-metadata.js"; const REPO_ROOT = resolve(import.meta.dirname, ".."); -const EXAMPLE_INPUT = join(REPO_ROOT, "examples/v1/table_metadata.json"); +const EXAMPLE_INPUT = join(REPO_ROOT, "examples/v1/metadata.json"); describe("extractTableMetadata", () => { let workdir: string; From 14f49b30901599e325ced3f312691444c459301f Mon Sep 17 00:00:00 2001 From: Alexander Polyankin Date: Mon, 4 May 2026 22:51:58 -0400 Subject: [PATCH 7/7] fix --- .github/workflows/validate.yml | 5 +---- src/extract-table-metadata.ts | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 3517c52..f350112 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -18,10 +18,7 @@ jobs: - run: bun install - name: Extract metadata - run: bun run bin/cli.ts extract-table-metadata examples/v1/table_metadata.json /tmp/databases - - - name: Extract field values - run: bun run bin/cli.ts extract-field-values examples/v1/table_metadata.json examples/v1/field_values.json /tmp/databases + run: bun run bin/cli.ts extract-table-metadata examples/v1/metadata.json /tmp/databases - name: Diff examples run: diff -r examples/v1/databases /tmp/databases diff --git a/src/extract-table-metadata.ts b/src/extract-table-metadata.ts index f4283da..7ddfb68 100644 --- a/src/extract-table-metadata.ts +++ b/src/extract-table-metadata.ts @@ -56,7 +56,7 @@ export type ExtractMetadataResult = { type Order = "tables-first" | "fields-first"; type TouchState = { - lastTouched: string | null + lastTouched: string | null; }; type FieldState = {