Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ log = "0.4"
tracing = "0.1"
wxyc-etl = "0.3.0"
postgres = "0.19"
rusqlite = { version = "0.31", features = ["bundled"] }

[dev-dependencies]
assert_cmd = "2"
Expand Down
66 changes: 66 additions & 0 deletions migrations/0002_wxyc_library_v2.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
-- wxyc_library v2 hook (consolidated cross-cache identity schema)
--
-- Lands E1 §4.1.3 of the cross-cache-identity plan:
-- https://github.com/WXYC/wiki/blob/main/plans/library-hook-canonicalization.md#413-homebrew-wikidata-port-5435
--
-- Creates the consolidated `wxyc_library` table per §3.1. The wikidata cache
-- is small (~262 MB, ≤64K rows) and `wxyc_library` is loaded once per
-- rebuild, so per §4.1.3 every index — including the GIN trigram indexes —
-- is built INLINE (no CONCURRENTLY), which keeps this a regular sqlx
-- in-transaction migration.
--
-- Idempotency: every statement uses IF NOT EXISTS so re-applying against a
-- populated cache is a no-op (mandated by CLAUDE.md "Idempotency is
-- mandatory" — every monthly rebuild runs `sqlx migrate run` against the
-- destination DB before the rebuild kicks off).
--
-- artist_id / label_id / format_id / release_year are nullable per §3.1:
-- this cache reads from a SQLite library.db (a wxyc-catalog export) which
-- does not carry Backend's integer IDs. They exist for forward compatibility
-- with a future Backend-direct loader.
--
-- snapshot_source CHECK matches the canonical {backend|tubafrenzy|llm} set
-- defined in §3.1; the loader-side argument validation in
-- `src/wxyc_loader.rs` mirrors this.

CREATE TABLE IF NOT EXISTS wxyc_library (
library_id INTEGER PRIMARY KEY,
artist_id INTEGER,
artist_name TEXT NOT NULL,
album_title TEXT NOT NULL,
label_id INTEGER,
label_name TEXT,
format_id INTEGER,
format_name TEXT,
wxyc_genre TEXT,
call_letters TEXT,
call_numbers INTEGER,
release_year SMALLINT,
norm_artist TEXT NOT NULL,
norm_title TEXT NOT NULL,
norm_label TEXT,
snapshot_at TIMESTAMPTZ NOT NULL,
snapshot_source TEXT NOT NULL
CHECK (snapshot_source IN ('backend', 'tubafrenzy', 'llm'))
);

-- B-tree indexes per §3.1. Inline-safe because this cache is small
-- (§4.1.3 explicitly waives the CONCURRENTLY requirement).
CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_idx
ON wxyc_library (norm_artist);
CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_idx
ON wxyc_library (norm_title);
CREATE INDEX IF NOT EXISTS wxyc_library_artist_id_idx
ON wxyc_library (artist_id);
CREATE INDEX IF NOT EXISTS wxyc_library_format_id_idx
ON wxyc_library (format_id);
CREATE INDEX IF NOT EXISTS wxyc_library_release_year_idx
ON wxyc_library (release_year);

-- GIN trigram indexes for fuzzy lookup. pg_trgm is created by 0001_initial.
-- Inline (no CONCURRENTLY) per §4.1.3 — keeps the migration in a single
-- transaction.
CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_trgm_idx
ON wxyc_library USING GIN (norm_artist gin_trgm_ops);
CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_trgm_idx
ON wxyc_library USING GIN (norm_title gin_trgm_ops);
41 changes: 41 additions & 0 deletions schema/create_database.sql
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,44 @@ CREATE TABLE IF NOT EXISTS occupation (
occupation_qid TEXT NOT NULL,
PRIMARY KEY (entity_qid, occupation_qid)
);

-- wxyc_library v2 hook (consolidated cross-cache identity schema). Mirrored
-- from migrations/0002_wxyc_library_v2.sql per the dual-source pattern in
-- CLAUDE.md "Migrations". Per E1 §4.1.3 of the cross-cache-identity plan,
-- this cache is small enough that every index is built inline.
-- See: https://github.com/WXYC/wiki/blob/main/plans/library-hook-canonicalization.md#413-homebrew-wikidata-port-5435
CREATE TABLE IF NOT EXISTS wxyc_library (
library_id INTEGER PRIMARY KEY,
artist_id INTEGER,
artist_name TEXT NOT NULL,
album_title TEXT NOT NULL,
label_id INTEGER,
label_name TEXT,
format_id INTEGER,
format_name TEXT,
wxyc_genre TEXT,
call_letters TEXT,
call_numbers INTEGER,
release_year SMALLINT,
norm_artist TEXT NOT NULL,
norm_title TEXT NOT NULL,
norm_label TEXT,
snapshot_at TIMESTAMPTZ NOT NULL,
snapshot_source TEXT NOT NULL
CHECK (snapshot_source IN ('backend', 'tubafrenzy', 'llm'))
);

CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_idx
ON wxyc_library (norm_artist);
CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_idx
ON wxyc_library (norm_title);
CREATE INDEX IF NOT EXISTS wxyc_library_artist_id_idx
ON wxyc_library (artist_id);
CREATE INDEX IF NOT EXISTS wxyc_library_format_id_idx
ON wxyc_library (format_id);
CREATE INDEX IF NOT EXISTS wxyc_library_release_year_idx
ON wxyc_library (release_year);
CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_trgm_idx
ON wxyc_library USING GIN (norm_artist gin_trgm_ops);
CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_trgm_idx
ON wxyc_library USING GIN (norm_title gin_trgm_ops);
50 changes: 40 additions & 10 deletions src/import_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,32 @@ use postgres::Client;
/// The DDL SQL embedded from `schema/create_database.sql`.
pub const DDL: &str = include_str!("../schema/create_database.sql");

/// All 8 wikidata-cache tables in FK-safe import order (parent first).
/// CSV streaming-filter output tables in FK-safe import order (parent first).
///
/// Used by the bulk-import lifecycle (`truncate_all`, `set_tables_unlogged`,
/// `set_tables_logged`, `vacuum_full`) wired into the CSV `import`
/// subcommand. The list intentionally excludes `wxyc_library` — that table
/// is populated by the separate `import-wxyc-library` subcommand and must
/// NOT be truncated/vacuumed by the CSV import path, otherwise running
/// `wikidata-cache import` would silently wipe the cross-cache identity hook.
pub const CSV_IMPORT_TABLES: &[&str] = &[
"entity",
"discogs_mapping",
"influence",
"genre",
"record_label",
"label_hierarchy",
"entity_alias",
"occupation",
];

/// Every table managed by this repo, including the `wxyc_library` cross-cache
/// identity hook (E1 §4.1.3, see `migrations/0002_wxyc_library_v2.sql`).
///
/// Used by `drop_schema` (so `--fresh` drops everything) and by the schema
/// test that asserts every table is created. The bulk-import lifecycle
/// functions deliberately use `CSV_IMPORT_TABLES` instead — see that
/// constant's docstring for why.
pub const ALL_TABLES: &[&str] = &[
"entity",
"discogs_mapping",
Expand All @@ -19,6 +44,7 @@ pub const ALL_TABLES: &[&str] = &[
"label_hierarchy",
"entity_alias",
"occupation",
"wxyc_library",
];

/// Apply the wikidata-cache schema DDL to the database.
Expand Down Expand Up @@ -54,38 +80,42 @@ pub fn drop_schema(client: &mut Client) -> Result<()> {
Ok(())
}

/// Truncate all tables (in reverse FK order) for idempotent re-import.
/// Truncate the CSV-import tables (in reverse FK order) for idempotent re-import.
///
/// Excludes `wxyc_library` — that table is owned by the `import-wxyc-library`
/// subcommand and must survive a CSV `import` rerun.
pub fn truncate_all(client: &mut Client) -> Result<()> {
for table in ALL_TABLES.iter().rev() {
for table in CSV_IMPORT_TABLES.iter().rev() {
client.batch_execute(&format!("TRUNCATE {table} CASCADE"))?;
}
Ok(())
}

/// Set tables to UNLOGGED mode for faster bulk import (disables WAL).
/// Set the CSV-import tables to UNLOGGED mode for faster bulk import (disables WAL).
///
/// Processes child tables first (reverse FK order), then parent tables,
/// because a logged table cannot reference an unlogged table.
/// because a logged table cannot reference an unlogged table. `wxyc_library`
/// stays LOGGED — it isn't part of the bulk-import path.
pub fn set_tables_unlogged(client: &mut Client) -> Result<()> {
for table in ALL_TABLES.iter().rev() {
for table in CSV_IMPORT_TABLES.iter().rev() {
client.batch_execute(&format!("ALTER TABLE {table} SET UNLOGGED"))?;
}
Ok(())
}

/// Restore tables to LOGGED mode (re-enables WAL durability).
/// Restore the CSV-import tables to LOGGED mode (re-enables WAL durability).
///
/// Processes parent tables first (FK order) so child tables can reference them.
pub fn set_tables_logged(client: &mut Client) -> Result<()> {
for table in ALL_TABLES {
for table in CSV_IMPORT_TABLES {
client.batch_execute(&format!("ALTER TABLE {table} SET LOGGED"))?;
}
Ok(())
}

/// Run VACUUM FULL on all tables to reclaim space after bulk import.
/// Run VACUUM FULL on the CSV-import tables to reclaim space after bulk import.
pub fn vacuum_full(client: &mut Client) -> Result<()> {
for table in ALL_TABLES {
for table in CSV_IMPORT_TABLES {
client.batch_execute(&format!("VACUUM FULL {table}"))?;
}
Ok(())
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ pub mod import;
pub mod import_schema;
pub mod model;
pub mod writer;
pub mod wxyc_loader;
60 changes: 60 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use wikidata_cache::import;
use wikidata_cache::import_schema;
use wikidata_cache::model::Entity;
use wikidata_cache::writer::CsvOutput;
use wikidata_cache::wxyc_loader;

const DATABASE_URL_ENV: &str = "DATABASE_URL_WIKIDATA";

Expand Down Expand Up @@ -75,6 +76,25 @@ enum Commands {
#[arg(long, hide = true)]
csv_dir: Option<PathBuf>,
},
/// Populate the `wxyc_library` cross-cache identity hook from a SQLite
/// `library.db` (E1 §4.1.3 of the cross-cache-identity plan).
///
/// Idempotent on `library_id` (`ON CONFLICT DO NOTHING`); safe to re-run.
/// Expects the `wxyc_library` table to already exist — apply
/// `migrations/0002_wxyc_library_v2.sql` (via `sqlx migrate run`) or run
/// the `import` subcommand first to create the table inline.
ImportWxycLibrary {
/// Path to the SQLite `library.db` (a wxyc-catalog export).
#[arg(long)]
library_db: PathBuf,

#[command(flatten)]
db: DatabaseArgs,

/// Origin of this snapshot. Must be one of: backend | tubafrenzy | llm.
#[arg(long, default_value = "backend")]
snapshot_source: String,
},
}

fn main() -> Result<()> {
Expand All @@ -83,6 +103,9 @@ fn main() -> Result<()> {
let (tool, step) = match &cli.command {
Commands::Build { .. } => ("wikidata-cache build", "build"),
Commands::Import { .. } => ("wikidata-cache import", "import"),
Commands::ImportWxycLibrary { .. } => {
("wikidata-cache import-wxyc-library", "import-wxyc-library")
}
};
let _logger_guard = logger::init(LoggerConfig {
repo: "wikidata-cache",
Expand Down Expand Up @@ -118,10 +141,47 @@ fn main() -> Result<()> {
.context("Failed to resolve database URL")?;
run_import(&data_dir, &database_url, import.fresh)
}
Commands::ImportWxycLibrary {
library_db,
db,
snapshot_source,
} => {
let database_url = resolve_database_url(&db, DATABASE_URL_ENV)
.context("Failed to resolve database URL")?;
run_import_wxyc_library(&library_db, &database_url, &snapshot_source)
}
}
})
}

fn run_import_wxyc_library(
library_db: &Path,
database_url: &str,
snapshot_source: &str,
) -> Result<()> {
let start = Instant::now();

eprintln!("Connecting to PostgreSQL...");
let mut client = postgres::Client::connect(database_url, postgres::NoTls)
.context("Failed to connect to PostgreSQL")?;

eprintln!(
"Loading wxyc_library hook from {} (snapshot_source={})...",
library_db.display(),
snapshot_source
);
let attempted =
wxyc_loader::populate_wxyc_library_v2(&mut client, library_db, snapshot_source)?;

let elapsed = start.elapsed();
eprintln!();
eprintln!("Done in {:.1}s", elapsed.as_secs_f64());
eprintln!(" Rows attempted: {attempted:>10}");
eprintln!(" Source: {}", library_db.display());

Ok(())
}

/// Resolve the working directory, honouring the deprecated alias if it was passed.
fn resolve_data_dir(data_dir: PathBuf, deprecated: Option<PathBuf>, alias: &str) -> PathBuf {
if let Some(path) = deprecated {
Expand Down
Loading
Loading