diff --git a/Cargo.lock b/Cargo.lock index d065b64..eeda300 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2935,6 +2935,7 @@ dependencies = [ "predicates", "pretty_assertions", "rayon", + "rusqlite", "serde", "serde_json", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 2f15310..128779b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ log = "0.4" tracing = "0.1" wxyc-etl = "0.3.0" postgres = "0.19" +rusqlite = { version = "0.31", features = ["bundled"] } [dev-dependencies] assert_cmd = "2" diff --git a/migrations/0002_wxyc_library_v2.sql b/migrations/0002_wxyc_library_v2.sql new file mode 100644 index 0000000..22b05e9 --- /dev/null +++ b/migrations/0002_wxyc_library_v2.sql @@ -0,0 +1,66 @@ +-- wxyc_library v2 hook (consolidated cross-cache identity schema) +-- +-- Lands E1 §4.1.3 of the cross-cache-identity plan: +-- https://github.com/WXYC/wiki/blob/main/plans/library-hook-canonicalization.md#413-homebrew-wikidata-port-5435 +-- +-- Creates the consolidated `wxyc_library` table per §3.1. The wikidata cache +-- is small (~262 MB, ≤64K rows) and `wxyc_library` is loaded once per +-- rebuild, so per §4.1.3 every index — including the GIN trigram indexes — +-- is built INLINE (no CONCURRENTLY), which keeps this a regular sqlx +-- in-transaction migration. +-- +-- Idempotency: every statement uses IF NOT EXISTS so re-applying against a +-- populated cache is a no-op (mandated by CLAUDE.md "Idempotency is +-- mandatory" — every monthly rebuild runs `sqlx migrate run` against the +-- destination DB before the rebuild kicks off). +-- +-- artist_id / label_id / format_id / release_year are nullable per §3.1: +-- this cache reads from a SQLite library.db (a wxyc-catalog export) which +-- does not carry Backend's integer IDs. They exist for forward compatibility +-- with a future Backend-direct loader. +-- +-- snapshot_source CHECK matches the canonical {backend|tubafrenzy|llm} set +-- defined in §3.1; the loader-side argument validation in +-- `src/wxyc_loader.rs` mirrors this. + +CREATE TABLE IF NOT EXISTS wxyc_library ( + library_id INTEGER PRIMARY KEY, + artist_id INTEGER, + artist_name TEXT NOT NULL, + album_title TEXT NOT NULL, + label_id INTEGER, + label_name TEXT, + format_id INTEGER, + format_name TEXT, + wxyc_genre TEXT, + call_letters TEXT, + call_numbers INTEGER, + release_year SMALLINT, + norm_artist TEXT NOT NULL, + norm_title TEXT NOT NULL, + norm_label TEXT, + snapshot_at TIMESTAMPTZ NOT NULL, + snapshot_source TEXT NOT NULL + CHECK (snapshot_source IN ('backend', 'tubafrenzy', 'llm')) +); + +-- B-tree indexes per §3.1. Inline-safe because this cache is small +-- (§4.1.3 explicitly waives the CONCURRENTLY requirement). +CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_idx + ON wxyc_library (norm_artist); +CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_idx + ON wxyc_library (norm_title); +CREATE INDEX IF NOT EXISTS wxyc_library_artist_id_idx + ON wxyc_library (artist_id); +CREATE INDEX IF NOT EXISTS wxyc_library_format_id_idx + ON wxyc_library (format_id); +CREATE INDEX IF NOT EXISTS wxyc_library_release_year_idx + ON wxyc_library (release_year); + +-- GIN trigram indexes for fuzzy lookup. pg_trgm is created by 0001_initial. +-- Inline (no CONCURRENTLY) per §4.1.3 — keeps the migration in a single +-- transaction. +CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_trgm_idx + ON wxyc_library USING GIN (norm_artist gin_trgm_ops); +CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_trgm_idx + ON wxyc_library USING GIN (norm_title gin_trgm_ops); diff --git a/schema/create_database.sql b/schema/create_database.sql index 98387e6..b9148f9 100644 --- a/schema/create_database.sql +++ b/schema/create_database.sql @@ -62,3 +62,44 @@ CREATE TABLE IF NOT EXISTS occupation ( occupation_qid TEXT NOT NULL, PRIMARY KEY (entity_qid, occupation_qid) ); + +-- wxyc_library v2 hook (consolidated cross-cache identity schema). Mirrored +-- from migrations/0002_wxyc_library_v2.sql per the dual-source pattern in +-- CLAUDE.md "Migrations". Per E1 §4.1.3 of the cross-cache-identity plan, +-- this cache is small enough that every index is built inline. +-- See: https://github.com/WXYC/wiki/blob/main/plans/library-hook-canonicalization.md#413-homebrew-wikidata-port-5435 +CREATE TABLE IF NOT EXISTS wxyc_library ( + library_id INTEGER PRIMARY KEY, + artist_id INTEGER, + artist_name TEXT NOT NULL, + album_title TEXT NOT NULL, + label_id INTEGER, + label_name TEXT, + format_id INTEGER, + format_name TEXT, + wxyc_genre TEXT, + call_letters TEXT, + call_numbers INTEGER, + release_year SMALLINT, + norm_artist TEXT NOT NULL, + norm_title TEXT NOT NULL, + norm_label TEXT, + snapshot_at TIMESTAMPTZ NOT NULL, + snapshot_source TEXT NOT NULL + CHECK (snapshot_source IN ('backend', 'tubafrenzy', 'llm')) +); + +CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_idx + ON wxyc_library (norm_artist); +CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_idx + ON wxyc_library (norm_title); +CREATE INDEX IF NOT EXISTS wxyc_library_artist_id_idx + ON wxyc_library (artist_id); +CREATE INDEX IF NOT EXISTS wxyc_library_format_id_idx + ON wxyc_library (format_id); +CREATE INDEX IF NOT EXISTS wxyc_library_release_year_idx + ON wxyc_library (release_year); +CREATE INDEX IF NOT EXISTS wxyc_library_norm_artist_trgm_idx + ON wxyc_library USING GIN (norm_artist gin_trgm_ops); +CREATE INDEX IF NOT EXISTS wxyc_library_norm_title_trgm_idx + ON wxyc_library USING GIN (norm_title gin_trgm_ops); diff --git a/src/import_schema.rs b/src/import_schema.rs index 04f8c93..db00c74 100644 --- a/src/import_schema.rs +++ b/src/import_schema.rs @@ -9,7 +9,32 @@ use postgres::Client; /// The DDL SQL embedded from `schema/create_database.sql`. pub const DDL: &str = include_str!("../schema/create_database.sql"); -/// All 8 wikidata-cache tables in FK-safe import order (parent first). +/// CSV streaming-filter output tables in FK-safe import order (parent first). +/// +/// Used by the bulk-import lifecycle (`truncate_all`, `set_tables_unlogged`, +/// `set_tables_logged`, `vacuum_full`) wired into the CSV `import` +/// subcommand. The list intentionally excludes `wxyc_library` — that table +/// is populated by the separate `import-wxyc-library` subcommand and must +/// NOT be truncated/vacuumed by the CSV import path, otherwise running +/// `wikidata-cache import` would silently wipe the cross-cache identity hook. +pub const CSV_IMPORT_TABLES: &[&str] = &[ + "entity", + "discogs_mapping", + "influence", + "genre", + "record_label", + "label_hierarchy", + "entity_alias", + "occupation", +]; + +/// Every table managed by this repo, including the `wxyc_library` cross-cache +/// identity hook (E1 §4.1.3, see `migrations/0002_wxyc_library_v2.sql`). +/// +/// Used by `drop_schema` (so `--fresh` drops everything) and by the schema +/// test that asserts every table is created. The bulk-import lifecycle +/// functions deliberately use `CSV_IMPORT_TABLES` instead — see that +/// constant's docstring for why. pub const ALL_TABLES: &[&str] = &[ "entity", "discogs_mapping", @@ -19,6 +44,7 @@ pub const ALL_TABLES: &[&str] = &[ "label_hierarchy", "entity_alias", "occupation", + "wxyc_library", ]; /// Apply the wikidata-cache schema DDL to the database. @@ -54,38 +80,42 @@ pub fn drop_schema(client: &mut Client) -> Result<()> { Ok(()) } -/// Truncate all tables (in reverse FK order) for idempotent re-import. +/// Truncate the CSV-import tables (in reverse FK order) for idempotent re-import. +/// +/// Excludes `wxyc_library` — that table is owned by the `import-wxyc-library` +/// subcommand and must survive a CSV `import` rerun. pub fn truncate_all(client: &mut Client) -> Result<()> { - for table in ALL_TABLES.iter().rev() { + for table in CSV_IMPORT_TABLES.iter().rev() { client.batch_execute(&format!("TRUNCATE {table} CASCADE"))?; } Ok(()) } -/// Set tables to UNLOGGED mode for faster bulk import (disables WAL). +/// Set the CSV-import tables to UNLOGGED mode for faster bulk import (disables WAL). /// /// Processes child tables first (reverse FK order), then parent tables, -/// because a logged table cannot reference an unlogged table. +/// because a logged table cannot reference an unlogged table. `wxyc_library` +/// stays LOGGED — it isn't part of the bulk-import path. pub fn set_tables_unlogged(client: &mut Client) -> Result<()> { - for table in ALL_TABLES.iter().rev() { + for table in CSV_IMPORT_TABLES.iter().rev() { client.batch_execute(&format!("ALTER TABLE {table} SET UNLOGGED"))?; } Ok(()) } -/// Restore tables to LOGGED mode (re-enables WAL durability). +/// Restore the CSV-import tables to LOGGED mode (re-enables WAL durability). /// /// Processes parent tables first (FK order) so child tables can reference them. pub fn set_tables_logged(client: &mut Client) -> Result<()> { - for table in ALL_TABLES { + for table in CSV_IMPORT_TABLES { client.batch_execute(&format!("ALTER TABLE {table} SET LOGGED"))?; } Ok(()) } -/// Run VACUUM FULL on all tables to reclaim space after bulk import. +/// Run VACUUM FULL on the CSV-import tables to reclaim space after bulk import. pub fn vacuum_full(client: &mut Client) -> Result<()> { - for table in ALL_TABLES { + for table in CSV_IMPORT_TABLES { client.batch_execute(&format!("VACUUM FULL {table}"))?; } Ok(()) diff --git a/src/lib.rs b/src/lib.rs index cc63d9e..2e1a811 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,3 +4,4 @@ pub mod import; pub mod import_schema; pub mod model; pub mod writer; +pub mod wxyc_loader; diff --git a/src/main.rs b/src/main.rs index 415deff..4765d44 100644 --- a/src/main.rs +++ b/src/main.rs @@ -24,6 +24,7 @@ use wikidata_cache::import; use wikidata_cache::import_schema; use wikidata_cache::model::Entity; use wikidata_cache::writer::CsvOutput; +use wikidata_cache::wxyc_loader; const DATABASE_URL_ENV: &str = "DATABASE_URL_WIKIDATA"; @@ -75,6 +76,25 @@ enum Commands { #[arg(long, hide = true)] csv_dir: Option, }, + /// Populate the `wxyc_library` cross-cache identity hook from a SQLite + /// `library.db` (E1 §4.1.3 of the cross-cache-identity plan). + /// + /// Idempotent on `library_id` (`ON CONFLICT DO NOTHING`); safe to re-run. + /// Expects the `wxyc_library` table to already exist — apply + /// `migrations/0002_wxyc_library_v2.sql` (via `sqlx migrate run`) or run + /// the `import` subcommand first to create the table inline. + ImportWxycLibrary { + /// Path to the SQLite `library.db` (a wxyc-catalog export). + #[arg(long)] + library_db: PathBuf, + + #[command(flatten)] + db: DatabaseArgs, + + /// Origin of this snapshot. Must be one of: backend | tubafrenzy | llm. + #[arg(long, default_value = "backend")] + snapshot_source: String, + }, } fn main() -> Result<()> { @@ -83,6 +103,9 @@ fn main() -> Result<()> { let (tool, step) = match &cli.command { Commands::Build { .. } => ("wikidata-cache build", "build"), Commands::Import { .. } => ("wikidata-cache import", "import"), + Commands::ImportWxycLibrary { .. } => { + ("wikidata-cache import-wxyc-library", "import-wxyc-library") + } }; let _logger_guard = logger::init(LoggerConfig { repo: "wikidata-cache", @@ -118,10 +141,47 @@ fn main() -> Result<()> { .context("Failed to resolve database URL")?; run_import(&data_dir, &database_url, import.fresh) } + Commands::ImportWxycLibrary { + library_db, + db, + snapshot_source, + } => { + let database_url = resolve_database_url(&db, DATABASE_URL_ENV) + .context("Failed to resolve database URL")?; + run_import_wxyc_library(&library_db, &database_url, &snapshot_source) + } } }) } +fn run_import_wxyc_library( + library_db: &Path, + database_url: &str, + snapshot_source: &str, +) -> Result<()> { + let start = Instant::now(); + + eprintln!("Connecting to PostgreSQL..."); + let mut client = postgres::Client::connect(database_url, postgres::NoTls) + .context("Failed to connect to PostgreSQL")?; + + eprintln!( + "Loading wxyc_library hook from {} (snapshot_source={})...", + library_db.display(), + snapshot_source + ); + let attempted = + wxyc_loader::populate_wxyc_library_v2(&mut client, library_db, snapshot_source)?; + + let elapsed = start.elapsed(); + eprintln!(); + eprintln!("Done in {:.1}s", elapsed.as_secs_f64()); + eprintln!(" Rows attempted: {attempted:>10}"); + eprintln!(" Source: {}", library_db.display()); + + Ok(()) +} + /// Resolve the working directory, honouring the deprecated alias if it was passed. fn resolve_data_dir(data_dir: PathBuf, deprecated: Option, alias: &str) -> PathBuf { if let Some(path) = deprecated { diff --git a/src/wxyc_loader.rs b/src/wxyc_loader.rs new file mode 100644 index 0000000..df22efc --- /dev/null +++ b/src/wxyc_loader.rs @@ -0,0 +1,429 @@ +//! WXYC library hook loader for the Homebrew wikidata cache. +//! +//! Implements E1 §4.1.3 of the cross-cache-identity plan: +//! +//! +//! Reads a SQLite `library.db` (a wxyc-catalog export of the WXYC library +//! catalog) and populates the consolidated `wxyc_library` PostgreSQL table. +//! +//! # Idempotency +//! +//! The writer uses `INSERT ... ON CONFLICT (library_id) DO NOTHING`, so +//! re-running the loader against an already-populated cache is a no-op for +//! existing library_ids. New rows are inserted; conflicts are silently +//! skipped. This matches the pattern used by `discogs-etl`'s +//! `loaders/wxyc.py::populate_wxyc_library_v2` (PR #185 / issue #178). +//! +//! # Normalization +//! +//! Per the plan §3.3 / E3 step 4, this loader is locked onto the canonical +//! identity normalizers from `wxyc-etl` 0.3.0: +//! +//! - [`wxyc_etl::text::to_identity_match_form`] — used for both +//! `norm_artist` AND `norm_label` (labels share the artist-side pipeline; +//! no `_label` variant exists or is needed). +//! - [`wxyc_etl::text::to_identity_match_form_title`] — title-side variant, +//! used for `norm_title`. +//! +//! The opt-in variants (`_with_punctuation`, `_with_disambiguator_strip`) +//! are deliberately not invoked here — the cross-cache-identity hook stays +//! on the locked-on baseline so every consumer cache normalizes identically. +//! `wxyc_etl::text::to_match_form` (the WX-2 comparison form) is a different +//! normalizer and must NOT be substituted. +//! +//! # Nullability +//! +//! Per §3.1, `artist_id` / `label_id` / `format_id` / `release_year` are all +//! nullable. This cache reads from library.db (a SQLite catalog export) +//! which does not carry Backend's integer IDs; for the foreseeable future +//! every row from this loader stamps NULL on those four columns. They +//! exist for forward compatibility with a future Backend-direct loader. + +use std::path::Path; +use std::time::SystemTime; + +use anyhow::{Context, Result}; +use postgres::Client; +use wxyc_etl::text::{to_identity_match_form, to_identity_match_form_title}; + +/// Audit string surfaced in INFO logs and asserted by integration tests. +/// Pinning the string makes a future API rename in `wxyc-etl` observable. +pub const NORMALIZER_NAME: &str = "wxyc_etl::text::to_identity_match_form"; + +/// Allowed values for `wxyc_library.snapshot_source`. Mirrors the §3.1 CHECK +/// constraint at the loader-argument boundary so callers get a friendly +/// error instead of a Postgres CheckViolation buried in a transaction. +pub const ALLOWED_SNAPSHOT_SOURCES: &[&str] = &["backend", "tubafrenzy", "llm"]; + +/// One row read from `library.db`. Mirrors §3.1's column list. The four +/// nullable columns (`artist_id`, `label_id`, `format_id`, `release_year`) +/// are always `None` from this loader — see the module-level "Nullability" +/// note. +#[derive(Debug, Clone)] +pub struct LibraryRow { + pub library_id: i32, + pub artist_name: String, + pub album_title: String, + pub label_name: Option, + pub format_name: Option, + pub wxyc_genre: Option, + pub call_letters: Option, + pub call_numbers: Option, +} + +/// Read every row from a SQLite `library.db` into [`LibraryRow`] records. +/// +/// The minimal-fixture schema is `(id, artist, title)`; the production schema +/// adds `format`, `label`, `genre`, `call_letters`, `release_call_number`. +/// We adapt to whatever optional columns are present rather than failing — +/// the same shape as `discogs-etl/loaders/wxyc.py::_read_library_db`. +pub fn read_library_db(library_db: &Path) -> Result> { + let conn = rusqlite::Connection::open(library_db) + .with_context(|| format!("Failed to open {}", library_db.display()))?; + + let cols = existing_columns(&conn, "library")?; + + // Required columns. The PRAGMA-driven approach below is forgiving of + // schema drift; if `id` / `artist` / `title` are missing the SELECT + // itself will surface the problem with a clear SQLite error. + let mut select_parts: Vec<&str> = vec!["id", "artist", "title"]; + for c in &[ + "label", + "format", + "genre", + "call_letters", + "release_call_number", + ] { + if cols.contains(*c) { + select_parts.push(c); + } + } + let query = format!("SELECT {} FROM library", select_parts.join(", ")); + + let mut stmt = conn.prepare(&query)?; + let rows = stmt.query_map([], |row| { + // i32 truncation on the integer columns must surface as a hard + // error, never silently land NULL — the cache schema declares + // INTEGER (PG i32) for both library_id and call_numbers, so an + // out-of-range source value is upstream corruption that should + // halt the loader rather than corrupt the hook. + let id_i64: i64 = row.get("id")?; + let library_id = i32::try_from(id_i64).map_err(|_| { + rusqlite::Error::FromSqlConversionFailure( + 0, + rusqlite::types::Type::Integer, + format!("library.id {id_i64} does not fit in i32").into(), + ) + })?; + let mut r = LibraryRow { + library_id, + artist_name: row.get("artist")?, + album_title: row.get("title")?, + label_name: None, + format_name: None, + wxyc_genre: None, + call_letters: None, + call_numbers: None, + }; + if cols.contains("label") { + r.label_name = row.get("label").ok(); + } + if cols.contains("format") { + r.format_name = row.get("format").ok(); + } + if cols.contains("genre") { + r.wxyc_genre = row.get("genre").ok(); + } + if cols.contains("call_letters") { + r.call_letters = row.get("call_letters").ok(); + } + if cols.contains("release_call_number") { + r.call_numbers = row + .get::<_, Option>("release_call_number")? + .map(|n| { + i32::try_from(n).map_err(|_| { + rusqlite::Error::FromSqlConversionFailure( + 0, + rusqlite::types::Type::Integer, + format!("library.release_call_number {n} does not fit in i32").into(), + ) + }) + }) + .transpose()?; + } + Ok(r) + })?; + + let mut out = Vec::new(); + for r in rows { + out.push(r?); + } + Ok(out) +} + +/// Read column names from `PRAGMA table_info`, normalized to lowercase. +/// +/// SQLite identifiers are case-insensitive in DDL but `PRAGMA` returns them +/// as declared (`Label` vs `label`). Lowercasing the set lets the +/// optional-column probe in `read_library_db` match regardless of source +/// casing — otherwise a future `library.db` declared with `Label` would +/// silently miss the lookup and the column would stay `None` with no error. +fn existing_columns( + conn: &rusqlite::Connection, + table: &str, +) -> Result> { + let mut stmt = conn.prepare(&format!("PRAGMA table_info({table})"))?; + let names = stmt.query_map([], |row| row.get::<_, String>(1))?; + let mut set = std::collections::HashSet::new(); + for n in names { + set.insert(n?.to_ascii_lowercase()); + } + Ok(set) +} + +/// Strip NUL bytes (U+0000) from a TEXT value at the PostgreSQL write +/// boundary, matching the org-wide WX-3.B policy ([WXYC/docs#18]) and +/// `import.rs::escape_copy_text`. PostgreSQL TEXT cannot store NUL; in +/// library metadata it's always corruption, never intentional signal. +/// +/// [WXYC/docs#18]: https://github.com/WXYC/docs/issues/18 +fn strip_pg_null_bytes(s: &str) -> String { + s.chars().filter(|c| *c != '\0').collect() +} + +fn strip_pg_null_bytes_opt(s: Option<&str>) -> Option { + s.map(strip_pg_null_bytes) +} + +/// Identity-tier normalization for the optional `norm_label` column. +/// +/// `to_identity_match_form` returns an empty string for empty input; we want +/// NULL to flow through to PostgreSQL for the nullable `norm_label` column +/// so downstream NULL-aware joins behave correctly. The `.filter` collapses +/// both the `None` input case AND a `Some("")` post-normalization case +/// (e.g. a `Some(" ")` whitespace-only label) to a single `None`. +fn norm_label(value: Option<&str>) -> Option { + value.map(to_identity_match_form).filter(|s| !s.is_empty()) +} + +/// Populate `wxyc_library` from a SQLite `library.db`. +/// +/// Per E1 §4.1.3 + §3.1: every library row is written (Option B; no filter). +/// Idempotent on `library_id` (`ON CONFLICT DO NOTHING`). +/// +/// `snapshot_source` MUST be one of `backend` | `tubafrenzy` | `llm` per +/// §3.1; the function returns an error otherwise (mirrors the database-side +/// CHECK constraint at the loader boundary so the error message is +/// human-readable). +/// +/// Returns the number of rows attempted (pre-conflict). With a clean target +/// table this equals the row count of `library.db`; on a re-run the report +/// is identical but `COUNT(*)` in `wxyc_library` does not change. +pub fn populate_wxyc_library_v2( + client: &mut Client, + library_db: &Path, + snapshot_source: &str, +) -> Result { + if !ALLOWED_SNAPSHOT_SOURCES.contains(&snapshot_source) { + anyhow::bail!( + "snapshot_source must be one of {:?}, got {snapshot_source:?}", + ALLOWED_SNAPSHOT_SOURCES + ); + } + + let rows = read_library_db(library_db)?; + if rows.is_empty() { + log::warn!( + "populate_wxyc_library_v2: no rows from {}", + library_db.display() + ); + return Ok(0); + } + + // Single timestamp for the whole snapshot — cross-cache freshness is + // observable via the spread of snapshot_at across caches, so all rows + // from one load should share an instant. + let snapshot_at = SystemTime::now(); + + // Use a single prepared statement and a transaction. The cache is small + // (≤64K rows) so a per-row INSERT inside a tx is plenty; if this ever + // becomes a bottleneck the right move is `COPY ... FROM STDIN` like + // `import.rs::import_csv`, but ON CONFLICT semantics complicate that. + let stmt_sql = " + INSERT INTO wxyc_library ( + library_id, artist_id, artist_name, album_title, + label_id, label_name, format_id, format_name, + wxyc_genre, call_letters, call_numbers, release_year, + norm_artist, norm_title, norm_label, + snapshot_at, snapshot_source + ) VALUES ( + $1, $2, $3, $4, + $5, $6, $7, $8, + $9, $10, $11, $12, + $13, $14, $15, + $16, $17 + ) + ON CONFLICT (library_id) DO NOTHING + "; + + let mut tx = client.transaction()?; + let stmt = tx.prepare(stmt_sql)?; + + let mut attempted: u64 = 0; + for r in &rows { + // Strip NUL bytes BEFORE normalization so derived columns inherit the + // PG-safe form. Reversing the order would let a NUL byte in the source + // pass through into norm_artist / norm_title / norm_label and crash + // the INSERT — every TEXT column that hits PostgreSQL must have been + // stripped, including the derived ones. + let artist_name = strip_pg_null_bytes(&r.artist_name); + let album_title = strip_pg_null_bytes(&r.album_title); + let label_name = strip_pg_null_bytes_opt(r.label_name.as_deref()); + let format_name = strip_pg_null_bytes_opt(r.format_name.as_deref()); + let wxyc_genre = strip_pg_null_bytes_opt(r.wxyc_genre.as_deref()); + let call_letters = strip_pg_null_bytes_opt(r.call_letters.as_deref()); + + // norm_artist / norm_title are NOT NULL per §3.1, but Postgres `NOT + // NULL` rejects SQL NULL — NOT empty strings. An empty artist or + // title would silently land with an empty norm_* column, defeating + // downstream NULL-aware joins. Catch it here so the upstream issue + // (likely a SQLite NULL or stripped-to-empty value) surfaces with a + // clear error rather than corrupting the cache. + if artist_name.is_empty() || album_title.is_empty() { + anyhow::bail!( + "library_id {}: artist_name or album_title is empty after NUL-stripping \ + (artist={:?}, title={:?}). library.db rows must have non-empty artist/title; \ + fix the source row before re-running the loader.", + r.library_id, + artist_name, + album_title, + ); + } + let norm_artist = to_identity_match_form(&artist_name); + let norm_title = to_identity_match_form_title(&album_title); + let norm_label_v = norm_label(label_name.as_deref()); + + // For this cache today, every row stamps NULL on artist_id / + // label_id / format_id / release_year — see the module-level + // "Nullability" note. + let artist_id: Option = None; + let label_id: Option = None; + let format_id: Option = None; + let release_year: Option = None; + + tx.execute( + &stmt, + &[ + &r.library_id, + &artist_id, + &artist_name, + &album_title, + &label_id, + &label_name, + &format_id, + &format_name, + &wxyc_genre, + &call_letters, + &r.call_numbers, + &release_year, + &norm_artist, + &norm_title, + &norm_label_v, + &snapshot_at, + &snapshot_source, + ], + )?; + attempted += 1; + } + tx.commit()?; + + log::info!( + "populate_wxyc_library_v2: wrote {} rows to wxyc_library (snapshot_source={}, normalizer={})", + attempted, + snapshot_source, + NORMALIZER_NAME, + ); + Ok(attempted) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn strip_pg_null_bytes_drops_nul() { + assert_eq!(strip_pg_null_bytes("a\0b"), "ab"); + assert_eq!(strip_pg_null_bytes("\0a\0b\0"), "ab"); + } + + #[test] + fn strip_pg_null_bytes_clean_input_unchanged() { + assert_eq!(strip_pg_null_bytes("Stereolab"), "Stereolab"); + } + + #[test] + fn norm_label_passes_none_through() { + assert_eq!(norm_label(None), None); + } + + #[test] + fn norm_label_normalizes_some() { + // to_identity_match_form lowercases + collapses; full algorithmic + // pin lives in the integration tests against PG. + let v = norm_label(Some("Sonamos")); + assert_eq!(v.as_deref(), Some("sonamos")); + } + + #[test] + fn norm_label_drops_empty_string() { + // `Some("")` and `Some(" ")` (whitespace that the normalizer + // collapses to "") must come back as `None` — a non-NULL empty + // string would defeat downstream NULL-aware lookups on norm_label. + // The docstring promises this; the test pins it. + assert_eq!(norm_label(Some("")), None); + assert_eq!(norm_label(Some(" ")), None); + } + + #[test] + fn allowed_snapshot_sources_pinned() { + assert_eq!(ALLOWED_SNAPSHOT_SOURCES, &["backend", "tubafrenzy", "llm"]); + } + + /// `read_library_db` adapts to whichever optional columns happen to be + /// present. The integration tests exercise the full-prod schema; this + /// unit test pins the minimal-schema branch (`id, artist, title` only). + /// Older library.db snapshots — and the smallest test fixtures — don't + /// carry label / format / genre / call_letters / release_call_number, + /// and the loader needs to handle those without a PRAGMA-keyed panic. + #[test] + fn read_library_db_minimal_schema() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("library.db"); + + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE library (\ + id INTEGER PRIMARY KEY, \ + artist TEXT NOT NULL, \ + title TEXT NOT NULL\ + );\ + INSERT INTO library (id, artist, title) VALUES \ + (1, 'Juana Molina', 'DOGA'), \ + (2, 'Stereolab', 'Aluminum Tunes');", + ) + .unwrap(); + drop(conn); + + let rows = read_library_db(&db_path).unwrap(); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0].library_id, 1); + assert_eq!(rows[0].artist_name, "Juana Molina"); + assert_eq!(rows[0].album_title, "DOGA"); + // Optional columns must be None when the source schema doesn't carry them. + assert!(rows[0].label_name.is_none()); + assert!(rows[0].format_name.is_none()); + assert!(rows[0].wxyc_genre.is_none()); + assert!(rows[0].call_letters.is_none()); + assert!(rows[0].call_numbers.is_none()); + } +} diff --git a/tests/import_test.rs b/tests/import_test.rs index 1a9abf8..df22054 100644 --- a/tests/import_test.rs +++ b/tests/import_test.rs @@ -8,8 +8,10 @@ use assert_cmd::Command; use postgres::{Client, NoTls}; +use rusqlite::Connection as SqliteConnection; use std::path::Path; use std::sync::{Mutex, MutexGuard}; +use tempfile::TempDir; use wikidata_cache::import; use wikidata_cache::import_schema; @@ -669,3 +671,138 @@ fn test_import_subcommand() { .get(0); assert_eq!(count, 5, "Import subcommand should load 5 entities"); } + +/// CLI integration test for the `import-wxyc-library` subcommand. +/// +/// Pattern parity with `test_import_subcommand` for the CSV `import` path — +/// every subcommand should have at least one end-to-end binary invocation +/// in this file. Validates `--library-db` / `--database-url` / `--snapshot-source` +/// wiring AND the through-line from CLI parse -> `run_import_wxyc_library` -> +/// `populate_wxyc_library_v2` -> PostgreSQL row. +#[test] +fn test_import_wxyc_library_subcommand() { + let _lock = lock_db(); + + // Ensure the schema exists (the subcommand expects wxyc_library to be present). + let mut client = test_client(); + fresh_schema(&mut client); + drop(client); + + // Build a tiny library.db fixture in a temp dir. + let tmp = TempDir::new().unwrap(); + let library_db_path = tmp.path().join("library.db"); + let conn = SqliteConnection::open(&library_db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE library (\ + id INTEGER PRIMARY KEY, \ + artist TEXT NOT NULL, \ + title TEXT NOT NULL, \ + label TEXT, \ + format TEXT\ + );\ + INSERT INTO library (id, artist, title, label, format) VALUES \ + (101, 'Juana Molina', 'DOGA', 'Sonamos', 'LP'), \ + (102, 'Stereolab', 'Aluminum Tunes', 'Duophonic', 'CD');", + ) + .unwrap(); + drop(conn); + + Command::cargo_bin("wikidata-cache") + .unwrap() + .arg("import-wxyc-library") + .arg("--library-db") + .arg(&library_db_path) + .arg("--database-url") + .arg(TEST_DB_URL) + .arg("--snapshot-source") + .arg("backend") + .assert() + .success(); + + let mut client = test_client(); + let count: i64 = client + .query_one("SELECT COUNT(*) FROM wxyc_library", &[]) + .unwrap() + .get(0); + assert_eq!(count, 2, "import-wxyc-library should have written 2 rows"); + + // Spot-check normalization made it through the pipe end-to-end. + let norm_artist: String = client + .query_one( + "SELECT norm_artist FROM wxyc_library WHERE library_id = 101", + &[], + ) + .unwrap() + .get(0); + assert_eq!(norm_artist, "juana molina"); + + // Reject path: invalid --snapshot-source must fail before any write. + Command::cargo_bin("wikidata-cache") + .unwrap() + .arg("import-wxyc-library") + .arg("--library-db") + .arg(&library_db_path) + .arg("--database-url") + .arg(TEST_DB_URL) + .arg("--snapshot-source") + .arg("bogus") + .assert() + .failure(); +} + +/// Regression test for the bulk-import-clobbers-wxyc_library bug. +/// +/// `truncate_all` is called by the CSV `import` subcommand on every run. +/// `wxyc_library` is loaded by the separate `import-wxyc-library` +/// subcommand and must NOT be wiped by a CSV import. Before the +/// `CSV_IMPORT_TABLES` / `ALL_TABLES` split, every CSV import would +/// silently truncate `wxyc_library`. This test pins the new contract. +#[test] +fn test_truncate_all_preserves_wxyc_library() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + // Seed both an `entity` row (CSV table; should be truncated) and a + // `wxyc_library` row (hook table; should survive). + client + .execute( + "INSERT INTO entity (qid, label, description, entity_type) VALUES ($1, $2, $3, $4)", + &[&"Q1", &"sentinel", &"sentinel-desc", &"human"], + ) + .unwrap(); + client + .execute( + "INSERT INTO wxyc_library (\ + library_id, artist_name, album_title, \ + norm_artist, norm_title, snapshot_at, snapshot_source\ + ) VALUES ($1, $2, $3, $4, $5, NOW(), $6)", + &[ + &1_i32, + &"Sentinel Artist", + &"Sentinel Album", + &"sentinel artist", + &"sentinel album", + &"backend", + ], + ) + .unwrap(); + + import_schema::truncate_all(&mut client).unwrap(); + + let csv_count: i64 = client + .query_one("SELECT COUNT(*) FROM entity", &[]) + .unwrap() + .get(0); + assert_eq!(csv_count, 0, "truncate_all should clear CSV tables"); + + let hook_count: i64 = client + .query_one("SELECT COUNT(*) FROM wxyc_library", &[]) + .unwrap() + .get(0); + assert_eq!( + hook_count, 1, + "truncate_all MUST NOT touch wxyc_library — that table is owned \ + by the import-wxyc-library subcommand" + ); +} diff --git a/tests/wxyc_library_v2_test.rs b/tests/wxyc_library_v2_test.rs new file mode 100644 index 0000000..3cc746a --- /dev/null +++ b/tests/wxyc_library_v2_test.rs @@ -0,0 +1,445 @@ +//! Integration tests for the v2 `wxyc_library` hook (E1 §4.1.3 of +//! `plans/library-hook-canonicalization.md`). +//! +//! Validates the migration `0002_wxyc_library_v2.sql` (mirrored into +//! `schema/create_database.sql`) and the matching loader in +//! `src/wxyc_loader.rs`. Per the wiki §4.1.3, this cache is small and +//! schema-validation only — there is no in-repo legacy predecessor, so the +//! loader is verified against the input fixture's row count rather than a +//! parity comparator. Modeled after `discogs-etl` PR #185's 5-test suite. +//! +//! Like `tests/import_test.rs`, these tests are not gated behind `#[ignore]` +//! — they expect a PostgreSQL instance running on localhost:5435 with +//! user=wikidata, password=wikidata, dbname=wikidata_test +//! Start with: docker compose up -d +//! Run with: cargo test --test wxyc_library_v2_test + +use postgres::{Client, NoTls}; +use std::path::{Path, PathBuf}; +use std::sync::{Mutex, MutexGuard}; + +use wikidata_cache::import_schema; +use wikidata_cache::wxyc_loader::{ + ALLOWED_SNAPSHOT_SOURCES, NORMALIZER_NAME, populate_wxyc_library_v2, +}; + +const TEST_DB_URL: &str = + "host=localhost port=5435 user=wikidata password=wikidata dbname=wikidata_test"; + +/// Serialize all database tests on this binary to avoid race conditions on +/// the shared test database. Mirrors the pattern in `tests/import_test.rs`. +static DB_LOCK: Mutex<()> = Mutex::new(()); + +fn lock_db() -> MutexGuard<'static, ()> { + DB_LOCK.lock().unwrap_or_else(|e| e.into_inner()) +} + +fn test_client() -> Client { + Client::connect(TEST_DB_URL, NoTls) + .expect("Failed to connect to test database. Is `docker compose up -d` running?") +} + +/// Drop and re-create the schema (which now includes `wxyc_library`). +fn fresh_schema(client: &mut Client) { + import_schema::drop_schema(client).unwrap(); + import_schema::create_schema(client).unwrap(); +} + +/// Canonical fixture matching `discogs-etl/tests/integration/test_wxyc_library_v2.py`. +/// Uses WXYC-representative artists per the org-level CLAUDE.md "Example +/// Music Data" guidance. Row 6 (Nilüfer Yanya) carries the diacritic +/// canary — the normalizer pin test asserts ü is folded. +const FIXTURE_ROWS: &[(i64, &str, &str, &str, &str, &str)] = &[ + (1, "Juana Molina", "DOGA", "LP", "Sonamos", "Rock"), + ( + 2, + "Jessica Pratt", + "On Your Own Love Again", + "LP", + "Drag City", + "Rock", + ), + ( + 3, + "Chuquimamani-Condori", + "Edits", + "CD", + "self-released", + "Electronic", + ), + ( + 4, + "Duke Ellington & John Coltrane", + "Duke Ellington & John Coltrane", + "LP", + "Impulse Records", + "Jazz", + ), + (5, "Stereolab", "Aluminum Tunes", "CD", "Duophonic", "Rock"), + // Diacritic-bearing canonical name from `wxycCanonicalArtistNames`. + (6, "Nilüfer Yanya", "Painless", "LP", "ATO Records", "Rock"), +]; + +/// Build a tiny `library.db` SQLite file with the canonical fixture rows. +/// Returns the temp dir and the path so the dir lives long enough. +fn build_library_db(dir: &Path) -> PathBuf { + let db_path = dir.join("library.db"); + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE library ( + id INTEGER PRIMARY KEY, + artist TEXT NOT NULL, + title TEXT NOT NULL, + format TEXT, + label TEXT, + genre TEXT + );", + ) + .unwrap(); + let mut stmt = conn + .prepare( + "INSERT INTO library (id, artist, title, format, label, genre) \ + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", + ) + .unwrap(); + for (id, artist, title, format, label, genre) in FIXTURE_ROWS { + stmt.execute(rusqlite::params![id, artist, title, format, label, genre]) + .unwrap(); + } + db_path +} + +// --------------------------------------------------------------------------- +// 1. Schema lands all 8 indexes. +// --------------------------------------------------------------------------- + +#[test] +fn test_migration_creates_wxyc_library_with_indexes() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + // §3.1 names 8 indexes — pkey + 5 b-tree + 2 GIN trgm. The pkey index + // is auto-named `wxyc_library_pkey` by Postgres. + let expected: &[&str] = &[ + "wxyc_library_pkey", + "wxyc_library_norm_artist_idx", + "wxyc_library_norm_title_idx", + "wxyc_library_artist_id_idx", + "wxyc_library_format_id_idx", + "wxyc_library_release_year_idx", + "wxyc_library_norm_artist_trgm_idx", + "wxyc_library_norm_title_trgm_idx", + ]; + + let rows = client + .query( + "SELECT indexname FROM pg_indexes \ + WHERE schemaname = 'public' AND tablename = 'wxyc_library'", + &[], + ) + .unwrap(); + let present: std::collections::HashSet = + rows.iter().map(|r| r.get::<_, String>(0)).collect(); + for idx in expected { + assert!( + present.contains(*idx), + "index '{idx}' missing after migration; present: {present:?}" + ); + } + assert_eq!( + present.len(), + expected.len(), + "expected exactly {} indexes on wxyc_library; got {}: {:?}", + expected.len(), + present.len(), + present + ); +} + +// --------------------------------------------------------------------------- +// 2. Loader writes every fixture row. +// --------------------------------------------------------------------------- + +#[test] +fn test_v2_loader_writes_every_fixture_row() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + let tmp = tempfile::tempdir().unwrap(); + let library_db = build_library_db(tmp.path()); + + let attempted = populate_wxyc_library_v2(&mut client, &library_db, "backend").unwrap(); + assert_eq!(attempted as usize, FIXTURE_ROWS.len()); + + let count: i64 = client + .query_one("SELECT COUNT(*) FROM wxyc_library", &[]) + .unwrap() + .get(0); + assert_eq!(count as usize, FIXTURE_ROWS.len()); + + // Every fixture library_id must be present, with populated norm_artist / + // norm_title and snapshot_source = 'backend'. + for (id, _, _, _, _, _) in FIXTURE_ROWS { + let row = client + .query_one( + "SELECT artist_name, album_title, norm_artist, norm_title, snapshot_source \ + FROM wxyc_library WHERE library_id = $1", + &[&(*id as i32)], + ) + .unwrap(); + let artist_name: &str = row.get(0); + let album_title: &str = row.get(1); + let norm_artist: &str = row.get(2); + let norm_title: &str = row.get(3); + let snapshot_source: &str = row.get(4); + assert!(!artist_name.is_empty(), "artist_name empty for id={id}"); + assert!(!album_title.is_empty(), "album_title empty for id={id}"); + assert!(!norm_artist.is_empty(), "norm_artist empty for id={id}"); + assert!(!norm_title.is_empty(), "norm_title empty for id={id}"); + assert_eq!(snapshot_source, "backend", "wrong source for id={id}"); + } +} + +// --------------------------------------------------------------------------- +// 3. Loader is idempotent on re-run. +// --------------------------------------------------------------------------- + +#[test] +fn test_v2_loader_is_idempotent() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + let tmp = tempfile::tempdir().unwrap(); + let library_db = build_library_db(tmp.path()); + + // ON CONFLICT DO NOTHING means both calls report rows-attempted, not + // rows-inserted; idempotency is observable in COUNT(*). + let first = populate_wxyc_library_v2(&mut client, &library_db, "backend").unwrap(); + let second = populate_wxyc_library_v2(&mut client, &library_db, "backend").unwrap(); + assert_eq!(first, second); + assert_eq!(first as usize, FIXTURE_ROWS.len()); + + let count: i64 = client + .query_one("SELECT COUNT(*) FROM wxyc_library", &[]) + .unwrap() + .get(0); + assert_eq!(count as usize, FIXTURE_ROWS.len()); +} + +// --------------------------------------------------------------------------- +// 4. Loader rejects invalid snapshot_source (mirrors §3.1 CHECK constraint). +// --------------------------------------------------------------------------- + +#[test] +fn test_v2_loader_rejects_invalid_snapshot_source() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + let tmp = tempfile::tempdir().unwrap(); + let library_db = build_library_db(tmp.path()); + + let err = populate_wxyc_library_v2(&mut client, &library_db, "bogus").unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains("snapshot_source"), + "error message should mention snapshot_source; got: {msg}" + ); + + // Sanity: the constant pinning what's allowed. + assert_eq!(ALLOWED_SNAPSHOT_SOURCES, &["backend", "tubafrenzy", "llm"]); +} + +// --------------------------------------------------------------------------- +// 5. Normalizer is locked to `to_identity_match_form` (no algorithm drift). +// --------------------------------------------------------------------------- + +#[test] +fn test_normalizer_is_to_identity_match_form() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + let tmp = tempfile::tempdir().unwrap(); + let library_db = build_library_db(tmp.path()); + + populate_wxyc_library_v2(&mut client, &library_db, "backend").unwrap(); + + // The audit string names the locked-on baseline. + assert_eq!(NORMALIZER_NAME, "wxyc_etl::text::to_identity_match_form"); + + // Hard-coded value pin: catches algorithm drift in `wxyc-etl`. Library + // row 1 is "Juana Molina" / "DOGA" / "Sonamos" — no diacritics, no + // leading articles, just lowercasing. + let row = client + .query_one( + "SELECT norm_artist, norm_title, norm_label \ + FROM wxyc_library WHERE library_id = 1", + &[], + ) + .unwrap(); + let norm_artist: &str = row.get(0); + let norm_title: &str = row.get(1); + let norm_label: &str = row.get(2); + assert_eq!(norm_artist, "juana molina"); + assert_eq!(norm_title, "doga"); + assert_eq!(norm_label, "sonamos"); + + // Equality with the canonical functions — robust to other normalization + // changes that don't affect these particular inputs. + assert_eq!( + norm_artist, + wxyc_etl::text::to_identity_match_form("Juana Molina") + ); + assert_eq!( + norm_title, + wxyc_etl::text::to_identity_match_form_title("DOGA") + ); + assert_eq!( + norm_label, + wxyc_etl::text::to_identity_match_form("Sonamos") + ); + + // Diacritic-fold pin: row 6 has ü which must fold to u in storage. + let row = client + .query_one( + "SELECT norm_artist FROM wxyc_library WHERE library_id = 6", + &[], + ) + .unwrap(); + let norm_a_diacritic: &str = row.get(0); + assert_eq!( + norm_a_diacritic, "nilufer yanya", + "Nilüfer Yanya did not fold to ASCII as expected: {norm_a_diacritic:?}" + ); + assert!( + !norm_a_diacritic.contains('ü'), + "diacritic survived normalization: {norm_a_diacritic:?}" + ); +} + +/// Regression test: a library.db row with an empty `artist` or `title` +/// must be rejected by the loader before it lands in `wxyc_library`. +/// +/// Postgres `NOT NULL` rejects SQL NULL but NOT empty strings; without an +/// explicit guard, an empty input would silently land with empty norm +/// columns and defeat downstream NULL-aware joins. Pin the loud-failure +/// behavior so a future refactor can't silently regress it. +#[test] +fn test_loader_rejects_empty_artist_or_title() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + let tmp = tempfile::TempDir::new().unwrap(); + let db_path = tmp.path().join("library.db"); + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE library (\ + id INTEGER PRIMARY KEY, \ + artist TEXT NOT NULL, \ + title TEXT NOT NULL\ + );", + ) + .unwrap(); + // Empty artist string — must be rejected. + conn.execute( + "INSERT INTO library (id, artist, title) VALUES (?, ?, ?)", + rusqlite::params![1_i64, "", "Some Title"], + ) + .unwrap(); + drop(conn); + + let err = populate_wxyc_library_v2(&mut client, &db_path, "backend").unwrap_err(); + let msg = format!("{err}"); + assert!( + msg.contains("artist_name or album_title is empty"), + "expected empty-input error, got: {msg}" + ); + + // No rows should have made it into the table. + let count: i64 = client + .query_one("SELECT COUNT(*) FROM wxyc_library", &[]) + .unwrap() + .get(0); + assert_eq!(count, 0, "loader must not write any rows when bailing"); +} + +/// Regression test: NUL bytes (U+0000) in source strings must be stripped +/// from EVERY TEXT column the loader writes — including the derived +/// `norm_artist` / `norm_title` / `norm_label` columns. +/// +/// PostgreSQL TEXT cannot store `\0`; an unstripped NUL on any column would +/// crash the INSERT. An earlier version of the loader normalized BEFORE +/// stripping, which left NUL bytes intact in the norm columns; the strip +/// now happens first, and the norms derive from the cleaned strings. +#[test] +fn test_loader_strips_nul_bytes_from_norm_columns() { + let _lock = lock_db(); + let mut client = test_client(); + fresh_schema(&mut client); + + // library.db with NUL bytes embedded in artist/title/label. + let tmp = tempfile::TempDir::new().unwrap(); + let db_path = tmp.path().join("library.db"); + let conn = rusqlite::Connection::open(&db_path).unwrap(); + conn.execute_batch( + "CREATE TABLE library (\ + id INTEGER PRIMARY KEY, \ + artist TEXT NOT NULL, \ + title TEXT NOT NULL, \ + label TEXT\ + );", + ) + .unwrap(); + conn.execute( + "INSERT INTO library (id, artist, title, label) VALUES (?, ?, ?, ?)", + rusqlite::params![42_i64, "Juana\0 Molina", "DO\0GA", "Sona\0mos",], + ) + .unwrap(); + drop(conn); + + let written = populate_wxyc_library_v2(&mut client, &db_path, "backend").unwrap(); + assert_eq!(written, 1, "loader should attempt 1 row"); + + // Display columns: NUL stripped. + let row = client + .query_one( + "SELECT artist_name, album_title, label_name, \ + norm_artist, norm_title, norm_label \ + FROM wxyc_library WHERE library_id = 42", + &[], + ) + .unwrap(); + let artist_name: String = row.get(0); + let album_title: String = row.get(1); + let label_name: Option = row.get(2); + let norm_artist: String = row.get(3); + let norm_title: String = row.get(4); + let norm_label: Option = row.get(5); + + for col in [ + &artist_name, + &album_title, + label_name.as_deref().unwrap(), + &norm_artist, + &norm_title, + norm_label.as_deref().unwrap(), + ] { + assert!( + !col.contains('\0'), + "NUL byte survived in column value {col:?} — every TEXT column must be stripped, including derived norm columns" + ); + } + + // Sanity-check the cleaned forms. + assert_eq!(artist_name, "Juana Molina"); + assert_eq!(album_title, "DOGA"); + assert_eq!(label_name.as_deref(), Some("Sonamos")); + assert_eq!(norm_artist, "juana molina"); + assert_eq!(norm_title, "doga"); + assert_eq!(norm_label.as_deref(), Some("sonamos")); +}