diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8caaaef..b13b872 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -46,6 +46,19 @@ jobs: steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable + - name: Install wxyc_unaccent rules into the PG service container + # Required by migrations/0003_wxyc_identity_match_functions.sql. + # Alpine layout: /usr/local/share/postgresql/tsearch_data. + run: | + PG_CID=$(docker ps -q --filter ancestor=postgres:16-alpine) + if [ -z "$PG_CID" ]; then + echo "no postgres:16-alpine service container running" >&2 + exit 1 + fi + docker cp vendor/wxyc-etl/wxyc_unaccent.rules \ + "$PG_CID:/usr/local/share/postgresql/tsearch_data/wxyc_unaccent.rules" + docker cp vendor/wxyc-etl/wxyc_unaccent.version \ + "$PG_CID:/usr/local/share/postgresql/tsearch_data/wxyc_unaccent.version" - name: Run PostgreSQL integration tests env: TEST_DATABASE_URL: postgresql://wikidata:wikidata@localhost:5435/wikidata_test @@ -54,3 +67,7 @@ jobs: env: TEST_DATABASE_URL: postgresql://wikidata:wikidata@localhost:5435/wikidata_test run: cargo test --test charset_torture -- --test-threads=1 --include-ignored + - name: Run wxyc_identity_match_* parity test + env: + TEST_DATABASE_URL: postgresql://wikidata:wikidata@localhost:5435/wikidata_test + run: cargo test --test wxyc_identity_match_parity_test -- --ignored --test-threads=1 diff --git a/CLAUDE.md b/CLAUDE.md index 7ac4524..5f13d11 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -94,6 +94,10 @@ The full rebuild (`build` then `import --fresh`) is scheduled via `.github/workf **Runner-capacity caveat:** the Wikidata JSON dump is roughly 130GB gzipped and a full rebuild can take many hours. GitHub-hosted `ubuntu-latest` runners have a 6-hour job timeout and only ~14GB of free disk, so the scheduled run will likely fail on disk or timeout. The workflow is intentionally a scheduling skeleton — the actual rebuild needs to migrate to a self-hosted runner, a Railway job, or a dedicated EC2 box. Until then, treat the `workflow_dispatch` trigger as the supported path (e.g., for small-dump smoke tests) and run real rebuilds out-of-band. +## wxyc-etl vendoring (cross-cache-identity) + +This repo deploys the `wxyc_identity_match_*` plpgsql function family for cross-cache identity matching. The canonical sources live in WXYC/wxyc-etl@v0.4.0 (`data/`); we vendor byte-for-byte copies under `vendor/wxyc-etl/` (top-level `data/` would conflict with `--data-dir` runtime use). SHA-pinned in `wxyc-etl-pin.txt`. Migration `migrations/0003_wxyc_identity_match_functions.sql` is a sqlx-cli-friendly wrapper that sets up the `wxyc_unaccent` text-search dictionary and then inlines the canonical SQL. The parity test in `tests/wxyc_identity_match_parity_test.rs` enforces pin freshness, migration-vs-canonical byte-equality, and PG-side fixture parity (252 rows + idempotence). + ## Migrations Schema changes ship as numbered SQL files under `migrations/`, applied with [sqlx-cli](https://crates.io/crates/sqlx-cli). The baseline `migrations/0001_initial.sql` mirrors `schema/create_database.sql`. diff --git a/Cargo.lock b/Cargo.lock index eeda300..4ed0793 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -168,6 +168,15 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + [[package]] name = "block-buffer" version = "0.12.0" @@ -244,7 +253,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.3.0", "rand_core 0.10.1", ] @@ -306,6 +315,15 @@ version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + [[package]] name = "cpufeatures" version = "0.3.0" @@ -358,6 +376,16 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + [[package]] name = "crypto-common" version = "0.2.1" @@ -434,15 +462,25 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer 0.10.4", + "crypto-common 0.1.7", +] + [[package]] name = "digest" version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4850db49bf08e663084f7fb5c87d202ef91a3907271aff24a94eb97ff039153c" dependencies = [ - "block-buffer", + "block-buffer 0.12.0", "const-oid", - "crypto-common", + "crypto-common 0.2.1", "ctutils", ] @@ -625,6 +663,16 @@ dependencies = [ "slab", ] +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -723,7 +771,7 @@ version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" dependencies = [ - "digest", + "digest 0.11.2", ] [[package]] @@ -1122,7 +1170,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98" dependencies = [ "cfg-if", - "digest", + "digest 0.11.2", ] [[package]] @@ -1511,7 +1559,7 @@ dependencies = [ "md-5", "memchr", "rand 0.10.1", - "sha2", + "sha2 0.11.0", "stringprep", ] @@ -2158,6 +2206,17 @@ dependencies = [ "serde", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + [[package]] name = "sha2" version = "0.11.0" @@ -2165,8 +2224,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.3.0", + "digest 0.11.2", ] [[package]] @@ -2938,6 +2997,7 @@ dependencies = [ "rusqlite", "serde", "serde_json", + "sha2 0.10.9", "tempfile", "tracing", "wxyc-etl", @@ -3207,9 +3267,9 @@ checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" [[package]] name = "wxyc-etl" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9f4f32d293cb6ab6d935ebfb343dd0581f0e2592c69e85612c78c05a171339b" +checksum = "a8692087abc2998a8ef2f1a872059b0ff8bbd2e9da6c75e108fa86cc752dee9e" dependencies = [ "anyhow", "clap", diff --git a/Cargo.toml b/Cargo.toml index 128779b..176009f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ csv = "1.3" anyhow = "1" log = "0.4" tracing = "0.1" -wxyc-etl = "0.3.0" +wxyc-etl = "0.4.0" postgres = "0.19" rusqlite = { version = "0.31", features = ["bundled"] } @@ -27,6 +27,7 @@ tempfile = "3" postgres = "0.19" pretty_assertions = "1" csv = "1.3" +sha2 = "0.10" [profile.release] lto = true diff --git a/migrations/0003_wxyc_identity_match_functions.sql b/migrations/0003_wxyc_identity_match_functions.sql new file mode 100644 index 0000000..f237c88 --- /dev/null +++ b/migrations/0003_wxyc_identity_match_functions.sql @@ -0,0 +1,328 @@ +-- Cross-cache-identity match form: Postgres analog of +-- `wxyc_etl::text::to_identity_match_form` and its three siblings. +-- +-- Vendored from WXYC/wxyc-etl@v0.4.0 — see `wxyc-etl-pin.txt` at repo root. +-- Refresh by re-vendoring the three files under `vendor/wxyc-etl/`, then +-- re-pasting `wxyc_identity_match_functions.sql` after the @BEGIN CANONICAL +-- marker below. The parity test checks SHAs + byte-equality after the +-- sentinel; it does not regenerate the wrapper automatically. +-- +-- The DROP+CREATE on the wxyc_unaccent dictionary below is safe today +-- because no other migration depends on it. If a future migration adds a +-- functional index or generated column over `unaccent('wxyc_unaccent', ...)`, +-- this re-apply will fail without CASCADE — coordinate the rules-file +-- refresh with that dependency at the time it lands. +-- +-- Spec: WXYC/wiki `plans/library-hook-canonicalization.md` §3.3.5. + +CREATE EXTENSION IF NOT EXISTS unaccent; + +DROP TEXT SEARCH DICTIONARY IF EXISTS wxyc_unaccent; +CREATE TEXT SEARCH DICTIONARY wxyc_unaccent ( + TEMPLATE = unaccent, + RULES = 'wxyc_unaccent' +); + +-- @BEGIN CANONICAL BODY (do not edit; vendored from wxyc-etl) +-- Canonical SQL implementation of the cross-cache-identity match form. +-- +-- Vendored verbatim into every cache repo (discogs-etl, musicbrainz-cache, +-- wikidata-cache) and Backend-Service. The four function bodies must produce +-- byte-identical output to the corresponding Rust entry points in +-- `wxyc_etl::text::identity`: +-- +-- wxyc_identity_match_artist <-> to_identity_match_form +-- wxyc_identity_match_title <-> to_identity_match_form_title +-- wxyc_identity_match_with_punctuation <-> to_identity_match_form_with_punctuation +-- wxyc_identity_match_with_disambiguator_strip +-- <-> to_identity_match_form_with_disambiguator_strip +-- +-- Parity is asserted by `wxyc-etl/tests/postgres_parity_test.rs` against the +-- 252-row fixture in `wxyc-etl/tests/fixtures/identity_normalization_cases.csv`. +-- +-- Required Postgres version: 16+ (Unicode property classes, `normalize()`, +-- stable regex behavior). Required extension: `unaccent` configured with the +-- `wxyc_unaccent` text-search dictionary installed from +-- `data/wxyc_unaccent.rules`. +-- +-- Vendoring contract: each consumer carries `wxyc-etl-pin.txt` recording the +-- SHA-256 of `data/wxyc_unaccent.rules` and the version header read from the +-- file's first comment line. Mismatch fails CI. See +-- `wxyc-etl/docs/postgres-analog-vendoring.md`. + +DO $$ +BEGIN + IF current_setting('server_version_num')::int < 160000 THEN + RAISE EXCEPTION 'wxyc identity-match functions require Postgres 16+; got %', + current_setting('server_version'); + END IF; +END $$; + +-- The wxyc_unaccent dictionary must be created before this file loads. +-- Consumer migrations do: +-- CREATE EXTENSION IF NOT EXISTS unaccent; +-- CREATE TEXT SEARCH DICTIONARY wxyc_unaccent ( +-- TEMPLATE = unaccent, RULES = 'wxyc_unaccent' +-- ); +-- followed by the rules-file SHA verification block (see vendoring docs). + +-- --------------------------------------------------------------------------- +-- Base match-form pipeline. +-- +-- Mirror of `wxyc_etl::text::to_match_form` after the storage-form pass +-- (no mojibake repair — callers responsible for storing pre-cleaned bytes). +-- Pipeline: +-- normalize NFKC -> lower -> wxyc_unaccent dictionary -> strip-Cf-except-ZWJ +-- -> collapse-ASCII-space + trim. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_match_form(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; + zwj text := chr(8205); -- U+200D + cf_pattern text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := normalize(s, NFKC); + r := lower(r); + r := unaccent('wxyc_unaccent', r); + -- Strip Cf (format) characters except U+200D ZWJ (emoji integrity), matching + -- `strip_cf_except_zwj` in the Rust pipeline. Postgres regex has no + -- `\p{Cf}` and no char-class subtraction; build the class from explicit + -- BMP Cf codepoints split around ZWJ. Supplementary-plane Cf (U+E0001 etc.) + -- is rare in music-catalog data and intentionally not handled here. + cf_pattern := + '[' + || chr(173) -- U+00AD soft hyphen + || chr(1564) -- U+061C ALM + || chr(1757) -- U+06DD ARABIC END OF AYAH + || chr(1807) -- U+070F SYRIAC ABBREV MARK + || chr(2274) -- U+08E2 ARABIC DISPUTED END OF AYAH + || chr(6158) -- U+180E MONG VOWEL SEP + || chr(8203) || '-' || chr(8204) -- U+200B-U+200C (200D ZWJ skipped) + || chr(8206) || '-' || chr(8207) -- U+200E-U+200F + || chr(8234) || '-' || chr(8238) -- U+202A-U+202E + || chr(8288) || '-' || chr(8303) -- U+2060-U+206F + || chr(65279) -- U+FEFF BOM + || chr(65529) || '-' || chr(65531) -- U+FFF9-U+FFFB + || ']'; + -- ZWJ is excluded from the class above, so no placeholder swap needed. + r := regexp_replace(r, cf_pattern, '', 'g'); + -- Collapse runs of ASCII space + trim. Other whitespace (TAB etc.) preserved. + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: strip a single trailing (...) or [...] group. +-- +-- Mirror of `strip_trailing_parens` in `wxyc_etl::text::identity`. Returns +-- input unchanged when: no trailing close-bracket, brackets unbalanced, +-- or the matching open is at position 0 (would reduce stem to empty). +-- One pass only. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_strip_trailing_parens(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + trimmed text; + open_chr char; + close_chr char; + ch char; + depth int := 0; + open_idx int := -1; + i int; + stem text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + trimmed := regexp_replace(s, ' +$', ''); + IF length(trimmed) = 0 THEN RETURN s; END IF; + ch := right(trimmed, 1); + IF ch = ')' THEN + open_chr := '('; close_chr := ')'; + ELSIF ch = ']' THEN + open_chr := '['; close_chr := ']'; + ELSE + RETURN s; + END IF; + -- Scan right-to-left for the matching open. + FOR i IN REVERSE length(trimmed)..1 LOOP + ch := substr(trimmed, i, 1); + IF ch = close_chr THEN + depth := depth + 1; + ELSIF ch = open_chr THEN + depth := depth - 1; + IF depth = 0 THEN + open_idx := i; + EXIT; + END IF; + END IF; + END LOOP; + IF open_idx < 0 OR open_idx = 1 THEN + -- Unbalanced or full-string brackets — preserve. + RETURN s; + END IF; + stem := substr(trimmed, 1, open_idx - 1); + stem := regexp_replace(stem, ' +$', ''); + RETURN stem; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: drop a leading article or trailing comma-form article. +-- +-- Mirror of `drop_articles` in `wxyc_etl::text::identity`. At most one +-- match is consumed. The leading form requires the article followed by +-- ASCII space (`the `, `a `, `an `); `theater` does not match. The comma +-- form requires `, the` / `, a` / `, an` at end-of-string with a +-- non-empty stem; `Beatles, the Best Of` does not match. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_drop_articles(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + art text; + stripped text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + FOREACH art IN ARRAY ARRAY['the ', 'a ', 'an '] LOOP + IF starts_with(s, art) THEN + RETURN substr(s, length(art) + 1); + END IF; + END LOOP; + FOREACH art IN ARRAY ARRAY[', the', ', a', ', an'] LOOP + -- Suffix check via `right()` rather than `LIKE '%' || art` so a future + -- article containing `%` or `_` doesn't trigger wildcard semantics. + IF length(s) >= length(art) AND right(s, length(art)) = art THEN + stripped := substr(s, 1, length(s) - length(art)); + IF length(stripped) > 0 THEN + RETURN stripped; + END IF; + END IF; + END LOOP; + RETURN s; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: identity baseline (steps 4 + 5). +-- +-- Mirror of `identity_baseline` in `wxyc_etl::text::identity`. The shared +-- body of artist + title entry points. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_baseline(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_match_form(s); + r := wxyc_strip_trailing_parens(r); + r := wxyc_drop_articles(r); + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: artist identity match. +-- Mirror of `wxyc_etl::text::to_identity_match_form`. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_artist(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +BEGIN + RETURN wxyc_identity_baseline(s); +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: title identity match. +-- Mirror of `wxyc_etl::text::to_identity_match_form_title`. Same body as +-- artist today; separate function so callers type-distinguish at the call +-- site and a future step-6 promotion does not silently change titles that +-- would not benefit (`Side A/2` etc.). +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_title(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +BEGIN + RETURN wxyc_identity_baseline(s); +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: identity match + opt-in punctuation collapse (step 6). +-- Mirror of `wxyc_etl::text::to_identity_match_form_with_punctuation`. +-- Each run of one-or-more non-letter, non-number, non-whitespace codepoints +-- becomes a single ASCII space; result is re-collapsed and re-trimmed. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_with_punctuation(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_match_form(s); + r := wxyc_strip_trailing_parens(r); + r := wxyc_drop_articles(r); + -- Step 6: replace each run of non-{Letter,Number,Whitespace} with one space. + -- Postgres regex doesn't support `\p{L}` directly, but POSIX `[:alpha:]` / + -- `[:digit:]` / `[:space:]` are locale-aware (en_US.UTF-8 collation = + -- full Unicode coverage). + r := regexp_replace(r, '[^[:alpha:][:digit:][:space:]]+', ' ', 'g'); + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: identity match + opt-in `/N` disambiguator strip (step 8). +-- Mirror of `wxyc_etl::text::to_identity_match_form_with_disambiguator_strip`. +-- +-- Artists only. The leading whitespace before `/` is REQUIRED (`John Smith /1` +-- strips; `Track 1/12` does not — matches Rust's `\s+/\d+$` not `\s*`). +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_with_disambiguator_strip(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_identity_baseline(s); + r := regexp_replace(r, ' +/\d+$', ''); + RETURN r; +END +$$; diff --git a/tests/fixtures/identity_normalization_cases.csv b/tests/fixtures/identity_normalization_cases.csv new file mode 100644 index 0000000..2652753 --- /dev/null +++ b/tests/fixtures/identity_normalization_cases.csv @@ -0,0 +1,274 @@ +input,expected,variant,category,notes +# Categorical parity matrix for plan §3.3.2 steps 4-6 + 8. +# Variant column: which function to call. +# base → to_identity_match_form +# title → to_identity_match_form_title +# punct → to_identity_match_form_with_punctuation +# disamb → to_identity_match_form_with_disambiguator_strip +# Category column: which §3.3.2 step or behavior is exercised. Empty notes +# column is allowed; lowercase ASCII expected (to_match_form pre-pass). +# Steps 1-3 + 7 coverage delegated to charset-torture.json (WX-1). + +# ===== trailing-parens-artist (50) ===== +Stereolab (Live),stereolab,base,trailing-parens-artist, +Cat Power (Remastered),cat power,base,trailing-parens-artist, +Animal Collective (Live at Lincoln Hall),animal collective,base,trailing-parens-artist, +Beach House (Demo),beach house,base,trailing-parens-artist, +Big Thief (Acoustic),big thief,base,trailing-parens-artist, +Aphex Twin (1992),aphex twin,base,trailing-parens-artist, +Autechre (Reissue),autechre,base,trailing-parens-artist, +Boards of Canada (Original Mix),boards of canada,base,trailing-parens-artist, +Broadcast (Live),broadcast,base,trailing-parens-artist, +Brian Eno (Ambient),brian eno,base,trailing-parens-artist, +Burial (12 Inch),burial,base,trailing-parens-artist, +Cocteau Twins (1984),cocteau twins,base,trailing-parens-artist, +Colin Stetson (Live),colin stetson,base,trailing-parens-artist, +Damien Jurado (Demo),damien jurado,base,trailing-parens-artist, +Deerhoof (Live in Tokyo),deerhoof,base,trailing-parens-artist, +Elliott Smith (Posthumous),elliott smith,base,trailing-parens-artist, +Father John Misty (Solo),father john misty,base,trailing-parens-artist, +Flying Lotus (Brainfeeder),flying lotus,base,trailing-parens-artist, +Four Tet (Remix),four tet,base,trailing-parens-artist, +Gil Scott-Heron (Live),gil scott-heron,base,trailing-parens-artist, +Hermanos Gutiérrez (En Vivo),hermanos gutierrez,base,trailing-parens-artist, +Jessica Pratt (Live in LA),jessica pratt,base,trailing-parens-artist, +Juana Molina (Acoustic),juana molina,base,trailing-parens-artist, +Stereolab [Live],stereolab,base,trailing-parens-artist,square brackets +Cat Power [Remastered],cat power,base,trailing-parens-artist,square brackets +Animal Collective [Demos],animal collective,base,trailing-parens-artist,square brackets +Beach House [Acoustic],beach house,base,trailing-parens-artist,square brackets +Big Thief [Live],big thief,base,trailing-parens-artist,square brackets +Aphex Twin [1992],aphex twin,base,trailing-parens-artist,square brackets +Autechre [Bonus],autechre,base,trailing-parens-artist,square brackets +Boards of Canada [Original Mix],boards of canada,base,trailing-parens-artist,square brackets +Broadcast [Live],broadcast,base,trailing-parens-artist,square brackets +Brian Eno [Ambient],brian eno,base,trailing-parens-artist,square brackets +Stereolab (Live),stereolab,base,trailing-parens-artist,doubled space before group → match-form collapses to single +Cat Power (1995),cat power,base,trailing-parens-artist,triple space before group +Stereolab(),stereolab,base,trailing-parens-artist,no space before parens +Cat Power(Live),cat power,base,trailing-parens-artist,no space before parens +Aphex Twin (selected ambient works),aphex twin,base,trailing-parens-artist,multi-word inside parens +Autechre (lp5),autechre,base,trailing-parens-artist, +Beach House (b-sides and rarities),beach house,base,trailing-parens-artist, +Big Thief (live at the brooklyn academy),big thief,base,trailing-parens-artist, +Stereolab (mars audiac quintet),stereolab,base,trailing-parens-artist, +Stereolab (Live (1999) Edition),stereolab,base,trailing-parens-artist,nested parens balanced +Cat Power (Sun (Remastered)),cat power,base,trailing-parens-artist,nested parens balanced +Beach House (Bloom (10th)),beach house,base,trailing-parens-artist,nested parens balanced +Stereolab ((Live)),stereolab,base,trailing-parens-artist,double-open +(Cat Power),(cat power),base,trailing-parens-artist,only-parens negative — preserved +[Stereolab],[stereolab],base,trailing-parens-artist,only-brackets negative — preserved +Autechre ),autechre ),base,trailing-parens-artist,unbalanced close — passes through +Stereolab (Live (1999),stereolab (live,base,trailing-parens-artist,unbalanced inner-open: outer scan strips only the (1999) +Broadcast (Live) [2003],broadcast (live),base,trailing-parens-artist,one pass — only outermost stripped + +# ===== trailing-parens-title (50) ===== +"What Time Is It, Mr. Fox? (Live)","what time is it, mr. fox?",title,trailing-parens-title,strip outer paren only — comma preserved +Sugar Hill (Acoustic),sugar hill,title,trailing-parens-title, +La Paradoja (Single Edit),la paradoja,title,trailing-parens-title, +"Back, Baby (Demo)","back, baby",title,trailing-parens-title,comma in title preserved +Call Your Name (Edits),call your name,title,trailing-parens-title, +In a Sentimental Mood (Live),in a sentimental mood,title,trailing-parens-title, +Aluminum Tunes (Vol 1),aluminum tunes,title,trailing-parens-title, +DOGA (Album Version),doga,title,trailing-parens-title, +On Your Own Love Again (Reissue),on your own love again,title,trailing-parens-title, +Edits (Bonus),edits,title,trailing-parens-title, +Wedding Song (Live),wedding song,title,trailing-parens-title, +Saturday (Acoustic),saturday,title,trailing-parens-title, +Blue Moon (1956),blue moon,title,trailing-parens-title, +The Birthday Party (1973),birthday party,title,trailing-parens-title,title variant runs full baseline (paren strip + article drop) +"Track 1, Side A (Live)","track 1, side a",title,trailing-parens-title,comma preserved +Sugar Hill [Demo],sugar hill,title,trailing-parens-title,square +La Paradoja [Reissue],la paradoja,title,trailing-parens-title,square +"Back, Baby [Acoustic]","back, baby",title,trailing-parens-title,square + comma +Call Your Name [Bonus],call your name,title,trailing-parens-title,square +In a Sentimental Mood [Live 1962],in a sentimental mood,title,trailing-parens-title,square +Sugar Hill (),sugar hill,title,trailing-parens-title,empty parens stripped +Sugar Hill ( ),sugar hill,title,trailing-parens-title,whitespace-only inside parens +DOGA (Edit),doga,title,trailing-parens-title,multi-space before parens collapsed by match-form +DOGA(Edit),doga,title,trailing-parens-title,no leading space +Edits (Vol 1) [2025],edits (vol 1),title,trailing-parens-title,one-pass — outermost only +Saturday (Take 1) (Take 2),saturday (take 1),title,trailing-parens-title,one-pass +(Sugar Hill),(sugar hill),title,trailing-parens-title,only-parens preserved +[Sugar Hill],[sugar hill],title,trailing-parens-title,only-brackets preserved +Sugar Hill ),sugar hill ),title,trailing-parens-title,unbalanced close +Sugar Hill (Live (1999) Edition),sugar hill,title,trailing-parens-title,nested +Wedding Song (1971),wedding song,title,trailing-parens-title, +Wedding Song [1971],wedding song,title,trailing-parens-title, +Saturday Morning (Edit),saturday morning,title,trailing-parens-title, +Blue (Live in Berlin),blue,title,trailing-parens-title, +Blue [Live in Berlin],blue,title,trailing-parens-title, +Suite for Solo Cello (Bach),suite for solo cello,title,trailing-parens-title, +For Free (Joni Mitchell Cover),for free,title,trailing-parens-title, +For Free [Joni Mitchell Cover],for free,title,trailing-parens-title, +La Paradoja (Vol 2),la paradoja,title,trailing-parens-title, +La Paradoja II (Live),la paradoja ii,title,trailing-parens-title, +Edits (Compilation),edits,title,trailing-parens-title, +Edits Vol. 2 (Mini),edits vol. 2,title,trailing-parens-title, +Edits Vol. 2 [Mini],edits vol. 2,title,trailing-parens-title, +First Light (Take 1),first light,title,trailing-parens-title, +First Light [Take 1],first light,title,trailing-parens-title, +First Light (Take 1) (Take 2),first light (take 1),title,trailing-parens-title, +First Light [Take 1] [Take 2],first light [take 1],title,trailing-parens-title, +The Cat in the Hat (Original),cat in the hat,title,trailing-parens-title,title runs full baseline — article dropped +The Sun Also Rises (Edit),sun also rises,title,trailing-parens-title,title runs full baseline — article dropped +The Goldberg Variations (Bach),goldberg variations,title,trailing-parens-title,title runs full baseline — article dropped + +# ===== leading-article (25) ===== +The Field,field,base,leading-article,canonical WXYC artist +A Tribe Called Quest,tribe called quest,base,leading-article,canonical WXYC artist +The Beatles,beatles,base,leading-article,classic example +A Place to Bury Strangers,place to bury strangers,base,leading-article, +An Albatross,albatross,base,leading-article, +The Microphones,microphones,base,leading-article, +A Silver Mt. Zion,silver mt. zion,base,leading-article, +The The,the,base,leading-article,only first article dropped +The Sun Also Rises,sun also rises,base,leading-article, +The Field (Live),field,base,leading-article,article + paren strip composed +A Tribe Called Quest (Demos),tribe called quest,base,leading-article,article + paren strip composed +The Microphones [1999],microphones,base,leading-article,bracket variant +"Field, The",field,base,leading-article,Discogs comma form +"Tribe Called Quest, A",tribe called quest,base,leading-article,Discogs comma form +"Albatross, An",albatross,base,leading-article,Discogs comma form +"Microphones, The",microphones,base,leading-article,Discogs comma form +"Sun Also Rises, The",sun also rises,base,leading-article,Discogs comma form +"Field, The (1995)",field,base,leading-article,comma + paren strip composed +"Tribe Called Quest, A [Demos]",tribe called quest,base,leading-article,comma + bracket strip composed +Theater of Tragedy,theater of tragedy,base,leading-article,prefix substring negative +Animal Collective,animal collective,base,leading-article,no article negative +Andy Human and the Reptoids,andy human and the reptoids,base,leading-article,article in middle preserved +"Beatles, the Best Of","beatles, the best of",base,leading-article,trailing comma not at end-of-string +Aphex Twin,aphex twin,base,leading-article,no article negative +"Stereolab, The",stereolab,base,leading-article,Discogs comma form + +# ===== punct (50) — to_identity_match_form_with_punctuation ===== +M.I.A.,m i a,punct,punct,classic dotted abbreviation +R.E.M.,r e m,punct,punct, +O.D.B.,o d b,punct,punct, ++/-,,punct,punct,reduces to empty +!!!,,punct,punct,reduces to empty +"10,000 Maniacs",10 000 maniacs,punct,punct,comma collapses +Godspeed You! Black Emperor,godspeed you black emperor,punct,punct, +Of Montreal,of montreal,punct,punct,no punctuation passthrough +Sigur Rós,sigur ros,punct,punct,no punctuation passthrough +Big Thief,big thief,punct,punct, +Dam-Funk,dam funk,punct,punct,hyphen collapses +Gil Scott-Heron,gil scott heron,punct,punct,hyphen collapses +"Crosby, Stills & Nash",crosby stills nash,punct,punct, +"Earth, Wind & Fire",earth wind fire,punct,punct, +"Tyler, the Creator",tyler the creator,punct,punct, +"Jay-Z",jay z,punct,punct, +"Diiv",diiv,punct,punct, +A.A. Bondy,a a bondy,punct,punct, +A.S.A.P. Rocky,a s a p rocky,punct,punct, +Cluster & Eno,cluster eno,punct,punct,ampersand collapses +"Tortoise & Bonnie 'Prince' Billy",tortoise bonnie prince billy,punct,punct,straight quotes collapse +Bonnie 'Prince' Billy,bonnie prince billy,punct,punct, +"D'Angelo",d angelo,punct,punct,apostrophe collapses +Don't Be a Stranger,don t be a stranger,punct,punct,apostrophe collapses +Sun O))),sun o,punct,punct,reduces to letters only — paren strip first removes nothing because no balanced group +Foo!Bar (Live),foo bar,punct,punct,paren strip then punct collapse +Foo!Bar [Demo],foo bar,punct,punct,paren strip then punct collapse +The M.I.A.,m i a,punct,punct,article drop then punct +"M.I.A., The",m i a,punct,punct,Discogs comma + punct collapse +Black Midi,black midi,punct,punct,no punctuation passthrough +Mount Eerie,mount eerie,punct,punct, +Aphex Twin,aphex twin,punct,punct, +Stereolab,stereolab,punct,punct, +Cat Power,cat power,punct,punct, +Beach House,beach house,punct,punct, +Big Thief,big thief,punct,punct, +Animal Collective,animal collective,punct,punct, +Boards of Canada,boards of canada,punct,punct, +Brian Eno,brian eno,punct,punct, +Broadcast,broadcast,punct,punct, +Burial,burial,punct,punct, +Cocteau Twins,cocteau twins,punct,punct, +Colin Stetson,colin stetson,punct,punct, +Csillagrablók,csillagrablok,punct,punct,Unicode letter survives +Στελλάς,στελλασ,punct,punct,Greek letter survives +Молчат Дома,молчат дома,punct,punct,Cyrillic letter survives +細野晴臣,細野晴臣,punct,punct,Han letters survive +Sigur Rós,sigur ros,punct,punct, +Hermanos Gutiérrez,hermanos gutierrez,punct,punct, +Aşıq Altay,asıq altay,punct,punct,Turkish dotless i survives + +# ===== disamb (50) — to_identity_match_form_with_disambiguator_strip ===== +John Smith /1,john smith,disamb,disamb, +John Smith /2,john smith,disamb,disamb, +Various /17,various,disamb,disamb, +Stereolab /1,stereolab,disamb,disamb, +Stereolab /17,stereolab,disamb,disamb, +Cat Power /3,cat power,disamb,disamb, +Big Thief /2,big thief,disamb,disamb, +Beach House /1,beach house,disamb,disamb, +Aphex Twin /1,aphex twin,disamb,disamb, +Autechre /2,autechre,disamb,disamb, +Boards of Canada /1,boards of canada,disamb,disamb, +Broadcast /2,broadcast,disamb,disamb, +Brian Eno /1,brian eno,disamb,disamb, +Burial /1,burial,disamb,disamb, +Cocteau Twins /1,cocteau twins,disamb,disamb, +Colin Stetson /1,colin stetson,disamb,disamb, +Damien Jurado /1,damien jurado,disamb,disamb, +Deerhoof /1,deerhoof,disamb,disamb, +Elliott Smith /1,elliott smith,disamb,disamb, +Father John Misty /1,father john misty,disamb,disamb, +Flying Lotus /1,flying lotus,disamb,disamb, +Four Tet /1,four tet,disamb,disamb, +Gil Scott-Heron /1,gil scott-heron,disamb,disamb, +Hermanos Gutiérrez /1,hermanos gutierrez,disamb,disamb, +Jessica Pratt /1,jessica pratt,disamb,disamb, +Juana Molina /1,juana molina,disamb,disamb, +The Beatles /1,beatles,disamb,disamb,article drop + disamb composed +A Tribe Called Quest /1,tribe called quest,disamb,disamb,article + disamb composed +"Beatles, The /1","beatles, the",disamb,disamb,Discogs comma form is masked by /1 in baseline; only /1 strips at the disamb step (comma form does not re-fire) +The Field /2 (Live),field,disamb,disamb,article + paren + disamb composed +The Microphones /3,microphones,disamb,disamb, +Track 1/12,track 1/12,disamb,disamb,no whitespace before slash — preserved +Side A/B,side a/b,disamb,disamb,trailing letter — preserved +Track 1/12345,track 1/12345,disamb,disamb,no whitespace +Section 4/5,section 4/5,disamb,disamb,no whitespace +1/4,1/4,disamb,disamb,bare digits +"Stereolab /17 (Live)",stereolab,disamb,disamb,paren first then disamb +Cat Power /3 [Live],cat power,disamb,disamb,bracket then disamb +"Cat Power /3 (Demo)",cat power,disamb,disamb,paren then disamb +Stereolab,stereolab,disamb,disamb,no disambiguator passthrough +Cat Power,cat power,disamb,disamb,no disambiguator passthrough +Beach House,beach house,disamb,disamb,no disambiguator passthrough +M /1,m,disamb,disamb,single-letter stem +M.I.A. /1,m.i.a.,disamb,disamb,disamb strips trailing /N; dotted abbreviation survives because disamb does not punct-collapse +"Stereolab /1, The",stereolab,disamb,disamb,trailing comma form before disamb digits — wait this has /1 first +Various Artists /1,various artists,disamb,disamb, +Various Artists /99,various artists,disamb,disamb, +Compilation /1,compilation,disamb,disamb, +Compilation /5,compilation,disamb,disamb, +Various /999,various,disamb,disamb,three-digit +Bach /1,bach,disamb,disamb, + +# ===== disamb-title-negative (25) — title variant must NOT strip /N ===== +Track 1/12,track 1/12,title,disamb-title-negative, +Side A/B,side a/b,title,disamb-title-negative, +Track 2/12,track 2/12,title,disamb-title-negative, +Track 11/12,track 11/12,title,disamb-title-negative, +Section 4/5,section 4/5,title,disamb-title-negative, +1/4,1/4,title,disamb-title-negative, +Wedding Song /1,wedding song /1,title,disamb-title-negative,title variant preserves /N +Sugar Hill /2,sugar hill /2,title,disamb-title-negative, +La Paradoja /3,la paradoja /3,title,disamb-title-negative, +"Back, Baby /1","back, baby /1",title,disamb-title-negative, +Call Your Name /4,call your name /4,title,disamb-title-negative, +In a Sentimental Mood /1,in a sentimental mood /1,title,disamb-title-negative, +Suite No. 1/2,suite no. 1/2,title,disamb-title-negative,no whitespace +Side 1/2,side 1/2,title,disamb-title-negative, +Movement 4/4,movement 4/4,title,disamb-title-negative, +Variation 1/30,variation 1/30,title,disamb-title-negative, +"Symphony No. 9, Mvmt. 4/4","symphony no. 9, mvmt. 4/4",title,disamb-title-negative, +Bagatelle 11/11,bagatelle 11/11,title,disamb-title-negative, +Etude 24/24,etude 24/24,title,disamb-title-negative, +Prelude 1/24,prelude 1/24,title,disamb-title-negative, +Concerto 5/6,concerto 5/6,title,disamb-title-negative, +Mass 1/4,mass 1/4,title,disamb-title-negative, +Volume 1/3,volume 1/3,title,disamb-title-negative, +Disc 1/2,disc 1/2,title,disamb-title-negative, +CD 1/2,cd 1/2,title,disamb-title-negative, diff --git a/tests/wxyc_identity_match_parity_test.rs b/tests/wxyc_identity_match_parity_test.rs new file mode 100644 index 0000000..3533daf --- /dev/null +++ b/tests/wxyc_identity_match_parity_test.rs @@ -0,0 +1,296 @@ +//! Parity check for the four `wxyc_identity_match_*` plpgsql functions +//! deployed by `migrations/0003_wxyc_identity_match_functions.sql`. +//! +//! Three independent assertions: +//! +//! 1. **Pin freshness** — `vendor/wxyc-etl/wxyc_unaccent.rules`, +//! `vendor/wxyc-etl/wxyc_identity_match_functions.sql`, and +//! `tests/fixtures/identity_normalization_cases.csv` hash to the SHA-256 +//! values recorded in `wxyc-etl-pin.txt`. If any vendored file drifts +//! from the pin, fail with a re-vendoring hint. +//! 2. **Migration freshness** — `migrations/0003_wxyc_identity_match_functions.sql` +//! ends with the canonical SQL byte-for-byte. The prefix is the +//! wrapper (CREATE EXTENSION + CREATE DICTIONARY) that sqlx-cli needs +//! because it can't `\i` external files. +//! 3. **Postgres byte-equality** (`#[ignore]`-gated, runs in CI's +//! test-postgres job) — each of the 252 fixture rows is fed through +//! the corresponding plpgsql function on the live PG service; the +//! result must match the fixture's `expected` column. Implicit +//! Rust↔PG parity: the fixture IS the Rust-validated reference. + +use std::collections::HashMap; +use std::fs; +use std::path::PathBuf; + +use sha2::{Digest, Sha256}; + +fn repo_root() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) +} + +fn read(path: &str) -> Vec { + let mut p = repo_root(); + p.push(path); + fs::read(&p).unwrap_or_else(|e| panic!("read {}: {e}", p.display())) +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut h = Sha256::new(); + h.update(bytes); + format!("{:x}", h.finalize()) +} + +fn pin_map() -> HashMap { + let bytes = read("wxyc-etl-pin.txt"); + let text = String::from_utf8(bytes).expect("pin file is UTF-8"); + let mut m = HashMap::new(); + for line in text.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + if let Some((k, v)) = line.split_once('=') { + m.insert(k.trim().to_string(), v.trim().to_string()); + } + } + m +} + +#[test] +fn pin_file_sha256s_match_vendored_files() { + let pin = pin_map(); + let pairs = [ + ( + "vendor/wxyc-etl/wxyc_unaccent.rules", + "unaccent_rules_sha256", + ), + ( + "vendor/wxyc-etl/wxyc_identity_match_functions.sql", + "functions_sql_sha256", + ), + ( + "tests/fixtures/identity_normalization_cases.csv", + "fixture_csv_sha256", + ), + ]; + for (path, key) in pairs { + let actual = sha256_hex(&read(path)); + let expected = pin + .get(key) + .unwrap_or_else(|| panic!("missing pin entry {key:?}")); + assert_eq!( + &actual, + expected, + "{path} drifted from pin {key} — re-vendor from wxyc-etl@v{} and bump wxyc-etl-pin.txt", + pin.get("wxyc_etl_version") + .map(String::as_str) + .unwrap_or("?") + ); + } +} + +/// Sentinel emitted by the wrapper prelude just before the canonical body. +/// Anchoring the split here (rather than the first line of the canonical) +/// prevents a future wrapper edit that happens to paste that first line from +/// silently moving the split point. +const CANONICAL_SENTINEL: &str = "-- @BEGIN CANONICAL BODY (do not edit; vendored from wxyc-etl)\n"; + +#[test] +fn migration_inlines_canonical_sql_byte_for_byte() { + let migration = read("migrations/0003_wxyc_identity_match_functions.sql"); + let canonical = read("vendor/wxyc-etl/wxyc_identity_match_functions.sql"); + let migration = String::from_utf8(migration).expect("migration is UTF-8"); + let canonical = String::from_utf8(canonical).expect("canonical SQL is UTF-8"); + let sentinel_idx = migration.find(CANONICAL_SENTINEL).unwrap_or_else(|| { + panic!( + "migration is missing the `{}` sentinel that marks the start of the vendored canonical body — re-generate the migration by appending the sentinel + vendor/wxyc-etl/wxyc_identity_match_functions.sql to the wrapper prelude", + CANONICAL_SENTINEL.trim_end() + ) + }); + let body_start = sentinel_idx + CANONICAL_SENTINEL.len(); + assert_eq!( + &migration[body_start..], + canonical, + "migration body after the @BEGIN CANONICAL BODY sentinel diverges from vendor/wxyc-etl/wxyc_identity_match_functions.sql — re-vendor and regenerate" + ); +} + +// -- live-PG fixture parity ------------------------------------------------- + +use postgres::Client; + +#[derive(Debug)] +struct Row { + line_no: usize, + input: String, + expected: String, + variant: String, + category: String, +} + +fn fixture_rows() -> Vec { + let bytes = read("tests/fixtures/identity_normalization_cases.csv"); + let text = String::from_utf8(bytes).expect("fixture UTF-8"); + let mut rows = Vec::new(); + for (i, line) in text.lines().enumerate() { + let line_no = i + 1; + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + if line_no == 1 && trimmed.starts_with("input,") { + continue; + } + let fields = parse_csv_line(line); + assert_eq!(fields.len(), 5, "line {line_no} fields={}", fields.len()); + rows.push(Row { + line_no, + input: fields[0].clone(), + expected: fields[1].clone(), + variant: fields[2].clone(), + category: fields[3].clone(), + }); + } + rows +} + +fn parse_csv_line(line: &str) -> Vec { + let mut out = Vec::new(); + let mut field = String::new(); + let mut in_quotes = false; + let mut iter = line.chars().peekable(); + while let Some(c) = iter.next() { + match (c, in_quotes) { + ('"', true) => { + if iter.peek() == Some(&'"') { + field.push('"'); + iter.next(); + } else { + in_quotes = false; + } + } + ('"', false) => in_quotes = true, + (',', false) => out.push(std::mem::take(&mut field)), + (other, _) => field.push(other), + } + } + out.push(field); + out +} + +fn pg_function(variant: &str) -> &'static str { + match variant { + "base" => "wxyc_identity_match_artist", + "title" => "wxyc_identity_match_title", + "punct" => "wxyc_identity_match_with_punctuation", + "disamb" => "wxyc_identity_match_with_disambiguator_strip", + other => panic!("unknown variant {other:?}"), + } +} + +fn apply_migration(client: &mut Client) { + let migration = read("migrations/0003_wxyc_identity_match_functions.sql"); + let sql = String::from_utf8(migration).expect("migration UTF-8"); + client + .batch_execute(&sql) + .expect("apply 0003_wxyc_identity_match_functions migration"); +} + +#[test] +#[ignore] +fn postgres_functions_match_fixture_row_for_row() { + let Ok(db_url) = std::env::var("TEST_DATABASE_URL") else { + eprintln!("TEST_DATABASE_URL unset — skipping"); + return; + }; + let mut client = Client::connect(&db_url, postgres::NoTls).expect("connect to test PG"); + apply_migration(&mut client); + + let rows = fixture_rows(); + assert!(rows.len() >= 250, "fixture row count {} < 250", rows.len()); + + let mut failures: Vec = Vec::new(); + for row in &rows { + let fn_name = pg_function(&row.variant); + let pg_row = client + .query_one(&format!("SELECT {fn_name}($1)"), &[&row.input]) + .unwrap_or_else(|e| { + panic!( + "PG query failed line {} ({fn_name}, input={:?}): {e:?}", + row.line_no, row.input + ) + }); + let pg_out: Option = pg_row.get(0); + let pg_out = pg_out.unwrap_or_default(); + if pg_out != row.expected { + failures.push(format!( + " line {} [{}/{}] input={:?}\n expected={:?}\n pg={:?}", + row.line_no, row.variant, row.category, row.input, row.expected, pg_out + )); + } + } + if !failures.is_empty() { + panic!( + "{} of {} parity rows failed:\n{}", + failures.len(), + rows.len(), + failures.join("\n") + ); + } +} + +#[test] +#[ignore] +fn migration_double_apply_is_a_no_op() { + // Re-applying the whole migration must not throw and must leave the + // functions in the same state. CREATE OR REPLACE FUNCTION + DROP/CREATE + // TEXT SEARCH DICTIONARY are individually idempotent, but proving they + // compose cleanly when the migration is replayed end-to-end pins the + // contract this template makes to every consumer. + let Ok(db_url) = std::env::var("TEST_DATABASE_URL") else { + eprintln!("TEST_DATABASE_URL unset — skipping"); + return; + }; + let mut client = Client::connect(&db_url, postgres::NoTls).expect("connect to test PG"); + apply_migration(&mut client); + apply_migration(&mut client); + let row = client + .query_one("SELECT wxyc_identity_match_artist('Stereolab')", &[]) + .expect("query after double-apply"); + let got: Option = row.get(0); + assert_eq!(got.as_deref(), Some("stereolab")); +} + +#[test] +#[ignore] +fn postgres_functions_idempotent() { + let Ok(db_url) = std::env::var("TEST_DATABASE_URL") else { + eprintln!("TEST_DATABASE_URL unset — skipping"); + return; + }; + let mut client = Client::connect(&db_url, postgres::NoTls).expect("connect to test PG"); + apply_migration(&mut client); + + let probe = " The Foo Fighters (1995) "; + for fn_name in [ + "wxyc_identity_match_artist", + "wxyc_identity_match_title", + "wxyc_identity_match_with_punctuation", + "wxyc_identity_match_with_disambiguator_strip", + ] { + let once_row = client + .query_one(&format!("SELECT {fn_name}($1)"), &[&probe]) + .unwrap_or_else(|e| panic!("{fn_name} call 1 failed: {e:?}")); + let once: Option = once_row.get(0); + let once = once.expect("non-null"); + let twice_row = client + .query_one(&format!("SELECT {fn_name}($1)"), &[&once]) + .unwrap_or_else(|e| panic!("{fn_name} call 2 failed: {e:?}")); + let twice: Option = twice_row.get(0); + assert_eq!( + twice.as_deref(), + Some(once.as_str()), + "{fn_name} not idempotent: once={once:?} twice={twice:?}" + ); + } +} diff --git a/vendor/wxyc-etl/wxyc_identity_match_functions.sql b/vendor/wxyc-etl/wxyc_identity_match_functions.sql new file mode 100644 index 0000000..9774468 --- /dev/null +++ b/vendor/wxyc-etl/wxyc_identity_match_functions.sql @@ -0,0 +1,302 @@ +-- Canonical SQL implementation of the cross-cache-identity match form. +-- +-- Vendored verbatim into every cache repo (discogs-etl, musicbrainz-cache, +-- wikidata-cache) and Backend-Service. The four function bodies must produce +-- byte-identical output to the corresponding Rust entry points in +-- `wxyc_etl::text::identity`: +-- +-- wxyc_identity_match_artist <-> to_identity_match_form +-- wxyc_identity_match_title <-> to_identity_match_form_title +-- wxyc_identity_match_with_punctuation <-> to_identity_match_form_with_punctuation +-- wxyc_identity_match_with_disambiguator_strip +-- <-> to_identity_match_form_with_disambiguator_strip +-- +-- Parity is asserted by `wxyc-etl/tests/postgres_parity_test.rs` against the +-- 252-row fixture in `wxyc-etl/tests/fixtures/identity_normalization_cases.csv`. +-- +-- Required Postgres version: 16+ (Unicode property classes, `normalize()`, +-- stable regex behavior). Required extension: `unaccent` configured with the +-- `wxyc_unaccent` text-search dictionary installed from +-- `data/wxyc_unaccent.rules`. +-- +-- Vendoring contract: each consumer carries `wxyc-etl-pin.txt` recording the +-- SHA-256 of `data/wxyc_unaccent.rules` and the version header read from the +-- file's first comment line. Mismatch fails CI. See +-- `wxyc-etl/docs/postgres-analog-vendoring.md`. + +DO $$ +BEGIN + IF current_setting('server_version_num')::int < 160000 THEN + RAISE EXCEPTION 'wxyc identity-match functions require Postgres 16+; got %', + current_setting('server_version'); + END IF; +END $$; + +-- The wxyc_unaccent dictionary must be created before this file loads. +-- Consumer migrations do: +-- CREATE EXTENSION IF NOT EXISTS unaccent; +-- CREATE TEXT SEARCH DICTIONARY wxyc_unaccent ( +-- TEMPLATE = unaccent, RULES = 'wxyc_unaccent' +-- ); +-- followed by the rules-file SHA verification block (see vendoring docs). + +-- --------------------------------------------------------------------------- +-- Base match-form pipeline. +-- +-- Mirror of `wxyc_etl::text::to_match_form` after the storage-form pass +-- (no mojibake repair — callers responsible for storing pre-cleaned bytes). +-- Pipeline: +-- normalize NFKC -> lower -> wxyc_unaccent dictionary -> strip-Cf-except-ZWJ +-- -> collapse-ASCII-space + trim. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_match_form(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; + zwj text := chr(8205); -- U+200D + cf_pattern text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := normalize(s, NFKC); + r := lower(r); + r := unaccent('wxyc_unaccent', r); + -- Strip Cf (format) characters except U+200D ZWJ (emoji integrity), matching + -- `strip_cf_except_zwj` in the Rust pipeline. Postgres regex has no + -- `\p{Cf}` and no char-class subtraction; build the class from explicit + -- BMP Cf codepoints split around ZWJ. Supplementary-plane Cf (U+E0001 etc.) + -- is rare in music-catalog data and intentionally not handled here. + cf_pattern := + '[' + || chr(173) -- U+00AD soft hyphen + || chr(1564) -- U+061C ALM + || chr(1757) -- U+06DD ARABIC END OF AYAH + || chr(1807) -- U+070F SYRIAC ABBREV MARK + || chr(2274) -- U+08E2 ARABIC DISPUTED END OF AYAH + || chr(6158) -- U+180E MONG VOWEL SEP + || chr(8203) || '-' || chr(8204) -- U+200B-U+200C (200D ZWJ skipped) + || chr(8206) || '-' || chr(8207) -- U+200E-U+200F + || chr(8234) || '-' || chr(8238) -- U+202A-U+202E + || chr(8288) || '-' || chr(8303) -- U+2060-U+206F + || chr(65279) -- U+FEFF BOM + || chr(65529) || '-' || chr(65531) -- U+FFF9-U+FFFB + || ']'; + -- ZWJ is excluded from the class above, so no placeholder swap needed. + r := regexp_replace(r, cf_pattern, '', 'g'); + -- Collapse runs of ASCII space + trim. Other whitespace (TAB etc.) preserved. + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: strip a single trailing (...) or [...] group. +-- +-- Mirror of `strip_trailing_parens` in `wxyc_etl::text::identity`. Returns +-- input unchanged when: no trailing close-bracket, brackets unbalanced, +-- or the matching open is at position 0 (would reduce stem to empty). +-- One pass only. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_strip_trailing_parens(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + trimmed text; + open_chr char; + close_chr char; + ch char; + depth int := 0; + open_idx int := -1; + i int; + stem text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + trimmed := regexp_replace(s, ' +$', ''); + IF length(trimmed) = 0 THEN RETURN s; END IF; + ch := right(trimmed, 1); + IF ch = ')' THEN + open_chr := '('; close_chr := ')'; + ELSIF ch = ']' THEN + open_chr := '['; close_chr := ']'; + ELSE + RETURN s; + END IF; + -- Scan right-to-left for the matching open. + FOR i IN REVERSE length(trimmed)..1 LOOP + ch := substr(trimmed, i, 1); + IF ch = close_chr THEN + depth := depth + 1; + ELSIF ch = open_chr THEN + depth := depth - 1; + IF depth = 0 THEN + open_idx := i; + EXIT; + END IF; + END IF; + END LOOP; + IF open_idx < 0 OR open_idx = 1 THEN + -- Unbalanced or full-string brackets — preserve. + RETURN s; + END IF; + stem := substr(trimmed, 1, open_idx - 1); + stem := regexp_replace(stem, ' +$', ''); + RETURN stem; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: drop a leading article or trailing comma-form article. +-- +-- Mirror of `drop_articles` in `wxyc_etl::text::identity`. At most one +-- match is consumed. The leading form requires the article followed by +-- ASCII space (`the `, `a `, `an `); `theater` does not match. The comma +-- form requires `, the` / `, a` / `, an` at end-of-string with a +-- non-empty stem; `Beatles, the Best Of` does not match. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_drop_articles(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + art text; + stripped text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + FOREACH art IN ARRAY ARRAY['the ', 'a ', 'an '] LOOP + IF starts_with(s, art) THEN + RETURN substr(s, length(art) + 1); + END IF; + END LOOP; + FOREACH art IN ARRAY ARRAY[', the', ', a', ', an'] LOOP + -- Suffix check via `right()` rather than `LIKE '%' || art` so a future + -- article containing `%` or `_` doesn't trigger wildcard semantics. + IF length(s) >= length(art) AND right(s, length(art)) = art THEN + stripped := substr(s, 1, length(s) - length(art)); + IF length(stripped) > 0 THEN + RETURN stripped; + END IF; + END IF; + END LOOP; + RETURN s; +END +$$; + +-- --------------------------------------------------------------------------- +-- Helper: identity baseline (steps 4 + 5). +-- +-- Mirror of `identity_baseline` in `wxyc_etl::text::identity`. The shared +-- body of artist + title entry points. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_baseline(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_match_form(s); + r := wxyc_strip_trailing_parens(r); + r := wxyc_drop_articles(r); + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: artist identity match. +-- Mirror of `wxyc_etl::text::to_identity_match_form`. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_artist(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +BEGIN + RETURN wxyc_identity_baseline(s); +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: title identity match. +-- Mirror of `wxyc_etl::text::to_identity_match_form_title`. Same body as +-- artist today; separate function so callers type-distinguish at the call +-- site and a future step-6 promotion does not silently change titles that +-- would not benefit (`Side A/2` etc.). +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_title(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +BEGIN + RETURN wxyc_identity_baseline(s); +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: identity match + opt-in punctuation collapse (step 6). +-- Mirror of `wxyc_etl::text::to_identity_match_form_with_punctuation`. +-- Each run of one-or-more non-letter, non-number, non-whitespace codepoints +-- becomes a single ASCII space; result is re-collapsed and re-trimmed. +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_with_punctuation(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_match_form(s); + r := wxyc_strip_trailing_parens(r); + r := wxyc_drop_articles(r); + -- Step 6: replace each run of non-{Letter,Number,Whitespace} with one space. + -- Postgres regex doesn't support `\p{L}` directly, but POSIX `[:alpha:]` / + -- `[:digit:]` / `[:space:]` are locale-aware (en_US.UTF-8 collation = + -- full Unicode coverage). + r := regexp_replace(r, '[^[:alpha:][:digit:][:space:]]+', ' ', 'g'); + r := regexp_replace(r, ' +', ' ', 'g'); + r := regexp_replace(r, '^ | $', '', 'g'); + RETURN r; +END +$$; + +-- --------------------------------------------------------------------------- +-- Public entry point: identity match + opt-in `/N` disambiguator strip (step 8). +-- Mirror of `wxyc_etl::text::to_identity_match_form_with_disambiguator_strip`. +-- +-- Artists only. The leading whitespace before `/` is REQUIRED (`John Smith /1` +-- strips; `Track 1/12` does not — matches Rust's `\s+/\d+$` not `\s*`). +-- --------------------------------------------------------------------------- +CREATE OR REPLACE FUNCTION wxyc_identity_match_with_disambiguator_strip(s text) + RETURNS text + LANGUAGE plpgsql + IMMUTABLE + PARALLEL SAFE +AS $$ +DECLARE + r text; +BEGIN + IF s IS NULL THEN RETURN NULL; END IF; + r := wxyc_identity_baseline(s); + r := regexp_replace(r, ' +/\d+$', ''); + RETURN r; +END +$$; diff --git a/vendor/wxyc-etl/wxyc_unaccent.rules b/vendor/wxyc-etl/wxyc_unaccent.rules new file mode 100644 index 0000000..a303530 --- /dev/null +++ b/vendor/wxyc-etl/wxyc_unaccent.rules @@ -0,0 +1,433 @@ +ª a +² 2 +³ 3 +µ μ +¹ 1 +º o +¼ 1⁄4 +½ 1⁄2 +¾ 3⁄4 +à a +á a +â a +ã a +ä a +å a +æ ae +ç c +è e +é e +ê e +ë e +ì i +í i +î i +ï i +ñ n +ò o +ó o +ô o +õ o +ö o +ù u +ú u +û u +ü u +ý y +ÿ y +ā a +ă a +ą a +ć c +ĉ c +ċ c +č c +ď d +ē e +ĕ e +ė e +ę e +ě e +ĝ g +ğ g +ġ g +ģ g +ĥ h +ĩ i +ī i +ĭ i +į i +ij ij +ĵ j +ķ k +ĺ l +ļ l +ľ l +ŀ l· +ń n +ņ n +ň n +ʼn ʼn +ō o +ŏ o +ő o +œ oe +ŕ r +ŗ r +ř r +ś s +ŝ s +ş s +š s +ţ t +ť t +ũ u +ū u +ŭ u +ů u +ű u +ų u +ŵ w +ŷ y +ź z +ż z +ž z +ſ s +ơ o +ư u +dž dz +lj lj +nj nj +ǎ a +ǐ i +ǒ o +ǔ u +ǖ u +ǘ u +ǚ u +ǜ u +ǟ a +ǡ a +ǣ ae +ǧ g +ǩ k +ǫ o +ǭ o +ǰ j +dz dz +ǵ g +ǹ n +ǻ a +ǽ ae +ǿ ø +ȁ a +ȃ a +ȅ e +ȇ e +ȉ i +ȋ i +ȍ o +ȏ o +ȑ r +ȓ r +ȕ u +ȗ u +ș s +ț t +ȟ h +ȧ a +ȩ e +ȫ o +ȭ o +ȯ o +ȱ o +ȳ y +ʹ ʹ +; ; +· · +ΐ ι +ά α +έ ε +ή η +ί ι +ΰ υ +ς σ +ϊ ι +ϋ υ +ό ο +ύ υ +ώ ω +ϐ β +ϑ θ +ϒ υ +ϓ υ +ϔ υ +ϕ φ +ϖ π +ϰ κ +ϱ ρ +ϲ σ +ϵ ε +ḁ a +ḃ b +ḅ b +ḇ b +ḉ c +ḋ d +ḍ d +ḏ d +ḑ d +ḓ d +ḕ e +ḗ e +ḙ e +ḛ e +ḝ e +ḟ f +ḡ g +ḣ h +ḥ h +ḧ h +ḩ h +ḫ h +ḭ i +ḯ i +ḱ k +ḳ k +ḵ k +ḷ l +ḹ l +ḻ l +ḽ l +ḿ m +ṁ m +ṃ m +ṅ n +ṇ n +ṉ n +ṋ n +ṍ o +ṏ o +ṑ o +ṓ o +ṕ p +ṗ p +ṙ r +ṛ r +ṝ r +ṟ r +ṡ s +ṣ s +ṥ s +ṧ s +ṩ s +ṫ t +ṭ t +ṯ t +ṱ t +ṳ u +ṵ u +ṷ u +ṹ u +ṻ u +ṽ v +ṿ v +ẁ w +ẃ w +ẅ w +ẇ w +ẉ w +ẋ x +ẍ x +ẏ y +ẑ z +ẓ z +ẕ z +ẖ h +ẗ t +ẘ w +ẙ y +ẚ aʾ +ẛ s +ạ a +ả a +ấ a +ầ a +ẩ a +ẫ a +ậ a +ắ a +ằ a +ẳ a +ẵ a +ặ a +ẹ e +ẻ e +ẽ e +ế e +ề e +ể e +ễ e +ệ e +ỉ i +ị i +ọ o +ỏ o +ố o +ồ o +ổ o +ỗ o +ộ o +ớ o +ờ o +ở o +ỡ o +ợ o +ụ u +ủ u +ứ u +ừ u +ử u +ữ u +ự u +ỳ y +ỵ y +ỷ y +ỹ y +ἀ α +ἁ α +ἂ α +ἃ α +ἄ α +ἅ α +ἆ α +ἇ α +ἐ ε +ἑ ε +ἒ ε +ἓ ε +ἔ ε +ἕ ε +ἠ η +ἡ η +ἢ η +ἣ η +ἤ η +ἥ η +ἦ η +ἧ η +ἰ ι +ἱ ι +ἲ ι +ἳ ι +ἴ ι +ἵ ι +ἶ ι +ἷ ι +ὀ ο +ὁ ο +ὂ ο +ὃ ο +ὄ ο +ὅ ο +ὐ υ +ὑ υ +ὒ υ +ὓ υ +ὔ υ +ὕ υ +ὖ υ +ὗ υ +ὠ ω +ὡ ω +ὢ ω +ὣ ω +ὤ ω +ὥ ω +ὦ ω +ὧ ω +ὰ α +ά α +ὲ ε +έ ε +ὴ η +ή η +ὶ ι +ί ι +ὸ ο +ό ο +ὺ υ +ύ υ +ὼ ω +ώ ω +ᾀ α +ᾁ α +ᾂ α +ᾃ α +ᾄ α +ᾅ α +ᾆ α +ᾇ α +ᾐ η +ᾑ η +ᾒ η +ᾓ η +ᾔ η +ᾕ η +ᾖ η +ᾗ η +ᾠ ω +ᾡ ω +ᾢ ω +ᾣ ω +ᾤ ω +ᾥ ω +ᾦ ω +ᾧ ω +ᾰ α +ᾱ α +ᾲ α +ᾳ α +ᾴ α +ᾶ α +ᾷ α +ι ι +ῂ η +ῃ η +ῄ η +ῆ η +ῇ η +ῐ ι +ῑ ι +ῒ ι +ΐ ι +ῖ ι +ῗ ι +ῠ υ +ῡ υ +ῢ υ +ΰ υ +ῤ ρ +ῥ ρ +ῦ υ +ῧ υ +` ` +ῲ ω +ῳ ω +ῴ ω +ῶ ω +ῷ ω +ⱼ j +ⱽ v +ꝰ ꝯ +꟱ s +ꟲ c +ꟳ f +ꟴ q +ꟸ ħ +ꟹ oe +ꭜ ꜧ +ꭝ ꬷ +ꭞ ɫ +ꭟ ꭒ +ꭩ ʍ diff --git a/vendor/wxyc-etl/wxyc_unaccent.version b/vendor/wxyc-etl/wxyc_unaccent.version new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/vendor/wxyc-etl/wxyc_unaccent.version @@ -0,0 +1 @@ +0.1.0 diff --git a/wxyc-etl-pin.txt b/wxyc-etl-pin.txt new file mode 100644 index 0000000..f8fc1e9 --- /dev/null +++ b/wxyc-etl-pin.txt @@ -0,0 +1,19 @@ +# wxyc-etl postgres-analog vendor pin. +# +# These SHAs pin the canonical bytes from WXYC/wxyc-etl@v0.4.0 (`data/`), +# vendored here under `vendor/wxyc-etl/`. +# The migration at `migrations/0003_wxyc_identity_match_functions.sql` is +# a generated wrapper: setup prelude + the canonical SQL inlined for sqlx-cli +# (which can't `\i` external files). A test in `tests/wxyc_identity_match_parity_test.rs` +# asserts that the migration body matches `vendor/wxyc-etl/wxyc_identity_match_functions.sql` +# byte-for-byte after the wrapper prefix is stripped. +# +# Refresh procedure: re-vendor both files from a new wxyc-etl tag, bump the +# SHAs + version below, regenerate the migration, run the parity test. +# See WXYC/wxyc-etl/docs/postgres-analog-vendoring.md. + +unaccent_rules_version = 0.1.0 +unaccent_rules_sha256 = fc51eceb722904fa0d80734d4b2f4bbcffb0fcfecc133450f774210a067b9d26 +functions_sql_sha256 = 4beb7db2b44d479a4878a386e4d1b626d26fd3738e34371b23abe1ab4ccdc21a +fixture_csv_sha256 = dc07253b12dab04bc008a23464d7cb5904953649fca49ae08c0104c89c99771a +wxyc_etl_version = 0.4.0