WXYC · jakebromberg · May 20, 2026 · May 20, 2026 · May 20, 2026
diff --git a/.github/workflows/charset-corpus-drift.yml b/.github/workflows/charset-corpus-drift.yml
@@ -12,11 +12,18 @@ on:
   schedule:
     - cron: '17 8 * * 1'
 
+# The reusable workflow runs `npm pack @wxyc/shared` against npm.pkg.github.com,
+# so the GITHUB_TOKEN we pass in as `npm-token` needs packages:read. Other jobs
+# get no token writes by default.
+permissions:
+  contents: read
+  packages: read
+
 jobs:
   drift:
-    uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@main
+    uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@gha/v1
     with:
-      pinned-sha256: 75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a
-      package-version: '0.10.0'
+      pinned-sha256: 41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66
+      package-version: '1.6.0'
     secrets:
       npm-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/tests/charset_torture.rs b/tests/charset_torture.rs
@@ -35,7 +35,13 @@ fn load_corpus() -> Corpus {
 
 /// Inputs whose CSV→COPY round-trip cannot succeed today.
 fn expected_failures() -> HashMap<&'static str, &'static str> {
-    HashMap::new()
+    let mut m = HashMap::new();
+    // U+0000 is invalid in PG TEXT (SQL standard).
+    m.insert(
+        "null\x00byte",
+        "[wjf:pg-null-byte] PostgreSQL TEXT rejects U+0000 (SQL standard)",
+    );
+    m
 }
 
 const TEST_DB_URL: &str =
@@ -89,12 +95,15 @@ fn corpus_csv_pg_roundtrip() {
 
     let tmp = tempfile::tempdir().expect("tempdir");
 
-    // Build entity.csv with one row per corpus entry. NUL (U+0000) is
-    // carried through the CSV layer; `escape_copy_text` strips it before
-    // the bytes reach PostgreSQL, per WX-3.B (WXYC/docs#18).
+    // Build entity.csv with one row per corpus entry. Skip entries containing
+    // bytes the CSV format can't carry (NUL — RFC 4180 silently accepts it but
+    // postgres COPY rejects it later, so we filter here for tidier reporting).
     let mut entity_csv = String::from("qid,label,description,entity_type\n");
     let mut written: Vec<(usize, &str)> = Vec::new();
     for (id, _, input, _) in &entries {
+        if input.contains('\0') {
+            continue;
+        }
         let qid = format!("Q{}", id);
         entity_csv.push_str(&qid);
         entity_csv.push(',');
@@ -136,11 +145,7 @@ fn corpus_csv_pg_roundtrip() {
             .flatten();
         let actual: Option<String> = row.map(|r| r.get(0));
 
-        // WX-3.B: U+0000 is stripped at the PG TEXT write boundary
-        // (WXYC/docs#18), so the expected stored value is the input
-        // with NUL bytes removed.
-        let expected: String = input.replace('\0', "");
-        let passed = actual.as_deref() == Some(expected.as_str());
+        let passed = actual.as_deref() == Some(*input);
         match (passed, known) {
             (true, None) => {}
             (true, Some(_tag)) => {

diff --git a/tests/fixtures/charset-torture.json b/tests/fixtures/charset-torture.json
@@ -4,8 +4,8 @@
     "version": 1,
     "schema": {
       "input": "Raw UTF-8 string the storage layer must round-trip losslessly.",
-      "expected_storage": "Canonical (mojibake-fixed) form. Equals input for non-mojibake categories.",
-      "expected_match_form": "Output of to_match_form(expected_storage). NFKD + strip combining marks + lowercase + Greek sigma fold + trim. Null when WX-2 has not formalized the rule for this script.",
+      "expected_storage": "Output of to_storage_form(input). ftfy-style mojibake repair, then NFC normalization, then ASCII whitespace trim. Equals input for clean NFC input; differs for mojibake (repaired) and NFD (re-NFC'd). Two byte forms of the same visual string canonicalize to one.",
+      "expected_match_form": "Output of to_match_form(expected_storage). NFKC + Unicode default-caseless casefold + strip combining marks + WXYC fold registry (sigma, ligatures) + strip Cf format chars except U+200D ZWJ + collapse internal whitespace + trim. Null when WX-2 has not formalized the rule for this script.",
       "expected_ascii_form": "Output of to_ascii_form(expected_storage). Lowercase ASCII transliteration. Null for scripts without a defined transliteration in v1.",
       "notes": "Why this entry exists; which encoding incident or hazard category it pins."
     },
@@ -279,23 +279,23 @@
       {
         "input": "Hello\u200EWorld",
         "expected_storage": "Hello\u200EWorld",
-        "expected_match_form": null,
+        "expected_match_form": "helloworld",
         "expected_ascii_form": "helloworld",
-        "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — it is NOT stripped by NFKD + is_mark filter. Match-form behavior is undefined in v1 (WX-2 must decide whether to strip Cf characters)."
+        "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — survives NFKD + is_mark filter. Match-form strips it (WX-2 charter: strip all Cf except U+200D ZWJ); storage preserves it for round-trip fidelity."
       },
       {
         "input": "Hello\u200FWorld",
         "expected_storage": "Hello\u200FWorld",
-        "expected_match_form": null,
+        "expected_match_form": "helloworld",
         "expected_ascii_form": "helloworld",
-        "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM. Invisible character that survives most pipelines and corrupts visual rendering later."
+        "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM; same charter resolution — match-form strips, storage preserves."
       },
       {
         "input": "\u202EReversed\u202C",
         "expected_storage": "\u202EReversed\u202C",
-        "expected_match_form": null,
+        "expected_match_form": "reversed",
         "expected_ascii_form": "reversed",
-        "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector; must round-trip but should ideally be flagged on storage (out of scope for WX-1)."
+        "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector — both are Cf, both stripped from match-form; storage preserves the bytes (write-boundary defense is WX-3 scope, not WX-1 corpus scope)."
       }
     ],
     "zwj": [
@@ -327,28 +327,28 @@
         "expected_storage": "café",
         "expected_match_form": "cafe",
         "expected_ascii_form": "cafe",
-        "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Storage MUST preserve as-is (no silent re-normalization). Match-form folds to bare ASCII via NFKD + strip combining."
+        "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Already canonical; storage form is a no-op. Match-form folds to bare ASCII via NFKC + casefold + strip combining."
       },
       {
         "input": "cafe\u0301",
-        "expected_storage": "cafe\u0301",
+        "expected_storage": "café",
         "expected_match_form": "cafe",
         "expected_ascii_form": "cafe",
-        "notes": "NFD form of café: U+0063 U+0061 U+0066 U+0065 U+0301. Visually identical to the NFC entry above; bytes differ. Both must round-trip; both must produce the same match form."
+        "notes": "NFD form of café (U+0063 U+0061 U+0066 U+0065 U+0301) — three codepoints, four bytes after NFC. Storage form NFC-normalizes to the precomposed form so two byte-distinct submissions of the same visual string canonicalize to one row at the catalog. Match form collides with the NFC entry above."
       },
       {
         "input": "ñ",
         "expected_storage": "ñ",
         "expected_match_form": "n",
         "expected_ascii_form": "n",
-        "notes": "NFC form: U+00F1. Single precomposed Spanish ñ."
+        "notes": "NFC form: U+00F1. Single precomposed Spanish ñ; storage form is a no-op."
       },
       {
         "input": "n\u0303",
-        "expected_storage": "n\u0303",
+        "expected_storage": "ñ",
         "expected_match_form": "n",
         "expected_ascii_form": "n",
-        "notes": "NFD form of ñ: U+006E + U+0303. Bytes differ from NFC entry; match form must collide."
+        "notes": "NFD form of ñ (U+006E + U+0303). Storage form NFC-normalizes to the precomposed U+00F1 — same canonicalization rule as café-NFD above. Match form collides with both NFC and NFD ñ entries."
       }
     ],
     "mojibake_known": [

diff --git a/tests/fixtures/charset-torture.json.sha256 b/tests/fixtures/charset-torture.json.sha256
@@ -1 +1 @@
-75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a  charset-torture.json
+41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66  charset-torture.json
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a charset-torture.json
		41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 charset-torture.json