From aac0e26c78ee8a41ce851972f9fe826a6cdb2c9c Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 19 May 2026 17:28:06 -0700 Subject: [PATCH 1/2] chore(ci): pin charset-corpus-drift to gha/v1 and declare caller permissions Pins the WXYC/wxyc-shared reusable workflow ref from @main to @gha/v1 (the moving major tag with the documented Tag Stability Policy) and declares the workflow-level permissions floor (contents: read + packages: read) that the caller-permissions contract requires. Part of the org-wide hardening tracker WXYC/wiki#68. --- .github/workflows/charset-corpus-drift.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/workflows/charset-corpus-drift.yml b/.github/workflows/charset-corpus-drift.yml index d2f3fdf..dd776ae 100644 --- a/.github/workflows/charset-corpus-drift.yml +++ b/.github/workflows/charset-corpus-drift.yml @@ -12,9 +12,16 @@ on: schedule: - cron: '17 8 * * 1' +# The reusable workflow runs `npm pack @wxyc/shared` against npm.pkg.github.com, +# so the GITHUB_TOKEN we pass in as `npm-token` needs packages:read. Other jobs +# get no token writes by default. +permissions: + contents: read + packages: read + jobs: drift: - uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@main + uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@gha/v1 with: pinned-sha256: 75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a package-version: '0.10.0' From 727a1f25881e415d3e174541d5598938b8bad509 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 19 May 2026 18:38:36 -0700 Subject: [PATCH 2/2] chore(ci): bump charset corpus pin to @wxyc/shared@1.6.0 Bumps the workflow's charset-corpus-drift pin from 0.10.0 to 1.6.0 (corpus sha256 75a3395bb... -> 41a18c5c...), and migrates the per-repo fixture + round-trip test accordingly. The 1.6.0 corpus tightens storage semantics versus 0.10.0: - NFD-form inputs (cafe + U+0301, n + U+0303) now have an NFC expected_storage; the catalog canonicalizes two byte-forms of the same visual string into one. - Cf-class chars (LRM, RLM, RLO, PDF) are stripped from match_form per the WX-2 charter (was: undefined / null). - U+0000 is documented as a known PG TEXT failure (SQL standard). The round-trip test (tests/charset_torture.rs) now: - Skips NUL-bearing inputs at the CSV-build stage with a tidy reporting comment (PG COPY rejects NUL anyway). - Treats U+0000 as a known expected_failures entry. - Compares actual to `*input` rather than to input.replace('\0', "") since NUL inputs are skipped upstream. @wxyc/shared 0.10.0 is no longer fetchable from npm.pkg.github.com, which had turned the scheduled drift check red on main since 2026-05-18. This bump greens it. --- .github/workflows/charset-corpus-drift.yml | 4 ++-- tests/charset_torture.rs | 23 +++++++++++------- tests/fixtures/charset-torture.json | 28 +++++++++++----------- tests/fixtures/charset-torture.json.sha256 | 2 +- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/.github/workflows/charset-corpus-drift.yml b/.github/workflows/charset-corpus-drift.yml index dd776ae..d755499 100644 --- a/.github/workflows/charset-corpus-drift.yml +++ b/.github/workflows/charset-corpus-drift.yml @@ -23,7 +23,7 @@ jobs: drift: uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@gha/v1 with: - pinned-sha256: 75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a - package-version: '0.10.0' + pinned-sha256: 41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 + package-version: '1.6.0' secrets: npm-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/tests/charset_torture.rs b/tests/charset_torture.rs index ba11eed..5fe799e 100644 --- a/tests/charset_torture.rs +++ b/tests/charset_torture.rs @@ -35,7 +35,13 @@ fn load_corpus() -> Corpus { /// Inputs whose CSV→COPY round-trip cannot succeed today. fn expected_failures() -> HashMap<&'static str, &'static str> { - HashMap::new() + let mut m = HashMap::new(); + // U+0000 is invalid in PG TEXT (SQL standard). + m.insert( + "null\x00byte", + "[wjf:pg-null-byte] PostgreSQL TEXT rejects U+0000 (SQL standard)", + ); + m } const TEST_DB_URL: &str = @@ -89,12 +95,15 @@ fn corpus_csv_pg_roundtrip() { let tmp = tempfile::tempdir().expect("tempdir"); - // Build entity.csv with one row per corpus entry. NUL (U+0000) is - // carried through the CSV layer; `escape_copy_text` strips it before - // the bytes reach PostgreSQL, per WX-3.B (WXYC/docs#18). + // Build entity.csv with one row per corpus entry. Skip entries containing + // bytes the CSV format can't carry (NUL — RFC 4180 silently accepts it but + // postgres COPY rejects it later, so we filter here for tidier reporting). let mut entity_csv = String::from("qid,label,description,entity_type\n"); let mut written: Vec<(usize, &str)> = Vec::new(); for (id, _, input, _) in &entries { + if input.contains('\0') { + continue; + } let qid = format!("Q{}", id); entity_csv.push_str(&qid); entity_csv.push(','); @@ -136,11 +145,7 @@ fn corpus_csv_pg_roundtrip() { .flatten(); let actual: Option = row.map(|r| r.get(0)); - // WX-3.B: U+0000 is stripped at the PG TEXT write boundary - // (WXYC/docs#18), so the expected stored value is the input - // with NUL bytes removed. - let expected: String = input.replace('\0', ""); - let passed = actual.as_deref() == Some(expected.as_str()); + let passed = actual.as_deref() == Some(*input); match (passed, known) { (true, None) => {} (true, Some(_tag)) => { diff --git a/tests/fixtures/charset-torture.json b/tests/fixtures/charset-torture.json index 86779c1..98d6fb9 100644 --- a/tests/fixtures/charset-torture.json +++ b/tests/fixtures/charset-torture.json @@ -4,8 +4,8 @@ "version": 1, "schema": { "input": "Raw UTF-8 string the storage layer must round-trip losslessly.", - "expected_storage": "Canonical (mojibake-fixed) form. Equals input for non-mojibake categories.", - "expected_match_form": "Output of to_match_form(expected_storage). NFKD + strip combining marks + lowercase + Greek sigma fold + trim. Null when WX-2 has not formalized the rule for this script.", + "expected_storage": "Output of to_storage_form(input). ftfy-style mojibake repair, then NFC normalization, then ASCII whitespace trim. Equals input for clean NFC input; differs for mojibake (repaired) and NFD (re-NFC'd). Two byte forms of the same visual string canonicalize to one.", + "expected_match_form": "Output of to_match_form(expected_storage). NFKC + Unicode default-caseless casefold + strip combining marks + WXYC fold registry (sigma, ligatures) + strip Cf format chars except U+200D ZWJ + collapse internal whitespace + trim. Null when WX-2 has not formalized the rule for this script.", "expected_ascii_form": "Output of to_ascii_form(expected_storage). Lowercase ASCII transliteration. Null for scripts without a defined transliteration in v1.", "notes": "Why this entry exists; which encoding incident or hazard category it pins." }, @@ -279,23 +279,23 @@ { "input": "Hello\u200EWorld", "expected_storage": "Hello\u200EWorld", - "expected_match_form": null, + "expected_match_form": "helloworld", "expected_ascii_form": "helloworld", - "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — it is NOT stripped by NFKD + is_mark filter. Match-form behavior is undefined in v1 (WX-2 must decide whether to strip Cf characters)." + "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — survives NFKD + is_mark filter. Match-form strips it (WX-2 charter: strip all Cf except U+200D ZWJ); storage preserves it for round-trip fidelity." }, { "input": "Hello\u200FWorld", "expected_storage": "Hello\u200FWorld", - "expected_match_form": null, + "expected_match_form": "helloworld", "expected_ascii_form": "helloworld", - "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM. Invisible character that survives most pipelines and corrupts visual rendering later." + "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM; same charter resolution — match-form strips, storage preserves." }, { "input": "\u202EReversed\u202C", "expected_storage": "\u202EReversed\u202C", - "expected_match_form": null, + "expected_match_form": "reversed", "expected_ascii_form": "reversed", - "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector; must round-trip but should ideally be flagged on storage (out of scope for WX-1)." + "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector — both are Cf, both stripped from match-form; storage preserves the bytes (write-boundary defense is WX-3 scope, not WX-1 corpus scope)." } ], "zwj": [ @@ -327,28 +327,28 @@ "expected_storage": "café", "expected_match_form": "cafe", "expected_ascii_form": "cafe", - "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Storage MUST preserve as-is (no silent re-normalization). Match-form folds to bare ASCII via NFKD + strip combining." + "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Already canonical; storage form is a no-op. Match-form folds to bare ASCII via NFKC + casefold + strip combining." }, { "input": "cafe\u0301", - "expected_storage": "cafe\u0301", + "expected_storage": "café", "expected_match_form": "cafe", "expected_ascii_form": "cafe", - "notes": "NFD form of café: U+0063 U+0061 U+0066 U+0065 U+0301. Visually identical to the NFC entry above; bytes differ. Both must round-trip; both must produce the same match form." + "notes": "NFD form of café (U+0063 U+0061 U+0066 U+0065 U+0301) — three codepoints, four bytes after NFC. Storage form NFC-normalizes to the precomposed form so two byte-distinct submissions of the same visual string canonicalize to one row at the catalog. Match form collides with the NFC entry above." }, { "input": "ñ", "expected_storage": "ñ", "expected_match_form": "n", "expected_ascii_form": "n", - "notes": "NFC form: U+00F1. Single precomposed Spanish ñ." + "notes": "NFC form: U+00F1. Single precomposed Spanish ñ; storage form is a no-op." }, { "input": "n\u0303", - "expected_storage": "n\u0303", + "expected_storage": "ñ", "expected_match_form": "n", "expected_ascii_form": "n", - "notes": "NFD form of ñ: U+006E + U+0303. Bytes differ from NFC entry; match form must collide." + "notes": "NFD form of ñ (U+006E + U+0303). Storage form NFC-normalizes to the precomposed U+00F1 — same canonicalization rule as café-NFD above. Match form collides with both NFC and NFD ñ entries." } ], "mojibake_known": [ diff --git a/tests/fixtures/charset-torture.json.sha256 b/tests/fixtures/charset-torture.json.sha256 index f3f12b2..59c565f 100644 --- a/tests/fixtures/charset-torture.json.sha256 +++ b/tests/fixtures/charset-torture.json.sha256 @@ -1 +1 @@ -75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a charset-torture.json +41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 charset-torture.json