From 735669b4b378adf2faddce68cd890dec16b3d0b8 Mon Sep 17 00:00:00 2001 From: Jake Bromberg Date: Tue, 5 May 2026 12:36:06 -0700 Subject: [PATCH] test: bump charset-torture corpus pin to v0.12.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cascade WX-2.2.1 (`to_storage_form`) corpus updates from WXYC/wxyc-shared#100 + WXYC/wxyc-etl#89. Corpus moves from v0.10.0 (sha 75a3395b…) to v0.12.0 (sha 41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66). The new corpus formalizes `expected_match_form` for the bidi_marks category (`Hello‎World` → `helloworld`, `Hello‏World` → `helloworld`, `‮Reversed‬` → `reversed`) and adds NFC `expected_storage` for two NFD `normalization` entries. This repo's `tests/charset_torture.rs` detector is a CSV → PG COPY byte round-trip — it asserts `actual == input`, not `actual == expected_match_form` or `actual == expected_storage`. The bidi_marks bytes round-trip cleanly (LRM/RLM/RLO/PDF are all valid PG TEXT) so no new `[wjf:...]` xfails are needed; only the existing `[wjf:pg-null-byte]` xfail remains. Verified with `cargo test -- --ignored` against the docker-compose PG service (1 passed, 0 failed). Drift-guard CI (`.github/workflows/charset-corpus-drift.yml`) is repointed to the new sha + `'0.12.0'` package version. It will fail-until-publish until `@wxyc/shared` v0.12.0 is on GitHub Packages — expected. --- .github/workflows/charset-corpus-drift.yml | 4 ++-- tests/fixtures/charset-torture.json | 28 +++++++++++----------- tests/fixtures/charset-torture.json.sha256 | 2 +- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/.github/workflows/charset-corpus-drift.yml b/.github/workflows/charset-corpus-drift.yml index d2f3fdf..21be0e5 100644 --- a/.github/workflows/charset-corpus-drift.yml +++ b/.github/workflows/charset-corpus-drift.yml @@ -16,7 +16,7 @@ jobs: drift: uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@main with: - pinned-sha256: 75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a - package-version: '0.10.0' + pinned-sha256: 41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 + package-version: '0.12.0' secrets: npm-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/tests/fixtures/charset-torture.json b/tests/fixtures/charset-torture.json index 86779c1..98d6fb9 100644 --- a/tests/fixtures/charset-torture.json +++ b/tests/fixtures/charset-torture.json @@ -4,8 +4,8 @@ "version": 1, "schema": { "input": "Raw UTF-8 string the storage layer must round-trip losslessly.", - "expected_storage": "Canonical (mojibake-fixed) form. Equals input for non-mojibake categories.", - "expected_match_form": "Output of to_match_form(expected_storage). NFKD + strip combining marks + lowercase + Greek sigma fold + trim. Null when WX-2 has not formalized the rule for this script.", + "expected_storage": "Output of to_storage_form(input). ftfy-style mojibake repair, then NFC normalization, then ASCII whitespace trim. Equals input for clean NFC input; differs for mojibake (repaired) and NFD (re-NFC'd). Two byte forms of the same visual string canonicalize to one.", + "expected_match_form": "Output of to_match_form(expected_storage). NFKC + Unicode default-caseless casefold + strip combining marks + WXYC fold registry (sigma, ligatures) + strip Cf format chars except U+200D ZWJ + collapse internal whitespace + trim. Null when WX-2 has not formalized the rule for this script.", "expected_ascii_form": "Output of to_ascii_form(expected_storage). Lowercase ASCII transliteration. Null for scripts without a defined transliteration in v1.", "notes": "Why this entry exists; which encoding incident or hazard category it pins." }, @@ -279,23 +279,23 @@ { "input": "Hello\u200EWorld", "expected_storage": "Hello\u200EWorld", - "expected_match_form": null, + "expected_match_form": "helloworld", "expected_ascii_form": "helloworld", - "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — it is NOT stripped by NFKD + is_mark filter. Match-form behavior is undefined in v1 (WX-2 must decide whether to strip Cf characters)." + "notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — survives NFKD + is_mark filter. Match-form strips it (WX-2 charter: strip all Cf except U+200D ZWJ); storage preserves it for round-trip fidelity." }, { "input": "Hello\u200FWorld", "expected_storage": "Hello\u200FWorld", - "expected_match_form": null, + "expected_match_form": "helloworld", "expected_ascii_form": "helloworld", - "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM. Invisible character that survives most pipelines and corrupts visual rendering later." + "notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM; same charter resolution — match-form strips, storage preserves." }, { "input": "\u202EReversed\u202C", "expected_storage": "\u202EReversed\u202C", - "expected_match_form": null, + "expected_match_form": "reversed", "expected_ascii_form": "reversed", - "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector; must round-trip but should ideally be flagged on storage (out of scope for WX-1)." + "notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector — both are Cf, both stripped from match-form; storage preserves the bytes (write-boundary defense is WX-3 scope, not WX-1 corpus scope)." } ], "zwj": [ @@ -327,28 +327,28 @@ "expected_storage": "café", "expected_match_form": "cafe", "expected_ascii_form": "cafe", - "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Storage MUST preserve as-is (no silent re-normalization). Match-form folds to bare ASCII via NFKD + strip combining." + "notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Already canonical; storage form is a no-op. Match-form folds to bare ASCII via NFKC + casefold + strip combining." }, { "input": "cafe\u0301", - "expected_storage": "cafe\u0301", + "expected_storage": "café", "expected_match_form": "cafe", "expected_ascii_form": "cafe", - "notes": "NFD form of café: U+0063 U+0061 U+0066 U+0065 U+0301. Visually identical to the NFC entry above; bytes differ. Both must round-trip; both must produce the same match form." + "notes": "NFD form of café (U+0063 U+0061 U+0066 U+0065 U+0301) — three codepoints, four bytes after NFC. Storage form NFC-normalizes to the precomposed form so two byte-distinct submissions of the same visual string canonicalize to one row at the catalog. Match form collides with the NFC entry above." }, { "input": "ñ", "expected_storage": "ñ", "expected_match_form": "n", "expected_ascii_form": "n", - "notes": "NFC form: U+00F1. Single precomposed Spanish ñ." + "notes": "NFC form: U+00F1. Single precomposed Spanish ñ; storage form is a no-op." }, { "input": "n\u0303", - "expected_storage": "n\u0303", + "expected_storage": "ñ", "expected_match_form": "n", "expected_ascii_form": "n", - "notes": "NFD form of ñ: U+006E + U+0303. Bytes differ from NFC entry; match form must collide." + "notes": "NFD form of ñ (U+006E + U+0303). Storage form NFC-normalizes to the precomposed U+00F1 — same canonicalization rule as café-NFD above. Match form collides with both NFC and NFD ñ entries." } ], "mojibake_known": [ diff --git a/tests/fixtures/charset-torture.json.sha256 b/tests/fixtures/charset-torture.json.sha256 index f3f12b2..59c565f 100644 --- a/tests/fixtures/charset-torture.json.sha256 +++ b/tests/fixtures/charset-torture.json.sha256 @@ -1 +1 @@ -75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a charset-torture.json +41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 charset-torture.json