Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions .github/workflows/charset-corpus-drift.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,18 @@ on:
schedule:
- cron: '17 8 * * 1'

# The reusable workflow runs `npm pack @wxyc/shared` against npm.pkg.github.com,
# so the GITHUB_TOKEN we pass in as `npm-token` needs packages:read. Other jobs
# get no token writes by default.
permissions:
contents: read
packages: read

jobs:
drift:
uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@main
uses: WXYC/wxyc-shared/.github/workflows/check-charset-corpus-drift.yml@gha/v1
with:
pinned-sha256: 75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a
package-version: '0.10.0'
pinned-sha256: 41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66
package-version: '1.6.0'
secrets:
npm-token: ${{ secrets.GITHUB_TOKEN }}
23 changes: 14 additions & 9 deletions tests/charset_torture.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,13 @@ fn load_corpus() -> Corpus {

/// Inputs whose CSV→COPY round-trip cannot succeed today.
fn expected_failures() -> HashMap<&'static str, &'static str> {
HashMap::new()
let mut m = HashMap::new();
// U+0000 is invalid in PG TEXT (SQL standard).
m.insert(
"null\x00byte",
"[wjf:pg-null-byte] PostgreSQL TEXT rejects U+0000 (SQL standard)",
);
m
}

const TEST_DB_URL: &str =
Expand Down Expand Up @@ -89,12 +95,15 @@ fn corpus_csv_pg_roundtrip() {

let tmp = tempfile::tempdir().expect("tempdir");

// Build entity.csv with one row per corpus entry. NUL (U+0000) is
// carried through the CSV layer; `escape_copy_text` strips it before
// the bytes reach PostgreSQL, per WX-3.B (WXYC/docs#18).
// Build entity.csv with one row per corpus entry. Skip entries containing
// bytes the CSV format can't carry (NUL — RFC 4180 silently accepts it but
// postgres COPY rejects it later, so we filter here for tidier reporting).
let mut entity_csv = String::from("qid,label,description,entity_type\n");
let mut written: Vec<(usize, &str)> = Vec::new();
for (id, _, input, _) in &entries {
if input.contains('\0') {
continue;
}
let qid = format!("Q{}", id);
entity_csv.push_str(&qid);
entity_csv.push(',');
Expand Down Expand Up @@ -136,11 +145,7 @@ fn corpus_csv_pg_roundtrip() {
.flatten();
let actual: Option<String> = row.map(|r| r.get(0));

// WX-3.B: U+0000 is stripped at the PG TEXT write boundary
// (WXYC/docs#18), so the expected stored value is the input
// with NUL bytes removed.
let expected: String = input.replace('\0', "");
let passed = actual.as_deref() == Some(expected.as_str());
let passed = actual.as_deref() == Some(*input);
match (passed, known) {
(true, None) => {}
(true, Some(_tag)) => {
Expand Down
28 changes: 14 additions & 14 deletions tests/fixtures/charset-torture.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
"version": 1,
"schema": {
"input": "Raw UTF-8 string the storage layer must round-trip losslessly.",
"expected_storage": "Canonical (mojibake-fixed) form. Equals input for non-mojibake categories.",
"expected_match_form": "Output of to_match_form(expected_storage). NFKD + strip combining marks + lowercase + Greek sigma fold + trim. Null when WX-2 has not formalized the rule for this script.",
"expected_storage": "Output of to_storage_form(input). ftfy-style mojibake repair, then NFC normalization, then ASCII whitespace trim. Equals input for clean NFC input; differs for mojibake (repaired) and NFD (re-NFC'd). Two byte forms of the same visual string canonicalize to one.",
"expected_match_form": "Output of to_match_form(expected_storage). NFKC + Unicode default-caseless casefold + strip combining marks + WXYC fold registry (sigma, ligatures) + strip Cf format chars except U+200D ZWJ + collapse internal whitespace + trim. Null when WX-2 has not formalized the rule for this script.",
"expected_ascii_form": "Output of to_ascii_form(expected_storage). Lowercase ASCII transliteration. Null for scripts without a defined transliteration in v1.",
"notes": "Why this entry exists; which encoding incident or hazard category it pins."
},
Expand Down Expand Up @@ -279,23 +279,23 @@
{
"input": "Hello\u200EWorld",
"expected_storage": "Hello\u200EWorld",
"expected_match_form": null,
"expected_match_form": "helloworld",
"expected_ascii_form": "helloworld",
"notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — it is NOT stripped by NFKD + is_mark filter. Match-form behavior is undefined in v1 (WX-2 must decide whether to strip Cf characters)."
"notes": "U+200E LEFT-TO-RIGHT MARK embedded in ASCII. LRM is category Cf (Format), NOT M (Mark) — survives NFKD + is_mark filter. Match-form strips it (WX-2 charter: strip all Cf except U+200D ZWJ); storage preserves it for round-trip fidelity."
},
{
"input": "Hello\u200FWorld",
"expected_storage": "Hello\u200FWorld",
"expected_match_form": null,
"expected_match_form": "helloworld",
"expected_ascii_form": "helloworld",
"notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM. Invisible character that survives most pipelines and corrupts visual rendering later."
"notes": "U+200F RIGHT-TO-LEFT MARK. Same Cf-not-stripped hazard as LRM; same charter resolution — match-form strips, storage preserves."
},
{
"input": "\u202EReversed\u202C",
"expected_storage": "\u202EReversed\u202C",
"expected_match_form": null,
"expected_match_form": "reversed",
"expected_ascii_form": "reversed",
"notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector; must round-trip but should ideally be flagged on storage (out of scope for WX-1)."
"notes": "U+202E RIGHT-TO-LEFT OVERRIDE + U+202C POP DIRECTIONAL FORMATTING. Spoofing-attack vector — both are Cf, both stripped from match-form; storage preserves the bytes (write-boundary defense is WX-3 scope, not WX-1 corpus scope)."
}
],
"zwj": [
Expand Down Expand Up @@ -327,28 +327,28 @@
"expected_storage": "café",
"expected_match_form": "cafe",
"expected_ascii_form": "cafe",
"notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Storage MUST preserve as-is (no silent re-normalization). Match-form folds to bare ASCII via NFKD + strip combining."
"notes": "NFC form: U+0063 U+0061 U+0066 U+00E9. Already canonical; storage form is a no-op. Match-form folds to bare ASCII via NFKC + casefold + strip combining."
},
{
"input": "cafe\u0301",
"expected_storage": "cafe\u0301",
"expected_storage": "café",
"expected_match_form": "cafe",
"expected_ascii_form": "cafe",
"notes": "NFD form of café: U+0063 U+0061 U+0066 U+0065 U+0301. Visually identical to the NFC entry above; bytes differ. Both must round-trip; both must produce the same match form."
"notes": "NFD form of café (U+0063 U+0061 U+0066 U+0065 U+0301) — three codepoints, four bytes after NFC. Storage form NFC-normalizes to the precomposed form so two byte-distinct submissions of the same visual string canonicalize to one row at the catalog. Match form collides with the NFC entry above."
},
{
"input": "ñ",
"expected_storage": "ñ",
"expected_match_form": "n",
"expected_ascii_form": "n",
"notes": "NFC form: U+00F1. Single precomposed Spanish ñ."
"notes": "NFC form: U+00F1. Single precomposed Spanish ñ; storage form is a no-op."
},
{
"input": "n\u0303",
"expected_storage": "n\u0303",
"expected_storage": "ñ",
"expected_match_form": "n",
"expected_ascii_form": "n",
"notes": "NFD form of ñ: U+006E + U+0303. Bytes differ from NFC entry; match form must collide."
"notes": "NFD form of ñ (U+006E + U+0303). Storage form NFC-normalizes to the precomposed U+00F1 — same canonicalization rule as café-NFD above. Match form collides with both NFC and NFD ñ entries."
}
],
"mojibake_known": [
Expand Down
2 changes: 1 addition & 1 deletion tests/fixtures/charset-torture.json.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
75a3395bb10894480dba95bf5b7f379f5056645098d6a1bf9e94416709e5214a charset-torture.json
41a18c5c0a92d129ec4b575827b6874196bfb7591e4bdf237a918a5da2de7b66 charset-torture.json
Loading