.bin.gz} in the devtest dir, decodes
+ * them as {@code sourceCharset}, then re-encodes the resulting Unicode
+ * under the synthetic charset (the map key). The resulting bytes are
+ * then treated as if they had been read from a labeled-with-the-key
+ * file.
+ *
+ * Needed because the training corpus only has Windows codepages
+ * ({@code windows-1250.bin.gz}, etc.) but the real-world Polish
+ * {@code ci¶nienia} failure mode requires ISO-8859-2 bytes (where
+ * {@code ś = 0xB6}, not the windows-1250 {@code 0x9C}). Re-encoding
+ * windows-1250 Polish records as ISO-8859-2 gives us labeled
+ * ISO-8859-2 data that, when cross-decoded back as windows-1250,
+ * reproduces the {@code ¶}-splits-word pattern.
+ */
+ private static final Map SYNTHETIC_SOURCES;
+
+ static {
+ Map m = new LinkedHashMap<>();
+ m.put("ISO-8859-2",
+ new SyntheticSource("windows-1250", "windows-1250"));
+ SYNTHETIC_SOURCES = Collections.unmodifiableMap(m);
+ }
+
+ private static final class SyntheticSource {
+ final String sourceFileBasename; // without .bin.gz suffix
+ final String sourceCharset;
+
+ SyntheticSource(String sourceFileBasename, String sourceCharset) {
+ this.sourceFileBasename = sourceFileBasename;
+ this.sourceCharset = sourceCharset;
+ }
+ }
+
public static void main(String[] args) throws IOException {
Path devtestDir = Paths.get(System.getProperty("user.home"),
"data", "charsets", "devtest");
@@ -117,8 +264,13 @@ public static void main(String[] args) throws IOException {
int maxRecords = DEFAULT_MAX_RECORDS;
int[] lengths = DEFAULT_LENGTHS;
float threshold = DEFAULT_THRESHOLD;
- List wrongCharsets = DEFAULT_WRONG_CHARSETS;
+ List wrongCharsetsOverride = null;
List labeledFilter = null;
+ boolean usePerSourceMap = true;
+ List stripModes = DEFAULT_STRIP_MODES;
+ Path perRecordPath = null;
+ int perRecordMax = DEFAULT_PER_RECORD_MAX;
+ boolean wordMode = false;
for (int i = 0; i < args.length; i++) {
switch (args[i]) {
@@ -142,11 +294,27 @@ public static void main(String[] args) throws IOException {
.mapToInt(Integer::parseInt).toArray();
break;
case "--wrong-charsets":
- wrongCharsets = Arrays.asList(args[++i].split(","));
+ wrongCharsetsOverride = Arrays.asList(args[++i].split(","));
+ usePerSourceMap = false;
+ break;
+ case "--no-per-source-map":
+ usePerSourceMap = false;
break;
case "--only":
labeledFilter = Arrays.asList(args[++i].split(","));
break;
+ case "--strip-modes":
+ stripModes = parseStripModes(args[++i]);
+ break;
+ case "--per-record":
+ perRecordPath = Paths.get(args[++i]);
+ break;
+ case "--per-record-max":
+ perRecordMax = Integer.parseInt(args[++i]);
+ break;
+ case "--word-mode":
+ wordMode = true;
+ break;
default:
System.err.println("Unknown arg: " + args[i]);
printUsage();
@@ -172,17 +340,12 @@ public static void main(String[] args) throws IOException {
System.err.println(" max-records: " + maxRecords);
System.err.println(" lengths: " + Arrays.toString(lengths));
System.err.println(" threshold: " + threshold);
- System.err.println(" wrong-cs: " + wrongCharsets);
-
- // Resolve wrong charsets (skip any the JVM doesn't have)
- Map resolvedWrong = new LinkedHashMap<>();
- for (String name : wrongCharsets) {
- Charset cs = tryGetCharset(name);
- if (cs == null) {
- System.err.println(" WARN: wrong-charset unavailable: " + name);
- continue;
- }
- resolvedWrong.put(name, cs);
+ System.err.println(" strip-modes: " + stripModes);
+ System.err.println(" per-source map: " + (usePerSourceMap ? "yes" : "no"));
+ System.err.println(" word-mode: " + wordMode);
+ if (perRecordPath != null) {
+ System.err.println(" per-record: " + perRecordPath
+ + " (max " + perRecordMax + " per cell)");
}
List files;
@@ -204,13 +367,21 @@ public static void main(String[] args) throws IOException {
List allRows = new ArrayList<>();
try (PrintWriter detail = new PrintWriter(
- Files.newBufferedWriter(detailPath, StandardCharsets.UTF_8))) {
+ Files.newBufferedWriter(detailPath, StandardCharsets.UTF_8));
+ PrintWriter perRecord = perRecordPath != null
+ ? new PrintWriter(Files.newBufferedWriter(perRecordPath,
+ StandardCharsets.UTF_8))
+ : null) {
- detail.println("labeled_cs\tscript\twrong_cs\tlength\tn"
+ detail.println("strip_mode\tlabeled_cs\tscript\twrong_cs\tlength\tn"
+ "\tmean_clean_z\tmean_mojibake_z\tcohens_d"
+ "\tmean_margin\tp5_margin\tp50_margin"
+ "\tfpr\ttpr");
+ if (perRecord != null) {
+ perRecord.println(perRecordHeader());
+ }
+
for (Path file : files) {
String labeledName = filenameToCharsetName(file);
if (labeledFilter != null && !labeledFilter.contains(labeledName)) {
@@ -221,77 +392,213 @@ public static void main(String[] args) throws IOException {
System.err.println(" SKIP: labeled charset unavailable: " + labeledName);
continue;
}
-
List records = readRecords(file, maxRecords);
- if (records.size() < MIN_SAMPLES_PER_CELL) {
- System.err.printf(" SKIP %s: only %d records%n",
- labeledName, records.size());
+ processLabeled(detector, detail, perRecord, allRows,
+ labeledName, labeled, records,
+ stripModes, lengths, threshold,
+ wrongCharsetsOverride, usePerSourceMap, perRecordMax,
+ false, wordMode);
+ }
+
+ // Synthetic sources: re-encode a known charset's records under
+ // a charset that has no devtest file of its own. Used to
+ // reproduce the Polish ¶ failure (synthesize ISO-8859-2 bytes
+ // from windows-1250 records).
+ for (Map.Entry e : SYNTHETIC_SOURCES.entrySet()) {
+ String synthName = e.getKey();
+ if (labeledFilter != null && !labeledFilter.contains(synthName)) {
+ continue;
+ }
+ Charset synth = tryGetCharset(synthName);
+ if (synth == null) {
+ System.err.println(" SKIP synthetic: charset unavailable: " + synthName);
+ continue;
+ }
+ SyntheticSource src = e.getValue();
+ Path sourceFile = devtestDir.resolve(src.sourceFileBasename + ".bin.gz");
+ if (!Files.isReadable(sourceFile)) {
+ System.err.println(" SKIP synthetic " + synthName
+ + ": source file missing: " + sourceFile);
continue;
}
+ Charset sourceCs = tryGetCharset(src.sourceCharset);
+ if (sourceCs == null) {
+ System.err.println(" SKIP synthetic " + synthName
+ + ": source charset unavailable: " + src.sourceCharset);
+ continue;
+ }
+ List sourceRecords = readRecords(sourceFile, maxRecords);
+ List synthRecords =
+ synthesizeRecords(sourceRecords, sourceCs, synth);
+ System.err.printf("%n=== synthetic %s ← %s round-trip: kept %d / %d records ===%n",
+ synthName, src.sourceCharset, synthRecords.size(),
+ sourceRecords.size());
+ processLabeled(detector, detail, perRecord, allRows,
+ synthName, synth, synthRecords,
+ stripModes, lengths, threshold,
+ wrongCharsetsOverride, usePerSourceMap, perRecordMax,
+ true, wordMode);
+ }
+ }
- System.err.printf("%n--- %s (%d records) ---%n",
- labeledName, records.size());
+ writeSummary(summaryPath, allRows, lengths);
+ writeScriptPivot(pivotPath, allRows);
- for (int len : lengths) {
- List slices = sliceToLength(records, len);
- if (slices.size() < MIN_SAMPLES_PER_CELL) {
- continue;
- }
+ System.err.println("\nWrote " + detailPath);
+ System.err.println("Wrote " + summaryPath);
+ System.err.println("Wrote " + pivotPath);
+ if (perRecordPath != null) {
+ System.err.println("Wrote " + perRecordPath);
+ }
+ System.err.println("Done.");
+ }
- // Decode all slices under labeled (clean) once
- List cleanTexts = decodeAll(slices, labeled);
- List cleanZs = scoreAll(detector, cleanTexts);
- if (cleanZs.size() < MIN_SAMPLES_PER_CELL) {
- continue;
- }
+ // -----------------------------------------------------------------------
+ // Per-labeled-charset processing (shared by real-file and synthetic loops)
+ // -----------------------------------------------------------------------
- // Detect script from a sample of the clean decoded text
- String script = detectDominantScript(
- cleanTexts.get(cleanTexts.size() / 2));
+ private static void processLabeled(JunkDetector detector,
+ PrintWriter detail, PrintWriter perRecord,
+ List allRows,
+ String labeledName, Charset labeled,
+ List records,
+ List stripModes,
+ int[] lengths,
+ float threshold,
+ List wrongCharsetsOverride,
+ boolean usePerSourceMap,
+ int perRecordMax,
+ boolean synthetic,
+ boolean wordMode) {
+ if (records.size() < MIN_SAMPLES_PER_CELL) {
+ System.err.printf(" SKIP %s: only %d records%n",
+ labeledName, records.size());
+ return;
+ }
+ List wrongCharsetNames = resolveWrongCharsets(
+ labeledName, wrongCharsetsOverride, usePerSourceMap);
+ Map resolvedWrong = resolveCharsets(wrongCharsetNames);
+
+ System.err.printf("%n--- %s%s (%d records, wrong=%s) ---%n",
+ labeledName, synthetic ? " (synthetic)" : "",
+ records.size(), wrongCharsetNames);
+
+ // Word-mode replaces the length-bucket loop with a single bucket of
+ // whitespace-delimited tokens (length-in-bytes = WORD_MODE_LEN_SENTINEL
+ // for reporting; actual sizes vary per token).
+ int[] effectiveLengths = wordMode
+ ? new int[]{WORD_MODE_LEN_SENTINEL} : lengths;
+
+ for (StripMode strip : stripModes) {
+ for (int len : effectiveLengths) {
+ List slices = wordMode
+ ? extractTokens(records, labeled)
+ : sliceToLength(records, len);
+ if (slices.size() < MIN_SAMPLES_PER_CELL) {
+ continue;
+ }
- for (Map.Entry entry : resolvedWrong.entrySet()) {
- String wrongName = entry.getKey();
- Charset wrongCs = entry.getValue();
- if (equalCharset(labeled, wrongCs)) {
- continue; // can't be its own mojibake
- }
+ List cleanTextsRaw = decodeAll(slices, labeled);
+ List cleanTexts = applyStrip(cleanTextsRaw, strip);
+ List cleanZs = scoreAll(detector, cleanTexts);
+ if (cleanZs.size() < MIN_SAMPLES_PER_CELL) {
+ continue;
+ }
- List mojiTexts = decodeAll(slices, wrongCs);
- // Pair cleanTexts[i] with mojiTexts[i] by source record
- Row row = scorePairs(detector, script, labeledName,
- wrongName, len, cleanTexts, mojiTexts,
- cleanZs, threshold);
- if (row == null) {
- continue;
- }
- allRows.add(row);
- detail.println(row.toTsv());
+ String script = detectDominantScript(
+ cleanTextsRaw.get(cleanTextsRaw.size() / 2));
+
+ List cleanFeats = perRecord != null
+ ? snapshotAll(detector, cleanTexts)
+ : null;
+
+ for (Map.Entry entry : resolvedWrong.entrySet()) {
+ String wrongName = entry.getKey();
+ Charset wrongCs = entry.getValue();
+ if (equalCharset(labeled, wrongCs)) {
+ continue; // can't be its own mojibake
+ }
+
+ List mojiTextsRaw = decodeAll(slices, wrongCs);
+ List mojiTexts = applyStrip(mojiTextsRaw, strip);
+
+ Row row = scorePairs(detector, strip, script, labeledName,
+ wrongName, len, cleanTexts, mojiTexts,
+ cleanZs, threshold);
+ if (row == null) {
+ continue;
}
- detail.flush();
- System.err.printf(" len=%4d n_clean=%d cells=%d%n",
- len, cleanZs.size(),
- allRows.stream()
- .filter(r -> r.labeledCs.equals(labeledName)
- && r.length == len)
- .count());
+ allRows.add(row);
+ detail.println(row.toTsv());
+
+ if (perRecord != null) {
+ List mojiFeats =
+ snapshotAll(detector, mojiTexts);
+ writePerRecord(perRecord, labeledName + (synthetic ? " (syn)" : ""),
+ strip, labeledName, script, wrongName, len,
+ cleanTexts, mojiTexts,
+ cleanFeats, mojiFeats, perRecordMax);
+ }
+ }
+ detail.flush();
+ if (perRecord != null) {
+ perRecord.flush();
}
+ System.err.printf(" strip=%-18s len=%4d n_clean=%d%n",
+ strip, len, cleanZs.size());
}
}
+ }
- writeSummary(summaryPath, allRows, lengths);
- writeScriptPivot(pivotPath, allRows);
+ // -----------------------------------------------------------------------
+ // Synthetic source generation: source bytes → Unicode → synthetic bytes.
+ // -----------------------------------------------------------------------
- System.err.println("\nWrote " + detailPath);
- System.err.println("Wrote " + summaryPath);
- System.err.println("Wrote " + pivotPath);
- System.err.println("Done.");
+ private static List synthesizeRecords(List sourceRecords,
+ Charset sourceCs,
+ Charset synthCs) {
+ CharsetDecoder dec = sourceCs.newDecoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE);
+ java.nio.charset.CharsetEncoder enc = synthCs.newEncoder()
+ .onMalformedInput(CodingErrorAction.REPLACE)
+ .onUnmappableCharacter(CodingErrorAction.REPLACE)
+ .replaceWith(new byte[]{(byte) 0x3F}); // '?' for unmappable
+ List out = new ArrayList<>();
+ for (byte[] src : sourceRecords) {
+ String text;
+ try {
+ text = dec.decode(ByteBuffer.wrap(src)).toString();
+ } catch (CharacterCodingException e) {
+ continue;
+ }
+ if (text.isEmpty()) {
+ continue;
+ }
+ byte[] synth;
+ try {
+ java.nio.ByteBuffer bb = enc.encode(java.nio.CharBuffer.wrap(text));
+ synth = new byte[bb.remaining()];
+ bb.get(synth);
+ } catch (CharacterCodingException e) {
+ continue;
+ }
+ // No fidelity filter — lossy round-trips just mean the synthetic
+ // bytes contain more '?' chars (which become a replacement-ratio
+ // signal in the eval, not a discard reason).
+ if (synth.length > 0) {
+ out.add(synth);
+ }
+ }
+ return out;
}
// -----------------------------------------------------------------------
- // Per-cell scoring (one labeled × wrong × length cell)
+ // Per-cell scoring (one strip × labeled × wrong × length cell)
// -----------------------------------------------------------------------
private static Row scorePairs(JunkDetector detector,
+ StripMode strip,
String script,
String labeledName, String wrongName,
int length,
@@ -299,8 +606,6 @@ private static Row scorePairs(JunkDetector detector,
List mojiTexts,
List cleanZsPre,
float threshold) {
- // cleanZsPre is the already-scored clean text (avoid re-scoring per wrong cs).
- // We re-score only the mojibake side here.
int n = Math.min(cleanTexts.size(), mojiTexts.size());
List cleanZs = new ArrayList<>(n);
List mojiZs = new ArrayList<>(n);
@@ -312,6 +617,9 @@ private static Row scorePairs(JunkDetector detector,
continue;
}
float mz = ms.getZScore();
+ if (Float.isNaN(cz)) {
+ continue;
+ }
cleanZs.add(cz);
mojiZs.add(mz);
margins.add(cz - mz);
@@ -319,10 +627,228 @@ private static Row scorePairs(JunkDetector detector,
if (margins.size() < MIN_SAMPLES_PER_CELL) {
return null;
}
- return new Row(labeledName, script, wrongName, length,
+ return new Row(strip, labeledName, script, wrongName, length,
cleanZs, mojiZs, margins, threshold);
}
+ // -----------------------------------------------------------------------
+ // Per-record output (Phase 2 feature study)
+ // -----------------------------------------------------------------------
+
+ private static String perRecordHeader() {
+ return "strip_mode\tlabeled_cs\tscript\twrong_cs\tlength\trecord_idx"
+ + "\tclean_logit\twrong_logit\tmargin"
+ + "\tclean_z1\tclean_z2\tclean_z3\tclean_z4"
+ + "\twrong_z1\twrong_z2\twrong_z3\twrong_z4"
+ + "\tclean_alphabetic_ratio\twrong_alphabetic_ratio"
+ + "\tclean_letter_pair_density\twrong_letter_pair_density"
+ + "\tclean_high_byte_entropy\twrong_high_byte_entropy"
+ + "\tclean_replacement_ratio\twrong_replacement_ratio"
+ + "\tclean_replacement_count\twrong_replacement_count"
+ + "\tclean_per_word_script_purity\twrong_per_word_script_purity"
+ + "\tclean_combining_mark_ratio\twrong_combining_mark_ratio"
+ + "\tclean_letter_adj_mark_ratio\twrong_letter_adj_mark_ratio"
+ + "\tclean_dominant_script\twrong_dominant_script"
+ + "\tn_cp_clean\tn_differing_cp"
+ + "\tclean_text\twrong_text";
+ }
+
+ private static void writePerRecord(PrintWriter out, String fileName,
+ StripMode strip,
+ String labeledName, String script,
+ String wrongName, int len,
+ List cleanTexts,
+ List mojiTexts,
+ List cleanFeats,
+ List mojiFeats,
+ int maxRecords) {
+ int n = Math.min(cleanFeats.size(), mojiFeats.size());
+ int emitted = 0;
+ for (int i = 0; i < n && emitted < maxRecords; i++) {
+ FeatureSnapshot c = cleanFeats.get(i);
+ FeatureSnapshot w = mojiFeats.get(i);
+ if (c.logit != c.logit || w.logit != w.logit) { // NaN check
+ continue;
+ }
+ String cText = cleanTexts.get(i);
+ String wText = mojiTexts.get(i);
+ int nCpClean = cText.codePointCount(0, cText.length());
+ int nDiffCp = countDifferingCodepoints(cText, wText);
+ float margin = c.logit - w.logit;
+ out.printf("%s\t%s\t%s\t%s\t%d\t%d"
+ + "\t%.4f\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%d\t%d"
+ + "\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%.4f\t%.4f"
+ + "\t%s\t%s"
+ + "\t%d\t%d"
+ + "\t%s\t%s%n",
+ strip, labeledName, script, wrongName, len, i,
+ c.logit, w.logit, margin,
+ c.z1, c.z2, c.z3, c.z4,
+ w.z1, w.z2, w.z3, w.z4,
+ c.alphabeticRatio, w.alphabeticRatio,
+ c.letterPairDensity, w.letterPairDensity,
+ c.highByteEntropy, w.highByteEntropy,
+ c.replacementRatio, w.replacementRatio,
+ c.replacementCount, w.replacementCount,
+ c.perWordScriptPurity, w.perWordScriptPurity,
+ c.combiningMarkRatio, w.combiningMarkRatio,
+ c.letterAdjacentToMarkRatio, w.letterAdjacentToMarkRatio,
+ c.dominantScript, w.dominantScript,
+ nCpClean, nDiffCp,
+ escapeForTsv(cText, 80),
+ escapeForTsv(wText, 80));
+ emitted++;
+ }
+ }
+
+ /**
+ * Render {@code s} for inclusion in a TSV cell: replace every control,
+ * format, and tab/newline codepoint with a {@code } escape so
+ * the row remains parseable. Truncate to {@code maxCp} codepoints
+ * with a trailing ellipsis to keep TSV rows manageable.
+ */
+ private static String escapeForTsv(String s, int maxCp) {
+ if (s == null) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ int kept = 0;
+ for (int i = 0; i < s.length(); ) {
+ int cp = s.codePointAt(i);
+ i += Character.charCount(cp);
+ if (kept >= maxCp) {
+ sb.append("…");
+ break;
+ }
+ kept++;
+ if (cp == '\t' || cp == '\n' || cp == '\r' || cp == '\\'
+ || Character.getType(cp) == Character.CONTROL
+ || Character.getType(cp) == Character.FORMAT) {
+ sb.append(String.format("", cp));
+ } else {
+ sb.appendCodePoint(cp);
+ }
+ }
+ return sb.toString();
+ }
+
+ private static int countDifferingCodepoints(String a, String b) {
+ int[] ac = a.codePoints().toArray();
+ int[] bc = b.codePoints().toArray();
+ int n = Math.min(ac.length, bc.length);
+ int diff = 0;
+ for (int i = 0; i < n; i++) {
+ if (ac[i] != bc[i]) {
+ diff++;
+ }
+ }
+ diff += Math.abs(ac.length - bc.length);
+ return diff;
+ }
+
+ private static List snapshotAll(JunkDetector detector,
+ List texts) {
+ List out = new ArrayList<>(texts.size());
+ for (String t : texts) {
+ out.add(FeatureSnapshot.of(detector, t));
+ }
+ return out;
+ }
+
+ private static final class FeatureSnapshot {
+ final float z1, z2, z3, z4, logit;
+ final String dominantScript;
+ final double alphabeticRatio;
+ final double letterPairDensity;
+ final double highByteEntropy;
+ final double replacementRatio;
+ final int replacementCount;
+ final double perWordScriptPurity;
+ final double combiningMarkRatio;
+ final double letterAdjacentToMarkRatio;
+
+ FeatureSnapshot(float z1, float z2, float z3, float z4, float logit,
+ String dominantScript,
+ double alphabeticRatio, double letterPairDensity,
+ double highByteEntropy,
+ double replacementRatio, int replacementCount,
+ double perWordScriptPurity,
+ double combiningMarkRatio,
+ double letterAdjacentToMarkRatio) {
+ this.z1 = z1;
+ this.z2 = z2;
+ this.z3 = z3;
+ this.z4 = z4;
+ this.logit = logit;
+ this.dominantScript = dominantScript;
+ this.alphabeticRatio = alphabeticRatio;
+ this.letterPairDensity = letterPairDensity;
+ this.highByteEntropy = highByteEntropy;
+ this.replacementRatio = replacementRatio;
+ this.replacementCount = replacementCount;
+ this.perWordScriptPurity = perWordScriptPurity;
+ this.combiningMarkRatio = combiningMarkRatio;
+ this.letterAdjacentToMarkRatio = letterAdjacentToMarkRatio;
+ }
+
+ static FeatureSnapshot of(JunkDetector detector, String text) {
+ JunkDetector.FeatureComponents fc =
+ detector.scoreWithFeatureComponents(text);
+ return new FeatureSnapshot(
+ fc.z1, fc.z2, fc.z3, fc.z4, fc.logit,
+ fc.dominantScript == null ? "-" : fc.dominantScript,
+ TextQualityFeatures.alphabeticRatio(text),
+ TextQualityFeatures.letterPairDensity(text),
+ TextQualityFeatures.highByteEntropy(text),
+ TextQualityFeatures.replacementRatio(text),
+ TextQualityFeatures.replacementCount(text),
+ TextQualityFeatures.perWordScriptPurity(text),
+ TextQualityFeatures.combiningMarkRatio(text),
+ TextQualityFeatures.letterAdjacentToMarkRatio(text));
+ }
+ }
+
+ // -----------------------------------------------------------------------
+ // Wrong-charset list resolution
+ // -----------------------------------------------------------------------
+
+ private static List resolveWrongCharsets(String labeledName,
+ List override,
+ boolean usePerSourceMap) {
+ if (override != null) {
+ return override;
+ }
+ if (usePerSourceMap) {
+ List perSource = PER_SOURCE_WRONG_CHARSETS.get(labeledName);
+ if (perSource != null) {
+ return perSource;
+ }
+ }
+ return DEFAULT_WRONG_CHARSETS;
+ }
+
+ private static Map resolveCharsets(List names) {
+ Map out = new LinkedHashMap<>();
+ for (String n : names) {
+ Charset cs = tryGetCharset(n);
+ if (cs != null) {
+ out.put(n, cs);
+ } else {
+ System.err.println(" WARN: wrong-charset unavailable: " + n);
+ }
+ }
+ return out;
+ }
+
// -----------------------------------------------------------------------
// I/O: read the gzipped length-prefixed record format
// -----------------------------------------------------------------------
@@ -347,16 +873,97 @@ private static List readRecords(Path file, int maxRecords) throws IOExce
return records;
}
+ /**
+ * Tile each record into contiguous {@code len}-byte slices. A 500-byte
+ * record at {@code len=10} yields 50 slices, not 1. This dramatically
+ * increases the short-length sample count without needing more records,
+ * which matters because the short-string buckets (5, 10, 15) are the
+ * focus of this eval. Caps total slices per length to avoid runaway
+ * memory on extreme combinations.
+ */
private static List sliceToLength(List records, int len) {
List slices = new ArrayList<>();
+ int cap = MAX_SLICES_PER_LENGTH;
+ outer:
for (byte[] r : records) {
- if (r.length >= len) {
- slices.add(Arrays.copyOf(r, len));
+ int n = r.length / len;
+ for (int i = 0; i < n; i++) {
+ slices.add(Arrays.copyOfRange(r, i * len, (i + 1) * len));
+ if (slices.size() >= cap) {
+ break outer;
+ }
}
}
return slices;
}
+ /** Cap on slices emitted per length bucket — prevents runaway growth at
+ * small lengths (e.g. 1000-byte records × len=5 = 200 slices each). */
+ private static final int MAX_SLICES_PER_LENGTH = 20000;
+
+ /**
+ * Extract whitespace-delimited tokens from records decoded under
+ * {@code sourceCs}, then re-encode each token under {@code sourceCs} to
+ * get token-sized byte sequences. Used by {@code --word-mode}: directly
+ * tests the "single Polish word" failure case the eval is for. Token
+ * length filter (in codepoints) is set by {@link #WORD_MODE_MIN_CP} and
+ * {@link #WORD_MODE_MAX_CP}.
+ *
+ * Does NOT work for CJK cohorts — CJK text has no inter-character
+ * whitespace, so each record collapses to one giant token that exceeds
+ * {@link #WORD_MODE_MAX_CP} and gets dropped. For CJK use the
+ * fixed-length slicing path (default).
+ */
+ private static List extractTokens(List records,
+ Charset sourceCs) {
+ List out = new ArrayList<>();
+ for (byte[] r : records) {
+ String text = decode(r, sourceCs);
+ int len = text.length();
+ int i = 0;
+ while (i < len) {
+ int cp = text.codePointAt(i);
+ if (Character.isWhitespace(cp)) {
+ i += Character.charCount(cp);
+ continue;
+ }
+ int tokenStart = i;
+ int cps = 0;
+ while (i < len) {
+ int c = text.codePointAt(i);
+ if (Character.isWhitespace(c)) {
+ break;
+ }
+ cps++;
+ i += Character.charCount(c);
+ }
+ if (cps >= WORD_MODE_MIN_CP && cps <= WORD_MODE_MAX_CP) {
+ String token = text.substring(tokenStart, i);
+ byte[] tokenBytes = token.getBytes(sourceCs);
+ if (tokenBytes.length > 0) {
+ out.add(tokenBytes);
+ }
+ }
+ if (out.size() >= MAX_SLICES_PER_LENGTH) {
+ return out;
+ }
+ }
+ }
+ return out;
+ }
+
+ private static final int WORD_MODE_MIN_CP = 3;
+ private static final int WORD_MODE_MAX_CP = 30;
+
+ /**
+ * Length column written for word-mode rows. Word-mode produces samples
+ * of varying byte length, so we report a sentinel ({@code -1}) rather
+ * than tagging each row with the per-record token length (which would
+ * shatter the per-cell aggregation). Use {@code grep '\t-1\t'} or
+ * filter {@code length == -1} in the TSV to isolate word-mode rows.
+ */
+ private static final int WORD_MODE_LEN_SENTINEL = -1;
+
private static List decodeAll(List slices, Charset cs) {
List texts = new ArrayList<>(slices.size());
for (byte[] s : slices) {
@@ -376,6 +983,17 @@ private static String decode(byte[] bytes, Charset cs) {
}
}
+ private static List applyStrip(List texts, StripMode mode) {
+ if (mode == StripMode.NONE) {
+ return texts;
+ }
+ List out = new ArrayList<>(texts.size());
+ for (String t : texts) {
+ out.add(TextQualityFeatures.strip(t, mode));
+ }
+ return out;
+ }
+
private static List scoreAll(JunkDetector detector, List texts) {
List zs = new ArrayList<>(texts.size());
for (String t : texts) {
@@ -397,92 +1015,103 @@ private static void writeSummary(Path summaryPath, List rows,
int[] lengths) throws IOException {
try (PrintWriter out = new PrintWriter(
Files.newBufferedWriter(summaryPath, StandardCharsets.UTF_8))) {
- out.println("script\tlength\tn_cells"
+ out.println("strip_mode\tscript\tlength\tn_cells"
+ "\tmacro_cohens_d\tmacro_mean_margin\tmacro_p5_margin"
+ "\tmacro_fpr\tmacro_tpr");
- // Group by (script, length)
- Map>> bucketed = new HashMap<>();
+ // Group by (strip_mode, script, length)
+ Map>>> bucketed = new LinkedHashMap<>();
for (Row r : rows) {
bucketed
+ .computeIfAbsent(r.stripMode, k -> new LinkedHashMap<>())
.computeIfAbsent(r.script, k -> new HashMap<>())
.computeIfAbsent(r.length, k -> new ArrayList<>())
.add(r);
}
- List scripts = new ArrayList<>(bucketed.keySet());
- Collections.sort(scripts);
- for (String script : scripts) {
- for (int len : lengths) {
- List cell = bucketed.get(script).get(len);
- if (cell == null || cell.isEmpty()) {
- continue;
+ for (Map.Entry>>> e
+ : bucketed.entrySet()) {
+ StripMode strip = e.getKey();
+ List scripts = new ArrayList<>(e.getValue().keySet());
+ Collections.sort(scripts);
+ for (String script : scripts) {
+ for (int len : lengths) {
+ List cell = e.getValue().get(script).get(len);
+ if (cell == null || cell.isEmpty()) {
+ continue;
+ }
+ double macroD = cell.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .mapToDouble(r -> r.cohensD)
+ .average().orElse(Double.NaN);
+ double macroMargin = cell.stream()
+ .mapToDouble(r -> r.meanMargin)
+ .average().orElse(Double.NaN);
+ double macroP5 = cell.stream()
+ .mapToDouble(r -> r.p5Margin)
+ .average().orElse(Double.NaN);
+ double macroFpr = cell.stream()
+ .mapToDouble(r -> r.fpr)
+ .average().orElse(Double.NaN);
+ double macroTpr = cell.stream()
+ .mapToDouble(r -> r.tpr)
+ .average().orElse(Double.NaN);
+ out.printf("%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f%n",
+ strip, script, len, cell.size(),
+ macroD, macroMargin, macroP5, macroFpr, macroTpr);
}
- double macroD = cell.stream()
- .filter(r -> !Double.isNaN(r.cohensD))
- .mapToDouble(r -> r.cohensD)
- .average().orElse(Double.NaN);
- double macroMargin = cell.stream()
- .mapToDouble(r -> r.meanMargin)
- .average().orElse(Double.NaN);
- double macroP5 = cell.stream()
- .mapToDouble(r -> r.p5Margin)
- .average().orElse(Double.NaN);
- double macroFpr = cell.stream()
- .mapToDouble(r -> r.fpr)
- .average().orElse(Double.NaN);
- double macroTpr = cell.stream()
- .mapToDouble(r -> r.tpr)
- .average().orElse(Double.NaN);
- out.printf("%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f%n",
- script, len, cell.size(),
- macroD, macroMargin, macroP5, macroFpr, macroTpr);
}
}
}
}
// -----------------------------------------------------------------------
- // Aggregation: script_pivot.tsv (single line per script — quick triage)
+ // Aggregation: script_pivot.tsv (one line per strip × script — quick triage)
// -----------------------------------------------------------------------
private static void writeScriptPivot(Path path, List rows) throws IOException {
try (PrintWriter out = new PrintWriter(
Files.newBufferedWriter(path, StandardCharsets.UTF_8))) {
- out.println("script\tn_cells"
+ out.println("strip_mode\tscript\tn_cells"
+ "\tmean_d\tmean_margin\tmean_p5_margin"
+ "\tmin_d_cell\tmin_margin_cell");
- Map> byScript = new HashMap<>();
+ Map>> byStripScript = new LinkedHashMap<>();
for (Row r : rows) {
- byScript.computeIfAbsent(r.script, k -> new ArrayList<>()).add(r);
+ byStripScript
+ .computeIfAbsent(r.stripMode, k -> new LinkedHashMap<>())
+ .computeIfAbsent(r.script, k -> new ArrayList<>())
+ .add(r);
}
- List scripts = new ArrayList<>(byScript.keySet());
- Collections.sort(scripts);
- for (String script : scripts) {
- List cells = byScript.get(script);
- double meanD = cells.stream()
- .filter(r -> !Double.isNaN(r.cohensD))
- .mapToDouble(r -> r.cohensD)
- .average().orElse(Double.NaN);
- double meanMargin = cells.stream()
- .mapToDouble(r -> r.meanMargin)
- .average().orElse(Double.NaN);
- double meanP5 = cells.stream()
- .mapToDouble(r -> r.p5Margin)
- .average().orElse(Double.NaN);
- Row minDCell = cells.stream()
- .filter(r -> !Double.isNaN(r.cohensD))
- .min((a, b) -> Double.compare(a.cohensD, b.cohensD))
- .orElse(null);
- Row minMarginCell = cells.stream()
- .min((a, b) -> Double.compare(a.meanMargin, b.meanMargin))
- .orElse(null);
- out.printf("%s\t%d\t%.3f\t%.3f\t%.3f\t%s\t%s%n",
- script, cells.size(),
- meanD, meanMargin, meanP5,
- minDCell != null ? cellLabel(minDCell) : "-",
- minMarginCell != null ? cellLabel(minMarginCell) : "-");
+ for (Map.Entry>> e : byStripScript.entrySet()) {
+ StripMode strip = e.getKey();
+ List scripts = new ArrayList<>(e.getValue().keySet());
+ Collections.sort(scripts);
+ for (String script : scripts) {
+ List cells = e.getValue().get(script);
+ double meanD = cells.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .mapToDouble(r -> r.cohensD)
+ .average().orElse(Double.NaN);
+ double meanMargin = cells.stream()
+ .mapToDouble(r -> r.meanMargin)
+ .average().orElse(Double.NaN);
+ double meanP5 = cells.stream()
+ .mapToDouble(r -> r.p5Margin)
+ .average().orElse(Double.NaN);
+ Row minDCell = cells.stream()
+ .filter(r -> !Double.isNaN(r.cohensD))
+ .min((a, b) -> Double.compare(a.cohensD, b.cohensD))
+ .orElse(null);
+ Row minMarginCell = cells.stream()
+ .min((a, b) -> Double.compare(a.meanMargin, b.meanMargin))
+ .orElse(null);
+ out.printf("%s\t%s\t%d\t%.3f\t%.3f\t%.3f\t%s\t%s%n",
+ strip, script, cells.size(),
+ meanD, meanMargin, meanP5,
+ minDCell != null ? cellLabel(minDCell) : "-",
+ minMarginCell != null ? cellLabel(minMarginCell) : "-");
+ }
}
}
}
@@ -517,9 +1146,16 @@ private static boolean equalCharset(Charset a, Charset b) {
|| b.aliases().contains(a.name());
}
+ private static List parseStripModes(String s) {
+ List out = new ArrayList<>();
+ for (String tok : s.split(",")) {
+ out.add(StripMode.valueOf(tok.trim().toUpperCase()));
+ }
+ return out;
+ }
+
// -----------------------------------------------------------------------
- // Script detection (parallels JunkDetector.detectDominantScript, which is
- // package-private; small enough to inline)
+ // Script detection (parallels JunkDetector.detectDominantScript)
// -----------------------------------------------------------------------
private static final Map SCRIPT_FALLBACK = Map.of(
@@ -557,6 +1193,7 @@ private static String detectDominantScript(String text) {
// -----------------------------------------------------------------------
private static final class Row {
+ final StripMode stripMode;
final String labeledCs;
final String script;
final String wrongCs;
@@ -571,9 +1208,10 @@ private static final class Row {
final double fpr;
final double tpr;
- Row(String labeledCs, String script, String wrongCs, int length,
+ Row(StripMode stripMode, String labeledCs, String script, String wrongCs, int length,
List cleanZs, List mojiZs, List margins,
float threshold) {
+ this.stripMode = stripMode;
this.labeledCs = labeledCs;
this.script = script;
this.wrongCs = wrongCs;
@@ -591,8 +1229,8 @@ private static final class Row {
String toTsv() {
return String.format(
- "%s\t%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f",
- labeledCs, script, wrongCs, length, n,
+ "%s\t%s\t%s\t%s\t%d\t%d\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f",
+ stripMode, labeledCs, script, wrongCs, length, n,
meanCleanZ, meanMojiZ, cohensD,
meanMargin, p5Margin, p50Margin,
fpr, tpr);
@@ -682,7 +1320,12 @@ private static void printUsage() {
System.err.println(" [--max-records N] (default 2000)");
System.err.println(" [--threshold F] (default -2.0)");
System.err.println(" [--lengths 20,50,...]");
- System.err.println(" [--wrong-charsets a,b,...]");
+ System.err.println(" [--strip-modes NONE,WHITESPACE,WHITESPACE_CONTROL,ALL_COMMON]");
+ System.err.println(" [--wrong-charsets a,b,...] (override per-source map)");
+ System.err.println(" [--no-per-source-map] (use the global default list)");
System.err.println(" [--only labeledCs,...] (filter for spot runs)");
+ System.err.println(" [--per-record ] (write wide per-record TSV)");
+ System.err.println(" [--per-record-max N] (cap per cell, default 50)");
+ System.err.println(" [--word-mode] (use whitespace-delimited tokens, not byte slices)");
}
}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
deleted file mode 100644
index 47ee346d37f..00000000000
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/PrototypeCodepointHash.java
+++ /dev/null
@@ -1,1208 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.ml.junkdetect.tools;
-
-import java.io.DataInputStream;
-import java.io.EOFException;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.nio.ByteBuffer;
-import java.nio.charset.CharacterCodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.CharsetDecoder;
-import java.nio.charset.CodingErrorAction;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Stream;
-import java.util.zip.GZIPInputStream;
-
-import org.apache.tika.ml.chardetect.HtmlByteStripper;
-import org.apache.tika.ml.junkdetect.JunkDetector;
-import org.apache.tika.quality.TextQualityScore;
-
-/**
- * Throwaway prototype: validates the v6 codepoint-bigram-hash architecture
- * (Bloom-gated lookup with unigram backoff) by training on locally-available
- * text and measuring margins on the AIT5-class failure case (UTF-8 multi-
- * language records cross-decoded as GB18030).
- *
- * Goal: prove the codepoint-bigram-hash approach opens the
- * UTF-8→GB18030 mojibake margin meaningfully above v5's ~1 z-unit
- * baseline BEFORE committing to a multi-day production retrain.
- *
- *
Training corpus: decode {@code ~/data/charsets/devtest/GB18030.bin.gz}
- * (Chinese) + first 80% of {@code UTF-8.bin.gz} (multi-language Wikipedia)
- * under their labeled charsets, iterate codepoints, count bigrams and unigrams,
- * hash into N buckets, build Bloom filter of seen pairs. Held-out: last 20%
- * of UTF-8 records.
- *
- *
Eval: for each held-out UTF-8 record, slice to length buckets
- * {20, 50, 100, 200, 500, 1000} source bytes. Decode each slice under
- * UTF-8 (clean) and GB18030 (mojibake-as-HAN). Score both with the
- * prototype model. Margin = clean_score - mojibake_score. Report
- * mean and 5th-percentile margin per length.
- *
- *
Sweep: {bigramBuckets, alpha} grid. Pick the configuration that
- * maximises margin. Compare to v5 baseline (mean margin ~1 z-unit
- * across all lengths in the same cohort).
- *
- *
Outputs:
- *
- * - prototype-sweep.tsv: one row per
- * (bigram_buckets, alpha, length). Columns: n, mean_clean,
- * mean_moji, mean_margin, std_margin, p5_margin, p50_margin,
- * margin_in_clean_stds (effective z-units).
- *
- *
- * Usage:
- *
- * ./mvnw -pl tika-ml/tika-ml-junkdetect exec:java \
- * -Dexec.mainClass=org.apache.tika.ml.junkdetect.tools.PrototypeCodepointHash \
- * -Dexec.args="--devtest-dir ~/data/charsets/devtest --output-dir /tmp/v6-prototype"
- *
- */
-public class PrototypeCodepointHash {
-
- // --- Hyperparameter sweep grid ---
- private static final int[] BIGRAM_BUCKETS = {4096, 8192, 16384, 32768};
- private static final double[] ALPHAS = {1.0, 0.4};
- private static final int UNIGRAM_BUCKETS = 8192;
- private static final int BLOOM_BITS = 4 * 1024 * 1024; // 512 KB
- private static final int BLOOM_K = 7;
-
- // --- Smoothing ---
- private static final double ADD_ALPHA = 0.01;
-
- // --- Eval ---
- private static final int[] LENGTHS = {20, 50, 100, 200, 500, 1000};
- private static final int MAX_RECORDS_PER_FILE = 5000;
- private static final double HOLDOUT_FRACTION = 0.20;
- private static final int MIN_SCORE_CODEPOINTS = 3;
-
- public static void main(String[] args) throws IOException {
- Path devtestDir = Paths.get(System.getProperty("user.home"),
- "data", "charsets", "devtest");
- Path outputDir = Paths.get("/tmp/v6-prototype");
- int maxRecords = MAX_RECORDS_PER_FILE;
- List fixturesDirs = new ArrayList<>();
- String wrongCharsetName = "GB18030";
- boolean singleModel = false;
- List candidates = List.of(
- "UTF-8", "GB18030", "windows-1252", "windows-1251", "windows-1257",
- "Shift_JIS", "EUC-JP", "ISO-2022-JP", "UTF-16LE", "UTF-16BE");
- List forceCandidates = null; // when set, skip base detectors
- String expected = "UTF-8";
- int[] probeSizes = null; // when set, sweep these probe sizes per fixture
-
- for (int i = 0; i < args.length; i++) {
- switch (args[i]) {
- case "--devtest-dir":
- devtestDir = Paths.get(args[++i]);
- break;
- case "--output-dir":
- outputDir = Paths.get(args[++i]);
- break;
- case "--max-records":
- maxRecords = Integer.parseInt(args[++i]);
- break;
- case "--fixtures-dir":
- fixturesDirs.add(Paths.get(args[++i]));
- break;
- case "--wrong-charset":
- wrongCharsetName = args[++i];
- break;
- case "--single-model":
- // Skip prototype training; run N-way fixture eval on bundled JunkDetector only.
- singleModel = true;
- break;
- case "--candidates":
- candidates = Arrays.asList(args[++i].split(","));
- break;
- case "--force-candidates":
- // Bypass base detectors; pairwise tournament directly on these.
- forceCandidates = Arrays.asList(args[++i].split(","));
- break;
- case "--expected":
- expected = args[++i];
- break;
- case "--probe-sizes":
- // Comma-separated probe sizes (bytes). Each fixture
- // gets one row per size, so you can see how length
- // affects UNKNOWN vs scored.
- String[] sizes = args[++i].split(",");
- probeSizes = new int[sizes.length];
- for (int k = 0; k < sizes.length; k++) {
- probeSizes[k] = Integer.parseInt(sizes[k].trim());
- }
- break;
- default:
- System.err.println("Unknown arg: " + args[i]);
- System.exit(1);
- }
- }
- Files.createDirectories(outputDir);
-
- // --single-model bypasses the v5/v6-prototype comparison apparatus.
- // Requires --force-candidates to specify the charsets to compare;
- // the base-detector-driven path was removed to keep tika-ml-junkdetect
- // free of heavy encoding-detector deps.
- if (singleModel) {
- if (fixturesDirs.isEmpty()) {
- System.err.println("--single-model requires --fixtures-dir");
- System.exit(1);
- }
- if (forceCandidates == null || forceCandidates.isEmpty()) {
- System.err.println("--single-model requires --force-candidates "
- + "(e.g. --force-candidates UTF-8,GB18030)");
- System.exit(1);
- }
- evalFixturesSingleModel(fixturesDirs, forceCandidates, expected,
- probeSizes, outputDir);
- return;
- }
-
- System.err.println("=== PrototypeCodepointHash ===");
- System.err.println(" devtest-dir: " + devtestDir);
- System.err.println(" output-dir: " + outputDir);
- System.err.println(" max-records: " + maxRecords);
- System.err.println(" bigram_buckets sweep: " + Arrays.toString(BIGRAM_BUCKETS));
- System.err.println(" alpha sweep: " + Arrays.toString(ALPHAS));
- System.err.println(" unigram_buckets: " + UNIGRAM_BUCKETS);
- System.err.println(" bloom_bits: " + BLOOM_BITS
- + " (" + (BLOOM_BITS / 8 / 1024) + " KB, k=" + BLOOM_K + ")");
-
- // -------- Load corpus --------
-
- Charset utf8 = StandardCharsets.UTF_8;
- Charset gb18030 = Charset.forName("GB18030");
-
- System.err.println("\n--- Loading corpus ---");
- List utf8Records = readRecords(
- devtestDir.resolve("UTF-8.bin.gz"), maxRecords);
- List gbRecords = readRecords(
- devtestDir.resolve("GB18030.bin.gz"), maxRecords);
- System.err.printf(" UTF-8.bin.gz: %d records%n", utf8Records.size());
- System.err.printf(" GB18030.bin.gz: %d records%n", gbRecords.size());
-
- // Train/eval split on UTF-8 records. GB18030 records all go to training.
- int holdoutCount = (int) (utf8Records.size() * HOLDOUT_FRACTION);
- int utf8TrainSize = utf8Records.size() - holdoutCount;
- List utf8TrainBytes = utf8Records.subList(0, utf8TrainSize);
- List utf8EvalBytes = utf8Records.subList(utf8TrainSize, utf8Records.size());
- System.err.printf(" UTF-8 train: %d eval: %d%n",
- utf8TrainBytes.size(), utf8EvalBytes.size());
-
- // Decode training corpus to codepoint streams
- System.err.println("\n--- Decoding training corpus ---");
- List trainStreams = new ArrayList<>();
- long totalTrainCp = 0;
- for (byte[] r : utf8TrainBytes) {
- int[] cps = toCodepoints(decode(r, utf8));
- if (cps.length >= 2) trainStreams.add(cps);
- totalTrainCp += cps.length;
- }
- for (byte[] r : gbRecords) {
- int[] cps = toCodepoints(decode(r, gb18030));
- if (cps.length >= 2) trainStreams.add(cps);
- totalTrainCp += cps.length;
- }
- System.err.printf(" total training codepoints: %,d across %d records%n",
- totalTrainCp, trainStreams.size());
-
- // Count unique pairs (for Bloom sizing sanity)
- Set uniquePairs = new HashSet<>();
- for (int[] cps : trainStreams) {
- for (int i = 0; i + 1 < cps.length; i++) {
- uniquePairs.add(packPair(cps[i], cps[i + 1]));
- if (uniquePairs.size() >= 2_000_000) break;
- }
- if (uniquePairs.size() >= 2_000_000) break;
- }
- System.err.printf(" unique codepoint-pairs in training: ~%,d%n",
- uniquePairs.size());
-
- // -------- Hyperparameter sweep --------
-
- Path sweepPath = outputDir.resolve("prototype-sweep.tsv");
- try (PrintWriter out = new PrintWriter(
- Files.newBufferedWriter(sweepPath, StandardCharsets.UTF_8))) {
- out.println("bigram_buckets\talpha\tlength\tn"
- + "\tmean_clean\tstd_clean\tmean_moji"
- + "\tmean_margin\tstd_margin\tp5_margin\tp50_margin"
- + "\tmargin_in_clean_stds\tbloom_seen_frac_clean\tbloom_seen_frac_moji");
-
- for (int buckets : BIGRAM_BUCKETS) {
- for (double alpha : ALPHAS) {
- System.err.printf("%n--- Config: bigram_buckets=%d alpha=%.1f ---%n",
- buckets, alpha);
-
- Model m = train(trainStreams, buckets, UNIGRAM_BUCKETS,
- BLOOM_BITS, BLOOM_K, ADD_ALPHA, alpha);
-
- // Calibrate on a sample of training streams (for the
- // "margin_in_clean_stds" effective-z normalization)
- double[] muSigma = calibrate(m, trainStreams);
- System.err.printf(" train mu=%.3f sigma=%.3f%n", muSigma[0], muSigma[1]);
-
- // Eval on held-out UTF-8 records
- for (int len : LENGTHS) {
- EvalCell cell = evalAtLength(m, utf8EvalBytes, len, utf8, gb18030);
- if (cell == null) continue;
- double effZ = cell.meanMargin / Math.max(muSigma[1], 1e-6);
- out.printf("%d\t%.2f\t%d\t%d\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.3f\t%.3f\t%.3f%n",
- buckets, alpha, len, cell.n,
- cell.meanClean, cell.stdClean, cell.meanMoji,
- cell.meanMargin, cell.stdMargin,
- cell.p5Margin, cell.p50Margin,
- effZ, cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
- System.err.printf(" len=%4d n=%-5d mean_margin=%6.3f p5=%6.3f"
- + " eff_z=%5.2f bloom_clean=%.2f bloom_moji=%.2f%n",
- len, cell.n, cell.meanMargin, cell.p5Margin, effZ,
- cell.bloomSeenFracClean, cell.bloomSeenFracMoji);
- out.flush();
- }
- }
- }
- }
- System.err.println("\nWrote " + sweepPath);
-
- // -------- Fixture eval (AIT5-class HTML files) --------
-
- if (!fixturesDirs.isEmpty()) {
- evalFixtures(trainStreams, fixturesDirs, wrongCharsetName, outputDir);
- }
-
- System.err.println("Done.");
- }
-
- // -----------------------------------------------------------------------
- // Real-life fixture eval: runs the production base detectors (BOM +
- // HtmlEncodingDetector + UniversalEncodingDetector) and asks the
- // JunkDetector to pick among their candidates via pairwise compare.
- // Mirrors the production charset-detection arbitration.
- // -----------------------------------------------------------------------
-
- private static void evalFixturesSingleModel(List fixturesDirs,
- List forceCandidates,
- String expected,
- int[] probeSizes,
- Path outputDir) throws IOException {
- System.err.println("\n--- Forced-candidates fixture eval ---");
- System.err.println(" candidates: " + forceCandidates);
- JunkDetector detector = JunkDetector.loadFromClasspath();
- System.err.println(" model version: " + detector.getModelVersion());
- System.err.println(" expected: " + expected);
-
- List forced = new ArrayList<>();
- for (String n : forceCandidates) {
- try {
- forced.add(Charset.forName(n));
- } catch (Exception e) {
- System.err.println(" skip unsupported charset: " + n);
- }
- }
-
- Path out = outputDir.resolve("fixtures-real-life.tsv");
- try (PrintWriter pw = new PrintWriter(
- Files.newBufferedWriter(out, StandardCharsets.UTF_8))) {
- pw.println("dir\tfile\tn_bytes\tprobe_size\texpected\tbom_cs\thtml_cs\tuniversal_cs"
- + "\tcandidates\twinner\tmargin\tstatus\tnotes");
- int pass = 0, fail = 0, skip = 0, agree = 0;
- double passMarginSum = 0.0;
- List failingLines = new ArrayList<>();
-
- for (Path dir : fixturesDirs) {
- if (!Files.isDirectory(dir)) {
- System.err.println(" WARN: not a directory: " + dir);
- continue;
- }
- try (Stream stream = Files.walk(dir)) {
- List files = new ArrayList<>();
- stream.filter(Files::isRegularFile).forEach(files::add);
- Collections.sort(files);
- int[] sizes = probeSizes != null ? probeSizes : new int[]{16_384};
- for (Path f : files) {
- for (int sz : sizes) {
- FixtureResult r =
- evalOneForced(f, expected, detector, forced, sz);
- pw.println(r.toTsvLine());
- switch (r.status) {
- case "PASS":
- pass++;
- passMarginSum += r.margin;
- break;
- case "FAIL":
- fail++;
- failingLines.add(r.dir + "/" + r.shortName
- + "@" + sz + " -> " + r.winner
- + " (expected " + r.expected + ")");
- break;
- case "AGREE":
- agree++;
- break;
- default:
- skip++;
- }
- }
- }
- }
- }
- int n = pass + fail;
- System.err.println();
- System.err.println("=== Summary ===");
- System.err.printf("Pass: %d / %d (%.1f%%) — JunkDetector picked the expected charset%n",
- pass, n, n == 0 ? 0.0 : 100.0 * pass / n);
- System.err.printf("Fail: %d%n", fail);
- System.err.printf("Agree: %d (all detectors agreed; no arbitration needed)%n", agree);
- System.err.printf("Skip: %d%n", skip);
- if (pass > 0) {
- System.err.printf("Mean margin on pass: %.3f%n", passMarginSum / pass);
- }
- if (!failingLines.isEmpty()) {
- System.err.println("Failing:");
- Collections.sort(failingLines);
- for (String line : failingLines) {
- System.err.println(" " + line);
- }
- }
- }
- System.err.println("Wrote " + out);
- }
-
- private static FixtureResult evalOneForced(Path file, String expected,
- JunkDetector detector,
- List forced,
- int probeBytes) throws IOException {
- byte[] raw = Files.readAllBytes(file);
- FixtureResult r = new FixtureResult();
- r.dir = file.getParent().getFileName().toString();
- String fname = file.getFileName().toString();
- r.shortName = fname.length() > 24 ? fname.substring(0, 24) : fname;
- r.bytes = raw.length;
- r.probeSize = probeBytes;
- r.expected = expected;
-
- if (isBinaryMagic(raw)) {
- r.status = "SKIP_BIN";
- return r;
- }
- // Strip HTML on the WHOLE raw buffer first, then slice to probeBytes
- // from the stripped content. Otherwise a small probe slice can land
- // entirely inside // boilerplate and leave
- // nothing to score after strip.
- byte[] strippedFull = stripHtmlBytes(raw);
- byte[] forDecode = strippedFull.length > probeBytes
- ? Arrays.copyOf(strippedFull, probeBytes) : strippedFull;
- r.candidatesStr = forced.stream().map(Charset::name)
- .reduce((a, b) -> a + "," + b).orElse("-");
-
- // Always log every candidate in notes — even those JunkDetector
- // rejects as unknown — so the failure mode is visible. An
- // "unknown" score itself is meaningful information when the other
- // candidate scored fine.
- String winner = null;
- String runner = null;
- float winnerZ = Float.NEGATIVE_INFINITY;
- float runnerZ = Float.NEGATIVE_INFINITY;
- StringBuilder notes = new StringBuilder();
- int decoded_scored = 0;
- for (Charset cs : forced) {
- String decoded = applyEntityVariant(new String(forDecode, cs), "expanded");
- int cps = toCodepoints(decoded).length;
- if (cps < 3) {
- notes.append(cs.name()).append("=TOO_SHORT(").append(cps).append(") ");
- continue;
- }
- TextQualityScore s = detector.score(decoded);
- if (s.isUnknown()) {
- // Diagnose: is this script-not-in-model (neutral case) or
- // all-runs-fragmented-too-short (a real mojibake signal)?
- String why = diagnoseUnknown(decoded, detector);
- notes.append(cs.name()).append("=UNK[").append(why).append("] ");
- continue;
- }
- float z = s.getZScore();
- notes.append(cs.name()).append("=").append(String.format("%.2f", z)).append(" ");
- decoded_scored++;
- if (z > winnerZ) {
- runner = winner;
- runnerZ = winnerZ;
- winner = cs.name();
- winnerZ = z;
- } else if (z > runnerZ) {
- runner = cs.name();
- runnerZ = z;
- }
- }
- if (winner == null) {
- r.status = "NO_DECODE";
- r.notes = notes.toString().trim();
- return r;
- }
- r.winner = winner;
- if (decoded_scored < 2) {
- // Only one candidate scored; no real arbitration happened.
- r.margin = Float.NaN;
- r.status = safeCanonical(winner).equals(safeCanonical(expected))
- ? "ONLY_EXPECTED_SCORED" : "ONLY_WRONG_SCORED";
- } else {
- r.margin = winnerZ - runnerZ;
- r.status = safeCanonical(winner).equals(safeCanonical(expected)) ? "PASS" : "FAIL";
- }
- r.notes = notes.toString().trim();
- return r;
- }
-
- /**
- * Diagnose why JunkDetector returned UNKNOWN for {@code text}. Walks
- * the same script-run logic, then classifies the failure mode:
- *
- * - {@code EMPTY} — input had no characters.
- * - {@code NO_MODELED_SCRIPT} — all runs are in scripts the model
- * doesn't know (legit reason to be neutral).
- * - {@code ALL_RUNS_TOO_SHORT(N)} — runs exist in modeled scripts
- * but every one is <2 UTF-8 bytes. Strong mojibake signal —
- * text is a salad of single codepoints from many scripts.
- * - {@code MIXED} — some runs were modeled-but-too-short and
- * some were unmodeled.
- *
- */
- private static String diagnoseUnknown(String text, JunkDetector detector) {
- if (text == null || text.isEmpty()) {
- return "EMPTY";
- }
- Set modeled = detector.knownScripts();
- // Walk codepoints, splitting on script boundaries — same as
- // JunkDetector.buildScriptRuns conceptually. Track per-script:
- // longest UTF-8-byte run length, plus a separate "unmodeled" tally.
- java.util.Map longestModeled = new java.util.HashMap<>();
- int unmodeledRuns = 0;
- int modeledTooShortRuns = 0;
- int currentBytes = 0;
- String currentScript = null;
- for (int i = 0; i < text.length(); ) {
- int cp = text.codePointAt(i);
- int charCount = Character.charCount(cp);
- String script = Character.UnicodeScript.of(cp).name();
- // COMMON / INHERITED / UNKNOWN attach to preceding run, but for
- // diagnosis we don't need to be that precise — treat them as a
- // continuation.
- if ("COMMON".equals(script) || "INHERITED".equals(script)
- || "UNKNOWN".equals(script)) {
- if (currentScript != null) {
- currentBytes += new String(new int[]{cp}, 0, 1)
- .getBytes(StandardCharsets.UTF_8).length;
- }
- } else if (script.equals(currentScript)) {
- currentBytes += new String(new int[]{cp}, 0, 1)
- .getBytes(StandardCharsets.UTF_8).length;
- } else {
- // close out previous run
- tallyRun(currentScript, currentBytes, modeled, longestModeled);
- if (currentScript != null) {
- if (!modeled.contains(currentScript)) {
- unmodeledRuns++;
- } else if (currentBytes < 2) {
- modeledTooShortRuns++;
- }
- }
- currentScript = script;
- currentBytes = new String(new int[]{cp}, 0, 1)
- .getBytes(StandardCharsets.UTF_8).length;
- }
- i += charCount;
- }
- // close final run
- if (currentScript != null) {
- if (!modeled.contains(currentScript)) {
- unmodeledRuns++;
- } else if (currentBytes < 2) {
- modeledTooShortRuns++;
- } else {
- longestModeled.merge(currentScript, currentBytes, Math::max);
- }
- }
- boolean anyModeledLong = !longestModeled.isEmpty();
- if (anyModeledLong) {
- // Some modeled run is ≥2 bytes — shouldn't have hit UNKNOWN.
- // (Possible discrepancy with the production logic; reported as MIXED.)
- return "MIXED(modeled_long=" + longestModeled.size() + ")";
- }
- if (modeledTooShortRuns > 0 && unmodeledRuns > 0) {
- return "MIXED(short=" + modeledTooShortRuns
- + ",unmodeled=" + unmodeledRuns + ")";
- }
- if (modeledTooShortRuns > 0) {
- return "ALL_RUNS_TOO_SHORT(" + modeledTooShortRuns + ")";
- }
- if (unmodeledRuns > 0) {
- return "NO_MODELED_SCRIPT(" + unmodeledRuns + ")";
- }
- return "OTHER";
- }
-
- private static void tallyRun(String script, int bytes, Set modeled,
- java.util.Map longestModeled) {
- if (script == null) {
- return;
- }
- if (modeled.contains(script) && bytes >= 2) {
- longestModeled.merge(script, bytes, Math::max);
- }
- }
-
- /**
- * Run HtmlByteStripper over the entire input; return the stripped
- * content bytes (or the input verbatim if no tags found).
- */
- private static byte[] stripHtmlBytes(byte[] raw) {
- byte[] dst = new byte[raw.length];
- HtmlByteStripper.Result r =
- HtmlByteStripper.strip(raw, 0, raw.length, dst, 0);
- if (r.tagCount > 0 && r.length > 0) {
- return Arrays.copyOf(dst, r.length);
- }
- return raw;
- }
-
- private static boolean isBinaryMagic(byte[] b) {
- if (b.length < 4) {
- return false;
- }
- if (b[0] == 0x50 && b[1] == 0x4B
- && (b[2] == 0x03 || b[2] == 0x05 || b[2] == 0x07)) {
- return true; // ZIP / JAR / APK / docx
- }
- if ((b[0] & 0xFF) == 0x1F && (b[1] & 0xFF) == 0x8B) {
- return true; // gzip
- }
- if (b[0] == '%' && b[1] == 'P' && b[2] == 'D' && b[3] == 'F') {
- return true; // PDF
- }
- if ((b[0] & 0xFF) == 0xD0 && (b[1] & 0xFF) == 0xCF) {
- return true; // OLE2
- }
- return false;
- }
-
- private static String safeCanonical(String charset) {
- if (charset == null) {
- return "";
- }
- try {
- return Charset.forName(charset).name();
- } catch (Exception e) {
- return charset.toUpperCase();
- }
- }
-
- private static final class FixtureResult {
- String dir;
- String shortName;
- int bytes;
- int probeSize;
- String expected;
- String bomCs;
- String htmlCs;
- String universalCs;
- String candidatesStr = "-";
- String winner = "-";
- float margin = Float.NaN;
- String status = "";
- String notes = "";
-
- String toTsvLine() {
- return String.format("%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s",
- dir, shortName, bytes, probeSize, expected,
- str(bomCs), str(htmlCs), str(universalCs),
- candidatesStr, str(winner),
- Float.isNaN(margin) ? "-" : String.format("%.3f", margin),
- status, notes.isEmpty() ? "-" : notes);
- }
-
- private static String str(String s) {
- return s == null ? "-" : s;
- }
- }
-
- // -----------------------------------------------------------------------
- // Fixture eval: score real-world AIT5-class HTML files under v5 and v6
- // prototype, with byte-level HTML stripping and entity-variant comparison.
- // -----------------------------------------------------------------------
-
- private static void evalFixtures(List trainStreams,
- List fixturesDirs,
- String wrongCharsetName,
- Path outputDir) throws IOException {
- System.err.println("\n--- Fixture eval (best config: 4096 buckets, alpha=1.0) ---");
- Model v6 = train(trainStreams, 4096, UNIGRAM_BUCKETS,
- BLOOM_BITS, BLOOM_K, ADD_ALPHA, 1.0);
- double[] muSigma = calibrate(v6, trainStreams);
- float mu = (float) muSigma[0];
- float sigma = (float) Math.max(muSigma[1], 1e-6);
- System.err.printf(" v6 train mu=%.3f sigma=%.3f%n", mu, sigma);
-
- JunkDetector v5 = JunkDetector.loadFromClasspath();
- Charset cleanCs = StandardCharsets.UTF_8;
- Charset wrongCs = Charset.forName(wrongCharsetName);
- System.err.println(" v5 model version: " + v5.getModelVersion());
- System.err.println(" clean charset: " + cleanCs.name());
- System.err.println(" mojibake charset: " + wrongCs.name());
-
- Path fixturesPath = outputDir.resolve("fixtures.tsv");
- try (PrintWriter out = new PrintWriter(
- Files.newBufferedWriter(fixturesPath, StandardCharsets.UTF_8))) {
- out.println("cluster\tfile\tentity_variant\tn_clean_cp\tn_moji_cp"
- + "\tv5_clean_z\tv5_moji_z\tv5_margin"
- + "\tv6_F1_clean\tv6_F1_moji\tv6_F1_margin"
- + "\tv6_combo_clean\tv6_combo_moji\tv6_combo_margin"
- + "\tdominant_script"
- + "\tv5_winner\tv6_F1_winner\tv6_combo_winner");
-
- for (Path dir : fixturesDirs) {
- if (!Files.isDirectory(dir)) {
- System.err.println(" WARN: not a directory: " + dir);
- continue;
- }
- try (java.util.stream.Stream files = Files.walk(dir)) {
- List sorted = new ArrayList<>();
- files.filter(Files::isRegularFile).forEach(sorted::add);
- Collections.sort(sorted);
- for (Path f : sorted) {
- evalOneFixture(f, v6, mu, sigma, v5, cleanCs, wrongCs, out);
- }
- }
- }
- }
- System.err.println("Wrote " + fixturesPath);
- }
-
- private static void evalOneFixture(Path file, Model v6, float v6Mu, float v6Sigma,
- JunkDetector v5,
- Charset cleanCs, Charset wrongCs,
- PrintWriter out) throws IOException {
- byte[] rawBytes = Files.readAllBytes(file);
- if (rawBytes.length > 16384) {
- rawBytes = Arrays.copyOf(rawBytes, 16384);
- }
- // Byte-level HTML strip (matches JunkFilterEncodingDetector production pipeline)
- byte[] stripDst = new byte[rawBytes.length];
- HtmlByteStripper.Result strip =
- HtmlByteStripper.strip(rawBytes, 0, rawBytes.length, stripDst, 0);
- byte[] forDecode = rawBytes;
- if (strip.tagCount > 0 && strip.length > 0) {
- forDecode = new byte[strip.length];
- System.arraycopy(stripDst, 0, forDecode, 0, strip.length);
- }
-
- String cluster = file.getParent().getFileName().toString();
- String fname = file.getFileName().toString();
- // shorten long content-hash names for readability in output
- String shortName = fname.length() > 12 ? fname.substring(0, 12) : fname;
-
- String cleanRaw = decode(forDecode, cleanCs);
- String mojiRaw = decode(forDecode, wrongCs);
-
- for (String variant : List.of("raw", "expanded", "removed")) {
- String clean = applyEntityVariant(cleanRaw, variant);
- String moji = applyEntityVariant(mojiRaw, variant);
- int[] cleanCps = toCodepoints(clean);
- int[] mojiCps = toCodepoints(moji);
- if (cleanCps.length < 3 || mojiCps.length < 3) continue;
-
- // --- v5 full pipeline (existing) ---
- TextQualityScore v5cs = v5.score(clean);
- TextQualityScore v5ms = v5.score(moji);
- float v5cleanZ = v5cs.isUnknown() ? Float.NaN : v5cs.getZScore();
- float v5mojiZ = v5ms.isUnknown() ? Float.NaN : v5ms.getZScore();
- float v5Margin = v5cleanZ - v5mojiZ;
-
- // --- v6 Feature 1 alone (codepoint-bigram-hash + Bloom + unigram backoff) ---
- ScoreResult v6c = score(v6, cleanCps);
- ScoreResult v6m = score(v6, mojiCps);
- double v6Margin = v6c.meanLogP - v6m.meanLogP;
-
- // --- v6 combined: substitute v6's F1 z-score into v5's classifier ---
- JunkDetector.FeatureComponents cleanFc = v5.scoreWithFeatureComponents(clean);
- JunkDetector.FeatureComponents mojiFc = v5.scoreWithFeatureComponents(moji);
- float v6F1zClean = (float) (v6c.meanLogP - v6Mu) / v6Sigma;
- float v6F1zMoji = (float) (v6m.meanLogP - v6Mu) / v6Sigma;
- float comboClean = recombineLogit(v6F1zClean, cleanFc);
- float comboMoji = recombineLogit(v6F1zMoji, mojiFc);
- float comboMargin = comboClean - comboMoji;
- String dominantScript = cleanFc != null ? cleanFc.dominantScript : "?";
-
- String v5Winner = Float.isNaN(v5Margin) ? "?" : (v5Margin > 0 ? "CLEAN" : "MOJI");
- String v6F1Winner = Double.isNaN(v6Margin) ? "?" : (v6Margin > 0 ? "CLEAN" : "MOJI");
- String v6cWinner = Float.isNaN(comboMargin) ? "?" : (comboMargin > 0 ? "CLEAN" : "MOJI");
-
- out.printf("%s\t%s\t%s\t%d\t%d"
- + "\t%.3f\t%.3f\t%.3f"
- + "\t%.4f\t%.4f\t%.4f"
- + "\t%.3f\t%.3f\t%.3f"
- + "\t%s\t%s\t%s\t%s%n",
- cluster, shortName, variant,
- cleanCps.length, mojiCps.length,
- v5cleanZ, v5mojiZ, v5Margin,
- v6c.meanLogP, v6m.meanLogP, v6Margin,
- comboClean, comboMoji, comboMargin,
- dominantScript,
- v5Winner, v6F1Winner, v6cWinner);
- out.flush();
- System.err.printf(" [%s/%s %-8s] v5: Δ%+6.2f %s v6F1: Δ%+6.3f %s v6combo: Δ%+6.2f %s script=%s%n",
- cluster, shortName, variant,
- v5Margin, v5Winner,
- v6Margin, v6F1Winner,
- comboMargin, v6cWinner,
- dominantScript);
- }
- }
-
- /**
- * Recomputes v5's per-script classifier logit with v6's F1 z-score
- * substituted for v5's z1. Approximation: keeps v5's classifier weights
- * (w1..w4, bias) which were trained on the OLD F1 distribution. A true
- * v6 retrain would re-fit w1 on the new F1 distribution; this version
- * gives a directional estimate of "what if we just swap F1?"
- */
- private static float recombineLogit(float v6F1z, JunkDetector.FeatureComponents fc) {
- if (fc == null || fc.classifierWeights == null) {
- return Float.NaN;
- }
- float[] cw = fc.classifierWeights;
- int nFeat = cw.length - 1;
- float logit = cw[nFeat]; // bias
- if (nFeat >= 1) logit += cw[0] * v6F1z;
- if (nFeat >= 2) logit += cw[1] * fc.z2;
- if (nFeat >= 3) logit += cw[2] * fc.z3;
- if (nFeat >= 4) logit += cw[3] * fc.z4;
- return logit;
- }
-
- // -----------------------------------------------------------------------
- // HTML entity expansion / removal (regex-based, sufficient for fixtures)
- // -----------------------------------------------------------------------
-
- private static final Pattern NUM_DEC = Pattern.compile("(\\d{1,7});");
- private static final Pattern NUM_HEX = Pattern.compile("[xX]([0-9a-fA-F]{1,6});");
- private static final Pattern NAMED =
- Pattern.compile("&(amp|lt|gt|quot|apos|nbsp|copy|reg);");
-
- private static String applyEntityVariant(String s, String variant) {
- switch (variant) {
- case "raw": return s;
- case "expanded": return expandEntities(s);
- case "removed": return removeEntities(s);
- default: throw new IllegalArgumentException(variant);
- }
- }
-
- private static String expandEntities(String in) {
- String s = in;
- s = NUM_DEC.matcher(s).replaceAll(mr -> {
- try {
- int cp = Integer.parseInt(mr.group(1));
- if (cp >= 0 && cp <= 0x10FFFF) {
- return Matcher.quoteReplacement(new String(Character.toChars(cp)));
- }
- } catch (NumberFormatException ignored) {
- // fall through, leave unchanged
- }
- return Matcher.quoteReplacement(mr.group());
- });
- s = NUM_HEX.matcher(s).replaceAll(mr -> {
- try {
- int cp = Integer.parseInt(mr.group(1), 16);
- if (cp >= 0 && cp <= 0x10FFFF) {
- return Matcher.quoteReplacement(new String(Character.toChars(cp)));
- }
- } catch (NumberFormatException ignored) {
- // fall through, leave unchanged
- }
- return Matcher.quoteReplacement(mr.group());
- });
- s = NAMED.matcher(s).replaceAll(mr -> {
- switch (mr.group(1)) {
- case "amp": return "&";
- case "lt": return "<";
- case "gt": return ">";
- case "quot": return "\"";
- case "apos": return "'";
- case "nbsp": return " ";
- case "copy": return "©";
- case "reg": return "®";
- default: return Matcher.quoteReplacement(mr.group());
- }
- });
- return s;
- }
-
- private static String removeEntities(String s) {
- s = NUM_DEC.matcher(s).replaceAll("");
- s = NUM_HEX.matcher(s).replaceAll("");
- s = NAMED.matcher(s).replaceAll("");
- return s;
- }
-
- // -----------------------------------------------------------------------
- // Training
- // -----------------------------------------------------------------------
-
- private static Model train(List streams,
- int bigramBuckets, int unigramBuckets,
- int bloomBits, int bloomK,
- double addAlpha, double backoffAlpha) {
- if (Integer.bitCount(bigramBuckets) != 1 || Integer.bitCount(unigramBuckets) != 1) {
- throw new IllegalArgumentException("Bucket counts must be powers of 2");
- }
- long[] bigramCounts = new long[bigramBuckets];
- long[] unigramCounts = new long[unigramBuckets];
- long bigramTotal = 0;
- long unigramTotal = 0;
- long[] bloomBitArr = new long[(bloomBits + 63) / 64];
-
- for (int[] cps : streams) {
- for (int i = 0; i < cps.length; i++) {
- int cp = cps[i];
- int uBucket = (int) (fnv1aUnigram(cp) & (unigramBuckets - 1));
- unigramCounts[uBucket]++;
- unigramTotal++;
- if (i + 1 < cps.length) {
- int cpNext = cps[i + 1];
- int bBucket = (int) (fnv1aBigram(cp, cpNext) & (bigramBuckets - 1));
- bigramCounts[bBucket]++;
- bigramTotal++;
- bloomAdd(bloomBitArr, bloomBits, bloomK, cp, cpNext);
- }
- }
- }
-
- // Convert to log-probabilities with add-alpha smoothing
- float[] bigramLogP = new float[bigramBuckets];
- double bigramDenom = bigramTotal + addAlpha * bigramBuckets;
- for (int i = 0; i < bigramBuckets; i++) {
- double p = (bigramCounts[i] + addAlpha) / bigramDenom;
- bigramLogP[i] = (float) Math.log(p);
- }
- float[] unigramLogP = new float[unigramBuckets];
- double unigramDenom = unigramTotal + addAlpha * unigramBuckets;
- for (int i = 0; i < unigramBuckets; i++) {
- double p = (unigramCounts[i] + addAlpha) / unigramDenom;
- unigramLogP[i] = (float) Math.log(p);
- }
-
- return new Model(bigramBuckets, unigramBuckets, bigramLogP, unigramLogP,
- bloomBitArr, bloomBits, bloomK, backoffAlpha);
- }
-
- private static double[] calibrate(Model m, List streams) {
- double s = 0;
- double s2 = 0;
- int n = 0;
- // Use a stride to avoid scoring every single train record
- int stride = Math.max(1, streams.size() / 1000);
- for (int i = 0; i < streams.size(); i += stride) {
- int[] cps = streams.get(i);
- if (cps.length < MIN_SCORE_CODEPOINTS) continue;
- ScoreResult r = score(m, cps);
- s += r.meanLogP;
- s2 += r.meanLogP * r.meanLogP;
- n++;
- }
- if (n == 0) return new double[]{0, 1};
- double mu = s / n;
- double var = Math.max(0, s2 / n - mu * mu);
- double sigma = Math.sqrt(var);
- return new double[]{mu, sigma};
- }
-
- // -----------------------------------------------------------------------
- // Scoring
- // -----------------------------------------------------------------------
-
- private static ScoreResult score(Model m, int[] cps) {
- if (cps.length < 2) return new ScoreResult(Double.NaN, 0, 0);
- double sum = 0;
- int n = 0;
- int seen = 0;
- for (int i = 0; i + 1 < cps.length; i++) {
- int cp1 = cps[i];
- int cp2 = cps[i + 1];
- double logP;
- if (bloomContains(m.bloomBits, m.bloomBitCount, m.bloomK, cp1, cp2)) {
- int b = (int) (fnv1aBigram(cp1, cp2) & (m.bigramBuckets - 1));
- logP = m.bigramLogP[b];
- seen++;
- } else {
- int u1 = (int) (fnv1aUnigram(cp1) & (m.unigramBuckets - 1));
- int u2 = (int) (fnv1aUnigram(cp2) & (m.unigramBuckets - 1));
- logP = m.backoffAlpha * (m.unigramLogP[u1] + m.unigramLogP[u2]);
- }
- sum += logP;
- n++;
- }
- return new ScoreResult(sum / n, n, seen);
- }
-
- private static final class ScoreResult {
- final double meanLogP;
- final int nPairs;
- final int seenPairs;
- ScoreResult(double m, int n, int s) {
- this.meanLogP = m;
- this.nPairs = n;
- this.seenPairs = s;
- }
- }
-
- // -----------------------------------------------------------------------
- // Eval at one length bucket
- // -----------------------------------------------------------------------
-
- private static EvalCell evalAtLength(Model m, List evalBytes, int length,
- Charset cleanCs, Charset wrongCs) {
- List cleans = new ArrayList<>();
- List mojis = new ArrayList<>();
- List margins = new ArrayList<>();
- double seenSumClean = 0, seenSumMoji = 0;
- int nSeenObs = 0;
- for (byte[] rec : evalBytes) {
- if (rec.length < length) continue;
- byte[] slice = Arrays.copyOf(rec, length);
- int[] cleanCps = toCodepoints(decode(slice, cleanCs));
- int[] mojiCps = toCodepoints(decode(slice, wrongCs));
- if (cleanCps.length < MIN_SCORE_CODEPOINTS
- || mojiCps.length < MIN_SCORE_CODEPOINTS) continue;
- ScoreResult sc = score(m, cleanCps);
- ScoreResult sm = score(m, mojiCps);
- if (Double.isNaN(sc.meanLogP) || Double.isNaN(sm.meanLogP)) continue;
- cleans.add(sc.meanLogP);
- mojis.add(sm.meanLogP);
- margins.add(sc.meanLogP - sm.meanLogP);
- if (sc.nPairs > 0) seenSumClean += (double) sc.seenPairs / sc.nPairs;
- if (sm.nPairs > 0) seenSumMoji += (double) sm.seenPairs / sm.nPairs;
- nSeenObs++;
- }
- if (margins.size() < 30) return null;
- EvalCell cell = new EvalCell();
- cell.n = margins.size();
- cell.meanClean = mean(cleans);
- cell.stdClean = std(cleans, cell.meanClean);
- cell.meanMoji = mean(mojis);
- cell.meanMargin = mean(margins);
- cell.stdMargin = std(margins, cell.meanMargin);
- cell.p5Margin = percentile(margins, 0.05);
- cell.p50Margin = percentile(margins, 0.50);
- cell.bloomSeenFracClean = nSeenObs > 0 ? seenSumClean / nSeenObs : Double.NaN;
- cell.bloomSeenFracMoji = nSeenObs > 0 ? seenSumMoji / nSeenObs : Double.NaN;
- return cell;
- }
-
- private static final class EvalCell {
- int n;
- double meanClean, stdClean;
- double meanMoji;
- double meanMargin, stdMargin;
- double p5Margin, p50Margin;
- double bloomSeenFracClean, bloomSeenFracMoji;
- }
-
- // -----------------------------------------------------------------------
- // FNV-1a hashing for codepoint bigram / unigram + Bloom filter
- // -----------------------------------------------------------------------
-
- private static final long FNV_OFFSET = 0xcbf29ce484222325L;
- private static final long FNV_PRIME = 0x100000001b3L;
-
- private static long fnv1aBigram(int cp1, int cp2) {
- long h = FNV_OFFSET;
- h = (h ^ ((cp1 >>> 24) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp1 >>> 16) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp1 >>> 8) & 0xFF)) * FNV_PRIME;
- h = (h ^ (cp1 & 0xFF)) * FNV_PRIME;
- h = (h ^ 0xFF) * FNV_PRIME; // separator
- h = (h ^ ((cp2 >>> 24) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp2 >>> 16) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp2 >>> 8) & 0xFF)) * FNV_PRIME;
- h = (h ^ (cp2 & 0xFF)) * FNV_PRIME;
- return h;
- }
-
- private static long fnv1aUnigram(int cp) {
- long h = FNV_OFFSET;
- h = (h ^ ((cp >>> 24) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp >>> 16) & 0xFF)) * FNV_PRIME;
- h = (h ^ ((cp >>> 8) & 0xFF)) * FNV_PRIME;
- h = (h ^ (cp & 0xFF)) * FNV_PRIME;
- return h;
- }
-
- private static long secondaryHash(int cp1, int cp2) {
- // Independent secondary hash for Bloom double-hashing. Just shuffle
- // the inputs differently.
- long h = 0xff51afd7ed558ccdL;
- h = (h ^ Integer.reverse(cp1)) * 0xc4ceb9fe1a85ec53L;
- h = (h ^ Integer.reverse(cp2)) * 0xc4ceb9fe1a85ec53L;
- h ^= h >>> 33;
- return h;
- }
-
- private static void bloomAdd(long[] bits, int bitCount, int k, int cp1, int cp2) {
- long h1 = fnv1aBigram(cp1, cp2);
- long h2 = secondaryHash(cp1, cp2);
- for (int i = 0; i < k; i++) {
- long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
- bits[(int) (pos >>> 6)] |= 1L << (pos & 63);
- }
- }
-
- private static boolean bloomContains(long[] bits, int bitCount, int k,
- int cp1, int cp2) {
- long h1 = fnv1aBigram(cp1, cp2);
- long h2 = secondaryHash(cp1, cp2);
- for (int i = 0; i < k; i++) {
- long pos = ((h1 + (long) i * h2) & 0x7FFFFFFFFFFFFFFFL) % bitCount;
- if ((bits[(int) (pos >>> 6)] & (1L << (pos & 63))) == 0) return false;
- }
- return true;
- }
-
- private static long packPair(int cp1, int cp2) {
- return ((long) cp1 << 32) | (cp2 & 0xFFFFFFFFL);
- }
-
- // -----------------------------------------------------------------------
- // I/O and decode utilities (copied from EvalJunkOnCharsetDevtest)
- // -----------------------------------------------------------------------
-
- private static List readRecords(Path file, int maxRecords) throws IOException {
- List records = new ArrayList<>();
- try (FileInputStream fis = new FileInputStream(file.toFile());
- GZIPInputStream gis = new GZIPInputStream(fis);
- DataInputStream dis = new DataInputStream(gis)) {
- while (records.size() < maxRecords) {
- int len;
- try {
- len = dis.readUnsignedShort();
- } catch (EOFException eof) {
- break;
- }
- byte[] rec = new byte[len];
- dis.readFully(rec);
- records.add(rec);
- }
- }
- return records;
- }
-
- private static String decode(byte[] bytes, Charset cs) {
- CharsetDecoder dec = cs.newDecoder()
- .onMalformedInput(CodingErrorAction.REPLACE)
- .onUnmappableCharacter(CodingErrorAction.REPLACE);
- try {
- return dec.decode(ByteBuffer.wrap(bytes)).toString();
- } catch (CharacterCodingException e) {
- return new String(bytes, cs);
- }
- }
-
- private static int[] toCodepoints(String s) {
- int[] cps = new int[s.length()];
- int n = 0;
- for (int i = 0; i < s.length(); ) {
- int cp = s.codePointAt(i);
- cps[n++] = cp;
- i += Character.charCount(cp);
- }
- return Arrays.copyOf(cps, n);
- }
-
- // -----------------------------------------------------------------------
- // Stats
- // -----------------------------------------------------------------------
-
- private static double mean(List xs) {
- double s = 0;
- int n = 0;
- for (double v : xs) {
- if (!Double.isNaN(v)) {
- s += v;
- n++;
- }
- }
- return n == 0 ? Double.NaN : s / n;
- }
-
- private static double std(List xs, double mu) {
- if (xs.size() < 2) return 0;
- double s = 0;
- int n = 0;
- for (double v : xs) {
- if (!Double.isNaN(v)) {
- s += (v - mu) * (v - mu);
- n++;
- }
- }
- return n < 2 ? 0 : Math.sqrt(s / (n - 1));
- }
-
- private static double percentile(List xs, double p) {
- List sorted = new ArrayList<>(xs);
- sorted.removeIf(v -> Double.isNaN(v));
- if (sorted.isEmpty()) return Double.NaN;
- Collections.sort(sorted);
- int idx = (int) Math.floor(p * (sorted.size() - 1));
- return sorted.get(idx);
- }
-
- // -----------------------------------------------------------------------
- // Model
- // -----------------------------------------------------------------------
-
- private static final class Model {
- final int bigramBuckets;
- final int unigramBuckets;
- final float[] bigramLogP;
- final float[] unigramLogP;
- final long[] bloomBits;
- final int bloomBitCount;
- final int bloomK;
- final double backoffAlpha;
- Model(int bb, int ub, float[] blp, float[] ulp,
- long[] bloom, int bbc, int bk, double a) {
- this.bigramBuckets = bb;
- this.unigramBuckets = ub;
- this.bigramLogP = blp;
- this.unigramLogP = ulp;
- this.bloomBits = bloom;
- this.bloomBitCount = bbc;
- this.bloomK = bk;
- this.backoffAlpha = a;
- }
- }
-}
diff --git a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
index cf52a9eedfc..88807c1cdbc 100644
--- a/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
+++ b/tika-ml/tika-ml-junkdetect/src/main/java/org/apache/tika/ml/junkdetect/tools/TrainJunkModel.java
@@ -24,19 +24,16 @@
import java.nio.ByteOrder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
-import java.nio.charset.UnsupportedCharsetException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
-import java.util.Set;
import java.util.TreeMap;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@@ -127,8 +124,11 @@
public class TrainJunkModel {
static final String MAGIC = "JUNKDET1";
- /** Sole supported file-format version. Matches JunkDetector.VERSION. */
- static final byte VERSION = 7;
+ /** Current file-format version produced by this trainer. v8 adds two
+ * global calibrations (z5 letter-adjacent-to-mark, z6 replacement-char)
+ * after the script-transition calibration and writes 6-feature LR
+ * weights per script. Matches {@link JunkDetector#VERSION}. */
+ static final byte VERSION = 13;
// -----------------------------------------------------------------------
// v7 model constants (per-script open-addressing codepoint-bigram tables)
@@ -156,25 +156,141 @@ public class TrainJunkModel {
static final float CONTROL_BYTE_MIN_SIGMA = 0.005f;
/**
- * Codec pairs used to build wrong-codec remap tables for training.
- * Each entry is {sourceCodec, wrongCodec}: text encoded in sourceCodec but
- * decoded as wrongCodec. Pairs within the same script family (e.g. CP1250↔CP1252)
- * produce wrong-accent distortions that shift characters between Unicode blocks
- * while staying in LATIN. Cross-script pairs (CP1252↔CP1255) additionally change
- * the Unicode script, which z4 also detects.
+ * Per-script extra positive-sample sources. For each entry the trainer
+ * reads length-prefixed records from {@code file}, decodes under the
+ * declared {@code charset}, and adds the resulting Unicode strings to
+ * the per-script positive class at fraction {@code fraction} of the
+ * primary corpus. Phase C of the JunkDetector cleanup uses this to
+ * augment the LATIN positive class with under-represented Central
+ * European and South-East Asian languages (Vietnamese, Polish, Czech,
+ * Baltic) sourced from the charset-detection training corpus.
*/
- static final String[][] WRONG_CODEC_PAIRS = {
- {"windows-1252", "windows-1250"}, // Western ↔ Central European (wrong accents)
- {"windows-1250", "windows-1252"}, // reverse
- {"windows-1252", "windows-1257"}, // Western ↔ Baltic (wrong accents)
- {"windows-1257", "windows-1252"}, // reverse
- {"windows-1252", "windows-1254"}, // Western ↔ Turkish (wrong accents)
- {"windows-1251", "windows-1252"}, // Cyrillic → Latin (cross-script)
- {"windows-1252", "windows-1251"}, // Latin → Cyrillic (cross-script)
- {"windows-1253", "windows-1252"}, // Greek → Latin (cross-script)
- {"windows-1252", "windows-1253"}, // Latin → Greek (cross-script)
- {"windows-1255", "windows-1252"}, // Hebrew → Latin (cross-script)
- {"windows-1252", "windows-1255"}, // Latin → Hebrew (the German vcard case)
+ static final Map> EXTRA_POSITIVE_SOURCES;
+
+ static {
+ Map> m = new LinkedHashMap<>();
+ Path charsetTrain = Paths.get(System.getProperty("user.home"),
+ "data", "charsets", "train");
+ // Fractions chosen small (0.04 / 0.04 / 0.02) to nudge bigram
+ // coverage without drowning out the primary LATIN corpus or
+ // collapsing per-script bias/discrimination on Western-Latin
+ // (English/Spanish/French) and Baltic test fixtures. Initial
+ // larger fractions (0.15/0.10/0.05) helped Vietnamese but
+ // dropped LATIN bias from ~1.6 to ~0.4 and broke the cp1257
+ // Baltic discrimination test.
+ m.put("LATIN", List.of(
+ // Vietnamese (the deferred Phase C target). windows-1258
+ // bytes decoded as windows-1258 give Unicode Vietnamese text
+ // that lifts the LATIN bigram model's Vietnamese coverage.
+ new ExtraPositiveSource(charsetTrain.resolve("windows-1258.bin.gz"),
+ "windows-1258", 0.04),
+ // Central European (Polish, Czech, Slovak, Hungarian,
+ // Croatian) — similarly under-represented.
+ new ExtraPositiveSource(charsetTrain.resolve("windows-1250.bin.gz"),
+ "windows-1250", 0.04),
+ // Baltic — modest boost for windows-1257 cohort coverage.
+ new ExtraPositiveSource(charsetTrain.resolve("windows-1257.bin.gz"),
+ "windows-1257", 0.02)));
+ EXTRA_POSITIVE_SOURCES = Collections.unmodifiableMap(m);
+ }
+
+ static final class ExtraPositiveSource {
+ final Path file;
+ final String charsetName;
+ final double fraction;
+
+ ExtraPositiveSource(Path file, String charsetName, double fraction) {
+ this.file = file;
+ this.charsetName = charsetName;
+ this.fraction = fraction;
+ }
+ }
+
+ /**
+ * Full-text byte-level mojibake pairs used by {@link #byteLevelMojibake}.
+ * Each entry is {sourceCodec, wrongCodec}: training text gets encoded in
+ * sourceCodec, then the resulting bytes are re-decoded as wrongCodec to
+ * produce realistic mojibake. Covers SBCS sibling confusion (1252↔1250,
+ * etc.), UTF-8 ↔ Latin (TIKA-4683), and CJK siblings (the GB18030↔EUC-JP
+ * cohort that was -14817 in the 29K eval). For codec pairs that share
+ * an ASCII subset, ASCII-only training samples pass through unchanged
+ * (no-op corruption), so the list is safe to apply across all scripts.
+ */
+ static final String[][] BYTE_LEVEL_MOJIBAKE_PAIRS = {
+ // SBCS Western family
+ {"windows-1252", "windows-1250"},
+ {"windows-1250", "windows-1252"},
+ {"windows-1252", "windows-1257"},
+ {"windows-1257", "windows-1252"},
+ {"windows-1252", "windows-1254"},
+ {"ISO-8859-1", "windows-1252"},
+ {"windows-1252", "ISO-8859-1"},
+ {"x-MacRoman", "windows-1252"},
+ // SBCS Cyrillic / Greek / RTL
+ {"windows-1251", "windows-1252"},
+ {"windows-1252", "windows-1251"},
+ {"windows-1253", "windows-1252"},
+ {"windows-1252", "windows-1253"},
+ {"windows-1255", "windows-1252"},
+ {"windows-1252", "windows-1255"},
+ {"windows-1256", "windows-1252"},
+ // Polish ¶ emblem and Central European
+ {"ISO-8859-2", "windows-1250"},
+ {"windows-1250", "ISO-8859-2"},
+ {"ISO-8859-3", "windows-1250"},
+ // Vietnamese
+ {"windows-1258", "windows-1252"},
+ {"windows-1252", "windows-1258"},
+ // UTF-8 → Latin (TIKA-4683 / AIT5)
+ {"UTF-8", "windows-1252"},
+ {"UTF-8", "ISO-8859-1"},
+ // UTF-16 → various — bytes-as-UTF-16 produces dense CJK ideographs
+ // (the AIT5 / TIKA-4683 shape); included for HAN-classifier training
+ // against this cohort.
+ {"UTF-8", "UTF-16LE"},
+ {"UTF-8", "UTF-16BE"},
+ // CJK siblings
+ {"GB18030", "EUC-JP"},
+ {"EUC-JP", "GB18030"}, // reverse
+ {"GB18030", "Shift_JIS"}, // CJK siblings
+ {"Shift_JIS", "GB18030"}, // reverse
+ {"Big5-HKSCS", "GB18030"}, // CJK siblings
+ {"GB18030", "Big5-HKSCS"}, // reverse
+ // Latin → CJK: the SPECIFIC pattern that produces our 66 wrong-CJK
+ // over-adoption cases. Western European accents (0xC0-0xFE in
+ // windows-1252) are valid 2-byte CJK lead bytes; GB18030/Shift_JIS/etc
+ // decoders consume them as the lead of a multi-byte sequence, which
+ // (a) inserts singleton Han characters scattered through Latin text
+ // and (b) eats the byte after each accent. Produces the
+ // long-Latin-with-singleton-HAN fragmentation that z9 measures.
+ // Without these pairs the LATIN classifier never sees this pattern
+ // in its negatives and the LR fits w9 = 0.
+ {"windows-1252", "GB18030"},
+ {"windows-1252", "Shift_JIS"},
+ {"windows-1252", "EUC-JP"},
+ {"windows-1252", "Big5-HKSCS"},
+ {"ISO-8859-1", "GB18030"},
+ {"ISO-8859-1", "Shift_JIS"},
+ };
+
+ /**
+ * Same pairs as the LATIN→CJK block above, but isolated for the
+ * sampling-boost in {@link #trainClassifierV7}. When training the
+ * LATIN classifier, half of the case-2 (byte-level-mojibake) picks
+ * come from this subset rather than from the full pair list.
+ * Without the boost, LATIN→CJK pairs are ~6/54 = 11% of case-2,
+ * which translates to ~1.4% of all LATIN negatives — too rare to
+ * lift w9 (script-alternation ratio) above the L2 floor. Boosting
+ * to 50% of case-2 = ~6% of all negatives gives the LR enough z9
+ * signal to fit a meaningful weight.
+ */
+ static final String[][] LATIN_TO_CJK_PAIRS = {
+ {"windows-1252", "GB18030"},
+ {"windows-1252", "Shift_JIS"},
+ {"windows-1252", "EUC-JP"},
+ {"windows-1252", "Big5-HKSCS"},
+ {"ISO-8859-1", "GB18030"},
+ {"ISO-8859-1", "Shift_JIS"},
};
/**
@@ -299,6 +415,10 @@ public static void main(String[] args) throws IOException {
t0 = System.currentTimeMillis();
System.out.print(" Training named-block table... ");
float[] blockTable = trainBlockTable(trainFile);
+ // Round-trip through int8 quantization so the calibration sees
+ // the same precision the inference path will see (Phase F:
+ // eliminates train/infer drift on F2 dequantized lookups).
+ blockTable = quantizeDequantizeRoundTrip(blockTable);
System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
t0 = System.currentTimeMillis();
@@ -344,6 +464,9 @@ public static void main(String[] args) throws IOException {
t0 = System.currentTimeMillis();
System.out.print(" Training script-transition table... ");
float[] scriptTransTable = trainScriptTransitionTable(allTrainFiles, scriptBucketMap, numScriptBuckets);
+ // Round-trip through int8 quantization so calibration sees the
+ // values inference will see (Phase F: F4 is also stored quantized).
+ scriptTransTable = quantizeDequantizeRoundTrip(scriptTransTable);
System.out.printf("done (%dms)%n", System.currentTimeMillis() - t0);
t0 = System.currentTimeMillis();
@@ -353,24 +476,10 @@ public static void main(String[] args) throws IOException {
System.out.printf("done — mu=%.4f sigma=%.4f (%dms)%n",
scriptTransCal[0], scriptTransCal[1], System.currentTimeMillis() - t0);
- t0 = System.currentTimeMillis();
- System.out.print(" Collecting per-script codepoint pools... ");
- Map> scriptCodepoints = collectScriptCodepoints(allTrainFiles, 200);
- System.out.printf("done — %d scripts (%dms)%n",
- scriptCodepoints.size(), System.currentTimeMillis() - t0);
-
- System.out.print(" Building wrong-codec remap tables... ");
- List