Merge pull request #7 from tk1024/feat/boundary-heuristic

tk1024 · web-flow · commit 553950777b6d · 2026-03-22T22:27:26.000+09:00
feat: 文字種境界スコアリング + OOV姓ペナルティ + カタカナ姓例外フロー
diff --git a/src/core/normalize.ts b/src/core/normalize.ts
@@ -40,6 +40,75 @@ const VARIANT_MAP: Record<string, string> = {
   "條": "条", "圓": "円",
 };
 
+type ScriptType = "kanji" | "hiragana" | "katakana" | "other";
+
+function scriptOf(ch: string): ScriptType {
+  if (/[\u3041-\u3096]/.test(ch)) return "hiragana";
+  if (/[\u30A1-\u30F6\u30FC]/.test(ch)) return "katakana";
+  if (/[\p{Script=Han}々〆ヶ]/u.test(ch)) return "kanji";
+  return "other";
+}
+
+/**
+ * Find the split position where a single script boundary occurs
+ * between kanji and kana (in either direction).
+ * Returns the character index and direction, or undefined if no unique boundary exists.
+ *
+ * Examples:
+ *   "夏色まつり" → { index: 2, direction: "kanji-to-kana" }
+ *   "白銀ノエル" → { index: 2, direction: "kanji-to-kana" }
+ *   "デーモン閣下" → { index: 4, direction: "kana-to-kanji" }
+ *   "もこ田めめめ" → undefined (2 transitions)
+ *   "田中太郎" → undefined (all kanji, no transition)
+ */
+export interface ScriptBoundary {
+  index: number;
+  direction: "kanji-to-kana" | "kana-to-kanji";
+}
+
+export function findSingleScriptBoundary(fullName: string): ScriptBoundary | undefined {
+  const chars = [...fullName];
+  let transitionCount = 0;
+  let splitIndex: number | undefined;
+  let fromScript: ScriptType | undefined;
+  let toScript: ScriptType | undefined;
+
+  for (let i = 1; i < chars.length; i++) {
+    const prev = scriptOf(chars[i - 1]);
+    const next = scriptOf(chars[i]);
+    if (prev === next) continue;
+    if (prev === "other" || next === "other") return undefined;
+    transitionCount++;
+    if (transitionCount > 1) return undefined;
+    splitIndex = i;
+    fromScript = prev;
+    toScript = next;
+  }
+
+  if (transitionCount !== 1 || splitIndex === undefined) return undefined;
+
+  const fromIsKanji = fromScript === "kanji";
+  const toIsKanji = toScript === "kanji";
+  const fromIsKana = fromScript === "hiragana" || fromScript === "katakana";
+  const toIsKana = toScript === "hiragana" || toScript === "katakana";
+
+  if (fromIsKanji && toIsKana) {
+    return { index: splitIndex, direction: "kanji-to-kana" };
+  }
+  if (fromIsKana && toIsKanji) {
+    return { index: splitIndex, direction: "kana-to-kanji" };
+  }
+
+  return undefined;
+}
+
+/** @deprecated Use findSingleScriptBoundary instead */
+export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined {
+  const result = findSingleScriptBoundary(fullName);
+  if (result?.direction === "kanji-to-kana") return result.index;
+  return undefined;
+}
+
 /**
  * Fold variant kanji to their canonical forms.
  */
diff --git a/src/core/scorer.ts b/src/core/scorer.ts
@@ -30,6 +30,21 @@ const MEI_LENGTH_SCORE: Record<number, number> = {
 const PAIR_BONUS = 0.8;
 const BOTH_SINGLE_CHAR_PENALTY = -1.0;
 
+// Surname hit is stronger evidence than given name hit,
+// because surnames are a finite known set while given names are creative
+const SEI_HIT_BONUS = 0.5;
+
+// Script boundary scoring
+const BOUNDARY_MATCH_BONUS = 1.2;
+const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8;
+const BOUNDARY_BEFORE_PENALTY = -3.0;
+const BOUNDARY_AFTER_PENALTY = -1.8;
+
+// Sei mixed-script penalty: OOV surname containing kana is unnatural
+const SEI_MIXED_SINGLE_HIRA_PENALTY = -2.5;
+const SEI_MIXED_SINGLE_KATA_PENALTY = -3.0;
+const SEI_MIXED_MULTI_KANA_PENALTY = -1.5;
+
 // Cache for Set-based lookups built from string[]
 const setCache = new WeakMap<PackedLexicon, { sei: Set<string>; mei: Set<string> }>();
 
@@ -45,6 +60,61 @@ function getSets(lexicon: PackedLexicon): { sei: Set<string>; mei: Set<string> }
   return cached;
 }
 
+const RE_KANJI = /[\p{Script=Han}々〆ヶ]/u;
+const RE_HIRAGANA = /[\u3041-\u3096]/;
+const RE_KATAKANA = /[\u30A1-\u30F6\u30FC]/;
+
+function scriptOf(ch: string): "K" | "H" | "T" | "O" {
+  if (RE_KANJI.test(ch)) return "K";
+  if (RE_HIRAGANA.test(ch)) return "H";
+  if (RE_KATAKANA.test(ch)) return "T";
+  return "O";
+}
+
+function scriptPattern(s: string): string {
+  return [...s].map(scriptOf).join("");
+}
+
+/**
+ * Penalty for OOV surnames that contain kana (e.g. 宝鐘マ, 星街すい).
+ * Real Japanese surnames are almost always pure kanji.
+ * Only applied when the surname has no dictionary hit.
+ */
+function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number {
+  if (seiMatch !== "none") return 0;
+
+  const p = scriptPattern(sei);
+  if (!/^K+[HT]+$/.test(p)) return 0;
+
+  const suffix = p.match(/[HT]+$/)![0];
+  if (suffix.length === 1) {
+    return suffix[0] === "T"
+      ? SEI_MIXED_SINGLE_KATA_PENALTY
+      : SEI_MIXED_SINGLE_HIRA_PENALTY;
+  }
+  return SEI_MIXED_MULTI_KANA_PENALTY;
+}
+
+/**
+ * Penalty for OOV given names that start with kana followed by kanji (e.g. モン閣下, イク眞木).
+ * When a kana→kanji boundary exists, the mei side should be pure kanji.
+ * Only applied when the given name has no dictionary hit.
+ */
+function meiMixedScriptPenalty(mei: string, meiMatch: MatchType): number {
+  if (meiMatch !== "none") return 0;
+
+  const p = scriptPattern(mei);
+  if (!/^[HT]+K+$/.test(p)) return 0;
+
+  const prefix = p.match(/^[HT]+/)![0];
+  if (prefix.length === 1) {
+    return prefix[0] === "T"
+      ? SEI_MIXED_SINGLE_KATA_PENALTY
+      : SEI_MIXED_SINGLE_HIRA_PENALTY;
+  }
+  return SEI_MIXED_MULTI_KANA_PENALTY;
+}
+
 /**
  * Look up a candidate string in the lexicon.
  * Returns the match type: surface > folded > reading > none.
@@ -87,19 +157,33 @@ export function lookupMatch(
 
 /**
  * Calculate the score for a split candidate.
+ *
+ * @param sei - the surname candidate string
+ * @param splitIndex - the character index where this candidate splits (i.e. sei length)
+ * @param boundaryIndex - the unique kanji→kana boundary position, or undefined if none
  */
 export function calcScore(
+  sei: string,
+  mei: string,
   seiMatch: MatchType,
   meiMatch: MatchType,
   seiLen: number,
   meiLen: number,
+  splitIndex: number,
+  boundaryIndex: number | undefined,
 ): number {
   let score = 0;
 
   // Match scores (primary signal)
   score += MATCH_SCORE[seiMatch];
   score += MATCH_SCORE[meiMatch];
 
+  // Surname hit bonus: surnames are a known finite set,
+  // so a dictionary hit on sei is stronger evidence than on mei
+  if (seiMatch === "surface" || seiMatch === "folded") {
+    score += SEI_HIT_BONUS;
+  }
+
   // Length scores (secondary signal)
   score += SEI_LENGTH_SCORE[Math.min(seiLen, 6)] ?? -0.5;
   score += MEI_LENGTH_SCORE[Math.min(meiLen, 6)] ?? -0.5;
@@ -124,5 +208,23 @@ export function calcScore(
     score = -Infinity;
   }
 
+  // Script boundary scoring
+  if (boundaryIndex !== undefined) {
+    if (splitIndex === boundaryIndex) {
+      score += BOUNDARY_MATCH_BONUS;
+      if (seiMatch === "surface" || seiMatch === "folded") {
+        score += BOUNDARY_MATCH_WITH_DICT_BONUS;
+      }
+    } else if (splitIndex < boundaryIndex) {
+      score += BOUNDARY_BEFORE_PENALTY;
+    } else {
+      score += BOUNDARY_AFTER_PENALTY;
+    }
+  }
+
+  // OOV surname mixed-script penalty (mei side is not penalized —
+  // names like よね子, ルミ子, 美つ子 naturally mix scripts)
+  score += seiMixedScriptPenalty(sei, seiMatch);
+
   return score;
 }
diff --git a/src/core/splitter.ts b/src/core/splitter.ts
@@ -6,12 +6,17 @@ import type {
   SeimeiResult,
   SplitOptions,
 } from "./types.js";
-import { isAllHiragana, isAllKatakana, isNonJapanese } from "./normalize.js";
+import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleScriptBoundary } from "./normalize.js";
 import { calcScore, lookupMatch } from "./scorer.js";
 
 const CONFIDENCE_THRESHOLD = 6.0;
 const CONFIDENCE_GAP = 1.0;
 
+// Boundary confidence: when the best candidate aligns with a script boundary
+// and has dictionary evidence, grant confidence 0.8
+const BOUNDARY_CONFIDENCE = 0.8;
+const BOUNDARY_CONFIDENCE_GAP = 0.5;
+
 let defaultLexicon: PackedLexicon | undefined;
 let defaultReading: ReadingData | undefined;
 
@@ -73,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
 
   const lexicon = options?.lexicon ?? defaultLexicon;
   if (!lexicon) {
-    // No lexicon loaded: return unsplit
     return {
       best: { sei: trimmed, mei: "" },
       confidence: 0,
@@ -94,6 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
 
   const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed);
   const maxSplit = Math.min(lexicon.maxSeiLen, n - 1);
+  const boundary = findSingleScriptBoundary(trimmed);
+  const boundaryIndex = boundary?.index;
   const candidates: SeimeiCandidate[] = [];
 
   for (let i = 1; i <= maxSplit; i++) {
@@ -107,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
     const readingData = options?.readingData ?? defaultReading;
     const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData);
     const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData);
-    const score = calcScore(seiMatch, meiMatch, seiLen, meiLen);
+    const score = calcScore(sei, mei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex);
 
     candidates.push({ sei, mei, score, seiMatch, meiMatch });
   }
@@ -129,15 +135,67 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
   const confident =
     best.score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP;
 
-  if (confident || options?.allowLowConfidence) {
+  // 1. Normal confidence: dictionary-based high score
+  if (confident) {
+    return {
+      best: { sei: best.sei, mei: best.mei },
+      confidence: 1.0,
+      candidates,
+    };
+  }
+
+  // 2. Boundary confidence: best candidate aligns with script boundary
+  //    and has dictionary evidence on the appropriate side
+  if (boundaryIndex !== undefined && [...best.sei].length === boundaryIndex) {
+    const hasDictEvidence = boundary?.direction === "kanji-to-kana"
+      ? (best.seiMatch === "surface" || best.seiMatch === "folded")
+      : (best.meiMatch === "surface" || best.meiMatch === "folded");
+
+    if (
+      hasDictEvidence &&
+      best.score >= CONFIDENCE_THRESHOLD &&
+      gap >= BOUNDARY_CONFIDENCE_GAP
+    ) {
+      return {
+        best: { sei: best.sei, mei: best.mei },
+        confidence: BOUNDARY_CONFIDENCE,
+        candidates,
+      };
+    }
+  }
+
+  // 3. Katakana-sei exception: when the best candidate's sei is all katakana
+  //    (e.g. ジャガー/横田), real Japanese surnames are never pure katakana.
+  //    Re-score the boundary candidate by looking up mei in sei dict instead.
+  if (boundary?.direction === "kana-to-kanji" && boundaryIndex !== undefined) {
+    const boundaryCandidate = candidates.find(
+      (c) => [...c.sei].length === boundaryIndex && isAllKatakana(c.sei)
+    );
+    if (boundaryCandidate) {
+      const meiAsSei = lookupMatch(
+        boundaryCandidate.mei, "sei", lexicon, false,
+        options?.readingData ?? defaultReading,
+      );
+      if (meiAsSei === "surface" || meiAsSei === "folded") {
+        return {
+          best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei },
+          confidence: BOUNDARY_CONFIDENCE,
+          candidates,
+        };
+      }
+    }
+  }
+
+  // 4. Low confidence mode
+  if (options?.allowLowConfidence) {
     return {
       best: { sei: best.sei, mei: best.mei },
-      confidence: confident ? 1.0 : best.score / CONFIDENCE_THRESHOLD,
+      confidence: best.score / CONFIDENCE_THRESHOLD,
       candidates,
     };
   }
 
-  // Not confident enough: return unsplit
+  // 4. Not confident enough: return unsplit
   return {
     best: { sei: trimmed, mei: "" },
     confidence: 0,
diff --git a/test/unit/split.test.ts b/test/unit/split.test.ts