Skip to content

Commit 5539507

Browse files
authored
Merge pull request #7 from tk1024/feat/boundary-heuristic
feat: 文字種境界スコアリング + OOV姓ペナルティ + カタカナ姓例外フロー
2 parents 451c446 + a020453 commit 5539507

4 files changed

Lines changed: 358 additions & 10 deletions

File tree

src/core/normalize.ts

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,75 @@ const VARIANT_MAP: Record<string, string> = {
4040
"條": "条", "圓": "円",
4141
};
4242

43+
type ScriptType = "kanji" | "hiragana" | "katakana" | "other";
44+
45+
function scriptOf(ch: string): ScriptType {
46+
if (/[\u3041-\u3096]/.test(ch)) return "hiragana";
47+
if (/[\u30A1-\u30F6\u30FC]/.test(ch)) return "katakana";
48+
if (/[\p{Script=Han}]/u.test(ch)) return "kanji";
49+
return "other";
50+
}
51+
52+
/**
53+
* Find the split position where a single script boundary occurs
54+
* between kanji and kana (in either direction).
55+
* Returns the character index and direction, or undefined if no unique boundary exists.
56+
*
57+
* Examples:
58+
* "夏色まつり" → { index: 2, direction: "kanji-to-kana" }
59+
* "白銀ノエル" → { index: 2, direction: "kanji-to-kana" }
60+
* "デーモン閣下" → { index: 4, direction: "kana-to-kanji" }
61+
* "もこ田めめめ" → undefined (2 transitions)
62+
* "田中太郎" → undefined (all kanji, no transition)
63+
*/
64+
export interface ScriptBoundary {
65+
index: number;
66+
direction: "kanji-to-kana" | "kana-to-kanji";
67+
}
68+
69+
export function findSingleScriptBoundary(fullName: string): ScriptBoundary | undefined {
70+
const chars = [...fullName];
71+
let transitionCount = 0;
72+
let splitIndex: number | undefined;
73+
let fromScript: ScriptType | undefined;
74+
let toScript: ScriptType | undefined;
75+
76+
for (let i = 1; i < chars.length; i++) {
77+
const prev = scriptOf(chars[i - 1]);
78+
const next = scriptOf(chars[i]);
79+
if (prev === next) continue;
80+
if (prev === "other" || next === "other") return undefined;
81+
transitionCount++;
82+
if (transitionCount > 1) return undefined;
83+
splitIndex = i;
84+
fromScript = prev;
85+
toScript = next;
86+
}
87+
88+
if (transitionCount !== 1 || splitIndex === undefined) return undefined;
89+
90+
const fromIsKanji = fromScript === "kanji";
91+
const toIsKanji = toScript === "kanji";
92+
const fromIsKana = fromScript === "hiragana" || fromScript === "katakana";
93+
const toIsKana = toScript === "hiragana" || toScript === "katakana";
94+
95+
if (fromIsKanji && toIsKana) {
96+
return { index: splitIndex, direction: "kanji-to-kana" };
97+
}
98+
if (fromIsKana && toIsKanji) {
99+
return { index: splitIndex, direction: "kana-to-kanji" };
100+
}
101+
102+
return undefined;
103+
}
104+
105+
/** @deprecated Use findSingleScriptBoundary instead */
106+
export function findSingleKanjiToKanaBoundary(fullName: string): number | undefined {
107+
const result = findSingleScriptBoundary(fullName);
108+
if (result?.direction === "kanji-to-kana") return result.index;
109+
return undefined;
110+
}
111+
43112
/**
44113
* Fold variant kanji to their canonical forms.
45114
*/

src/core/scorer.ts

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,21 @@ const MEI_LENGTH_SCORE: Record<number, number> = {
3030
const PAIR_BONUS = 0.8;
3131
const BOTH_SINGLE_CHAR_PENALTY = -1.0;
3232

33+
// Surname hit is stronger evidence than given name hit,
34+
// because surnames are a finite known set while given names are creative
35+
const SEI_HIT_BONUS = 0.5;
36+
37+
// Script boundary scoring
38+
const BOUNDARY_MATCH_BONUS = 1.2;
39+
const BOUNDARY_MATCH_WITH_DICT_BONUS = 0.8;
40+
const BOUNDARY_BEFORE_PENALTY = -3.0;
41+
const BOUNDARY_AFTER_PENALTY = -1.8;
42+
43+
// Sei mixed-script penalty: OOV surname containing kana is unnatural
44+
const SEI_MIXED_SINGLE_HIRA_PENALTY = -2.5;
45+
const SEI_MIXED_SINGLE_KATA_PENALTY = -3.0;
46+
const SEI_MIXED_MULTI_KANA_PENALTY = -1.5;
47+
3348
// Cache for Set-based lookups built from string[]
3449
const setCache = new WeakMap<PackedLexicon, { sei: Set<string>; mei: Set<string> }>();
3550

@@ -45,6 +60,61 @@ function getSets(lexicon: PackedLexicon): { sei: Set<string>; mei: Set<string> }
4560
return cached;
4661
}
4762

63+
const RE_KANJI = /[\p{Script=Han}]/u;
64+
const RE_HIRAGANA = /[\u3041-\u3096]/;
65+
const RE_KATAKANA = /[\u30A1-\u30F6\u30FC]/;
66+
67+
function scriptOf(ch: string): "K" | "H" | "T" | "O" {
68+
if (RE_KANJI.test(ch)) return "K";
69+
if (RE_HIRAGANA.test(ch)) return "H";
70+
if (RE_KATAKANA.test(ch)) return "T";
71+
return "O";
72+
}
73+
74+
function scriptPattern(s: string): string {
75+
return [...s].map(scriptOf).join("");
76+
}
77+
78+
/**
79+
* Penalty for OOV surnames that contain kana (e.g. 宝鐘マ, 星街すい).
80+
* Real Japanese surnames are almost always pure kanji.
81+
* Only applied when the surname has no dictionary hit.
82+
*/
83+
function seiMixedScriptPenalty(sei: string, seiMatch: MatchType): number {
84+
if (seiMatch !== "none") return 0;
85+
86+
const p = scriptPattern(sei);
87+
if (!/^K+[HT]+$/.test(p)) return 0;
88+
89+
const suffix = p.match(/[HT]+$/)![0];
90+
if (suffix.length === 1) {
91+
return suffix[0] === "T"
92+
? SEI_MIXED_SINGLE_KATA_PENALTY
93+
: SEI_MIXED_SINGLE_HIRA_PENALTY;
94+
}
95+
return SEI_MIXED_MULTI_KANA_PENALTY;
96+
}
97+
98+
/**
99+
* Penalty for OOV given names that start with kana followed by kanji (e.g. モン閣下, イク眞木).
100+
* When a kana→kanji boundary exists, the mei side should be pure kanji.
101+
* Only applied when the given name has no dictionary hit.
102+
*/
103+
function meiMixedScriptPenalty(mei: string, meiMatch: MatchType): number {
104+
if (meiMatch !== "none") return 0;
105+
106+
const p = scriptPattern(mei);
107+
if (!/^[HT]+K+$/.test(p)) return 0;
108+
109+
const prefix = p.match(/^[HT]+/)![0];
110+
if (prefix.length === 1) {
111+
return prefix[0] === "T"
112+
? SEI_MIXED_SINGLE_KATA_PENALTY
113+
: SEI_MIXED_SINGLE_HIRA_PENALTY;
114+
}
115+
return SEI_MIXED_MULTI_KANA_PENALTY;
116+
}
117+
48118
/**
49119
* Look up a candidate string in the lexicon.
50120
* Returns the match type: surface > folded > reading > none.
@@ -87,19 +157,33 @@ export function lookupMatch(
87157

88158
/**
89159
* Calculate the score for a split candidate.
160+
*
161+
* @param sei - the surname candidate string
162+
* @param splitIndex - the character index where this candidate splits (i.e. sei length)
163+
* @param boundaryIndex - the unique kanji→kana boundary position, or undefined if none
90164
*/
91165
export function calcScore(
166+
sei: string,
167+
mei: string,
92168
seiMatch: MatchType,
93169
meiMatch: MatchType,
94170
seiLen: number,
95171
meiLen: number,
172+
splitIndex: number,
173+
boundaryIndex: number | undefined,
96174
): number {
97175
let score = 0;
98176

99177
// Match scores (primary signal)
100178
score += MATCH_SCORE[seiMatch];
101179
score += MATCH_SCORE[meiMatch];
102180

181+
// Surname hit bonus: surnames are a known finite set,
182+
// so a dictionary hit on sei is stronger evidence than on mei
183+
if (seiMatch === "surface" || seiMatch === "folded") {
184+
score += SEI_HIT_BONUS;
185+
}
186+
103187
// Length scores (secondary signal)
104188
score += SEI_LENGTH_SCORE[Math.min(seiLen, 6)] ?? -0.5;
105189
score += MEI_LENGTH_SCORE[Math.min(meiLen, 6)] ?? -0.5;
@@ -124,5 +208,23 @@ export function calcScore(
124208
score = -Infinity;
125209
}
126210

211+
// Script boundary scoring
212+
if (boundaryIndex !== undefined) {
213+
if (splitIndex === boundaryIndex) {
214+
score += BOUNDARY_MATCH_BONUS;
215+
if (seiMatch === "surface" || seiMatch === "folded") {
216+
score += BOUNDARY_MATCH_WITH_DICT_BONUS;
217+
}
218+
} else if (splitIndex < boundaryIndex) {
219+
score += BOUNDARY_BEFORE_PENALTY;
220+
} else {
221+
score += BOUNDARY_AFTER_PENALTY;
222+
}
223+
}
224+
225+
// OOV surname mixed-script penalty (mei side is not penalized —
226+
// names like よね子, ルミ子, 美つ子 naturally mix scripts)
227+
score += seiMixedScriptPenalty(sei, seiMatch);
228+
127229
return score;
128230
}

src/core/splitter.ts

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,17 @@ import type {
66
SeimeiResult,
77
SplitOptions,
88
} from "./types.js";
9-
import { isAllHiragana, isAllKatakana, isNonJapanese } from "./normalize.js";
9+
import { isAllHiragana, isAllKatakana, isNonJapanese, findSingleScriptBoundary } from "./normalize.js";
1010
import { calcScore, lookupMatch } from "./scorer.js";
1111

1212
const CONFIDENCE_THRESHOLD = 6.0;
1313
const CONFIDENCE_GAP = 1.0;
1414

15+
// Boundary confidence: when the best candidate aligns with a script boundary
16+
// and has dictionary evidence, grant confidence 0.8
17+
const BOUNDARY_CONFIDENCE = 0.8;
18+
const BOUNDARY_CONFIDENCE_GAP = 0.5;
19+
1520
let defaultLexicon: PackedLexicon | undefined;
1621
let defaultReading: ReadingData | undefined;
1722

@@ -73,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
7378

7479
const lexicon = options?.lexicon ?? defaultLexicon;
7580
if (!lexicon) {
76-
// No lexicon loaded: return unsplit
7781
return {
7882
best: { sei: trimmed, mei: "" },
7983
confidence: 0,
@@ -94,6 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
9498

9599
const isKana = isAllHiragana(trimmed) || isAllKatakana(trimmed);
96100
const maxSplit = Math.min(lexicon.maxSeiLen, n - 1);
101+
const boundary = findSingleScriptBoundary(trimmed);
102+
const boundaryIndex = boundary?.index;
97103
const candidates: SeimeiCandidate[] = [];
98104

99105
for (let i = 1; i <= maxSplit; i++) {
@@ -107,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
107113
const readingData = options?.readingData ?? defaultReading;
108114
const seiMatch = lookupMatch(sei, "sei", lexicon, isKana, readingData);
109115
const meiMatch = lookupMatch(mei, "mei", lexicon, isKana, readingData);
110-
const score = calcScore(seiMatch, meiMatch, seiLen, meiLen);
116+
const score = calcScore(sei, mei, seiMatch, meiMatch, seiLen, meiLen, i, boundaryIndex);
111117

112118
candidates.push({ sei, mei, score, seiMatch, meiMatch });
113119
}
@@ -129,15 +135,67 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
129135
const confident =
130136
best.score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP;
131137

132-
if (confident || options?.allowLowConfidence) {
138+
// 1. Normal confidence: dictionary-based high score
139+
if (confident) {
140+
return {
141+
best: { sei: best.sei, mei: best.mei },
142+
confidence: 1.0,
143+
candidates,
144+
};
145+
}
146+
147+
// 2. Boundary confidence: best candidate aligns with script boundary
148+
// and has dictionary evidence on the appropriate side
149+
if (boundaryIndex !== undefined && [...best.sei].length === boundaryIndex) {
150+
const hasDictEvidence = boundary?.direction === "kanji-to-kana"
151+
? (best.seiMatch === "surface" || best.seiMatch === "folded")
152+
: (best.meiMatch === "surface" || best.meiMatch === "folded");
153+
154+
if (
155+
hasDictEvidence &&
156+
best.score >= CONFIDENCE_THRESHOLD &&
157+
gap >= BOUNDARY_CONFIDENCE_GAP
158+
) {
159+
return {
160+
best: { sei: best.sei, mei: best.mei },
161+
confidence: BOUNDARY_CONFIDENCE,
162+
candidates,
163+
};
164+
}
165+
}
166+
167+
// 3. Katakana-sei exception: when the best candidate's sei is all katakana
168+
// (e.g. ジャガー/横田), real Japanese surnames are never pure katakana.
169+
// Re-score the boundary candidate by looking up mei in sei dict instead.
170+
if (boundary?.direction === "kana-to-kanji" && boundaryIndex !== undefined) {
171+
const boundaryCandidate = candidates.find(
172+
(c) => [...c.sei].length === boundaryIndex && isAllKatakana(c.sei)
173+
);
174+
if (boundaryCandidate) {
175+
const meiAsSei = lookupMatch(
176+
boundaryCandidate.mei, "sei", lexicon, false,
177+
options?.readingData ?? defaultReading,
178+
);
179+
if (meiAsSei === "surface" || meiAsSei === "folded") {
180+
return {
181+
best: { sei: boundaryCandidate.sei, mei: boundaryCandidate.mei },
182+
confidence: BOUNDARY_CONFIDENCE,
183+
candidates,
184+
};
185+
}
186+
}
187+
}
188+
189+
// 4. Low confidence mode
190+
if (options?.allowLowConfidence) {
133191
return {
134192
best: { sei: best.sei, mei: best.mei },
135-
confidence: confident ? 1.0 : best.score / CONFIDENCE_THRESHOLD,
193+
confidence: best.score / CONFIDENCE_THRESHOLD,
136194
candidates,
137195
};
138196
}
139197

140-
// Not confident enough: return unsplit
198+
// 4. Not confident enough: return unsplit
141199
return {
142200
best: { sei: trimmed, mei: "" },
143201
confidence: 0,

0 commit comments

Comments
 (0)