@@ -6,12 +6,17 @@ import type {
66 SeimeiResult ,
77 SplitOptions ,
88} from "./types.js" ;
9- import { isAllHiragana , isAllKatakana , isNonJapanese } from "./normalize.js" ;
9+ import { isAllHiragana , isAllKatakana , isNonJapanese , findSingleScriptBoundary } from "./normalize.js" ;
1010import { calcScore , lookupMatch } from "./scorer.js" ;
1111
1212const CONFIDENCE_THRESHOLD = 6.0 ;
1313const CONFIDENCE_GAP = 1.0 ;
1414
15+ // Boundary confidence: when the best candidate aligns with a script boundary
16+ // and has dictionary evidence, grant confidence 0.8
17+ const BOUNDARY_CONFIDENCE = 0.8 ;
18+ const BOUNDARY_CONFIDENCE_GAP = 0.5 ;
19+
1520let defaultLexicon : PackedLexicon | undefined ;
1621let defaultReading : ReadingData | undefined ;
1722
@@ -73,7 +78,6 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
7378
7479 const lexicon = options ?. lexicon ?? defaultLexicon ;
7580 if ( ! lexicon ) {
76- // No lexicon loaded: return unsplit
7781 return {
7882 best : { sei : trimmed , mei : "" } ,
7983 confidence : 0 ,
@@ -94,6 +98,8 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
9498
9599 const isKana = isAllHiragana ( trimmed ) || isAllKatakana ( trimmed ) ;
96100 const maxSplit = Math . min ( lexicon . maxSeiLen , n - 1 ) ;
101+ const boundary = findSingleScriptBoundary ( trimmed ) ;
102+ const boundaryIndex = boundary ?. index ;
97103 const candidates : SeimeiCandidate [ ] = [ ] ;
98104
99105 for ( let i = 1 ; i <= maxSplit ; i ++ ) {
@@ -107,7 +113,7 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
107113 const readingData = options ?. readingData ?? defaultReading ;
108114 const seiMatch = lookupMatch ( sei , "sei" , lexicon , isKana , readingData ) ;
109115 const meiMatch = lookupMatch ( mei , "mei" , lexicon , isKana , readingData ) ;
110- const score = calcScore ( seiMatch , meiMatch , seiLen , meiLen ) ;
116+ const score = calcScore ( sei , mei , seiMatch , meiMatch , seiLen , meiLen , i , boundaryIndex ) ;
111117
112118 candidates . push ( { sei, mei, score, seiMatch, meiMatch } ) ;
113119 }
@@ -129,15 +135,67 @@ export function analyze(fullName: string, options?: SplitOptions): AnalyzeResult
129135 const confident =
130136 best . score >= CONFIDENCE_THRESHOLD && gap >= CONFIDENCE_GAP ;
131137
132- if ( confident || options ?. allowLowConfidence ) {
138+ // 1. Normal confidence: dictionary-based high score
139+ if ( confident ) {
140+ return {
141+ best : { sei : best . sei , mei : best . mei } ,
142+ confidence : 1.0 ,
143+ candidates,
144+ } ;
145+ }
146+
147+ // 2. Boundary confidence: best candidate aligns with script boundary
148+ // and has dictionary evidence on the appropriate side
149+ if ( boundaryIndex !== undefined && [ ...best . sei ] . length === boundaryIndex ) {
150+ const hasDictEvidence = boundary ?. direction === "kanji-to-kana"
151+ ? ( best . seiMatch === "surface" || best . seiMatch === "folded" )
152+ : ( best . meiMatch === "surface" || best . meiMatch === "folded" ) ;
153+
154+ if (
155+ hasDictEvidence &&
156+ best . score >= CONFIDENCE_THRESHOLD &&
157+ gap >= BOUNDARY_CONFIDENCE_GAP
158+ ) {
159+ return {
160+ best : { sei : best . sei , mei : best . mei } ,
161+ confidence : BOUNDARY_CONFIDENCE ,
162+ candidates,
163+ } ;
164+ }
165+ }
166+
167+ // 3. Katakana-sei exception: when the best candidate's sei is all katakana
168+ // (e.g. ジャガー/横田), real Japanese surnames are never pure katakana.
169+ // Re-score the boundary candidate by looking up mei in sei dict instead.
170+ if ( boundary ?. direction === "kana-to-kanji" && boundaryIndex !== undefined ) {
171+ const boundaryCandidate = candidates . find (
172+ ( c ) => [ ...c . sei ] . length === boundaryIndex && isAllKatakana ( c . sei )
173+ ) ;
174+ if ( boundaryCandidate ) {
175+ const meiAsSei = lookupMatch (
176+ boundaryCandidate . mei , "sei" , lexicon , false ,
177+ options ?. readingData ?? defaultReading ,
178+ ) ;
179+ if ( meiAsSei === "surface" || meiAsSei === "folded" ) {
180+ return {
181+ best : { sei : boundaryCandidate . sei , mei : boundaryCandidate . mei } ,
182+ confidence : BOUNDARY_CONFIDENCE ,
183+ candidates,
184+ } ;
185+ }
186+ }
187+ }
188+
189+ // 4. Low confidence mode
190+ if ( options ?. allowLowConfidence ) {
133191 return {
134192 best : { sei : best . sei , mei : best . mei } ,
135- confidence : confident ? 1.0 : best . score / CONFIDENCE_THRESHOLD ,
193+ confidence : best . score / CONFIDENCE_THRESHOLD ,
136194 candidates,
137195 } ;
138196 }
139197
140- // Not confident enough: return unsplit
198+ // 4. Not confident enough: return unsplit
141199 return {
142200 best : { sei : trimmed , mei : "" } ,
143201 confidence : 0 ,
0 commit comments