From 9f1c8da3dddc5f2f56d458d69198a5f8e422eb18 Mon Sep 17 00:00:00 2001 From: zho Date: Tue, 19 May 2026 00:46:24 +0800 Subject: [PATCH 1/5] fixed reassemble and query format module --- src/server/format-query-result.test.ts | 72 ++++++++++++++++ src/server/format-query-result.ts | 7 +- src/server/reassemble-documents.test.ts | 107 +++++++++++++++++++++++- src/server/reassemble-documents.ts | 22 ++++- 4 files changed, 203 insertions(+), 5 deletions(-) create mode 100644 src/server/format-query-result.test.ts diff --git a/src/server/format-query-result.test.ts b/src/server/format-query-result.test.ts new file mode 100644 index 0000000..d5da2f9 --- /dev/null +++ b/src/server/format-query-result.test.ts @@ -0,0 +1,72 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import * as loggerModule from '../logger.js'; +import type { SearchResult } from '../types.js'; +import { + formatQueryResultRows, + formatSearchResultAsRow, + resetPaperNumberDeprecationLatchForTests, +} from './format-query-result.js'; + +const DEPRECATION_SUBSTRING = + 'paper_number is deprecated and will be removed in the next major release'; + +describe('formatSearchResultAsRow / formatQueryResultRows', () => { + let warnSpy: ReturnType; + + beforeEach(() => { + resetPaperNumberDeprecationLatchForTests(); + warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + + it('warns once on first row even when document_id is null', () => { + const doc: SearchResult = { + id: 'v1', + content: 'hello world', + score: 0.99, + metadata: { title: 'T' }, + reranked: false, + }; + const row = formatSearchResultAsRow(doc); + expect(row.document_id).toBeNull(); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(String(warnSpy.mock.calls[0]?.[0])).toContain(DEPRECATION_SUBSTRING); + }); + + it('does not warn again on second formatSearchResultAsRow call', () => { + const doc: SearchResult = { + id: 'v1', + content: 'a', + score: 0.5, + metadata: { document_number: 'P1' }, + reranked: false, + }; + formatSearchResultAsRow(doc); + formatSearchResultAsRow(doc); + expect(warnSpy).toHaveBeenCalledTimes(1); + }); + + it('warns at most once for formatQueryResultRows with multiple hits', () => { + const results: SearchResult[] = [ + { + id: 'a', + content: 'one', + score: 0.9, + metadata: { document_number: 'D1' }, + reranked: false, + }, + { + id: 'b', + content: 'two', + score: 0.8, + metadata: { document_number: 'D2' }, + reranked: false, + }, + ]; + formatQueryResultRows(results); + expect(warnSpy).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/server/format-query-result.ts b/src/server/format-query-result.ts index 1364c4c..5c01aa4 100644 --- a/src/server/format-query-result.ts +++ b/src/server/format-query-result.ts @@ -21,8 +21,9 @@ export interface QueryResultRow { document_id: string | null; /** * @deprecated Use `document_id`. Kept for one minor cycle and removed in - * the next major release. The first time a row is emitted with this alias - * during a session, a `WARN` log is fired so consumers see the deadline. + * the next major release. The first formatted query-result row in the + * process triggers a single `WARN` log per session so consumers see the + * deprecation deadline. */ paper_number: string | null; title: string; @@ -79,7 +80,7 @@ export function formatSearchResultAsRow( : null) ?? null; - if (document_id !== null && !deprecationWarnedThisSession) { + if (!deprecationWarnedThisSession) { deprecationWarnedThisSession = true; logWarn( 'paper_number is deprecated and will be removed in the next major release; use document_id instead.' diff --git a/src/server/reassemble-documents.test.ts b/src/server/reassemble-documents.test.ts index 33448eb..7f49d06 100644 --- a/src/server/reassemble-documents.test.ts +++ b/src/server/reassemble-documents.test.ts @@ -1,8 +1,19 @@ -import { describe, it, expect } from 'vitest'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import * as loggerModule from '../logger.js'; import { reassembleByDocument } from './reassemble-documents.js'; import type { SearchResult } from '../types.js'; describe('reassembleByDocument', () => { + let warnSpy: ReturnType; + + beforeEach(() => { + warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + it('groups chunks by document_number', () => { const results: SearchResult[] = [ { @@ -85,4 +96,98 @@ describe('reassembleByDocument', () => { expect(docs[0].chunk_count).toBe(3); expect(docs[0].merged_content).toBe('Chunk 0\n\nChunk 1\n\nChunk 2'); }); + + it('does not warn when all hits have a document key', () => { + const results: SearchResult[] = [ + { + id: 'c1', + content: 'Only doc.', + score: 0.9, + metadata: { document_number: 'P1' }, + reranked: false, + }, + ]; + reassembleByDocument(results); + expect(warnSpy).not.toHaveBeenCalled(); + }); + + it('warns once with count when hits lack document key and empty vector id', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'Orphan chunk.', + score: 0.5, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(0); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=/); + }); + + it('aggregates multiple skipped hits into one warn with total count', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'A', + score: 0.5, + metadata: {}, + reranked: false, + }, + { + id: '', + content: 'B', + score: 0.4, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(0); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 2 hit/); + }); + + it('warns only for invalid hits when mixed with valid document keys', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'Skipped.', + score: 0.3, + metadata: {}, + reranked: false, + }, + { + id: 'vec-1', + content: 'Kept by id.', + score: 0.9, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(1); + expect(docs[0].document_id).toBe('vec-1'); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=/); + }); + + it('includes sample vector ids in skip warning up to limit', () => { + const emptyDocNum = { document_number: '' } as Record; + const results: SearchResult[] = [ + { id: 'a', content: 'x', score: 0.1, metadata: emptyDocNum, reranked: false }, + { id: 'b', content: 'y', score: 0.2, metadata: emptyDocNum, reranked: false }, + { id: 'c', content: 'z', score: 0.3, metadata: emptyDocNum, reranked: false }, + { id: 'd', content: 'w', score: 0.4, metadata: emptyDocNum, reranked: false }, + ]; + reassembleByDocument(results); + expect(warnSpy).toHaveBeenCalledTimes(1); + const msg = String(warnSpy.mock.calls[0]?.[0]); + expect(msg).toMatch(/skipped 4 hit/); + expect(msg.match(/sample_ids=([^\s.]+)/)?.[1]).toBe('a,b,c'); + }); }); diff --git a/src/server/reassemble-documents.ts b/src/server/reassemble-documents.ts index ef900dd..648939c 100644 --- a/src/server/reassemble-documents.ts +++ b/src/server/reassemble-documents.ts @@ -5,10 +5,14 @@ */ import type { PineconeMetadataValue, SearchResult } from '../types.js'; +import { warn as logWarn } from '../logger.js'; /** Default metadata keys tried for chunk ordering (RecursiveCharacterTextSplitter often adds these). */ const CHUNK_ORDER_KEYS = ['chunk_index', 'start_index', 'loc'] as const; +/** Max vector ids listed in the one-per-call skip warning (for grep-friendly debugging). */ +const SKIP_WARN_SAMPLE_IDS = 3; + /** Derive a stable document key from hit metadata: document_number → url → doc_id → hit.id. */ function getDocumentKey(hit: SearchResult): string { const m = hit.metadata || {}; @@ -68,10 +72,18 @@ export function reassembleByDocument( const separator = options?.contentSeparator ?? '\n\n'; const byDoc = new Map(); + let skippedHits = 0; + const sampleIdsForWarn: string[] = []; for (const hit of results) { const key = getDocumentKey(hit); - if (!key) continue; + if (!key) { + skippedHits++; + if (sampleIdsForWarn.length < SKIP_WARN_SAMPLE_IDS) { + sampleIdsForWarn.push(hit.id.trim() === '' ? '' : hit.id); + } + continue; + } let list = byDoc.get(key); if (!list) { list = []; @@ -80,6 +92,14 @@ export function reassembleByDocument( list.push(hit); } + if (skippedHits > 0) { + const sample = + sampleIdsForWarn.length > 0 ? ` sample_ids=${sampleIdsForWarn.join(',')}` : ''; + logWarn( + `reassembleByDocument: skipped ${skippedHits} hit(s) with no document key (document_number, url, doc_id, or non-empty vector id).${sample}` + ); + } + const out: ReassembledDocument[] = []; for (const [docId, chunks] of byDoc) { From d72ad7fd8287c4d9a0a03f70a191a13c2068e916 Mon Sep 17 00:00:00 2001 From: zho Date: Tue, 19 May 2026 01:40:52 +0800 Subject: [PATCH 2/5] fixed format errors --- src/server/reassemble-documents.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/server/reassemble-documents.ts b/src/server/reassemble-documents.ts index 648939c..a94e4b4 100644 --- a/src/server/reassemble-documents.ts +++ b/src/server/reassemble-documents.ts @@ -93,8 +93,7 @@ export function reassembleByDocument( } if (skippedHits > 0) { - const sample = - sampleIdsForWarn.length > 0 ? ` sample_ids=${sampleIdsForWarn.join(',')}` : ''; + const sample = sampleIdsForWarn.length > 0 ? ` sample_ids=${sampleIdsForWarn.join(',')}` : ''; logWarn( `reassembleByDocument: skipped ${skippedHits} hit(s) with no document key (document_number, url, doc_id, or non-empty vector id).${sample}` ); From e5b071869b6c17fde49c09c2470d9a138c16a8d5 Mon Sep 17 00:00:00 2001 From: zho Date: Tue, 19 May 2026 13:13:08 +0800 Subject: [PATCH 3/5] addressed coderabbitai review --- src/server/reassemble-documents.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/server/reassemble-documents.ts b/src/server/reassemble-documents.ts index a94e4b4..b0cea52 100644 --- a/src/server/reassemble-documents.ts +++ b/src/server/reassemble-documents.ts @@ -19,11 +19,14 @@ function getDocumentKey(hit: SearchResult): string { const docNumber = m['document_number']; const url = m['url']; const docId = m['doc_id']; + const nonEmpty = (v: unknown): string | undefined => + typeof v === 'string' && v.trim() !== '' ? v : undefined; + return ( - (typeof docNumber === 'string' ? docNumber : undefined) ?? - (typeof url === 'string' ? url : undefined) ?? - (typeof docId === 'string' ? docId : undefined) ?? - hit.id ?? + nonEmpty(docNumber) ?? + nonEmpty(url) ?? + nonEmpty(docId) ?? + nonEmpty(hit.id) ?? '' ); } From 634c2f5b5b0713df3fc8c252c560661c020172a8 Mon Sep 17 00:00:00 2001 From: zho Date: Tue, 19 May 2026 13:16:18 +0800 Subject: [PATCH 4/5] fixed format error --- src/server/reassemble-documents.ts | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/server/reassemble-documents.ts b/src/server/reassemble-documents.ts index b0cea52..54aafec 100644 --- a/src/server/reassemble-documents.ts +++ b/src/server/reassemble-documents.ts @@ -22,13 +22,7 @@ function getDocumentKey(hit: SearchResult): string { const nonEmpty = (v: unknown): string | undefined => typeof v === 'string' && v.trim() !== '' ? v : undefined; - return ( - nonEmpty(docNumber) ?? - nonEmpty(url) ?? - nonEmpty(docId) ?? - nonEmpty(hit.id) ?? - '' - ); + return nonEmpty(docNumber) ?? nonEmpty(url) ?? nonEmpty(docId) ?? nonEmpty(hit.id) ?? ''; } /** Return a numeric order for sorting chunks from metadata (chunk_index, start_index, or loc). */ From 77f40d9103b797a7244bf78082478b760ff6357b Mon Sep 17 00:00:00 2001 From: zho Date: Tue, 19 May 2026 13:20:05 +0800 Subject: [PATCH 5/5] added test --- src/server/reassemble-documents.test.ts | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/server/reassemble-documents.test.ts b/src/server/reassemble-documents.test.ts index 7f49d06..b14d086 100644 --- a/src/server/reassemble-documents.test.ts +++ b/src/server/reassemble-documents.test.ts @@ -176,18 +176,22 @@ describe('reassembleByDocument', () => { expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=/); }); - it('includes sample vector ids in skip warning up to limit', () => { - const emptyDocNum = { document_number: '' } as Record; - const results: SearchResult[] = [ - { id: 'a', content: 'x', score: 0.1, metadata: emptyDocNum, reranked: false }, - { id: 'b', content: 'y', score: 0.2, metadata: emptyDocNum, reranked: false }, - { id: 'c', content: 'z', score: 0.3, metadata: emptyDocNum, reranked: false }, - { id: 'd', content: 'w', score: 0.4, metadata: emptyDocNum, reranked: false }, - ]; + it('includes at most three sample entries in skip warning when many hits are skipped', () => { + const results: SearchResult[] = Array.from({ length: 4 }, (_, i) => ({ + id: '', + content: `chunk-${i}`, + score: 0.1 * (i + 1), + metadata: {}, + reranked: false, + })); reassembleByDocument(results); expect(warnSpy).toHaveBeenCalledTimes(1); const msg = String(warnSpy.mock.calls[0]?.[0]); expect(msg).toMatch(/skipped 4 hit/); - expect(msg.match(/sample_ids=([^\s.]+)/)?.[1]).toBe('a,b,c'); + const samplePart = msg.match(/sample_ids=(.+)$/)?.[1]; + expect(samplePart).toBeDefined(); + const samples = samplePart!.split(','); + expect(samples).toHaveLength(3); + expect(samples.every((s) => s === '')).toBe(true); }); });