diff --git a/src/server/format-query-result.test.ts b/src/server/format-query-result.test.ts new file mode 100644 index 0000000..d5da2f9 --- /dev/null +++ b/src/server/format-query-result.test.ts @@ -0,0 +1,72 @@ +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import * as loggerModule from '../logger.js'; +import type { SearchResult } from '../types.js'; +import { + formatQueryResultRows, + formatSearchResultAsRow, + resetPaperNumberDeprecationLatchForTests, +} from './format-query-result.js'; + +const DEPRECATION_SUBSTRING = + 'paper_number is deprecated and will be removed in the next major release'; + +describe('formatSearchResultAsRow / formatQueryResultRows', () => { + let warnSpy: ReturnType; + + beforeEach(() => { + resetPaperNumberDeprecationLatchForTests(); + warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + + it('warns once on first row even when document_id is null', () => { + const doc: SearchResult = { + id: 'v1', + content: 'hello world', + score: 0.99, + metadata: { title: 'T' }, + reranked: false, + }; + const row = formatSearchResultAsRow(doc); + expect(row.document_id).toBeNull(); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(String(warnSpy.mock.calls[0]?.[0])).toContain(DEPRECATION_SUBSTRING); + }); + + it('does not warn again on second formatSearchResultAsRow call', () => { + const doc: SearchResult = { + id: 'v1', + content: 'a', + score: 0.5, + metadata: { document_number: 'P1' }, + reranked: false, + }; + formatSearchResultAsRow(doc); + formatSearchResultAsRow(doc); + expect(warnSpy).toHaveBeenCalledTimes(1); + }); + + it('warns at most once for formatQueryResultRows with multiple hits', () => { + const results: SearchResult[] = [ + { + id: 'a', + content: 'one', + score: 0.9, + metadata: { document_number: 'D1' }, + reranked: false, + }, + { + id: 'b', + content: 'two', + score: 0.8, + metadata: { document_number: 'D2' }, + reranked: false, + }, + ]; + formatQueryResultRows(results); + expect(warnSpy).toHaveBeenCalledTimes(1); + }); +}); diff --git a/src/server/format-query-result.ts b/src/server/format-query-result.ts index 1364c4c..5c01aa4 100644 --- a/src/server/format-query-result.ts +++ b/src/server/format-query-result.ts @@ -21,8 +21,9 @@ export interface QueryResultRow { document_id: string | null; /** * @deprecated Use `document_id`. Kept for one minor cycle and removed in - * the next major release. The first time a row is emitted with this alias - * during a session, a `WARN` log is fired so consumers see the deadline. + * the next major release. The first formatted query-result row in the + * process triggers a single `WARN` log per session so consumers see the + * deprecation deadline. */ paper_number: string | null; title: string; @@ -79,7 +80,7 @@ export function formatSearchResultAsRow( : null) ?? null; - if (document_id !== null && !deprecationWarnedThisSession) { + if (!deprecationWarnedThisSession) { deprecationWarnedThisSession = true; logWarn( 'paper_number is deprecated and will be removed in the next major release; use document_id instead.' diff --git a/src/server/reassemble-documents.test.ts b/src/server/reassemble-documents.test.ts index 33448eb..b14d086 100644 --- a/src/server/reassemble-documents.test.ts +++ b/src/server/reassemble-documents.test.ts @@ -1,8 +1,19 @@ -import { describe, it, expect } from 'vitest'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import * as loggerModule from '../logger.js'; import { reassembleByDocument } from './reassemble-documents.js'; import type { SearchResult } from '../types.js'; describe('reassembleByDocument', () => { + let warnSpy: ReturnType; + + beforeEach(() => { + warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {}); + }); + + afterEach(() => { + warnSpy.mockRestore(); + }); + it('groups chunks by document_number', () => { const results: SearchResult[] = [ { @@ -85,4 +96,102 @@ describe('reassembleByDocument', () => { expect(docs[0].chunk_count).toBe(3); expect(docs[0].merged_content).toBe('Chunk 0\n\nChunk 1\n\nChunk 2'); }); + + it('does not warn when all hits have a document key', () => { + const results: SearchResult[] = [ + { + id: 'c1', + content: 'Only doc.', + score: 0.9, + metadata: { document_number: 'P1' }, + reranked: false, + }, + ]; + reassembleByDocument(results); + expect(warnSpy).not.toHaveBeenCalled(); + }); + + it('warns once with count when hits lack document key and empty vector id', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'Orphan chunk.', + score: 0.5, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(0); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=/); + }); + + it('aggregates multiple skipped hits into one warn with total count', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'A', + score: 0.5, + metadata: {}, + reranked: false, + }, + { + id: '', + content: 'B', + score: 0.4, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(0); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 2 hit/); + }); + + it('warns only for invalid hits when mixed with valid document keys', () => { + const results: SearchResult[] = [ + { + id: '', + content: 'Skipped.', + score: 0.3, + metadata: {}, + reranked: false, + }, + { + id: 'vec-1', + content: 'Kept by id.', + score: 0.9, + metadata: {}, + reranked: false, + }, + ]; + const docs = reassembleByDocument(results); + expect(docs).toHaveLength(1); + expect(docs[0].document_id).toBe('vec-1'); + expect(warnSpy).toHaveBeenCalledTimes(1); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/); + expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=/); + }); + + it('includes at most three sample entries in skip warning when many hits are skipped', () => { + const results: SearchResult[] = Array.from({ length: 4 }, (_, i) => ({ + id: '', + content: `chunk-${i}`, + score: 0.1 * (i + 1), + metadata: {}, + reranked: false, + })); + reassembleByDocument(results); + expect(warnSpy).toHaveBeenCalledTimes(1); + const msg = String(warnSpy.mock.calls[0]?.[0]); + expect(msg).toMatch(/skipped 4 hit/); + const samplePart = msg.match(/sample_ids=(.+)$/)?.[1]; + expect(samplePart).toBeDefined(); + const samples = samplePart!.split(','); + expect(samples).toHaveLength(3); + expect(samples.every((s) => s === '')).toBe(true); + }); }); diff --git a/src/server/reassemble-documents.ts b/src/server/reassemble-documents.ts index ef900dd..54aafec 100644 --- a/src/server/reassemble-documents.ts +++ b/src/server/reassemble-documents.ts @@ -5,23 +5,24 @@ */ import type { PineconeMetadataValue, SearchResult } from '../types.js'; +import { warn as logWarn } from '../logger.js'; /** Default metadata keys tried for chunk ordering (RecursiveCharacterTextSplitter often adds these). */ const CHUNK_ORDER_KEYS = ['chunk_index', 'start_index', 'loc'] as const; +/** Max vector ids listed in the one-per-call skip warning (for grep-friendly debugging). */ +const SKIP_WARN_SAMPLE_IDS = 3; + /** Derive a stable document key from hit metadata: document_number → url → doc_id → hit.id. */ function getDocumentKey(hit: SearchResult): string { const m = hit.metadata || {}; const docNumber = m['document_number']; const url = m['url']; const docId = m['doc_id']; - return ( - (typeof docNumber === 'string' ? docNumber : undefined) ?? - (typeof url === 'string' ? url : undefined) ?? - (typeof docId === 'string' ? docId : undefined) ?? - hit.id ?? - '' - ); + const nonEmpty = (v: unknown): string | undefined => + typeof v === 'string' && v.trim() !== '' ? v : undefined; + + return nonEmpty(docNumber) ?? nonEmpty(url) ?? nonEmpty(docId) ?? nonEmpty(hit.id) ?? ''; } /** Return a numeric order for sorting chunks from metadata (chunk_index, start_index, or loc). */ @@ -68,10 +69,18 @@ export function reassembleByDocument( const separator = options?.contentSeparator ?? '\n\n'; const byDoc = new Map(); + let skippedHits = 0; + const sampleIdsForWarn: string[] = []; for (const hit of results) { const key = getDocumentKey(hit); - if (!key) continue; + if (!key) { + skippedHits++; + if (sampleIdsForWarn.length < SKIP_WARN_SAMPLE_IDS) { + sampleIdsForWarn.push(hit.id.trim() === '' ? '' : hit.id); + } + continue; + } let list = byDoc.get(key); if (!list) { list = []; @@ -80,6 +89,13 @@ export function reassembleByDocument( list.push(hit); } + if (skippedHits > 0) { + const sample = sampleIdsForWarn.length > 0 ? ` sample_ids=${sampleIdsForWarn.join(',')}` : ''; + logWarn( + `reassembleByDocument: skipped ${skippedHits} hit(s) with no document key (document_number, url, doc_id, or non-empty vector id).${sample}` + ); + } + const out: ReassembledDocument[] = []; for (const [docId, chunks] of byDoc) {