Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions src/server/format-query-result.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import * as loggerModule from '../logger.js';
import type { SearchResult } from '../types.js';
import {
formatQueryResultRows,
formatSearchResultAsRow,
resetPaperNumberDeprecationLatchForTests,
} from './format-query-result.js';

const DEPRECATION_SUBSTRING =
'paper_number is deprecated and will be removed in the next major release';

describe('formatSearchResultAsRow / formatQueryResultRows', () => {
let warnSpy: ReturnType<typeof vi.spyOn>;

beforeEach(() => {
resetPaperNumberDeprecationLatchForTests();
warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {});
});

afterEach(() => {
warnSpy.mockRestore();
});

it('warns once on first row even when document_id is null', () => {
const doc: SearchResult = {
id: 'v1',
content: 'hello world',
score: 0.99,
metadata: { title: 'T' },
reranked: false,
};
const row = formatSearchResultAsRow(doc);
expect(row.document_id).toBeNull();
expect(warnSpy).toHaveBeenCalledTimes(1);
expect(String(warnSpy.mock.calls[0]?.[0])).toContain(DEPRECATION_SUBSTRING);
});

it('does not warn again on second formatSearchResultAsRow call', () => {
const doc: SearchResult = {
id: 'v1',
content: 'a',
score: 0.5,
metadata: { document_number: 'P1' },
reranked: false,
};
formatSearchResultAsRow(doc);
formatSearchResultAsRow(doc);
expect(warnSpy).toHaveBeenCalledTimes(1);
});

it('warns at most once for formatQueryResultRows with multiple hits', () => {
const results: SearchResult[] = [
{
id: 'a',
content: 'one',
score: 0.9,
metadata: { document_number: 'D1' },
reranked: false,
},
{
id: 'b',
content: 'two',
score: 0.8,
metadata: { document_number: 'D2' },
reranked: false,
},
];
formatQueryResultRows(results);
expect(warnSpy).toHaveBeenCalledTimes(1);
});
});
7 changes: 4 additions & 3 deletions src/server/format-query-result.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ export interface QueryResultRow {
document_id: string | null;
/**
* @deprecated Use `document_id`. Kept for one minor cycle and removed in
* the next major release. The first time a row is emitted with this alias
* during a session, a `WARN` log is fired so consumers see the deadline.
* the next major release. The first formatted query-result row in the
* process triggers a single `WARN` log per session so consumers see the
* deprecation deadline.
*/
paper_number: string | null;
title: string;
Expand Down Expand Up @@ -79,7 +80,7 @@ export function formatSearchResultAsRow(
: null) ??
null;

if (document_id !== null && !deprecationWarnedThisSession) {
if (!deprecationWarnedThisSession) {
deprecationWarnedThisSession = true;
logWarn(
'paper_number is deprecated and will be removed in the next major release; use document_id instead.'
Expand Down
111 changes: 110 additions & 1 deletion src/server/reassemble-documents.test.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
import { describe, it, expect } from 'vitest';
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
import * as loggerModule from '../logger.js';
import { reassembleByDocument } from './reassemble-documents.js';
import type { SearchResult } from '../types.js';

describe('reassembleByDocument', () => {
let warnSpy: ReturnType<typeof vi.spyOn>;

beforeEach(() => {
warnSpy = vi.spyOn(loggerModule, 'warn').mockImplementation(() => {});
});

afterEach(() => {
warnSpy.mockRestore();
});

it('groups chunks by document_number', () => {
const results: SearchResult[] = [
{
Expand Down Expand Up @@ -85,4 +96,102 @@ describe('reassembleByDocument', () => {
expect(docs[0].chunk_count).toBe(3);
expect(docs[0].merged_content).toBe('Chunk 0\n\nChunk 1\n\nChunk 2');
});

it('does not warn when all hits have a document key', () => {
const results: SearchResult[] = [
{
id: 'c1',
content: 'Only doc.',
score: 0.9,
metadata: { document_number: 'P1' },
reranked: false,
},
];
reassembleByDocument(results);
expect(warnSpy).not.toHaveBeenCalled();
});

it('warns once with count when hits lack document key and empty vector id', () => {
const results: SearchResult[] = [
{
id: '',
content: 'Orphan chunk.',
score: 0.5,
metadata: {},
reranked: false,
},
];
const docs = reassembleByDocument(results);
expect(docs).toHaveLength(0);
expect(warnSpy).toHaveBeenCalledTimes(1);
expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/);
expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=<empty>/);
});

it('aggregates multiple skipped hits into one warn with total count', () => {
const results: SearchResult[] = [
{
id: '',
content: 'A',
score: 0.5,
metadata: {},
reranked: false,
},
{
id: '',
content: 'B',
score: 0.4,
metadata: {},
reranked: false,
},
];
const docs = reassembleByDocument(results);
expect(docs).toHaveLength(0);
expect(warnSpy).toHaveBeenCalledTimes(1);
expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 2 hit/);
});

it('warns only for invalid hits when mixed with valid document keys', () => {
const results: SearchResult[] = [
{
id: '',
content: 'Skipped.',
score: 0.3,
metadata: {},
reranked: false,
},
{
id: 'vec-1',
content: 'Kept by id.',
score: 0.9,
metadata: {},
reranked: false,
},
];
const docs = reassembleByDocument(results);
expect(docs).toHaveLength(1);
expect(docs[0].document_id).toBe('vec-1');
expect(warnSpy).toHaveBeenCalledTimes(1);
expect(warnSpy.mock.calls[0]?.[0]).toMatch(/skipped 1 hit/);
expect(warnSpy.mock.calls[0]?.[0]).toMatch(/sample_ids=<empty>/);
});

it('includes at most three sample entries in skip warning when many hits are skipped', () => {
const results: SearchResult[] = Array.from({ length: 4 }, (_, i) => ({
id: '',
content: `chunk-${i}`,
score: 0.1 * (i + 1),
metadata: {},
reranked: false,
}));
reassembleByDocument(results);
expect(warnSpy).toHaveBeenCalledTimes(1);
const msg = String(warnSpy.mock.calls[0]?.[0]);
expect(msg).toMatch(/skipped 4 hit/);
const samplePart = msg.match(/sample_ids=(.+)$/)?.[1];
expect(samplePart).toBeDefined();
const samples = samplePart!.split(',');
expect(samples).toHaveLength(3);
expect(samples.every((s) => s === '<empty>')).toBe(true);
});
});
32 changes: 24 additions & 8 deletions src/server/reassemble-documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,24 @@
*/

import type { PineconeMetadataValue, SearchResult } from '../types.js';
import { warn as logWarn } from '../logger.js';

/** Default metadata keys tried for chunk ordering (RecursiveCharacterTextSplitter often adds these). */
const CHUNK_ORDER_KEYS = ['chunk_index', 'start_index', 'loc'] as const;

/** Max vector ids listed in the one-per-call skip warning (for grep-friendly debugging). */
const SKIP_WARN_SAMPLE_IDS = 3;

/** Derive a stable document key from hit metadata: document_number → url → doc_id → hit.id. */
function getDocumentKey(hit: SearchResult): string {
const m = hit.metadata || {};
const docNumber = m['document_number'];
const url = m['url'];
const docId = m['doc_id'];
return (
(typeof docNumber === 'string' ? docNumber : undefined) ??
(typeof url === 'string' ? url : undefined) ??
(typeof docId === 'string' ? docId : undefined) ??
hit.id ??
''
);
const nonEmpty = (v: unknown): string | undefined =>
typeof v === 'string' && v.trim() !== '' ? v : undefined;

return nonEmpty(docNumber) ?? nonEmpty(url) ?? nonEmpty(docId) ?? nonEmpty(hit.id) ?? '';
}

/** Return a numeric order for sorting chunks from metadata (chunk_index, start_index, or loc). */
Expand Down Expand Up @@ -68,10 +69,18 @@ export function reassembleByDocument(
const separator = options?.contentSeparator ?? '\n\n';

const byDoc = new Map<string, SearchResult[]>();
let skippedHits = 0;
const sampleIdsForWarn: string[] = [];

for (const hit of results) {
const key = getDocumentKey(hit);
if (!key) continue;
if (!key) {
skippedHits++;
if (sampleIdsForWarn.length < SKIP_WARN_SAMPLE_IDS) {
sampleIdsForWarn.push(hit.id.trim() === '' ? '<empty>' : hit.id);
}
continue;
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
let list = byDoc.get(key);
if (!list) {
list = [];
Expand All @@ -80,6 +89,13 @@ export function reassembleByDocument(
list.push(hit);
}

if (skippedHits > 0) {
const sample = sampleIdsForWarn.length > 0 ? ` sample_ids=${sampleIdsForWarn.join(',')}` : '';
logWarn(
`reassembleByDocument: skipped ${skippedHits} hit(s) with no document key (document_number, url, doc_id, or non-empty vector id).${sample}`
);
}

const out: ReassembledDocument[] = [];

for (const [docId, chunks] of byDoc) {
Expand Down
Loading