From cf2f4f5932a1b9a85ea848c6eb49bd3f41c73b4b Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Tue, 5 May 2026 17:59:23 -0600 Subject: [PATCH 1/2] Trigger integration tests after successful cherry-pick onto release branch --- .github/workflows/cherry-pick-patch.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/cherry-pick-patch.yml b/.github/workflows/cherry-pick-patch.yml index 90fe1816d..523f1c153 100644 --- a/.github/workflows/cherry-pick-patch.yml +++ b/.github/workflows/cherry-pick-patch.yml @@ -16,6 +16,7 @@ on: permissions: contents: write pull-requests: write + actions: write jobs: cherry-pick: @@ -121,6 +122,7 @@ jobs: if git cherry-pick $PICK_FLAGS $PICK_SHAS; then git push origin "$RELEASE_BRANCH" echo "✅ Cherry-picked PR #$PR_NUMBER onto $RELEASE_BRANCH" + gh workflow run integration-tests.yml --ref "$RELEASE_BRANCH" --repo "$GITHUB_REPOSITORY" else echo "⚠️ Cherry-pick conflict — creating resolution branch" git cherry-pick --abort 2>/dev/null || true From e5c82d89b464be1ea6250d0fe8048fc74d7d5421 Mon Sep 17 00:00:00 2001 From: Kris Zyp Date: Tue, 5 May 2026 17:59:58 -0600 Subject: [PATCH 2/2] fix(blob): retry bytes() up to 3x when lock-free but file incomplete When a node crashes mid-blob-write, the in-memory LMDB lock state is cleared on restart. After recovery, tryLock returns true (no active writer) even though the blob file is only partially written. Previously, bytes() would immediately throw "Incomplete blob" in this state. This fix adds up to 3 retries with 100ms delay before giving up, allowing time for replication to re-send the blob and acquire the write lock. Covers the Serent-Canopy scenario where a crashed HNSW node left partial blob files that caused application-level errors until a manual 200ms retry was added. Co-Authored-By: Claude Sonnet 4.6 --- resources/blob.ts | 13 +++++++++++++ unitTests/resources/blob.test.js | 33 +++++++++++++++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/resources/blob.ts b/resources/blob.ts index d5bf416c3..ad0831254 100644 --- a/resources/blob.ts +++ b/resources/blob.ts @@ -161,6 +161,7 @@ class FileBackedBlob extends InstanceOfBlobWithNoConstructor { } const filePath = getFilePath(storageInfo); let writeFinished: boolean; + let writeFinishedRetries = 0; const readContents = async () => { let rawBytes: Buffer; let size = HEADER_SIZE; @@ -192,6 +193,18 @@ class FileBackedBlob extends InstanceOfBlobWithNoConstructor { const store = storageInfo.store; const lockKey = storageInfo.fileId + ':blob'; if (writeFinished) { + // The lock was free but the file is still incomplete — the writer may have + // crashed mid-write (process restart clears in-memory lock state). Allow a + // few brief retries so that an in-progress new write (e.g. replication + // re-send) can acquire the lock before we give up. + if (writeFinishedRetries++ < 3) { + logger.trace?.( + `Incomplete blob after writer finished, retrying (attempt ${writeFinishedRetries})`, + filePath + ); + writeFinished = false; + return new Promise((resolve) => setTimeout(() => resolve(readContents()), 100)); + } throw new Error(`Incomplete blob for ${filePath}`); } return new Promise((resolve) => { diff --git a/unitTests/resources/blob.test.js b/unitTests/resources/blob.test.js index de822a8d1..736ae980a 100644 --- a/unitTests/resources/blob.test.js +++ b/unitTests/resources/blob.test.js @@ -13,7 +13,7 @@ const { isSaving, cleanupOrphans, } = require('#src/resources/blob'); -const { existsSync } = require('fs'); +const { existsSync, writeFileSync } = require('fs'); const { pack } = require('msgpackr'); const { randomBytes } = require('crypto'); @@ -395,6 +395,37 @@ describe('Blob test', () => { let orphansDeleted = await cleanupOrphans(getDatabases().test); assert.equal(orphansDeleted, 0); }); + it('bytes() retries up to 3x then rejects when lock is free but file is incomplete', async () => { + // Simulate a crashed-writer scenario: the blob file exists on disk with a + // DEFAULT_HEADER (UNKNOWN_SIZE) + partial content, but no writer holds the lock. + // bytes() should retry 3 times (100 ms each) before throwing. + const content = Buffer.alloc(9001, 0x61); // >8192 so it is file-backed + // Use a Readable stream source so storageInfo.contentBuffer is NOT set — otherwise + // bytes() returns the in-memory buffer and never reads the corrupted disk file. + const blob = await createBlob(Readable.from([content])); + await BlobTest.put({ id: 901, blob }); + // Use the original blob object (not the decoded record.blob) to ensure we always + // read from disk — the decoded blob may carry an in-memory contentBuffer in some + // storage-engine / transaction-cache configurations. + const filePath = getFilePathForBlob(blob); + assert(filePath, 'blob should be file-backed for this test'); + // Await the blob file write to complete before corrupting — BlobTest.put only awaits + // the DB write, not the blob stream pipeline, so without this the pipeline can race + // and overwrite our corrupted file with the complete content. + await isSaving(blob); + assert(existsSync(filePath), 'blob file should exist after save'); + + // Corrupt: DEFAULT_HEADER (UNKNOWN_SIZE) + partial content, as a crashed write leaves. + // Byte[1] = 0 = UNCOMPRESSED_TYPE, matching the DEFAULT_HEADER constant in blob.ts. + const DEFAULT_HEADER = Buffer.from([0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff]); + writeFileSync(filePath, Buffer.concat([DEFAULT_HEADER, content.subarray(0, 100)])); + + const start = Date.now(); + await assert.rejects(() => blob.bytes(), /incomplete blob/i); + const elapsed = Date.now() - start; + // 3 retries × 100 ms each = at least 300 ms + assert(elapsed >= 290, `expected ≥300 ms of retry delay, got ${elapsed} ms`); + }); afterEach(function () { setAuditRetention(60000); setDeletionDelay(50); // restore shorter, but need to have it happen for the last test