From 1e349f52d93537d2731e6d5432addd9ac6c6c9a7 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 5 May 2026 18:36:44 +0200 Subject: [PATCH 01/10] refactor!: Trim the storage subclient interfaces --- packages/core/src/storages/dataset.ts | 95 +--- packages/core/src/storages/key_value_store.ts | 83 +--- .../core/src/storages/request_provider.ts | 2 +- .../src/resource-clients/dataset.ts | 165 ++----- .../src/resource-clients/key-value-store.ts | 261 +--------- .../src/resource-clients/request-queue.ts | 82 +--- packages/memory-storage/src/utils.ts | 150 ------ .../test/async-iteration.test.ts | 464 +----------------- .../memory-storage/test/fs-fallback.test.ts | 8 +- .../test/key-value-store/stream.test.ts | 4 +- .../test/no-crash-on-big-buffers.test.ts | 2 +- .../test/no-writing-to-disk.test.ts | 4 +- .../test/request-queue/forefront.test.ts | 4 +- .../handledRequestCount-should-update.test.ts | 13 - .../test/reverse-datataset-list.test.ts | 12 +- .../test/write-metadata.test.ts | 4 +- packages/types/src/storages.ts | 101 ++-- .../adaptive_playwright_crawler.test.ts | 8 +- .../crawlers/rendering_type_predictor.test.ts | 2 +- test/core/recoverable_state.test.ts | 2 +- test/core/storages/dataset.test.ts | 219 +++------ test/core/storages/key_value_store.test.ts | 249 ++++------ test/core/storages/request_queue.test.ts | 117 +++-- test/shared/MemoryStorageEmulator.ts | 4 +- 24 files changed, 473 insertions(+), 1582 deletions(-) diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 8d1721ea4226..c8b06c1edd9e 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -1,9 +1,7 @@ -import type { DatasetClient, DatasetInfo, Dictionary, PaginatedList } from '@crawlee/types'; +import type { DatasetClient, DatasetInfo, Dictionary } from '@crawlee/types'; import { stringify } from 'csv-stringify/sync'; import ow from 'ow'; -import { MAX_PAYLOAD_SIZE_BYTES } from '@apify/consts'; - import { Configuration } from '../configuration.js'; import type { CrawleeLogger } from '../log.js'; import { serviceLocator } from '../service_locator.js'; @@ -18,15 +16,15 @@ import { purgeDefaultStorages } from './utils.js'; /** @internal */ export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000; -const SAFETY_BUFFER_PERCENT = 0.01 / 100; // 0.01% - /** - * Accepts a JSON serializable object as an input, validates its serializability, - * and validates its serialized size against limitBytes. Optionally accepts its index - * in an array to provide better error messages. Returns serialized object. + * Validates that the given value is a plain JSON-serializable object + * (not an array, not a primitive, not circular). + * + * @param item The value to validate. + * @param index Optional index for error messages when validating inside an array. * @ignore */ -export function checkAndSerialize(item: T, limitBytes: number, index?: number): string { +export function isJsonSerializable(item: T, index?: number): void { const s = typeof index === 'number' ? ` at index ${index} ` : ' '; const isItemObject = item && typeof item === 'object' && !Array.isArray(item); @@ -34,20 +32,12 @@ export function checkAndSerialize(item: T, limitBytes: number, index?: number throw new Error(`Data item${s}is not an object. You can push only objects into a dataset.`); } - let payload; try { - payload = JSON.stringify(item); + JSON.stringify(item); } catch (e) { const err = e as Error; throw new Error(`Data item${s}is not serializable to JSON.\nCause: ${err.message}`); } - - const bytes = Buffer.byteLength(payload); - if (bytes > limitBytes) { - throw new Error(`Data item${s}is too large (size: ${bytes} bytes, limit: ${limitBytes} bytes)`); - } - - return payload; } /** @@ -260,44 +250,21 @@ export class Dataset { * **IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`, * otherwise the crawler process might finish before the data is stored! * - * The size of the data is limited by the receiving API and therefore `pushData()` will only - * allow objects whose JSON representation is smaller than 9MB. When an array is passed, - * none of the included objects - * may be larger than 9MB, but the array itself may be of any size. - * - * The function internally - * chunks the array into separate items and pushes them sequentially. - * The chunking process is stable (keeps order of data), but it does not provide a transaction - * safety mechanism. Therefore, in the event of an uploading error (after several automatic retries), - * the function's Promise will reject and the dataset will be left in a state where some of - * the items have already been saved to the dataset while other items from the source array were not. - * To overcome this limitation, the developer may, for example, read the last item saved in the dataset - * and re-attempt the save of the data from this item onwards to prevent duplicates. * @param data Object or array of objects containing data to be stored in the default dataset. - * The objects must be serializable to JSON and the JSON representation of each object must be smaller than 9MB. + * The objects must be serializable to JSON. */ async pushData(data: Data | Data[]): Promise { checkStorageAccess(); ow(data, 'data', ow.object); - const dispatch = async (payload: string) => this.client.pushItems(payload); - const limit = MAX_PAYLOAD_SIZE_BYTES - Math.ceil(MAX_PAYLOAD_SIZE_BYTES * SAFETY_BUFFER_PERCENT); - - // Handle singular Objects - if (!Array.isArray(data)) { - const payload = checkAndSerialize(data, limit); - await dispatch(payload); - return; - } - - // Handle Arrays - const payloads = data.map((item, index) => checkAndSerialize(item, limit, index)); - const chunks = chunkBySize(payloads, limit); - // Invoke client in series to preserve order of data - for (const chunk of chunks) { - await dispatch(chunk); + // Normalize to array and validate each item + const items = Array.isArray(data) ? data : [data]; + for (let i = 0; i < items.length; i++) { + isJsonSerializable(items[i], i); } + + await this.client.pushData(items); } /** @@ -307,7 +274,7 @@ export class Dataset { checkStorageAccess(); try { - return await this.client.listItems(options); + return await this.client.getData(options); } catch (e) { const error = e as Error; if (error.message.includes('Cannot create a string longer than')) { @@ -330,7 +297,7 @@ export class Dataset { const fetchNextChunk = async (offset = 0): Promise => { const limit = 1000; - const value = await this.client.listItems({ offset, limit, ...options }); + const value = await this.client.getData({ offset, limit, ...options }); if (value.count === 0) { return; @@ -616,20 +583,10 @@ export class Dataset { * * @param options Options for the iteration. */ - values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise> { + async *values(options: DatasetIteratorOptions = {}): AsyncGenerator { checkStorageAccess(); - const result = this.client.listItems(options) as AsyncIterable & Promise>; - - if (!(Symbol.asyncIterator in result)) { - Object.defineProperty(result, Symbol.asyncIterator, { - get() { - throw new Error('Resource client "listItems" method does not return an async iterable.'); - }, - }); - } - - return result; + yield* this.client.iterateItems(options); } /** @@ -646,16 +603,14 @@ export class Dataset { * * @param options Options for the iteration. */ - entries( - options: DatasetIteratorOptions = {}, - ): AsyncIterable<[number, Data]> & Promise> { + async *entries(options: DatasetIteratorOptions = {}): AsyncGenerator<[number, Data]> { checkStorageAccess(); - if (!this.client.listEntries) { - throw new Error('Resource client is missing the "listEntries" method.'); - } + let index = options.offset ?? 0; - return this.client.listEntries(options); + for await (const item of this.client.iterateItems(options)) { + yield [index++, item]; + } } /** @@ -681,7 +636,7 @@ export class Dataset { async drop(): Promise { checkStorageAccess(); - await this.client.delete(); + await this.client.drop(); serviceLocator.getStorageInstanceManager().removeFromCache(this); } diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index e04866a63435..57e47cb780b9 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -227,7 +227,7 @@ export class KeyValueStore { checkStorageAccess(); ow(key, ow.string.nonEmpty); - const record = await this.client.getRecord(key); + const record = await this.client.getValue(key); return (record?.value as T) ?? defaultValue ?? null; } @@ -393,21 +393,15 @@ export class KeyValueStore { } // In this case delete the record. - if (value === null) return this.client.deleteRecord(key); + if (value === null) return this.client.deleteValue(key); value = maybeStringify(value, optionsCopy); - return this.client.setRecord( - { - key, - value, - contentType: optionsCopy.contentType, - }, - { - timeoutSecs: optionsCopy.timeoutSecs, - doNotRetryTimeouts: optionsCopy.doNotRetryTimeouts, - }, - ); + return this.client.setValue({ + key, + value, + contentType: optionsCopy.contentType, + }); } /** @@ -417,7 +411,7 @@ export class KeyValueStore { async drop(): Promise { checkStorageAccess(); - await this.client.delete(); + await this.client.drop(); serviceLocator.getStorageInstanceManager().removeFromCache(this); } @@ -432,7 +426,7 @@ export class KeyValueStore { * Iterates over key-value store keys, yielding each in turn to an `iteratee` function. * Each invocation of `iteratee` is called with three arguments: `(key, index, info)`, where `key` * is the record key, `index` is a zero-based index of the key in the current iteration - * (regardless of `options.exclusiveStartKey`) and `info` is an object that contains a single property `size` + * and `info` is an object that contains a single property `size` * indicating size of the record in bytes. * * If the `iteratee` function returns a Promise then it is awaited before the next call. @@ -452,33 +446,18 @@ export class KeyValueStore { async forEachKey(iteratee: KeyConsumer, options: KeyValueStoreIteratorOptions = {}): Promise { checkStorageAccess(); - return this._forEachKey(iteratee, options); - } - - private async _forEachKey( - iteratee: KeyConsumer, - options: KeyValueStoreIteratorOptions = {}, - index = 0, - ): Promise { - const { exclusiveStartKey, prefix, collection } = options; ow(iteratee, ow.function); ow( options, ow.object.exactShape({ - exclusiveStartKey: ow.optional.string, prefix: ow.optional.string, - collection: ow.optional.string, }), ); - const response = await this.client.listKeys({ exclusiveStartKey, prefix, collection }); - const { nextExclusiveStartKey, isTruncated, items } = response; - for (const item of items) { + let index = 0; + for await (const item of this.client.iterateKeys(options)) { await iteratee(item.key, index++, { size: item.size }); } - return isTruncated - ? this._forEachKey(iteratee, { exclusiveStartKey: nextExclusiveStartKey, prefix, collection }, index) - : undefined; // [].forEach() returns undefined. } /** @@ -498,11 +477,9 @@ export class KeyValueStore { async *keys(options: KeyValueStoreIteratorOptions = {}): AsyncGenerator { checkStorageAccess(); - if (!this.client.keys) { - throw new Error('Resource client is missing the "keys" method.'); + for await (const item of this.client.iterateKeys(options)) { + yield item.key; } - - yield* this.client.keys(options); } /** @@ -519,14 +496,15 @@ export class KeyValueStore { * * @param options Options for the iteration. */ - values(options: KeyValueStoreIteratorOptions = {}): AsyncIterable & Promise { + async *values(options: KeyValueStoreIteratorOptions = {}): AsyncGenerator { checkStorageAccess(); - if (!this.client.values) { - throw new Error('Resource client is missing the "values" method.'); + for await (const item of this.client.iterateKeys(options)) { + const record = await this.client.getValue(item.key); + if (record) { + yield record.value as T; + } } - - return this.client.values(options) as AsyncIterable & Promise; } /** @@ -543,16 +521,17 @@ export class KeyValueStore { * * @param options Options for the iteration. */ - entries( + async *entries( options: KeyValueStoreIteratorOptions = {}, - ): AsyncIterable<[string, T]> & Promise<[string, T][]> { + ): AsyncGenerator<[string, T], void, undefined> { checkStorageAccess(); - if (!this.client.entries) { - throw new Error('Resource client is missing the "entries" method.'); + for await (const item of this.client.iterateKeys(options)) { + const record = await this.client.getValue(item.key); + if (record) { + yield [item.key, record.value as T]; + } } - - return this.client.entries(options) as AsyncIterable<[string, T]> & Promise<[string, T][]>; } /** @@ -579,7 +558,7 @@ export class KeyValueStore { * @param key The key of the record to generate the public URL for. */ async getPublicUrl(key: string): Promise { - return this.client.getRecordPublicUrl(key); + return this.client.getPublicUrl(key); } /** @@ -855,16 +834,8 @@ export interface RecordOptions { } export interface KeyValueStoreIteratorOptions { - /** - * All keys up to this one (including) are skipped from the result. - */ - exclusiveStartKey?: string; /** * If set, only keys that start with this prefix are returned. */ prefix?: string; - /** - * Collection name to use for listing keys. - */ - collection?: string; } diff --git a/packages/core/src/storages/request_provider.ts b/packages/core/src/storages/request_provider.ts index cb692c3c7321..6fd6433d83b9 100644 --- a/packages/core/src/storages/request_provider.ts +++ b/packages/core/src/storages/request_provider.ts @@ -722,7 +722,7 @@ export abstract class RequestProvider implements IStorage, IRequestManager { async drop(): Promise { checkStorageAccess(); - await this.client.delete(); + await this.client.drop(); serviceLocator.getStorageInstanceManager().removeFromCache(this); } diff --git a/packages/memory-storage/src/resource-clients/dataset.ts b/packages/memory-storage/src/resource-clients/dataset.ts index 6e0854968d76..99ded54f52a9 100644 --- a/packages/memory-storage/src/resource-clients/dataset.ts +++ b/packages/memory-storage/src/resource-clients/dataset.ts @@ -6,14 +6,11 @@ import { resolve } from 'node:path'; import type * as storage from '@crawlee/types'; import type { Dictionary } from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { move } from 'fs-extra'; import { scheduleBackgroundTask } from '../background-handler/index.js'; -import { StorageTypes } from '../consts.js'; import type { StorageImplementation } from '../fs/common.js'; import { createDatasetStorageImplementation } from '../fs/dataset/index.js'; import type { MemoryStorage } from '../index.js'; -import { createPaginatedEntryList, createPaginatedList } from '../utils.js'; import { BaseClient } from './common/base-client.js'; /** @@ -61,42 +58,7 @@ export class DatasetClient return this.toDatasetInfo(); } - async update(newFields: storage.DatasetClientUpdateOptions = {}): Promise { - const parsed = s - .object({ - name: s.string().lengthGreaterThan(0).optional(), - }) - .parse(newFields); - - // Skip if no changes - if (!parsed.name) { - return this.toDatasetInfo(); - } - - // Check that name is not in use already - const existingStoreByName = this.client.datasetClientCache.find( - (store) => store.name?.toLowerCase() === parsed.name!.toLowerCase(), - ); - - if (existingStoreByName) { - this.throwOnDuplicateEntry(StorageTypes.Dataset, 'name', parsed.name); - } - - this.name = parsed.name; - - const previousDir = this.datasetDirectory; - - this.datasetDirectory = resolve(this.client.datasetsDirectory, parsed.name ?? this.name ?? this.id); - - await move(previousDir, this.datasetDirectory, { overwrite: true }); - - // Update timestamps - this.updateTimestamps(true); - - return this.toDatasetInfo(); - } - - async delete(): Promise { + async drop(): Promise { const storeIndex = this.client.datasetClientCache.findIndex((store) => store.id === this.id); if (storeIndex !== -1) { @@ -108,13 +70,25 @@ export class DatasetClient } } - async downloadItems(): Promise { - throw new Error('This method is not implemented in @crawlee/memory-storage'); + async purge(): Promise { + this.itemCount = 0; + this.datasetEntries.clear(); + + // Remove item files from disk but keep the directory + if (this.client.persistStorage) { + const { readdir } = await import('node:fs/promises'); + const entries = await readdir(this.datasetDirectory).catch(() => []); + for (const entry of entries) { + if (entry !== '__metadata__.json') { + await rm(resolve(this.datasetDirectory, entry), { force: true }); + } + } + } + + this.updateTimestamps(true); } - listItems( - options: storage.DatasetClientListOptions = {}, - ): AsyncIterable & Promise> { + getData(options: storage.DatasetClientListOptions = {}): Promise> { const { desc, limit, offset } = s .object({ desc: s.boolean().optional(), @@ -123,21 +97,19 @@ export class DatasetClient }) .parse(options); - return createPaginatedList( - (pageOffset, pageLimit) => - this.listItemsPage({ - desc, - offset: pageOffset, - limit: Math.min(pageLimit, LIST_ITEMS_LIMIT), - }), - { offset, limit }, - ); + return this.getDataPage({ + desc, + offset: offset ?? 0, + limit: Math.min(limit ?? LIST_ITEMS_LIMIT, LIST_ITEMS_LIMIT), + }); } - listEntries( - options: storage.DatasetClientListOptions = {}, - ): AsyncIterable<[number, Data]> & Promise> { - const { desc, limit, offset } = s + async *iterateItems(options: storage.DatasetClientListOptions = {}): AsyncIterable { + const { + desc, + limit, + offset: startOffset, + } = s .object({ desc: s.boolean().optional(), limit: s.number().int().optional(), @@ -145,18 +117,30 @@ export class DatasetClient }) .parse(options); - return createPaginatedEntryList( - (pageOffset, pageLimit) => - this.listItemsPage({ - desc, - offset: pageOffset, - limit: Math.min(pageLimit, LIST_ITEMS_LIMIT), - }), - { offset, limit }, - ); + let offset = startOffset ?? 0; + let yielded = 0; + const pageSize = 1000; + + while (true) { + const pageLimit = limit !== undefined ? Math.min(pageSize, limit - yielded) : pageSize; + if (pageLimit <= 0) break; + + const page = await this.getDataPage({ desc, offset, limit: pageLimit }); + + for (const item of page.items) { + yield item; + yielded++; + } + + if (page.items.length < pageLimit || (limit !== undefined && yielded >= limit)) { + break; + } + + offset += page.items.length; + } } - private async listItemsPage(options: storage.DatasetClientListOptions = {}): Promise> { + private async getDataPage(options: storage.DatasetClientListOptions = {}): Promise> { const { limit = LIST_ITEMS_LIMIT, offset = 0, desc } = options; const [start, end] = this.getStartAndEndIndexes( @@ -183,18 +167,8 @@ export class DatasetClient }; } - async pushItems(items: string | Data | string[] | Data[]): Promise { - const rawItems = s - .union([ - s.string(), - s.object({} as Data).passthrough(), - s.array(s.union([s.string(), s.object({} as Data).passthrough()])), - ]) - .parse(items) as Data[]; - - const normalized = this.normalizeItems(rawItems); - - for (const entry of normalized) { + async pushData(items: Data[]): Promise { + for (const entry of items) { const idx = this.generateLocalEntryName(++this.itemCount); const storageEntry = createDatasetStorageImplementation({ entityId: idx, @@ -231,39 +205,6 @@ export class DatasetClient return [start, end] as const; } - /** - * To emulate API and split arrays of items into individual dataset items, - * we need to normalize the input items - which can be strings, objects - * or arrays of those - into objects, so that we can save them one by one - * later. We could potentially do this directly with strings, but let's - * not optimize prematurely. - */ - private normalizeItems(items: string | Data | (string | Data)[]): Data[] { - if (typeof items === 'string') { - items = JSON.parse(items); - } - - return Array.isArray(items) ? items.map((item) => this.normalizeItem(item)) : [this.normalizeItem(items)]; - } - - private normalizeItem(item: string | Data): Data { - if (typeof item === 'string') { - item = JSON.parse(item) as Data; - } - - if (Array.isArray(item)) { - throw new Error( - `Each dataset item can only be a single JSON object, not an array. Received: [${item.join(',\n')}]`, - ); - } - - if (typeof item !== 'object' || item === null) { - throw new Error(`Each dataset item must be a JSON object. Received: ${item}`); - } - - return item; - } - private updateTimestamps(hasBeenModified: boolean) { this.accessedAt = new Date(); diff --git a/packages/memory-storage/src/resource-clients/key-value-store.ts b/packages/memory-storage/src/resource-clients/key-value-store.ts index 1b8df20eecd8..f0cd4558de58 100644 --- a/packages/memory-storage/src/resource-clients/key-value-store.ts +++ b/packages/memory-storage/src/resource-clients/key-value-store.ts @@ -1,25 +1,20 @@ import { randomUUID } from 'node:crypto'; import { rm } from 'node:fs/promises'; import { resolve } from 'node:path'; -import { Readable } from 'node:stream'; import type * as storage from '@crawlee/types'; import { s } from '@sapphire/shapeshift'; -import { move } from 'fs-extra'; -import mime from 'mime-types'; -import pLimit from 'p-limit'; import { scheduleBackgroundTask } from '../background-handler/index.js'; import { maybeParseBody } from '../body-parser.js'; -import { DEFAULT_API_PARAM_LIMIT, StorageTypes } from '../consts.js'; import type { StorageImplementation } from '../fs/common.js'; import { createKeyValueStorageImplementation } from '../fs/key-value-store/index.js'; import type { MemoryStorage } from '../index.js'; -import { createKeyList, createKeyStringList, createLazyIterablePromise, isBuffer, isStream } from '../utils.js'; +import { isBuffer, isStream } from '../utils.js'; import { BaseClient } from './common/base-client.js'; +import mime from 'mime-types'; const DEFAULT_LOCAL_FILE_EXTENSION = 'bin'; -const GET_RECORD_CONCURRENCY = 25; export interface KeyValueStoreClientOptions { name?: string; @@ -36,7 +31,7 @@ export interface InternalKeyRecord { filePath?: string; } -export class KeyValueStoreClient extends BaseClient { +export class KeyValueStoreClient extends BaseClient implements storage.KeyValueStoreClient { name?: string; createdAt = new Date(); accessedAt = new Date(); @@ -58,42 +53,7 @@ export class KeyValueStoreClient extends BaseClient { return this.toKeyValueStoreInfo(); } - async update(newFields: storage.KeyValueStoreClientUpdateOptions = {}): Promise { - const parsed = s - .object({ - name: s.string().lengthGreaterThan(0).optional(), - }) - .parse(newFields); - - // Skip if no changes - if (!parsed.name) { - return this.toKeyValueStoreInfo(); - } - - // Check that name is not in use already - const existingStoreByName = this.client.keyValueStoreCache.find( - (store) => store.name?.toLowerCase() === parsed.name!.toLowerCase(), - ); - - if (existingStoreByName) { - this.throwOnDuplicateEntry(StorageTypes.KeyValueStore, 'name', parsed.name); - } - - this.name = parsed.name; - - const previousDir = this.keyValueStoreDirectory; - - this.keyValueStoreDirectory = resolve(this.client.keyValueStoresDirectory, parsed.name ?? this.name ?? this.id); - - await move(previousDir, this.keyValueStoreDirectory, { overwrite: true }); - - // Update timestamps - this.updateTimestamps(true); - - return this.toKeyValueStoreInfo(); - } - - async delete(): Promise { + async drop(): Promise { const storeIndex = this.client.keyValueStoreCache.findIndex((store) => store.id === this.id); if (storeIndex !== -1) { @@ -104,158 +64,27 @@ export class KeyValueStoreClient extends BaseClient { } } - listKeys( - options: storage.KeyValueStoreClientListOptions = {}, - ): AsyncIterable & Promise { - const { limit, exclusiveStartKey, prefix } = s - .object({ - limit: s.number().greaterThan(0).optional(), - exclusiveStartKey: s.string().optional(), - collection: s.string().optional(), // This is ignored, but kept for validation consistency with API client. - prefix: s.string().optional(), - }) - .parse(options); + async purge(): Promise { + // Delete all entries + const entriesToDelete = [...this.keyValueEntries.entries()]; + for (const [key, entry] of entriesToDelete) { + this.keyValueEntries.delete(key); + await entry.delete(); + } - return createKeyList( - (pageExclusiveStartKey) => - this.listKeysPage({ - limit: limit ?? DEFAULT_API_PARAM_LIMIT, - exclusiveStartKey: pageExclusiveStartKey, - prefix, - }), - { exclusiveStartKey, limit }, - ); + this.updateTimestamps(true); } - keys( - options: storage.KeyValueStoreClientListOptions = {}, - ): AsyncIterable & Promise { - const { limit, exclusiveStartKey, prefix } = s + async *iterateKeys( + options: storage.KeyValueStoreIterateKeysOptions = {}, + ): AsyncIterable { + const { prefix } = s .object({ - limit: s.number().greaterThan(0).optional(), - exclusiveStartKey: s.string().optional(), - collection: s.string().optional(), prefix: s.string().optional(), }) .parse(options); - return createKeyStringList( - (pageExclusiveStartKey) => - this.listKeysPage({ - limit: limit ?? DEFAULT_API_PARAM_LIMIT, - exclusiveStartKey: pageExclusiveStartKey, - prefix, - }), - { exclusiveStartKey, limit }, - ); - } - - values(options: storage.KeyValueStoreClientListOptions = {}): AsyncIterable & Promise { - const keys = this.keys.bind(this); - const getRecord = this.getRecord.bind(this); - const limit = options.limit; - - const firstPageKeysPromise = keys(options); - - const getFirstPageValues = async () => { - const firstPageKeys = await firstPageKeysPromise; - const keysToFetch = limit !== undefined ? firstPageKeys.items.slice(0, limit) : firstPageKeys.items; - const limiter = pLimit(GET_RECORD_CONCURRENCY); - const results = await Promise.all(keysToFetch.map((item) => limiter(() => getRecord(item.key)))); - return results.filter((r) => r !== undefined).map((r) => r.value); - }; - - async function* asyncGenerator(): AsyncGenerator { - const firstPageKeys = await firstPageKeysPromise; - let yielded = 0; - - for (const item of firstPageKeys.items) { - if (limit !== undefined && yielded >= limit) return; - const record = await getRecord(item.key); - if (record) { - yield record.value; - yielded++; - } - } - - if (firstPageKeys.nextExclusiveStartKey && (limit === undefined || yielded < limit)) { - for await (const key of keys({ - ...options, - exclusiveStartKey: firstPageKeys.nextExclusiveStartKey, - })) { - if (limit !== undefined && yielded >= limit) return; - const record = await getRecord(key); - if (record) { - yield record.value; - yielded++; - } - } - } - } - - return createLazyIterablePromise(getFirstPageValues, asyncGenerator); - } - - entries( - options: storage.KeyValueStoreClientListOptions = {}, - ): AsyncIterable<[string, unknown]> & Promise<[string, unknown][]> { - const keys = this.keys.bind(this); - const getRecord = this.getRecord.bind(this); - const limit = options.limit; - - const firstPageKeysPromise = keys(options); - - const getFirstPageEntries = async () => { - const firstPageKeys = await firstPageKeysPromise; - const keysToFetch = limit !== undefined ? firstPageKeys.items.slice(0, limit) : firstPageKeys.items; - const limiter = pLimit(GET_RECORD_CONCURRENCY); - const results = await Promise.all( - keysToFetch.map((item) => - limiter(() => getRecord(item.key).then((record) => ({ key: item.key, record }))), - ), - ); - return results - .filter((r) => r.record !== undefined) - .map((r) => [r.key, r.record!.value] as [string, unknown]); - }; - - async function* asyncGenerator(): AsyncGenerator<[string, unknown]> { - const firstPageKeys = await firstPageKeysPromise; - let yielded = 0; - - for (const item of firstPageKeys.items) { - if (limit !== undefined && yielded >= limit) return; - const record = await getRecord(item.key); - if (record) { - yield [item.key, record.value]; - yielded++; - } - } - - if (firstPageKeys.nextExclusiveStartKey && (limit === undefined || yielded < limit)) { - for await (const key of keys({ - ...options, - exclusiveStartKey: firstPageKeys.nextExclusiveStartKey, - })) { - if (limit !== undefined && yielded >= limit) return; - const record = await getRecord(key); - if (record) { - yield [key, record.value]; - yielded++; - } - } - } - } - - return createLazyIterablePromise(getFirstPageEntries, asyncGenerator); - } - - private async listKeysPage( - options: storage.KeyValueStoreClientListOptions = {}, - ): Promise { - const { limit = DEFAULT_API_PARAM_LIMIT, exclusiveStartKey, prefix } = options; - - const items = []; + const items: storage.KeyValueStoreItemData[] = []; for (const storageEntry of this.keyValueEntries.values()) { const record = await storageEntry.get(); @@ -268,36 +97,15 @@ export class KeyValueStoreClient extends BaseClient { } // Lexically sort to emulate API. - // TODO(vladfrangu): ensure the sorting works the same way as before (if it matters) - items.sort((a, b) => { - return a.key.localeCompare(b.key); - }); + items.sort((a, b) => a.key.localeCompare(b.key)); const filteredItems = items.filter((item) => !prefix || item.key.startsWith(prefix)); - let truncatedItems = filteredItems; - if (exclusiveStartKey) { - const keyPos = filteredItems.findIndex((item) => item.key === exclusiveStartKey); - if (keyPos !== -1) truncatedItems = filteredItems.slice(keyPos + 1); - } - - const limitedItems = truncatedItems.slice(0, limit); - - const lastItemInStore = filteredItems.at(-1); - const lastSelectedItem = limitedItems.at(-1); - const isLastSelectedItemAbsolutelyLast = lastItemInStore === lastSelectedItem; - const nextExclusiveStartKey = isLastSelectedItemAbsolutelyLast ? undefined : lastSelectedItem?.key; - this.updateTimestamps(false); - return { - count: limitedItems.length, - limit, - exclusiveStartKey, - isTruncated: !isLastSelectedItemAbsolutelyLast, - nextExclusiveStartKey, - items: limitedItems, - }; + for (const item of filteredItems) { + yield item; + } } /** @@ -306,7 +114,7 @@ export class KeyValueStoreClient extends BaseClient { * Returns `undefined` if the record does not exist or has no associated file path (i.e., it is not stored as a file). * @param key The key of the record to generate the public URL for. */ - async getRecordPublicUrl(key: string): Promise { + async getPublicUrl(key: string): Promise { s.string().parse(key); const storageEntry = await this.keyValueEntries.get(key)?.get(); @@ -326,18 +134,8 @@ export class KeyValueStoreClient extends BaseClient { return this.keyValueEntries.has(key); } - async getRecord( - key: string, - options: storage.KeyValueStoreClientGetRecordOptions = {}, - ): Promise { + async getValue(key: string): Promise { s.string().parse(key); - s.object({ - buffer: s.boolean().optional(), - // These options are ignored, but kept here - // for validation consistency with API client. - stream: s.boolean().optional(), - disableRedirect: s.boolean().optional(), - }).parse(options); const storageEntry = this.keyValueEntries.get(key); @@ -353,20 +151,15 @@ export class KeyValueStoreClient extends BaseClient { contentType: entry.contentType ?? (mime.contentType(entry.extension) as string), }; - if (options.stream) { - record.value = Readable.from(record.value); - } else if (options.buffer) { - record.value = Buffer.from(record.value); - } else { - record.value = maybeParseBody(record.value, record.contentType!); - } + // Auto-parse the body (JSON → object, text → string, etc.) + record.value = maybeParseBody(record.value, record.contentType!); this.updateTimestamps(false); return record; } - async setRecord(record: storage.KeyValueStoreRecord): Promise { + async setValue(record: storage.KeyValueStoreRecord): Promise { s.object({ key: s.string().lengthGreaterThan(0), value: s.union([ @@ -437,7 +230,7 @@ export class KeyValueStoreClient extends BaseClient { this.updateTimestamps(true); } - async deleteRecord(key: string): Promise { + async deleteValue(key: string): Promise { s.string().parse(key); const entry = this.keyValueEntries.get(key); diff --git a/packages/memory-storage/src/resource-clients/request-queue.ts b/packages/memory-storage/src/resource-clients/request-queue.ts index 60e253d3d80b..78feebe458a2 100644 --- a/packages/memory-storage/src/resource-clients/request-queue.ts +++ b/packages/memory-storage/src/resource-clients/request-queue.ts @@ -5,12 +5,10 @@ import { resolve } from 'node:path'; import type * as storage from '@crawlee/types'; import { AsyncQueue } from '@sapphire/async-queue'; import { s } from '@sapphire/shapeshift'; -import { move } from 'fs-extra/esm'; import type { RequestQueueFileSystemEntry } from '../fs/request-queue/fs.js'; import type { RequestQueueMemoryEntry } from '../fs/request-queue/memory.js'; import { scheduleBackgroundTask } from '../background-handler/index.js'; -import { StorageTypes } from '../consts.js'; import { createRequestQueueStorageImplementation } from '../fs/request-queue/index.js'; import type { MemoryStorage } from '../index.js'; import { purgeNullsFromObject, uniqueKeyToRequestId } from '../utils.js'; @@ -78,45 +76,7 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue return this.toRequestQueueInfo(); } - async update(newFields: { name?: string | undefined }): Promise { - // The validation is intentionally loose to prevent issues - // when swapping to a remote queue in production. - const parsed = s - .object({ - name: s.string().lengthGreaterThan(0).optional(), - }) - .passthrough() - .parse(newFields); - - // Skip if no changes - if (!parsed.name) { - return this.toRequestQueueInfo(); - } - - // Check that name is not in use already - const existingQueueByName = this.client.requestQueueCache.find( - (queue) => queue.name?.toLowerCase() === parsed.name!.toLowerCase(), - ); - - if (existingQueueByName) { - this.throwOnDuplicateEntry(StorageTypes.RequestQueue, 'name', parsed.name); - } - - this.name = parsed.name; - - const previousDir = this.requestQueueDirectory; - - this.requestQueueDirectory = resolve(this.client.requestQueuesDirectory, parsed.name ?? this.name ?? this.id); - - await move(previousDir, this.requestQueueDirectory, { overwrite: true }); - - // Update timestamps - this.updateTimestamps(true); - - return this.toRequestQueueInfo(); - } - - async delete(): Promise { + async drop(): Promise { const storeIndex = this.client.requestQueueCache.findIndex((queue) => queue.id === this.id); if (storeIndex !== -1) { @@ -128,6 +88,27 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue } } + async purge(): Promise { + // Clear all in-memory state + this.requests.clear(); + this.forefrontRequestIds = []; + this.handledRequestCount = 0; + this.pendingRequestCount = 0; + + // Remove request files from disk but keep the directory + if (this.client.persistStorage) { + const { readdir } = await import('node:fs/promises'); + const entries = await readdir(this.requestQueueDirectory).catch(() => []); + for (const entry of entries) { + if (entry !== '__metadata__.json') { + await rm(resolve(this.requestQueueDirectory, entry), { force: true }); + } + } + } + + this.updateTimestamps(true); + } + private *requestKeyIterator(rqClient: RequestQueueClient): IterableIterator { for (let i = this.forefrontRequestIds.length - 1; i >= 0; i--) { yield this.forefrontRequestIds[i]; @@ -520,25 +501,6 @@ export class RequestQueueClient extends BaseClient implements storage.RequestQue }; } - async deleteRequest(id: string): Promise { - const entry = this.requests.get(id); - - if (entry) { - const request = await entry.get(); - - this.requests.delete(id); - this.updateTimestamps(true); - - if (request.orderNo) { - this.pendingRequestCount -= 1; - } else { - this.handledRequestCount -= 1; - } - - await entry.delete(); - } - } - toRequestQueueInfo(): storage.RequestQueueInfo { return { accessedAt: this.accessedAt, diff --git a/packages/memory-storage/src/utils.ts b/packages/memory-storage/src/utils.ts index 94fb8460dc90..f62476328d39 100644 --- a/packages/memory-storage/src/utils.ts +++ b/packages/memory-storage/src/utils.ts @@ -107,153 +107,3 @@ export function createPaginatedList( value: asyncGenerator, }) as AsyncIterable & Promise>; } - -/** - * Creates a hybrid Promise + AsyncIterable for cursor-based pagination (KeyValueStore.listKeys). - * - * The returned object can be: - * - Awaited directly to get the first page (backward compatible) - * - Used with `for await...of` to iterate through all keys - */ -export function createKeyList( - getPage: (exclusiveStartKey?: string) => Promise, - options: { exclusiveStartKey?: string; limit?: number } = {}, -): AsyncIterable & Promise { - // Immediately fetch the first page - const firstPagePromise = getPage(options.exclusiveStartKey); - - async function* asyncGenerator(): AsyncGenerator { - let currentPage = await firstPagePromise; - yield* currentPage.items; - - let remainingItems = options.limit ? options.limit - currentPage.items.length : undefined; - - while ( - currentPage.items.length > 0 && - currentPage.nextExclusiveStartKey !== undefined && - (remainingItems === undefined || remainingItems > 0) - ) { - currentPage = await getPage(currentPage.nextExclusiveStartKey); - yield* currentPage.items; - if (remainingItems !== undefined) { - remainingItems -= currentPage.items.length; - } - } - } - - return Object.defineProperty(firstPagePromise, Symbol.asyncIterator, { - value: asyncGenerator, - }) as AsyncIterable & Promise; -} - -/** - * Creates a hybrid Promise + AsyncIterable that yields only key strings (KeyValueStore.keys). - * - * The returned object can be: - * - Awaited directly to get the first page (backward compatible) - * - Used with `for await...of` to iterate through all key strings - */ -export function createKeyStringList( - getPage: (exclusiveStartKey?: string) => Promise, - options: { exclusiveStartKey?: string; limit?: number } = {}, -): AsyncIterable & Promise { - // Immediately fetch the first page - const firstPagePromise = getPage(options.exclusiveStartKey); - - async function* asyncGenerator(): AsyncGenerator { - let currentPage = await firstPagePromise; - for (const item of currentPage.items) { - yield item.key; - } - - let remainingItems = options.limit ? options.limit - currentPage.items.length : undefined; - - while ( - currentPage.items.length > 0 && - currentPage.nextExclusiveStartKey !== undefined && - (remainingItems === undefined || remainingItems > 0) - ) { - currentPage = await getPage(currentPage.nextExclusiveStartKey); - for (const item of currentPage.items) { - yield item.key; - } - if (remainingItems !== undefined) { - remainingItems -= currentPage.items.length; - } - } - } - - return Object.defineProperty(firstPagePromise, Symbol.asyncIterator, { - value: asyncGenerator, - }) as AsyncIterable & Promise; -} - -/** - * Creates a hybrid Promise + AsyncIterable for offset-based pagination with index-value entries (Dataset.listEntries). - * - * The returned object can be: - * - Awaited directly to get the first page with [index, item] tuples (backward compatible) - * - Used with `for await...of` to iterate through all entries as [index, item] tuples - */ -export function createPaginatedEntryList( - getPage: (offset: number, limit: number) => Promise>, - options: { offset?: number; limit?: number } = {}, -): AsyncIterable<[number, Data]> & Promise> { - const offset = options.offset ?? 0; - - // Immediately fetch the first page and transform items to entries - const firstPagePromise = getPage(offset, options.limit ?? Infinity).then((result) => ({ - ...result, - items: result.items.map((item, i) => [offset + i, item] as [number, Data]), - })); - - async function* asyncGenerator(): AsyncGenerator<[number, Data]> { - let currentIndex = offset; - for await (const item of createPaginatedList(getPage, options)) { - yield [currentIndex++, item]; - } - } - - return Object.defineProperty(firstPagePromise, Symbol.asyncIterator, { - value: asyncGenerator, - }) as AsyncIterable<[number, Data]> & Promise>; -} - -/** - * Creates an object that acts as both a lazy Promise and an AsyncIterable. - * - When awaited, it triggers `promiseFactory` (bulk fetch, cached after first call). - * - When iterated with `for await...of`, it uses `iteratorFactory` (streaming, no bulk fetch). - */ -export function createLazyIterablePromise( - promiseFactory: () => Promise, - iteratorFactory: () => AsyncGenerator, -): AsyncIterable & Promise { - let cached: Promise | null = null; - function getOrCreate(): Promise { - if (!cached) { - cached = promiseFactory(); - } - return cached; - } - - const result = { - then( - onfulfilled?: ((value: TPromise) => TResult1 | PromiseLike) | null, - onrejected?: ((reason: any) => TResult2 | PromiseLike) | null, - ): Promise { - return getOrCreate().then(onfulfilled, onrejected); - }, - catch( - onrejected?: ((reason: any) => TResult | PromiseLike) | null, - ): Promise { - return getOrCreate().catch(onrejected); - }, - finally(onfinally?: (() => void) | null): Promise { - return getOrCreate().finally(onfinally); - }, - [Symbol.asyncIterator]: iteratorFactory, - [Symbol.toStringTag]: 'Promise' as const, - }; - - return result as AsyncIterable & Promise; -} diff --git a/packages/memory-storage/test/async-iteration.test.ts b/packages/memory-storage/test/async-iteration.test.ts index 71c7a4c35dd1..a1acab02bf2f 100644 --- a/packages/memory-storage/test/async-iteration.test.ts +++ b/packages/memory-storage/test/async-iteration.test.ts @@ -3,9 +3,6 @@ import path from 'node:path'; import { MemoryStorage } from '@crawlee/memory-storage'; import type { DatasetClient, KeyValueStoreClient } from '@crawlee/types'; -import { vi } from 'vitest'; - -import { createLazyIterablePromise } from '../src/utils'; describe('Async iteration support', () => { const localDataDirectory = path.resolve(__dirname, './tmp/async-iteration'); @@ -18,17 +15,19 @@ describe('Async iteration support', () => { await rm(localDataDirectory, { force: true, recursive: true }); }); - describe('Dataset.listItems', () => { + describe('Dataset.getData / iterateItems', () => { const elements = Array.from({ length: 25 }, (_, i) => ({ index: i })); - let dataset: DatasetClient; + let dataset: DatasetClient<{ index: number }>; beforeAll(async () => { - dataset = await storage.createDatasetClient({ name: 'async-iteration-dataset' }); - await dataset.pushItems(elements); + dataset = (await storage.createDatasetClient({ name: 'async-iteration-dataset' })) as DatasetClient<{ + index: number; + }>; + await dataset.pushData(elements); }); - test('can be awaited directly (backward compatibility)', async () => { - const result = await dataset.listItems({ limit: 10 }); + test('getData returns a paginated result', async () => { + const result = await dataset.getData({ limit: 10 }); expect(result.items).toHaveLength(10); expect(result.total).toBe(25); @@ -39,7 +38,7 @@ describe('Async iteration support', () => { test('can be used with for await...of to iterate all items', async () => { const items: { index: number }[] = []; - for await (const item of dataset.listItems()) { + for await (const item of dataset.iterateItems()) { items.push(item); } @@ -50,7 +49,7 @@ describe('Async iteration support', () => { test('respects limit option when iterating', async () => { const items: { index: number }[] = []; - for await (const item of dataset.listItems({ limit: 10 })) { + for await (const item of dataset.iterateItems({ limit: 10 })) { items.push(item); } @@ -61,7 +60,7 @@ describe('Async iteration support', () => { test('respects offset option when iterating', async () => { const items: { index: number }[] = []; - for await (const item of dataset.listItems({ offset: 5 })) { + for await (const item of dataset.iterateItems({ offset: 5 })) { items.push(item); } @@ -72,7 +71,7 @@ describe('Async iteration support', () => { test('respects both offset and limit options when iterating', async () => { const items: { index: number }[] = []; - for await (const item of dataset.listItems({ offset: 5, limit: 10 })) { + for await (const item of dataset.iterateItems({ offset: 5, limit: 10 })) { items.push(item); } @@ -83,7 +82,7 @@ describe('Async iteration support', () => { test('respects desc option when iterating', async () => { const items: { index: number }[] = []; - for await (const item of dataset.listItems({ desc: true, limit: 5 })) { + for await (const item of dataset.iterateItems({ desc: true, limit: 5 })) { items.push(item); } @@ -92,7 +91,7 @@ describe('Async iteration support', () => { }); }); - describe('KeyValueStore.listKeys', () => { + describe('KeyValueStore.iterateKeys', () => { const keys = Array.from({ length: 25 }, (_, i) => `key-${String(i).padStart(2, '0')}`); let kvStore: KeyValueStoreClient; @@ -100,22 +99,14 @@ describe('Async iteration support', () => { kvStore = await storage.createKeyValueStoreClient({ name: 'async-iteration-kvs' }); for (const key of keys) { - await kvStore.setRecord({ key, value: { data: key } }); + await kvStore.setValue({ key, value: { data: key } }); } }); - test('can be awaited directly (backward compatibility)', async () => { - const result = await kvStore.listKeys({ limit: 10 }); - - expect(result.items).toHaveLength(10); - expect(result.isTruncated).toBe(true); - expect(result.items.map((i) => i.key)).toStrictEqual(keys.slice(0, 10)); - }); - test('can be used with for await...of to iterate all keys', async () => { const items: string[] = []; - for await (const item of kvStore.listKeys()) { + for await (const item of kvStore.iterateKeys()) { items.push(item.key); } @@ -123,42 +114,11 @@ describe('Async iteration support', () => { expect(items).toStrictEqual(keys); }); - test('respects limit option when iterating (10 items, limit 2)', async () => { - // Create a fresh store with exactly 10 items to match the reported bug scenario - const testStore = await storage.createKeyValueStoreClient({ name: 'limit-test-kvs' }); - - for (let i = 0; i < 10; i++) { - await testStore.setRecord({ key: `key-${i}`, value: `value-${i}` }); - } - - const items: string[] = []; - - // This should only return 2 items, matching apify-client behavior - for await (const item of testStore.listKeys({ limit: 2 })) { - items.push(item.key); - } - - // Should only get 2 items, not all 10 - expect(items).toHaveLength(2); - }); - - test('respects exclusiveStartKey option when iterating', async () => { - const items: string[] = []; - - // Start after key-04 (index 4), should get keys 5-24 - for await (const item of kvStore.listKeys({ exclusiveStartKey: 'key-04' })) { - items.push(item.key); - } - - expect(items).toHaveLength(20); - expect(items).toStrictEqual(keys.slice(5)); - }); - test('respects prefix option when iterating', async () => { const items: string[] = []; // Only keys starting with 'key-0' (key-00 to key-09) - for await (const item of kvStore.listKeys({ prefix: 'key-0' })) { + for await (const item of kvStore.iterateKeys({ prefix: 'key-0' })) { items.push(item.key); } @@ -166,394 +126,4 @@ describe('Async iteration support', () => { expect(items).toStrictEqual(keys.slice(0, 10)); }); }); - - describe('KeyValueStore.keys', () => { - const keys = Array.from({ length: 25 }, (_, i) => `key-${String(i).padStart(2, '0')}`); - let kvStore: KeyValueStoreClient; - - beforeAll(async () => { - kvStore = await storage.createKeyValueStoreClient({ name: 'async-iteration-kvs-keys' }); - - for (const key of keys) { - await kvStore.setRecord({ key, value: { data: key } }); - } - }); - - test('can be awaited directly (backward compatibility)', async () => { - const result = await kvStore.keys({ limit: 10 }); - - // When awaited, returns the same structure as listKeys - expect(result.items).toHaveLength(10); - expect(result.isTruncated).toBe(true); - expect(result.items.map((i) => i.key)).toStrictEqual(keys.slice(0, 10)); - }); - - test('can be used with for await...of to iterate all keys as strings', async () => { - const items: string[] = []; - - for await (const key of kvStore.keys()) { - items.push(key); - } - - expect(items).toHaveLength(25); - expect(items).toStrictEqual(keys); - }); - - test('yields strings directly, not objects', async () => { - // eslint-disable-next-line no-unreachable-loop - for await (const key of kvStore.keys()) { - expect(typeof key).toBe('string'); - break; // Only need to check the first one - } - }); - - test('respects limit option when iterating', async () => { - const items: string[] = []; - - for await (const key of kvStore.keys({ limit: 10 })) { - items.push(key); - } - - expect(items).toHaveLength(10); - expect(items).toStrictEqual(keys.slice(0, 10)); - }); - - test('respects exclusiveStartKey option when iterating', async () => { - const items: string[] = []; - - // Start after key-04 (index 4), should get keys 5-24 - for await (const key of kvStore.keys({ exclusiveStartKey: 'key-04' })) { - items.push(key); - } - - expect(items).toHaveLength(20); - expect(items).toStrictEqual(keys.slice(5)); - }); - - test('respects prefix option when iterating', async () => { - const items: string[] = []; - - // Only keys starting with 'key-0' (key-00 to key-09) - for await (const key of kvStore.keys({ prefix: 'key-0' })) { - items.push(key); - } - - expect(items).toHaveLength(10); - expect(items).toStrictEqual(keys.slice(0, 10)); - }); - - test('respects both exclusiveStartKey and limit options', async () => { - const items: string[] = []; - - for await (const key of kvStore.keys({ exclusiveStartKey: 'key-04', limit: 5 })) { - items.push(key); - } - - expect(items).toHaveLength(5); - expect(items).toStrictEqual(keys.slice(5, 10)); - }); - }); - - describe('KeyValueStore.values', () => { - const keys = Array.from({ length: 25 }, (_, i) => `key-${String(i).padStart(2, '0')}`); - let kvStore: KeyValueStoreClient; - - beforeAll(async () => { - kvStore = await storage.createKeyValueStoreClient({ name: 'async-iteration-kvs-values' }); - - for (const key of keys) { - await kvStore.setRecord({ key, value: { data: key } }); - } - }); - - test('can be awaited directly (backward compatibility)', async () => { - const values = await kvStore.values({ limit: 10 }); - - expect(values).toHaveLength(10); - expect(Array.isArray(values)).toBe(true); - expect(values[0]).toStrictEqual({ data: 'key-00' }); - }); - - test('can be used with for await...of to iterate all values', async () => { - const values: unknown[] = []; - - for await (const value of kvStore.values()) { - values.push(value); - } - - expect(values).toHaveLength(25); - expect(values.every((v) => v && typeof v === 'object')).toBe(true); - }); - - test('yields values directly, not KeyValueStoreRecord objects', async () => { - // eslint-disable-next-line no-unreachable-loop - for await (const value of kvStore.values()) { - // Should be the actual value, not a record wrapper - expect(value).toStrictEqual({ data: 'key-00' }); - expect(value).not.toHaveProperty('contentType'); - break; // Only need to check the first one - } - }); - - test('respects limit option when iterating', async () => { - const values: unknown[] = []; - - for await (const value of kvStore.values({ limit: 10 })) { - values.push(value); - } - - expect(values).toHaveLength(10); - }); - - test('respects exclusiveStartKey option when iterating', async () => { - const values: unknown[] = []; - - // Start after key-04 (index 4), should get keys 5-24 - for await (const value of kvStore.values({ exclusiveStartKey: 'key-04' })) { - values.push(value); - } - - expect(values).toHaveLength(20); - }); - - test('respects prefix option when iterating', async () => { - const values: unknown[] = []; - - // Only keys starting with 'key-0' (key-00 to key-09) - for await (const value of kvStore.values({ prefix: 'key-0' })) { - values.push(value); - } - - expect(values).toHaveLength(10); - }); - - test('fetches actual record values', async () => { - const values: unknown[] = []; - - for await (const value of kvStore.values({ limit: 3 })) { - values.push(value); - } - - expect(values[0]).toStrictEqual({ data: 'key-00' }); - expect(values[1]).toStrictEqual({ data: 'key-01' }); - expect(values[2]).toStrictEqual({ data: 'key-02' }); - }); - }); - - describe('KeyValueStore.entries', () => { - const keys = Array.from({ length: 25 }, (_, i) => `key-${String(i).padStart(2, '0')}`); - let kvStore: KeyValueStoreClient; - - beforeAll(async () => { - kvStore = await storage.createKeyValueStoreClient({ name: 'async-iteration-kvs-entries' }); - - for (const key of keys) { - await kvStore.setRecord({ key, value: { data: key } }); - } - }); - - test('can be awaited directly (backward compatibility)', async () => { - const entries = await kvStore.entries({ limit: 10 }); - - expect(entries).toHaveLength(10); - expect(Array.isArray(entries)).toBe(true); - // Each entry is a [key, value] tuple - expect(entries[0][0]).toBe('key-00'); - expect(entries[0][1]).toStrictEqual({ data: 'key-00' }); - }); - - test('can be used with for await...of to iterate all entries', async () => { - const entries: [string, unknown][] = []; - - for await (const entry of kvStore.entries()) { - entries.push(entry); - } - - expect(entries).toHaveLength(25); - expect(entries.map(([key]) => key)).toStrictEqual(keys); - }); - - test('yields [key, value] tuples', async () => { - // eslint-disable-next-line no-unreachable-loop - for await (const [key, value] of kvStore.entries()) { - expect(typeof key).toBe('string'); - expect(key).toBe('key-00'); - expect(value).toStrictEqual({ data: 'key-00' }); - // Value should not be a record wrapper - expect(value).not.toHaveProperty('contentType'); - break; // Only need to check the first one - } - }); - - test('respects limit option when iterating', async () => { - const entries: [string, unknown][] = []; - - for await (const entry of kvStore.entries({ limit: 10 })) { - entries.push(entry); - } - - expect(entries).toHaveLength(10); - expect(entries.map(([key]) => key)).toStrictEqual(keys.slice(0, 10)); - }); - - test('respects exclusiveStartKey option when iterating', async () => { - const entries: [string, unknown][] = []; - - // Start after key-04 (index 4), should get keys 5-24 - for await (const entry of kvStore.entries({ exclusiveStartKey: 'key-04' })) { - entries.push(entry); - } - - expect(entries).toHaveLength(20); - expect(entries.map(([key]) => key)).toStrictEqual(keys.slice(5)); - }); - - test('respects prefix option when iterating', async () => { - const entries: [string, unknown][] = []; - - // Only keys starting with 'key-0' (key-00 to key-09) - for await (const entry of kvStore.entries({ prefix: 'key-0' })) { - entries.push(entry); - } - - expect(entries).toHaveLength(10); - expect(entries.map(([key]) => key)).toStrictEqual(keys.slice(0, 10)); - }); - - test('values in entries match expected data', async () => { - for await (const [key, value] of kvStore.entries({ limit: 5 })) { - expect(value).toStrictEqual({ data: key }); - } - }); - }); - - describe('createLazyIterablePromise', () => { - test('promise factory is not called until awaited', async () => { - const promiseFactory = vi.fn(() => Promise.resolve([1, 2, 3])); - async function* iteratorFactory() { - yield 1; - yield 2; - yield 3; - } - - const result = createLazyIterablePromise(promiseFactory, iteratorFactory); - - // Factory should not be called yet - expect(promiseFactory).not.toHaveBeenCalled(); - - // Now await it - const values = await result; - expect(promiseFactory).toHaveBeenCalledTimes(1); - expect(values).toStrictEqual([1, 2, 3]); - }); - - test('iterating does not trigger the promise factory', async () => { - const promiseFactory = vi.fn(() => Promise.resolve([1, 2, 3])); - async function* iteratorFactory() { - yield 10; - yield 20; - yield 30; - } - - const result = createLazyIterablePromise(promiseFactory, iteratorFactory); - - const items: number[] = []; - for await (const item of result) { - items.push(item); - } - - expect(items).toStrictEqual([10, 20, 30]); - expect(promiseFactory).not.toHaveBeenCalled(); - }); - - test('promise factory result is cached across multiple awaits', async () => { - const promiseFactory = vi.fn(() => Promise.resolve([1, 2, 3])); - async function* iteratorFactory() { - yield 1; - } - - const result = createLazyIterablePromise(promiseFactory, iteratorFactory); - - await result; - await result; - await result; - - expect(promiseFactory).toHaveBeenCalledTimes(1); - }); - }); - - describe('KeyValueStore.values lazy promise behavior', () => { - let kvStore: KeyValueStoreClient; - - beforeAll(async () => { - kvStore = await storage.createKeyValueStoreClient({ name: 'lazy-test-kvs-values' }); - - for (let i = 0; i < 5; i++) { - await kvStore.setRecord({ key: `key-${i}`, value: { data: i } }); - } - }); - - test('calling values() does not immediately fetch records', async () => { - const getRecordSpy = vi.spyOn(kvStore, 'getRecord'); - - // Call values() but do not await or iterate - const result = kvStore.values(); - - // getRecord should not have been called yet (lazy) - // Note: keys may be fetched eagerly, but record values should not - // We need to wait a tick to ensure no async work triggered getRecord - await new Promise((resolve) => setTimeout(resolve, 50)); - expect(getRecordSpy).not.toHaveBeenCalled(); - - // Clean up: consume the result to avoid dangling promises - await result; - getRecordSpy.mockRestore(); - }); - - test('iterating and awaiting produce the same values', async () => { - const awaited = await kvStore.values(); - - const iterated: unknown[] = []; - for await (const value of kvStore.values()) { - iterated.push(value); - } - - expect(awaited).toStrictEqual(iterated); - }); - }); - - describe('KeyValueStore.entries lazy promise behavior', () => { - let kvStore: KeyValueStoreClient; - - beforeAll(async () => { - kvStore = await storage.createKeyValueStoreClient({ name: 'lazy-test-kvs-entries' }); - - for (let i = 0; i < 5; i++) { - await kvStore.setRecord({ key: `key-${i}`, value: { data: i } }); - } - }); - - test('calling entries() does not immediately fetch records', async () => { - const getRecordSpy = vi.spyOn(kvStore, 'getRecord'); - - const result = kvStore.entries(); - - await new Promise((resolve) => setTimeout(resolve, 50)); - expect(getRecordSpy).not.toHaveBeenCalled(); - - await result; - getRecordSpy.mockRestore(); - }); - - test('iterating and awaiting produce the same entries', async () => { - const awaited = await kvStore.entries(); - - const iterated: [string, unknown][] = []; - for await (const entry of kvStore.entries()) { - iterated.push(entry); - } - - expect(awaited).toStrictEqual(iterated); - }); - }); }); diff --git a/packages/memory-storage/test/fs-fallback.test.ts b/packages/memory-storage/test/fs-fallback.test.ts index ca8ac5ff7c59..21c4f6434b91 100644 --- a/packages/memory-storage/test/fs-fallback.test.ts +++ b/packages/memory-storage/test/fs-fallback.test.ts @@ -61,7 +61,7 @@ describe('fallback to fs for reading', () => { expect(defaultStoreInfo.name).toEqual('default'); expect(defaultStoreInfo.createdAt).toEqual(expectedFsDate); - const input = await defaultStore.getRecord('INPUT'); + const input = await defaultStore.getValue('INPUT'); expect(input).toStrictEqual({ key: 'INPUT', value: { foo: 'bar but from fs' }, @@ -72,7 +72,7 @@ describe('fallback to fs for reading', () => { test('attempting to read "other" key value store with no "__metadata__" present should read from fs, even if accessed without generating id first', async () => { const otherStore = await storage.createKeyValueStoreClient({ name: 'other' }); - const input = await otherStore.getRecord('INPUT'); + const input = await otherStore.getValue('INPUT'); expect(input).toStrictEqual({ key: 'INPUT', value: { foo: 'bar but from fs' }, @@ -89,7 +89,7 @@ describe('fallback to fs for reading', () => { test('attempting to read "no-ext" key value store should load the missing extension file correctly', async () => { const noExtStore = await storage.createKeyValueStoreClient({ name: 'no-ext' }); - const input = await noExtStore.getRecord('INPUT'); + const input = await noExtStore.getValue('INPUT'); expect(input).toStrictEqual({ key: 'INPUT', value: JSON.stringify({ foo: 'bar but from fs' }), @@ -100,7 +100,7 @@ describe('fallback to fs for reading', () => { test('attempting to read "invalid-json" key value store should ignore the invalid "INPUT" json file', async () => { const invalidJsonStore = await storage.createKeyValueStoreClient({ name: 'invalid-json' }); - const input = await invalidJsonStore.getRecord('INPUT'); + const input = await invalidJsonStore.getValue('INPUT'); expect(input).toBeUndefined(); }); }); diff --git a/packages/memory-storage/test/key-value-store/stream.test.ts b/packages/memory-storage/test/key-value-store/stream.test.ts index 8577a4266a15..bfbd05cd3931 100644 --- a/packages/memory-storage/test/key-value-store/stream.test.ts +++ b/packages/memory-storage/test/key-value-store/stream.test.ts @@ -12,11 +12,11 @@ describe('KeyValueStore should drain streams when setting records', () => { test('should drain stream', async () => { const defaultStore = await storage.createKeyValueStoreClient({ name: 'default' }); - await defaultStore.setRecord({ key: 'streamz', value: fsStream, contentType: 'text/plain' }); + await defaultStore.setValue({ key: 'streamz', value: fsStream, contentType: 'text/plain' }); expect(fsStream.destroyed).toBeTruthy(); - const record = await defaultStore.getRecord('streamz'); + const record = await defaultStore.getValue('streamz'); expect(record!.value.toString('utf8')).toEqual('helloworld'); }); }); diff --git a/packages/memory-storage/test/no-crash-on-big-buffers.test.ts b/packages/memory-storage/test/no-crash-on-big-buffers.test.ts index 473bc0b3f423..136960b149a0 100644 --- a/packages/memory-storage/test/no-crash-on-big-buffers.test.ts +++ b/packages/memory-storage/test/no-crash-on-big-buffers.test.ts @@ -36,7 +36,7 @@ describe('MemoryStorage should not crash when saving a big buffer', () => { } try { - await store.setRecord({ key: 'owo.zip', value: zip }); + await store.setValue({ key: 'owo.zip', value: zip }); } catch (err) { expect(err).not.toBeDefined(); } diff --git a/packages/memory-storage/test/no-writing-to-disk.test.ts b/packages/memory-storage/test/no-writing-to-disk.test.ts index b7cc86f5234d..9006957a2484 100644 --- a/packages/memory-storage/test/no-writing-to-disk.test.ts +++ b/packages/memory-storage/test/no-writing-to-disk.test.ts @@ -21,7 +21,7 @@ describe('persistStorage option', () => { test('creating a key-value pair in a key-value store should not write data to the disk', async () => { const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setRecord({ key: 'foo', value: 'test' }); + await keyValueStore.setValue({ key: 'foo', value: 'test' }); // We check that reading the directory for the store throws an error, which means it wasn't created on disk await expect(async () => readdir(localDataDirectory)).rejects.toThrow(); @@ -38,7 +38,7 @@ describe('persistStorage option', () => { test('creating a key-value pair in a key-value store should not write data to the disk, but it should write the __metadata__ file', async () => { const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setRecord({ key: 'foo', value: 'test' }); + await keyValueStore.setValue({ key: 'foo', value: 'test' }); const keyValueStoreInfo = await keyValueStore.getMetadata(); const storePath = resolve(storage.keyValueStoresDirectory, keyValueStoreInfo.id); diff --git a/packages/memory-storage/test/request-queue/forefront.test.ts b/packages/memory-storage/test/request-queue/forefront.test.ts index 45553b59990a..7cb02e60862c 100644 --- a/packages/memory-storage/test/request-queue/forefront.test.ts +++ b/packages/memory-storage/test/request-queue/forefront.test.ts @@ -15,7 +15,7 @@ describe('RequestQueueV1 respects `forefront` in `listHead`', () => { }); afterEach(async () => { - await requestQueue.delete(); + await requestQueue.drop(); }); test('requests without `forefront` respect sequential order', async () => { @@ -202,7 +202,7 @@ describe('RequestQueueV2 respects `forefront` in `listAndLockHead`', () => { }); afterEach(async () => { - await requestQueue.delete(); + await requestQueue.drop(); }); test('requests without `forefront` respect sequential order', async () => { diff --git a/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts b/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts index 8d23a4cfbdbf..38674f73e202 100644 --- a/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts +++ b/packages/memory-storage/test/request-queue/handledRequestCount-should-update.test.ts @@ -36,17 +36,4 @@ describe('RequestQueue handledRequestCount should update', () => { const updatedStatistics = await requestQueue.getMetadata(); expect(updatedStatistics.handledRequestCount).toEqual(2); }); - - test('deleting a request should decrement the handledRequestCount', async () => { - const { requestId } = await requestQueue.addRequest({ - url: 'http://example.com/3', - uniqueKey: '3', - handledAt: new Date().toISOString(), - }); - - await requestQueue.deleteRequest(requestId); - - const updatedStatistics = await requestQueue.getMetadata(); - expect(updatedStatistics.handledRequestCount).toEqual(2); - }); }); diff --git a/packages/memory-storage/test/reverse-datataset-list.test.ts b/packages/memory-storage/test/reverse-datataset-list.test.ts index d28553795723..28b0388c8a99 100644 --- a/packages/memory-storage/test/reverse-datataset-list.test.ts +++ b/packages/memory-storage/test/reverse-datataset-list.test.ts @@ -6,7 +6,7 @@ import type { DatasetClient } from '@crawlee/types'; const elements = Array.from({ length: 10 }, (_, i) => ({ number: i })); -describe('Dataset#listItems respects the desc option', () => { +describe('Dataset#getData respects the desc option', () => { const localDataDirectory = resolve(import.meta.dirname, './tmp/desc'); const storage = new MemoryStorage({ localDataDirectory, @@ -22,32 +22,32 @@ describe('Dataset#listItems respects the desc option', () => { beforeAll(async () => { dataset = await storage.createDatasetClient({ name: 'false' }); - await dataset.pushItems(elements); + await dataset.pushData(elements); }); test('with desc: false', async () => { - const result = await dataset.listItems({ desc: false, limit: 5 }); + const result = await dataset.getData({ desc: false, limit: 5 }); expect(result.items).toHaveLength(5); expect(result.items).toStrictEqual(elements.slice(0, 5)); }); test('with desc: true', async () => { - const result = await dataset.listItems({ desc: true, limit: 5 }); + const result = await dataset.getData({ desc: true, limit: 5 }); expect(result.items).toHaveLength(5); expect(result.items).toStrictEqual(elements.slice().reverse().slice(0, 5)); }); test('with desc: false and offset: 2', async () => { - const result = await dataset.listItems({ desc: false, limit: 5, offset: 2 }); + const result = await dataset.getData({ desc: false, limit: 5, offset: 2 }); expect(result.items).toHaveLength(5); expect(result.items).toStrictEqual(elements.slice(2, 7)); }); test('with desc: true and offset: 2', async () => { - const result = await dataset.listItems({ desc: true, limit: 5, offset: 2 }); + const result = await dataset.getData({ desc: true, limit: 5, offset: 2 }); expect(result.items).toHaveLength(5); expect(result.items).toStrictEqual(elements.slice().reverse().slice(2, 7)); diff --git a/packages/memory-storage/test/write-metadata.test.ts b/packages/memory-storage/test/write-metadata.test.ts index 68a465a902ee..80a6686acca6 100644 --- a/packages/memory-storage/test/write-metadata.test.ts +++ b/packages/memory-storage/test/write-metadata.test.ts @@ -30,7 +30,7 @@ describe('writeMetadata option', () => { test('creating a key-value pair in a key-value store should not write __metadata__.json file for the value', async () => { const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setRecord({ key: 'foo', value: 'test' }); + await keyValueStore.setValue({ key: 'foo', value: 'test' }); const keyValueStoreInfo = await keyValueStore.getMetadata(); const expectedFilePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.txt`); @@ -62,7 +62,7 @@ describe('writeMetadata option', () => { test('creating a key-value pair in a key-value store should write __metadata__.json file for the value', async () => { const keyValueStore = await storage.createKeyValueStoreClient(); - await keyValueStore.setRecord({ key: 'foo', value: 'test' }); + await keyValueStore.setValue({ key: 'foo', value: 'test' }); const keyValueStoreInfo = await keyValueStore.getMetadata(); const expectedFilePath = resolve(storage.keyValueStoresDirectory, `${keyValueStoreInfo.id}/foo.txt`); diff --git a/packages/types/src/storages.ts b/packages/types/src/storages.ts index f0d11453c537..7ce0f8f853b7 100644 --- a/packages/types/src/storages.ts +++ b/packages/types/src/storages.ts @@ -30,10 +30,6 @@ export interface PaginatedList { items: Data[]; } -export interface DatasetClientUpdateOptions { - name?: string; -} - export interface DatasetClientListOptions { desc?: boolean; limit?: number; @@ -66,14 +62,21 @@ export interface DatasetClient { * for a storage that has been removed. */ getMetadata(): Promise; - update(newFields: DatasetClientUpdateOptions): Promise>; - delete(): Promise; - downloadItems(...args: unknown[]): Promise; - listItems(options?: DatasetClientListOptions): AsyncIterable & Promise>; - listEntries?( - options?: DatasetClientListOptions, - ): AsyncIterable<[number, Data]> & Promise>; - pushItems(items: Data | Data[] | string | string[]): Promise; + + /** Remove the dataset and all its data. */ + drop(): Promise; + + /** Remove all items from the dataset but keep the dataset itself. */ + purge(): Promise; + + /** Add items to the dataset. */ + pushData(items: Data[]): Promise; + + /** Fetch a page of items from the dataset. */ + getData(options?: DatasetClientListOptions): Promise>; + + /** Iterate over all items in the dataset. */ + iterateItems(options?: DatasetClientListOptions): AsyncIterable; } export interface KeyValueStoreStats { @@ -102,19 +105,8 @@ export interface KeyValueStoreRecord { contentType?: string; } -export interface KeyValueStoreRecordOptions { - timeoutSecs?: number; - doNotRetryTimeouts?: boolean; -} - -export interface KeyValueStoreClientUpdateOptions { - name?: string; -} - -export interface KeyValueStoreClientListOptions { - limit?: number; - exclusiveStartKey?: string; - collection?: string; +export interface KeyValueStoreIterateKeysOptions { + /** If set, only keys that start with this prefix are returned. */ prefix?: string; } @@ -123,20 +115,6 @@ export interface KeyValueStoreItemData { size: number; } -export interface KeyValueStoreClientListData { - count: number; - limit: number; - exclusiveStartKey?: string; - isTruncated: boolean; - nextExclusiveStartKey?: string; - items: KeyValueStoreItemData[]; -} - -export interface KeyValueStoreClientGetRecordOptions { - buffer?: boolean; - stream?: boolean; -} - /** * Key-value Store client. */ @@ -149,19 +127,30 @@ export interface KeyValueStoreClient { * for a storage that has been removed. */ getMetadata(): Promise; - update(newFields: KeyValueStoreClientUpdateOptions): Promise>; - delete(): Promise; - listKeys( - options?: KeyValueStoreClientListOptions, - ): Partial> & Promise; - keys?(options?: KeyValueStoreClientListOptions): AsyncIterable & Promise; - values?(options?: KeyValueStoreClientListOptions): AsyncIterable & Promise; - entries?(options?: KeyValueStoreClientListOptions): AsyncIterable<[string, unknown]> & Promise<[string, unknown][]>; + + /** Remove the key-value store and all its data. */ + drop(): Promise; + + /** Remove all records from the store but keep the store itself. */ + purge(): Promise; + + /** Get a record value by key. Returns the parsed value or `undefined` if not found. */ + getValue(key: string): Promise; + + /** Set a record value. */ + setValue(record: KeyValueStoreRecord): Promise; + + /** Delete a record by key. */ + deleteValue(key: string): Promise; + + /** Iterate over all keys in the store. */ + iterateKeys(options?: KeyValueStoreIterateKeysOptions): AsyncIterable; + + /** Get the public URL for a record, or `undefined` if unavailable. */ + getPublicUrl(key: string): Promise; + + /** Check whether a record with the given key exists. */ recordExists(key: string): Promise; - getRecordPublicUrl(key: string): Promise; - getRecord(key: string, options?: KeyValueStoreClientGetRecordOptions): Promise; - setRecord(record: KeyValueStoreRecord, options?: KeyValueStoreRecordOptions): Promise; - deleteRecord(key: string): Promise; } export interface RequestQueueStats { @@ -284,14 +273,18 @@ export interface RequestQueueClient { * for a storage that has been removed. */ getMetadata(): Promise; - update(newFields: { name?: string }): Promise | undefined>; - delete(): Promise; + + /** Remove the request queue and all its data. */ + drop(): Promise; + + /** Remove all requests from the queue but keep the queue itself. */ + purge(): Promise; + listHead(options?: ListOptions): Promise; addRequest(request: RequestSchema, options?: RequestOptions): Promise; batchAddRequests(requests: RequestSchema[], options?: RequestOptions): Promise; getRequest(id: string): Promise; updateRequest(request: UpdateRequestSchema, options?: RequestOptions): Promise; - deleteRequest(id: string): Promise; listAndLockHead(options: ListAndLockOptions): Promise; prolongRequestLock(id: string, options: ProlongRequestLockOptions): Promise; deleteRequestLock(id: string, options?: DeleteRequestLockOptions): Promise; diff --git a/test/core/crawlers/adaptive_playwright_crawler.test.ts b/test/core/crawlers/adaptive_playwright_crawler.test.ts index cdf8ba7fd4b0..ba7af8366efc 100644 --- a/test/core/crawlers/adaptive_playwright_crawler.test.ts +++ b/test/core/crawlers/adaptive_playwright_crawler.test.ts @@ -458,9 +458,9 @@ describe('AdaptivePlaywrightCrawler', () => { await crawler.run(); const store = await localStorageEmulator.getKeyValueStore(); - expect((await store.getRecord('1'))!.value).toEqual({ content: 42 }); - expect((await store.getRecord('2'))!.value).toEqual({ content: 42 }); - expect((await store.getRecord('3'))!.value).toEqual({ content: 42 }); + expect((await store.getValue('1'))!.value).toEqual({ content: 42 }); + expect((await store.getValue('2'))!.value).toEqual({ content: 42 }); + expect((await store.getValue('3'))!.value).toEqual({ content: 42 }); }); test('should not allow direct key-value store manipulation', async () => { @@ -494,7 +494,7 @@ describe('AdaptivePlaywrightCrawler', () => { ); const store = await localStorageEmulator.getKeyValueStore(); - expect(await store.getRecord('1')).toBeUndefined(); + expect(await store.getValue('1')).toBeUndefined(); }); test('should persist RenderingTypePredictor state on PERSIST_STATE events', async () => { diff --git a/test/core/crawlers/rendering_type_predictor.test.ts b/test/core/crawlers/rendering_type_predictor.test.ts index ac44fc089ce2..9abedb8a5d81 100644 --- a/test/core/crawlers/rendering_type_predictor.test.ts +++ b/test/core/crawlers/rendering_type_predictor.test.ts @@ -39,7 +39,7 @@ describe('RenderingTypePredictor', () => { await predictor['state'].persistState(); // Access private state for persistence // Verify state was persisted - const persistedRecord = await store.getRecord(persistStateKey); + const persistedRecord = await store.getValue(persistStateKey); expect(persistedRecord).not.toBeNull(); expect(persistedRecord?.value).toBeDefined(); diff --git a/test/core/recoverable_state.test.ts b/test/core/recoverable_state.test.ts index 947ea7bfff45..c5a659d3ef77 100644 --- a/test/core/recoverable_state.test.ts +++ b/test/core/recoverable_state.test.ts @@ -180,7 +180,7 @@ describe('RecoverableState', () => { await recoverableState.persistState(); const persistedState = JSON.parse( - (await (await localStorageEmulator.getKeyValueStore()).getRecord('test-key'))?.value, + (await (await localStorageEmulator.getKeyValueStore()).getValue('test-key'))?.value, ); expect(persistedState).toMatchObject({ data: { value: 'updated' }, diff --git a/test/core/storages/dataset.test.ts b/test/core/storages/dataset.test.ts index c4021cf9b030..23c7b4169894 100644 --- a/test/core/storages/dataset.test.ts +++ b/test/core/storages/dataset.test.ts @@ -1,4 +1,4 @@ -import { checkAndSerialize, chunkBySize, Configuration, Dataset, KeyValueStore, serviceLocator } from '@crawlee/core'; +import { isJsonSerializable, chunkBySize, Dataset, KeyValueStore, serviceLocator } from '@crawlee/core'; import type { Dictionary } from '@crawlee/utils'; import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; @@ -30,28 +30,28 @@ describe('dataset', () => { test('should work', async () => { const dataset = await createDataset(); - const pushItemSpy = vitest.spyOn(dataset.client, 'pushItems'); + const pushDataSpy = vitest.spyOn(dataset.client, 'pushData'); - const mockPushItems = pushItemSpy.mockResolvedValueOnce(undefined); + const mockPushData = pushDataSpy.mockResolvedValueOnce(undefined); await dataset.pushData({ foo: 'bar' }); - expect(mockPushItems).toBeCalledTimes(1); - expect(mockPushItems).toBeCalledWith(JSON.stringify({ foo: 'bar' })); + expect(mockPushData).toHaveBeenCalledTimes(1); + expect(mockPushData).toHaveBeenCalledWith([{ foo: 'bar' }]); - const mockPushItems2 = pushItemSpy.mockResolvedValueOnce(undefined); + const mockPushData2 = pushDataSpy.mockResolvedValueOnce(undefined); await dataset.pushData([{ foo: 'hotel;' }, { foo: 'restaurant' }]); - expect(mockPushItems2).toBeCalledTimes(2); - expect(mockPushItems2).toBeCalledWith(JSON.stringify([{ foo: 'hotel;' }, { foo: 'restaurant' }])); + expect(mockPushData2).toHaveBeenCalledTimes(2); + expect(mockPushData2).toHaveBeenCalledWith([{ foo: 'hotel;' }, { foo: 'restaurant' }]); - const mockDelete = vitest.spyOn(dataset.client, 'delete').mockResolvedValueOnce(undefined); + const mockDrop = vitest.spyOn(dataset.client, 'drop').mockResolvedValueOnce(undefined); await dataset.drop(); - expect(mockDelete).toBeCalledTimes(1); - expect(mockDelete).toHaveBeenLastCalledWith(); + expect(mockDrop).toHaveBeenCalledTimes(1); + expect(mockDrop).toHaveBeenLastCalledWith(); }); test('should successfully save large data', async () => { @@ -59,15 +59,13 @@ describe('dataset', () => { const dataset = await createDataset(); - const mockPushItems = vitest.spyOn(dataset.client, 'pushItems'); - mockPushItems.mockResolvedValueOnce(undefined); - mockPushItems.mockResolvedValueOnce(undefined); + const mockPushData = vitest.spyOn(dataset.client, 'pushData'); + mockPushData.mockResolvedValueOnce(undefined); await dataset.pushData([{ foo: half }, { bar: half }]); - expect(mockPushItems).toBeCalledTimes(2); - expect(mockPushItems).toHaveBeenNthCalledWith(1, JSON.stringify([{ foo: half }])); - expect(mockPushItems).toHaveBeenNthCalledWith(2, JSON.stringify([{ bar: half }])); + expect(mockPushData).toHaveBeenCalledTimes(1); + expect(mockPushData).toHaveBeenCalledWith([{ foo: half }, { bar: half }]); }); test('should successfully save lots of small data', async () => { @@ -75,43 +73,16 @@ describe('dataset', () => { const string = mockData(MAX_PAYLOAD_SIZE_BYTES / count); const chunk = { foo: string, bar: 'baz' }; const data = Array(count).fill(chunk); - const expectedFirst = JSON.stringify(Array(count - 1).fill(chunk)); - const expectedSecond = JSON.stringify([chunk]); const dataset = await createDataset(); - const mockPushItems = vitest.spyOn(dataset.client, 'pushItems'); - mockPushItems.mockResolvedValueOnce(undefined); - mockPushItems.mockResolvedValueOnce(undefined); + const mockPushData = vitest.spyOn(dataset.client, 'pushData'); + mockPushData.mockResolvedValueOnce(undefined); await dataset.pushData(data); - expect(mockPushItems).toBeCalledTimes(2); - expect(mockPushItems).toHaveBeenNthCalledWith(1, expectedFirst); - expect(mockPushItems).toHaveBeenNthCalledWith(2, expectedSecond); - }); - - test('should throw on too large file', async () => { - const full = mockData(MAX_PAYLOAD_SIZE_BYTES); - const dataset = await createDataset(); - try { - await dataset.pushData({ foo: full }); - throw new Error('Should fail!'); - } catch (err) { - expect(err).toBeInstanceOf(Error); - expect((err as Error).message).toMatch('Data item is too large'); - } - }); - test('should throw on too large file in an array', async () => { - const full = mockData(MAX_PAYLOAD_SIZE_BYTES); - const dataset = await createDataset(); - try { - await dataset.pushData([{ foo: 0 }, { foo: 1 }, { foo: 2 }, { foo: full }, { foo: 4 }]); - throw new Error('Should fail!'); - } catch (err) { - expect(err).toBeInstanceOf(Error); - expect((err as Error).message).toMatch('Data item at index 3 is too large'); - } + expect(mockPushData).toHaveBeenCalledTimes(1); + expect(mockPushData).toHaveBeenCalledWith(data); }); test('getData() should work', async () => { @@ -126,27 +97,22 @@ describe('dataset', () => { desc: false, }; - const mockListItems = vitest.spyOn(dataset.client, 'listItems'); - mockListItems.mockResolvedValueOnce(expected); + const mockGetData = vitest.spyOn(dataset.client, 'getData'); + mockGetData.mockResolvedValueOnce(expected); const result = await dataset.getData({ limit: 2, offset: 3 }); - expect(mockListItems).toHaveBeenLastCalledWith({ + expect(mockGetData).toHaveBeenLastCalledWith({ limit: 2, offset: 3, }); expect(result).toEqual(expected); - let e; - const spy = vitest.spyOn(dataset.client, 'listItems').mockImplementation(() => { + + vitest.spyOn(dataset.client, 'getData').mockImplementation(() => { throw new Error('Cannot create a string longer than 0x3fffffe7 characters'); }); - try { - await dataset.getData(); - } catch (err) { - e = err; - } - expect((e as Error).message).toEqual( + await expect(dataset.getData()).rejects.toThrow( 'dataset.getData(): The response is too large for parsing. You can fix this by lowering the "limit" option.', ); }); @@ -192,17 +158,17 @@ describe('dataset', () => { desc: false, }; - const mockListItems = vitest.spyOn(dataset.client, 'listItems'); - mockListItems.mockResolvedValueOnce(firstResolve); - mockListItems.mockResolvedValueOnce(secondResolve); + const mockGetData = vitest.spyOn(dataset.client, 'getData'); + mockGetData.mockResolvedValueOnce(firstResolve); + mockGetData.mockResolvedValueOnce(secondResolve); const restoreAndVerify = () => { - expect(mockListItems).toBeCalledTimes(2); - expect(mockListItems).toHaveBeenNthCalledWith(1, { + expect(mockGetData).toHaveBeenCalledTimes(2); + expect(mockGetData).toHaveBeenNthCalledWith(1, { limit: 2, offset: 0, }); - expect(mockListItems).toHaveBeenNthCalledWith(2, { + expect(mockGetData).toHaveBeenNthCalledWith(2, { limit: 2, offset: 2, }); @@ -331,8 +297,8 @@ describe('dataset', () => { test('reduce() uses first value as memo if no memo is provided', async () => { const dataset = await createDataset('some-id', 'some-name'); - const mockListItems = vitest.spyOn(dataset.client, 'listItems'); - mockListItems.mockResolvedValueOnce({ + const mockGetData = vitest.spyOn(dataset.client, 'getData'); + mockGetData.mockResolvedValueOnce({ items: [{ foo: 4 }, { foo: 5 }], limit: 2, total: 4, @@ -340,7 +306,7 @@ describe('dataset', () => { count: 2, desc: false, }); - mockListItems.mockResolvedValueOnce({ + mockGetData.mockResolvedValueOnce({ items: [{ foo: 4 }, { foo: 1 }], limit: 2, total: 4, @@ -362,12 +328,12 @@ describe('dataset', () => { }, ); - expect(mockListItems).toBeCalledTimes(2); - expect(mockListItems).toHaveBeenNthCalledWith(1, { + expect(mockGetData).toHaveBeenCalledTimes(2); + expect(mockGetData).toHaveBeenNthCalledWith(1, { limit: 2, offset: 0, }); - expect(mockListItems).toHaveBeenNthCalledWith(2, { + expect(mockGetData).toHaveBeenNthCalledWith(2, { limit: 2, offset: 2, }); @@ -401,7 +367,7 @@ describe('dataset', () => { 'Expected `data` to be of type `object` but received type `boolean`', ); await expect(dataset.pushData(() => {})).rejects.toThrow( - 'Data item is not an object. You can push only objects into a dataset.', + 'Data item at index 0 is not an object. You can push only objects into a dataset.', ); const circularObj = {} as Dictionary; @@ -412,26 +378,21 @@ describe('dataset', () => { }); describe('utils', () => { - test('checkAndSerialize() works', () => { - // Basic - const obj = { foo: 'bar' }; - const json = JSON.stringify(obj); - expect(checkAndSerialize({}, 100)).toBe('{}'); - expect(checkAndSerialize(obj, 100)).toEqual(json); - // With index - expect(checkAndSerialize(obj, 100, 1)).toEqual(json); - // Too large - expect(() => checkAndSerialize(obj, 5)).toThrowError(Error); - expect(() => checkAndSerialize(obj, 5, 7)).toThrowError(Error); - // Bad JSON + test('isJsonSerializable() works', () => { + // Valid objects + expect(() => isJsonSerializable({})).not.toThrow(); + expect(() => isJsonSerializable({ foo: 'bar' })).not.toThrow(); + expect(() => isJsonSerializable({ foo: 'bar' }, 1)).not.toThrow(); + // Circular reference const bad = {} as Dictionary; bad.bad = bad; - expect(() => checkAndSerialize(bad, 100)).toThrowError(Error); - // Bad data - const str = 'hello'; - expect(() => checkAndSerialize(str, 100)).toThrowError(Error); - expect(() => checkAndSerialize([], 100)).toThrowError(Error); - expect(() => checkAndSerialize([str, str], 100)).toThrowError(Error); + expect(() => isJsonSerializable(bad)).toThrow('not serializable to JSON'); + // Non-objects + expect(() => isJsonSerializable('hello')).toThrow('not an object'); + expect(() => isJsonSerializable([])).toThrow('not an object'); + expect(() => isJsonSerializable(['a', 'b'])).toThrow('not an object'); + // With index in error message + expect(() => isJsonSerializable('hello', 3)).toThrow('at index 3'); }); test('chunkBySize', () => { const obj = { foo: 'bar' }; @@ -574,39 +535,30 @@ describe('dataset', () => { expect(items).toEqual(testData); }); - test('values() can be awaited directly (hybrid usage)', async () => { - const dataset = await Dataset.open(); - await dataset.pushData(testData); - - const result = await dataset.values(); - - expect(result.items).toEqual(testData); - expect(result.total).toBe(3); - expect(result.count).toBe(3); - expect(result.offset).toBe(0); - }); - - test('values() respects limit when awaited directly', async () => { + test('values() respects limit when iterating', async () => { const dataset = await Dataset.open(); await dataset.pushData(testData); - const result = await dataset.values({ limit: 2 }); + const items = []; + for await (const item of dataset.values({ limit: 2 })) { + items.push(item); + } - expect(result.items).toHaveLength(2); - expect(result.items).toEqual(testData.slice(0, 2)); - expect(result.total).toBe(3); - expect(result.count).toBe(2); + expect(items).toHaveLength(2); + expect(items).toEqual(testData.slice(0, 2)); }); - test('values() respects offset when awaited directly', async () => { + test('values() respects offset when iterating', async () => { const dataset = await Dataset.open(); await dataset.pushData(testData); - const result = await dataset.values({ offset: 1 }); + const items = []; + for await (const item of dataset.values({ offset: 1 })) { + items.push(item); + } - expect(result.items).toHaveLength(2); - expect(result.items).toEqual(testData.slice(1)); - expect(result.offset).toBe(1); + expect(items).toHaveLength(2); + expect(items).toEqual(testData.slice(1)); }); test('entries() should iterate over index-item pairs', async () => { @@ -640,49 +592,36 @@ describe('dataset', () => { ]); }); - test('entries() can be awaited directly (hybrid usage)', async () => { - const dataset = await Dataset.open(); - await dataset.pushData(testData); - - const result = await dataset.entries(); - - expect(result.items).toEqual([ - [0, { id: 1, name: 'Alice' }], - [1, { id: 2, name: 'Bob' }], - [2, { id: 3, name: 'Charlie' }], - ]); - expect(result.total).toBe(3); - expect(result.count).toBe(3); - expect(result.offset).toBe(0); - }); - - test('entries() respects limit when awaited directly', async () => { + test('entries() respects limit when iterating', async () => { const dataset = await Dataset.open(); await dataset.pushData(testData); - const result = await dataset.entries({ limit: 2 }); + const entries = []; + for await (const entry of dataset.entries({ limit: 2 })) { + entries.push(entry); + } - expect(result.items).toHaveLength(2); - expect(result.items).toEqual([ + expect(entries).toHaveLength(2); + expect(entries).toEqual([ [0, { id: 1, name: 'Alice' }], [1, { id: 2, name: 'Bob' }], ]); - expect(result.total).toBe(3); - expect(result.count).toBe(2); }); - test('entries() respects offset when awaited directly', async () => { + test('entries() respects offset when iterating', async () => { const dataset = await Dataset.open(); await dataset.pushData(testData); - const result = await dataset.entries({ offset: 1 }); + const entries = []; + for await (const entry of dataset.entries({ offset: 1 })) { + entries.push(entry); + } - expect(result.items).toHaveLength(2); - expect(result.items).toEqual([ + expect(entries).toHaveLength(2); + expect(entries).toEqual([ [1, { id: 2, name: 'Bob' }], [2, { id: 3, name: 'Charlie' }], ]); - expect(result.offset).toBe(1); }); test('Symbol.asyncIterator should iterate over items', async () => { diff --git a/test/core/storages/key_value_store.test.ts b/test/core/storages/key_value_store.test.ts index d28d39b54179..1de8a5b0d5ac 100644 --- a/test/core/storages/key_value_store.test.ts +++ b/test/core/storages/key_value_store.test.ts @@ -1,6 +1,6 @@ import { PassThrough } from 'node:stream'; -import { Configuration, KeyValueStore, maybeStringify, serviceLocator } from '@crawlee/core'; +import { KeyValueStore, maybeStringify, serviceLocator } from '@crawlee/core'; import type { Dictionary } from '@crawlee/utils'; import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; @@ -32,30 +32,24 @@ describe('KeyValueStore', () => { const recordStr = JSON.stringify(record, null, 2); // Set record - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', record); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value: recordStr, - contentType: 'application/json; charset=utf-8', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value: recordStr, + contentType: 'application/json; charset=utf-8', + }); // Get Record - const mockGetRecord = vitest + const mockGetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'getRecord') + .spyOn(store.client, 'getValue') .mockResolvedValueOnce({ key: 'key-1', value: record, @@ -64,8 +58,8 @@ describe('KeyValueStore', () => { const response = await store.getValue('key-1'); - expect(mockGetRecord).toBeCalledTimes(1); - expect(mockGetRecord).toBeCalledWith('key-1'); + expect(mockGetValue).toHaveBeenCalledTimes(1); + expect(mockGetValue).toHaveBeenCalledWith('key-1'); expect(response).toEqual(record); // Record Exists @@ -76,31 +70,31 @@ describe('KeyValueStore', () => { const exists = await store.recordExists('key-1'); - expect(mockRecordExists).toBeCalledTimes(1); - expect(mockRecordExists).toBeCalledWith('key-1'); + expect(mockRecordExists).toHaveBeenCalledTimes(1); + expect(mockRecordExists).toHaveBeenCalledWith('key-1'); expect(exists).toBe(true); // Delete Record - const mockDeleteRecord = vitest + const mockDeleteValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'deleteRecord') + .spyOn(store.client, 'deleteValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', null); - expect(mockDeleteRecord).toBeCalledTimes(1); - expect(mockDeleteRecord).toBeCalledWith('key-1'); + expect(mockDeleteValue).toHaveBeenCalledTimes(1); + expect(mockDeleteValue).toHaveBeenCalledWith('key-1'); // Drop store - const mockDelete = vitest + const mockDrop = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'delete') + .spyOn(store.client, 'drop') .mockResolvedValueOnce(undefined); await store.drop(); - expect(mockDelete).toBeCalledTimes(1); - expect(mockDelete).toHaveBeenLastCalledWith(); + expect(mockDrop).toHaveBeenCalledTimes(1); + expect(mockDrop).toHaveBeenLastCalledWith(); }); describe('getValue', () => { @@ -127,13 +121,13 @@ describe('KeyValueStore', () => { getValueSpy.mockImplementationOnce(async () => 123); const val = await KeyValueStore.getValue('key-1'); - expect(getValueSpy).toBeCalledTimes(1); - expect(getValueSpy).toBeCalledWith('key-1', undefined); + expect(getValueSpy).toHaveBeenCalledTimes(1); + expect(getValueSpy).toHaveBeenCalledWith('key-1', undefined); expect(val).toBe(123); const val2 = await KeyValueStore.getValue('key-2', 321); - expect(getValueSpy).toBeCalledTimes(2); - expect(getValueSpy).toBeCalledWith('key-2', 321); + expect(getValueSpy).toHaveBeenCalledTimes(2); + expect(getValueSpy).toHaveBeenCalledWith('key-2', 321); expect(val2).toBe(321); }); }); @@ -162,8 +156,8 @@ describe('KeyValueStore', () => { recordExistsSpy.mockImplementationOnce(async () => false); const val = await KeyValueStore.recordExists('key-1'); - expect(recordExistsSpy).toBeCalledTimes(1); - expect(recordExistsSpy).toBeCalledWith('key-1'); + expect(recordExistsSpy).toHaveBeenCalledTimes(1); + expect(recordExistsSpy).toHaveBeenCalledWith('key-1'); expect(val).toBe(false); }); }); @@ -264,25 +258,19 @@ describe('KeyValueStore', () => { test('correctly adds charset to content type', async () => { const store = await createKeyValueStore('my-store-id-1'); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', 'xxxx', { contentType: 'text/plain; charset=utf-8' }); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value: 'xxxx', - contentType: 'text/plain; charset=utf-8', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value: 'xxxx', + contentType: 'text/plain; charset=utf-8', + }); }); test('correctly passes object values as JSON', async () => { @@ -291,25 +279,19 @@ describe('KeyValueStore', () => { const record = { foo: 'bar' }; const recordStr = JSON.stringify(record, null, 2); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', record); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value: recordStr, - contentType: 'application/json; charset=utf-8', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value: recordStr, + contentType: 'application/json; charset=utf-8', + }); }); test('correctly passes timeout options', async () => { @@ -318,9 +300,9 @@ describe('KeyValueStore', () => { const record = { foo: 'bar' }; const recordStr = JSON.stringify(record, null, 2); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', record, { @@ -328,75 +310,57 @@ describe('KeyValueStore', () => { doNotRetryTimeouts: true, }); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value: recordStr, - contentType: 'application/json; charset=utf-8', - }, - { - doNotRetryTimeouts: true, - timeoutSecs: 1, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value: recordStr, + contentType: 'application/json; charset=utf-8', + }); }); test('correctly passes raw string values', async () => { const store = await createKeyValueStore('my-store-id-1'); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); await store.setValue('key-1', 'xxxx', { contentType: 'text/plain; charset=utf-8' }); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value: 'xxxx', - contentType: 'text/plain; charset=utf-8', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value: 'xxxx', + contentType: 'text/plain; charset=utf-8', + }); }); test('correctly passes raw Buffer values', async () => { const store = await createKeyValueStore('my-store-id-1'); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); const value = Buffer.from('some text value'); await store.setValue('key-1', value, { contentType: 'image/jpeg; charset=something' }); - expect(mockSetRecord).toBeCalledTimes(1); - expect(mockSetRecord).toBeCalledWith( - { - key: 'key-1', - value, - contentType: 'image/jpeg; charset=something', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value, + contentType: 'image/jpeg; charset=something', + }); }); test('correctly passes a stream', async () => { const store = await createKeyValueStore('my-store-id-1'); - const mockSetRecord = vitest + const mockSetValue = vitest // @ts-expect-error Accessing private property - .spyOn(store.client, 'setRecord') + .spyOn(store.client, 'setValue') .mockResolvedValueOnce(undefined); const value = new PassThrough(); @@ -405,18 +369,12 @@ describe('KeyValueStore', () => { value.end(); value.destroy(); - expect(mockSetRecord).toHaveBeenCalledTimes(1); - expect(mockSetRecord).toHaveBeenCalledWith( - { - key: 'key-1', - value, - contentType: 'plain/text', - }, - { - doNotRetryTimeouts: undefined, - timeoutSecs: undefined, - }, - ); + expect(mockSetValue).toHaveBeenCalledTimes(1); + expect(mockSetValue).toHaveBeenCalledWith({ + key: 'key-1', + value, + contentType: 'plain/text', + }); }); }); @@ -447,7 +405,7 @@ describe('KeyValueStore', () => { const obj = {} as Dictionary; obj.self = obj; - expect(() => maybeStringify(obj, { contentType: null as any })).toThrowError( + expect(() => maybeStringify(obj, { contentType: null as any })).toThrow( 'The "value" parameter cannot be stringified to JSON: Converting circular structure to JSON', ); }); @@ -516,52 +474,27 @@ describe('KeyValueStore', () => { const store = await createKeyValueStore('my-store-id-1'); // @ts-expect-error Accessing private property - const mockListKeys = vitest.spyOn(store.client, 'listKeys'); - mockListKeys.mockResolvedValueOnce({ - isTruncated: true, - exclusiveStartKey: 'key0', - nextExclusiveStartKey: 'key2', - items: [ - { key: 'key1', size: 1 }, - { key: 'key2', size: 2 }, - ], - count: 2, - limit: 2, - }); - - mockListKeys.mockResolvedValueOnce({ - isTruncated: true, - exclusiveStartKey: 'key0', - nextExclusiveStartKey: 'key4', - items: [ - { key: 'key3', size: 3 }, - { key: 'key4', size: 4 }, - ], - count: 1, - limit: 2, - }); - - mockListKeys.mockResolvedValueOnce({ - isTruncated: false, - exclusiveStartKey: 'key0', - nextExclusiveStartKey: undefined, - items: [{ key: 'key5', size: 5 }], - count: 1, - limit: 1, - }); + const mockIterateKeys = vitest.spyOn(store.client, 'iterateKeys'); + mockIterateKeys.mockReturnValueOnce( + (async function* () { + yield { key: 'key1', size: 1 }; + yield { key: 'key2', size: 2 }; + yield { key: 'key3', size: 3 }; + yield { key: 'key4', size: 4 }; + yield { key: 'key5', size: 5 }; + })(), + ); const results: [string, number, { size: number }][] = []; await store.forEachKey( async (key, index, info) => { results.push([key, index, info]); }, - { exclusiveStartKey: 'key0', prefix: 'img/' }, + { prefix: 'img/' }, ); - expect(mockListKeys).toBeCalledTimes(3); - expect(mockListKeys).toHaveBeenNthCalledWith(1, { exclusiveStartKey: 'key0', prefix: 'img/' }); - expect(mockListKeys).toHaveBeenNthCalledWith(2, { exclusiveStartKey: 'key2', prefix: 'img/' }); - expect(mockListKeys).toHaveBeenNthCalledWith(3, { exclusiveStartKey: 'key4', prefix: 'img/' }); + expect(mockIterateKeys).toHaveBeenCalledTimes(1); + expect(mockIterateKeys).toHaveBeenCalledWith({ prefix: 'img/' }); expect(results).toHaveLength(5); results.forEach((r, i) => { diff --git a/test/core/storages/request_queue.test.ts b/test/core/storages/request_queue.test.ts index be8b1ba11b79..6c73a2b182b0 100644 --- a/test/core/storages/request_queue.test.ts +++ b/test/core/storages/request_queue.test.ts @@ -2,7 +2,6 @@ import { API_PROCESSED_REQUESTS_DELAY_MILLIS, - Configuration, ProxyConfiguration, QUERY_HEAD_MIN_LENGTH, Request, @@ -12,8 +11,6 @@ import { STORAGE_CONSISTENCY_DELAY_MILLIS, } from '@crawlee/core'; import { sleep } from '@crawlee/utils'; -import { gotScraping } from 'got-scraping'; -import type { MockedFunction } from 'vitest'; import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; @@ -64,8 +61,8 @@ describe('RequestQueue remote', () => { }); expect(queue['queueHeadIds'].length()).toBe(1); - expect(mockAddRequest).toBeCalledTimes(1); - expect(mockAddRequest).toBeCalledWith(requestA, { forefront: false }); + expect(mockAddRequest).toHaveBeenCalledTimes(1); + expect(mockAddRequest).toHaveBeenCalledWith(requestA, { forefront: false }); // Try to add again a request with the same URL const queueOperationInfo2 = await queue.addRequest(requestOptions); @@ -86,7 +83,7 @@ describe('RequestQueue remote', () => { mockAddRequest.mockResolvedValueOnce(secondResolveValue); await queue.addRequest(requestB, { forefront: true }); - expect(mockAddRequest).toBeCalledTimes(2); + expect(mockAddRequest).toHaveBeenCalledTimes(2); expect(mockAddRequest).toHaveBeenLastCalledWith(requestB, { forefront: true }); expect(queue['queueHeadIds'].length()).toBe(2); @@ -97,7 +94,7 @@ describe('RequestQueue remote', () => { mockGetRequest.mockResolvedValueOnce({ ...requestB, id: 'b' }); const requestBFromQueue = await queue.fetchNextRequest(); - expect(mockGetRequest).toBeCalledTimes(1); + expect(mockGetRequest).toHaveBeenCalledTimes(1); expect(mockGetRequest).toHaveBeenLastCalledWith('b'); expect(requestBFromQueue).toEqual({ ...requestB, id: 'b' }); @@ -117,7 +114,7 @@ describe('RequestQueue remote', () => { mockGetRequest.mockResolvedValueOnce(undefined); const requestXFromQueue = await queue.getRequest('non-existent'); - expect(mockGetRequest).toBeCalledTimes(2); + expect(mockGetRequest).toHaveBeenCalledTimes(2); expect(mockGetRequest).toHaveBeenLastCalledWith('non-existent'); expect(requestXFromQueue).toBe(null); @@ -133,7 +130,7 @@ describe('RequestQueue remote', () => { }); await queue.reclaimRequest(requestBFromQueue!, { forefront: true }); - expect(mockUpdateRequest).toBeCalledTimes(1); + expect(mockUpdateRequest).toHaveBeenCalledTimes(1); expect(mockUpdateRequest).toHaveBeenLastCalledWith(requestBFromQueue, { forefront: true }); expect(queue['queueHeadIds'].length()).toBe(1); @@ -147,7 +144,7 @@ describe('RequestQueue remote', () => { mockGetRequest.mockResolvedValueOnce(requestBFromQueue as never); const requestBFromQueue2 = await queue.fetchNextRequest(); - expect(mockGetRequest).toBeCalledTimes(3); + expect(mockGetRequest).toHaveBeenCalledTimes(3); expect(mockGetRequest).toHaveBeenLastCalledWith('b'); expect(requestBFromQueue2).toEqual(requestBFromQueue); @@ -165,7 +162,7 @@ describe('RequestQueue remote', () => { }); await queue.markRequestHandled(requestBFromQueue!); - expect(mockUpdateRequest).toBeCalledTimes(2); + expect(mockUpdateRequest).toHaveBeenCalledTimes(2); expect(mockUpdateRequest).toHaveBeenLastCalledWith(requestBFromQueue); expect(queue['queueHeadIds'].length()).toBe(1); @@ -186,9 +183,9 @@ describe('RequestQueue remote', () => { mockGetRequest.mockResolvedValueOnce({ ...requestA, id: 'a' }); const requestAFromQueue = await queue.fetchNextRequest(); - expect(mockGetRequest).toBeCalledTimes(4); + expect(mockGetRequest).toHaveBeenCalledTimes(4); expect(mockGetRequest).toHaveBeenLastCalledWith('a'); - expect(mockListHead).toBeCalledTimes(1); + expect(mockListHead).toHaveBeenCalledTimes(1); expect(mockListHead).toHaveBeenLastCalledWith({ limit: QUERY_HEAD_MIN_LENGTH }); expect(requestAFromQueue).toEqual({ ...requestA, id: 'a' }); @@ -196,12 +193,12 @@ describe('RequestQueue remote', () => { expect(queue.inProgressCount()).toBe(1); // Drop queue. - const mockDelete = vitest.spyOn(queue.client, 'delete'); - mockDelete.mockResolvedValueOnce(undefined); + const mockDrop = vitest.spyOn(queue.client, 'drop'); + mockDrop.mockResolvedValueOnce(undefined); await queue.drop(); - expect(mockDelete).toBeCalledTimes(1); - expect(mockDelete).toHaveBeenLastCalledWith(); + expect(mockDrop).toHaveBeenCalledTimes(1); + expect(mockDrop).toHaveBeenLastCalledWith(); }); test('addRequests', async () => { @@ -233,8 +230,8 @@ describe('RequestQueue remote', () => { // Ensure the client method was actually called, and added expect(queue['queueHeadIds'].length()).toBe(1); - expect(mockAddRequests).toBeCalledTimes(1); - expect(mockAddRequests).toBeCalledWith([requestA], { forefront: false }); + expect(mockAddRequests).toHaveBeenCalledTimes(1); + expect(mockAddRequests).toHaveBeenCalledWith([requestA], { forefront: false }); // Try to add a request with the same URL again, expecting cached const addRequestsResult2 = await queue.addRequests([requestOptions]); @@ -285,7 +282,7 @@ describe('RequestQueue remote', () => { expect(queue['queueHeadIds'].length()).toBe(3); expect(mockAddRequests).toHaveBeenCalled(); - expect(mockAddRequests).toBeCalledWith([requestB, requestC], { forefront: true }); + expect(mockAddRequests).toHaveBeenCalledWith([requestB, requestC], { forefront: true }); }); test('should cache new requests locally', async () => { @@ -303,13 +300,13 @@ describe('RequestQueue remote', () => { }); await queue.addRequest(requestA); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(addRequestMock).toHaveBeenLastCalledWith(requestA, { forefront: false }); // Add request B that has same unique so that addRequest() is not called because it's already cached. // mock.expects('addRequest').never(); const queueOperationInfo = await queue.addRequest(requestB); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(queueOperationInfo).toEqual({ requestId: 'a', uniqueKey: requestA.uniqueKey, @@ -334,13 +331,13 @@ describe('RequestQueue remote', () => { }); await queue.addRequest(requestX); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(addRequestMock).toHaveBeenLastCalledWith(requestX, { forefront: false }); // Add request Y that has same unique so that addRequest() is not called because it's already cached. // mock.expects('addRequest').never(); const queueOperationInfo = await queue.addRequest(requestY); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(queueOperationInfo).toEqual({ requestId: 'x', uniqueKey: requestX.uniqueKey, @@ -360,7 +357,7 @@ describe('RequestQueue remote', () => { } as never); expect(await queue.isEmpty()).toBe(false); - expect(listHeadMock).toBeCalledTimes(1); + expect(listHeadMock).toHaveBeenCalledTimes(1); expect(listHeadMock).toHaveBeenLastCalledWith({ limit: QUERY_HEAD_MIN_LENGTH }); // Add request A and addRequest is not called because was already cached. @@ -368,7 +365,7 @@ describe('RequestQueue remote', () => { const addRequestMock = vitest.spyOn(queue.client, 'addRequest'); const queueOperationInfo = await queue.addRequest(requestA); - expect(addRequestMock).toBeCalledTimes(0); + expect(addRequestMock).toHaveBeenCalledTimes(0); expect(queueOperationInfo).toEqual({ requestId: 'a', uniqueKey: 'aaa', @@ -393,7 +390,7 @@ describe('RequestQueue remote', () => { }); await queue.addRequest(requestA, { forefront: true }); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(addRequestMock).toHaveBeenLastCalledWith(requestA, { forefront: true }); expect(queue['queueHeadIds'].length()).toBe(1); @@ -403,13 +400,13 @@ describe('RequestQueue remote', () => { getRequestMock.mockResolvedValueOnce(undefined); const fetchedRequest = await queue.fetchNextRequest(); - expect(getRequestMock).toBeCalledTimes(1); + expect(getRequestMock).toHaveBeenCalledTimes(1); expect(getRequestMock).toHaveBeenLastCalledWith('a'); expect(fetchedRequest).toBe(null); // Give queue time to mark request 'a' as not in progress await sleep(STORAGE_CONSISTENCY_DELAY_MILLIS + 10); - expect(listHeadMock).not.toBeCalled(); + expect(listHeadMock).not.toHaveBeenCalled(); // Should try it once again (the queue head is queried again) getRequestMock.mockResolvedValueOnce({ @@ -422,9 +419,9 @@ describe('RequestQueue remote', () => { } as never); const fetchedRequest2 = await queue.fetchNextRequest(); - expect(getRequestMock).toBeCalledTimes(2); + expect(getRequestMock).toHaveBeenCalledTimes(2); expect(getRequestMock).toHaveBeenLastCalledWith('a'); - expect(listHeadMock).toBeCalledTimes(1); + expect(listHeadMock).toHaveBeenCalledTimes(1); expect(listHeadMock).toHaveBeenLastCalledWith({ limit: QUERY_HEAD_MIN_LENGTH }); expect(fetchedRequest2).toEqual({ ...requestA, id: 'a' }); }); @@ -449,12 +446,12 @@ describe('RequestQueue remote', () => { } as never); await queue.addRequest(requestA, { forefront: true }); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(addRequestMock).toHaveBeenLastCalledWith(requestA, { forefront: true }); const fetchedRequest = await queue.fetchNextRequest(); - expect(getRequestMock).not.toBeCalled(); - expect(listHeadMock).toBeCalledTimes(1); + expect(getRequestMock).not.toHaveBeenCalled(); + expect(listHeadMock).toHaveBeenCalledTimes(1); expect(listHeadMock).toHaveBeenLastCalledWith({ limit: QUERY_HEAD_MIN_LENGTH }); expect(fetchedRequest).toBe(null); }); @@ -470,7 +467,7 @@ describe('RequestQueue remote', () => { const requestOpts = { url: 'http://example.com/a' }; await queue.addRequest(requestOpts); - expect(addRequestMock).toBeCalledTimes(1); + expect(addRequestMock).toHaveBeenCalledTimes(1); expect(addRequestMock).toHaveBeenLastCalledWith(new Request(requestOpts), { forefront: false }); }); @@ -482,7 +479,7 @@ describe('RequestQueue remote', () => { } as never); const count = await queue.handledCount(); expect(count).toBe(33); - expect(getMock).toBeCalledTimes(1); + expect(getMock).toHaveBeenCalledTimes(1); expect(getMock).toHaveBeenLastCalledWith(); }); @@ -506,7 +503,7 @@ describe('RequestQueue remote', () => { const isFinished = await queue.isFinished(); expect(isFinished).toBe(true); - expect(listHeadMock).toBeCalledTimes(2); + expect(listHeadMock).toHaveBeenCalledTimes(2); expect(listHeadMock).toHaveBeenNthCalledWith(1, { limit: QUERY_HEAD_MIN_LENGTH }); expect(listHeadMock).toHaveBeenNthCalledWith(2, { limit: QUERY_HEAD_MIN_LENGTH }); }); @@ -534,7 +531,7 @@ describe('RequestQueue remote', () => { expect(queue.inProgressCount()).toBe(0); expect(queue.assumedTotalCount).toBe(2); expect(queue.assumedHandledCount).toBe(0); - expect(addRequestMock).toBeCalledTimes(2); + expect(addRequestMock).toHaveBeenCalledTimes(2); expect(addRequestMock).toHaveBeenNthCalledWith(1, requestA, { forefront: true }); expect(addRequestMock).toHaveBeenNthCalledWith(2, requestB, { forefront: true }); @@ -543,7 +540,7 @@ describe('RequestQueue remote', () => { const isFinished = await queue.isFinished(); expect(isFinished).toBe(false); - expect(listHeadMock).not.toBeCalled(); + expect(listHeadMock).not.toHaveBeenCalled(); // Fetch them from queue. const getRequestMock = vitest.spyOn(queue.client, 'getRequest'); @@ -552,11 +549,11 @@ describe('RequestQueue remote', () => { const requestBFromQueue = await queue.fetchNextRequest(); expect(requestBFromQueue).toEqual(requestBWithId); - expect(getRequestMock).toBeCalledTimes(1); + expect(getRequestMock).toHaveBeenCalledTimes(1); expect(getRequestMock).toHaveBeenLastCalledWith('b'); const requestAFromQueue = await queue.fetchNextRequest(); expect(requestAFromQueue).toEqual(requestAWithId); - expect(getRequestMock).toBeCalledTimes(2); + expect(getRequestMock).toHaveBeenCalledTimes(2); expect(getRequestMock).toHaveBeenLastCalledWith('a'); expect(queue['queueHeadIds'].length()).toBe(0); @@ -566,20 +563,20 @@ describe('RequestQueue remote', () => { // It won't query the head as there is something in progress or pending. expect(await queue.isFinished()).toBe(false); - expect(listHeadMock).not.toBeCalled(); + expect(listHeadMock).not.toHaveBeenCalled(); // Reclaim one and mark another one handled. const updateRequestMock = vitest.spyOn(queue.client, 'updateRequest'); updateRequestMock.mockResolvedValueOnce({ requestId: 'b', wasAlreadyHandled: false, wasAlreadyPresent: true }); await queue.markRequestHandled(requestBWithId); - expect(updateRequestMock).toBeCalledTimes(1); + expect(updateRequestMock).toHaveBeenCalledTimes(1); expect(updateRequestMock).toHaveBeenLastCalledWith(requestBWithId); updateRequestMock.mockResolvedValueOnce({ requestId: 'a', wasAlreadyHandled: false, wasAlreadyPresent: true }); await queue.reclaimRequest(requestAWithId, { forefront: true }); - expect(updateRequestMock).toBeCalledTimes(2); + expect(updateRequestMock).toHaveBeenCalledTimes(2); expect(updateRequestMock).toHaveBeenLastCalledWith(requestAWithId, { forefront: true }); expect(queue['queueHeadIds'].length()).toBe(0); @@ -595,7 +592,7 @@ describe('RequestQueue remote', () => { // It won't query the head as there is something in progress or pending. expect(await queue.isFinished()).toBe(false); - expect(listHeadMock).not.toBeCalled(); + expect(listHeadMock).not.toHaveBeenCalled(); // Fetch again. // @ts-expect-error Argument of type 'Request' is not assignable to parameter of type @@ -604,7 +601,7 @@ describe('RequestQueue remote', () => { const requestAFromQueue2 = await queue.fetchNextRequest(); expect(requestAFromQueue2).toEqual(requestAWithId); - expect(getRequestMock).toBeCalledTimes(3); + expect(getRequestMock).toHaveBeenCalledTimes(3); expect(getRequestMock).toHaveBeenLastCalledWith('a'); expect(queue['queueHeadIds'].length()).toBe(0); @@ -614,13 +611,13 @@ describe('RequestQueue remote', () => { // It won't query the head as there is something in progress or pending. expect(await queue.isFinished()).toBe(false); - expect(listHeadMock).not.toBeCalled(); + expect(listHeadMock).not.toHaveBeenCalled(); // Mark handled. updateRequestMock.mockResolvedValueOnce({ requestId: 'a', wasAlreadyHandled: false, wasAlreadyPresent: true }); await queue.markRequestHandled(requestAWithId); - expect(updateRequestMock).toBeCalledTimes(3); + expect(updateRequestMock).toHaveBeenCalledTimes(3); expect(updateRequestMock).toHaveBeenLastCalledWith(requestAWithId); expect(queue['queueHeadIds'].length()).toBe(0); @@ -638,7 +635,7 @@ describe('RequestQueue remote', () => { }); expect(await queue.isFinished()).toBe(true); - expect(listHeadMock).toBeCalledTimes(1); + expect(listHeadMock).toHaveBeenCalledTimes(1); expect(listHeadMock).toHaveBeenLastCalledWith({ limit: QUERY_HEAD_MIN_LENGTH }); }); @@ -699,17 +696,17 @@ describe('RequestQueue remote', () => { const result = await queue.getInfo(); expect(result).toEqual(expected); - expect(getMock).toBeCalledTimes(1); + expect(getMock).toHaveBeenCalledTimes(1); expect(getMock).toHaveBeenLastCalledWith(); }); test('drop() works', async () => { const queue = await createRequestQueue('some-id', 'some-name'); - const deleteMock = vitest.spyOn(queue.client, 'delete').mockResolvedValueOnce(undefined); + const dropMock = vitest.spyOn(queue.client, 'drop').mockResolvedValueOnce(undefined); await queue.drop(); - expect(deleteMock).toBeCalledTimes(1); - expect(deleteMock).toHaveBeenLastCalledWith(); + expect(dropMock).toHaveBeenCalledTimes(1); + expect(dropMock).toHaveBeenLastCalledWith(); }); test('Request.userData.__crawlee internal object is non-enumerable and always defined', async () => { @@ -786,9 +783,9 @@ describe('RequestQueue with requestsFromUrl', () => { expect(await queue.fetchNextRequest()).toMatchObject({ method: 'POST', url: list2[0] }); expect(await queue.fetchNextRequest()).toMatchObject({ method: 'POST', url: list2[1] }); - expect(spy).toBeCalledTimes(2); - expect(spy).toBeCalledWith({ url: 'http://example.com/list-1', urlRegExp: undefined }); - expect(spy).toBeCalledWith({ url: 'http://example.com/list-2', urlRegExp: undefined }); + expect(spy).toHaveBeenCalledTimes(2); + expect(spy).toHaveBeenCalledWith({ url: 'http://example.com/list-1', urlRegExp: undefined }); + expect(spy).toHaveBeenCalledWith({ url: 'http://example.com/list-2', urlRegExp: undefined }); }); test('should use regex parameter to parse urls', async () => { @@ -811,7 +808,7 @@ describe('RequestQueue with requestsFromUrl', () => { expect(await queue.fetchNextRequest()).toMatchObject({ method: 'GET', url: listArr[1] }); await queue.drop(); - expect(mockHttpClient.sendRequest).toBeCalled(); + expect(mockHttpClient.sendRequest).toHaveBeenCalled(); expect(mockHttpClient.sendRequest.mock.calls[0][0].url).toBe('http://example.com/list-1'); }); @@ -855,8 +852,8 @@ describe('RequestQueue with requestsFromUrl', () => { expect(await queue.fetchNextRequest()).toBe(null); - expect(spy).toBeCalledTimes(1); - expect(spy).toBeCalledWith({ url: 'http://example.com/list-1', urlRegExp: undefined }); + expect(spy).toHaveBeenCalledTimes(1); + expect(spy).toHaveBeenCalledWith({ url: 'http://example.com/list-1', urlRegExp: undefined }); }); test('should use the defined proxy server when using `requestsFromUrl`', async () => { @@ -876,7 +873,7 @@ describe('RequestQueue with requestsFromUrl', () => { { requestsFromUrl: 'http://example.com/list-3' }, ]); - expect(spy).not.toBeCalledWith(expect.not.objectContaining({ proxyUrl: expect.any(String) })); + expect(spy).not.toHaveBeenCalledWith(expect.not.objectContaining({ proxyUrl: expect.any(String) })); }); }); diff --git a/test/shared/MemoryStorageEmulator.ts b/test/shared/MemoryStorageEmulator.ts index ba1537b64b1f..c48a826c906d 100644 --- a/test/shared/MemoryStorageEmulator.ts +++ b/test/shared/MemoryStorageEmulator.ts @@ -36,7 +36,7 @@ export class MemoryStorageEmulator extends StorageEmulator { async getDatasetItems(id?: string) { const dataset = await this.getDataset(id); - return (await dataset.listItems()).items; + return (await dataset.getData()).items; } getRequestQueue(id?: string) { @@ -54,7 +54,7 @@ export class MemoryStorageEmulator extends StorageEmulator { } async getState() { - return await (await this.getKeyValueStore()).getRecord('CRAWLEE_STATE'); + return await (await this.getKeyValueStore()).getValue('CRAWLEE_STATE'); } } From f7246357f39962646d04450f6736c3ab35c4e37b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 12 May 2026 15:07:33 +0200 Subject: [PATCH 02/10] Address review comments --- packages/core/src/storages/dataset.ts | 21 +++++---- packages/core/src/storages/key_value_store.ts | 36 ++++++++++----- packages/core/src/storages/utils.ts | 45 +++++++++++++++++++ test/core/storages/dataset.test.ts | 20 ++++----- 4 files changed, 92 insertions(+), 30 deletions(-) diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index c8b06c1edd9e..89a834941ca1 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -11,7 +11,7 @@ import { KeyValueStore } from './key_value_store.js'; import type { StorageIdentifier } from './storage_instance_manager.js'; import type { StorageOpenOptions } from './utils.js'; import { resolveStorageIdentifier } from './storage_instance_manager.js'; -import { purgeDefaultStorages } from './utils.js'; +import { dualAsyncIterable, purgeDefaultStorages } from './utils.js'; /** @internal */ export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000; @@ -24,7 +24,7 @@ export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000; * @param index Optional index for error messages when validating inside an array. * @ignore */ -export function isJsonSerializable(item: T, index?: number): void { +export function assertJsonSerializable(item: T, index?: number): void { const s = typeof index === 'number' ? ` at index ${index} ` : ' '; const isItemObject = item && typeof item === 'object' && !Array.isArray(item); @@ -261,7 +261,7 @@ export class Dataset { // Normalize to array and validate each item const items = Array.isArray(data) ? data : [data]; for (let i = 0; i < items.length; i++) { - isJsonSerializable(items[i], i); + assertJsonSerializable(items[i], i); } await this.client.pushData(items); @@ -583,10 +583,10 @@ export class Dataset { * * @param options Options for the iteration. */ - async *values(options: DatasetIteratorOptions = {}): AsyncGenerator { + values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); - yield* this.client.iterateItems(options); + return dualAsyncIterable(this.client.iterateItems(options)); } /** @@ -603,14 +603,19 @@ export class Dataset { * * @param options Options for the iteration. */ - async *entries(options: DatasetIteratorOptions = {}): AsyncGenerator<[number, Data]> { + entries(options: DatasetIteratorOptions = {}): AsyncIterable<[number, Data]> & Promise<[number, Data][]> { checkStorageAccess(); + const iterable = this.client.iterateItems(options); let index = options.offset ?? 0; - for await (const item of this.client.iterateItems(options)) { - yield [index++, item]; + async function* enumerate(): AsyncGenerator<[number, Data]> { + for await (const item of iterable) { + yield [index++, item]; + } } + + return dualAsyncIterable(enumerate()); } /** diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 57e47cb780b9..8c210900c059 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -15,7 +15,7 @@ import { checkStorageAccess } from './access_checking.js'; import type { StorageIdentifier } from './storage_instance_manager.js'; import type { StorageOpenOptions } from './utils.js'; import { resolveStorageIdentifier } from './storage_instance_manager.js'; -import { purgeDefaultStorages } from './utils.js'; +import { dualAsyncIterable, purgeDefaultStorages } from './utils.js'; /** * Helper function to possibly stringify value if options.contentType is not set. @@ -496,15 +496,21 @@ export class KeyValueStore { * * @param options Options for the iteration. */ - async *values(options: KeyValueStoreIteratorOptions = {}): AsyncGenerator { + values(options: KeyValueStoreIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); - for await (const item of this.client.iterateKeys(options)) { - const record = await this.client.getValue(item.key); - if (record) { - yield record.value as T; + const client = this.client; + + async function* iterate(): AsyncGenerator { + for await (const item of client.iterateKeys(options)) { + const record = await client.getValue(item.key); + if (record) { + yield record.value as T; + } } } + + return dualAsyncIterable(iterate()); } /** @@ -521,17 +527,23 @@ export class KeyValueStore { * * @param options Options for the iteration. */ - async *entries( + entries( options: KeyValueStoreIteratorOptions = {}, - ): AsyncGenerator<[string, T], void, undefined> { + ): AsyncIterable<[string, T]> & Promise<[string, T][]> { checkStorageAccess(); - for await (const item of this.client.iterateKeys(options)) { - const record = await this.client.getValue(item.key); - if (record) { - yield [item.key, record.value as T]; + const client = this.client; + + async function* iterate(): AsyncGenerator<[string, T]> { + for await (const item of client.iterateKeys(options)) { + const record = await client.getValue(item.key); + if (record) { + yield [item.key, record.value as T]; + } } } + + return dualAsyncIterable(iterate()); } /** diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index 82e210b5804a..91e368ee63a2 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -141,6 +141,51 @@ export const API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000; */ export const MAX_QUERIES_FOR_CONSISTENCY = 6; +/** + * Wraps an `AsyncIterable` so that it can be used both as an async iterable + * (`for await...of`) **and** as a `Promise` (`await`). + * + * This is a convenience for callers: they can choose between streaming one-by-one + * or collecting everything into an array. + * + * @internal + */ +export function dualAsyncIterable(source: AsyncIterable): AsyncIterable & Promise { + // Lazily collect all items when the result is awaited as a Promise. + const collectAll = async (): Promise => { + const items: T[] = []; + for await (const item of source) { + items.push(item); + } + return items; + }; + + // We attach `then` / `catch` so that `await result` works, + // and `Symbol.asyncIterator` so that `for await (const x of result)` works. + const result = { + [Symbol.asyncIterator]() { + return source[Symbol.asyncIterator](); + }, + then( + onfulfilled?: ((value: T[]) => TResult1 | PromiseLike) | null, + onrejected?: ((reason: any) => TResult2 | PromiseLike) | null, + ): Promise { + return collectAll().then(onfulfilled, onrejected); + }, + catch( + onrejected?: ((reason: any) => TResult | PromiseLike) | null, + ): Promise { + return collectAll().catch(onrejected); + }, + finally(onfinally?: (() => void) | null): Promise { + return collectAll().finally(onfinally); + }, + [Symbol.toStringTag]: 'DualAsyncIterable', + } as AsyncIterable & Promise; + + return result; +} + /** * Options for the static `open()` method on storage classes ({@apilink Dataset}, {@apilink KeyValueStore}, {@apilink RequestQueue}). */ diff --git a/test/core/storages/dataset.test.ts b/test/core/storages/dataset.test.ts index 23c7b4169894..1328b40b7802 100644 --- a/test/core/storages/dataset.test.ts +++ b/test/core/storages/dataset.test.ts @@ -1,4 +1,4 @@ -import { isJsonSerializable, chunkBySize, Dataset, KeyValueStore, serviceLocator } from '@crawlee/core'; +import { assertJsonSerializable, chunkBySize, Dataset, KeyValueStore, serviceLocator } from '@crawlee/core'; import type { Dictionary } from '@crawlee/utils'; import { MemoryStorageEmulator } from '../../shared/MemoryStorageEmulator.js'; @@ -378,21 +378,21 @@ describe('dataset', () => { }); describe('utils', () => { - test('isJsonSerializable() works', () => { + test('assertJsonSerializable() works', () => { // Valid objects - expect(() => isJsonSerializable({})).not.toThrow(); - expect(() => isJsonSerializable({ foo: 'bar' })).not.toThrow(); - expect(() => isJsonSerializable({ foo: 'bar' }, 1)).not.toThrow(); + expect(() => assertJsonSerializable({})).not.toThrow(); + expect(() => assertJsonSerializable({ foo: 'bar' })).not.toThrow(); + expect(() => assertJsonSerializable({ foo: 'bar' }, 1)).not.toThrow(); // Circular reference const bad = {} as Dictionary; bad.bad = bad; - expect(() => isJsonSerializable(bad)).toThrow('not serializable to JSON'); + expect(() => assertJsonSerializable(bad)).toThrow('not serializable to JSON'); // Non-objects - expect(() => isJsonSerializable('hello')).toThrow('not an object'); - expect(() => isJsonSerializable([])).toThrow('not an object'); - expect(() => isJsonSerializable(['a', 'b'])).toThrow('not an object'); + expect(() => assertJsonSerializable('hello')).toThrow('not an object'); + expect(() => assertJsonSerializable([])).toThrow('not an object'); + expect(() => assertJsonSerializable(['a', 'b'])).toThrow('not an object'); // With index in error message - expect(() => isJsonSerializable('hello', 3)).toThrow('at index 3'); + expect(() => assertJsonSerializable('hello', 3)).toThrow('at index 3'); }); test('chunkBySize', () => { const obj = { foo: 'bar' }; From 1786bcd9d4b3d6f8d4781ac057e55d659645536d Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 13:12:31 +0200 Subject: [PATCH 03/10] Improve dual iterator semantics --- packages/core/src/storages/dataset.ts | 85 +++++++++--- packages/core/src/storages/key_value_store.ts | 121 ++++++++++++++---- packages/core/src/storages/utils.ts | 44 +++---- .../src/resource-clients/dataset.ts | 36 ------ .../src/resource-clients/key-value-store.ts | 25 ++-- .../test/async-iteration.test.ts | 104 +++++++-------- packages/types/src/storages.ts | 13 +- test/core/storages/key_value_store.test.ts | 21 ++- 8 files changed, 264 insertions(+), 185 deletions(-) diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 89a834941ca1..467836ec4f3a 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -1,4 +1,4 @@ -import type { DatasetClient, DatasetInfo, Dictionary } from '@crawlee/types'; +import type { DatasetClient, DatasetInfo, Dictionary, PaginatedList } from '@crawlee/types'; import { stringify } from 'csv-stringify/sync'; import ow from 'ow'; @@ -11,7 +11,7 @@ import { KeyValueStore } from './key_value_store.js'; import type { StorageIdentifier } from './storage_instance_manager.js'; import type { StorageOpenOptions } from './utils.js'; import { resolveStorageIdentifier } from './storage_instance_manager.js'; -import { dualAsyncIterable, purgeDefaultStorages } from './utils.js'; +import { createDualIterable, purgeDefaultStorages } from './utils.js'; /** @internal */ export const DATASET_ITERATORS_DEFAULT_LIMIT = 10000; @@ -570,28 +570,61 @@ export class Dataset { } /** - * Iterates over dataset items using an async generator, - * allowing the use of `for await...of` syntax. + * Returns dataset items. + * + * When awaited (`await dataset.values()`), returns the first page as a {@apilink PaginatedList}. + * When used as an async iterable (`for await...of`), streams all items across pages. * * **Example usage:** * ```javascript * const dataset = await Dataset.open('my-results'); + * + * // Stream all items * for await (const item of dataset.values()) { * console.log(item); * } + * + * // Or fetch a single page + * const page = await dataset.values(); + * console.log(page.items, page.total); * ``` * * @param options Options for the iteration. */ - values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise { + values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise> { checkStorageAccess(); - return dualAsyncIterable(this.client.iterateItems(options)); + const client = this.client; + const firstPage = client.getData(options); + + async function* iterateAll(): AsyncGenerator { + let offset = options.offset ?? 0; + const totalLimit = options.limit; + const pageSize = DATASET_ITERATORS_DEFAULT_LIMIT; + let yielded = 0; + + while (true) { + const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize; + if (fetchLimit <= 0) break; + + const page = await client.getData({ ...options, offset, limit: fetchLimit }); + for (const item of page.items) { + yield item; + yielded++; + } + if (page.items.length < fetchLimit || offset + page.items.length >= page.total) break; + offset += page.items.length; + } + } + + return createDualIterable(firstPage, iterateAll()); } /** - * Iterates over dataset entries (index-value pairs) using an async generator, - * allowing the use of `for await...of` syntax. + * Returns dataset entries (index-value pairs). + * + * When awaited, returns the first page as a {@apilink PaginatedList} of `[index, item]` tuples. + * When used as an async iterable (`for await...of`), streams all entries across pages. * * **Example usage:** * ```javascript @@ -603,19 +636,39 @@ export class Dataset { * * @param options Options for the iteration. */ - entries(options: DatasetIteratorOptions = {}): AsyncIterable<[number, Data]> & Promise<[number, Data][]> { + entries( + options: DatasetIteratorOptions = {}, + ): AsyncIterable<[number, Data]> & Promise> { checkStorageAccess(); - const iterable = this.client.iterateItems(options); - let index = options.offset ?? 0; - - async function* enumerate(): AsyncGenerator<[number, Data]> { - for await (const item of iterable) { - yield [index++, item]; + const client = this.client; + const startOffset = options.offset ?? 0; + + const firstPage = client.getData(options).then((page) => ({ + ...page, + items: page.items.map((item, i) => [startOffset + i, item] as [number, Data]), + })); + + async function* iterateAll(): AsyncGenerator<[number, Data]> { + let offset = startOffset; + const totalLimit = options.limit; + const pageSize = DATASET_ITERATORS_DEFAULT_LIMIT; + let yielded = 0; + + while (true) { + const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize; + if (fetchLimit <= 0) break; + + const page = await client.getData({ ...options, offset, limit: fetchLimit }); + for (const item of page.items) { + yield [offset++, item]; + yielded++; + } + if (page.items.length < fetchLimit || offset >= page.total) break; } } - return dualAsyncIterable(enumerate()); + return createDualIterable(firstPage, iterateAll()); } /** diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 8c210900c059..ed6cea9f9934 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -15,7 +15,10 @@ import { checkStorageAccess } from './access_checking.js'; import type { StorageIdentifier } from './storage_instance_manager.js'; import type { StorageOpenOptions } from './utils.js'; import { resolveStorageIdentifier } from './storage_instance_manager.js'; -import { dualAsyncIterable, purgeDefaultStorages } from './utils.js'; +import { createDualIterable, purgeDefaultStorages } from './utils.js'; + +/** @internal */ +const KVS_KEYS_DEFAULT_LIMIT = 1000; /** * Helper function to possibly stringify value if options.contentType is not set. @@ -455,43 +458,80 @@ export class KeyValueStore { ); let index = 0; - for await (const item of this.client.iterateKeys(options)) { - await iteratee(item.key, index++, { size: item.size }); + let exclusiveStartKey: string | undefined; + const limit = KVS_KEYS_DEFAULT_LIMIT; + + while (true) { + const items = await this.client.listKeys({ ...options, exclusiveStartKey, limit }); + for (const item of items) { + await iteratee(item.key, index++, { size: item.size }); + } + if (items.length < limit) break; + exclusiveStartKey = items[items.length - 1].key; } } /** - * Iterates over key-value store keys using an async generator, - * allowing the use of `for await...of` syntax. + * Returns key-value store keys. + * + * When awaited (`await store.keys()`), returns the first page of keys as `string[]`. + * When used as an async iterable (`for await...of`), streams all keys across pages. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); + * + * // Stream all keys * for await (const key of keyValueStore.keys()) { * console.log(key); * } + * + * // Or fetch first page + * const firstPageKeys = await keyValueStore.keys(); * ``` * * @param options Options for the iteration. */ - async *keys(options: KeyValueStoreIteratorOptions = {}): AsyncGenerator { + keys(options: KeyValueStoreIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); - for await (const item of this.client.iterateKeys(options)) { - yield item.key; + const client = this.client; + const firstPage = client.listKeys(options).then((items) => items.map((item) => item.key)); + + async function* iterateAll(): AsyncGenerator { + let exclusiveStartKey: string | undefined; + const limit = KVS_KEYS_DEFAULT_LIMIT; + + while (true) { + const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); + for (const item of items) { + yield item.key; + } + if (items.length < limit) break; + exclusiveStartKey = items[items.length - 1].key; + } } + + return createDualIterable(firstPage, iterateAll()); } /** - * Iterates over key-value store values using an async generator, - * allowing the use of `for await...of` syntax. + * Returns key-value store values. + * + * When awaited (`await store.values()`), returns the first page of values as `T[]`. + * When used as an async iterable (`for await...of`), streams all values across pages. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); + * + * // Stream all values * for await (const value of keyValueStore.values()) { * console.log(value); * } + * + * // Or fetch first page + * const firstPageValues = await keyValueStore.values(); * ``` * * @param options Options for the iteration. @@ -501,28 +541,50 @@ export class KeyValueStore { const client = this.client; - async function* iterate(): AsyncGenerator { - for await (const item of client.iterateKeys(options)) { + const firstPage = client.listKeys(options).then(async (items) => { + const results: T[] = []; + for (const item of items) { const record = await client.getValue(item.key); - if (record) { - yield record.value as T; + if (record) results.push(record.value as T); + } + return results; + }); + + async function* iterateAll(): AsyncGenerator { + let exclusiveStartKey: string | undefined; + const limit = KVS_KEYS_DEFAULT_LIMIT; + + while (true) { + const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); + for (const item of items) { + const record = await client.getValue(item.key); + if (record) yield record.value as T; } + if (items.length < limit) break; + exclusiveStartKey = items[items.length - 1].key; } } - return dualAsyncIterable(iterate()); + return createDualIterable(firstPage, iterateAll()); } /** - * Iterates over key-value store entries (key-value pairs) using an async generator, - * allowing the use of `for await...of` syntax. + * Returns key-value store entries (key-value pairs). + * + * When awaited (`await store.entries()`), returns the first page of entries as `[key, value][]`. + * When used as an async iterable (`for await...of`), streams all entries across pages. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); + * + * // Stream all entries * for await (const [key, value] of keyValueStore.entries()) { * console.log(`${key}: ${value}`); * } + * + * // Or fetch first page + * const firstPageEntries = await keyValueStore.entries(); * ``` * * @param options Options for the iteration. @@ -534,16 +596,31 @@ export class KeyValueStore { const client = this.client; - async function* iterate(): AsyncGenerator<[string, T]> { - for await (const item of client.iterateKeys(options)) { + const firstPage = client.listKeys(options).then(async (items) => { + const results: [string, T][] = []; + for (const item of items) { const record = await client.getValue(item.key); - if (record) { - yield [item.key, record.value as T]; + if (record) results.push([item.key, record.value as T]); + } + return results; + }); + + async function* iterateAll(): AsyncGenerator<[string, T]> { + let exclusiveStartKey: string | undefined; + const limit = KVS_KEYS_DEFAULT_LIMIT; + + while (true) { + const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); + for (const item of items) { + const record = await client.getValue(item.key); + if (record) yield [item.key, record.value as T]; } + if (items.length < limit) break; + exclusiveStartKey = items[items.length - 1].key; } } - return dualAsyncIterable(iterate()); + return createDualIterable(firstPage, iterateAll()); } /** diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index 91e368ee63a2..bedc50415a4d 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -142,46 +142,38 @@ export const API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000; export const MAX_QUERIES_FOR_CONSISTENCY = 6; /** - * Wraps an `AsyncIterable` so that it can be used both as an async iterable - * (`for await...of`) **and** as a `Promise` (`await`). + * Creates an object that is both an `AsyncIterable` (for `for await...of`) + * and a `Promise` (for `await`). * - * This is a convenience for callers: they can choose between streaming one-by-one - * or collecting everything into an array. + * - `await result` resolves to `firstPage` (a single page / bounded fetch). + * - `for await (const item of result)` streams items from `allItems`. * * @internal */ -export function dualAsyncIterable(source: AsyncIterable): AsyncIterable & Promise { - // Lazily collect all items when the result is awaited as a Promise. - const collectAll = async (): Promise => { - const items: T[] = []; - for await (const item of source) { - items.push(item); - } - return items; - }; - - // We attach `then` / `catch` so that `await result` works, - // and `Symbol.asyncIterator` so that `for await (const x of result)` works. +export function createDualIterable( + firstPage: Promise, + allItems: AsyncIterable, +): AsyncIterable & Promise { const result = { [Symbol.asyncIterator]() { - return source[Symbol.asyncIterator](); + return allItems[Symbol.asyncIterator](); }, - then( - onfulfilled?: ((value: T[]) => TResult1 | PromiseLike) | null, + then( + onfulfilled?: ((value: TPage) => TResult1 | PromiseLike) | null, onrejected?: ((reason: any) => TResult2 | PromiseLike) | null, ): Promise { - return collectAll().then(onfulfilled, onrejected); + return firstPage.then(onfulfilled, onrejected); }, catch( onrejected?: ((reason: any) => TResult | PromiseLike) | null, - ): Promise { - return collectAll().catch(onrejected); + ): Promise { + return firstPage.catch(onrejected); }, - finally(onfinally?: (() => void) | null): Promise { - return collectAll().finally(onfinally); + finally(onfinally?: (() => void) | null): Promise { + return firstPage.finally(onfinally); }, - [Symbol.toStringTag]: 'DualAsyncIterable', - } as AsyncIterable & Promise; + [Symbol.toStringTag]: 'DualIterable', + } as AsyncIterable & Promise; return result; } diff --git a/packages/memory-storage/src/resource-clients/dataset.ts b/packages/memory-storage/src/resource-clients/dataset.ts index 99ded54f52a9..7e911fca0040 100644 --- a/packages/memory-storage/src/resource-clients/dataset.ts +++ b/packages/memory-storage/src/resource-clients/dataset.ts @@ -104,42 +104,6 @@ export class DatasetClient }); } - async *iterateItems(options: storage.DatasetClientListOptions = {}): AsyncIterable { - const { - desc, - limit, - offset: startOffset, - } = s - .object({ - desc: s.boolean().optional(), - limit: s.number().int().optional(), - offset: s.number().int().optional(), - }) - .parse(options); - - let offset = startOffset ?? 0; - let yielded = 0; - const pageSize = 1000; - - while (true) { - const pageLimit = limit !== undefined ? Math.min(pageSize, limit - yielded) : pageSize; - if (pageLimit <= 0) break; - - const page = await this.getDataPage({ desc, offset, limit: pageLimit }); - - for (const item of page.items) { - yield item; - yielded++; - } - - if (page.items.length < pageLimit || (limit !== undefined && yielded >= limit)) { - break; - } - - offset += page.items.length; - } - } - private async getDataPage(options: storage.DatasetClientListOptions = {}): Promise> { const { limit = LIST_ITEMS_LIMIT, offset = 0, desc } = options; diff --git a/packages/memory-storage/src/resource-clients/key-value-store.ts b/packages/memory-storage/src/resource-clients/key-value-store.ts index f0cd4558de58..918bc4c1f9e2 100644 --- a/packages/memory-storage/src/resource-clients/key-value-store.ts +++ b/packages/memory-storage/src/resource-clients/key-value-store.ts @@ -75,12 +75,12 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS this.updateTimestamps(true); } - async *iterateKeys( - options: storage.KeyValueStoreIterateKeysOptions = {}, - ): AsyncIterable { - const { prefix } = s + async listKeys(options: storage.KeyValueStoreListKeysOptions = {}): Promise { + const { prefix, exclusiveStartKey, limit } = s .object({ prefix: s.string().optional(), + exclusiveStartKey: s.string().optional(), + limit: s.number().int().greaterThan(0).optional(), }) .parse(options); @@ -99,13 +99,22 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS // Lexically sort to emulate API. items.sort((a, b) => a.key.localeCompare(b.key)); - const filteredItems = items.filter((item) => !prefix || item.key.startsWith(prefix)); + let filteredItems = items.filter((item) => !prefix || item.key.startsWith(prefix)); - this.updateTimestamps(false); + if (exclusiveStartKey) { + const keyPos = filteredItems.findIndex((item) => item.key === exclusiveStartKey); + if (keyPos !== -1) { + filteredItems = filteredItems.slice(keyPos + 1); + } + } - for (const item of filteredItems) { - yield item; + if (limit !== undefined) { + filteredItems = filteredItems.slice(0, limit); } + + this.updateTimestamps(false); + + return filteredItems; } /** diff --git a/packages/memory-storage/test/async-iteration.test.ts b/packages/memory-storage/test/async-iteration.test.ts index a1acab02bf2f..8fc6f96f667a 100644 --- a/packages/memory-storage/test/async-iteration.test.ts +++ b/packages/memory-storage/test/async-iteration.test.ts @@ -15,7 +15,7 @@ describe('Async iteration support', () => { await rm(localDataDirectory, { force: true, recursive: true }); }); - describe('Dataset.getData / iterateItems', () => { + describe('Dataset.getData', () => { const elements = Array.from({ length: 25 }, (_, i) => ({ index: i })); let dataset: DatasetClient<{ index: number }>; @@ -35,63 +35,36 @@ describe('Async iteration support', () => { expect(result.items).toStrictEqual(elements.slice(0, 10)); }); - test('can be used with for await...of to iterate all items', async () => { - const items: { index: number }[] = []; - - for await (const item of dataset.iterateItems()) { - items.push(item); - } - - expect(items).toHaveLength(25); - expect(items).toStrictEqual(elements); - }); - - test('respects limit option when iterating', async () => { - const items: { index: number }[] = []; - - for await (const item of dataset.iterateItems({ limit: 10 })) { - items.push(item); - } + test('respects limit option', async () => { + const result = await dataset.getData({ limit: 10 }); - expect(items).toHaveLength(10); - expect(items).toStrictEqual(elements.slice(0, 10)); + expect(result.items).toHaveLength(10); + expect(result.items).toStrictEqual(elements.slice(0, 10)); }); - test('respects offset option when iterating', async () => { - const items: { index: number }[] = []; - - for await (const item of dataset.iterateItems({ offset: 5 })) { - items.push(item); - } + test('respects offset option', async () => { + const result = await dataset.getData({ offset: 5 }); - expect(items).toHaveLength(20); - expect(items).toStrictEqual(elements.slice(5)); + expect(result.items).toHaveLength(20); + expect(result.items).toStrictEqual(elements.slice(5)); }); - test('respects both offset and limit options when iterating', async () => { - const items: { index: number }[] = []; + test('respects both offset and limit options', async () => { + const result = await dataset.getData({ offset: 5, limit: 10 }); - for await (const item of dataset.iterateItems({ offset: 5, limit: 10 })) { - items.push(item); - } - - expect(items).toHaveLength(10); - expect(items).toStrictEqual(elements.slice(5, 15)); + expect(result.items).toHaveLength(10); + expect(result.items).toStrictEqual(elements.slice(5, 15)); }); - test('respects desc option when iterating', async () => { - const items: { index: number }[] = []; + test('respects desc option', async () => { + const result = await dataset.getData({ desc: true, limit: 5 }); - for await (const item of dataset.iterateItems({ desc: true, limit: 5 })) { - items.push(item); - } - - expect(items).toHaveLength(5); - expect(items).toStrictEqual(elements.slice().reverse().slice(0, 5)); + expect(result.items).toHaveLength(5); + expect(result.items).toStrictEqual(elements.slice().reverse().slice(0, 5)); }); }); - describe('KeyValueStore.iterateKeys', () => { + describe('KeyValueStore.listKeys', () => { const keys = Array.from({ length: 25 }, (_, i) => `key-${String(i).padStart(2, '0')}`); let kvStore: KeyValueStoreClient; @@ -103,27 +76,40 @@ describe('Async iteration support', () => { } }); - test('can be used with for await...of to iterate all keys', async () => { - const items: string[] = []; - - for await (const item of kvStore.iterateKeys()) { - items.push(item.key); - } + test('returns all keys', async () => { + const items = await kvStore.listKeys(); expect(items).toHaveLength(25); - expect(items).toStrictEqual(keys); + expect(items.map((i) => i.key)).toStrictEqual(keys); }); - test('respects prefix option when iterating', async () => { - const items: string[] = []; - + test('respects prefix option', async () => { // Only keys starting with 'key-0' (key-00 to key-09) - for await (const item of kvStore.iterateKeys({ prefix: 'key-0' })) { - items.push(item.key); - } + const items = await kvStore.listKeys({ prefix: 'key-0' }); expect(items).toHaveLength(10); - expect(items).toStrictEqual(keys.slice(0, 10)); + expect(items.map((i) => i.key)).toStrictEqual(keys.slice(0, 10)); + }); + + test('respects exclusiveStartKey option', async () => { + const items = await kvStore.listKeys({ exclusiveStartKey: 'key-09' }); + + expect(items).toHaveLength(15); + expect(items.map((i) => i.key)).toStrictEqual(keys.slice(10)); + }); + + test('respects limit option', async () => { + const items = await kvStore.listKeys({ limit: 5 }); + + expect(items).toHaveLength(5); + expect(items.map((i) => i.key)).toStrictEqual(keys.slice(0, 5)); + }); + + test('respects exclusiveStartKey and limit together', async () => { + const items = await kvStore.listKeys({ exclusiveStartKey: 'key-04', limit: 5 }); + + expect(items).toHaveLength(5); + expect(items.map((i) => i.key)).toStrictEqual(keys.slice(5, 10)); }); }); }); diff --git a/packages/types/src/storages.ts b/packages/types/src/storages.ts index 7ce0f8f853b7..b992a5817dd2 100644 --- a/packages/types/src/storages.ts +++ b/packages/types/src/storages.ts @@ -74,9 +74,6 @@ export interface DatasetClient { /** Fetch a page of items from the dataset. */ getData(options?: DatasetClientListOptions): Promise>; - - /** Iterate over all items in the dataset. */ - iterateItems(options?: DatasetClientListOptions): AsyncIterable; } export interface KeyValueStoreStats { @@ -105,9 +102,13 @@ export interface KeyValueStoreRecord { contentType?: string; } -export interface KeyValueStoreIterateKeysOptions { +export interface KeyValueStoreListKeysOptions { /** If set, only keys that start with this prefix are returned. */ prefix?: string; + /** All keys up to this one are skipped from the result. */ + exclusiveStartKey?: string; + /** Maximum number of keys to return. */ + limit?: number; } export interface KeyValueStoreItemData { @@ -143,8 +144,8 @@ export interface KeyValueStoreClient { /** Delete a record by key. */ deleteValue(key: string): Promise; - /** Iterate over all keys in the store. */ - iterateKeys(options?: KeyValueStoreIterateKeysOptions): AsyncIterable; + /** List keys in the store. Returns at most `limit` keys starting after `exclusiveStartKey`. */ + listKeys(options?: KeyValueStoreListKeysOptions): Promise; /** Get the public URL for a record, or `undefined` if unavailable. */ getPublicUrl(key: string): Promise; diff --git a/test/core/storages/key_value_store.test.ts b/test/core/storages/key_value_store.test.ts index 1de8a5b0d5ac..7bff2492b81f 100644 --- a/test/core/storages/key_value_store.test.ts +++ b/test/core/storages/key_value_store.test.ts @@ -474,16 +474,14 @@ describe('KeyValueStore', () => { const store = await createKeyValueStore('my-store-id-1'); // @ts-expect-error Accessing private property - const mockIterateKeys = vitest.spyOn(store.client, 'iterateKeys'); - mockIterateKeys.mockReturnValueOnce( - (async function* () { - yield { key: 'key1', size: 1 }; - yield { key: 'key2', size: 2 }; - yield { key: 'key3', size: 3 }; - yield { key: 'key4', size: 4 }; - yield { key: 'key5', size: 5 }; - })(), - ); + const mockListKeys = vitest.spyOn(store.client, 'listKeys'); + mockListKeys.mockResolvedValueOnce([ + { key: 'key1', size: 1 }, + { key: 'key2', size: 2 }, + { key: 'key3', size: 3 }, + { key: 'key4', size: 4 }, + { key: 'key5', size: 5 }, + ]); const results: [string, number, { size: number }][] = []; await store.forEachKey( @@ -493,8 +491,7 @@ describe('KeyValueStore', () => { { prefix: 'img/' }, ); - expect(mockIterateKeys).toHaveBeenCalledTimes(1); - expect(mockIterateKeys).toHaveBeenCalledWith({ prefix: 'img/' }); + expect(mockListKeys).toHaveBeenCalledTimes(1); expect(results).toHaveLength(5); results.forEach((r, i) => { From b999174aa4d127b1297171018a28a042cd0136a7 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 14:56:43 +0200 Subject: [PATCH 04/10] Pass limit to first-page call in KVS --- packages/core/src/storages/key_value_store.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index ed6cea9f9934..005cf213ec77 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -496,7 +496,9 @@ export class KeyValueStore { checkStorageAccess(); const client = this.client; - const firstPage = client.listKeys(options).then((items) => items.map((item) => item.key)); + const firstPage = client + .listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }) + .then((items) => items.map((item) => item.key)); async function* iterateAll(): AsyncGenerator { let exclusiveStartKey: string | undefined; @@ -541,7 +543,7 @@ export class KeyValueStore { const client = this.client; - const firstPage = client.listKeys(options).then(async (items) => { + const firstPage = client.listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }).then(async (items) => { const results: T[] = []; for (const item of items) { const record = await client.getValue(item.key); @@ -596,7 +598,7 @@ export class KeyValueStore { const client = this.client; - const firstPage = client.listKeys(options).then(async (items) => { + const firstPage = client.listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }).then(async (items) => { const results: [string, T][] = []; for (const item of items) { const record = await client.getValue(item.key); From 07d67930734ffa215f8868a15b91d752c26d404b Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 15:30:18 +0200 Subject: [PATCH 05/10] Remove obsolete options --- packages/core/src/storages/key_value_store.ts | 21 +--------------- test/core/storages/key_value_store.test.ts | 24 ------------------- 2 files changed, 1 insertion(+), 44 deletions(-) diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 005cf213ec77..5e064bf20478 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -276,19 +276,12 @@ export class KeyValueStore { return; } - // use half the interval of `persistState` to avoid race conditions - const persistStateIntervalMillis = this.config.persistStateIntervalMillis; - const timeoutSecs = persistStateIntervalMillis / 2_000; - serviceLocator.getEventManager().on('persistState', async () => { const promises: Promise[] = []; for (const [key, value] of this.cache) { promises.push( - this.setValue(key, value, { - timeoutSecs, - doNotRetryTimeouts: true, - }).catch((error) => + this.setValue(key, value).catch((error) => serviceLocator.getLogger().warning(`Failed to persist the state value to ${key}`, { error }), ), ); @@ -370,8 +363,6 @@ export class KeyValueStore { options, ow.object.exactShape({ contentType: ow.optional.string.nonEmpty, - timeoutSecs: ow.optional.number, - doNotRetryTimeouts: ow.optional.boolean, }), ); @@ -912,16 +903,6 @@ export interface RecordOptions { * Specifies a custom MIME content type of the record. */ contentType?: string; - - /** - * Specifies a custom timeout for the `set-record` API call, in seconds. - */ - timeoutSecs?: number; - - /** - * If set to `true`, the `set-record` API call will not be retried if it times out. - */ - doNotRetryTimeouts?: boolean; } export interface KeyValueStoreIteratorOptions { diff --git a/test/core/storages/key_value_store.test.ts b/test/core/storages/key_value_store.test.ts index 7bff2492b81f..2e22ded8cd69 100644 --- a/test/core/storages/key_value_store.test.ts +++ b/test/core/storages/key_value_store.test.ts @@ -294,30 +294,6 @@ describe('KeyValueStore', () => { }); }); - test('correctly passes timeout options', async () => { - const store = await createKeyValueStore('my-store-id-1'); - - const record = { foo: 'bar' }; - const recordStr = JSON.stringify(record, null, 2); - - const mockSetValue = vitest - // @ts-expect-error Accessing private property - .spyOn(store.client, 'setValue') - .mockResolvedValueOnce(undefined); - - await store.setValue('key-1', record, { - timeoutSecs: 1, - doNotRetryTimeouts: true, - }); - - expect(mockSetValue).toHaveBeenCalledTimes(1); - expect(mockSetValue).toHaveBeenCalledWith({ - key: 'key-1', - value: recordStr, - contentType: 'application/json; charset=utf-8', - }); - }); - test('correctly passes raw string values', async () => { const store = await createKeyValueStore('my-store-id-1'); From 76c25433f199b34d90098ef872222da8b2da8f3d Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 15:40:50 +0200 Subject: [PATCH 06/10] Throw on missing exclusiveStartKey in memory storage --- .../src/resource-clients/key-value-store.ts | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/packages/memory-storage/src/resource-clients/key-value-store.ts b/packages/memory-storage/src/resource-clients/key-value-store.ts index 918bc4c1f9e2..3a83f323dcc8 100644 --- a/packages/memory-storage/src/resource-clients/key-value-store.ts +++ b/packages/memory-storage/src/resource-clients/key-value-store.ts @@ -103,9 +103,13 @@ export class KeyValueStoreClient extends BaseClient implements storage.KeyValueS if (exclusiveStartKey) { const keyPos = filteredItems.findIndex((item) => item.key === exclusiveStartKey); - if (keyPos !== -1) { - filteredItems = filteredItems.slice(keyPos + 1); + if (keyPos === -1) { + throw new Error( + `exclusiveStartKey "${exclusiveStartKey}" was not found in the key-value store. ` + + `This is likely a bug — the key may have been deleted between paginated listKeys calls.`, + ); } + filteredItems = filteredItems.slice(keyPos + 1); } if (limit !== undefined) { From 1d6129be47f69d1bf90aadad41c538c45023fad1 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 17:31:25 +0200 Subject: [PATCH 07/10] Simplify --- packages/core/src/storages/dataset.ts | 112 +++++++--------- packages/core/src/storages/key_value_store.ts | 121 ++++++------------ packages/core/src/storages/utils.ts | 66 +++++++--- 3 files changed, 135 insertions(+), 164 deletions(-) diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 467836ec4f3a..0ccc547bd4d5 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -295,22 +295,9 @@ export class Dataset { const items: Data[] = []; - const fetchNextChunk = async (offset = 0): Promise => { - const limit = 1000; - const value = await this.client.getData({ offset, limit, ...options }); - - if (value.count === 0) { - return; - } - - items.push(...value.items); - - if (value.total > offset + value.count) { - await fetchNextChunk(offset + value.count); - } - }; - - await fetchNextChunk(); + for await (const page of this.fetchPages(options)) { + items.push(...page.items); + } return items; } @@ -569,6 +556,37 @@ export class Dataset { return currentMemo; } + private async *fetchEntryPages(options: DatasetIteratorOptions): AsyncGenerator> { + let index = options.offset ?? 0; + for await (const page of this.fetchPages(options)) { + yield { + ...page, + items: page.items.map((item) => [index++, item] as [number, Data]), + }; + } + } + + private async *fetchPages( + options: DatasetIteratorOptions, + pageSize = DATASET_ITERATORS_DEFAULT_LIMIT, + ): AsyncGenerator> { + let offset = options.offset ?? 0; + const totalLimit = options.limit; + let yielded = 0; + + while (true) { + const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize; + if (fetchLimit <= 0) break; + + const page = await this.client.getData({ ...options, offset, limit: fetchLimit }); + yield page; + + yielded += page.items.length; + if (page.items.length < fetchLimit || offset + page.items.length >= page.total) break; + offset += page.items.length; + } + } + /** * Returns dataset items. * @@ -594,30 +612,11 @@ export class Dataset { values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise> { checkStorageAccess(); - const client = this.client; - const firstPage = client.getData(options); - - async function* iterateAll(): AsyncGenerator { - let offset = options.offset ?? 0; - const totalLimit = options.limit; - const pageSize = DATASET_ITERATORS_DEFAULT_LIMIT; - let yielded = 0; - - while (true) { - const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize; - if (fetchLimit <= 0) break; - - const page = await client.getData({ ...options, offset, limit: fetchLimit }); - for (const item of page.items) { - yield item; - yielded++; - } - if (page.items.length < fetchLimit || offset + page.items.length >= page.total) break; - offset += page.items.length; - } - } - - return createDualIterable(firstPage, iterateAll()); + return createDualIterable({ + createPages: () => this.fetchPages(options), + extractItems: (page) => page.items, + mapFirstPage: (page) => page, + }); } /** @@ -641,34 +640,11 @@ export class Dataset { ): AsyncIterable<[number, Data]> & Promise> { checkStorageAccess(); - const client = this.client; - const startOffset = options.offset ?? 0; - - const firstPage = client.getData(options).then((page) => ({ - ...page, - items: page.items.map((item, i) => [startOffset + i, item] as [number, Data]), - })); - - async function* iterateAll(): AsyncGenerator<[number, Data]> { - let offset = startOffset; - const totalLimit = options.limit; - const pageSize = DATASET_ITERATORS_DEFAULT_LIMIT; - let yielded = 0; - - while (true) { - const fetchLimit = totalLimit !== undefined ? Math.min(pageSize, totalLimit - yielded) : pageSize; - if (fetchLimit <= 0) break; - - const page = await client.getData({ ...options, offset, limit: fetchLimit }); - for (const item of page.items) { - yield [offset++, item]; - yielded++; - } - if (page.items.length < fetchLimit || offset >= page.total) break; - } - } - - return createDualIterable(firstPage, iterateAll()); + return createDualIterable({ + createPages: () => this.fetchEntryPages(options), + extractItems: (page) => page.items, + mapFirstPage: (page) => page, + }); } /** diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 5e064bf20478..2c64acc7284c 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -1,7 +1,7 @@ import { readFile } from 'node:fs/promises'; import { join } from 'node:path'; -import type { Dictionary, KeyValueStoreClient } from '@crawlee/types'; +import type { Dictionary, KeyValueStoreClient, KeyValueStoreItemData } from '@crawlee/types'; import JSON5 from 'json5'; import ow, { ArgumentError } from 'ow'; @@ -293,6 +293,34 @@ export class KeyValueStore { this.persistStateEventStarted = true; } + private async *fetchKeyValuePages( + options: KeyValueStoreIteratorOptions, + mapRecord: (key: string, value: unknown) => T, + ): AsyncGenerator { + for await (const page of this.fetchKeyPages(options)) { + const results: T[] = []; + for (const item of page) { + const record = await this.client.getValue(item.key); + if (record) results.push(mapRecord(item.key, record.value)); + } + yield results; + } + } + + private async *fetchKeyPages( + options: KeyValueStoreIteratorOptions, + limit = KVS_KEYS_DEFAULT_LIMIT, + ): AsyncGenerator { + let exclusiveStartKey: string | undefined; + + while (true) { + const items = await this.client.listKeys({ ...options, exclusiveStartKey, limit }); + yield items; + if (items.length < limit) break; + exclusiveStartKey = items[items.length - 1].key; + } + } + /** * Saves or deletes a record in the key-value store. * The function returns a promise that resolves once the record has been saved or deleted. @@ -449,16 +477,11 @@ export class KeyValueStore { ); let index = 0; - let exclusiveStartKey: string | undefined; - const limit = KVS_KEYS_DEFAULT_LIMIT; - while (true) { - const items = await this.client.listKeys({ ...options, exclusiveStartKey, limit }); - for (const item of items) { + for await (const page of this.fetchKeyPages(options)) { + for (const item of page) { await iteratee(item.key, index++, { size: item.size }); } - if (items.length < limit) break; - exclusiveStartKey = items[items.length - 1].key; } } @@ -486,26 +509,10 @@ export class KeyValueStore { keys(options: KeyValueStoreIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); - const client = this.client; - const firstPage = client - .listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }) - .then((items) => items.map((item) => item.key)); - - async function* iterateAll(): AsyncGenerator { - let exclusiveStartKey: string | undefined; - const limit = KVS_KEYS_DEFAULT_LIMIT; - - while (true) { - const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); - for (const item of items) { - yield item.key; - } - if (items.length < limit) break; - exclusiveStartKey = items[items.length - 1].key; - } - } - - return createDualIterable(firstPage, iterateAll()); + return createDualIterable({ + createPages: () => this.fetchKeyPages(options), + extractItems: (page) => page.map((item) => item.key), + }); } /** @@ -532,33 +539,10 @@ export class KeyValueStore { values(options: KeyValueStoreIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); - const client = this.client; - - const firstPage = client.listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }).then(async (items) => { - const results: T[] = []; - for (const item of items) { - const record = await client.getValue(item.key); - if (record) results.push(record.value as T); - } - return results; + return createDualIterable({ + createPages: () => this.fetchKeyValuePages(options, (_key, value) => value as T), + extractItems: (page) => page, }); - - async function* iterateAll(): AsyncGenerator { - let exclusiveStartKey: string | undefined; - const limit = KVS_KEYS_DEFAULT_LIMIT; - - while (true) { - const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); - for (const item of items) { - const record = await client.getValue(item.key); - if (record) yield record.value as T; - } - if (items.length < limit) break; - exclusiveStartKey = items[items.length - 1].key; - } - } - - return createDualIterable(firstPage, iterateAll()); } /** @@ -587,33 +571,10 @@ export class KeyValueStore { ): AsyncIterable<[string, T]> & Promise<[string, T][]> { checkStorageAccess(); - const client = this.client; - - const firstPage = client.listKeys({ ...options, limit: KVS_KEYS_DEFAULT_LIMIT }).then(async (items) => { - const results: [string, T][] = []; - for (const item of items) { - const record = await client.getValue(item.key); - if (record) results.push([item.key, record.value as T]); - } - return results; + return createDualIterable({ + createPages: () => this.fetchKeyValuePages<[string, T]>(options, (key, value) => [key, value as T]), + extractItems: (page) => page, }); - - async function* iterateAll(): AsyncGenerator<[string, T]> { - let exclusiveStartKey: string | undefined; - const limit = KVS_KEYS_DEFAULT_LIMIT; - - while (true) { - const items = await client.listKeys({ ...options, exclusiveStartKey, limit }); - for (const item of items) { - const record = await client.getValue(item.key); - if (record) yield [item.key, record.value as T]; - } - if (items.length < limit) break; - exclusiveStartKey = items[items.length - 1].key; - } - } - - return createDualIterable(firstPage, iterateAll()); } /** diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index bedc50415a4d..fd6dbe4bf98d 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -141,39 +141,73 @@ export const API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000; */ export const MAX_QUERIES_FOR_CONSISTENCY = 6; +/** @internal */ +export interface DualIterableOptions { + /** Factory that returns an async generator yielding pages. */ + createPages: () => AsyncGenerator; + /** Extracts individual items from a page (for iteration). */ + extractItems: (page: TRawPage) => TItem[]; + /** Transforms the first page into the await result. Defaults to `extractItems`. */ + mapFirstPage?: (page: TRawPage) => TAwaitResult; +} + /** * Creates an object that is both an `AsyncIterable` (for `for await...of`) - * and a `Promise` (for `await`). + * and a `Promise` (for `await`) from a single async page generator. + * + * - `await result` consumes only the first page from a fresh generator and + * transforms it via `mapFirstPage`. + * - `for await (const item of result)` streams all items across all pages, + * extracting items from each page via `getItems`. * - * - `await result` resolves to `firstPage` (a single page / bounded fetch). - * - `for await (const item of result)` streams items from `allItems`. + * Each usage path creates its own generator instance, so `await` and + * `for await...of` never interfere with each other. * * @internal */ -export function createDualIterable( - firstPage: Promise, - allItems: AsyncIterable, -): AsyncIterable & Promise { +export function createDualIterable( + options: DualIterableOptions, +): AsyncIterable & Promise { + const { createPages, extractItems } = options; + const resolveFirstPage = + options.mapFirstPage ?? ((page: TRawPage) => extractItems(page) as unknown as TAwaitResult); + let cached: Promise | null = null; + + function getOrCreate(): Promise { + if (!cached) { + cached = createPages() + .next() + .then((result) => resolveFirstPage(result.value)); + } + return cached; + } + + async function* iterateAll(): AsyncGenerator { + for await (const page of createPages()) { + yield* extractItems(page); + } + } + const result = { [Symbol.asyncIterator]() { - return allItems[Symbol.asyncIterator](); + return iterateAll(); }, - then( - onfulfilled?: ((value: TPage) => TResult1 | PromiseLike) | null, + then( + onfulfilled?: ((value: TAwaitResult) => TResult1 | PromiseLike) | null, onrejected?: ((reason: any) => TResult2 | PromiseLike) | null, ): Promise { - return firstPage.then(onfulfilled, onrejected); + return getOrCreate().then(onfulfilled, onrejected); }, catch( onrejected?: ((reason: any) => TResult | PromiseLike) | null, - ): Promise { - return firstPage.catch(onrejected); + ): Promise { + return getOrCreate().catch(onrejected); }, - finally(onfinally?: (() => void) | null): Promise { - return firstPage.finally(onfinally); + finally(onfinally?: (() => void) | null): Promise { + return getOrCreate().finally(onfinally); }, [Symbol.toStringTag]: 'DualIterable', - } as AsyncIterable & Promise; + } as AsyncIterable & Promise; return result; } From 9469b2e5735d9a656e95a3856951e4484dac5de6 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 17:50:43 +0200 Subject: [PATCH 08/10] Remove timeout from state persistence callbacks --- packages/core/src/crawlers/statistics.ts | 8 +------- packages/core/src/session_pool/session_pool.ts | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/packages/core/src/crawlers/statistics.ts b/packages/core/src/crawlers/statistics.ts index 29aaae3f537c..8056c23444e6 100644 --- a/packages/core/src/crawlers/statistics.ts +++ b/packages/core/src/crawlers/statistics.ts @@ -330,14 +330,8 @@ export class Statistics { this.log.debug('Persisting state', { persistStateKey: this.persistStateKey }); - // use half the interval of `persistState` to avoid race conditions - const persistStateIntervalMillis = serviceLocator.getConfiguration().persistStateIntervalMillis; - const timeoutSecs = persistStateIntervalMillis / 2_000; await this.keyValueStore - .setValue(this.persistStateKey, this.toJSON(), { - timeoutSecs, - doNotRetryTimeouts: true, - }) + .setValue(this.persistStateKey, this.toJSON()) .catch((error) => this.log.warning(`Failed to persist the statistics to ${this.persistStateKey}`, { error }), ); diff --git a/packages/core/src/session_pool/session_pool.ts b/packages/core/src/session_pool/session_pool.ts index 3bad85c7405d..90656c993b83 100644 --- a/packages/core/src/session_pool/session_pool.ts +++ b/packages/core/src/session_pool/session_pool.ts @@ -389,14 +389,8 @@ export class SessionPool extends EventEmitter { persistStateKey: this.persistStateKey, }); - // use half the interval of `persistState` to avoid race conditions - const persistStateIntervalMillis = serviceLocator.getConfiguration().persistStateIntervalMillis; - const timeoutSecs = persistStateIntervalMillis / 2_000; await this.keyValueStore - ?.setValue(this.persistStateKey, await this.getState(), { - timeoutSecs, - doNotRetryTimeouts: true, - }) + ?.setValue(this.persistStateKey, await this.getState()) .catch((error) => this.log.warning(`Failed to persist the session pool stats to ${this.persistStateKey}`, { error }), ); From 10ee5a40b1b172a215beb8acfeebe379bcbd6ae9 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Wed, 13 May 2026 18:33:30 +0200 Subject: [PATCH 09/10] Update tests --- test/core/crawlers/statistics.test.ts | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/core/crawlers/statistics.test.ts b/test/core/crawlers/statistics.test.ts index 50790992cd96..454ce70828e1 100644 --- a/test/core/crawlers/statistics.test.ts +++ b/test/core/crawlers/statistics.test.ts @@ -189,10 +189,6 @@ describe('Statistics', () => { // @ts-expect-error Accessing private prop stats.persistStateKey, { ...state, ...rest }, - { - doNotRetryTimeouts: true, - timeoutSecs: 30, - }, ); }, 2000); }); From 6e70953cc9afc205aa42deba7f57cde5be85eef5 Mon Sep 17 00:00:00 2001 From: Jan Buchar Date: Tue, 19 May 2026 15:59:32 +0200 Subject: [PATCH 10/10] awaiting a dual iterator yields all items --- packages/core/src/storages/dataset.ts | 32 +++++----- packages/core/src/storages/key_value_store.ts | 36 ++++++----- packages/core/src/storages/utils.ts | 44 ++++++------- test/core/storages/dataset.test.ts | 62 +++++++++++++++++++ test/core/storages/key_value_store.test.ts | 58 +++++++++++++++++ 5 files changed, 181 insertions(+), 51 deletions(-) diff --git a/packages/core/src/storages/dataset.ts b/packages/core/src/storages/dataset.ts index 0ccc547bd4d5..b423aa647d1e 100644 --- a/packages/core/src/storages/dataset.ts +++ b/packages/core/src/storages/dataset.ts @@ -590,60 +590,64 @@ export class Dataset { /** * Returns dataset items. * - * When awaited (`await dataset.values()`), returns the first page as a {@apilink PaginatedList}. - * When used as an async iterable (`for await...of`), streams all items across pages. + * When awaited (`await dataset.values()`), returns all items as a flat `Data[]` array. + * When used as an async iterable (`for await...of`), streams all items across pages + * without buffering everything in memory. * * **Example usage:** * ```javascript * const dataset = await Dataset.open('my-results'); * - * // Stream all items + * // Stream all items (memory-efficient for large datasets) * for await (const item of dataset.values()) { * console.log(item); * } * - * // Or fetch a single page - * const page = await dataset.values(); - * console.log(page.items, page.total); + * // Or fetch all items at once + * const items = await dataset.values(); + * console.log(items); * ``` * * @param options Options for the iteration. */ - values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise> { + values(options: DatasetIteratorOptions = {}): AsyncIterable & Promise { checkStorageAccess(); return createDualIterable({ createPages: () => this.fetchPages(options), extractItems: (page) => page.items, - mapFirstPage: (page) => page, }); } /** * Returns dataset entries (index-value pairs). * - * When awaited, returns the first page as a {@apilink PaginatedList} of `[index, item]` tuples. - * When used as an async iterable (`for await...of`), streams all entries across pages. + * When awaited (`await dataset.entries()`), returns all entries as a flat `[index, item][]` array. + * When used as an async iterable (`for await...of`), streams all entries across pages + * without buffering everything in memory. * * **Example usage:** * ```javascript * const dataset = await Dataset.open('my-results'); + * + * // Stream all entries * for await (const [index, item] of dataset.entries()) { * console.log(`Item at ${index}: ${JSON.stringify(item)}`); * } + * + * // Or fetch all at once + * const entries = await dataset.entries(); + * console.log(entries); * ``` * * @param options Options for the iteration. */ - entries( - options: DatasetIteratorOptions = {}, - ): AsyncIterable<[number, Data]> & Promise> { + entries(options: DatasetIteratorOptions = {}): AsyncIterable<[number, Data]> & Promise<[number, Data][]> { checkStorageAccess(); return createDualIterable({ createPages: () => this.fetchEntryPages(options), extractItems: (page) => page.items, - mapFirstPage: (page) => page, }); } diff --git a/packages/core/src/storages/key_value_store.ts b/packages/core/src/storages/key_value_store.ts index 2c64acc7284c..d4004ad34f31 100644 --- a/packages/core/src/storages/key_value_store.ts +++ b/packages/core/src/storages/key_value_store.ts @@ -488,20 +488,22 @@ export class KeyValueStore { /** * Returns key-value store keys. * - * When awaited (`await store.keys()`), returns the first page of keys as `string[]`. - * When used as an async iterable (`for await...of`), streams all keys across pages. + * When awaited (`await store.keys()`), returns all keys as a flat `string[]` array. + * When used as an async iterable (`for await...of`), streams all keys across pages + * without buffering everything in memory. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); * - * // Stream all keys + * // Stream all keys (memory-efficient for large stores) * for await (const key of keyValueStore.keys()) { * console.log(key); * } * - * // Or fetch first page - * const firstPageKeys = await keyValueStore.keys(); + * // Or fetch all keys at once + * const allKeys = await keyValueStore.keys(); + * console.log(allKeys); * ``` * * @param options Options for the iteration. @@ -518,20 +520,22 @@ export class KeyValueStore { /** * Returns key-value store values. * - * When awaited (`await store.values()`), returns the first page of values as `T[]`. - * When used as an async iterable (`for await...of`), streams all values across pages. + * When awaited (`await store.values()`), returns all values as a flat `T[]` array. + * When used as an async iterable (`for await...of`), streams all values across pages + * without buffering everything in memory. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); * - * // Stream all values + * // Stream all values (memory-efficient for large stores) * for await (const value of keyValueStore.values()) { * console.log(value); * } * - * // Or fetch first page - * const firstPageValues = await keyValueStore.values(); + * // Or fetch all values at once + * const allValues = await keyValueStore.values(); + * console.log(allValues); * ``` * * @param options Options for the iteration. @@ -548,20 +552,22 @@ export class KeyValueStore { /** * Returns key-value store entries (key-value pairs). * - * When awaited (`await store.entries()`), returns the first page of entries as `[key, value][]`. - * When used as an async iterable (`for await...of`), streams all entries across pages. + * When awaited (`await store.entries()`), returns all entries as a flat `[key, value][]` array. + * When used as an async iterable (`for await...of`), streams all entries across pages + * without buffering everything in memory. * * **Example usage:** * ```javascript * const keyValueStore = await KeyValueStore.open(); * - * // Stream all entries + * // Stream all entries (memory-efficient for large stores) * for await (const [key, value] of keyValueStore.entries()) { * console.log(`${key}: ${value}`); * } * - * // Or fetch first page - * const firstPageEntries = await keyValueStore.entries(); + * // Or fetch all entries at once + * const allEntries = await keyValueStore.entries(); + * console.log(allEntries); * ``` * * @param options Options for the iteration. diff --git a/packages/core/src/storages/utils.ts b/packages/core/src/storages/utils.ts index fd6dbe4bf98d..b3f687d78c91 100644 --- a/packages/core/src/storages/utils.ts +++ b/packages/core/src/storages/utils.ts @@ -142,42 +142,42 @@ export const API_PROCESSED_REQUESTS_DELAY_MILLIS = 10_000; export const MAX_QUERIES_FOR_CONSISTENCY = 6; /** @internal */ -export interface DualIterableOptions { +export interface DualIterableOptions { /** Factory that returns an async generator yielding pages. */ createPages: () => AsyncGenerator; /** Extracts individual items from a page (for iteration). */ extractItems: (page: TRawPage) => TItem[]; - /** Transforms the first page into the await result. Defaults to `extractItems`. */ - mapFirstPage?: (page: TRawPage) => TAwaitResult; } /** * Creates an object that is both an `AsyncIterable` (for `for await...of`) - * and a `Promise` (for `await`) from a single async page generator. + * and a `Promise` (for `await`) from a single async page generator. * - * - `await result` consumes only the first page from a fresh generator and - * transforms it via `mapFirstPage`. + * - `await result` drains all pages from a fresh generator and returns every + * item as a flat array. * - `for await (const item of result)` streams all items across all pages, - * extracting items from each page via `getItems`. + * yielding them one by one without buffering everything in memory. * * Each usage path creates its own generator instance, so `await` and * `for await...of` never interfere with each other. * * @internal */ -export function createDualIterable( - options: DualIterableOptions, -): AsyncIterable & Promise { +export function createDualIterable( + options: DualIterableOptions, +): AsyncIterable & Promise { const { createPages, extractItems } = options; - const resolveFirstPage = - options.mapFirstPage ?? ((page: TRawPage) => extractItems(page) as unknown as TAwaitResult); - let cached: Promise | null = null; + let cached: Promise | null = null; - function getOrCreate(): Promise { + function getOrCreate(): Promise { if (!cached) { - cached = createPages() - .next() - .then((result) => resolveFirstPage(result.value)); + cached = (async () => { + const items: TItem[] = []; + for await (const page of createPages()) { + items.push(...extractItems(page)); + } + return items; + })(); } return cached; } @@ -192,22 +192,22 @@ export function createDualIterable( [Symbol.asyncIterator]() { return iterateAll(); }, - then( - onfulfilled?: ((value: TAwaitResult) => TResult1 | PromiseLike) | null, + then( + onfulfilled?: ((value: TItem[]) => TResult1 | PromiseLike) | null, onrejected?: ((reason: any) => TResult2 | PromiseLike) | null, ): Promise { return getOrCreate().then(onfulfilled, onrejected); }, catch( onrejected?: ((reason: any) => TResult | PromiseLike) | null, - ): Promise { + ): Promise { return getOrCreate().catch(onrejected); }, - finally(onfinally?: (() => void) | null): Promise { + finally(onfinally?: (() => void) | null): Promise { return getOrCreate().finally(onfinally); }, [Symbol.toStringTag]: 'DualIterable', - } as AsyncIterable & Promise; + } as AsyncIterable & Promise; return result; } diff --git a/test/core/storages/dataset.test.ts b/test/core/storages/dataset.test.ts index 1328b40b7802..a1bb2458645d 100644 --- a/test/core/storages/dataset.test.ts +++ b/test/core/storages/dataset.test.ts @@ -646,6 +646,68 @@ describe('dataset', () => { expect(items).toEqual([]); }); + + test('await values() should return all items as a flat array', async () => { + const dataset = await Dataset.open(); + await dataset.pushData(testData); + + const items = await dataset.values(); + + expect(items).toEqual(testData); + }); + + test('await values() should respect limit', async () => { + const dataset = await Dataset.open(); + await dataset.pushData(testData); + + const items = await dataset.values({ limit: 2 }); + + expect(items).toHaveLength(2); + expect(items).toEqual(testData.slice(0, 2)); + }); + + test('await values() should respect offset', async () => { + const dataset = await Dataset.open(); + await dataset.pushData(testData); + + const items = await dataset.values({ offset: 1 }); + + expect(items).toHaveLength(2); + expect(items).toEqual(testData.slice(1)); + }); + + test('await entries() should return all entries as a flat array', async () => { + const dataset = await Dataset.open(); + await dataset.pushData(testData); + + const entries = await dataset.entries(); + + expect(entries).toEqual([ + [0, { id: 1, name: 'Alice' }], + [1, { id: 2, name: 'Bob' }], + [2, { id: 3, name: 'Charlie' }], + ]); + }); + + test('await entries() should respect offset', async () => { + const dataset = await Dataset.open(); + await dataset.pushData(testData); + + const entries = await dataset.entries({ offset: 1 }); + + expect(entries).toEqual([ + [1, { id: 2, name: 'Bob' }], + [2, { id: 3, name: 'Charlie' }], + ]); + }); + + test('await on empty dataset should return empty array', async () => { + const dataset = await Dataset.open(); + + const items = await dataset.values(); + + expect(items).toEqual([]); + }); }); }); }); diff --git a/test/core/storages/key_value_store.test.ts b/test/core/storages/key_value_store.test.ts index 2e22ded8cd69..41f0ed767186 100644 --- a/test/core/storages/key_value_store.test.ts +++ b/test/core/storages/key_value_store.test.ts @@ -587,5 +587,63 @@ describe('KeyValueStore', () => { ['key2', { value: 2 }], ]); }); + + test('await keys() should return all keys as a flat array', async () => { + const store = await KeyValueStore.open(); + + const testData = { + key1: { value: 1 }, + key2: { value: 2 }, + key3: { value: 3 }, + }; + + for (const [key, value] of Object.entries(testData)) { + await store.setValue(key, value); + } + + const keys = await store.keys(); + + expect(keys).toEqual(['key1', 'key2', 'key3']); + }); + + test('await values() should return all values as a flat array', async () => { + const store = await KeyValueStore.open(); + + const testData = { + key1: { value: 1 }, + key2: { value: 2 }, + key3: { value: 3 }, + }; + + for (const [key, value] of Object.entries(testData)) { + await store.setValue(key, value); + } + + const values = await store.values<{ value: number }>(); + + expect(values).toEqual([{ value: 1 }, { value: 2 }, { value: 3 }]); + }); + + test('await entries() should return all entries as a flat array', async () => { + const store = await KeyValueStore.open(); + + const testData = { + key1: { value: 1 }, + key2: { value: 2 }, + key3: { value: 3 }, + }; + + for (const [key, value] of Object.entries(testData)) { + await store.setValue(key, value); + } + + const entries = await store.entries<{ value: number }>(); + + expect(entries).toEqual([ + ['key1', { value: 1 }], + ['key2', { value: 2 }], + ['key3', { value: 3 }], + ]); + }); }); });