From 13a47a13741397fe040a50cdcdc96d19ffa7e00b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 4 Apr 2026 22:23:05 -0500 Subject: [PATCH 01/27] decode view batch --- src/nanoarrow/ipc/decoder.c | 56 +++++++++++++++++++++++++------ src/nanoarrow/ipc/decoder_test.cc | 48 ++++++++++++++++++++++++++ src/nanoarrow/nanoarrow_ipc.h | 5 +++ 3 files changed, 98 insertions(+), 11 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 520219ed9..e2a0c1228 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2354,6 +2354,7 @@ struct ArrowIpcArraySetter { struct ArrowIpcBufferSource src; struct ArrowIpcBufferFactory factory; enum ArrowIpcMetadataVersion version; + struct ArrowIpcDictionaries* dictionaries; }; static int ArrowIpcDecoderMakeBuffer(struct ArrowIpcArraySetter* setter, int64_t offset, @@ -2420,6 +2421,13 @@ static int ArrowIpcDecoderWalkGetArray(struct ArrowArrayView* array_view, array_view->children[i], array->children[i], out->children[i], error)); } + // TODO: set dictionary array. For now we probably just have to copy the view into the + // output because we don't have ref-counted arrays yet. + if (array_view->dictionary != NULL) { + ArrowErrorSet(error, "Dictionary array decode is not supported"); + return ENOTSUP; + } + return NANOARROW_OK; } @@ -2430,10 +2438,26 @@ static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcDecoder* decoder, struct ArrowError* error) { struct ArrowIpcDecoderPrivate* private_data = (struct ArrowIpcDecoderPrivate*)decoder->private_data; - struct ArrowIpcField* ipc_field = private_data->fields + setter->field_i; + + // setter->field_i indexes the flatbuffer FieldNode vector (which excludes the root + // struct), but private_data->fields includes the root at index 0, so add 1. + struct ArrowIpcField* ipc_field = private_data->fields + setter->field_i + 1; if (ipc_field->dictionary_id != NANOARROW_IPC_NO_DICTIONARY_ID) { - ArrowErrorSet(error, "Decoding a dictionary-encoding field is not supported"); - return ENOTSUP; + if (setter->dictionaries == NULL) { + ArrowErrorSet( + error, "Can't decode a dictionary-encoded field without ArrowIpcDictionaries"); + return ENOTSUP; + } + + const struct ArrowArray* dictionary; + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionariesFindCurrentValue( + setter->dictionaries, ipc_field->dictionary_id, &dictionary, error)); + + // Set the dictionary array view from the value. We may be able to skip this + // if we can somehow detect that the dictionary hasn't changed since the last + // decode. + NANOARROW_RETURN_NOT_OK( + ArrowArrayViewSetArray(array_view->dictionary, dictionary, error)); } ns(FieldNode_struct_t) field = @@ -2538,7 +2562,8 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayInternal( static ArrowErrorCode ArrowIpcDecoderDecodeArrayViewInternal( struct ArrowIpcDecoder* decoder, struct ArrowIpcBufferFactory factory, - int64_t field_i, struct ArrowArrayView** out_view, struct ArrowError* error) { + int64_t field_i, struct ArrowIpcDictionaries* dictionaries, + struct ArrowArrayView** out_view, struct ArrowError* error) { struct ArrowIpcDecoderPrivate* private_data = (struct ArrowIpcDecoderPrivate*)decoder->private_data; @@ -2563,6 +2588,7 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayViewInternal( setter.src.codec = decoder->codec; setter.src.swap_endian = ArrowIpcDecoderNeedsSwapEndian(decoder); setter.version = decoder->metadata_version; + setter.dictionaries = dictionaries; // If we are going to need a decompressor here, ensure the default one is // initialized. @@ -2600,10 +2626,10 @@ static ArrowErrorCode ArrowIpcDecoderDecodeArrayViewInternal( return NANOARROW_OK; } -ArrowErrorCode ArrowIpcDecoderDecodeArrayView(struct ArrowIpcDecoder* decoder, - struct ArrowBufferView body, int64_t i, - struct ArrowArrayView** out, - struct ArrowError* error) { +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayViewWithDictionaries( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowIpcDictionaries* dictionaries, struct ArrowArrayView** out, + struct ArrowError* error) { struct ArrowIpcDecoderPrivate* private_data = (struct ArrowIpcDecoderPrivate*)decoder->private_data; if (private_data->last_message == NULL || @@ -2613,7 +2639,15 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayView(struct ArrowIpcDecoder* decoder, } return ArrowIpcDecoderDecodeArrayViewInternal( - decoder, ArrowIpcBufferFactoryFromView(&body), i, out, error); + decoder, ArrowIpcBufferFactoryFromView(&body), i, dictionaries, out, error); +} + +ArrowErrorCode ArrowIpcDecoderDecodeArrayView(struct ArrowIpcDecoder* decoder, + struct ArrowBufferView body, int64_t i, + struct ArrowArrayView** out, + struct ArrowError* error) { + return ArrowIpcDecoderDecodeArrayViewWithDictionaries(decoder, body, i, NULL, out, + error); } ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, @@ -2631,7 +2665,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, struct ArrowArrayView* array_view; NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayViewInternal( - decoder, ArrowIpcBufferFactoryFromView(&body), i, &array_view, error)); + decoder, ArrowIpcBufferFactoryFromView(&body), i, NULL, &array_view, error)); NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidate(array_view, validation_level, error)); @@ -2655,7 +2689,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( struct ArrowError* error) { struct ArrowArrayView* array_view; NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayViewInternal( - decoder, ArrowIpcBufferFactoryFromShared(body), i, &array_view, error)); + decoder, ArrowIpcBufferFactoryFromShared(body), i, NULL, &array_view, error)); NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidate(array_view, validation_level, error)); diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index b6aeddb77..e5192b57e 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -208,6 +208,19 @@ alignas(8) static uint8_t kDictionaryBatch[] = { 0x6f, 0x6e, 0x65, 0x74, 0x77, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; +static uint8_t kDictionaryRecordBatch[] = { + 0xff, 0xff, 0xff, 0xff, 0x88, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x0c, 0x00, 0x16, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0c, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x18, 0x00, 0x0c, 0x00, + 0x04, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00}; + TEST(NanoarrowIpcTest, NanoarrowIpcCheckHeader) { struct ArrowIpcDecoder decoder; struct ArrowError error; @@ -794,6 +807,41 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ENOTSUP); ASSERT_STREQ(error.message, "Dictionary concatenation is not yet supported"); + // After all of this, we should be able to actually decode a RecordBatch + ASSERT_EQ(ArrowIpcDecoderSetSchemaWithDictionaries(&decoder, &schema, + &dictionary_encodings, &error), + NANOARROW_OK); + data.data.data = kDictionaryRecordBatch; + data.size_bytes = sizeof(kDictionaryRecordBatch); + ASSERT_EQ(ArrowIpcDecoderDecodeHeader(&decoder, data, &error), NANOARROW_OK); + data.data.as_uint8 += decoder.header_size_bytes; + data.size_bytes -= decoder.header_size_bytes; + + // Decode the entire batch and check the dictionary + struct ArrowArrayView* batch_view; + ASSERT_EQ(ArrowIpcDecoderDecodeArrayViewWithDictionaries( + &decoder, data, -1, &dictionaries, &batch_view, &error), + NANOARROW_OK) + << error.message; + + ASSERT_NE(batch_view->children[0]->dictionary, nullptr); + ASSERT_EQ(batch_view->children[0]->dictionary->length, 3); + ASSERT_EQ(ArrowArrayViewGetStringUnsafe(batch_view->children[0]->dictionary, 0), + "zero"_asv); + + // Decode the specific column and check the dictionary + struct ArrowArrayView* column_view; + ASSERT_EQ(ArrowIpcDecoderDecodeArrayViewWithDictionaries( + &decoder, data, 0, &dictionaries, &column_view, &error), + NANOARROW_OK) + << error.message; + + ASSERT_NE(column_view->dictionary, nullptr); + ASSERT_EQ(column_view->dictionary->length, 3); + ASSERT_EQ(ArrowArrayViewGetStringUnsafe(column_view->dictionary, 0), "zero"_asv); + + // TODO decode the array + ArrowArrayViewReset(&array_view); ArrowIpcSharedBufferReset(&shared); ArrowIpcDictionariesReset(&dictionaries); diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index e2b114cd0..11517c8fe 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -604,6 +604,11 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayView( struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, struct ArrowArrayView** out, struct ArrowError* error); +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayViewWithDictionaries( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowIpcDictionaries* dictionaries, struct ArrowArrayView** out, + struct ArrowError* error); + /// \brief Decode an ArrowArray /// /// After a successful call to ArrowIpcDecoderDecodeHeader(), assemble an ArrowArray given From e17d7cb5cf2de6750fa366032401ab4f623fd575 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 4 Apr 2026 23:03:46 -0500 Subject: [PATCH 02/27] sketch refcounted arrays --- src/nanoarrow/common/array.c | 116 +++++++++++++++++++++++++++++++++++ src/nanoarrow/nanoarrow.h | 18 ++++++ 2 files changed, 134 insertions(+) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 2f0f824cf..145faa794 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -23,6 +23,29 @@ #include #include +// For thread-safe reference counting we need C11 + stdatomic.h. +// Can compile with -DNANOARROW_USE_STDATOMIC=0 or 1 to override +// automatic detection. +#if !defined(NANOARROW_USE_STDATOMIC) +#define NANOARROW_USE_STDATOMIC 0 + +// Check for C11 +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L + +// Check for GCC 4.8, which doesn't include stdatomic.h but does +// not define __STDC_NO_ATOMICS__ +#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 5 + +#if !defined(__STDC_NO_ATOMICS__) +#include +#undef NANOARROW_USE_STDATOMIC +#define NANOARROW_USE_STDATOMIC 1 +#endif +#endif +#endif + +#endif + #include "nanoarrow/nanoarrow.h" static void ArrowArrayReleaseInternal(struct ArrowArray* array) { @@ -568,6 +591,99 @@ ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); } +// --- Reference-counted ArrowArray clone support --- + +#if NANOARROW_USE_STDATOMIC +struct ArrowArrayClonePrivate { + struct ArrowArray array; + atomic_long reference_count; +}; + +static int64_t ArrowArrayCloneUpdateRefCount(struct ArrowArrayClonePrivate* private_data, + int delta) { + int64_t old_count = atomic_fetch_add(&private_data->reference_count, delta); + return old_count + delta; +} + +static void ArrowArrayCloneSetRefCount(struct ArrowArrayClonePrivate* private_data, + int64_t count) { + atomic_store(&private_data->reference_count, count); +} + +int ArrowArrayCloneIsThreadSafe(void) { return 1; } +#else +struct ArrowArrayClonePrivate { + struct ArrowArray array; + int64_t reference_count; +}; + +static int64_t ArrowArrayCloneUpdateRefCount(struct ArrowArrayClonePrivate* private_data, + int delta) { + private_data->reference_count += delta; + return private_data->reference_count; +} + +static void ArrowArrayCloneSetRefCount(struct ArrowArrayClonePrivate* private_data, + int64_t count) { + private_data->reference_count = count; +} + +int ArrowArrayCloneIsThreadSafe(void) { return 0; } +#endif + +static void ArrowArrayReleaseClone(struct ArrowArray* array) { + struct ArrowArrayClonePrivate* private_data = + (struct ArrowArrayClonePrivate*)array->private_data; + + if (ArrowArrayCloneUpdateRefCount(private_data, -1) == 0) { + ArrowArrayRelease(&private_data->array); + ArrowFree(private_data); + } + + array->release = NULL; +} + +static int ArrowArrayIsClone(struct ArrowArray* array) { + return array->release == &ArrowArrayReleaseClone; +} + +ArrowErrorCode ArrowArrayClone(struct ArrowArray* src, struct ArrowArray* out) { + if (src->release == NULL) { + return EINVAL; + } + + struct ArrowArrayClonePrivate* private_data; + + if (ArrowArrayIsClone(src)) { + // Source is already a clone: just increment the existing refcount + private_data = (struct ArrowArrayClonePrivate*)src->private_data; + ArrowArrayCloneUpdateRefCount(private_data, 1); + } else { + // First clone: allocate shared state and move the source array into it + private_data = (struct ArrowArrayClonePrivate*)ArrowMalloc( + sizeof(struct ArrowArrayClonePrivate)); + if (private_data == NULL) { + return ENOMEM; + } + + ArrowArrayMove(src, &private_data->array); + // Two references: one for src (which we'll restore) and one for out + ArrowArrayCloneSetRefCount(private_data, 2); + + // Restore src as a clone handle pointing at the shared state + memcpy(src, &private_data->array, sizeof(struct ArrowArray)); + src->release = &ArrowArrayReleaseClone; + src->private_data = private_data; + } + + // Set up out as another clone handle + memcpy(out, &private_data->array, sizeof(struct ArrowArray)); + out->release = &ArrowArrayReleaseClone; + out->private_data = private_data; + + return NANOARROW_OK; +} + void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, enum ArrowType storage_type) { memset(array_view, 0, sizeof(struct ArrowArrayView)); diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h index c22ef12e5..40fc03783 100644 --- a/src/nanoarrow/nanoarrow.h +++ b/src/nanoarrow/nanoarrow.h @@ -114,6 +114,9 @@ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) #define ArrowArrayFinishBuildingDefault \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) +#define ArrowArrayClone NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayClone) +#define ArrowArrayCloneIsThreadSafe \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayCloneIsThreadSafe) #define ArrowArrayViewInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) #define ArrowArrayViewInitFromSchema \ @@ -877,6 +880,21 @@ NANOARROW_DLL ArrowErrorCode ArrowArrayInitFromArrayView( struct ArrowArray* array, const struct ArrowArrayView* array_view, struct ArrowError* error); +/// \brief Create a cheap reference-counted clone of an ArrowArray +/// +/// Both src and out will share the same underlying data. When the last +/// clone is released, the original array's release callback is invoked. +/// The clone is read-only; appender functions must not be used on it. +/// src must be a valid (non-released) ArrowArray. +NANOARROW_DLL ArrowErrorCode ArrowArrayClone(struct ArrowArray* src, + struct ArrowArray* out); + +/// \brief Check if ArrowArrayClone() uses thread-safe atomic reference counting +/// +/// Returns 1 if the implementation was compiled with C11 stdatomic.h support +/// and 0 otherwise. +NANOARROW_DLL int ArrowArrayCloneIsThreadSafe(void); + /// \brief Allocate the array->children array /// /// Includes the memory for each child struct ArrowArray, From 6c5fb3943f659fae653767a58a547ae204dbc6e7 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 4 Apr 2026 23:52:12 -0500 Subject: [PATCH 03/27] decode array works --- src/nanoarrow/ipc/decoder.c | 33 ++++++++++++++++++++----------- src/nanoarrow/ipc/decoder_test.cc | 13 ++++++++++-- src/nanoarrow/nanoarrow_ipc.h | 5 +++++ 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index e2a0c1228..7f254fc26 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -407,7 +407,8 @@ static ArrowErrorCode ArrowIpcDictionaryReplace(struct ArrowIpcDictionary* dicti ArrowArrayRelease(&dictionary->current_value); } - ArrowArrayMove(value, &dictionary->current_value); + NANOARROW_RETURN_NOT_OK(ArrowArrayClone(value, &dictionary->current_value)); + ArrowArrayRelease(value); return NANOARROW_OK; } @@ -419,7 +420,8 @@ static ArrowErrorCode ArrowIpcDictionaryAppend(struct ArrowIpcDictionary* dictio return ENOTSUP; } - ArrowArrayMove(value, &dictionary->current_value); + NANOARROW_RETURN_NOT_OK(ArrowArrayClone(value, &dictionary->current_value)); + ArrowArrayRelease(value); return NANOARROW_OK; } @@ -2421,11 +2423,9 @@ static int ArrowIpcDecoderWalkGetArray(struct ArrowArrayView* array_view, array_view->children[i], array->children[i], out->children[i], error)); } - // TODO: set dictionary array. For now we probably just have to copy the view into the - // output because we don't have ref-counted arrays yet. if (array_view->dictionary != NULL) { - ArrowErrorSet(error, "Dictionary array decode is not supported"); - return ENOTSUP; + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkGetArray( + array_view->dictionary, array->dictionary, out->dictionary, error)); } return NANOARROW_OK; @@ -2650,11 +2650,10 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayView(struct ArrowIpcDecoder* decoder, error); } -ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, - struct ArrowBufferView body, int64_t i, - struct ArrowArray* out, - enum ArrowValidationLevel validation_level, - struct ArrowError* error) { +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayWithDictionaries( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowIpcDictionaries* dictionaries, struct ArrowArray* out, + enum ArrowValidationLevel validation_level, struct ArrowError* error) { struct ArrowIpcDecoderPrivate* private_data = (struct ArrowIpcDecoderPrivate*)decoder->private_data; if (private_data->last_message == NULL || @@ -2665,7 +2664,8 @@ ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, struct ArrowArrayView* array_view; NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayViewInternal( - decoder, ArrowIpcBufferFactoryFromView(&body), i, NULL, &array_view, error)); + decoder, ArrowIpcBufferFactoryFromView(&body), i, dictionaries, &array_view, + error)); NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidate(array_view, validation_level, error)); @@ -2683,6 +2683,15 @@ ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, return NANOARROW_OK; } +ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, + struct ArrowBufferView body, int64_t i, + struct ArrowArray* out, + enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + return ArrowIpcDecoderDecodeArrayWithDictionaries(decoder, body, i, NULL, out, + validation_level, error); +} + ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* body, int64_t i, struct ArrowArray* out, enum ArrowValidationLevel validation_level, diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index e5192b57e..725851f0f 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -711,7 +711,7 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // Make a dictionary encoded schema that matches that of the dictionary example batch ArrowSchemaInit(&schema); ASSERT_EQ(ArrowSchemaSetTypeStruct(&schema, 1), NANOARROW_OK); - ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT32), NANOARROW_OK); + ASSERT_EQ(ArrowSchemaSetType(schema.children[0], NANOARROW_TYPE_INT8), NANOARROW_OK); ASSERT_EQ(ArrowSchemaAllocateDictionary(schema.children[0]), NANOARROW_OK); ASSERT_EQ( ArrowSchemaInitFromType(schema.children[0]->dictionary, NANOARROW_TYPE_STRING), @@ -840,8 +840,17 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ASSERT_EQ(column_view->dictionary->length, 3); ASSERT_EQ(ArrowArrayViewGetStringUnsafe(column_view->dictionary, 0), "zero"_asv); - // TODO decode the array + // Decode the array + struct ArrowArray batch; + ASSERT_EQ(ArrowIpcDecoderDecodeArrayWithDictionaries( + &decoder, data, -1, &dictionaries, &batch, + NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK) + << error.message; + ASSERT_NE(batch.children[0]->dictionary, nullptr); + ASSERT_EQ(batch.children[0]->dictionary->length, 3); + ArrowArrayRelease(&batch); ArrowArrayViewReset(&array_view); ArrowIpcSharedBufferReset(&shared); ArrowIpcDictionariesReset(&dictionaries); diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index 11517c8fe..66295f5ee 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -625,6 +625,11 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArray( struct ArrowArray* out, enum ArrowValidationLevel validation_level, struct ArrowError* error); +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayWithDictionaries( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowIpcDictionaries* dictionaries, struct ArrowArray* out, + enum ArrowValidationLevel validation_level, struct ArrowError* error); + /// \brief Decode an ArrowArray from an owned buffer /// /// This implementation takes advantage of the fact that it can avoid copying individual From 06c2c601db50aff4b487b187527de32e7f509a35 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 4 Apr 2026 23:54:01 -0500 Subject: [PATCH 04/27] remove clone hack --- src/nanoarrow/common/array.c | 93 ------------------------------------ src/nanoarrow/ipc/decoder.c | 6 +-- src/nanoarrow/nanoarrow.h | 18 ------- 3 files changed, 2 insertions(+), 115 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 145faa794..81781b682 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -591,99 +591,6 @@ ArrowErrorCode ArrowArrayFinishBuildingDefault(struct ArrowArray* array, return ArrowArrayFinishBuilding(array, NANOARROW_VALIDATION_LEVEL_DEFAULT, error); } -// --- Reference-counted ArrowArray clone support --- - -#if NANOARROW_USE_STDATOMIC -struct ArrowArrayClonePrivate { - struct ArrowArray array; - atomic_long reference_count; -}; - -static int64_t ArrowArrayCloneUpdateRefCount(struct ArrowArrayClonePrivate* private_data, - int delta) { - int64_t old_count = atomic_fetch_add(&private_data->reference_count, delta); - return old_count + delta; -} - -static void ArrowArrayCloneSetRefCount(struct ArrowArrayClonePrivate* private_data, - int64_t count) { - atomic_store(&private_data->reference_count, count); -} - -int ArrowArrayCloneIsThreadSafe(void) { return 1; } -#else -struct ArrowArrayClonePrivate { - struct ArrowArray array; - int64_t reference_count; -}; - -static int64_t ArrowArrayCloneUpdateRefCount(struct ArrowArrayClonePrivate* private_data, - int delta) { - private_data->reference_count += delta; - return private_data->reference_count; -} - -static void ArrowArrayCloneSetRefCount(struct ArrowArrayClonePrivate* private_data, - int64_t count) { - private_data->reference_count = count; -} - -int ArrowArrayCloneIsThreadSafe(void) { return 0; } -#endif - -static void ArrowArrayReleaseClone(struct ArrowArray* array) { - struct ArrowArrayClonePrivate* private_data = - (struct ArrowArrayClonePrivate*)array->private_data; - - if (ArrowArrayCloneUpdateRefCount(private_data, -1) == 0) { - ArrowArrayRelease(&private_data->array); - ArrowFree(private_data); - } - - array->release = NULL; -} - -static int ArrowArrayIsClone(struct ArrowArray* array) { - return array->release == &ArrowArrayReleaseClone; -} - -ArrowErrorCode ArrowArrayClone(struct ArrowArray* src, struct ArrowArray* out) { - if (src->release == NULL) { - return EINVAL; - } - - struct ArrowArrayClonePrivate* private_data; - - if (ArrowArrayIsClone(src)) { - // Source is already a clone: just increment the existing refcount - private_data = (struct ArrowArrayClonePrivate*)src->private_data; - ArrowArrayCloneUpdateRefCount(private_data, 1); - } else { - // First clone: allocate shared state and move the source array into it - private_data = (struct ArrowArrayClonePrivate*)ArrowMalloc( - sizeof(struct ArrowArrayClonePrivate)); - if (private_data == NULL) { - return ENOMEM; - } - - ArrowArrayMove(src, &private_data->array); - // Two references: one for src (which we'll restore) and one for out - ArrowArrayCloneSetRefCount(private_data, 2); - - // Restore src as a clone handle pointing at the shared state - memcpy(src, &private_data->array, sizeof(struct ArrowArray)); - src->release = &ArrowArrayReleaseClone; - src->private_data = private_data; - } - - // Set up out as another clone handle - memcpy(out, &private_data->array, sizeof(struct ArrowArray)); - out->release = &ArrowArrayReleaseClone; - out->private_data = private_data; - - return NANOARROW_OK; -} - void ArrowArrayViewInitFromType(struct ArrowArrayView* array_view, enum ArrowType storage_type) { memset(array_view, 0, sizeof(struct ArrowArrayView)); diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 7f254fc26..1a3986ed1 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -407,8 +407,7 @@ static ArrowErrorCode ArrowIpcDictionaryReplace(struct ArrowIpcDictionary* dicti ArrowArrayRelease(&dictionary->current_value); } - NANOARROW_RETURN_NOT_OK(ArrowArrayClone(value, &dictionary->current_value)); - ArrowArrayRelease(value); + ArrowArrayMove(value, &dictionary->current_value); return NANOARROW_OK; } @@ -420,8 +419,7 @@ static ArrowErrorCode ArrowIpcDictionaryAppend(struct ArrowIpcDictionary* dictio return ENOTSUP; } - NANOARROW_RETURN_NOT_OK(ArrowArrayClone(value, &dictionary->current_value)); - ArrowArrayRelease(value); + ArrowArrayMove(value, &dictionary->current_value); return NANOARROW_OK; } diff --git a/src/nanoarrow/nanoarrow.h b/src/nanoarrow/nanoarrow.h index 40fc03783..c22ef12e5 100644 --- a/src/nanoarrow/nanoarrow.h +++ b/src/nanoarrow/nanoarrow.h @@ -114,9 +114,6 @@ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuilding) #define ArrowArrayFinishBuildingDefault \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayFinishBuildingDefault) -#define ArrowArrayClone NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayClone) -#define ArrowArrayCloneIsThreadSafe \ - NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayCloneIsThreadSafe) #define ArrowArrayViewInitFromType \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowArrayViewInitFromType) #define ArrowArrayViewInitFromSchema \ @@ -880,21 +877,6 @@ NANOARROW_DLL ArrowErrorCode ArrowArrayInitFromArrayView( struct ArrowArray* array, const struct ArrowArrayView* array_view, struct ArrowError* error); -/// \brief Create a cheap reference-counted clone of an ArrowArray -/// -/// Both src and out will share the same underlying data. When the last -/// clone is released, the original array's release callback is invoked. -/// The clone is read-only; appender functions must not be used on it. -/// src must be a valid (non-released) ArrowArray. -NANOARROW_DLL ArrowErrorCode ArrowArrayClone(struct ArrowArray* src, - struct ArrowArray* out); - -/// \brief Check if ArrowArrayClone() uses thread-safe atomic reference counting -/// -/// Returns 1 if the implementation was compiled with C11 stdatomic.h support -/// and 0 otherwise. -NANOARROW_DLL int ArrowArrayCloneIsThreadSafe(void); - /// \brief Allocate the array->children array /// /// Includes the memory for each child struct ArrowArray, From 0e69e5802150cd2134a0ab25c81f7190f845e98f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 4 Apr 2026 23:56:15 -0500 Subject: [PATCH 05/27] add note about copying --- src/nanoarrow/ipc/decoder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 1a3986ed1..f361ca137 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2422,6 +2422,7 @@ static int ArrowIpcDecoderWalkGetArray(struct ArrowArrayView* array_view, } if (array_view->dictionary != NULL) { + // TODO: this currently copies the array for every output. NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderWalkGetArray( array_view->dictionary, array->dictionary, out->dictionary, error)); } From 2d4796f514ac799ddf71f122ec734fceabeab11e Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 20:32:49 -0500 Subject: [PATCH 06/27] revert change --- src/nanoarrow/common/array.c | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 81781b682..2f0f824cf 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -23,29 +23,6 @@ #include #include -// For thread-safe reference counting we need C11 + stdatomic.h. -// Can compile with -DNANOARROW_USE_STDATOMIC=0 or 1 to override -// automatic detection. -#if !defined(NANOARROW_USE_STDATOMIC) -#define NANOARROW_USE_STDATOMIC 0 - -// Check for C11 -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L - -// Check for GCC 4.8, which doesn't include stdatomic.h but does -// not define __STDC_NO_ATOMICS__ -#if defined(__clang__) || !defined(__GNUC__) || __GNUC__ >= 5 - -#if !defined(__STDC_NO_ATOMICS__) -#include -#undef NANOARROW_USE_STDATOMIC -#define NANOARROW_USE_STDATOMIC 1 -#endif -#endif -#endif - -#endif - #include "nanoarrow/nanoarrow.h" static void ArrowArrayReleaseInternal(struct ArrowArray* array) { From 8fca3fe4c44b96542345d16fcbd188f575217b03 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 20:36:37 -0500 Subject: [PATCH 07/27] aligned dictionary batch --- src/nanoarrow/ipc/decoder_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 725851f0f..84c0b38fb 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -208,7 +208,7 @@ alignas(8) static uint8_t kDictionaryBatch[] = { 0x6f, 0x6e, 0x65, 0x74, 0x77, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; -static uint8_t kDictionaryRecordBatch[] = { +alignas(8) static uint8_t kDictionaryRecordBatch[] = { 0xff, 0xff, 0xff, 0xff, 0x88, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x16, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, From 8cae5be12d4cced98ac677ad27a54ebbde5d4f3a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 21:05:53 -0500 Subject: [PATCH 08/27] add header docs and shared init function --- src/nanoarrow/ipc/decoder.c | 17 +++++++--- src/nanoarrow/ipc/decoder_test.cc | 21 +++++++++++- src/nanoarrow/nanoarrow_ipc.h | 54 ++++++++++++++++++++++++------- 3 files changed, 75 insertions(+), 17 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index f361ca137..da173b9e1 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2691,13 +2691,14 @@ ArrowErrorCode ArrowIpcDecoderDecodeArray(struct ArrowIpcDecoder* decoder, validation_level, error); } -ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( +ArrowErrorCode ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* body, int64_t i, - struct ArrowArray* out, enum ArrowValidationLevel validation_level, - struct ArrowError* error) { + struct ArrowIpcDictionaries* dictionaries, struct ArrowArray* out, + enum ArrowValidationLevel validation_level, struct ArrowError* error) { struct ArrowArrayView* array_view; NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayViewInternal( - decoder, ArrowIpcBufferFactoryFromShared(body), i, NULL, &array_view, error)); + decoder, ArrowIpcBufferFactoryFromShared(body), i, dictionaries, &array_view, + error)); NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidate(array_view, validation_level, error)); @@ -2715,6 +2716,14 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( return NANOARROW_OK; } +ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( + struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* body, int64_t i, + struct ArrowArray* out, enum ArrowValidationLevel validation_level, + struct ArrowError* error) { + return ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries(decoder, body, i, NULL, out, + validation_level, error); +} + NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* dictionaries, diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 84c0b38fb..6723f34f7 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -840,7 +840,7 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ASSERT_EQ(column_view->dictionary->length, 3); ASSERT_EQ(ArrowArrayViewGetStringUnsafe(column_view->dictionary, 0), "zero"_asv); - // Decode the array + // Decode the array from the ArrowBufferView struct ArrowArray batch; ASSERT_EQ(ArrowIpcDecoderDecodeArrayWithDictionaries( &decoder, data, -1, &dictionaries, &batch, @@ -849,8 +849,27 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { << error.message; ASSERT_NE(batch.children[0]->dictionary, nullptr); ASSERT_EQ(batch.children[0]->dictionary->length, 3); + ArrowArrayRelease(&batch); + + // Decode the array from a shared buffer + struct ArrowBuffer record_batch_body; + ArrowBufferInit(&record_batch_body); + ASSERT_EQ(ArrowBufferAppendBufferView(&record_batch_body, data), NANOARROW_OK); + struct ArrowIpcSharedBuffer record_batch_shared; + ASSERT_EQ(ArrowIpcSharedBufferInit(&record_batch_shared, &record_batch_body), + NANOARROW_OK); + + ASSERT_EQ(ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( + &decoder, &record_batch_shared, -1, &dictionaries, &batch, + NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK) + << error.message; + ASSERT_NE(batch.children[0]->dictionary, nullptr); + ASSERT_EQ(batch.children[0]->dictionary->length, 3); ArrowArrayRelease(&batch); + ArrowIpcSharedBufferReset(&record_batch_shared); + ArrowArrayViewReset(&array_view); ArrowIpcSharedBufferReset(&shared); ArrowIpcDictionariesReset(&dictionaries); diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index 66295f5ee..5a909e791 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -53,10 +53,17 @@ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeSchemaWithDictionaries) #define ArrowIpcDecoderDecodeArrayView \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayView) +#define ArrowIpcDecoderDecodeArrayViewWithDictionaries \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayViewWithDictionaries) #define ArrowIpcDecoderDecodeArray \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArray) +#define ArrowIpcDecoderDecodeArrayWithDictionaries \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayWithDictionaries) #define ArrowIpcDecoderDecodeArrayFromShared \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeArrayFromShared) +#define ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, \ + ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries) #define ArrowIpcDecoderSetSchema \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderSetSchema) #define ArrowIpcDecoderSetSchemaWithDictionaries \ @@ -588,7 +595,7 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetSchemaWithDictionaries( NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetEndianness( struct ArrowIpcDecoder* decoder, enum ArrowIpcEndianness endianness); -/// \brief Decode an ArrowArrayView +/// \brief Decode an ArrowArrayView with dictionary decoding support /// /// After a successful call to ArrowIpcDecoderDecodeHeader(), deserialize the content /// of body into an internally-managed ArrowArrayView and return it. Note that field index @@ -600,16 +607,23 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderSetEndianness( /// will not perform any heap allocations; however, the buffers referred to by the /// returned ArrowArrayView are only valid as long as the buffer referred to by body stays /// valid. -NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayView( - struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, - struct ArrowArrayView** out, struct ArrowError* error); - NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayViewWithDictionaries( struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, struct ArrowIpcDictionaries* dictionaries, struct ArrowArrayView** out, struct ArrowError* error); -/// \brief Decode an ArrowArray +/// \brief Decode an ArrowArrayView without dictionary decoding +/// +/// After a successful call to ArrowIpcDecoderDecodeHeader(), deserialize the content +/// of body into an internally-managed ArrowArrayView and return it. +/// +/// This is equivalent to ArrowIpcDecoderDecodeArrayViewWithDictionaries() passing +/// dictionaries as NULL. +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayView( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowArrayView** out, struct ArrowError* error); + +/// \brief Decode an ArrowArray with dictionary decoding support /// /// After a successful call to ArrowIpcDecoderDecodeHeader(), assemble an ArrowArray given /// a message body and a field index. Note that field index does not equate to column @@ -620,23 +634,39 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayViewWithDictionaries( /// Returns EINVAL if the decoder did not just decode a record batch message, ENOTSUP /// if the message uses features not supported by this library, or or NANOARROW_OK /// otherwise. -NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArray( - struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, - struct ArrowArray* out, enum ArrowValidationLevel validation_level, - struct ArrowError* error); - NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayWithDictionaries( struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, struct ArrowIpcDictionaries* dictionaries, struct ArrowArray* out, enum ArrowValidationLevel validation_level, struct ArrowError* error); -/// \brief Decode an ArrowArray from an owned buffer +/// \brief Decode an ArrowArray without dictionary decoding support +/// +/// After a successful call to ArrowIpcDecoderDecodeHeader(), assemble an ArrowArray given +/// a message body and a field index. +/// +/// This is equivalent to calling ArrowIpcDecoderDecodeArrayWithDictionaries() passing +/// dictionaries as NULL. +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArray( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, int64_t i, + struct ArrowArray* out, enum ArrowValidationLevel validation_level, + struct ArrowError* error); + +/// \brief Decode an ArrowArray from an owned buffer with dictionary decoding support /// /// This implementation takes advantage of the fact that it can avoid copying individual /// buffers. In all cases the caller must ArrowIpcSharedBufferReset() body after one or /// more calls to ArrowIpcDecoderDecodeArrayFromShared(). If /// ArrowIpcSharedBufferIsThreadSafe() returns 0, out must not be released by another /// thread. +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( + struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, int64_t i, + struct ArrowIpcDictionaries* dictionaries, struct ArrowArray* out, + enum ArrowValidationLevel validation_level, struct ArrowError* error); + +/// \brief Decode an ArrowArray from an owned buffer +/// +/// Equivalent to calling ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries() with +/// dictionaries as NULL. NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, int64_t i, struct ArrowArray* out, enum ArrowValidationLevel validation_level, From 85439cadffb45896ae518eeba40a37b6f5ab9790 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 22:00:43 -0500 Subject: [PATCH 09/27] start dictionaries with a zero size dictionary --- src/nanoarrow/ipc/decoder.c | 66 ++++++++++++++++++++++++++++++- src/nanoarrow/ipc/decoder_test.cc | 39 ++++++++++++------ src/nanoarrow/nanoarrow_ipc.h | 6 +++ 3 files changed, 96 insertions(+), 15 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index da173b9e1..efcf2fb0f 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -295,6 +295,44 @@ ArrowErrorCode ArrowIpcDictionaryEncodingsAppend( return NANOARROW_OK; } +static ArrowErrorCode ArrowIpcDictionaryEncodingsAppendSchemaInternal( + struct ArrowIpcDictionaryEncodings* dictionary_encodings, + const struct ArrowSchema* schema, int64_t* next_id) { + if (schema->dictionary != NULL) { + struct ArrowIpcDictionaryEncoding encoding; + encoding.schema = schema; + encoding.id = (*next_id)++; + encoding.kind = NANOARROW_IPC_DICTIONARY_KIND_DENSE_ARRAY; + NANOARROW_RETURN_NOT_OK( + ArrowIpcDictionaryEncodingsAppend(dictionary_encodings, encoding)); + } + + for (int64_t i = 0; i < schema->n_children; i++) { + NANOARROW_DCHECK(schema->children != NULL && schema->children[i] != NULL); + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionaryEncodingsAppendSchemaInternal( + dictionary_encodings, schema->children[i], next_id)); + } + + if (schema->dictionary != NULL) { + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionaryEncodingsAppendSchemaInternal( + dictionary_encodings, schema->dictionary, next_id)); + } + + return NANOARROW_OK; +} + +ArrowErrorCode ArrowIpcDictionaryEncodingsAppendSchema( + struct ArrowIpcDictionaryEncodings* dictionary_encodings, + const struct ArrowSchema* schema) { + NANOARROW_DCHECK(dictionary_encodings != NULL); + + int64_t next_id = 0; + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionaryEncodingsAppendSchemaInternal( + dictionary_encodings, schema, &next_id)); + + return NANOARROW_OK; +} + const struct ArrowIpcDictionaryEncoding* ArrowIpcDictionaryEncodingsFind( const struct ArrowIpcDictionaryEncodings* dictionary_encodings, const struct ArrowSchema* schema) { @@ -414,12 +452,13 @@ static ArrowErrorCode ArrowIpcDictionaryReplace(struct ArrowIpcDictionary* dicti static ArrowErrorCode ArrowIpcDictionaryAppend(struct ArrowIpcDictionary* dictionary, struct ArrowArray* value, struct ArrowError* error) { - if (dictionary->current_value.release != NULL) { + if (dictionary->current_value.release != NULL && + dictionary->current_value.length != 0) { ArrowErrorSet(error, "Dictionary concatenation is not yet supported"); return ENOTSUP; } - ArrowArrayMove(value, &dictionary->current_value); + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionaryReplace(dictionary, value, error)); return NANOARROW_OK; } @@ -492,7 +531,24 @@ static ArrowErrorCode ArrowIpcDictionariesInitDictionaries( return result; } + // Set the ID dictionary->id = unique_ids[i]; + + // Set the initial array value to a valid array with zero length. This is + // needed because empty and/or all null columns may not have a dictionary + // message emitted before a record batch arrives. + result = ArrowArrayInitFromSchema(&dictionary->current_value, + encoding->schema->dictionary, error); + if (result != NANOARROW_OK) { + *num_initialized_decoders_out = i + 1; + return result; + } + + result = ArrowArrayFinishBuildingDefault(&dictionary->current_value, error); + if (result != NANOARROW_OK) { + *num_initialized_decoders_out = i + 1; + return result; + } } *num_initialized_decoders_out = private_data->num_dictionaries; @@ -2452,6 +2508,12 @@ static int ArrowIpcDecoderWalkSetArrayView(struct ArrowIpcDecoder* decoder, NANOARROW_RETURN_NOT_OK(ArrowIpcDictionariesFindCurrentValue( setter->dictionaries, ipc_field->dictionary_id, &dictionary, error)); + if (dictionary->release == NULL) { + ArrowErrorSet(error, "Dictionary with ID %" PRId64 " is marked as released", + ipc_field->dictionary_id); + return EINVAL; + } + // Set the dictionary array view from the value. We may be able to skip this // if we can somehow detect that the dictionary hasn't changed since the last // decode. diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 6723f34f7..90c665cc1 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -1216,7 +1216,7 @@ std::string ArrowSchemaToString(const struct ArrowSchema* schema) { #if defined(NANOARROW_BUILD_TESTS_WITH_ARROW) TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowTypeRoundtrip) { if (GetParam()->id() == arrow::Type::DICTIONARY) { - GTEST_SKIP() << "Dictionary array decode is not yet supported"; + GTEST_SKIP() << "Dictionary array encode is not yet supported"; } nanoarrow::UniqueSchema schema; @@ -1258,16 +1258,12 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowTypeRoundtrip) { #if defined(NANOARROW_BUILD_TESTS_WITH_ARROW) TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { - if (GetParam()->id() == arrow::Type::DICTIONARY) { - GTEST_SKIP() << "Dictionary array decode is not yet supported"; - } - const std::shared_ptr& data_type = GetParam(); std::shared_ptr dummy_schema = arrow::schema({arrow::field("dummy_name", data_type)}); auto maybe_empty = arrow::RecordBatch::MakeEmpty(dummy_schema); - ASSERT_TRUE(maybe_empty.ok()); + ASSERT_TRUE(maybe_empty.ok()) << maybe_empty.status(); auto empty = maybe_empty.ValueUnsafe(); auto maybe_nulls_array = arrow::MakeArrayOfNull(data_type, 3); @@ -1282,10 +1278,24 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { struct ArrowBufferView buffer_view; struct ArrowArray array; - // Initialize the decoder + // Initialize the schema ASSERT_TRUE(arrow::ExportSchema(*dummy_schema, &schema).ok()); + + // Initialize the dictionaries + struct ArrowIpcDictionaryEncodings dictionary_encodings; + struct ArrowIpcDictionaries dictionaries; + ArrowIpcDictionaryEncodingsInit(&dictionary_encodings); + ASSERT_EQ(ArrowIpcDictionaryEncodingsAppendSchema(&dictionary_encodings, &schema), + NANOARROW_OK); + ASSERT_EQ(ArrowIpcDictionariesInit(&dictionaries, &dictionary_encodings, nullptr), + NANOARROW_OK); + + // Initialize the decoder ArrowIpcDecoderInit(&decoder); - ASSERT_EQ(ArrowIpcDecoderSetSchema(&decoder, &schema, nullptr), NANOARROW_OK); + ASSERT_EQ(ArrowIpcDecoderSetSchemaWithDictionaries(&decoder, &schema, + &dictionary_encodings, nullptr), + NANOARROW_OK); + ArrowIpcDictionaryEncodingsReset(&dictionary_encodings); // Check the empty array auto maybe_serialized = arrow::ipc::SerializeRecordBatch(*empty, options); @@ -1296,14 +1306,15 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { ASSERT_EQ(ArrowIpcDecoderDecodeHeader(&decoder, buffer_view, nullptr), NANOARROW_OK); buffer_view.data.as_uint8 += decoder.header_size_bytes; buffer_view.size_bytes -= decoder.header_size_bytes; - ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array, - NANOARROW_VALIDATION_LEVEL_FULL, nullptr), + ASSERT_EQ(ArrowIpcDecoderDecodeArrayWithDictionaries( + &decoder, buffer_view, -1, &dictionaries, &array, + NANOARROW_VALIDATION_LEVEL_FULL, nullptr), NANOARROW_OK); auto maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema); ASSERT_TRUE(maybe_batch.ok()); EXPECT_EQ(maybe_batch.ValueUnsafe()->ToString(), empty->ToString()); - EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*empty)); + EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*empty)) << empty->ToString(); // Check the array with 3 null values maybe_serialized = arrow::ipc::SerializeRecordBatch(*nulls, options); @@ -1314,8 +1325,9 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { ASSERT_EQ(ArrowIpcDecoderDecodeHeader(&decoder, buffer_view, nullptr), NANOARROW_OK); buffer_view.data.as_uint8 += decoder.header_size_bytes; buffer_view.size_bytes -= decoder.header_size_bytes; - ASSERT_EQ(ArrowIpcDecoderDecodeArray(&decoder, buffer_view, -1, &array, - NANOARROW_VALIDATION_LEVEL_FULL, nullptr), + ASSERT_EQ(ArrowIpcDecoderDecodeArrayWithDictionaries( + &decoder, buffer_view, -1, &dictionaries, &array, + NANOARROW_VALIDATION_LEVEL_FULL, nullptr), NANOARROW_OK); maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema); @@ -1324,6 +1336,7 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*nulls)); ArrowSchemaRelease(&schema); + ArrowIpcDictionariesReset(&dictionaries); ArrowIpcDecoderReset(&decoder); } #endif diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index 5a909e791..309abb6a2 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -250,6 +250,12 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDictionaryEncodingsAppend( struct ArrowIpcDictionaryEncodings* dictionary_encodings, struct ArrowIpcDictionaryEncoding encoding); +/// \brief Append all dictionaries in schema identified according to a depth-first +/// recursive search starting at 0 +NANOARROW_DLL ArrowErrorCode ArrowIpcDictionaryEncodingsAppendSchema( + struct ArrowIpcDictionaryEncodings* dictionary_encodings, + const struct ArrowSchema* schema); + /// \brief Resolve a ArrowIpcDictionaryEncoding for a given dictionary encoded field /// /// Returns NULL if the pointed to schema does not match any of the pointed to From c9fcd6e376962bf0e42c5cad0a9a9dbb37982396 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 22:18:20 -0500 Subject: [PATCH 10/27] skip the two unconstructible test cases --- src/nanoarrow/ipc/decoder_test.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 90c665cc1..1fc39c3ea 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -1259,6 +1259,14 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowTypeRoundtrip) { #if defined(NANOARROW_BUILD_TESTS_WITH_ARROW) TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { const std::shared_ptr& data_type = GetParam(); + + if (data_type->id() == arrow::Type::DICTIONARY && + std::static_pointer_cast(data_type)->value_type()->id() == + Type::EXTENSION) { + GTEST_SKIP() + << "Arrow C++ MakeEmpty() doesn't support dictionary with extension value types"; + } + std::shared_ptr dummy_schema = arrow::schema({arrow::field("dummy_name", data_type)}); @@ -1314,7 +1322,14 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcArrowArrayRoundtrip) { auto maybe_batch = arrow::ImportRecordBatch(&array, dummy_schema); ASSERT_TRUE(maybe_batch.ok()); EXPECT_EQ(maybe_batch.ValueUnsafe()->ToString(), empty->ToString()); - EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*empty)) << empty->ToString(); + + // Arrow C++ MakeEmpty() loses the ordered=1 flag for dictionary types. + // https://github.com/apache/arrow/issues/49674 + // So for ordered dictionaries, we only check ToString() equality for empty batches. + if (data_type->id() != arrow::Type::DICTIONARY || + !std::static_pointer_cast(data_type)->ordered()) { + EXPECT_TRUE(maybe_batch.ValueUnsafe()->Equals(*empty)) << empty->ToString(); + } // Check the array with 3 null values maybe_serialized = arrow::ipc::SerializeRecordBatch(*nulls, options); From 1da979895e50bfc024c402b13931295f4149f986 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 23:25:34 -0500 Subject: [PATCH 11/27] mostly there --- src/nanoarrow/ipc/files_test.cc | 18 +--- src/nanoarrow/ipc/reader.c | 148 ++++++++++++++++++++++--------- src/nanoarrow/ipc/reader_test.cc | 3 +- 3 files changed, 112 insertions(+), 57 deletions(-) diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index 6a1d7c787..74337c7f5 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -477,20 +477,10 @@ INSTANTIATE_TEST_SUITE_P( TestFile::OK("generated_primitive.stream"), TestFile::OK("generated_recursive_nested.stream"), TestFile::OK("generated_union.stream"), - - // Files with features that are not yet supported (Dictionary encoding) - TestFile::NotSupported( - "generated_dictionary_unsigned.stream", - "Found valid dictionary batch but dictionary encoding is not yet supported"), - TestFile::NotSupported( - "generated_dictionary.stream", - "Found valid dictionary batch but dictionary encoding is not yet supported"), - TestFile::NotSupported( - "generated_nested_dictionary.stream", - "Found valid dictionary batch but dictionary encoding is not yet supported"), - TestFile::NotSupported( - "generated_extension.stream", - "Found valid dictionary batch but dictionary encoding is not yet supported") + TestFile::OK("generated_dictionary_unsigned.stream"), + TestFile::OK("generated_dictionary.stream"), + TestFile::OK("generated_nested_dictionary.stream"), + TestFile::OK("generated_extension.stream") // Comment to keep last line from wrapping )); diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c index c70b14483..9780dccf3 100644 --- a/src/nanoarrow/ipc/reader.c +++ b/src/nanoarrow/ipc/reader.c @@ -191,6 +191,7 @@ struct ArrowIpcArrayStreamReaderPrivate { struct ArrowBuffer header; struct ArrowBuffer body; int32_t expected_header_prefix_size; + struct ArrowIpcDictionaries dictionaries; struct ArrowError error; }; @@ -211,13 +212,16 @@ static void ArrowIpcArrayStreamReaderRelease(struct ArrowArrayStream* stream) { ArrowBufferReset(&private_data->header); ArrowBufferReset(&private_data->body); + if (private_data->dictionaries.private_data != NULL) { + ArrowIpcDictionariesReset(&private_data->dictionaries); + } + ArrowFree(private_data); stream->release = NULL; } static int ArrowIpcArrayStreamReaderNextHeader( - struct ArrowIpcArrayStreamReaderPrivate* private_data, - enum ArrowIpcMessageType message_type) { + struct ArrowIpcArrayStreamReaderPrivate* private_data, int schema_expected) { private_data->header.size_bytes = 0; int64_t bytes_read = 0; @@ -332,7 +336,10 @@ static int ArrowIpcArrayStreamReaderNextHeader( // Don't decode the message if it's of the wrong type (because the error message // is better communicated by the caller) - if (private_data->decoder.message_type != message_type) { + if ((schema_expected && + private_data->decoder.message_type != NANOARROW_IPC_MESSAGE_TYPE_SCHEMA) || + (!schema_expected && + private_data->decoder.message_type == NANOARROW_IPC_MESSAGE_TYPE_SCHEMA)) { return NANOARROW_OK; } @@ -372,8 +379,7 @@ static int ArrowIpcArrayStreamReaderReadSchemaIfNeeded( return NANOARROW_OK; } - NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextHeader( - private_data, NANOARROW_IPC_MESSAGE_TYPE_SCHEMA)); + NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextHeader(private_data, 1)); // Error if this isn't a schema message if (private_data->decoder.message_type != NANOARROW_IPC_MESSAGE_TYPE_SCHEMA) { @@ -415,8 +421,17 @@ static int ArrowIpcArrayStreamReaderReadSchemaIfNeeded( return ENOTSUP; } + // Initialize dictionary decoders + int result = ArrowIpcDictionariesInit(&private_data->dictionaries, + &dictionary_encodings, &private_data->error); + if (result != NANOARROW_OK) { + ArrowIpcDictionaryEncodingsReset(&dictionary_encodings); + ArrowSchemaRelease(&tmp); + return result; + } + // Notify the decoder of the schema for forthcoming messages - int result = ArrowIpcDecoderSetSchemaWithDictionaries( + result = ArrowIpcDecoderSetSchemaWithDictionaries( &private_data->decoder, &tmp, &dictionary_encodings, &private_data->error); ArrowIpcDictionaryEncodingsReset(&dictionary_encodings); if (result != NANOARROW_OK) { @@ -437,19 +452,68 @@ static int ArrowIpcArrayStreamReaderGetSchema(struct ArrowArrayStream* stream, return ArrowSchemaDeepCopy(&private_data->out_schema, out); } -static int ArrowIpcArrayStreamReaderGetNext(struct ArrowArrayStream* stream, - struct ArrowArray* out) { - struct ArrowIpcArrayStreamReaderPrivate* private_data = - (struct ArrowIpcArrayStreamReaderPrivate*)stream->private_data; - ArrowErrorInit(&private_data->error); - NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderReadSchemaIfNeeded(private_data)); +static int ArrowIpcArrayStreamReaderProcessRecordBatch( + struct ArrowIpcArrayStreamReaderPrivate* private_data, struct ArrowArray* out) { + // Read in the body + NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data)); + + if (private_data->use_shared_buffers) { + struct ArrowIpcSharedBuffer shared; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); + int result = ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( + &private_data->decoder, &shared, private_data->field_index, + &private_data->dictionaries, out, NANOARROW_VALIDATION_LEVEL_FULL, + &private_data->error); + ArrowIpcSharedBufferReset(&shared); + NANOARROW_RETURN_NOT_OK(result); + } else { + struct ArrowBufferView body_view; + body_view.data.data = private_data->body.data; + body_view.size_bytes = private_data->body.size_bytes; + + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArray( + &private_data->decoder, body_view, private_data->field_index, out, + NANOARROW_VALIDATION_LEVEL_FULL, &private_data->error)); + } + + return NANOARROW_OK; +} + +static int ArrowIpcArrayStreamReaderProcessDictionary( + struct ArrowIpcArrayStreamReaderPrivate* private_data) { + if (!private_data->use_shared_buffers) { + ArrowErrorSet(&private_data->error, + "Dictionary decode without shared buffers is not supported"); + return ENOTSUP; + } + + // Read in the body + NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data)); + + // Decode the dictionary + struct ArrowIpcSharedBuffer shared; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); + int result = ArrowIpcDecoderDecodeDictionary( + &private_data->decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, + &private_data->dictionaries, &private_data->error); + ArrowIpcSharedBufferReset(&shared); + NANOARROW_RETURN_NOT_OK(result); + return NANOARROW_OK; +} + +static int ArrowIpcArrayStreamReaderProcessMessage( + struct ArrowIpcArrayStreamReaderPrivate* private_data, + enum ArrowIpcMessageType* message_type, struct ArrowArray* out) { // Read + decode the next header - int result = ArrowIpcArrayStreamReaderNextHeader( - private_data, NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH); + int result = ArrowIpcArrayStreamReaderNextHeader(private_data, 0); if (result == ENODATA) { // Stream is finished either because there is no input or because - // end of stream bytes were read. + // end of stream bytes were read. Read this as a RecordBatch in the + // sense that we populate out->release to NULL and return OK. + *message_type = NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH; out->release = NULL; return NANOARROW_OK; } else if (result != NANOARROW_OK) { @@ -457,44 +521,43 @@ static int ArrowIpcArrayStreamReaderGetNext(struct ArrowArrayStream* stream, return result; } - // Make sure we have a RecordBatch message + // Make sure we have a RecordBatch message or DictionaryBatch message switch (private_data->decoder.message_type) { case NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH: - break; + *message_type = NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH; + return ArrowIpcArrayStreamReaderProcessRecordBatch(private_data, out); case NANOARROW_IPC_MESSAGE_TYPE_DICTIONARY_BATCH: - ArrowErrorSet( - &private_data->error, - "Found valid dictionary batch but dictionary encoding is not yet supported"); - return ENOTSUP; + *message_type = NANOARROW_IPC_MESSAGE_TYPE_DICTIONARY_BATCH; + return ArrowIpcArrayStreamReaderProcessDictionary(private_data); default: ArrowErrorSet(&private_data->error, - "Unexpected message type (expected RecordBatch)"); + "Unexpected message type (expected RecordBatch or DictionaryBatch)"); return EINVAL; } +} - // Read in the body - NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data)); +static int ArrowIpcArrayStreamReaderGetNext(struct ArrowArrayStream* stream, + struct ArrowArray* out) { + struct ArrowIpcArrayStreamReaderPrivate* private_data = + (struct ArrowIpcArrayStreamReaderPrivate*)stream->private_data; + ArrowErrorInit(&private_data->error); + NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderReadSchemaIfNeeded(private_data)); + enum ArrowIpcMessageType message_type; struct ArrowArray tmp; + tmp.release = NULL; + + do { + int result = + ArrowIpcArrayStreamReaderProcessMessage(private_data, &message_type, &tmp); + if (result != NANOARROW_OK) { + if (tmp.release != NULL) { + ArrowArrayRelease(&tmp); + } - if (private_data->use_shared_buffers) { - struct ArrowIpcSharedBuffer shared; - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); - result = ArrowIpcDecoderDecodeArrayFromShared( - &private_data->decoder, &shared, private_data->field_index, &tmp, - NANOARROW_VALIDATION_LEVEL_FULL, &private_data->error); - ArrowIpcSharedBufferReset(&shared); - NANOARROW_RETURN_NOT_OK(result); - } else { - struct ArrowBufferView body_view; - body_view.data.data = private_data->body.data; - body_view.size_bytes = private_data->body.size_bytes; - - NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArray( - &private_data->decoder, body_view, private_data->field_index, &tmp, - NANOARROW_VALIDATION_LEVEL_FULL, &private_data->error)); - } + return result; + } + } while (message_type != NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH); ArrowArrayMove(&tmp, out); return NANOARROW_OK; @@ -528,6 +591,7 @@ ArrowErrorCode ArrowIpcArrayStreamReaderInit( private_data->out_schema.release = NULL; ArrowIpcInputStreamMove(input_stream, &private_data->input); private_data->expected_header_prefix_size = kExpectedHeaderPrefixSizeNotSet; + private_data->dictionaries.private_data = NULL; if (options != NULL) { private_data->field_index = options->field_index; diff --git a/src/nanoarrow/ipc/reader_test.cc b/src/nanoarrow/ipc/reader_test.cc index 8139503aa..1f258350b 100644 --- a/src/nanoarrow/ipc/reader_test.cc +++ b/src/nanoarrow/ipc/reader_test.cc @@ -335,7 +335,8 @@ TEST(NanoarrowIpcReader, StreamReaderExpectedRecordBatch) { struct ArrowArray array; struct ArrowError error; ASSERT_EQ(ArrowArrayStreamGetNext(&stream, &array, &error), EINVAL); - EXPECT_STREQ(error.message, "Unexpected message type (expected RecordBatch)"); + EXPECT_STREQ(error.message, + "Unexpected message type (expected RecordBatch or DictionaryBatch)"); ArrowArrayStreamRelease(&stream); } From 70cce65611d7c0124baab163ae713595fa0362a2 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 23:41:53 -0500 Subject: [PATCH 12/27] maybe actual failures --- src/nanoarrow/ipc/decoder.c | 6 ++--- src/nanoarrow/ipc/files_test.cc | 39 +++++++++++++++++++++++---------- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index efcf2fb0f..2a41f9f9f 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2825,10 +2825,8 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( struct ArrowArray tmp; - // TODO: provide ArrowIpcDecoderDecodeArrayInternalWithDictionaries to handle nested - // dictionaries - NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayFromShared( - &dictionary->decoder, shared, 0, &tmp, validation_level, error)); + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( + &dictionary->decoder, shared, 0, dictionaries, &tmp, validation_level, error)); ArrowErrorCode result; if (decoder->dictionary->is_delta) { diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index 74337c7f5..5b9c657a5 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -51,10 +51,12 @@ using namespace arrow; // would read. class TestFile { public: - TestFile(std::string path, int expected_return_code, std::string expected_error_message) + TestFile(std::string path, int expected_return_code, std::string expected_error_message, + bool write_supported = true) : path_(path), expected_return_code_(expected_return_code), - expected_error_message_(expected_error_message) {} + expected_error_message_(expected_error_message), + write_supported_(write_supported) {} TestFile(std::string path) : TestFile(path, NANOARROW_OK, "") {} @@ -62,6 +64,10 @@ class TestFile { static TestFile OK(std::string path) { return TestFile(path); } + static TestFile ReadOnly(std::string path) { + return TestFile(path, NANOARROW_OK, "", false); + } + static TestFile Err(int code, std::string path, std::string message = "__any__") { return TestFile(path, code, message); } @@ -228,7 +234,8 @@ class TestFile { return ArrowIpcWriterWriteArrayView(writer.get(), nullptr, error); } - void TestEqualsArrowCpp(const std::string& dir_prefix) { + void TestEqualsArrowCpp(const std::string& dir_prefix, + bool check_write_roundtrip = true) { std::stringstream path_builder; path_builder << dir_prefix << "/" << path_; @@ -251,11 +258,15 @@ class TestFile { GTEST_FAIL() << MakeError(NANOARROW_OK, ""); } - // Write back to a buffer using nanoarrow + // Write back to a buffer using nanoarrow if supported. We do this here + // because we need to move the arrays into the comparison for the Arrow C++ + // read. nanoarrow::UniqueBuffer roundtripped; - ASSERT_EQ(WriteNanoarrowStream(schema, arrays, roundtripped.get(), &error), - NANOARROW_OK) - << error.message; + if (write_supported_) { + ASSERT_EQ(WriteNanoarrowStream(schema, arrays, roundtripped.get(), &error), + NANOARROW_OK) + << error.message; + } // Read the same file with Arrow C++ auto maybe_table_arrow = ReadTable(io::ReadableFile::Open(path_builder.str())); @@ -266,6 +277,11 @@ class TestFile { maybe_table_arrow.ValueUnsafe()); } + // For types that aren't supported by the writer yet + if (!write_supported_) { + return; + } + auto maybe_table_roundtripped = ReadTable(BufferInputStream(roundtripped.get())); { SCOPED_TRACE("Read the roundtripped buffer using Arrow C++"); @@ -378,6 +394,7 @@ class TestFile { std::string path_; int expected_return_code_; std::string expected_error_message_; + bool write_supported_; }; // For better testing output @@ -477,10 +494,10 @@ INSTANTIATE_TEST_SUITE_P( TestFile::OK("generated_primitive.stream"), TestFile::OK("generated_recursive_nested.stream"), TestFile::OK("generated_union.stream"), - TestFile::OK("generated_dictionary_unsigned.stream"), - TestFile::OK("generated_dictionary.stream"), - TestFile::OK("generated_nested_dictionary.stream"), - TestFile::OK("generated_extension.stream") + TestFile::ReadOnly("generated_dictionary_unsigned.stream"), + TestFile::ReadOnly("generated_dictionary.stream"), + TestFile::ReadOnly("generated_nested_dictionary.stream"), + TestFile::ReadOnly("generated_extension.stream") // Comment to keep last line from wrapping )); From 09ad0e42f57754d178b0d207ff014d05c2e2f04c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 6 Apr 2026 23:54:19 -0500 Subject: [PATCH 13/27] fewer failures --- src/nanoarrow/ipc/decoder.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 2a41f9f9f..21948dda3 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1260,8 +1260,8 @@ static int ArrowIpcDecoderSetType(struct ArrowSchema* schema, ns(Field_table_t) } // A fun corner case when decoding dictionaries: the extension metadata lives with -// the dictionary (i.e., the non-index type); however, the field metadata still -// needs to exist on the field. +// the dictionary (i.e., the non-index type); however, non-extension field metadata +// still needs to exist on the field. static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( struct ArrowSchema* schema) { NANOARROW_DCHECK(schema->dictionary != NULL); @@ -1302,15 +1302,18 @@ static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( int key_is_extension_metadata = key.size_bytes == extension_metadata_key.size_bytes && strncmp(key.data, extension_metadata_key.data, key.size_bytes) == 0; - if (!key_is_extension_name && !key_is_extension_metadata) { - result = ArrowMetadataBuilderAppend(&field_metadata, key, value); + + // Extension metadata stays on the dictionary + if (key_is_extension_name || key_is_extension_metadata) { + result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); if (result != NANOARROW_OK) { ArrowBufferReset(&field_metadata); ArrowBufferReset(&extension_metadata); return result; } } else { - result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); + // Non-extension metadata goes to the field + result = ArrowMetadataBuilderAppend(&field_metadata, key, value); if (result != NANOARROW_OK) { ArrowBufferReset(&field_metadata); ArrowBufferReset(&extension_metadata); @@ -2822,6 +2825,8 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( (struct ArrowIpcDecoderPrivate*)dictionary->decoder.private_data; dictionary->decoder.message_type = NANOARROW_IPC_MESSAGE_TYPE_RECORD_BATCH; dictionary_decoder_private_data->last_message = record_batch; + // Transfer the endianness setting so that buffers are byte-swapped if needed + dictionary_decoder_private_data->endianness = private_data->endianness; struct ArrowArray tmp; From 51be30eff36c8ff7bf890b7bfa78f50ba86c50c1 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 7 Apr 2026 00:04:10 -0500 Subject: [PATCH 14/27] comment --- src/nanoarrow/ipc/decoder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 21948dda3..644d251af 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1303,7 +1303,7 @@ static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( key.size_bytes == extension_metadata_key.size_bytes && strncmp(key.data, extension_metadata_key.data, key.size_bytes) == 0; - // Extension metadata stays on the dictionary + // Extension metadata stays on the dictionary (value type) if (key_is_extension_name || key_is_extension_metadata) { result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); if (result != NANOARROW_OK) { From 12e9ac8147cb3323c3e3fac20d401c92ff85d70a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 7 Apr 2026 00:17:42 -0500 Subject: [PATCH 15/27] unregister uuid --- src/nanoarrow/ipc/files_test.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index 5b9c657a5..ea6be1ece 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -24,6 +24,7 @@ #if defined(NANOARROW_BUILD_TESTS_WITH_ARROW) #include #include +#include #include #include #include @@ -421,7 +422,13 @@ class TestEndianFileFixture : public ::testing::TestWithParam { TestFile test_file; }; +bool EnsureUuidIsNotRegistered() { + return arrow::UnregisterExtensionType("arrow.uuid").ok(); +} + TEST_P(TestEndianFileFixture, NanoarrowIpcTestFileNativeEndian) { + EnsureUuidIsNotRegistered(); + std::stringstream dir_builder; ArrowError error; ArrowErrorInit(&error); @@ -439,6 +446,8 @@ TEST_P(TestEndianFileFixture, NanoarrowIpcTestFileNativeEndian) { } TEST_P(TestEndianFileFixture, NanoarrowIpcTestFileSwapEndian) { + EnsureUuidIsNotRegistered(); + std::stringstream dir_builder; ArrowError error; ArrowErrorInit(&error); @@ -456,6 +465,8 @@ TEST_P(TestEndianFileFixture, NanoarrowIpcTestFileSwapEndian) { } TEST_P(TestEndianFileFixture, NanoarrowIpcTestFileCheckJSON) { + EnsureUuidIsNotRegistered(); + std::stringstream dir_builder; ArrowError error; ArrowErrorInit(&error); From 1d526f3376fabf4a1a737566fa0f9e9055862882 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 7 Apr 2026 00:34:10 -0500 Subject: [PATCH 16/27] check extension type with dictionary storage --- src/nanoarrow/ipc/decoder_test.cc | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 1fc39c3ea..f330dc586 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -1438,6 +1438,43 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowArrayRoundtrip) { } } +// Extension type with dictionary storage for testing +class DictExtensionType : public ExtensionType { + public: + explicit DictExtensionType() : ExtensionType(dictionary(int32(), utf8())) {} + + std::string extension_name() const override { return "test.dict_extension"; } + + bool ExtensionEquals(const ExtensionType& other) const override { + return other.extension_name() == extension_name(); + } + + std::shared_ptr MakeArray(std::shared_ptr data) const override { + return std::make_shared(data); + } + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override { + return std::make_shared(); + } + + std::string Serialize() const override { return ""; } +}; + +std::shared_ptr dict_extension() { + static bool registered = false; + auto type = std::make_shared(); + if (!registered) { + auto status = RegisterExtensionType(type); + if (!status.ok() && !status.IsKeyError()) { + status.Abort(); + } + registered = true; + } + return type; +} + INSTANTIATE_TEST_SUITE_P( NanoarrowIpcTest, ArrowTypeParameterizedTestFixture, ::testing::Values( @@ -1487,6 +1524,8 @@ INSTANTIATE_TEST_SUITE_P( arrow::dictionary(arrow::int32(), arrow::utf8(), true), // Extension type arrow::extension::uuid(), + // Extension type with dictionary as the storage type + dict_extension(), // Dictionary-encoded extension arrow::dictionary(arrow::int32(), arrow::extension::uuid()))); From 5e4f273dbc4ca914f53b9b553c97cc8897bbd264 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Thu, 9 Apr 2026 22:50:31 -0500 Subject: [PATCH 17/27] hack to make the integration tests pass --- src/nanoarrow/ipc/decoder.c | 25 ++++++++++++++++++++----- src/nanoarrow/ipc/decoder_test.cc | 16 +++++++++++++++- src/nanoarrow/ipc/files_test.cc | 1 + 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 644d251af..abd68635a 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1273,6 +1273,19 @@ static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( return NANOARROW_OK; } + // Temporary hack: if this is an extension type called "dict-extension", which is + // the exact name used in the integration tests AND the roundtrip tests for the + // decoder, move all the extension metadata back to the field because it is the + // only known extension type that supports a dictionary as the storage type. + struct ArrowSchemaView schema_view; + NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema->dictionary, NULL)); + + int all_metadata_back_to_field = 0; + if (schema_view.extension_name.size_bytes == 14 && + strncmp(schema_view.extension_name.data, "dict-extension", 14) == 0) { + all_metadata_back_to_field = 1; + } + struct ArrowBuffer field_metadata; struct ArrowBuffer extension_metadata; NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&field_metadata, NULL)); @@ -1303,17 +1316,19 @@ static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( key.size_bytes == extension_metadata_key.size_bytes && strncmp(key.data, extension_metadata_key.data, key.size_bytes) == 0; - // Extension metadata stays on the dictionary (value type) - if (key_is_extension_name || key_is_extension_metadata) { - result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); + if (all_metadata_back_to_field || + (!key_is_extension_name && !key_is_extension_metadata)) { + // Non-extension metadata goes to the field + result = ArrowMetadataBuilderAppend(&field_metadata, key, value); if (result != NANOARROW_OK) { ArrowBufferReset(&field_metadata); ArrowBufferReset(&extension_metadata); return result; } } else { - // Non-extension metadata goes to the field - result = ArrowMetadataBuilderAppend(&field_metadata, key, value); + // Extension metadata stays on the dictionary (value type) unless + // all_metadata_back_to_field is non-zero. + result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); if (result != NANOARROW_OK) { ArrowBufferReset(&field_metadata); ArrowBufferReset(&extension_metadata); diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index f330dc586..4f84404f2 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -1219,6 +1219,13 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowTypeRoundtrip) { GTEST_SKIP() << "Dictionary array encode is not yet supported"; } + if (GetParam()->id() == arrow::Type::EXTENSION && + std::static_pointer_cast(GetParam())->storage_type()->id() == + arrow::Type::DICTIONARY) { + GTEST_SKIP() + << "nanoarrow encoder cannot yet encode extension types with dictionary storage"; + } + nanoarrow::UniqueSchema schema; ASSERT_TRUE( arrow::ExportSchema(arrow::Schema({arrow::field("", GetParam())}), schema.get()) @@ -1385,6 +1392,13 @@ TEST_P(ArrowTypeParameterizedTestFixture, NanoarrowIpcNanoarrowArrayRoundtrip) { GTEST_SKIP() << "nanoarrow encoder cannot yet encode dictionaries"; } + if (GetParam()->id() == arrow::Type::EXTENSION && + std::static_pointer_cast(GetParam())->storage_type()->id() == + arrow::Type::DICTIONARY) { + GTEST_SKIP() + << "nanoarrow encoder cannot yet encode extension types with dictionary storage"; + } + struct ArrowError error; nanoarrow::UniqueSchema schema; ASSERT_TRUE( @@ -1443,7 +1457,7 @@ class DictExtensionType : public ExtensionType { public: explicit DictExtensionType() : ExtensionType(dictionary(int32(), utf8())) {} - std::string extension_name() const override { return "test.dict_extension"; } + std::string extension_name() const override { return "dict-extension"; } bool ExtensionEquals(const ExtensionType& other) const override { return other.extension_name() == extension_name(); diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index ea6be1ece..6b70a0c7f 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -365,6 +365,7 @@ class TestFile { // Use testing utility to compare nanoarrow::testing::TestingJSONComparison comparison; + comparison.set_compare_metadata_order(false); ASSERT_EQ(comparison.CompareArrayStream(ipc_stream.get(), json_stream.get(), &error), NANOARROW_OK) << error.message; From 40fbdbd88336748a8378ec2d4d2e2bbc9909162f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sat, 11 Apr 2026 21:45:17 -0500 Subject: [PATCH 18/27] maybe fix namespace build and field metadata for dictionaries --- src/nanoarrow/ipc/decoder.c | 104 ++++-------------------------- src/nanoarrow/ipc/decoder_test.cc | 19 +++--- src/nanoarrow/nanoarrow_ipc.h | 2 + 3 files changed, 24 insertions(+), 101 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index abd68635a..d961f6bbc 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1259,96 +1259,15 @@ static int ArrowIpcDecoderSetType(struct ArrowSchema* schema, ns(Field_table_t) } } -// A fun corner case when decoding dictionaries: the extension metadata lives with -// the dictionary (i.e., the non-index type); however, non-extension field metadata -// still needs to exist on the field. -static int ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded( - struct ArrowSchema* schema) { +// When decoding dictionaries, we move the value type to the schema->dictionary +// member, but we need to move the field metadata back because in IPC there +// is no such thing as dictionary metadata (even extension metadata) +// https://github.com/apache/arrow/issues/49704 +static int ArrowIpcMoveDictionaryMetadataBackToField(struct ArrowSchema* schema) { NANOARROW_DCHECK(schema->dictionary != NULL); - struct ArrowMetadataReader reader; - NANOARROW_RETURN_NOT_OK(ArrowMetadataReaderInit(&reader, schema->dictionary->metadata)); - - // For the most common case (no metadata), nothing needs to be done here - if (reader.remaining_keys == 0) { - return NANOARROW_OK; - } - - // Temporary hack: if this is an extension type called "dict-extension", which is - // the exact name used in the integration tests AND the roundtrip tests for the - // decoder, move all the extension metadata back to the field because it is the - // only known extension type that supports a dictionary as the storage type. - struct ArrowSchemaView schema_view; - NANOARROW_RETURN_NOT_OK(ArrowSchemaViewInit(&schema_view, schema->dictionary, NULL)); - - int all_metadata_back_to_field = 0; - if (schema_view.extension_name.size_bytes == 14 && - strncmp(schema_view.extension_name.data, "dict-extension", 14) == 0) { - all_metadata_back_to_field = 1; - } - - struct ArrowBuffer field_metadata; - struct ArrowBuffer extension_metadata; - NANOARROW_RETURN_NOT_OK(ArrowMetadataBuilderInit(&field_metadata, NULL)); - ArrowErrorCode result = ArrowMetadataBuilderInit(&extension_metadata, NULL); - if (result != NANOARROW_OK) { - ArrowBufferReset(&field_metadata); - return result; - } - - const struct ArrowStringView extension_name_key = ArrowCharView("ARROW:extension:name"); - const struct ArrowStringView extension_metadata_key = - ArrowCharView("ARROW:extension:metadata"); - - struct ArrowStringView key; - struct ArrowStringView value; - while (reader.remaining_keys > 0) { - result = ArrowMetadataReaderRead(&reader, &key, &value); - if (result != NANOARROW_OK) { - ArrowBufferReset(&field_metadata); - ArrowBufferReset(&extension_metadata); - return result; - } - - int key_is_extension_name = - key.size_bytes == extension_name_key.size_bytes && - strncmp(key.data, extension_name_key.data, key.size_bytes) == 0; - int key_is_extension_metadata = - key.size_bytes == extension_metadata_key.size_bytes && - strncmp(key.data, extension_metadata_key.data, key.size_bytes) == 0; - - if (all_metadata_back_to_field || - (!key_is_extension_name && !key_is_extension_metadata)) { - // Non-extension metadata goes to the field - result = ArrowMetadataBuilderAppend(&field_metadata, key, value); - if (result != NANOARROW_OK) { - ArrowBufferReset(&field_metadata); - ArrowBufferReset(&extension_metadata); - return result; - } - } else { - // Extension metadata stays on the dictionary (value type) unless - // all_metadata_back_to_field is non-zero. - result = ArrowMetadataBuilderAppend(&extension_metadata, key, value); - if (result != NANOARROW_OK) { - ArrowBufferReset(&field_metadata); - ArrowBufferReset(&extension_metadata); - return result; - } - } - } - - result = ArrowSchemaSetMetadata(schema, (char*)field_metadata.data); - if (result != NANOARROW_OK) { - ArrowBufferReset(&field_metadata); - ArrowBufferReset(&extension_metadata); - return result; - } - - result = ArrowSchemaSetMetadata(schema->dictionary, (char*)extension_metadata.data); - ArrowBufferReset(&field_metadata); - ArrowBufferReset(&extension_metadata); - - return result; + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetMetadata(schema, schema->dictionary->metadata)); + NANOARROW_RETURN_NOT_OK(ArrowSchemaSetMetadata(schema->dictionary, NULL)); + return NANOARROW_OK; } static int ArrowIpcSetDictionaryEncoding( @@ -1387,10 +1306,9 @@ static int ArrowIpcSetDictionaryEncoding( schema->flags |= ARROW_FLAG_DICTIONARY_ORDERED; } - // Field metadata should stay with the field; however, we need the extension metadata - // to stay with the dictionary. - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowIpcMoveNonExtensionFieldMetadataBackToFieldIfNeeded(schema), error); + // Sort out field metadata between the schema and the dictionary member + NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowIpcMoveDictionaryMetadataBackToField(schema), + error); // Track the identifier if we have a dictionaries object in which to track it if (dictionaries != NULL) { diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 4f84404f2..3ae3dfec3 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -1539,9 +1539,11 @@ INSTANTIATE_TEST_SUITE_P( // Extension type arrow::extension::uuid(), // Extension type with dictionary as the storage type - dict_extension(), - // Dictionary-encoded extension - arrow::dictionary(arrow::int32(), arrow::extension::uuid()))); + dict_extension() + // Dictionary-encoded extension is not supported in IPC + // https://github.com/apache/arrow/issues/49704 + // arrow::dictionary(arrow::int32(), arrow::extension::uuid())) + )); class ArrowSchemaParameterizedTestFixture : public ::testing::TestWithParam> { @@ -1702,11 +1704,12 @@ INSTANTIATE_TEST_SUITE_P( // Dictionary with field metadata arrow::schema({arrow::field( "some_name", arrow::dictionary(arrow::int32(), arrow::utf8()), - arrow::KeyValueMetadata::Make({"key1", "key2"}, {"value1", "value2"}))}), - // Dictionary with field metadata - arrow::schema({arrow::field( - "some_name", arrow::dictionary(arrow::int32(), arrow::extension::uuid()), - arrow::KeyValueMetadata::Make({"key1", "key2"}, {"value1", "value2"}))}))); + arrow::KeyValueMetadata::Make({"key1", "key2"}, {"value1", "value2"}))}) + // Dictionary with extension storage and field metadata is not supported in IPC + // arrow::schema({arrow::field( + // "some_name", arrow::dictionary(arrow::int32(), arrow::extension::uuid()), + // arrow::KeyValueMetadata::Make({"key1", "key2"}, {"value1", "value2"}))}) + )); class ArrowTypeIdParameterizedTestFixture : public ::testing::TestWithParam { diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index 309abb6a2..0148040cb 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -120,6 +120,8 @@ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsInit) #define ArrowIpcDictionaryEncodingsAppend \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsAppend) +#define ArrowIpcDictionaryEncodingsAppendSchema \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsAppendSchema) #define ArrowIpcDictionaryEncodingsFind \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionaryEncodingsFind) #define ArrowIpcDictionaryEncodingsFindById \ From 29c04146463e2dba5f3a4366ebe04bfed3b5ee26 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 13 Apr 2026 09:53:12 -0500 Subject: [PATCH 19/27] add definition for non-shared decoding --- src/nanoarrow/ipc/decoder.c | 2 +- src/nanoarrow/ipc/decoder_test.cc | 16 ++++++++-------- src/nanoarrow/ipc/reader.c | 2 +- src/nanoarrow/nanoarrow_ipc.h | 22 ++++++++++++++++++++++ 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index d961f6bbc..bdb66ea49 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2722,7 +2722,7 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( validation_level, error); } -NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* dictionaries, struct ArrowError* error) { diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 3ae3dfec3..0eb2d441f 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -736,8 +736,8 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ASSERT_EQ(ArrowIpcDecoderInit(&decoder), NANOARROW_OK); struct ArrowIpcSharedBuffer shared; ASSERT_EQ( - ArrowIpcDecoderDecodeDictionary(&decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, - &dictionaries, &error), + ArrowIpcDecoderDecodeDictionaryFromShared( + &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), EINVAL); ASSERT_STREQ(error.message, "decoder did not just decode a DictionaryBatch message"); @@ -762,8 +762,8 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ASSERT_EQ(ArrowIpcSharedBufferInit(&shared, &body), NANOARROW_OK); ASSERT_EQ( - ArrowIpcDecoderDecodeDictionary(&decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, - &dictionaries, &error), + ArrowIpcDecoderDecodeDictionaryFromShared( + &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), NANOARROW_OK); // If we find the current value of the dictionary we should get the correct array @@ -789,8 +789,8 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // If we try to decode the dictionary again it should succeed (because the dictionary // is in replacement mode) ASSERT_EQ( - ArrowIpcDecoderDecodeDictionary(&decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, - &dictionaries, &error), + ArrowIpcDecoderDecodeDictionaryFromShared( + &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), NANOARROW_OK); ASSERT_EQ(ArrowArrayViewSetArray(&array_view, dictionary_value, &error), NANOARROW_OK); @@ -802,8 +802,8 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // If we try to decode a delta dictionary, we should fail with a reasonable message const_cast(decoder.dictionary)->is_delta = 1; ASSERT_EQ( - ArrowIpcDecoderDecodeDictionary(&decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, - &dictionaries, &error), + ArrowIpcDecoderDecodeDictionaryFromShared( + &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), ENOTSUP); ASSERT_STREQ(error.message, "Dictionary concatenation is not yet supported"); diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c index 9780dccf3..141f154a7 100644 --- a/src/nanoarrow/ipc/reader.c +++ b/src/nanoarrow/ipc/reader.c @@ -495,7 +495,7 @@ static int ArrowIpcArrayStreamReaderProcessDictionary( struct ArrowIpcSharedBuffer shared; NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); - int result = ArrowIpcDecoderDecodeDictionary( + int result = ArrowIpcDecoderDecodeDictionaryFromShared( &private_data->decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &private_data->dictionaries, &private_data->error); ArrowIpcSharedBufferReset(&shared); diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index 0148040cb..f1f4c5acc 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -138,6 +138,8 @@ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDictionariesReset) #define ArrowIpcDecoderDecodeDictionary \ NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeDictionary) +#define ArrowIpcDecoderDecodeDictionaryFromShared \ + NANOARROW_SYMBOL(NANOARROW_NAMESPACE, ArrowIpcDecoderDecodeDictionaryFromShared) #endif @@ -680,11 +682,31 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( struct ArrowArray* out, enum ArrowValidationLevel validation_level, struct ArrowError* error); +/// \brief Decode an ArrowArray from a dictionary batch into the given +/// ArrowIpcDictionaries +/// +/// After a successful call to ArrowIpcDecoderDecodeHeader(), assemble an ArrowArray given +/// and place it into out for the decoding of future dictionaries. Note that other +/// dictionaries in out may be used during the decoding if there are nested dictionaries +/// in this stream. The decoded value may be obtained with +/// ArrowIpcDictionariesFindCurrentValue. NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* out, struct ArrowError* error); +/// \brief Decode an ArrowArray from a dictionary batch from an owned buffer +/// +/// This implementation takes advantage of the fact that it can avoid copying individual +/// buffers. In all cases the caller must ArrowIpcSharedBufferReset() body after one or +/// more calls to ArrowIpcDecoderDecodeArrayFromShared(). If +/// ArrowIpcSharedBufferIsThreadSafe() returns 0, no batches decoded using out may +/// be released from another thread. +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( + struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, + enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* out, + struct ArrowError* error); + /// \brief An user-extensible input data source struct ArrowIpcInputStream { /// \brief Read up to buf_size_bytes from stream into buf From 89c3d20d7d3186c7ac0dfe10ba093397dc3ed07a Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 13 Apr 2026 10:00:24 -0500 Subject: [PATCH 20/27] with internals --- src/nanoarrow/ipc/decoder.c | 52 ++++++++++++++++++++++++++++------- src/nanoarrow/nanoarrow_ipc.h | 2 +- 2 files changed, 43 insertions(+), 11 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index bdb66ea49..4b0239c13 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -2722,14 +2722,10 @@ ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( validation_level, error); } -NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( - struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, +static ArrowErrorCode ArrowIpcDecoderDecodeDictionaryInternal( + struct ArrowIpcDecoder* decoder, struct ArrowIpcBufferFactory factory, enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* dictionaries, struct ArrowError* error) { - NANOARROW_DCHECK(decoder != NULL); - NANOARROW_DCHECK(shared != NULL); - NANOARROW_DCHECK(dictionaries != NULL); - struct ArrowIpcDecoderPrivate* private_data = (struct ArrowIpcDecoderPrivate*)decoder->private_data; @@ -2761,12 +2757,23 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( // Transfer the endianness setting so that buffers are byte-swapped if needed dictionary_decoder_private_data->endianness = private_data->endianness; - struct ArrowArray tmp; + struct ArrowArrayView* array_view; + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayViewInternal( + &dictionary->decoder, factory, 0, dictionaries, &array_view, error)); - NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayFromSharedWithDictionaries( - &dictionary->decoder, shared, 0, dictionaries, &tmp, validation_level, error)); + NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidate(array_view, validation_level, error)); + + struct ArrowArray tmp; + tmp.release = NULL; + int result = ArrowIpcDecoderDecodeArrayInternal(&dictionary->decoder, 0, &tmp, + validation_level, error); + if (result != NANOARROW_OK && tmp.release != NULL) { + ArrowArrayRelease(&tmp); + return result; + } else if (result != NANOARROW_OK) { + return result; + } - ArrowErrorCode result; if (decoder->dictionary->is_delta) { result = ArrowIpcDictionaryAppend(dictionary, &tmp, error); } else { @@ -2780,3 +2787,28 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( return NANOARROW_OK; } + +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, + enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* dictionaries, + struct ArrowError* error) { + NANOARROW_DCHECK(decoder != NULL); + NANOARROW_DCHECK(dictionaries != NULL); + + return ArrowIpcDecoderDecodeDictionaryInternal(decoder, + ArrowIpcBufferFactoryFromView(&body), + validation_level, dictionaries, error); +} + +NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionaryFromShared( + struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, + enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* dictionaries, + struct ArrowError* error) { + NANOARROW_DCHECK(decoder != NULL); + NANOARROW_DCHECK(shared != NULL); + NANOARROW_DCHECK(dictionaries != NULL); + + return ArrowIpcDecoderDecodeDictionaryInternal(decoder, + ArrowIpcBufferFactoryFromShared(shared), + validation_level, dictionaries, error); +} diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index f1f4c5acc..f4c8ef2cf 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -691,7 +691,7 @@ NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeArrayFromShared( /// in this stream. The decoded value may be obtained with /// ArrowIpcDictionariesFindCurrentValue. NANOARROW_DLL ArrowErrorCode ArrowIpcDecoderDecodeDictionary( - struct ArrowIpcDecoder* decoder, struct ArrowIpcSharedBuffer* shared, + struct ArrowIpcDecoder* decoder, struct ArrowBufferView body, enum ArrowValidationLevel validation_level, struct ArrowIpcDictionaries* out, struct ArrowError* error); From c05a471c04257adc16fc3fa5a121947a73f81d0c Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 13 Apr 2026 10:07:33 -0500 Subject: [PATCH 21/27] implement support in the reader --- src/nanoarrow/ipc/reader.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c index 141f154a7..cf0d589ef 100644 --- a/src/nanoarrow/ipc/reader.c +++ b/src/nanoarrow/ipc/reader.c @@ -482,24 +482,27 @@ static int ArrowIpcArrayStreamReaderProcessRecordBatch( static int ArrowIpcArrayStreamReaderProcessDictionary( struct ArrowIpcArrayStreamReaderPrivate* private_data) { - if (!private_data->use_shared_buffers) { - ArrowErrorSet(&private_data->error, - "Dictionary decode without shared buffers is not supported"); - return ENOTSUP; - } - // Read in the body NANOARROW_RETURN_NOT_OK(ArrowIpcArrayStreamReaderNextBody(private_data)); - // Decode the dictionary - struct ArrowIpcSharedBuffer shared; - NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); - int result = ArrowIpcDecoderDecodeDictionaryFromShared( - &private_data->decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, - &private_data->dictionaries, &private_data->error); - ArrowIpcSharedBufferReset(&shared); - NANOARROW_RETURN_NOT_OK(result); + if (private_data->use_shared_buffers) { + // Decode the dictionary + struct ArrowIpcSharedBuffer shared; + NANOARROW_RETURN_NOT_OK_WITH_ERROR( + ArrowIpcSharedBufferInit(&shared, &private_data->body), &private_data->error); + int result = ArrowIpcDecoderDecodeDictionaryFromShared( + &private_data->decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, + &private_data->dictionaries, &private_data->error); + ArrowIpcSharedBufferReset(&shared); + NANOARROW_RETURN_NOT_OK(result); + } else { + struct ArrowBufferView body_view; + body_view.data.data = private_data->body.data; + body_view.size_bytes = private_data->body.size_bytes; + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeDictionary( + &private_data->decoder, body_view, NANOARROW_VALIDATION_LEVEL_FULL, + &private_data->dictionaries, &private_data->error)); + } return NANOARROW_OK; } From 400f5cfe70f29861bc71f14270b824331126ddc3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 13 Apr 2026 10:11:07 -0500 Subject: [PATCH 22/27] simpler test usage --- src/nanoarrow/ipc/decoder_test.cc | 41 +++++++++++++------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/src/nanoarrow/ipc/decoder_test.cc b/src/nanoarrow/ipc/decoder_test.cc index 0eb2d441f..2143b610a 100644 --- a/src/nanoarrow/ipc/decoder_test.cc +++ b/src/nanoarrow/ipc/decoder_test.cc @@ -734,11 +734,12 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // Check that we can't decode a dictionary batch if we haven't read a dictionary batch // message ASSERT_EQ(ArrowIpcDecoderInit(&decoder), NANOARROW_OK); - struct ArrowIpcSharedBuffer shared; - ASSERT_EQ( - ArrowIpcDecoderDecodeDictionaryFromShared( - &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), - EINVAL); + struct ArrowBufferView body; + body.data.data = nullptr; + body.size_bytes = 0; + ASSERT_EQ(ArrowIpcDecoderDecodeDictionary( + &decoder, body, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), + EINVAL); ASSERT_STREQ(error.message, "decoder did not just decode a DictionaryBatch message"); // Decode a dictionary batch and inspect metadata @@ -754,17 +755,12 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { EXPECT_FALSE(decoder.dictionary->is_delta); // Decode the dictionary batch - data.data.as_uint8 += decoder.header_size_bytes; - data.size_bytes = decoder.body_size_bytes; - struct ArrowBuffer body; - ArrowBufferInit(&body); - ASSERT_EQ(ArrowBufferAppendBufferView(&body, data), NANOARROW_OK); + body.data.as_uint8 = data.data.as_uint8 + decoder.header_size_bytes; + body.size_bytes = decoder.body_size_bytes; - ASSERT_EQ(ArrowIpcSharedBufferInit(&shared, &body), NANOARROW_OK); - ASSERT_EQ( - ArrowIpcDecoderDecodeDictionaryFromShared( - &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), - NANOARROW_OK); + ASSERT_EQ(ArrowIpcDecoderDecodeDictionary( + &decoder, body, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), + NANOARROW_OK); // If we find the current value of the dictionary we should get the correct array const struct ArrowArray* dictionary_value; @@ -788,10 +784,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // If we try to decode the dictionary again it should succeed (because the dictionary // is in replacement mode) - ASSERT_EQ( - ArrowIpcDecoderDecodeDictionaryFromShared( - &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), - NANOARROW_OK); + ASSERT_EQ(ArrowIpcDecoderDecodeDictionary( + &decoder, body, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), + NANOARROW_OK); ASSERT_EQ(ArrowArrayViewSetArray(&array_view, dictionary_value, &error), NANOARROW_OK); ASSERT_EQ(array_view.length, 3); @@ -801,10 +796,9 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { // If we try to decode a delta dictionary, we should fail with a reasonable message const_cast(decoder.dictionary)->is_delta = 1; - ASSERT_EQ( - ArrowIpcDecoderDecodeDictionaryFromShared( - &decoder, &shared, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), - ENOTSUP); + ASSERT_EQ(ArrowIpcDecoderDecodeDictionary( + &decoder, body, NANOARROW_VALIDATION_LEVEL_FULL, &dictionaries, &error), + ENOTSUP); ASSERT_STREQ(error.message, "Dictionary concatenation is not yet supported"); // After all of this, we should be able to actually decode a RecordBatch @@ -871,7 +865,6 @@ TEST(NanoarrowIpcTest, NanoarrowIpcDecodeDictionaryBatchDecode) { ArrowIpcSharedBufferReset(&record_batch_shared); ArrowArrayViewReset(&array_view); - ArrowIpcSharedBufferReset(&shared); ArrowIpcDictionariesReset(&dictionaries); ArrowIpcDictionaryEncodingsReset(&dictionary_encodings); ArrowSchemaRelease(&schema); From 7a1b5f2c6369dfbaf743f108c18dc87d786fc0cc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Mon, 13 Apr 2026 10:37:48 -0500 Subject: [PATCH 23/27] whoops --- src/nanoarrow/ipc/reader.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c index cf0d589ef..43f99584e 100644 --- a/src/nanoarrow/ipc/reader.c +++ b/src/nanoarrow/ipc/reader.c @@ -472,9 +472,10 @@ static int ArrowIpcArrayStreamReaderProcessRecordBatch( body_view.data.data = private_data->body.data; body_view.size_bytes = private_data->body.size_bytes; - NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArray( - &private_data->decoder, body_view, private_data->field_index, out, - NANOARROW_VALIDATION_LEVEL_FULL, &private_data->error)); + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayWithDictionaries( + &private_data->decoder, body_view, private_data->field_index, + &private_data->dictionaries, out, NANOARROW_VALIDATION_LEVEL_FULL, + &private_data->error)); } return NANOARROW_OK; From a51fc052d7ca14512702e9f185f61872d2eff283 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 17 Apr 2026 11:03:38 -0500 Subject: [PATCH 24/27] fix reader cleanup --- src/nanoarrow/ipc/reader.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nanoarrow/ipc/reader.c b/src/nanoarrow/ipc/reader.c index 43f99584e..897508241 100644 --- a/src/nanoarrow/ipc/reader.c +++ b/src/nanoarrow/ipc/reader.c @@ -435,6 +435,7 @@ static int ArrowIpcArrayStreamReaderReadSchemaIfNeeded( &private_data->decoder, &tmp, &dictionary_encodings, &private_data->error); ArrowIpcDictionaryEncodingsReset(&dictionary_encodings); if (result != NANOARROW_OK) { + ArrowIpcDictionariesReset(&private_data->dictionaries); ArrowSchemaRelease(&tmp); return result; } From 67a534c03b73f440b3bb5350758c98667d2f5646 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 17 Apr 2026 11:15:52 -0500 Subject: [PATCH 25/27] add reader tests --- src/nanoarrow/ipc/reader_test.cc | 161 +++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/src/nanoarrow/ipc/reader_test.cc b/src/nanoarrow/ipc/reader_test.cc index 1f258350b..95ead18b7 100644 --- a/src/nanoarrow/ipc/reader_test.cc +++ b/src/nanoarrow/ipc/reader_test.cc @@ -57,6 +57,65 @@ static uint8_t kSimpleRecordBatch[] = { 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +alignas(8) static uint8_t kDictionarySchema[] = { + 0xff, 0xff, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0a, 0x00, 0x0e, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, 0x41, 0x0a, 0x33, 0x0a, 0x32, 0x36, + 0x33, 0x31, 0x37, 0x30, 0x0a, 0x31, 0x39, 0x37, 0x38, 0x38, 0x38, 0x0a, 0x35, 0x0a, + 0x55, 0x54, 0x46, 0x2d, 0x38, 0x0a, 0x35, 0x33, 0x31, 0x0a, 0x31, 0x0a, 0x35, 0x33, + 0x31, 0x0a, 0x31, 0x0a, 0x32, 0x35, 0x34, 0x0a, 0x31, 0x30, 0x32, 0x36, 0x0a, 0x31, + 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35, 0x33, 0x0a, 0x35, 0x0a, 0x6e, 0x61, 0x6d, 0x65, + 0x73, 0x0a, 0x31, 0x36, 0x0a, 0x31, 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35, 0x33, 0x0a, + 0x38, 0x0a, 0x73, 0x6f, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6c, 0x0a, 0x32, 0x35, 0x34, + 0x0a, 0x31, 0x30, 0x32, 0x36, 0x0a, 0x35, 0x31, 0x31, 0x0a, 0x31, 0x36, 0x0a, 0x31, + 0x0a, 0x32, 0x36, 0x32, 0x31, 0x35, 0x33, 0x0a, 0x37, 0x0a, 0x63, 0x6f, 0x6c, 0x75, + 0x6d, 0x6e, 0x73, 0x0a, 0x32, 0x35, 0x34, 0x0a, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x72, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x18, 0x00, 0x08, 0x00, 0x06, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x05, 0x14, 0x00, 0x00, 0x00, 0x48, 0x00, + 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x6d, 0x65, 0x5f, 0x63, 0x6f, 0x6c, 0x00, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x07, 0x00, 0x08, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +alignas(8) static uint8_t kDictionaryBatch[] = { + 0xff, 0xff, 0xff, 0xff, 0xa8, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0c, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x00, 0x02, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x20, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x18, 0x00, + 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, + 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x7a, 0x65, 0x72, 0x6f, + 0x6f, 0x6e, 0x65, 0x74, 0x77, 0x6f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +alignas(8) static uint8_t kDictionaryRecordBatch[] = { + 0xff, 0xff, 0xff, 0xff, 0x88, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x0c, 0x00, 0x16, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0c, 0x00, + 0x0c, 0x00, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x08, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x18, 0x00, 0x0c, 0x00, + 0x04, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x10, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00}; + static uint8_t kEndOfStream[] = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; TEST(NanoarrowIpcReader, InputStreamBuffer) { @@ -483,3 +542,105 @@ TEST(NanoarrowIpcReader, StreamReaderIncompletePrefix) { ArrowArrayStreamRelease(&stream); } + +TEST(NanoarrowIpcReader, StreamReaderDictionary) { + struct ArrowBuffer input_buffer; + ArrowBufferInit(&input_buffer); + ASSERT_EQ( + ArrowBufferAppend(&input_buffer, kDictionarySchema, sizeof(kDictionarySchema)), + NANOARROW_OK); + ASSERT_EQ(ArrowBufferAppend(&input_buffer, kDictionaryBatch, sizeof(kDictionaryBatch)), + NANOARROW_OK); + ASSERT_EQ(ArrowBufferAppend(&input_buffer, kDictionaryRecordBatch, + sizeof(kDictionaryRecordBatch)), + NANOARROW_OK); + + struct ArrowIpcInputStream input; + ASSERT_EQ(ArrowIpcInputStreamInitBuffer(&input, &input_buffer), NANOARROW_OK); + + struct ArrowArrayStream stream; + ASSERT_EQ(ArrowIpcArrayStreamReaderInit(&stream, &input, nullptr), NANOARROW_OK); + + struct ArrowSchema schema; + ASSERT_EQ(ArrowArrayStreamGetSchema(&stream, &schema, nullptr), NANOARROW_OK); + EXPECT_STREQ(schema.format, "+s"); + ASSERT_EQ(schema.n_children, 1); + // Dictionary-encoded field with int8 indices + EXPECT_STREQ(schema.children[0]->format, "c"); + ASSERT_NE(schema.children[0]->dictionary, nullptr); + // Dictionary values are utf8 strings + EXPECT_STREQ(schema.children[0]->dictionary->format, "u"); + ArrowSchemaRelease(&schema); + + struct ArrowArray array; + ASSERT_EQ(ArrowArrayStreamGetNext(&stream, &array, nullptr), NANOARROW_OK); + EXPECT_EQ(array.length, 3); + ASSERT_EQ(array.n_children, 1); + // The child should have a dictionary + ASSERT_NE(array.children[0]->dictionary, nullptr); + EXPECT_EQ(array.children[0]->dictionary->length, 3); + ArrowArrayRelease(&array); + + ASSERT_EQ(ArrowArrayStreamGetNext(&stream, &array, nullptr), NANOARROW_OK); + EXPECT_EQ(array.release, nullptr); + + ArrowArrayStreamRelease(&stream); +} + +TEST(NanoarrowIpcReader, StreamReaderDictionaryBatchWithoutDictionarySchema) { + // Send a dictionary batch when the schema has no dictionaries + struct ArrowBuffer input_buffer; + ArrowBufferInit(&input_buffer); + ASSERT_EQ(ArrowBufferAppend(&input_buffer, kSimpleSchema, sizeof(kSimpleSchema)), + NANOARROW_OK); + ASSERT_EQ(ArrowBufferAppend(&input_buffer, kDictionaryBatch, sizeof(kDictionaryBatch)), + NANOARROW_OK); + + struct ArrowIpcInputStream input; + ASSERT_EQ(ArrowIpcInputStreamInitBuffer(&input, &input_buffer), NANOARROW_OK); + + struct ArrowArrayStream stream; + ASSERT_EQ(ArrowIpcArrayStreamReaderInit(&stream, &input, nullptr), NANOARROW_OK); + + struct ArrowSchema schema; + ASSERT_EQ(ArrowArrayStreamGetSchema(&stream, &schema, nullptr), NANOARROW_OK); + ArrowSchemaRelease(&schema); + + struct ArrowArray array; + struct ArrowError error; + ASSERT_NE(ArrowArrayStreamGetNext(&stream, &array, &error), NANOARROW_OK); + ASSERT_GT(strlen(ArrowArrayStreamGetLastError(&stream)), 0); + + ArrowArrayStreamRelease(&stream); +} + +TEST(NanoarrowIpcReader, StreamReaderRecordBatchWithoutDictionaryBatch) { + // Send a record batch referencing a dictionary before the dictionary values arrive + struct ArrowBuffer input_buffer; + ArrowBufferInit(&input_buffer); + ASSERT_EQ( + ArrowBufferAppend(&input_buffer, kDictionarySchema, sizeof(kDictionarySchema)), + NANOARROW_OK); + // Skip the dictionary batch and go straight to the record batch + ASSERT_EQ(ArrowBufferAppend(&input_buffer, kDictionaryRecordBatch, + sizeof(kDictionaryRecordBatch)), + NANOARROW_OK); + + struct ArrowIpcInputStream input; + ASSERT_EQ(ArrowIpcInputStreamInitBuffer(&input, &input_buffer), NANOARROW_OK); + + struct ArrowArrayStream stream; + ASSERT_EQ(ArrowIpcArrayStreamReaderInit(&stream, &input, nullptr), NANOARROW_OK); + + struct ArrowSchema schema; + ASSERT_EQ(ArrowArrayStreamGetSchema(&stream, &schema, nullptr), NANOARROW_OK); + ArrowSchemaRelease(&schema); + + struct ArrowArray array; + struct ArrowError error; + // Should error because dictionary values were never provided + ASSERT_EQ(ArrowArrayStreamGetNext(&stream, &array, &error), EINVAL); + ASSERT_GT(strlen(ArrowArrayStreamGetLastError(&stream)), 0); + + ArrowArrayStreamRelease(&stream); +} From 87798ed3f18010caa415cb206d07156c565d3b41 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 17 Apr 2026 11:27:43 -0500 Subject: [PATCH 26/27] implement dictionary index validation --- src/nanoarrow/common/array.c | 20 +++++++++-- src/nanoarrow/common/array_test.cc | 54 ++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/nanoarrow/common/array.c b/src/nanoarrow/common/array.c index 2f0f824cf..ec7bdb667 100644 --- a/src/nanoarrow/common/array.c +++ b/src/nanoarrow/common/array.c @@ -1513,10 +1513,26 @@ static int ArrowArrayViewValidateFull(struct ArrowArrayView* array_view, NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->children[i], error)); } - // Dictionary validation not implemented + // Dictionary index validation if (array_view->dictionary != NULL) { NANOARROW_RETURN_NOT_OK(ArrowArrayViewValidateFull(array_view->dictionary, error)); - // TODO: validate the indices + + // Validate that all non-null indices are within the dictionary bounds + int64_t dictionary_length = array_view->dictionary->length; + for (int64_t i = 0; i < array_view->length; i++) { + if (ArrowArrayViewIsNull(array_view, i)) { + continue; + } + + int64_t index = ArrowArrayViewGetIntUnsafe(array_view, i); + if (index < 0 || index >= dictionary_length) { + ArrowErrorSet(error, + "[%" PRId64 "] Expected dictionary index >= 0 and < %" PRId64 + " but found value %" PRId64, + i, dictionary_length, index); + return EINVAL; + } + } } return NANOARROW_OK; diff --git a/src/nanoarrow/common/array_test.cc b/src/nanoarrow/common/array_test.cc index 35a937871..b50e7266e 100644 --- a/src/nanoarrow/common/array_test.cc +++ b/src/nanoarrow/common/array_test.cc @@ -144,6 +144,60 @@ TEST(ArrayTest, ArrayTestAllocateDictionary) { ArrowArrayRelease(&array); } +TEST(ArrayTest, ArrayTestValidateDictionaryIndices) { + struct ArrowArray array; + struct ArrowSchema schema; + struct ArrowArrayView array_view; + struct ArrowError error; + + // Create a schema for dictionary-encoded int32 with string dictionary + ASSERT_EQ(ArrowSchemaInitFromType(&schema, NANOARROW_TYPE_INT32), NANOARROW_OK); + ASSERT_EQ(ArrowSchemaAllocateDictionary(&schema), NANOARROW_OK); + ASSERT_EQ(ArrowSchemaInitFromType(schema.dictionary, NANOARROW_TYPE_STRING), + NANOARROW_OK); + + // Initialize array_view from schema + ASSERT_EQ(ArrowArrayViewInitFromSchema(&array_view, &schema, &error), NANOARROW_OK); + + // Create a dictionary-encoded int32 array with a string dictionary + ASSERT_EQ(ArrowArrayInitFromSchema(&array, &schema, &error), NANOARROW_OK); + + // Build the array with dictionary values: ["zero", "one"] and indices [0, 1, 0] + ASSERT_EQ(ArrowArrayStartAppending(&array), NANOARROW_OK); + ASSERT_EQ(ArrowArrayAppendString(array.dictionary, "zero"_asv), NANOARROW_OK); + ASSERT_EQ(ArrowArrayAppendString(array.dictionary, "one"_asv), NANOARROW_OK); + ASSERT_EQ(ArrowArrayAppendInt(&array, 0), NANOARROW_OK); + ASSERT_EQ(ArrowArrayAppendInt(&array, 1), NANOARROW_OK); + ASSERT_EQ(ArrowArrayAppendInt(&array, 0), NANOARROW_OK); + ASSERT_EQ(ArrowArrayFinishBuildingDefault(&array, &error), NANOARROW_OK); + + // Valid indices should pass validation + ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + NANOARROW_OK); + + // Now modify index to be out of bounds (index 2 when dictionary has length 2) + int32_t* indices = reinterpret_cast(ArrowArrayBuffer(&array, 1)->data); + indices[1] = 2; // Out of bounds (valid range is 0-1) + ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, + "[1] Expected dictionary index >= 0 and < 2 but found value 2"); + + // Test negative index + indices[1] = -1; + ASSERT_EQ(ArrowArrayViewSetArray(&array_view, &array, &error), NANOARROW_OK); + EXPECT_EQ(ArrowArrayViewValidate(&array_view, NANOARROW_VALIDATION_LEVEL_FULL, &error), + EINVAL); + EXPECT_STREQ(error.message, + "[1] Expected dictionary index >= 0 and < 2 but found value -1"); + + ArrowArrayViewReset(&array_view); + ArrowSchemaRelease(&schema); + ArrowArrayRelease(&array); +} + TEST(ArrayTest, ArrayTestInitFromSchema) { struct ArrowArray array; struct ArrowSchema schema; From 282727e94229ec55822986f537954c85ed3f11b5 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Fri, 17 Apr 2026 11:28:01 -0500 Subject: [PATCH 27/27] dev --- src/nanoarrow/ipc/reader_test.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nanoarrow/ipc/reader_test.cc b/src/nanoarrow/ipc/reader_test.cc index 95ead18b7..d257e111f 100644 --- a/src/nanoarrow/ipc/reader_test.cc +++ b/src/nanoarrow/ipc/reader_test.cc @@ -57,7 +57,6 @@ static uint8_t kSimpleRecordBatch[] = { 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; - alignas(8) static uint8_t kDictionarySchema[] = { 0xff, 0xff, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0e, 0x00, 0x06, 0x00, 0x05, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00,