From 344e8a6699060fe15f9a5980a5e67fc2e46445c3 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 3 May 2026 22:05:35 -0500 Subject: [PATCH 01/10] support decimal 32 and 64 in testing json parser --- src/nanoarrow/testing/testing.cc | 8 +++++++- src/nanoarrow/testing/testing_test.cc | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/nanoarrow/testing/testing.cc b/src/nanoarrow/testing/testing.cc index 597f42ef6..038359121 100644 --- a/src/nanoarrow/testing/testing.cc +++ b/src/nanoarrow/testing/testing.cc @@ -1154,6 +1154,12 @@ ArrowErrorCode SetTypeDecimal(ArrowSchema* schema, const json& value, ArrowError ArrowType type; switch (bit_width_int) { + case 32: + type = NANOARROW_TYPE_DECIMAL32; + break; + case 64: + type = NANOARROW_TYPE_DECIMAL64; + break; case 128: type = NANOARROW_TYPE_DECIMAL128; break; @@ -1161,7 +1167,7 @@ ArrowErrorCode SetTypeDecimal(ArrowSchema* schema, const json& value, ArrowError type = NANOARROW_TYPE_DECIMAL256; break; default: - ArrowErrorSet(error, "Type[name=='decimal'] bitWidth must be 128 or 256"); + ArrowErrorSet(error, "Type[name=='decimal'] bitWidth must be 32, 64, 128 or 256"); return EINVAL; } diff --git a/src/nanoarrow/testing/testing_test.cc b/src/nanoarrow/testing/testing_test.cc index 90abef8a1..0c057304c 100644 --- a/src/nanoarrow/testing/testing_test.cc +++ b/src/nanoarrow/testing/testing_test.cc @@ -1152,7 +1152,7 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDecimal) { R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); TestTypeError(R"({"name": "decimal", "bitWidth": 123, "precision": 10, "scale": 3})", - "Type[name=='decimal'] bitWidth must be 128 or 256"); + "Type[name=='decimal'] bitWidth must be 32, 64, 128 or 256"); // Ensure that omitted bitWidth maps to decimal128 TestingJSONReader reader; From e46396a25dbdb120201be6a3fd47ef64408964cc Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 3 May 2026 22:40:32 -0500 Subject: [PATCH 02/10] add more files and improve tests Co-authored-by: Copilot --- src/nanoarrow/ipc/decoder.c | 13 ++++++ src/nanoarrow/ipc/files_test.cc | 71 +++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 25 deletions(-) diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 4b0239c13..3da01ea0f 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1225,10 +1225,16 @@ static int ArrowIpcDecoderSetType(struct ArrowSchema* schema, ns(Field_table_t) case ns(Type_FixedSizeBinary): return ArrowIpcDecoderSetTypeFixedSizeBinary(schema, ns(Field_type_get(field)), error); + case ns(Type_BinaryView): + ArrowErrorSet(error, "BinaryView not yet supported in IPC reader"); + return ENOTSUP; case ns(Type_Utf8): return ArrowIpcDecoderSetTypeSimple(schema, NANOARROW_TYPE_STRING, error); case ns(Type_LargeUtf8): return ArrowIpcDecoderSetTypeSimple(schema, NANOARROW_TYPE_LARGE_STRING, error); + case ns(Type_Utf8View): + ArrowErrorSet(error, "Utf8View not yet supported in IPC reader"); + return ENOTSUP; case ns(Type_Date): return ArrowIpcDecoderSetTypeDate(schema, ns(Field_type_get(field)), error); case ns(Type_Time): @@ -1248,11 +1254,18 @@ static int ArrowIpcDecoderSetType(struct ArrowSchema* schema, ns(Field_table_t) case ns(Type_FixedSizeList): return ArrowIpcDecoderSetTypeFixedSizeList(schema, ns(Field_type_get(field)), error); + case ns(Type_ListView): + case ns(Type_LargeListView): + ArrowErrorSet(error, "ListView/LargeListView not yet supported in IPC reader"); + return ENOTSUP; case ns(Type_Map): return ArrowIpcDecoderSetTypeMap(schema, ns(Field_type_get(field)), error); case ns(Type_Union): return ArrowIpcDecoderSetTypeUnion(schema, ns(Field_type_get(field)), n_children, error); + case ns(Type_RunEndEncoded): + ArrowErrorSet(error, "RunEndEncoded not yet supported in IPC reader"); + return ENOTSUP; default: ArrowErrorSet(error, "Unrecognized Field type with value %d", type_type); return EINVAL; diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index 6b70a0c7f..06ac8adfd 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -491,7 +491,6 @@ INSTANTIATE_TEST_SUITE_P( TestFile::OK("generated_datetime.stream"), TestFile::OK("generated_decimal.stream"), TestFile::OK("generated_decimal256.stream"), - TestFile::OK("generated_duplicate_fieldnames.stream"), TestFile::OK("generated_interval.stream"), TestFile::OK("generated_map_non_canonical.stream"), @@ -545,18 +544,10 @@ TEST_P(TestFileFixture, NanoarrowIpcTestFileIPCCheckJSON) { param.TestIPCCheckJSON(dir_builder.str()); } -// At least one Windows MSVC version does not allow the #if defined() -// to be within a macro invocation, so we define these two cases -// with some repetition. -#if defined(NANOARROW_IPC_WITH_ZSTD) && defined(NANOARROW_IPC_WITH_LZ4) INSTANTIATE_TEST_SUITE_P( NanoarrowIpcTest, TestFileFixture, ::testing::Values( // Testing of other files - TestFile::OK("2.0.0-compression/generated_uncompressible_zstd.stream"), - TestFile::OK("2.0.0-compression/generated_zstd.stream"), - TestFile::OK("2.0.0-compression/generated_uncompressible_lz4.stream"), - TestFile::OK("2.0.0-compression/generated_lz4.stream"), TestFile::OK("0.17.1/generated_union.stream"), TestFile::OK("0.14.1/generated_datetime.stream"), TestFile::OK("0.14.1/generated_decimal.stream"), @@ -565,24 +556,54 @@ INSTANTIATE_TEST_SUITE_P( TestFile::OK("0.14.1/generated_nested.stream"), TestFile::OK("0.14.1/generated_primitive.stream"), TestFile::OK("0.14.1/generated_primitive_no_batches.stream"), - TestFile::OK("0.14.1/generated_primitive_zerolength.stream") + TestFile::OK("0.14.1/generated_primitive_zerolength.stream"), + TestFile::OK("4.0.0-shareddict/generated_shared_dict"), + // cpp-21.0.0 regenerated gold files + TestFile::OK("cpp-21.0.0/generated_binary.stream"), + TestFile::OK("cpp-21.0.0/generated_binary_no_batches.stream"), + TestFile::OK("cpp-21.0.0/generated_binary_zerolength.stream"), + TestFile::OK("cpp-21.0.0/generated_custom_metadata.stream"), + TestFile::OK("cpp-21.0.0/generated_datetime.stream"), + TestFile::OK("cpp-21.0.0/generated_decimal.stream"), + TestFile::OK("cpp-21.0.0/generated_decimal256.stream"), + TestFile::OK("cpp-21.0.0/generated_decimal32.stream"), + TestFile::OK("cpp-21.0.0/generated_decimal64.stream"), + TestFile::OK("cpp-21.0.0/generated_duplicate_fieldnames.stream"), + TestFile::OK("cpp-21.0.0/generated_duration.stream"), + TestFile::OK("cpp-21.0.0/generated_interval.stream"), + TestFile::OK("cpp-21.0.0/generated_interval_mdn.stream"), + TestFile::OK("cpp-21.0.0/generated_large_binary.stream"), + TestFile::OK("cpp-21.0.0/generated_map.stream"), + TestFile::OK("cpp-21.0.0/generated_map_non_canonical.stream"), + TestFile::OK("cpp-21.0.0/generated_nested.stream"), + TestFile::OK("cpp-21.0.0/generated_nested_large_offsets.stream"), + TestFile::OK("cpp-21.0.0/generated_null.stream"), + TestFile::OK("cpp-21.0.0/generated_null_trivial.stream"), + TestFile::OK("cpp-21.0.0/generated_primitive.stream"), + TestFile::OK("cpp-21.0.0/generated_primitive_no_batches.stream"), + TestFile::OK("cpp-21.0.0/generated_primitive_zerolength.stream"), + TestFile::OK("cpp-21.0.0/generated_recursive_nested.stream"), + TestFile::OK("cpp-21.0.0/generated_union.stream"), + TestFile::ReadOnly("cpp-21.0.0/generated_dictionary.stream"), + TestFile::ReadOnly("cpp-21.0.0/generated_dictionary_unsigned.stream"), + TestFile::ReadOnly("cpp-21.0.0/generated_extension.stream"), + TestFile::ReadOnly("cpp-21.0.0/generated_nested_dictionary.stream"), + TestFile::NotSupported("cpp-21.0.0/generated_list_view.stream"), + TestFile::NotSupported("cpp-21.0.0/generated_binary_view.stream"), + TestFile::NotSupported("cpp-21.0.0/generated_run_end_encoded.stream") + // Comment to keep line from wrapping + )); + +#if defined(NANOARROW_IPC_WITH_ZSTD) && defined(NANOARROW_IPC_WITH_LZ4) +INSTANTIATE_TEST_SUITE_P( + NanoarrowIpcTestCompression, TestFileFixture, + ::testing::Values( + TestFile::OK("2.0.0-compression/generated_uncompressible_zstd.stream"), + TestFile::OK("2.0.0-compression/generated_zstd.stream"), + TestFile::OK("2.0.0-compression/generated_uncompressible_lz4.stream"), + TestFile::OK("2.0.0-compression/generated_lz4.stream") // Comment to keep line from wrapping )); -#else -INSTANTIATE_TEST_SUITE_P(NanoarrowIpcTest, TestFileFixture, - ::testing::Values( - // Testing of other files - TestFile::OK("0.17.1/generated_union.stream"), - TestFile::OK("0.14.1/generated_datetime.stream"), - TestFile::OK("0.14.1/generated_decimal.stream"), - TestFile::OK("0.14.1/generated_interval.stream"), - TestFile::OK("0.14.1/generated_map.stream"), - TestFile::OK("0.14.1/generated_nested.stream"), - TestFile::OK("0.14.1/generated_primitive.stream"), - TestFile::OK("0.14.1/generated_primitive_no_batches.stream"), - TestFile::OK("0.14.1/generated_primitive_zerolength.stream") - // Comment to keep line from wrapping - )); #endif #endif From 1e83811b349f107e43764811a203e825f9d45a77 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 3 May 2026 22:43:29 -0500 Subject: [PATCH 03/10] fix Co-authored-by: Copilot --- src/nanoarrow/ipc/files_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nanoarrow/ipc/files_test.cc b/src/nanoarrow/ipc/files_test.cc index 06ac8adfd..46c7203c1 100644 --- a/src/nanoarrow/ipc/files_test.cc +++ b/src/nanoarrow/ipc/files_test.cc @@ -557,7 +557,7 @@ INSTANTIATE_TEST_SUITE_P( TestFile::OK("0.14.1/generated_primitive.stream"), TestFile::OK("0.14.1/generated_primitive_no_batches.stream"), TestFile::OK("0.14.1/generated_primitive_zerolength.stream"), - TestFile::OK("4.0.0-shareddict/generated_shared_dict"), + TestFile::ReadOnly("4.0.0-shareddict/generated_shared_dict.stream"), // cpp-21.0.0 regenerated gold files TestFile::OK("cpp-21.0.0/generated_binary.stream"), TestFile::OK("cpp-21.0.0/generated_binary_no_batches.stream"), From 11a84353a357946d84d03d04d920bdf5931bd362 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Sun, 3 May 2026 23:35:14 -0500 Subject: [PATCH 04/10] fix more decimal updates Co-authored-by: Copilot --- src/nanoarrow/testing/testing.cc | 18 ++++++++++++++++-- src/nanoarrow/testing/testing_test.cc | 6 ++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/nanoarrow/testing/testing.cc b/src/nanoarrow/testing/testing.cc index 038359121..df69cebe5 100644 --- a/src/nanoarrow/testing/testing.cc +++ b/src/nanoarrow/testing/testing.cc @@ -421,6 +421,12 @@ ArrowErrorCode WriteData(std::ostream& out, const ArrowArrayView* value, break; } + case NANOARROW_TYPE_DECIMAL32: + NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 32)); + break; + case NANOARROW_TYPE_DECIMAL64: + NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 64)); + break; case NANOARROW_TYPE_DECIMAL128: NANOARROW_RETURN_NOT_OK(WriteDecimalData(out, value, 128)); break; @@ -508,6 +514,8 @@ ArrowErrorCode WriteTypeFromView(std::ostream& out, const ArrowSchemaView* field case NANOARROW_TYPE_FIXED_SIZE_BINARY: out << R"("name": "fixedsizebinary", "byteWidth": )" << field->fixed_size; break; + case NANOARROW_TYPE_DECIMAL32: + case NANOARROW_TYPE_DECIMAL64: case NANOARROW_TYPE_DECIMAL128: case NANOARROW_TYPE_DECIMAL256: out << R"("name": "decimal", "bitWidth": )" << field->decimal_bitwidth @@ -1924,6 +1932,9 @@ ArrowErrorCode SetBufferDecimal(const json& value, ArrowBuffer* buffer, int bitw ArrowDecimal decimal; ArrowDecimalInit(&decimal, bitwidth, 0, 0); + // n_words is 0 for decimal32, so calculate byte size from bitwidth directly + size_t element_size_bytes = bitwidth / 8; + ArrowStringView item_view; for (const auto& item : value) { @@ -1934,8 +1945,7 @@ ArrowErrorCode SetBufferDecimal(const json& value, ArrowBuffer* buffer, int bitw item_view.size_bytes = item_str.size(); NANOARROW_RETURN_NOT_OK_WITH_ERROR(ArrowDecimalSetDigits(&decimal, item_view), error); NANOARROW_RETURN_NOT_OK_WITH_ERROR( - ArrowBufferAppend(buffer, decimal.words, decimal.n_words * sizeof(uint64_t)), - error); + ArrowBufferAppend(buffer, decimal.words, element_size_bytes), error); } return NANOARROW_OK; @@ -2059,6 +2069,10 @@ ArrowErrorCode SetArrayColumnBuffers(const json& value, ArrowArrayView* array_vi return SetBufferIntervalDayTime(data, buffer, error); case NANOARROW_TYPE_INTERVAL_MONTH_DAY_NANO: return SetBufferIntervalMonthDayNano(data, buffer, error); + case NANOARROW_TYPE_DECIMAL32: + return SetBufferDecimal(data, buffer, 32, error); + case NANOARROW_TYPE_DECIMAL64: + return SetBufferDecimal(data, buffer, 64, error); case NANOARROW_TYPE_DECIMAL128: return SetBufferDecimal(data, buffer, 128, error); case NANOARROW_TYPE_DECIMAL256: diff --git a/src/nanoarrow/testing/testing_test.cc b/src/nanoarrow/testing/testing_test.cc index 0c057304c..8c3cb4c36 100644 --- a/src/nanoarrow/testing/testing_test.cc +++ b/src/nanoarrow/testing/testing_test.cc @@ -1144,6 +1144,12 @@ TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldFixedSizeBinary) { } TEST(NanoarrowTestingTest, NanoarrowTestingTestFieldDecimal) { + TestTypeRoundtrip( + R"({"name": "decimal", "bitWidth": 32, "precision": 8, "scale": 3})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); + TestTypeRoundtrip( + R"({"name": "decimal", "bitWidth": 64, "precision": 10, "scale": 3})", + R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); TestTypeRoundtrip( R"({"name": "decimal", "bitWidth": 128, "precision": 10, "scale": 3})", R"({"name": null, "count": 3, "VALIDITY": [0, 1, 1], "DATA": ["0", "0", "258"]})"); From 3c22b72116ab9324df53da780028d1a42f5f6068 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 10:53:34 -0500 Subject: [PATCH 05/10] dict integration --- src/nanoarrow/integration/ipc_integration.cc | 39 +++++++++++++++++--- src/nanoarrow/ipc/decoder.c | 13 +++++++ src/nanoarrow/ipc/encoder.c | 2 + src/nanoarrow/nanoarrow_ipc.h | 2 + 4 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/nanoarrow/integration/ipc_integration.cc b/src/nanoarrow/integration/ipc_integration.cc index 47ec93927..786d4118c 100644 --- a/src/nanoarrow/integration/ipc_integration.cc +++ b/src/nanoarrow/integration/ipc_integration.cc @@ -195,11 +195,40 @@ struct MaterializedArrayStream { NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowSchemaDeepCopy(&decoder->footer->schema, schema.get()), error); - NANOARROW_RETURN_NOT_OK( - ArrowIpcDecoderSetSchema(decoder.get(), &decoder->footer->schema, error)); + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderSetSchemaWithDictionaries( + decoder.get(), &decoder->footer->schema, &decoder->footer->dictionaries, error)); NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowIpcDecoderSetEndianness(decoder.get(), decoder->endianness), error); + // Read dictionary blocks + nanoarrow::ipc::UniqueDictionaries dictionaries; + NANOARROW_RETURN_NOT_OK(ArrowIpcDictionariesInit( + dictionaries.get(), &decoder->footer->dictionaries, error)); + + nanoarrow::UniqueBuffer dictionary_blocks; + ArrowBufferMove(&decoder->footer->record_batch_blocks, dictionary_blocks.get()); + + for (int i = 0; i < dictionary_blocks->size_bytes / sizeof(struct ArrowIpcFileBlock); + i++) { + const auto& block = + reinterpret_cast(dictionary_blocks->data)[i]; + struct ArrowBufferView metadata_view = { + {bytes.data() + block.offset}, + block.metadata_length, + }; + NANOARROW_RETURN_NOT_OK( + ArrowIpcDecoderDecodeHeader(decoder.get(), metadata_view, error)); + + struct ArrowBufferView body_view = { + {metadata_view.data.as_uint8 + metadata_view.size_bytes}, + block.body_length, + }; + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeDictionary( + decoder.get(), body_view, NANOARROW_VALIDATION_LEVEL_FULL, dictionaries.get(), + error)); + } + + // Read record batch blocks nanoarrow::UniqueBuffer record_batch_blocks; ArrowBufferMove(&decoder->footer->record_batch_blocks, record_batch_blocks.get()); @@ -219,9 +248,9 @@ struct MaterializedArrayStream { block.body_length, }; nanoarrow::UniqueArray batch; - NANOARROW_RETURN_NOT_OK( - ArrowIpcDecoderDecodeArray(decoder.get(), body_view, -1, batch.get(), - NANOARROW_VALIDATION_LEVEL_FULL, error)); + NANOARROW_RETURN_NOT_OK(ArrowIpcDecoderDecodeArrayWithDictionaries( + decoder.get(), body_view, -1, dictionaries.get(), batch.get(), + NANOARROW_VALIDATION_LEVEL_FULL, error)); batches.push_back(std::move(batch)); } diff --git a/src/nanoarrow/ipc/decoder.c b/src/nanoarrow/ipc/decoder.c index 3da01ea0f..d9c8e1c4a 100644 --- a/src/nanoarrow/ipc/decoder.c +++ b/src/nanoarrow/ipc/decoder.c @@ -1898,6 +1898,19 @@ ArrowErrorCode ArrowIpcDecoderDecodeFooter(struct ArrowIpcDecoder* decoder, record_batches[i].body_length = ns(Block_bodyLength(blocks + i)); } + blocks = ns(Footer_dictionaries(footer)); + n = ns(Block_vec_len(blocks)); + NANOARROW_RETURN_NOT_OK(ArrowBufferResize(&private_data->footer.dictionary_blocks, + sizeof(struct ArrowIpcFileBlock) * n, + /*shrink_to_fit=*/0)); + struct ArrowIpcFileBlock* dictionaries = + (struct ArrowIpcFileBlock*)private_data->footer.dictionary_blocks.data; + for (int64_t i = 0; i < n; i++) { + dictionaries[i].offset = ns(Block_offset(blocks + i)); + dictionaries[i].metadata_length = ns(Block_metaDataLength(blocks + i)); + dictionaries[i].body_length = ns(Block_bodyLength(blocks + i)); + } + decoder->footer = &private_data->footer; return NANOARROW_OK; } diff --git a/src/nanoarrow/ipc/encoder.c b/src/nanoarrow/ipc/encoder.c index 8b3fc4896..27c9da954 100644 --- a/src/nanoarrow/ipc/encoder.c +++ b/src/nanoarrow/ipc/encoder.c @@ -629,6 +629,7 @@ ArrowErrorCode ArrowIpcEncoderEncodeSimpleRecordBatch( void ArrowIpcFooterInit(struct ArrowIpcFooter* footer) { footer->schema.release = NULL; ArrowBufferInit(&footer->record_batch_blocks); + ArrowBufferInit(&footer->dictionary_blocks); ArrowIpcDictionaryEncodingsInit(&footer->dictionaries); } @@ -637,6 +638,7 @@ void ArrowIpcFooterReset(struct ArrowIpcFooter* footer) { ArrowSchemaRelease(&footer->schema); } ArrowBufferReset(&footer->record_batch_blocks); + ArrowBufferReset(&footer->dictionary_blocks); ArrowIpcDictionaryEncodingsReset(&footer->dictionaries); } diff --git a/src/nanoarrow/nanoarrow_ipc.h b/src/nanoarrow/nanoarrow_ipc.h index f4c8ef2cf..f0cd71a51 100644 --- a/src/nanoarrow/nanoarrow_ipc.h +++ b/src/nanoarrow/nanoarrow_ipc.h @@ -954,6 +954,8 @@ struct ArrowIpcFooter { struct ArrowIpcDictionaryEncodings dictionaries; /// \brief all blocks containing RecordBatch Messages struct ArrowBuffer record_batch_blocks; + /// \brief all blocks containing DictionaryBatch Messages + struct ArrowBuffer dictionary_blocks; }; /// \brief Initialize a footer From 43a622673b29712cef93deb7d9ecb0b4d41739fa Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 13:42:51 -0500 Subject: [PATCH 06/10] maybe fix cmake scenarios --- examples/cmake-scenarios/run.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/cmake-scenarios/run.sh b/examples/cmake-scenarios/run.sh index 71f330795..ccc211714 100755 --- a/examples/cmake-scenarios/run.sh +++ b/examples/cmake-scenarios/run.sh @@ -27,13 +27,14 @@ WIN_DLL_NANOARROW_INSTALLED="$(pwd)/scratch/nanoarrow_install/bin" # The mismatched_shared_libs test is static-only, so no DLL path needed for dir in scratch/build*; do # Special cases where we have to set PATH on Windows - if [ "${dir}" = "scratch/build_against_fetched_shared" ] && [ "${OSTYPE}" = "msys" ]; then + # OSTYPE can be "msys" (Git Bash) or "cygwin" (Cygwin/GitHub Actions) + if [ "${dir}" = "scratch/build_against_fetched_shared" ] && [[ "${OSTYPE}" == msys* || "${OSTYPE}" == cygwin* ]]; then PATH="${PATH}:${WIN_DLL_NANOARROW_FETCHED}" ./${dir}/Debug/minimal_cpp_app - elif [ "${dir}" = "scratch/build_shared" ] && [ "${OSTYPE}" = "msys" ]; then + elif [ "${dir}" = "scratch/build_shared" ] && [[ "${OSTYPE}" == msys* || "${OSTYPE}" == cygwin* ]]; then PATH="${PATH}:${WIN_DLL_NANOARROW_BUILT}" ./${dir}/Debug/minimal_cpp_app - elif [ "${dir}" = "scratch/build_against_install_shared" ] && [ "${OSTYPE}" = "msys" ]; then + elif [ "${dir}" = "scratch/build_against_install_shared" ] && [[ "${OSTYPE}" == msys* || "${OSTYPE}" == cygwin* ]]; then PATH="${PATH}:${WIN_DLL_NANOARROW_INSTALLED}" ./${dir}/Debug/minimal_cpp_app - elif [ "${OSTYPE}" = "msys" ]; then + elif [[ "${OSTYPE}" == msys* || "${OSTYPE}" == cygwin* ]]; then ./${dir}/Debug/minimal_cpp_app else ./${dir}/minimal_cpp_app From f68f4487d2ca242b913ce08556fc78c162fab8cd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 13:45:04 -0500 Subject: [PATCH 07/10] reduce cuda size --- .github/workflows/build-and-test-device.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-and-test-device.yaml b/.github/workflows/build-and-test-device.yaml index 3e10d4950..b2088d3d0 100644 --- a/.github/workflows/build-and-test-device.yaml +++ b/.github/workflows/build-and-test-device.yaml @@ -80,8 +80,11 @@ jobs: - name: Install build dependencies if: matrix.config.label == 'with-cuda' run: | + # Install minimal CUDA packages needed for building (avoid full cuda-toolkit + # which includes nsight-systems and other large tools that exhaust disk space) sudo apt-get update && \ - sudo apt-get install -y cmake build-essential cuda-toolkit tzdata + sudo apt-get install -y cmake build-essential tzdata \ + cuda-nvcc-13-2 cuda-cudart-dev-13-2 libcublas-dev-13-2 # Install newer cmake for building Arrow C++ pip install cmake From 26f06eb248c68aa2369612d9b2cf959bfc23289b Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 14:18:55 -0500 Subject: [PATCH 08/10] add integration test test Co-authored-by: Copilot --- ci/scripts/run-ipc-integration-tests.sh | 209 +++++++++++++++++++ src/nanoarrow/integration/ipc_integration.cc | 12 +- 2 files changed, 216 insertions(+), 5 deletions(-) create mode 100755 ci/scripts/run-ipc-integration-tests.sh diff --git a/ci/scripts/run-ipc-integration-tests.sh b/ci/scripts/run-ipc-integration-tests.sh new file mode 100755 index 000000000..318caa14c --- /dev/null +++ b/ci/scripts/run-ipc-integration-tests.sh @@ -0,0 +1,209 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script runs the nanoarrow_ipc_integration VALIDATE command against +# test files from the arrow-testing repository. +# +# Usage: +# export NANOARROW_ARROW_TESTING_DIR=/path/to/arrow-testing +# ./dev/run_ipc_integration_tests.sh [build_dir] +# +# Arguments: +# build_dir: Optional path to the build directory containing +# nanoarrow_ipc_integration. Defaults to "build". + +set -e + +REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +BUILD_DIR="${1:-${REPO_ROOT}/build}" + +if [ -z "${NANOARROW_ARROW_TESTING_DIR}" ]; then + echo "Error: NANOARROW_ARROW_TESTING_DIR environment variable not set" + echo "Please set it to the path of a checkout of apache/arrow-testing" + exit 1 +fi + +if [ ! -d "${NANOARROW_ARROW_TESTING_DIR}" ]; then + echo "Error: NANOARROW_ARROW_TESTING_DIR does not exist: ${NANOARROW_ARROW_TESTING_DIR}" + exit 1 +fi + +INTEGRATION_BIN="${BUILD_DIR}/nanoarrow_ipc_integration" +if [ ! -x "${INTEGRATION_BIN}" ]; then + echo "Error: nanoarrow_ipc_integration not found at ${INTEGRATION_BIN}" + echo "Please build the project first or specify the build directory as an argument" + exit 1 +fi + +DATA_DIR="${NANOARROW_ARROW_TESTING_DIR}/data/arrow-ipc-stream/integration" + +# Create a temp directory for decompressed JSON files +TEMP_DIR=$(mktemp -d) +trap "rm -rf ${TEMP_DIR}" EXIT + +# Function to run VALIDATE for a given test file +run_validate() { + local subdir="$1" + local basename="$2" + + local stream_file="${DATA_DIR}/${subdir}/${basename}.stream" + local arrow_file="${DATA_DIR}/${subdir}/${basename}.arrow_file" + local json_gz="${DATA_DIR}/${subdir}/${basename}.json.gz" + local json_file="${TEMP_DIR}/${subdir}_${basename}.json" + + # The VALIDATE command uses FromIpcFile which requires Arrow file format + # (with ARROW1 magic and footer), not IPC stream format. + # So we prefer .arrow_file over .stream + local arrow_path="" + if [ -f "${arrow_file}" ]; then + arrow_path="${arrow_file}" + elif [ -f "${stream_file}" ]; then + # Note: .stream files may fail because FromIpcFile expects file format + arrow_path="${stream_file}" + else + echo "SKIP: ${subdir}/${basename} - no .arrow_file or .stream found" + return 0 + fi + + # Check if JSON exists (possibly gzipped) + if [ -f "${json_gz}" ]; then + gunzip -c "${json_gz}" > "${json_file}" + elif [ -f "${DATA_DIR}/${subdir}/${basename}.json" ]; then + json_file="${DATA_DIR}/${subdir}/${basename}.json" + else + echo "SKIP: ${subdir}/${basename} - no .json or .json.gz found" + return 0 + fi + + echo "Testing: ${subdir}/${basename}" + if COMMAND=VALIDATE ARROW_PATH="${arrow_path}" JSON_PATH="${json_file}" "${INTEGRATION_BIN}"; then + echo " PASS" + return 0 + else + echo " FAIL" + return 1 + fi +} + +# Track results +PASSED=0 +FAILED=0 +SKIPPED=0 + +run_test() { + if run_validate "$@"; then + if [[ $(run_validate "$@" 2>&1) == *"SKIP"* ]]; then + ((SKIPPED++)) + else + ((PASSED++)) + fi + else + ((FAILED++)) + fi +} + +echo "=== Running IPC Integration Tests ===" +echo "Using arrow-testing at: ${NANOARROW_ARROW_TESTING_DIR}" +echo "Using integration binary at: ${INTEGRATION_BIN}" +echo "" + +# Test files in cpp-21.0.0 (includes decimal32, decimal64, and dictionaries) +CPP_21_FILES=( + "generated_decimal32" + "generated_decimal64" + "generated_decimal" + "generated_decimal256" + "generated_primitive" + "generated_primitive_no_batches" + "generated_primitive_zerolength" + "generated_datetime" + "generated_interval" + "generated_interval_mdn" + "generated_duration" + "generated_nested" + "generated_nested_large_offsets" + "generated_null" + "generated_null_trivial" + "generated_custom_metadata" + "generated_duplicate_fieldnames" + "generated_map" + "generated_map_non_canonical" + "generated_recursive_nested" + "generated_union" + "generated_binary" + "generated_binary_no_batches" + "generated_binary_zerolength" + "generated_large_binary" + "generated_dictionary" + "generated_dictionary_unsigned" + "generated_nested_dictionary" +) + +echo "=== Testing cpp-21.0.0 files ===" +for file in "${CPP_21_FILES[@]}"; do + if run_validate "cpp-21.0.0" "${file}"; then + ((PASSED++)) + else + exit_code=$? + if [ $exit_code -eq 0 ]; then + ((SKIPPED++)) + else + ((FAILED++)) + fi + fi +done + +# Test files in 1.0.0-littleendian +LITTLEENDIAN_FILES=( + "generated_decimal" + "generated_decimal256" + "generated_primitive" + "generated_datetime" + "generated_interval" + "generated_nested" + "generated_null" + "generated_custom_metadata" + "generated_map" + "generated_union" +) + +echo "" +echo "=== Testing 1.0.0-littleendian files ===" +for file in "${LITTLEENDIAN_FILES[@]}"; do + if run_validate "1.0.0-littleendian" "${file}"; then + ((PASSED++)) + else + exit_code=$? + if [ $exit_code -eq 0 ]; then + ((SKIPPED++)) + else + ((FAILED++)) + fi + fi +done + +echo "" +echo "=== Summary ===" +echo "Passed: ${PASSED}" +echo "Failed: ${FAILED}" +echo "Skipped: ${SKIPPED}" + +if [ ${FAILED} -gt 0 ]; then + exit 1 +fi diff --git a/src/nanoarrow/integration/ipc_integration.cc b/src/nanoarrow/integration/ipc_integration.cc index 786d4118c..7a56fca70 100644 --- a/src/nanoarrow/integration/ipc_integration.cc +++ b/src/nanoarrow/integration/ipc_integration.cc @@ -200,14 +200,19 @@ struct MaterializedArrayStream { NANOARROW_RETURN_NOT_OK_WITH_ERROR( ArrowIpcDecoderSetEndianness(decoder.get(), decoder->endianness), error); - // Read dictionary blocks + // Initialize dictionaries storage nanoarrow::ipc::UniqueDictionaries dictionaries; NANOARROW_RETURN_NOT_OK(ArrowIpcDictionariesInit( dictionaries.get(), &decoder->footer->dictionaries, error)); + // Move both block buffers out of the footer BEFORE decoding any headers, + // because ArrowIpcDecoderDecodeHeader resets the footer nanoarrow::UniqueBuffer dictionary_blocks; - ArrowBufferMove(&decoder->footer->record_batch_blocks, dictionary_blocks.get()); + nanoarrow::UniqueBuffer record_batch_blocks; + ArrowBufferMove(&decoder->footer->dictionary_blocks, dictionary_blocks.get()); + ArrowBufferMove(&decoder->footer->record_batch_blocks, record_batch_blocks.get()); + // Read dictionary blocks for (int i = 0; i < dictionary_blocks->size_bytes / sizeof(struct ArrowIpcFileBlock); i++) { const auto& block = @@ -229,9 +234,6 @@ struct MaterializedArrayStream { } // Read record batch blocks - nanoarrow::UniqueBuffer record_batch_blocks; - ArrowBufferMove(&decoder->footer->record_batch_blocks, record_batch_blocks.get()); - for (int i = 0; i < record_batch_blocks->size_bytes / sizeof(struct ArrowIpcFileBlock); i++) { const auto& block = From ae295af5b8c2ee61a7dcb3cfc6dbaf7fb257a182 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 14:29:14 -0500 Subject: [PATCH 09/10] more futurey testing Co-authored-by: Copilot --- .github/workflows/build-and-test-ipc.yaml | 5 + ci/scripts/run-ipc-integration-tests.sh | 170 ++++++++-------------- 2 files changed, 68 insertions(+), 107 deletions(-) diff --git a/.github/workflows/build-and-test-ipc.yaml b/.github/workflows/build-and-test-ipc.yaml index 2ae83deaf..6257396e9 100644 --- a/.github/workflows/build-and-test-ipc.yaml +++ b/.github/workflows/build-and-test-ipc.yaml @@ -149,3 +149,8 @@ jobs: with: name: nanoarrow-ipc-memcheck path: build/Testing/Temporary/MemoryChecker.*.log + + - name: Run integration test validation for arrow-testing files + if: matrix.config.label == 'default-build' + run: | + ./ci/scripts/run-ipc-integration-tests.sh build diff --git a/ci/scripts/run-ipc-integration-tests.sh b/ci/scripts/run-ipc-integration-tests.sh index 318caa14c..f276f9432 100755 --- a/ci/scripts/run-ipc-integration-tests.sh +++ b/ci/scripts/run-ipc-integration-tests.sh @@ -18,18 +18,16 @@ # under the License. # This script runs the nanoarrow_ipc_integration VALIDATE command against -# test files from the arrow-testing repository. +# all test files from the arrow-testing repository's integration directory. # # Usage: # export NANOARROW_ARROW_TESTING_DIR=/path/to/arrow-testing -# ./dev/run_ipc_integration_tests.sh [build_dir] +# ./ci/scripts/run-ipc-integration-tests.sh [build_dir] # # Arguments: # build_dir: Optional path to the build directory containing # nanoarrow_ipc_integration. Defaults to "build". -set -e - REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)" BUILD_DIR="${1:-${REPO_ROOT}/build}" @@ -57,28 +55,42 @@ DATA_DIR="${NANOARROW_ARROW_TESTING_DIR}/data/arrow-ipc-stream/integration" TEMP_DIR=$(mktemp -d) trap "rm -rf ${TEMP_DIR}" EXIT +# Track results +PASSED=0 +FAILED=0 +SKIPPED=0 + +# Known files that are expected to be skipped (unsupported types) +SKIP_PATTERNS=( + "generated_extension" # Extension types not fully supported + "generated_list_view" # ListView not supported + "generated_binary_view" # BinaryView not supported + "generated_run_end_encoded" # REE not supported +) + +# Function to check if a file should be skipped +should_skip() { + local basename="$1" + for pattern in "${SKIP_PATTERNS[@]}"; do + if [[ "${basename}" == *"${pattern}"* ]]; then + return 0 + fi + done + return 1 +} + # Function to run VALIDATE for a given test file run_validate() { local subdir="$1" local basename="$2" - local stream_file="${DATA_DIR}/${subdir}/${basename}.stream" local arrow_file="${DATA_DIR}/${subdir}/${basename}.arrow_file" local json_gz="${DATA_DIR}/${subdir}/${basename}.json.gz" local json_file="${TEMP_DIR}/${subdir}_${basename}.json" - # The VALIDATE command uses FromIpcFile which requires Arrow file format - # (with ARROW1 magic and footer), not IPC stream format. - # So we prefer .arrow_file over .stream - local arrow_path="" - if [ -f "${arrow_file}" ]; then - arrow_path="${arrow_file}" - elif [ -f "${stream_file}" ]; then - # Note: .stream files may fail because FromIpcFile expects file format - arrow_path="${stream_file}" - else - echo "SKIP: ${subdir}/${basename} - no .arrow_file or .stream found" - return 0 + # We require .arrow_file format (with ARROW1 magic and footer) + if [ ! -f "${arrow_file}" ]; then + return 2 # Skip - no arrow file fi # Check if JSON exists (possibly gzipped) @@ -87,34 +99,13 @@ run_validate() { elif [ -f "${DATA_DIR}/${subdir}/${basename}.json" ]; then json_file="${DATA_DIR}/${subdir}/${basename}.json" else - echo "SKIP: ${subdir}/${basename} - no .json or .json.gz found" - return 0 + return 2 # Skip - no JSON file fi - echo "Testing: ${subdir}/${basename}" - if COMMAND=VALIDATE ARROW_PATH="${arrow_path}" JSON_PATH="${json_file}" "${INTEGRATION_BIN}"; then - echo " PASS" - return 0 - else - echo " FAIL" - return 1 - fi -} - -# Track results -PASSED=0 -FAILED=0 -SKIPPED=0 - -run_test() { - if run_validate "$@"; then - if [[ $(run_validate "$@" 2>&1) == *"SKIP"* ]]; then - ((SKIPPED++)) - else - ((PASSED++)) - fi + if COMMAND=VALIDATE ARROW_PATH="${arrow_file}" JSON_PATH="${json_file}" "${INTEGRATION_BIN}" > /dev/null 2>&1; then + return 0 # Pass else - ((FAILED++)) + return 1 # Fail fi } @@ -123,82 +114,47 @@ echo "Using arrow-testing at: ${NANOARROW_ARROW_TESTING_DIR}" echo "Using integration binary at: ${INTEGRATION_BIN}" echo "" -# Test files in cpp-21.0.0 (includes decimal32, decimal64, and dictionaries) -CPP_21_FILES=( - "generated_decimal32" - "generated_decimal64" - "generated_decimal" - "generated_decimal256" - "generated_primitive" - "generated_primitive_no_batches" - "generated_primitive_zerolength" - "generated_datetime" - "generated_interval" - "generated_interval_mdn" - "generated_duration" - "generated_nested" - "generated_nested_large_offsets" - "generated_null" - "generated_null_trivial" - "generated_custom_metadata" - "generated_duplicate_fieldnames" - "generated_map" - "generated_map_non_canonical" - "generated_recursive_nested" - "generated_union" - "generated_binary" - "generated_binary_no_batches" - "generated_binary_zerolength" - "generated_large_binary" - "generated_dictionary" - "generated_dictionary_unsigned" - "generated_nested_dictionary" -) +# Find all subdirectories in the integration directory +for subdir_path in "${DATA_DIR}"/*/; do + [ -d "${subdir_path}" ] || continue + subdir=$(basename "${subdir_path}") -echo "=== Testing cpp-21.0.0 files ===" -for file in "${CPP_21_FILES[@]}"; do - if run_validate "cpp-21.0.0" "${file}"; then - ((PASSED++)) - else - exit_code=$? - if [ $exit_code -eq 0 ]; then + # Skip versions before 1.0.0 + if [[ "${subdir}" == 0.* ]]; then + continue + fi + + echo "=== Testing ${subdir} ===" + + # Find all unique basenames (from .arrow_file files) + for arrow_file in "${subdir_path}"*.arrow_file; do + [ -f "${arrow_file}" ] || continue + + basename=$(basename "${arrow_file}" .arrow_file) + + # Check if this file should be skipped + if should_skip "${basename}"; then ((SKIPPED++)) - else - ((FAILED++)) + continue fi - fi -done -# Test files in 1.0.0-littleendian -LITTLEENDIAN_FILES=( - "generated_decimal" - "generated_decimal256" - "generated_primitive" - "generated_datetime" - "generated_interval" - "generated_nested" - "generated_null" - "generated_custom_metadata" - "generated_map" - "generated_union" -) + run_validate "${subdir}" "${basename}" + result=$? -echo "" -echo "=== Testing 1.0.0-littleendian files ===" -for file in "${LITTLEENDIAN_FILES[@]}"; do - if run_validate "1.0.0-littleendian" "${file}"; then - ((PASSED++)) - else - exit_code=$? - if [ $exit_code -eq 0 ]; then + if [ $result -eq 0 ]; then + echo " PASS: ${basename}" + ((PASSED++)) + elif [ $result -eq 2 ]; then + echo " SKIP: ${basename} (missing files)" ((SKIPPED++)) else + echo " FAIL: ${basename}" ((FAILED++)) fi - fi + done + echo "" done -echo "" echo "=== Summary ===" echo "Passed: ${PASSED}" echo "Failed: ${FAILED}" From 2dded79124a35c8a74cc855d886bb39dbe5d7571 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 May 2026 14:30:59 -0500 Subject: [PATCH 10/10] don't skip supported files --- ci/scripts/run-ipc-integration-tests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/scripts/run-ipc-integration-tests.sh b/ci/scripts/run-ipc-integration-tests.sh index f276f9432..266f8e5d5 100755 --- a/ci/scripts/run-ipc-integration-tests.sh +++ b/ci/scripts/run-ipc-integration-tests.sh @@ -62,7 +62,6 @@ SKIPPED=0 # Known files that are expected to be skipped (unsupported types) SKIP_PATTERNS=( - "generated_extension" # Extension types not fully supported "generated_list_view" # ListView not supported "generated_binary_view" # BinaryView not supported "generated_run_end_encoded" # REE not supported