From 24ef3469d1d323ce6b166fcb60fe9596cb524bcd Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 Nov 2024 14:20:38 -0600 Subject: [PATCH 1/4] one attempt --- src/nanoarrow/hpp/schema.hpp | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 src/nanoarrow/hpp/schema.hpp diff --git a/src/nanoarrow/hpp/schema.hpp b/src/nanoarrow/hpp/schema.hpp new file mode 100644 index 000000000..0540a781c --- /dev/null +++ b/src/nanoarrow/hpp/schema.hpp @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef NANOARROW_HPP_SCHEMA_HPP_INCLUDED +#define NANOARROW_HPP_SCHEMA_HPP_INCLUDED + +#include + +#include "nanoarrow/hpp/exception.hpp" +#include "nanoarrow/hpp/unique.hpp" +#include "nanoarrow/nanoarrow.h" + +NANOARROW_CXX_NAMESPACE_BEGIN + +class Schema { + public: + explicit Schema(UniqueSchema schema) : schema_(std::move(schema)) {} + + // Make conversion from a raw pointer explicit, including copies + static Schema Copy(const ArrowSchema* schema) { + Schema out; + NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(schema, out.schema_.get())); + return out; + } + + static Schema Move(ArrowSchema* schema) { + Schema out; + out.schema_.reset(schema); + return out; + } + + // Movable + Schema(Schema&& rhs) : Schema(std::move(rhs.schema_)) {} + Schema& operator=(Schema&& rhs) { + schema_ = std::move(rhs.schema_); + return *this; + } + // Not copyable + Schema(const Schema& rhs) = delete; + + // Implicitly convertable to const ArrowSchema + const ArrowSchema* data() const { return schema_.get(); } + operator const ArrowSchema*() const { return schema_.get(); } + + bool IsValid() const { return schema_->release != nullptr; } + + int64_t NumChildren() const { + NANOARROW_DCHECK(IsValid()); + return schema_->n_children; + } + + Schema Child(int64_t i) const { + NANOARROW_DCHECK(IsValid() && i < schema_->n_children && i > 0); + return Schema::Copy(schema_->children[i]); + } + + Schema Dictionary() { + NANOARROW_DCHECK(IsValid() && schema_->dictionary != nullptr); + return Schema::Copy(schema_->dictionary); + } + + private: + UniqueSchema schema_; + + Schema() = default; +}; + +class SchemaBuilder { + public: + SchemaBuilder() { ArrowSchemaInit(schema_.get()); } + SchemaBuilder(const ArrowSchema* schema) { + NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(schema, schema_.get())); + } + + SchemaBuilder(const UniqueSchema schema) : SchemaBuilder(schema.get()) {} + + // Movable + SchemaBuilder(SchemaBuilder&& rhs) : SchemaBuilder(std::move(rhs.schema_)) {} + SchemaBuilder& operator=(SchemaBuilder&& rhs) { + schema_ = std::move(rhs.schema_); + return *this; + } + // Not copyable + SchemaBuilder(const SchemaBuilder& rhs) = delete; + + private: + UniqueSchema schema_; +}; + +NANOARROW_CXX_NAMESPACE_END + +#endif From 68392b61a4768591ece4d6066ca008d012c498b0 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 Nov 2024 15:31:24 -0600 Subject: [PATCH 2/4] try on iterators --- src/nanoarrow/hpp/schema.hpp | 151 +++++++++++++++++++++-------------- 1 file changed, 93 insertions(+), 58 deletions(-) diff --git a/src/nanoarrow/hpp/schema.hpp b/src/nanoarrow/hpp/schema.hpp index 0540a781c..c8dbdc463 100644 --- a/src/nanoarrow/hpp/schema.hpp +++ b/src/nanoarrow/hpp/schema.hpp @@ -22,85 +22,120 @@ #include "nanoarrow/hpp/exception.hpp" #include "nanoarrow/hpp/unique.hpp" +#include "nanoarrow/hpp/view.hpp" #include "nanoarrow/nanoarrow.h" NANOARROW_CXX_NAMESPACE_BEGIN -class Schema { +template +class ViewMetadata { public: - explicit Schema(UniqueSchema schema) : schema_(std::move(schema)) {} + explicit ViewMetadata(const char* metadata) : metadata_(metadata) {} - // Make conversion from a raw pointer explicit, including copies - static Schema Copy(const ArrowSchema* schema) { - Schema out; - NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(schema, out.schema_.get())); - return out; - } - - static Schema Move(ArrowSchema* schema) { - Schema out; - out.schema_.reset(schema); - return out; - } - - // Movable - Schema(Schema&& rhs) : Schema(std::move(rhs.schema_)) {} - Schema& operator=(Schema&& rhs) { - schema_ = std::move(rhs.schema_); - return *this; - } - // Not copyable - Schema(const Schema& rhs) = delete; + private: + const char* metadata_; - // Implicitly convertable to const ArrowSchema - const ArrowSchema* data() const { return schema_.get(); } - operator const ArrowSchema*() const { return schema_.get(); } + public: + class iterator { + const ViewMetadata& outer_; + ArrowMetadataReader reader_{}; + ArrowStringView key_{}; + ArrowStringView value_{}; + + public: + explicit iterator(const ViewMetadata& outer, int64_t remaining_keys) : outer_(outer) { + if (remaining_keys != 0) { + NANOARROW_THROW_NOT_OK(ArrowMetadataReaderInit(&reader_, outer.metadata_)); + } + } + + iterator& operator++() { + ArrowMetadataReaderRead(&reader_, &key_, &value_); + return *this; + } + + iterator operator++(int) { + iterator retval = *this; + ++(*this); + return retval; + } + + bool operator==(iterator other) const { + return outer_.metadata_ == other.outer_.metadata_ && + reader_.remaining_keys == other.reader_.remaining_keys; + } + + bool operator!=(iterator other) const { return !(*this == other); } + + std::pair operator*() const { + return {StringT{key_.data, key_.size_bytes}, + StringT{value_.data, value_.size_bytes}}; + } + + using iterator_category = std::forward_iterator_tag; + }; + + iterator begin() const { return iterator(*this); } + iterator end() const { return iterator(*this, 0); } +}; - bool IsValid() const { return schema_->release != nullptr; } +class ViewSchemaChildren; - int64_t NumChildren() const { - NANOARROW_DCHECK(IsValid()); - return schema_->n_children; - } +class ViewSchema { + public: + ViewSchema(const ArrowSchema* schema) : schema_{schema} {} - Schema Child(int64_t i) const { - NANOARROW_DCHECK(IsValid() && i < schema_->n_children && i > 0); - return Schema::Copy(schema_->children[i]); + template + ViewMetadata Metadata() { + return ViewMetadata(schema_->metadata); } - Schema Dictionary() { - NANOARROW_DCHECK(IsValid() && schema_->dictionary != nullptr); - return Schema::Copy(schema_->dictionary); - } + ViewSchemaChildren Children(); private: - UniqueSchema schema_; - - Schema() = default; + const ArrowSchema* schema_; }; -class SchemaBuilder { +class ViewSchemaChildren { public: - SchemaBuilder() { ArrowSchemaInit(schema_.get()); } - SchemaBuilder(const ArrowSchema* schema) { - NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(schema, schema_.get())); - } - - SchemaBuilder(const UniqueSchema schema) : SchemaBuilder(schema.get()) {} - - // Movable - SchemaBuilder(SchemaBuilder&& rhs) : SchemaBuilder(std::move(rhs.schema_)) {} - SchemaBuilder& operator=(SchemaBuilder&& rhs) { - schema_ = std::move(rhs.schema_); - return *this; - } - // Not copyable - SchemaBuilder(const SchemaBuilder& rhs) = delete; + explicit ViewSchemaChildren(const ArrowSchema* schema) : schema_(schema) {} private: - UniqueSchema schema_; + const ArrowSchema* schema_{}; + + public: + class iterator { + const ViewSchemaChildren& outer_; + int64_t i_ = 0; + + public: + explicit iterator(const ViewSchemaChildren& outer, int i = 0) + : outer_(outer), i_(i) {} + iterator& operator++() { + i_++; + return *this; + } + iterator operator++(int) { + iterator retval = *this; + ++(*this); + return retval; + } + bool operator==(iterator other) const { + return outer_.schema_ == other.outer_.schema_ && i_ == other.i_; + } + bool operator!=(iterator other) const { return !(*this == other); } + ViewSchema operator*() const { return ViewSchema(outer_.schema_->children[i_]); } + using iterator_category = std::forward_iterator_tag; + }; + + iterator begin() const { return iterator(*this); } + iterator end() const { return iterator(*this, schema_->n_children); } }; +inline ViewSchemaChildren ViewSchema::Children() { return ViewSchemaChildren(schema_); } + + + NANOARROW_CXX_NAMESPACE_END #endif From ae03964e3b31cc537456494977dd69f0287cdfb4 Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 Nov 2024 16:05:11 -0600 Subject: [PATCH 3/4] iterator stuff --- src/nanoarrow/hpp/schema.hpp | 75 +++++++++++++++++++++++++++------- src/nanoarrow/hpp/view_test.cc | 32 +++++++++++++-- 2 files changed, 88 insertions(+), 19 deletions(-) diff --git a/src/nanoarrow/hpp/schema.hpp b/src/nanoarrow/hpp/schema.hpp index c8dbdc463..209e9a456 100644 --- a/src/nanoarrow/hpp/schema.hpp +++ b/src/nanoarrow/hpp/schema.hpp @@ -18,6 +18,7 @@ #ifndef NANOARROW_HPP_SCHEMA_HPP_INCLUDED #define NANOARROW_HPP_SCHEMA_HPP_INCLUDED +#include #include #include "nanoarrow/hpp/exception.hpp" @@ -27,11 +28,18 @@ NANOARROW_CXX_NAMESPACE_BEGIN -template class ViewMetadata { public: explicit ViewMetadata(const char* metadata) : metadata_(metadata) {} + int64_t size() { + if (metadata_ == nullptr) { + return 0; + } + + return end() - begin(); + } + private: const char* metadata_; @@ -50,7 +58,7 @@ class ViewMetadata { } iterator& operator++() { - ArrowMetadataReaderRead(&reader_, &key_, &value_); + NANOARROW_THROW_NOT_OK(ArrowMetadataReaderRead(&reader_, &key_, &value_)); return *this; } @@ -60,6 +68,10 @@ class ViewMetadata { return retval; } + int64_t operator-(iterator other) const { + return reader_.remaining_keys - other.reader_.remaining_keys; + } + bool operator==(iterator other) const { return outer_.metadata_ == other.outer_.metadata_ && reader_.remaining_keys == other.reader_.remaining_keys; @@ -67,15 +79,15 @@ class ViewMetadata { bool operator!=(iterator other) const { return !(*this == other); } - std::pair operator*() const { - return {StringT{key_.data, key_.size_bytes}, - StringT{value_.data, value_.size_bytes}}; + std::pair operator*() const { + return {{key_.data, static_cast(key_.size_bytes)}, + {value_.data, static_cast(value_.size_bytes)}}; } using iterator_category = std::forward_iterator_tag; }; - iterator begin() const { return iterator(*this); } + iterator begin() const { return iterator(*this, -1); } iterator end() const { return iterator(*this, 0); } }; @@ -83,23 +95,56 @@ class ViewSchemaChildren; class ViewSchema { public: - ViewSchema(const ArrowSchema* schema) : schema_{schema} {} + ViewSchema(const ArrowSchema* schema) : schema_{schema} { + // Probably need to do something better with this error here + NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view_, schema_, nullptr)); + } + + std::string_view format() const { + if (schema_->name) { + return ""; + } else { + return schema_->name; + } + } - template - ViewMetadata Metadata() { - return ViewMetadata(schema_->metadata); + std::string_view name() const { + if (schema_->name) { + return ""; + } else { + return schema_->name; + } + } + + ViewMetadata metadata() const { return ViewMetadata(schema_->metadata); } + + ViewSchemaChildren children() const; + + std::optional dictionary() const { + if (schema_->dictionary) { + return ViewSchema(schema_->dictionary); + } else { + return std::nullopt; + } } - ViewSchemaChildren Children(); + bool is_extension() const { return schema_view_.extension_name.size_bytes > 0; } + + ArrowType type() const { return schema_view_.type; } + + ArrowType storage_type() { return schema_view_.storage_type; } private: const ArrowSchema* schema_; + ArrowSchemaView schema_view_{}; }; class ViewSchemaChildren { public: explicit ViewSchemaChildren(const ArrowSchema* schema) : schema_(schema) {} + int64_t size() const { return schema_->n_children; } + private: const ArrowSchema* schema_{}; @@ -109,7 +154,7 @@ class ViewSchemaChildren { int64_t i_ = 0; public: - explicit iterator(const ViewSchemaChildren& outer, int i = 0) + explicit iterator(const ViewSchemaChildren& outer, int64_t i = 0) : outer_(outer), i_(i) {} iterator& operator++() { i_++; @@ -132,9 +177,9 @@ class ViewSchemaChildren { iterator end() const { return iterator(*this, schema_->n_children); } }; -inline ViewSchemaChildren ViewSchema::Children() { return ViewSchemaChildren(schema_); } - - +inline ViewSchemaChildren ViewSchema::children() const { + return ViewSchemaChildren(schema_); +} NANOARROW_CXX_NAMESPACE_END diff --git a/src/nanoarrow/hpp/view_test.cc b/src/nanoarrow/hpp/view_test.cc index e18c7c639..9306ea838 100644 --- a/src/nanoarrow/hpp/view_test.cc +++ b/src/nanoarrow/hpp/view_test.cc @@ -18,6 +18,7 @@ #include #include +#include "nanoarrow/hpp/schema.hpp" #include "nanoarrow/nanoarrow.hpp" #include "nanoarrow/nanoarrow_gtest_util.hpp" @@ -32,7 +33,7 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayAsTest) { std::vector{8, 4, 2, 1, .5, .25, .125}); const void* buffers[] = {is_valid->data, floats->data}; - struct ArrowArray array {}; + struct ArrowArray array{}; array.length = 7; array.null_count = 2; array.n_buffers = 2; @@ -63,7 +64,7 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayAsBytesTest) { nanoarrow::BufferInitSequence(data.get(), std::string{"abcdefghi"}); const void* buffers[] = {is_valid->data, offsets->data, data->data}; - struct ArrowArray array {}; + struct ArrowArray array{}; array.length = 7; array.null_count = 2; array.n_buffers = 2; @@ -93,7 +94,7 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayAsFixedSizeBytesTest) { data.get(), std::string{"foo"} + "bar" + "foo" + "bar" + "foo" + "bar" + "foo"); const void* buffers[] = {is_valid->data, data->data}; - struct ArrowArray array {}; + struct ArrowArray array{}; array.length = 7; array.null_count = 2; array.n_buffers = 2; @@ -113,7 +114,7 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayAsFixedSizeBytesTest) { TEST(NanoarrowHppTest, NanoarrowHppViewArrayStreamTest) { static int32_t slot = 1; - struct ArrowArrayStream stream {}; + struct ArrowArrayStream stream{}; stream.get_schema = [](struct ArrowArrayStream*, struct ArrowSchema* out) { return ArrowSchemaInitFromType(out, NANOARROW_TYPE_INT32); }; @@ -135,3 +136,26 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayStreamTest) { EXPECT_EQ(stream_view.code(), ENOMEM); EXPECT_STREQ(stream_view.error()->message, "foo bar"); } + +TEST(SchemaHpp, SchemaHppDump) { + std::stringstream ss; + + nanoarrow::UniqueSchema schema; + ASSERT_EQ(ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_INT32), NANOARROW_OK); + + nanoarrow::ViewSchema view(schema.get()); + + ss << "metadata size: " << view.metadata().size() << std::endl; + + for (const auto& item : view.metadata()) { + ss << item.first << ", " << item.second << std::endl; + } + + for (const auto& child : view.children()) { + ss << "child: " << child.name() << std::endl; + } + + if (view.dictionary()) { + ss << "has dictionary\n"; + } +} From f17204a613bce4f11021fbcf06b29356ff0d039f Mon Sep 17 00:00:00 2001 From: Dewey Dunnington Date: Tue, 5 Nov 2024 16:45:40 -0600 Subject: [PATCH 4/4] maybe maybe maybe --- src/nanoarrow/hpp/schema.hpp | 135 +++++++++++++++++++++++++++++++++ src/nanoarrow/hpp/view_test.cc | 6 +- 2 files changed, 137 insertions(+), 4 deletions(-) diff --git a/src/nanoarrow/hpp/schema.hpp b/src/nanoarrow/hpp/schema.hpp index 209e9a456..9d4d01fc3 100644 --- a/src/nanoarrow/hpp/schema.hpp +++ b/src/nanoarrow/hpp/schema.hpp @@ -20,6 +20,7 @@ #include #include +#include #include "nanoarrow/hpp/exception.hpp" #include "nanoarrow/hpp/unique.hpp" @@ -28,6 +29,140 @@ NANOARROW_CXX_NAMESPACE_BEGIN +class SchemaBuilder { + public: + // Let some implicit magic construction happen + SchemaBuilder() { ArrowSchemaInit(schema_.get()); } + SchemaBuilder(const ArrowSchema* schema) { + NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(schema, schema_.get())); + } + SchemaBuilder(ArrowType type) : SchemaBuilder() { set_type(type); } + SchemaBuilder(const UniqueSchema schema) : SchemaBuilder(schema.get()) {} + + // Movable + SchemaBuilder(SchemaBuilder&& rhs) : SchemaBuilder(std::move(rhs.schema_)) {} + SchemaBuilder& operator=(SchemaBuilder&& rhs) { + schema_ = std::move(rhs.schema_); + return *this; + } + // Copyable + SchemaBuilder(const SchemaBuilder& rhs) : SchemaBuilder(rhs.data()) {} + + SchemaBuilder& operator=(SchemaBuilder& rhs) { + NANOARROW_THROW_NOT_OK(ArrowSchemaDeepCopy(rhs.data(), schema_.get())); + return *this; + } + + // Implicitly convertable to const ArrowSchema + operator const ArrowSchema*() const { return schema_.get(); } + + // Get schema pointer + const ArrowSchema* data() const { return schema_.get(); } + ArrowSchema* data() { return schema_.get(); } + + // Move the schema out + void Export(ArrowSchema* out) { ArrowSchemaMove(schema_.get(), out); } + + SchemaBuilder& set_type(ArrowType type) { + NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(schema_.get(), type)); + return *this; + } + + SchemaBuilder& set_type_datetime(ArrowType type, ArrowTimeUnit time_unit, + const char* tz = "") { + NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(data(), type, time_unit, tz)); + return *this; + } + + SchemaBuilder& set_type_list(ArrowType type, SchemaBuilder child, + int32_t fixed_size = -1) { + if (fixed_size > 0) { + NANOARROW_THROW_NOT_OK( + ArrowSchemaSetTypeFixedSize(schema_.get(), type, fixed_size)); + } else { + set_type(type); + } + + child.set_name(data()->children[0]->name); + ArrowSchemaRelease(data()->children[0]); + child.Export(data()->children[0]); + return *this; + } + + SchemaBuilder& set_name(const char* name) { + NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(schema_.get(), name)); + return *this; + } + + SchemaBuilder& allocate_children(int64_t n_children) { + NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateChildren(schema_.get(), n_children)); + return *this; + } + + SchemaBuilder& set_child(int64_t i, SchemaBuilder child) { + if (data()->children[i]->release) { + ArrowSchemaRelease(data()->children[i]); + } + + child.Export(data()->children[i]); + return *this; + } + + SchemaBuilder& set_child(int64_t i, SchemaBuilder child, const char* name) { + child.set_name(name); + if (data()->children[i]->release) { + ArrowSchemaRelease(data()->children[i]); + } + + child.Export(data()->children[i]); + return *this; + } + + private: + UniqueSchema schema_; +}; + +namespace schema { + +SchemaBuilder int32() { return NANOARROW_TYPE_INT32; } + +SchemaBuilder string() { return NANOARROW_TYPE_STRING; } + +SchemaBuilder list(SchemaBuilder child) { + SchemaBuilder out; + out.set_type_list(NANOARROW_TYPE_LIST, std::move(child)); + return out; +} + +SchemaBuilder fixed_size_list(SchemaBuilder child) { + SchemaBuilder out; + out.set_type_list(NANOARROW_TYPE_LIST, std::move(child)); + return out; +} + +SchemaBuilder struct_(std::vector children) { + SchemaBuilder out(NANOARROW_TYPE_STRUCT); + out.allocate_children(static_cast(children.size())); + for (int64_t i = 0; i < static_cast(children.size()); i++) { + out.set_child(i, std::move(children[i])); + } + + return out; +} + +SchemaBuilder struct_(std::vector> children) { + SchemaBuilder out(NANOARROW_TYPE_STRUCT); + out.allocate_children(static_cast(children.size())); + for (int64_t i = 0; i < static_cast(children.size()); i++) { + auto child = std::move(children[i]); + out.set_child(i, std::move(child.second), child.first.c_str()); + } + + return out; +} + +} // namespace schema + class ViewMetadata { public: explicit ViewMetadata(const char* metadata) : metadata_(metadata) {} diff --git a/src/nanoarrow/hpp/view_test.cc b/src/nanoarrow/hpp/view_test.cc index 9306ea838..343244042 100644 --- a/src/nanoarrow/hpp/view_test.cc +++ b/src/nanoarrow/hpp/view_test.cc @@ -140,10 +140,8 @@ TEST(NanoarrowHppTest, NanoarrowHppViewArrayStreamTest) { TEST(SchemaHpp, SchemaHppDump) { std::stringstream ss; - nanoarrow::UniqueSchema schema; - ASSERT_EQ(ArrowSchemaInitFromType(schema.get(), NANOARROW_TYPE_INT32), NANOARROW_OK); - - nanoarrow::ViewSchema view(schema.get()); + auto schema = nanoarrow::schema::struct_({{"foofy", NANOARROW_TYPE_INT32}}); + nanoarrow::ViewSchema view(schema); ss << "metadata size: " << view.metadata().size() << std::endl;