From 58b900201c65a93c6bc972d1ad24f58304ba3bb9 Mon Sep 17 00:00:00 2001 From: lihangyu-x Date: Wed, 27 May 2026 22:00:57 +0800 Subject: [PATCH] [fix](variant) Bind Variant search to nested indexes ### What problem does this PR solve? Issue Number: N/A Related PR: #63660 Problem Summary: Backport #63660 to branch-4.1. Bind Variant inverted-index search to the resolved scalar or nested Variant index reader, map nested leaf results back to the expected document scope, and preserve null bitmap semantics for empty bitset truth bitmaps. Adapt the segment index iterator call to the branch-4.1 ColumnReader API. Cherry-picked from commits 8310d28f9899f4bdf2af43aa93fdc036ad11d2d4 and 315ad31794625dad8579a9956cf83aa19efd6798. ### Release note Fix Variant inverted-index search binding for scalar and nested Variant paths. ### Check List (For Author) - Test: - Unit Test: ./run-be-ut.sh --run --filter='*Variant*:FunctionSearchTest.TestBuildLeafQueryDirectUnknownClauseUsesLeafMapper:FunctionSearchNestedTest.*:BitSetQueryTest.EmptyTruthBitmapPreservesNullBitmap' - Behavior changed: Yes. Fixes Variant inverted-index search binding and null bitmap handling. - Does this need documentation: No --- be/src/exprs/function/function_search.cpp | 720 +++++------------- be/src/exprs/function/function_search.h | 99 +-- .../variant_inverted_index_search.cpp | 720 ++++++++++++++++++ .../function/variant_inverted_index_search.h | 218 ++++++ be/src/exprs/vsearch.cpp | 56 ++ .../index/inverted/inverted_index_profile.h | 9 + .../index/inverted/inverted_index_stats.h | 14 + .../query_v2/bit_set_query/bit_set_query.h | 8 +- .../query_v2/bit_set_query/bit_set_weight.h | 6 +- be/src/storage/segment/segment.cpp | 47 +- be/src/storage/segment/segment_iterator.cpp | 33 +- .../segment/variant/variant_column_reader.cpp | 51 +- .../segment/variant/variant_column_reader.h | 4 +- .../function/function_search_nested_test.cpp | 353 ++++++++- .../exprs/function/function_search_test.cpp | 463 +++++++++++ .../inverted/query_v2/boolean_query_test.cpp | 22 + 16 files changed, 2165 insertions(+), 658 deletions(-) create mode 100644 be/src/exprs/function/variant_inverted_index_search.cpp create mode 100644 be/src/exprs/function/variant_inverted_index_search.h diff --git a/be/src/exprs/function/function_search.cpp b/be/src/exprs/function/function_search.cpp index 6c96da46c5be0c..9c33752598ccd3 100644 --- a/be/src/exprs/function/function_search.cpp +++ b/be/src/exprs/function/function_search.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -34,8 +35,11 @@ #include "common/status.h" #include "core/block/columns_with_type_and_name.h" #include "core/column/column_const.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_string.h" #include "exprs/function/simple_function_factory.h" +#include "exprs/function/variant_inverted_index_search.h" #include "exprs/vexpr_context.h" #include "runtime/runtime_profile.h" #include "storage/index/index_file_reader.h" @@ -59,12 +63,11 @@ #include "storage/index/inverted/query_v2/term_query/term_query.h" #include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h" #include "storage/index/inverted/util/string_helper.h" -#include "storage/segment/segment.h" -#include "storage/segment/variant/nested_group_path.h" +#include "storage/olap_common.h" #include "storage/segment/variant/nested_group_provider.h" -#include "storage/segment/variant/variant_column_reader.h" #include "storage/types.h" #include "util/debug_points.h" +#include "util/string_parser.hpp" #include "util/string_util.h" #include "util/thrift_util.h" @@ -119,277 +122,102 @@ bool is_nested_group_search_supported() { return provider != nullptr && provider->should_enable_nested_group_read_path(); } -class ResolverNullBitmapAdapter final : public query_v2::NullBitmapResolver { -public: - explicit ResolverNullBitmapAdapter(const FieldReaderResolver& resolver) : _resolver(resolver) {} - - segment_v2::IndexIterator* iterator_for(const query_v2::Scorer& /*scorer*/, - const std::string& logical_field) const override { - if (logical_field.empty()) { - return nullptr; - } - return _resolver.get_iterator(logical_field); +query_v2::QueryPtr make_unknown_query(uint32_t num_rows) { + auto null_bitmap = std::make_shared(); + if (num_rows > 0) { + null_bitmap->addRange(0, num_rows); } + return std::make_shared(std::make_shared(), + std::move(null_bitmap)); +} -private: - const FieldReaderResolver& _resolver; -}; - -void populate_binding_context(const FieldReaderResolver& resolver, - query_v2::QueryExecutionContext* exec_ctx) { - DCHECK(exec_ctx != nullptr); - exec_ctx->readers = resolver.readers(); - exec_ctx->reader_bindings = resolver.reader_bindings(); - exec_ctx->field_reader_bindings = resolver.field_readers(); - for (const auto& [binding_key, binding] : resolver.binding_cache()) { - if (binding_key.empty()) { - continue; +DataTypePtr unwrap_direct_index_value_type(DataTypePtr column_type) { + DataTypePtr value_type = remove_nullable(std::move(column_type)); + while (value_type != nullptr && + value_type->get_storage_field_type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + const auto* array_type = dynamic_cast(value_type.get()); + if (array_type == nullptr) { + return value_type; } - query_v2::FieldBindingContext binding_ctx; - binding_ctx.logical_field_name = binding.logical_field_name; - binding_ctx.stored_field_name = binding.stored_field_name; - binding_ctx.stored_field_wstr = binding.stored_field_wstr; - exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx)); + value_type = remove_nullable(array_type->get_nested_type()); } + return value_type; } -query_v2::QueryExecutionContext build_query_execution_context( - uint32_t segment_num_rows, const FieldReaderResolver& resolver, - query_v2::NullBitmapResolver* null_resolver) { - query_v2::QueryExecutionContext exec_ctx; - exec_ctx.segment_num_rows = segment_num_rows; - populate_binding_context(resolver, &exec_ctx); - exec_ctx.null_resolver = null_resolver; - return exec_ctx; -} - -} // namespace - -Status FieldReaderResolver::resolve(const std::string& field_name, - InvertedIndexQueryType query_type, - FieldReaderBinding* binding) { - DCHECK(binding != nullptr); - - // Check if this is a variant subcolumn - bool is_variant_sub = is_variant_subcolumn(field_name); - - auto data_it = _data_type_with_names.find(field_name); - if (data_it == _data_type_with_names.end()) { - // For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment) - // Return OK but with null binding to signal "no match" - if (is_variant_sub) { - VLOG_DEBUG << "Variant subcolumn '" << field_name - << "' not found in this segment, treating as no match"; - *binding = FieldReaderBinding(); - return Status::OK(); - } - // For normal fields, this is an error - return Status::Error( - "field '{}' not found in inverted index metadata", field_name); +template +Status parse_integral_search_value(const std::string& value, Field* field) { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + CppType parsed = + StringParser::string_to_int(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as {}", value, + type_to_string(primitive_type)); } + *field = Field::create_field(parsed); + return Status::OK(); +} - const auto& stored_field_name = data_it->second.first; - const auto binding_key = binding_key_for(stored_field_name, query_type); - - auto cache_it = _cache.find(binding_key); - if (cache_it != _cache.end()) { - *binding = cache_it->second; - return Status::OK(); +Status parse_scalar_search_value(const DataTypePtr& column_type, const std::string& value, + Field* field) { + if (column_type == nullptr || field == nullptr) { + return Status::InvalidArgument("missing column type for scalar search value"); } - auto iterator_it = _iterators.find(field_name); - if (iterator_it == _iterators.end() || iterator_it->second == nullptr) { - // For variant subcolumns, not finding the iterator is normal - if (is_variant_sub) { - VLOG_DEBUG << "Variant subcolumn '" << field_name - << "' iterator not found in this segment, treating as no match"; - *binding = FieldReaderBinding(); - return Status::OK(); + switch (column_type->get_storage_field_type()) { + case FieldType::OLAP_FIELD_TYPE_BOOL: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + bool parsed = StringParser::string_to_bool(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as bool", value); } - return Status::Error( - "iterator not found for field '{}'", field_name); - } - - auto* inverted_iterator = dynamic_cast(iterator_it->second); - if (inverted_iterator == nullptr) { - return Status::Error( - "iterator for field '{}' is not InvertedIndexIterator", field_name); + *field = Field::create_field(parsed); + return Status::OK(); } - - // For variant subcolumns, FE resolves the field pattern to a specific index and sends - // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index, - // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so select_best_reader picks the - // FULLTEXT reader instead of STRING_TYPE. Without this upgrade: - // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory - // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results - // - // For regular (non-variant) columns with multiple indexes, the caller (build_leaf_query) - // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY for tokenized - // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This ensures - // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without needing an explicit - // analyzer key, since the query_type alone drives the reader type preference. - InvertedIndexQueryType effective_query_type = query_type; - auto fb_it = _field_binding_map.find(field_name); - std::string analyzer_key; - if (is_variant_sub && fb_it != _field_binding_map.end() && - fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { - analyzer_key = normalize_analyzer_key( - build_analyzer_key_from_properties(fb_it->second->index_properties)); - if (inverted_index::InvertedIndexAnalyzer::should_analyzer( - fb_it->second->index_properties) && - (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || - effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { - effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + case FieldType::OLAP_FIELD_TYPE_TINYINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_SMALLINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_INT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_BIGINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_LARGEINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_FLOAT: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + Float32 parsed = + StringParser::string_to_float(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as float", value); } + *field = Field::create_field(parsed); + return Status::OK(); } - - Result reader_result; - const auto& column_type = data_it->second.second; - if (column_type) { - reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type, - analyzer_key); - } else { - reader_result = inverted_iterator->select_best_reader(analyzer_key); - } - - if (!reader_result.has_value()) { - return reader_result.error(); - } - - auto inverted_reader = reader_result.value(); - if (inverted_reader == nullptr) { - return Status::Error( - "selected reader is null for field '{}'", field_name); - } - - auto index_file_reader = inverted_reader->get_index_file_reader(); - if (index_file_reader == nullptr) { - return Status::Error( - "index file reader is null for field '{}'", field_name); - } - - // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly, - // respecting the enable_inverted_index_searcher_cache session variable. - auto index_file_key = - index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); - InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); - InvertedIndexCacheHandle searcher_cache_handle; - - bool searcher_cache_enabled = - _context->runtime_state != nullptr && - _context->runtime_state->query_options().enable_inverted_index_searcher_cache; - - bool cache_hit = false; - if (searcher_cache_enabled) { - int64_t lookup_dummy = 0; - SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_lookup_timer - : &lookup_dummy); - cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, - &searcher_cache_handle); - } - - std::shared_ptr reader_holder; - if (cache_hit) { - if (_context->stats) { - _context->stats->inverted_index_searcher_cache_hit++; - } - auto searcher_variant = searcher_cache_handle.get_index_searcher(); - auto* searcher_ptr = std::get_if(&searcher_variant); - if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { - reader_holder = std::shared_ptr( - (*searcher_ptr)->getReader(), - [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); + case FieldType::OLAP_FIELD_TYPE_DOUBLE: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + Float64 parsed = + StringParser::string_to_float(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as double", value); } + *field = Field::create_field(parsed); + return Status::OK(); } - - if (!reader_holder) { - if (_context->stats) { - _context->stats->inverted_index_searcher_cache_miss++; - } - // Cache miss: open directory, build IndexSearcher, insert into cache - int64_t dummy_timer = 0; - SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer - : &dummy_timer); - RETURN_IF_ERROR( - index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); - auto directory = DORIS_TRY( - index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); - - auto index_searcher_builder = DORIS_TRY( - IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); - auto searcher_result = - DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); - auto reader_size = index_searcher_builder->get_reader_size(); - - // Initialization reads are done. Clear io_ctx on the main stream so the - // cached searcher does not carry a stale reference. Subsequent query-phase - // reads receive the caller's io_ctx through the CLucene API parameters - // (termDocs/termPositions/terms) — the same pattern used by the MATCH path - // in InvertedIndexReader::create_index_searcher(). - auto* stream = static_cast(directory.get())->getDorisIndexInput(); - DBUG_EXECUTE_IF( - "FieldReaderResolver.resolve.io_ctx", ({ - const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext(); - if (cur_io_ctx->file_cache_stats) { - if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) { - LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: " - << cur_io_ctx->file_cache_stats << " vs " - << &_context->stats->file_cache_stats; - } - } - })); - stream->setIoContext(nullptr); - stream->setIndexFile(false); - - auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), - reader_size, UnixMillis()); - InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, - &searcher_cache_handle); - - auto new_variant = searcher_cache_handle.get_index_searcher(); - auto* new_ptr = std::get_if(&new_variant); - if (new_ptr != nullptr && *new_ptr != nullptr) { - reader_holder = std::shared_ptr( - (*new_ptr)->getReader(), - [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); - } - - if (!reader_holder) { - return Status::Error( - "failed to build IndexSearcher for field '{}'", field_name); - } + default: + return Status::NotSupported("scalar search does not support storage field type {}", + static_cast(column_type->get_storage_field_type())); } +} - _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); - - FieldReaderBinding resolved; - resolved.logical_field_name = field_name; - resolved.stored_field_name = stored_field_name; - resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); - resolved.column_type = column_type; - resolved.query_type = effective_query_type; - resolved.inverted_reader = inverted_reader; - resolved.lucene_reader = reader_holder; - // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching) - // Reuse fb_it from earlier lookup above. - if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && - !fb_it->second->index_properties.empty()) { - resolved.index_properties = fb_it->second->index_properties; - } else { - resolved.index_properties = inverted_reader->get_index_properties(); +InvertedIndexQueryType direct_index_query_type_for_clause(const std::string& clause_type) { + if (clause_type == "TERM" || clause_type == "EXACT") { + return InvertedIndexQueryType::EQUAL_QUERY; } - resolved.binding_key = binding_key; - resolved.analyzer_key = - normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); - - _binding_readers[binding_key] = reader_holder; - _field_readers[resolved.stored_field_wstr] = reader_holder; - _readers.emplace_back(reader_holder); - _cache.emplace(binding_key, resolved); - *binding = resolved; - return Status::OK(); + return InvertedIndexQueryType::UNKNOWN_QUERY; } +} // namespace + Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/, const ColumnNumbers& /*arguments*/, uint32_t /*result*/, size_t /*input_rows_count*/) const { @@ -507,78 +335,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( context->collection_similarity = std::make_shared(); } - // NESTED() queries evaluate predicates on the flattened "element space" of a nested group. - // For VARIANT nested groups, the indexed lucene field (stored_field_name) uses: - // parent_unique_id + "." + - // where the nested path is rooted at either: - // - "__D0_root__" for top-level array (NESTED(data, ...)) - // - "" for object fields (NESTED(data.items, ...)) - // - // FE field bindings are expressed using logical column paths (e.g. "data.items.msg"), so for - // NESTED() we normalize stored_field_name suffix to be consistent with the nested group root. - std::unordered_map patched_data_type_with_names; const auto* effective_data_type_with_names = &data_type_with_names; - if (is_nested_query && search_param.root.__isset.nested_path) { - const std::string& nested_path = search_param.root.nested_path; - const auto dot_pos = nested_path.find('.'); - const std::string root_field = - (dot_pos == std::string::npos) ? nested_path : nested_path.substr(0, dot_pos); - const std::string root_prefix = root_field + "."; - const std::string array_path = (dot_pos == std::string::npos) - ? std::string(segment_v2::kRootNestedGroupPath) - : nested_path.substr(dot_pos + 1); - - bool copied = false; - for (const auto& fb : search_param.field_bindings) { - if (!fb.__isset.is_variant_subcolumn || !fb.is_variant_subcolumn) { - continue; - } - if (fb.field_name.empty()) { - continue; - } - const auto it_orig = data_type_with_names.find(fb.field_name); - if (it_orig == data_type_with_names.end()) { - continue; - } - const std::string& old_stored = it_orig->second.first; - const auto first_dot = old_stored.find('.'); - if (first_dot == std::string::npos) { - continue; - } - std::string sub_path; - if (fb.__isset.subcolumn_path && !fb.subcolumn_path.empty()) { - sub_path = fb.subcolumn_path; - } else if (fb.field_name.starts_with(nested_path + ".")) { - sub_path = fb.field_name.substr(nested_path.size() + 1); - } else if (fb.field_name.starts_with(root_prefix)) { - sub_path = fb.field_name.substr(root_prefix.size()); - } else { - sub_path = fb.field_name; - } - if (sub_path.empty()) { - continue; - } - const std::string array_prefix = array_path + "."; - const std::string suffix_path = - sub_path.starts_with(array_prefix) ? sub_path : (array_prefix + sub_path); - const std::string parent_uid = old_stored.substr(0, first_dot); - const std::string expected_stored = parent_uid + "." + suffix_path; - if (old_stored == expected_stored) { - continue; - } - - if (!copied) { - patched_data_type_with_names = data_type_with_names; - effective_data_type_with_names = &patched_data_type_with_names; - copied = true; - } - auto it = patched_data_type_with_names.find(fb.field_name); - if (it == patched_data_type_with_names.end()) { - continue; - } - it->second.first = expected_stored; - } - } // Pass field_bindings to resolver for variant subcolumn detection FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context, @@ -586,9 +343,10 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( if (is_nested_query) { std::shared_ptr row_bitmap; - RETURN_IF_ERROR(evaluate_nested_query(search_param, search_param.root, context, resolver, - num_rows, index_exec_ctx, field_name_to_column_id, - row_bitmap)); + VariantNestedSearchEvaluator nested_evaluator(*this); + RETURN_IF_ERROR(nested_evaluator.evaluate(search_param, search_param.root, context, + resolver, num_rows, index_exec_ctx, + field_name_to_column_id, row_bitmap)); bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap), std::make_shared()); bitmap_result.mask_out_null(); @@ -617,7 +375,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy); RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query, &root_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); } if (root_query == nullptr) { LOG(INFO) << "search: Query tree resolved to empty query, dsl:" @@ -627,9 +385,9 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } - ResolverNullBitmapAdapter null_resolver(resolver); + VariantSearchNullBitmapAdapter null_resolver(resolver); query_v2::QueryExecutionContext exec_ctx = - build_query_execution_context(num_rows, resolver, &null_resolver); + build_variant_search_query_execution_context(num_rows, resolver, &null_resolver); bool enable_scoring = false; bool is_asc = false; @@ -713,139 +471,6 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } -Status FunctionSearch::evaluate_nested_query( - const TSearchParam& search_param, const TSearchClause& nested_clause, - const std::shared_ptr& context, FieldReaderResolver& resolver, - uint32_t num_rows, const IndexExecContext* index_exec_ctx, - const std::unordered_map& field_name_to_column_id, - std::shared_ptr& result_bitmap) const { - (void)field_name_to_column_id; - if (!(nested_clause.__isset.nested_path)) { - return Status::InvalidArgument("NESTED clause missing nested_path"); - } - if (!(nested_clause.__isset.children) || nested_clause.children.empty()) { - return Status::InvalidArgument("NESTED clause missing inner query"); - } - if (result_bitmap == nullptr) { - result_bitmap = std::make_shared(); - } else { - *result_bitmap = roaring::Roaring(); - } - - // 1. Get the nested group chain directly - std::string root_field = nested_clause.nested_path; - auto dot_pos = nested_clause.nested_path.find('.'); - if (dot_pos != std::string::npos) { - root_field = nested_clause.nested_path.substr(0, dot_pos); - } - if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) { - return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment"); - } - auto* segment = index_exec_ctx->segment(); - const int32_t ordinal = segment->tablet_schema()->field_index(root_field); - if (ordinal < 0) { - return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query", - root_field); - } - const ColumnId column_id = static_cast(ordinal); - - std::shared_ptr column_reader; - RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id), - &column_reader, - index_exec_ctx->column_iter_opts().stats)); - auto* variant_reader = dynamic_cast(column_reader.get()); - if (variant_reader == nullptr) { - return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field); - } - - std::string array_path; - if (dot_pos == std::string::npos) { - array_path = std::string(segment_v2::kRootNestedGroupPath); - } else { - array_path = nested_clause.nested_path.substr(dot_pos + 1); - } - - auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path); - if (!found || group_chain.empty()) { - return Status::OK(); - } - - // Use the read provider for element counting and bitmap mapping. - auto read_provider = segment_v2::create_nested_group_read_provider(); - if (!read_provider || !read_provider->should_enable_nested_group_read_path()) { - return Status::NotSupported( - "NestedGroup search is an enterprise capability, not available in this build"); - } - - auto& leaf_group = group_chain.back(); - uint64_t total_elements = 0; - RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(), - leaf_group, &total_elements)); - if (total_elements == 0) { - return Status::OK(); - } - - // 3. Evaluate inner query - std::string default_operator = "or"; - if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { - default_operator = search_param.default_operator; - } - int32_t minimum_should_match = -1; - if (search_param.__isset.minimum_should_match) { - minimum_should_match = search_param.minimum_should_match; - } - - query_v2::QueryPtr inner_query; - std::string inner_binding_key; - RETURN_IF_ERROR(build_query_recursive(nested_clause.children[0], context, resolver, - &inner_query, &inner_binding_key, default_operator, - minimum_should_match)); - if (inner_query == nullptr) { - return Status::OK(); - } - - if (total_elements > std::numeric_limits::max()) { - return Status::InvalidArgument("nested element_count exceeds uint32_t max"); - } - - ResolverNullBitmapAdapter null_resolver(resolver); - query_v2::QueryExecutionContext exec_ctx = build_query_execution_context( - static_cast(total_elements), resolver, &null_resolver); - - auto weight = inner_query->weight(false); - if (!weight) { - return Status::OK(); - } - auto scorer = weight->scorer(exec_ctx, inner_binding_key); - if (!scorer) { - return Status::OK(); - } - - roaring::Roaring element_bitmap; - uint32_t doc = scorer->doc(); - while (doc != query_v2::TERMINATED) { - element_bitmap.add(doc); - doc = scorer->advance(); - } - - if (scorer->has_null_bitmap(exec_ctx.null_resolver)) { - const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); - if (bitmap != nullptr && !bitmap->isEmpty()) { - element_bitmap -= *bitmap; - } - } - - // 4. Map element-level hits back to row-level hits through NestedGroup chain. - if (result_bitmap == nullptr) { - result_bitmap = std::make_shared(); - } - roaring::Roaring parent_bitmap; - RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords( - group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap)); - *result_bitmap = std::move(parent_bitmap); - return Status::OK(); -} - // Aligned with FE QsClauseType enum - uses enum.name() as clause_type FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category( const std::string& clause_type) const { @@ -955,13 +580,11 @@ static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) { } } -Status FunctionSearch::build_query_recursive(const TSearchClause& clause, - const std::shared_ptr& context, - FieldReaderResolver& resolver, - inverted_index::query_v2::QueryPtr* out, - std::string* binding_key, - const std::string& default_operator, - int32_t minimum_should_match) const { +Status FunctionSearch::build_query_recursive( + const TSearchClause& clause, const std::shared_ptr& context, + FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, + std::string* binding_key, const std::string& default_operator, int32_t minimum_should_match, + uint32_t num_rows) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -991,7 +614,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, &child_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); // Determine occur type from child clause query_v2::Occur occur = query_v2::Occur::MUST; // default @@ -1027,7 +650,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, &child_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); // Add all children including empty BitSetQuery // BooleanQuery will handle the logic: // - AND with empty bitmap → result is empty @@ -1042,7 +665,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, } return build_leaf_query(clause, context, resolver, out, binding_key, default_operator, - minimum_should_match); + minimum_should_match, num_rows); } Status FunctionSearch::build_leaf_query(const TSearchClause& clause, @@ -1051,7 +674,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, const std::string& default_operator, - int32_t minimum_should_match) const { + int32_t minimum_should_match, uint32_t num_rows) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -1083,26 +706,75 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; } + auto finish_leaf_query = [&](query_v2::QueryPtr query) -> Status { + *out = std::move(query); + return resolver.map_leaf_query(field_name, out); + }; + FieldReaderBinding binding; RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding)); - // Check if binding is empty (variant subcolumn not found in this segment) - if (binding.lucene_reader == nullptr) { + if (!binding.is_bound()) { LOG(INFO) << "search: No inverted index for field '" << field_name << "' in this segment, clause_type='" << clause_type - << "', query_type=" << static_cast(query_type) << ", returning no matches"; - // Variant subcolumn doesn't exist - create empty BitSetQuery (no matches) - *out = std::make_shared(roaring::Roaring()); + << "', query_type=" << static_cast(query_type) + << ", returning UNKNOWN bitmap"; if (binding_key) { binding_key->clear(); } - return Status::OK(); + return finish_leaf_query(make_unknown_query(num_rows)); } if (binding_key) { *binding_key = binding.binding_key; } + if (binding.use_direct_index_reader()) { + auto direct_query_type = direct_index_query_type_for_clause(clause_type); + if (direct_query_type == InvertedIndexQueryType::UNKNOWN_QUERY) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + + auto value_type = unwrap_direct_index_value_type(binding.column_type); + Field param_value; + auto parse_status = parse_scalar_search_value(value_type, value, ¶m_value); + if (!parse_status.ok()) { + LOG(INFO) << "search: scalar leaf value is unsupported, field=" << field_name + << ", value='" << value << "', reason=" << parse_status.to_string(); + return finish_leaf_query(make_unknown_query(num_rows)); + } + + auto* iterator = resolver.get_iterator(field_name); + if (iterator == nullptr) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + + segment_v2::InvertedIndexParam param; + param.column_name = binding.stored_field_name; + param.column_type = value_type; + param.query_value = param_value; + param.query_type = direct_query_type; + param.num_rows = num_rows; + param.roaring = std::make_shared(); + RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam {¶m})); + + std::shared_ptr null_bitmap = std::make_shared(); + auto has_null = iterator->has_null(); + if (has_null.has_value() && has_null.value()) { + segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle; + RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap_cache_handle)); + if (auto bitmap = null_bitmap_cache_handle.get_bitmap(); bitmap != nullptr) { + null_bitmap = bitmap; + } + } + return finish_leaf_query(std::make_shared(std::move(param.roaring), + std::move(null_bitmap))); + } + + if (binding.lucene_reader == nullptr) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type); std::wstring field_wstr = binding.stored_field_wstr; std::wstring value_wstr = StringHelper::to_wstring(value); @@ -1118,8 +790,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (binding.index_properties.empty()) { LOG(WARNING) << "search: analyzer required but index properties empty for field '" << field_name << "'"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1129,14 +800,13 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, LOG(WARNING) << "search: No terms found after tokenization for TERM query, field=" << field_name << ", value='" << value << "', returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } if (term_infos.size() == 1) { std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); - *out = make_term_query(term_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(term_wstr)); } // When minimum_should_match is specified, use OccurBooleanQuery @@ -1151,8 +821,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); builder->add(make_term_query(term_wstr), occur); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } // Use default_operator to determine how to combine tokenized terms @@ -1165,12 +834,10 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) { @@ -1180,16 +847,14 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (!should_analyze) { VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name << "', falling back to TERM"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (binding.index_properties.empty()) { LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE " "query on field '" << field_name << "'"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1199,8 +864,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field=" << field_name << ", value='" << value << "', returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } std::vector phrase_term_infos = @@ -1209,7 +874,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, const auto& term_info = phrase_term_infos[0]; if (term_info.is_single_term()) { std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); - *out = std::make_shared(context, field_wstr, term_wstr); + return finish_leaf_query( + std::make_shared(context, field_wstr, term_wstr)); } else { auto builder = create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR); @@ -1217,15 +883,15 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term); builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); + return finish_leaf_query(builder->build()); } } else { if (QueryHelper::is_simple_phrase(phrase_term_infos)) { - *out = std::make_shared(context, field_wstr, - phrase_term_infos); + return finish_leaf_query(std::make_shared( + context, field_wstr, phrase_term_infos)); } else { - *out = std::make_shared(context, field_wstr, - phrase_term_infos); + return finish_leaf_query(std::make_shared( + context, field_wstr, phrase_term_infos)); } } @@ -1233,23 +899,20 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, } if (clause_type == "MATCH") { VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (clause_type == "ANY" || clause_type == "ALL") { bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( binding.index_properties); if (!should_analyze) { - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (binding.index_properties.empty()) { LOG(WARNING) << "search: index properties empty for tokenized clause '" << clause_type << "' field=" << field_name; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1258,8 +921,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (term_infos.empty()) { LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type << "', field=" << field_name << ", returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR; @@ -1269,8 +932,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (term_infos.size() == 1) { std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); - *out = make_term_query(term_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(term_wstr)); } auto builder = create_operator_boolean_query_builder(bool_type); @@ -1278,13 +940,11 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } // Default tokenized clause fallback - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) { @@ -1293,10 +953,9 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase // If only tokenized index exists, EXACT may return empty results because // tokenized indexes store individual tokens, not complete strings - *out = make_term_query(value_wstr); VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='" << value << "'"; - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (clause_type == "PREFIX") { // Apply lowercase only if: @@ -1308,21 +967,20 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, get_parser_lowercase_from_properties(binding.index_properties); bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); std::string pattern = should_lowercase ? to_lower(value) : value; - *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" << pattern << "' (original='" << value << "', has_parser=" << has_parser << ", lower_case=" << lowercase_setting << ")"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, pattern)); } if (clause_type == "WILDCARD") { // Standalone wildcard "*" matches all non-null values for this field // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery if (value == "*") { - *out = std::make_shared(field_wstr, true); VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field=" << field_name; - return Status::OK(); + return finish_leaf_query(std::make_shared(field_wstr, true)); } // Apply lowercase only if: // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) @@ -1333,33 +991,31 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, get_parser_lowercase_from_properties(binding.index_properties); bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); std::string pattern = should_lowercase ? to_lower(value) : value; - *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" << pattern << "' (original='" << value << "', has_parser=" << has_parser << ", lower_case=" << lowercase_setting << ")"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, pattern)); } if (clause_type == "REGEXP") { // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching) // This matches ES query_string behavior where regex patterns bypass analysis - *out = std::make_shared(context, field_wstr, value); VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" << value << "'"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, value)); } if (clause_type == "RANGE" || clause_type == "LIST") { VLOG_DEBUG << "search: clause type '" << clause_type << "' not implemented, fallback to TERM"; } - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } void register_function_search(SimpleFunctionFactory& factory) { diff --git a/be/src/exprs/function/function_search.h b/be/src/exprs/function/function_search.h index 376e1aa07282dd..343db747583a70 100644 --- a/be/src/exprs/function/function_search.h +++ b/be/src/exprs/function/function_search.h @@ -31,6 +31,7 @@ #include "core/data_type/data_type_number.h" #include "core/types.h" #include "exprs/function/function.h" +#include "exprs/function/variant_inverted_index_search.h" #include "storage/index/index_query_context.h" #include "storage/index/inverted/inverted_index_cache.h" #include "storage/index/inverted/query_v2/boolean_query/operator_boolean_query.h" @@ -42,91 +43,6 @@ using namespace doris::segment_v2; class IndexExecContext; -struct FieldReaderBinding { - std::string logical_field_name; - std::string stored_field_name; - std::wstring stored_field_wstr; - DataTypePtr column_type; - InvertedIndexQueryType query_type; - InvertedIndexReaderPtr inverted_reader; - std::shared_ptr lucene_reader; - std::map index_properties; - std::string binding_key; - std::string analyzer_key; -}; - -class FieldReaderResolver { -public: - FieldReaderResolver( - const std::unordered_map& data_type_with_names, - const std::unordered_map& iterators, - std::shared_ptr context, - const std::vector& field_bindings = {}) - : _data_type_with_names(data_type_with_names), - _iterators(iterators), - _context(std::move(context)), - _field_bindings(field_bindings) { - // Build lookup maps for quick access - for (const auto& binding : _field_bindings) { - if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { - _variant_subcolumn_fields.insert(binding.field_name); - } - _field_binding_map[binding.field_name] = &binding; - } - } - - Status resolve(const std::string& field_name, InvertedIndexQueryType query_type, - FieldReaderBinding* binding); - - // Check if a field is a variant subcolumn - bool is_variant_subcolumn(const std::string& field_name) const { - return _variant_subcolumn_fields.count(field_name) > 0; - } - - const std::vector>& readers() const { - return _readers; - } - - const std::unordered_map>& - reader_bindings() const { - return _binding_readers; - } - - const std::unordered_map>& - field_readers() const { - return _field_readers; - } - - const std::unordered_map& binding_cache() const { - return _cache; - } - - IndexIterator* get_iterator(const std::string& field_name) const { - auto it = _iterators.find(field_name); - return (it != _iterators.end()) ? it->second : nullptr; - } - -private: - std::string binding_key_for(const std::string& stored_field_name, - InvertedIndexQueryType query_type) const { - return stored_field_name + "#" + std::to_string(static_cast(query_type)); - } - - const std::unordered_map& _data_type_with_names; - const std::unordered_map& _iterators; - std::shared_ptr _context; - std::vector _field_bindings; - std::unordered_map _field_binding_map; - std::unordered_set _variant_subcolumn_fields; - std::unordered_map _cache; - std::vector> _readers; - std::unordered_map> _binding_readers; - std::unordered_map> _field_readers; - // Keep searcher cache handles alive for the resolver's lifetime. - // This pins cached IndexSearcher entries so extracted IndexReaders remain valid. - std::vector _searcher_cache_handles; -}; - class FunctionSearch : public IFunction { public: static constexpr auto name = "search"; @@ -177,13 +93,6 @@ class FunctionSearch : public IFunction { const std::unordered_map& field_name_to_column_id, const std::shared_ptr& index_query_context = nullptr) const; - Status evaluate_nested_query( - const TSearchParam& search_param, const TSearchClause& nested_clause, - const std::shared_ptr& context, FieldReaderResolver& resolver, - uint32_t num_rows, const IndexExecContext* index_exec_ctx, - const std::unordered_map& field_name_to_column_id, - std::shared_ptr& result_bitmap) const; - // Public methods for testing enum class ClauseTypeCategory { NON_TOKENIZED, // TERM, PREFIX, WILDCARD, REGEXP, RANGE, LIST - no tokenization, use EQUAL_QUERY @@ -204,14 +113,14 @@ class FunctionSearch : public IFunction { const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, - const std::string& default_operator, - int32_t minimum_should_match) const; + const std::string& default_operator, int32_t minimum_should_match, + uint32_t num_rows = 0) const; Status build_leaf_query(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, const std::string& default_operator, - int32_t minimum_should_match) const; + int32_t minimum_should_match, uint32_t num_rows = 0) const; }; } // namespace doris diff --git a/be/src/exprs/function/variant_inverted_index_search.cpp b/be/src/exprs/function/variant_inverted_index_search.cpp new file mode 100644 index 00000000000000..cf3fc0505188c6 --- /dev/null +++ b/be/src/exprs/function/variant_inverted_index_search.cpp @@ -0,0 +1,720 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/function/variant_inverted_index_search.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/config.h" +#include "common/exception.h" +#include "common/logging.h" +#include "exprs/function/function_search.h" +#include "exprs/vexpr_context.h" +#include "runtime/runtime_state.h" +#include "storage/index/index_file_reader.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/inverted_index_compound_reader.h" +#include "storage/index/inverted/inverted_index_parser.h" +#include "storage/index/inverted/inverted_index_searcher.h" +#include "storage/index/inverted/query_v2/bit_set_query/bit_set_scorer.h" +#include "storage/index/inverted/query_v2/doc_set.h" +#include "storage/index/inverted/query_v2/scorer.h" +#include "storage/index/inverted/query_v2/term_query/term_query.h" +#include "storage/index/inverted/query_v2/weight.h" +#include "storage/index/inverted/util/string_helper.h" +#include "storage/segment/segment.h" +#include "storage/segment/variant/nested_group_path.h" +#include "storage/segment/variant/nested_group_provider.h" +#include "storage/segment/variant/variant_column_reader.h" +#include "storage/utils.h" +#include "util/debug_points.h" +#include "util/time.h" + +namespace doris { + +namespace query_v2 = segment_v2::inverted_index::query_v2; + +namespace { + +void add_search_binding_diagnostic(const std::shared_ptr& context, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (context != nullptr && context->stats != nullptr) { + context->stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + +} // namespace + +FieldReaderResolver::FieldReaderResolver( + const std::unordered_map& data_type_with_names, + const std::unordered_map& iterators, + std::shared_ptr context, + const std::vector& field_bindings) + : _data_type_with_names(data_type_with_names), + _iterators(iterators), + _context(std::move(context)), + _field_bindings(field_bindings) { + for (const auto& binding : _field_bindings) { + if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { + _variant_subcolumn_fields.insert(binding.field_name); + } + _field_binding_map[binding.field_name] = &binding; + } +} + +Status FieldReaderResolver::resolve(const std::string& field_name, + InvertedIndexQueryType query_type, + FieldReaderBinding* binding) { + DCHECK(binding != nullptr); + + const bool is_variant_sub = is_variant_subcolumn(field_name); + + auto data_it = _data_type_with_names.find(field_name); + if (data_it == _data_type_with_names.end()) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=no_metadata " + "logical_field={} query_type={} reason=field_not_found", + field_name, query_type_to_string(query_type))); + *binding = FieldReaderBinding(); + return Status::OK(); + } + return Status::Error( + "field '{}' not found in inverted index metadata", field_name); + } + + const auto& stored_field_name = data_it->second.first; + const auto binding_key = binding_key_for(stored_field_name, query_type); + + auto cache_it = _cache.find(binding_key); + if (cache_it != _cache.end()) { + *binding = cache_it->second; + return Status::OK(); + } + + auto iterator_it = _iterators.find(field_name); + if (iterator_it == _iterators.end() || iterator_it->second == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=no_iterator " + "logical_field={} stored_field={} query_type={} " + "reason=iterator_not_found", + field_name, stored_field_name, query_type_to_string(query_type))); + *binding = FieldReaderBinding(); + return Status::OK(); + } + return Status::Error( + "iterator not found for field '{}'", field_name); + } + + auto* inverted_iterator = dynamic_cast(iterator_it->second); + if (inverted_iterator == nullptr) { + return Status::Error( + "iterator for field '{}' is not InvertedIndexIterator", field_name); + } + + InvertedIndexQueryType effective_query_type = query_type; + const auto& column_type = data_it->second.second; + const bool is_text_field = + column_type != nullptr && is_string_type(column_type->get_storage_field_type()); + auto fb_it = _field_binding_map.find(field_name); + std::string analyzer_key; + if (is_text_field && is_variant_sub && fb_it != _field_binding_map.end() && + fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { + analyzer_key = normalize_analyzer_key( + build_analyzer_key_from_properties(fb_it->second->index_properties)); + if (inverted_index::InvertedIndexAnalyzer::should_analyzer( + fb_it->second->index_properties) && + (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || + effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { + effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + } + } + + Result reader_result; + if (column_type) { + reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type, + is_text_field ? analyzer_key : ""); + } else { + reader_result = inverted_iterator->select_best_reader(is_text_field ? analyzer_key : ""); + } + + if (!reader_result.has_value()) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} analyzer_key={} reason={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), analyzer_key, + reader_result.error().to_string())); + } + return reader_result.error(); + } + + auto inverted_reader = reader_result.value(); + if (inverted_reader == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} reason=selected_reader_null", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type))); + } + return Status::Error( + "selected reader is null for field '{}'", field_name); + } + + FieldReaderBinding resolved; + resolved.logical_field_name = field_name; + resolved.stored_field_name = stored_field_name; + resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); + resolved.column_type = column_type; + resolved.query_type = effective_query_type; + resolved.inverted_reader = inverted_reader; + resolved.binding_key = binding_key; + resolved.state = SearchFieldBindingState::BOUND; + if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && + !fb_it->second->index_properties.empty()) { + resolved.index_properties = fb_it->second->index_properties; + } else { + resolved.index_properties = inverted_reader->get_index_properties(); + } + resolved.analyzer_key = + normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); + + auto index_file_reader = inverted_reader->get_index_file_reader(); + if (index_file_reader == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} index_id={} suffix={} " + "reason=index_file_reader_null", + field_name, stored_field_name, inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix())); + } + return Status::Error( + "index file reader is null for field '{}'", field_name); + } + + if (inverted_reader->type() == InvertedIndexReaderType::BKD) { + _cache.emplace(binding_key, resolved); + if (is_variant_sub) { + bool index_file_exists = false; + auto probe_status = index_file_reader->index_file_exist( + &inverted_reader->get_index_meta(), &index_file_exists); + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=selected_direct " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} index_id={} suffix={} reader_type={} " + "index_file_exists={} probe_status={} index_file={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), + inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix(), + reader_type_to_string(inverted_reader->type()), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string(), + index_file_reader->get_index_file_path( + &inverted_reader->get_index_meta()))); + } + *binding = resolved; + return Status::OK(); + } + + auto index_file_key = + index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); + InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); + InvertedIndexCacheHandle searcher_cache_handle; + + bool searcher_cache_enabled = + _context->runtime_state != nullptr && + _context->runtime_state->query_options().enable_inverted_index_searcher_cache; + + bool cache_hit = false; + if (searcher_cache_enabled) { + int64_t lookup_dummy = 0; + SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_lookup_timer + : &lookup_dummy); + cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, + &searcher_cache_handle); + } + + std::shared_ptr reader_holder; + if (cache_hit) { + if (_context->stats) { + _context->stats->inverted_index_searcher_cache_hit++; + } + auto searcher_variant = searcher_cache_handle.get_index_searcher(); + auto* searcher_ptr = std::get_if(&searcher_variant); + if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*searcher_ptr)->getReader(), [](lucene::index::IndexReader*) {}); + } + } + + if (!reader_holder) { + if (_context->stats) { + _context->stats->inverted_index_searcher_cache_miss++; + } + int64_t dummy_timer = 0; + SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer + : &dummy_timer); + RETURN_IF_ERROR( + index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); + auto directory = DORIS_TRY( + index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); + + auto index_searcher_builder = DORIS_TRY( + IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); + auto searcher_result = + DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); + auto reader_size = index_searcher_builder->get_reader_size(); + + auto* stream = static_cast(directory.get())->getDorisIndexInput(); + DBUG_EXECUTE_IF( + "FieldReaderResolver.resolve.io_ctx", ({ + const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext(); + if (cur_io_ctx->file_cache_stats) { + if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) { + LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: " + << cur_io_ctx->file_cache_stats << " vs " + << &_context->stats->file_cache_stats; + } + } + })); + stream->setIoContext(nullptr); + stream->setIndexFile(false); + + auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), + reader_size, UnixMillis()); + InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, + &searcher_cache_handle); + + auto new_variant = searcher_cache_handle.get_index_searcher(); + auto* new_ptr = std::get_if(&new_variant); + if (new_ptr != nullptr && *new_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*new_ptr)->getReader(), [](lucene::index::IndexReader*) {}); + } + + if (!reader_holder) { + return Status::Error( + "failed to build IndexSearcher for field '{}'", field_name); + } + } + + _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); + + resolved.lucene_reader = reader_holder; + _binding_readers[binding_key] = reader_holder; + _field_readers[resolved.stored_field_wstr] = reader_holder; + _readers.emplace_back(reader_holder); + _cache.emplace(binding_key, resolved); + if (is_variant_sub) { + bool index_file_exists = false; + auto probe_status = index_file_reader->index_file_exist(&inverted_reader->get_index_meta(), + &index_file_exists); + add_search_binding_diagnostic( + _context, + fmt::format( + "[VariantSearchBinding] phase=field_resolve result=selected " + "logical_field={} stored_field={} query_type={} effective_query_type={} " + "index_id={} suffix={} reader_type={} analyzer_key={} " + "field_pattern={} index_file_exists={} probe_status={} " + "searcher_cache={} index_file={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix(), + reader_type_to_string(inverted_reader->type()), resolved.analyzer_key, + inverted_reader->get_index_meta().field_pattern(), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string(), + cache_hit ? "hit" : "miss", + index_file_reader->get_index_file_path( + &inverted_reader->get_index_meta()))); + } + *binding = resolved; + return Status::OK(); +} + +segment_v2::IndexIterator* VariantSearchNullBitmapAdapter::iterator_for( + const query_v2::Scorer& /*scorer*/, const std::string& logical_field) const { + if (logical_field.empty()) { + return nullptr; + } + return _resolver.get_iterator(logical_field); +} + +void populate_variant_search_binding_context(const FieldReaderResolver& resolver, + query_v2::QueryExecutionContext* exec_ctx) { + DCHECK(exec_ctx != nullptr); + exec_ctx->readers = resolver.readers(); + exec_ctx->reader_bindings = resolver.reader_bindings(); + exec_ctx->field_reader_bindings = resolver.field_readers(); + for (const auto& [binding_key, binding] : resolver.binding_cache()) { + if (binding_key.empty()) { + continue; + } + query_v2::FieldBindingContext binding_ctx; + binding_ctx.logical_field_name = binding.logical_field_name; + binding_ctx.stored_field_name = binding.stored_field_name; + binding_ctx.stored_field_wstr = binding.stored_field_wstr; + exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx)); + } +} + +query_v2::QueryExecutionContext build_variant_search_query_execution_context( + uint32_t segment_num_rows, const FieldReaderResolver& resolver, + query_v2::NullBitmapResolver* null_resolver) { + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = segment_num_rows; + populate_variant_search_binding_context(resolver, &exec_ctx); + exec_ctx.null_resolver = null_resolver; + return exec_ctx; +} + +namespace { + +class VariantNestedDocMappingWeight final : public query_v2::Weight { +public: + VariantNestedDocMappingWeight( + query_v2::WeightPtr child_weight, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) + : _child_weight(std::move(child_weight)), + _child_to_parent_chain(std::move(child_to_parent_chain)), + _read_provider(read_provider), + _column_iter_opts(std::move(column_iter_opts)) {} + + query_v2::ScorerPtr scorer(const query_v2::QueryExecutionContext& context, + const std::string& binding_key) override { + if (_child_weight == nullptr || _read_provider == nullptr || + _child_to_parent_chain.empty()) { + return std::make_shared(); + } + + auto child_scorer = _child_weight->scorer(context, binding_key); + if (child_scorer == nullptr) { + return std::make_shared(); + } + + roaring::Roaring child_true; + uint32_t doc = child_scorer->doc(); + while (doc != query_v2::TERMINATED) { + child_true.add(doc); + doc = child_scorer->advance(); + } + + auto mapped_true = std::make_shared(); + if (!child_true.isEmpty()) { + auto status = _read_provider->map_elements_to_parent_ords( + _child_to_parent_chain, _column_iter_opts, child_true, mapped_true.get()); + if (!status.ok()) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "failed to map nested search true bitmap: {}", status.to_string()); + } + } + + std::shared_ptr mapped_null; + if (child_scorer->has_null_bitmap(context.null_resolver)) { + const auto* child_null = child_scorer->get_null_bitmap(context.null_resolver); + if (child_null != nullptr && !child_null->isEmpty()) { + mapped_null = std::make_shared(); + auto status = _read_provider->map_elements_to_parent_ords( + _child_to_parent_chain, _column_iter_opts, *child_null, mapped_null.get()); + if (!status.ok()) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "failed to map nested search null bitmap: {}", + status.to_string()); + } + *mapped_null -= *mapped_true; + if (mapped_null->isEmpty()) { + mapped_null.reset(); + } + } + } + + if (mapped_true->isEmpty() && (mapped_null == nullptr || mapped_null->isEmpty())) { + return std::make_shared(); + } + return std::make_shared(std::move(mapped_true), + std::move(mapped_null)); + } + +private: + query_v2::WeightPtr _child_weight; + std::vector _child_to_parent_chain; + const segment_v2::NestedGroupReadProvider* _read_provider; + segment_v2::ColumnIteratorOptions _column_iter_opts; +}; + +class VariantNestedDocMappingQuery final : public query_v2::Query { +public: + VariantNestedDocMappingQuery( + query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) + : _child_query(std::move(child_query)), + _child_to_parent_chain(std::move(child_to_parent_chain)), + _read_provider(read_provider), + _column_iter_opts(std::move(column_iter_opts)) {} + + query_v2::WeightPtr weight(bool enable_scoring) override { + if (_child_query == nullptr) { + return nullptr; + } + return std::make_shared(_child_query->weight(enable_scoring), + _child_to_parent_chain, + _read_provider, _column_iter_opts); + } + +private: + query_v2::QueryPtr _child_query; + std::vector _child_to_parent_chain; + const segment_v2::NestedGroupReadProvider* _read_provider; + segment_v2::ColumnIteratorOptions _column_iter_opts; +}; + +bool starts_with_root_field(const std::string& logical_field_name, const std::string& root_field) { + if (logical_field_name == root_field) { + return true; + } + return logical_field_name.size() > root_field.size() && + logical_field_name.compare(0, root_field.size(), root_field) == 0 && + logical_field_name[root_field.size()] == '.'; +} + +} // namespace + +query_v2::QueryPtr make_variant_nested_doc_mapping_query( + query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) { + if (child_to_parent_chain.empty()) { + return child_query; + } + return std::make_shared( + std::move(child_query), std::move(child_to_parent_chain), read_provider, + std::move(column_iter_opts)); +} + +Status map_variant_nested_leaf_query_to_active_group(const VariantNestedDocMapperContext& context, + const std::string& logical_field_name, + query_v2::QueryPtr* query) { + if (query == nullptr || *query == nullptr || context.variant_reader == nullptr || + context.read_provider == nullptr || context.active_group_chain.empty() || + context.root_field.empty()) { + return Status::OK(); + } + if (!starts_with_root_field(logical_field_name, context.root_field)) { + return Status::OK(); + } + + std::string relative_path; + if (logical_field_name.size() > context.root_field.size()) { + relative_path = logical_field_name.substr(context.root_field.size() + 1); + } + if (relative_path.empty()) { + return Status::OK(); + } + + auto [found, leaf_group_chain, _] = + context.variant_reader->collect_nested_group_chain(relative_path); + if (!found) { + return Status::OK(); + } + if (leaf_group_chain.size() < context.active_group_chain.size()) { + return Status::InvalidArgument( + "nested search leaf field '{}' is outside active nested path", logical_field_name); + } + for (size_t i = 0; i < context.active_group_chain.size(); ++i) { + if (leaf_group_chain[i] != context.active_group_chain[i]) { + return Status::InvalidArgument( + "nested search leaf field '{}' is outside active nested path", + logical_field_name); + } + } + if (leaf_group_chain.size() == context.active_group_chain.size()) { + return Status::OK(); + } + + std::vector child_to_parent_chain( + leaf_group_chain.begin() + context.active_group_chain.size(), leaf_group_chain.end()); + *query = make_variant_nested_doc_mapping_query(std::move(*query), + std::move(child_to_parent_chain), + context.read_provider, context.column_iter_opts); + return Status::OK(); +} + +Status VariantNestedSearchEvaluator::evaluate( + const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, + FieldReaderResolver& resolver, uint32_t num_rows, const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) const { + (void)num_rows; + (void)field_name_to_column_id; + if (!(nested_clause.__isset.nested_path)) { + return Status::InvalidArgument("NESTED clause missing nested_path"); + } + if (!(nested_clause.__isset.children) || nested_clause.children.empty()) { + return Status::InvalidArgument("NESTED clause missing inner query"); + } + if (result_bitmap == nullptr) { + result_bitmap = std::make_shared(); + } else { + *result_bitmap = roaring::Roaring(); + } + + std::string root_field = nested_clause.nested_path; + auto dot_pos = nested_clause.nested_path.find('.'); + if (dot_pos != std::string::npos) { + root_field = nested_clause.nested_path.substr(0, dot_pos); + } + if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) { + return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment"); + } + auto* segment = index_exec_ctx->segment(); + const int32_t ordinal = segment->tablet_schema()->field_index(root_field); + if (ordinal < 0) { + return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query", + root_field); + } + const ColumnId column_id = static_cast(ordinal); + + std::shared_ptr column_reader; + RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id), + &column_reader, + index_exec_ctx->column_iter_opts().stats)); + auto* variant_reader = dynamic_cast(column_reader.get()); + if (variant_reader == nullptr) { + return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field); + } + + std::string array_path; + if (dot_pos == std::string::npos) { + array_path = std::string(segment_v2::kRootNestedGroupPath); + } else { + array_path = nested_clause.nested_path.substr(dot_pos + 1); + } + + auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path); + if (!found || group_chain.empty()) { + return Status::OK(); + } + + auto read_provider = segment_v2::create_nested_group_read_provider(); + if (!read_provider || !read_provider->should_enable_nested_group_read_path()) { + return Status::NotSupported( + "NestedGroup search is an enterprise capability, not available in this build"); + } + + auto& leaf_group = group_chain.back(); + uint64_t total_elements = 0; + RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(), + leaf_group, &total_elements)); + if (total_elements == 0) { + return Status::OK(); + } + if (total_elements > std::numeric_limits::max()) { + return Status::InvalidArgument("nested element_count exceeds uint32_t max"); + } + + std::string default_operator = "or"; + if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { + default_operator = search_param.default_operator; + } + int32_t minimum_should_match = -1; + if (search_param.__isset.minimum_should_match) { + minimum_should_match = search_param.minimum_should_match; + } + + query_v2::QueryPtr inner_query; + std::string inner_binding_key; + VariantNestedDocMapperContext mapper_context; + mapper_context.root_field = root_field; + mapper_context.active_group_chain = group_chain; + mapper_context.variant_reader = variant_reader; + mapper_context.read_provider = read_provider.get(); + mapper_context.column_iter_opts = index_exec_ctx->column_iter_opts(); + resolver.set_leaf_query_mapper( + [mapper_context](const std::string& logical_field_name, query_v2::QueryPtr* query) { + return map_variant_nested_leaf_query_to_active_group(mapper_context, + logical_field_name, query); + }); + struct ScopedLeafMapperReset { + FieldReaderResolver& resolver; + ~ScopedLeafMapperReset() { resolver.set_leaf_query_mapper(nullptr); } + } mapper_reset {resolver}; + RETURN_IF_ERROR(_function_search.build_query_recursive( + nested_clause.children[0], context, resolver, &inner_query, &inner_binding_key, + default_operator, minimum_should_match, static_cast(total_elements))); + if (inner_query == nullptr) { + return Status::OK(); + } + + VariantSearchNullBitmapAdapter null_resolver(resolver); + query_v2::QueryExecutionContext exec_ctx = build_variant_search_query_execution_context( + static_cast(total_elements), resolver, &null_resolver); + + auto weight = inner_query->weight(false); + if (!weight) { + return Status::OK(); + } + auto scorer = weight->scorer(exec_ctx, inner_binding_key); + if (!scorer) { + return Status::OK(); + } + + roaring::Roaring element_bitmap; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + element_bitmap.add(doc); + doc = scorer->advance(); + } + + if (scorer->has_null_bitmap(exec_ctx.null_resolver)) { + const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); + if (bitmap != nullptr && !bitmap->isEmpty()) { + element_bitmap -= *bitmap; + } + } + + roaring::Roaring parent_bitmap; + RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords( + group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap)); + *result_bitmap = std::move(parent_bitmap); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exprs/function/variant_inverted_index_search.h b/be/src/exprs/function/variant_inverted_index_search.h new file mode 100644 index 00000000000000..973c9c8c826c55 --- /dev/null +++ b/be/src/exprs/function/variant_inverted_index_search.h @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/block/columns_with_type_and_name.h" +#include "core/data_type/data_type.h" +#include "storage/index/index_query_context.h" +#include "storage/index/inverted/inverted_index_cache.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/inverted/query_v2/query.h" +#include "storage/index/inverted/query_v2/weight.h" +#include "storage/olap_common.h" +#include "storage/segment/column_reader.h" + +namespace doris::segment_v2::inverted_index::query_v2 { +class Query; +} + +namespace doris::segment_v2 { +class NestedGroupReadProvider; +struct NestedGroupReader; +class VariantColumnReader; +} // namespace doris::segment_v2 + +namespace doris { + +using namespace doris::segment_v2; + +class FunctionSearch; +class IndexExecContext; + +using SearchLeafQueryMapper = std::function*)>; + +enum class SearchFieldBindingState { + BOUND, + MISSING_IN_SEGMENT, +}; + +struct FieldReaderBinding { + std::string logical_field_name; + std::string stored_field_name; + std::wstring stored_field_wstr; + DataTypePtr column_type; + InvertedIndexQueryType query_type; + InvertedIndexReaderPtr inverted_reader; + std::shared_ptr lucene_reader; + std::map index_properties; + std::string binding_key; + std::string analyzer_key; + SearchFieldBindingState state = SearchFieldBindingState::MISSING_IN_SEGMENT; + + bool is_bound() const { + return state == SearchFieldBindingState::BOUND || inverted_reader != nullptr || + lucene_reader != nullptr; + } + bool use_direct_index_reader() const { + return is_bound() && inverted_reader != nullptr && lucene_reader == nullptr; + } +}; + +class FieldReaderResolver { +public: + FieldReaderResolver( + const std::unordered_map& data_type_with_names, + const std::unordered_map& iterators, + std::shared_ptr context, + const std::vector& field_bindings = {}); + + Status resolve(const std::string& field_name, InvertedIndexQueryType query_type, + FieldReaderBinding* binding); + + bool is_variant_subcolumn(const std::string& field_name) const { + return _variant_subcolumn_fields.count(field_name) > 0; + } + + const std::vector>& readers() const { + return _readers; + } + + const std::unordered_map>& + reader_bindings() const { + return _binding_readers; + } + + const std::unordered_map>& + field_readers() const { + return _field_readers; + } + + const std::unordered_map& binding_cache() const { + return _cache; + } + + IndexIterator* get_iterator(const std::string& field_name) const { + auto it = _iterators.find(field_name); + return (it != _iterators.end()) ? it->second : nullptr; + } + + void set_leaf_query_mapper(SearchLeafQueryMapper mapper) { + _leaf_query_mapper = std::move(mapper); + } + + Status map_leaf_query( + const std::string& field_name, + std::shared_ptr* query) const { + if (!_leaf_query_mapper || query == nullptr || *query == nullptr) { + return Status::OK(); + } + return _leaf_query_mapper(field_name, query); + } + +private: + std::string binding_key_for(const std::string& stored_field_name, + InvertedIndexQueryType query_type) const { + return stored_field_name + "#" + std::to_string(static_cast(query_type)); + } + + const std::unordered_map& _data_type_with_names; + const std::unordered_map& _iterators; + std::shared_ptr _context; + std::vector _field_bindings; + std::unordered_map _field_binding_map; + std::unordered_set _variant_subcolumn_fields; + std::unordered_map _cache; + std::vector> _readers; + std::unordered_map> _binding_readers; + std::unordered_map> _field_readers; + std::vector _searcher_cache_handles; + SearchLeafQueryMapper _leaf_query_mapper; +}; + +class VariantSearchNullBitmapAdapter final : public inverted_index::query_v2::NullBitmapResolver { +public: + explicit VariantSearchNullBitmapAdapter(const FieldReaderResolver& resolver) + : _resolver(resolver) {} + + segment_v2::IndexIterator* iterator_for(const inverted_index::query_v2::Scorer& scorer, + const std::string& logical_field) const override; + +private: + const FieldReaderResolver& _resolver; +}; + +void populate_variant_search_binding_context( + const FieldReaderResolver& resolver, + inverted_index::query_v2::QueryExecutionContext* exec_ctx); + +inverted_index::query_v2::QueryExecutionContext build_variant_search_query_execution_context( + uint32_t segment_num_rows, const FieldReaderResolver& resolver, + inverted_index::query_v2::NullBitmapResolver* null_resolver); + +struct VariantNestedDocMapperContext { + std::string root_field; + std::vector active_group_chain; + const segment_v2::VariantColumnReader* variant_reader = nullptr; + const segment_v2::NestedGroupReadProvider* read_provider = nullptr; + segment_v2::ColumnIteratorOptions column_iter_opts; +}; + +Status map_variant_nested_leaf_query_to_active_group(const VariantNestedDocMapperContext& context, + const std::string& logical_field_name, + inverted_index::query_v2::QueryPtr* query); + +inverted_index::query_v2::QueryPtr make_variant_nested_doc_mapping_query( + inverted_index::query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts); + +class VariantNestedSearchEvaluator { +public: + explicit VariantNestedSearchEvaluator(const FunctionSearch& function_search) + : _function_search(function_search) {} + + Status evaluate(const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, + FieldReaderResolver& resolver, uint32_t num_rows, + const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) const; + +private: + const FunctionSearch& _function_search; +}; + +} // namespace doris diff --git a/be/src/exprs/vsearch.cpp b/be/src/exprs/vsearch.cpp index f4ed11e95fcd59..3b5da89d29011e 100644 --- a/be/src/exprs/vsearch.cpp +++ b/be/src/exprs/vsearch.cpp @@ -17,6 +17,8 @@ #include "exprs/vsearch.h" +#include + #include #include @@ -30,6 +32,7 @@ #include "glog/logging.h" #include "runtime/runtime_state.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/olap_common.h" #include "storage/segment/segment.h" namespace doris { @@ -45,6 +48,18 @@ struct SearchInputBundle { ColumnsWithTypeAndName literal_args; }; +void add_search_binding_diagnostic(const IndexExecContext* index_context, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (index_context == nullptr) { + return; + } + const auto& index_query_context = index_context->get_index_query_context(); + if (index_query_context != nullptr && index_query_context->stats != nullptr) { + index_query_context->stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, SearchInputBundle* bundle) { DCHECK(bundle != nullptr); @@ -158,9 +173,24 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, if (base_column_index >= 0) { bundle->column_ids.emplace_back(base_column_index); } + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=parent_fallback logical_field={} " + "parent_field={} sub_path={} base_column_id={} " + "stored_field={} reason=slot_iterator_missing", + field_name, binding->parent_field_name, sub_path, + base_column_id, prefix + "." + sub_path)); field_added = true; } } + } else { + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=reject logical_field={} parent_field={} " + "reason=parent_column_not_found", + field_name, binding->parent_field_name)); } } @@ -174,6 +204,15 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, bundle->iterators.emplace(field_name, iterator); bundle->field_types.emplace(field_name, *storage_name_type); bundle->column_ids.emplace_back(column_id); + if (binding != nullptr && binding->__isset.is_variant_subcolumn && + binding->is_variant_subcolumn) { + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=direct_iterator logical_field={} column_id={} " + "stored_field={}", + field_name, column_id, storage_name_type->first)); + } } child_index++; @@ -187,6 +226,18 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, field_bindings[child_index].__isset.is_variant_subcolumn && field_bindings[child_index].is_variant_subcolumn) { // Variant subcolumn not materialized - skip, will create empty BitSetQuery in function_search + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=unmaterialized_element_at logical_field={} " + "parent_field={} sub_path={} reason=no_slot_ref", + field_bindings[child_index].field_name, + field_bindings[child_index].__isset.parent_field_name + ? field_bindings[child_index].parent_field_name + : "", + field_bindings[child_index].__isset.subcolumn_path + ? field_bindings[child_index].subcolumn_path + : "")); child_index++; continue; } @@ -252,6 +303,11 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm if (bundle.iterators.empty() && !is_nested_query) { LOG(WARNING) << "VSearchExpr: No indexed columns available for evaluation, DSL: " << _original_dsl; + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=evaluate_search result=no_iterator " + "dsl={} reason=no_indexed_columns", + _original_dsl)); auto empty_bitmap = InvertedIndexResultBitmap(std::make_shared(), std::make_shared()); index_context->set_index_result_for_expr(this, std::move(empty_bitmap)); diff --git a/be/src/storage/index/inverted/inverted_index_profile.h b/be/src/storage/index/inverted/inverted_index_profile.h index 393c33d711b3c1..eddb20a990503f 100644 --- a/be/src/storage/index/inverted/inverted_index_profile.h +++ b/be/src/storage/index/inverted/inverted_index_profile.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -31,6 +32,14 @@ class InvertedIndexProfileReporter { ~InvertedIndexProfileReporter() = default; void update(RuntimeProfile* profile, const InvertedIndexStatistics* statistics) { + if (!statistics->binding_diagnostics.empty()) { + std::string info; + for (const auto& diagnostic : statistics->binding_diagnostics) { + info += "\n" + diagnostic; + } + profile->add_info_string("VariantSearchBindingDiagnostics", info); + } + // Determine the iteration limit: the smaller of 20 or the size of statistics->stats size_t iteration_limit = std::min(20, statistics->stats.size()); diff --git a/be/src/storage/index/inverted/inverted_index_stats.h b/be/src/storage/index/inverted/inverted_index_stats.h index b82b230f41d71e..863a5bf0219776 100644 --- a/be/src/storage/index/inverted/inverted_index_stats.h +++ b/be/src/storage/index/inverted/inverted_index_stats.h @@ -17,6 +17,9 @@ #pragma once +#include +#include +#include #include namespace doris { @@ -28,7 +31,18 @@ struct InvertedIndexQueryStatistics { }; struct InvertedIndexStatistics { + void add_binding_diagnostic(std::string diagnostic) { + if (binding_diagnostics.size() >= kMaxBindingDiagnostics) { + return; + } + binding_diagnostics.emplace_back(std::move(diagnostic)); + } + std::vector stats; + std::vector binding_diagnostics; + +private: + static constexpr size_t kMaxBindingDiagnostics = 64; }; } // namespace doris diff --git a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h index 5531fb8e62aaf3..35528905471ff3 100644 --- a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h +++ b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h @@ -28,18 +28,22 @@ namespace doris::segment_v2::inverted_index::query_v2 { class BitSetQuery : public Query { public: explicit BitSetQuery(std::shared_ptr bitmap) : _bitmap(std::move(bitmap)) {} + BitSetQuery(std::shared_ptr bitmap, + std::shared_ptr null_bitmap) + : _bitmap(std::move(bitmap)), _null_bitmap(std::move(null_bitmap)) {} BitSetQuery(const roaring::Roaring& bitmap) : _bitmap(std::make_shared(bitmap)) {} ~BitSetQuery() override = default; WeightPtr weight(bool /*enable_scoring*/) override { - return std::make_shared(_bitmap); + return std::make_shared(_bitmap, _null_bitmap); } private: std::shared_ptr _bitmap; + std::shared_ptr _null_bitmap; }; using BitSetQueryPtr = std::shared_ptr; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h index f1a726edd8fc3d..6d3f9b8f038363 100644 --- a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h +++ b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h @@ -33,10 +33,12 @@ class BitSetWeight final : public Weight { ~BitSetWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& /*context*/) override { - if (_bitmap == nullptr || _bitmap->isEmpty()) { + if ((_bitmap == nullptr || _bitmap->isEmpty()) && + (_null_bitmap == nullptr || _null_bitmap->isEmpty())) { return std::make_shared(); } - return std::make_shared(_bitmap, _null_bitmap); + auto bitmap = _bitmap ? _bitmap : std::make_shared(); + return std::make_shared(std::move(bitmap), _null_bitmap); } private: diff --git a/be/src/storage/segment/segment.cpp b/be/src/storage/segment/segment.cpp index 7563299a856826..e3df7b70f34dc9 100644 --- a/be/src/storage/segment/segment.cpp +++ b/be/src/storage/segment/segment.cpp @@ -802,7 +802,52 @@ Status Segment::new_index_iterator(const TabletColumn& tablet_column, const Tabl // to avoid data race during parallel method calls RETURN_IF_ERROR(_index_file_reader_open.call([&] { return _open_index_file_reader(); })); // after DorisCallOnce.call, _index_file_reader is guaranteed to be not nullptr - RETURN_IF_ERROR(reader->new_index_iterator(_index_file_reader, index_meta, iter)); + const bool need_binding_diagnostic = tablet_column.is_variant_type() || + tablet_column.is_extracted_column() || + !index_meta->get_index_suffix().empty(); + bool index_file_exists = false; + Status probe_status; + if (need_binding_diagnostic) { + probe_status = _index_file_reader->init(config::inverted_index_read_buffer_size, + &read_options.io_ctx); + if (probe_status.ok()) { + probe_status = _index_file_reader->index_file_exist(index_meta, &index_file_exists); + } + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=index_file_probe tablet_id={} rowset_id={} " + "segment_id={} column={} logical_path={} index_id={} suffix={} exists={} " + "status={}", + read_options.tablet_id, _rowset_id.to_string(), _segment_id, + tablet_column.name(), + tablet_column.has_path_info() ? tablet_column.path_info_ptr()->get_path() + : tablet_column.name(), + index_meta->index_id(), index_meta->get_index_suffix(), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string()); + VLOG_DEBUG << diagnostic; + if (read_options.stats != nullptr) { + read_options.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } + } + Status iter_status = reader->new_index_iterator(_index_file_reader, index_meta, iter); + if (!iter_status.ok()) { + if (need_binding_diagnostic) { + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=index_iterator_create result=reject " + "tablet_id={} rowset_id={} segment_id={} column={} logical_path={} " + "index_id={} suffix={} reason={}", + read_options.tablet_id, _rowset_id.to_string(), _segment_id, + tablet_column.name(), + tablet_column.has_path_info() ? tablet_column.path_info_ptr()->get_path() + : tablet_column.name(), + index_meta->index_id(), index_meta->get_index_suffix(), + iter_status.to_string()); + VLOG_DEBUG << diagnostic; + if (read_options.stats != nullptr) { + read_options.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } + } + return iter_status; + } return Status::OK(); } return Status::OK(); diff --git a/be/src/storage/segment/segment_iterator.cpp b/be/src/storage/segment/segment_iterator.cpp index 3d542307318ae7..3a9c37aafcd5f4 100644 --- a/be/src/storage/segment/segment_iterator.cpp +++ b/be/src/storage/segment/segment_iterator.cpp @@ -1600,8 +1600,8 @@ Status SegmentIterator::_init_index_iterators() { data_type = inferred_type; } } - inverted_indexs_holder = - variant_reader->find_subcolumn_tablet_indexes(column, data_type); + inverted_indexs_holder = variant_reader->find_subcolumn_tablet_indexes( + column, data_type, _opts.stats); // Extract raw pointers from shared_ptr for iteration for (const auto& index_ptr : inverted_indexs_holder) { inverted_indexs.push_back(index_ptr.get()); @@ -1611,9 +1611,38 @@ Status SegmentIterator::_init_index_iterators() { else { inverted_indexs = _segment->_tablet_schema->inverted_indexs(column); } + if (column.is_extracted_column() && inverted_indexs.empty() && _opts.stats != nullptr) { + const auto relative_path = column.path_info_ptr()->copy_pop_front().get_path(); + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=init_index_iterators " + "result=no_candidate tablet_id={} rowset_id={} segment_id={} cid={} " + "logical_path={} relative_path={} materialized_column={}", + _tablet_id, _segment->rowset_id().to_string(), _segment->id(), cid, + column.path_info_ptr()->get_path(), relative_path, column.name()); + VLOG_DEBUG << diagnostic; + _opts.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } for (const auto& inverted_index : inverted_indexs) { + const bool had_iterator = _index_iterators[cid] != nullptr; RETURN_IF_ERROR(_segment->new_index_iterator(column, inverted_index, _opts, &_index_iterators[cid])); + if ((column.is_extracted_column() || column.is_variant_type()) && + _opts.stats != nullptr) { + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=init_index_iterators " + "result={} tablet_id={} rowset_id={} segment_id={} cid={} " + "logical_path={} materialized_column={} index_id={} suffix={} " + "field_pattern={} iterator_state={}", + _index_iterators[cid] == nullptr ? "no_iterator" : "accepted", + _tablet_id, _segment->rowset_id().to_string(), _segment->id(), cid, + column.has_path_info() ? column.path_info_ptr()->get_path() + : column.name(), + column.name(), inverted_index->index_id(), + inverted_index->get_index_suffix(), inverted_index->field_pattern(), + had_iterator ? "preserved" : "created"); + VLOG_DEBUG << diagnostic; + _opts.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } } if (_index_iterators[cid] != nullptr) { _index_iterators[cid]->set_context(_index_query_context); diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index a072eba289d7dc..ab63cb4f13f944 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -17,6 +17,7 @@ #include "storage/segment/variant/variant_column_reader.h" +#include #include #include @@ -28,6 +29,7 @@ #include #include "common/config.h" +#include "common/logging.h" #include "common/status.h" #include "core/assert_cast.h" #include "core/column/column_array.h" @@ -41,6 +43,7 @@ #include "io/fs/file_reader.h" #include "runtime/descriptors.h" #include "storage/key_coder.h" +#include "storage/olap_common.h" #include "storage/segment/column_meta_accessor.h" #include "storage/segment/column_reader.h" #include "storage/segment/column_reader_cache.h" @@ -67,6 +70,14 @@ bool is_compaction_or_checksum_reader(const StorageReadOptions* opts) { opts->io_ctx.reader_type == ReaderType::READER_CHECKSUM); } +void add_variant_search_binding_diagnostic(OlapReaderStatistics* stats, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (stats != nullptr) { + stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + // Nested-group whole/root-merge iterators dereference NestedGroupReader state that is owned by // VariantColumnReader. Hold the owning reader until the iterator itself is destroyed so query-time // iterator initialization cannot outlive the reader and hit a UAF. @@ -1418,11 +1429,14 @@ Status VariantColumnReader::load_external_meta_once() { } TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletColumn& column, - const DataTypePtr& data_type) { + const DataTypePtr& data_type, + OlapReaderStatistics* stats) { TabletSchema::SubColumnInfo sub_column_info; const auto& parent_index = _tablet_schema->inverted_indexs(column.parent_unique_id()); auto relative_path = column.path_info_ptr()->copy_pop_front(); DataTypePtr index_data_type = data_type; + const std::string logical_path = column.path_info_ptr()->get_path(); + const std::string relative_path_str = relative_path.get_path(); if (!relative_path.empty()) { auto [found, group_chain, child_path] = @@ -1443,6 +1457,16 @@ TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletCol if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), relative_path.get_path(), &sub_column_info) && !sub_column_info.indexes.empty()) { + for (const auto& index : sub_column_info.indexes) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=direct logical_path={} relative_path={} " + "materialized_column={} index_id={} suffix={} field_pattern={} " + "reason=generated_subcolumn_info", + logical_path, relative_path_str, column.name(), index->index_id(), + index->get_index_suffix(), index->field_pattern())); + } return sub_column_info.indexes; } @@ -1458,6 +1482,31 @@ TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletCol .parent_unique_id = column.parent_unique_id(), .path_info = index_path}); variant_util::inherit_index(parent_index, sub_column_info.indexes, target_column); + for (const auto& index : sub_column_info.indexes) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=parent_inherited logical_path={} relative_path={} " + "materialized_column={} index_id={} suffix={} field_pattern={} " + "reason=no_direct_subcolumn_index", + logical_path, relative_path_str, column.name(), index->index_id(), + index->get_index_suffix(), index->field_pattern())); + } + } else if (parent_index.empty()) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=none logical_path={} relative_path={} materialized_column={} " + "reason=parent_index_missing", + logical_path, relative_path_str, column.name())); + } else { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=none logical_path={} relative_path={} materialized_column={} " + "data_type={} reason=unsupported_inherited_index_type", + logical_path, relative_path_str, column.name(), + index_data_type ? index_data_type->get_name() : "null")); } // Return shared_ptr directly to maintain object lifetime return sub_column_info.indexes; diff --git a/be/src/storage/segment/variant/variant_column_reader.h b/be/src/storage/segment/variant/variant_column_reader.h index d9f35730e62efd..af9249e3e2ce81 100644 --- a/be/src/storage/segment/variant/variant_column_reader.h +++ b/be/src/storage/segment/variant/variant_column_reader.h @@ -52,6 +52,7 @@ namespace doris { class TabletIndex; class StorageReadOptions; class TabletSchema; +struct OlapReaderStatistics; namespace segment_v2 { @@ -215,7 +216,8 @@ class VariantColumnReader : public ColumnReader { // Return shared_ptr to ensure the lifetime of TabletIndex objects TabletIndexes find_subcolumn_tablet_indexes(const TabletColumn& target_column, - const DataTypePtr& data_type); + const DataTypePtr& data_type, + OlapReaderStatistics* stats = nullptr); bool exist_in_sparse_column(const PathInData& path) const; diff --git a/be/test/exprs/function/function_search_nested_test.cpp b/be/test/exprs/function/function_search_nested_test.cpp index b44587ba707d1f..1861e26131bc10 100644 --- a/be/test/exprs/function/function_search_nested_test.cpp +++ b/be/test/exprs/function/function_search_nested_test.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Unit tests for FunctionSearch::evaluate_nested_query and NESTED clause handling. +// Unit tests for VariantNestedSearchEvaluator and NESTED clause handling. // Migrated from function_search_test.cpp for maintainability. #include @@ -23,19 +23,154 @@ #include #include +#include #include +#include "common/exception.h" #include "core/block/block.h" #include "exprs/function/function_search.h" +#include "exprs/function/variant_inverted_index_search.h" +#include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h" +#include "storage/index/inverted/query_v2/query.h" +#include "storage/index/inverted/query_v2/weight.h" #include "storage/segment/variant/nested_group_provider.h" +#include "storage/segment/variant/variant_column_reader.h" namespace doris { +class FakeNestedGroupReadProvider final : public segment_v2::NestedGroupReadProvider { +public: + bool should_enable_nested_group_read_path() const override { return true; } + + Status init_readers(const segment_v2::ColumnReaderOptions&, + const std::shared_ptr&, + const std::shared_ptr&, segment_v2::ColumnMetaAccessor*, + int32_t, uint64_t, segment_v2::NestedGroupReaders&) override { + return Status::NotSupported("not implemented"); + } + + bool try_build_read_plan(const TabletSchema*, const segment_v2::NestedGroupReaders&, + const TabletColumn&, const StorageReadOptions*, int32_t, + const PathInData&, bool*, DataTypePtr*, PathInData*, std::string*, + std::string*, std::vector*, + std::optional*) const override { + return false; + } + + Status create_nested_group_iterator(bool, + const std::vector&, + const std::string&, const std::string&, + const std::optional&, + segment_v2::ColumnIteratorUPtr*, DataTypePtr*) override { + return Status::NotSupported("not implemented"); + } + + Status get_total_elements(const segment_v2::ColumnIteratorOptions&, + const segment_v2::NestedGroupReader*, uint64_t*) const override { + return Status::NotSupported("not implemented"); + } + + Status create_root_merge_iterator(segment_v2::ColumnIteratorUPtr, + const segment_v2::NestedGroupReaders&, + const StorageReadOptions*, + segment_v2::ColumnIteratorUPtr*) override { + return Status::NotSupported("not implemented"); + } + + Status map_elements_to_parent_ords(const std::vector&, + const segment_v2::ColumnIteratorOptions&, + const roaring::Roaring& element_bitmap, + roaring::Roaring* parent_bitmap) const override { + for (auto doc : element_bitmap) { + parent_bitmap->add(doc / 2); + } + return Status::OK(); + } +}; + +class ErrorNestedGroupReadProvider final : public segment_v2::NestedGroupReadProvider { +public: + bool should_enable_nested_group_read_path() const override { return true; } + + Status init_readers(const segment_v2::ColumnReaderOptions&, + const std::shared_ptr&, + const std::shared_ptr&, segment_v2::ColumnMetaAccessor*, + int32_t, uint64_t, segment_v2::NestedGroupReaders&) override { + return Status::NotSupported("not implemented"); + } + + bool try_build_read_plan(const TabletSchema*, const segment_v2::NestedGroupReaders&, + const TabletColumn&, const StorageReadOptions*, int32_t, + const PathInData&, bool*, DataTypePtr*, PathInData*, std::string*, + std::string*, std::vector*, + std::optional*) const override { + return false; + } + + Status create_nested_group_iterator(bool, + const std::vector&, + const std::string&, const std::string&, + const std::optional&, + segment_v2::ColumnIteratorUPtr*, DataTypePtr*) override { + return Status::NotSupported("not implemented"); + } + + Status get_total_elements(const segment_v2::ColumnIteratorOptions&, + const segment_v2::NestedGroupReader*, uint64_t*) const override { + return Status::NotSupported("not implemented"); + } + + Status create_root_merge_iterator(segment_v2::ColumnIteratorUPtr, + const segment_v2::NestedGroupReaders&, + const StorageReadOptions*, + segment_v2::ColumnIteratorUPtr*) override { + return Status::NotSupported("not implemented"); + } + + Status map_elements_to_parent_ords(const std::vector&, + const segment_v2::ColumnIteratorOptions&, + const roaring::Roaring&, roaring::Roaring*) const override { + return Status::InternalError("forced mapping failure"); + } +}; + +class NullWeightQuery final : public inverted_index::query_v2::Query { +public: + inverted_index::query_v2::WeightPtr weight(bool) override { return nullptr; } +}; + +class NullScorerWeight final : public inverted_index::query_v2::Weight { +public: + inverted_index::query_v2::ScorerPtr scorer( + const inverted_index::query_v2::QueryExecutionContext&, + const std::string& = {}) override { + return nullptr; + } +}; + +class NullScorerQuery final : public inverted_index::query_v2::Query { +public: + inverted_index::query_v2::WeightPtr weight(bool) override { + return std::make_shared(); + } +}; + class FunctionSearchNestedTest : public testing::Test { public: void SetUp() override { function_search = std::make_shared(); } protected: + Status evaluate_nested_query( + const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, FieldReaderResolver& resolver, + uint32_t num_rows, const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) { + VariantNestedSearchEvaluator evaluator(*function_search); + return evaluator.evaluate(search_param, nested_clause, context, resolver, num_rows, + index_exec_ctx, field_name_to_column_id, result_bitmap); + } + std::shared_ptr function_search; }; @@ -88,6 +223,187 @@ TEST_F(FunctionSearchNestedTest, NestedClauseMustBeTopLevel) { std::string::npos); } +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryMapsTruthAndNullBitmaps) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); + true_bitmap->add(4); + auto null_bitmap = std::make_shared(); + null_bitmap->add(7); + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + + roaring::Roaring actual_true; + uint32_t doc = scorer->doc(); + while (doc != inverted_index::query_v2::TERMINATED) { + actual_true.add(doc); + doc = scorer->advance(); + } + + EXPECT_TRUE(actual_true.contains(1)); + EXPECT_TRUE(actual_true.contains(2)); + EXPECT_EQ(2, actual_true.cardinality()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* actual_null = scorer->get_null_bitmap(); + ASSERT_NE(nullptr, actual_null); + EXPECT_TRUE(actual_null->contains(3)); + EXPECT_EQ(1, actual_null->cardinality()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryReturnsChildWhenNoMappingChain) { + auto child_query = std::make_shared(roaring::Roaring()); + + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, {}, nullptr, + segment_v2::ColumnIteratorOptions {}); + + EXPECT_EQ(child_query, mapped_query); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryHandlesNullChildWeight) { + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + FakeNestedGroupReadProvider read_provider; + auto mapped_query = make_variant_nested_doc_mapping_query(std::make_shared(), + chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryHandlesNullChildScorer) { + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + FakeNestedGroupReadProvider read_provider; + auto mapped_query = make_variant_nested_doc_mapping_query(std::make_shared(), + chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryDropsNullsCoveredByTrueHits) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); // maps to parent 1 + auto null_bitmap = std::make_shared(); + null_bitmap->add(3); // also maps to parent 1, then is removed from null bitmap + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(1u, scorer->doc()); + EXPECT_FALSE(scorer->has_null_bitmap()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryReturnsEmptyForEmptyChildResult) { + auto child_query = std::make_shared(roaring::Roaring()); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryThrowsOnMappingError) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); + auto child_query = std::make_shared(true_bitmap); + + ErrorNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + EXPECT_THROW((void)weight->scorer(exec_ctx), Exception); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryThrowsOnNullBitmapMappingError) { + auto true_bitmap = std::make_shared(); + auto null_bitmap = std::make_shared(); + null_bitmap->add(3); + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + ErrorNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + EXPECT_THROW((void)weight->scorer(exec_ctx), Exception); +} + +TEST_F(FunctionSearchNestedTest, VariantNestedLeafMapperEarlyExitBranches) { + inverted_index::query_v2::QueryPtr query = + std::make_shared(roaring::Roaring()); + auto original_query = query; + VariantNestedDocMapperContext mapper_context; + + ASSERT_TRUE( + map_variant_nested_leaf_query_to_active_group(mapper_context, "data.items.msg", &query) + .ok()); + EXPECT_EQ(original_query, query); + + segment_v2::NestedGroupReader nested_group; + FakeNestedGroupReadProvider read_provider; + segment_v2::VariantColumnReader variant_reader; + mapper_context.root_field = "data"; + mapper_context.active_group_chain = {&nested_group}; + mapper_context.read_provider = &read_provider; + mapper_context.variant_reader = &variant_reader; + + ASSERT_TRUE(map_variant_nested_leaf_query_to_active_group(mapper_context, "metrics.items.msg", + &query) + .ok()); + EXPECT_EQ(original_query, query); + + ASSERT_TRUE(map_variant_nested_leaf_query_to_active_group(mapper_context, "data", &query).ok()); + EXPECT_EQ(original_query, query); +} + // =========================================================================== // Community-edition fallback: NESTED root → NOT_IMPLEMENTED_ERROR // =========================================================================== @@ -151,9 +467,8 @@ TEST_F(FunctionSearchNestedTest, MissingNestedPath) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing nested_path"), std::string::npos); @@ -178,9 +493,8 @@ TEST_F(FunctionSearchNestedTest, MissingChildren) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing inner query"), std::string::npos); @@ -205,9 +519,8 @@ TEST_F(FunctionSearchNestedTest, EmptyChildrenList) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing inner query"), std::string::npos); @@ -237,9 +550,8 @@ TEST_F(FunctionSearchNestedTest, NullExecContext) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("IndexExecContext"), std::string::npos); @@ -263,9 +575,8 @@ TEST_F(FunctionSearchNestedTest, InitializesNullResultBitmap) { std::shared_ptr result_bitmap; // nullptr std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); // Should fail (nested_path not set), but no crash on null bitmap EXPECT_FALSE(status.ok()); } @@ -304,9 +615,8 @@ TEST_F(FunctionSearchNestedTest, BitmapClearedAfterPassingValidation) { std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); // Will fail later (null context), but bitmap should be cleared EXPECT_FALSE(status.ok()); ASSERT_NE(nullptr, result_bitmap); @@ -338,9 +648,8 @@ TEST_F(FunctionSearchNestedTest, DottedNestedPath) { std::unordered_map field_to_col_id; // null context → InvalidArgument about segment - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("IndexExecContext"), std::string::npos); diff --git a/be/test/exprs/function/function_search_test.cpp b/be/test/exprs/function/function_search_test.cpp index ba11058ca5fce9..ac57847f32e4de 100644 --- a/be/test/exprs/function/function_search_test.cpp +++ b/be/test/exprs/function/function_search_test.cpp @@ -21,12 +21,21 @@ #include #include +#include #include #include #include +#include #include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/primitive_type.h" +#include "storage/index/index_file_reader.h" #include "storage/index/index_iterator.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/inverted/inverted_index_parser.h" #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h" #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_weight.h" #include "storage/index/inverted/query_v2/phrase_query/phrase_query.h" @@ -60,6 +69,103 @@ class DummyIndexIterator : public segment_v2::IndexIterator { Result has_null() override { return false; } }; +class RecordingIndexIterator : public segment_v2::IndexIterator { +public: + segment_v2::IndexReaderPtr get_reader( + segment_v2::IndexReaderType /*reader_type*/) const override { + return nullptr; + } + + Status read_from_index(const segment_v2::IndexParam& param) override { + auto* i_param_ptr = std::get_if(¶m); + if (i_param_ptr == nullptr || *i_param_ptr == nullptr) { + return Status::InvalidArgument("missing inverted index param"); + } + auto* i_param = *i_param_ptr; + last_column_name = i_param->column_name; + last_column_storage_type = i_param->column_type == nullptr + ? FieldType::OLAP_FIELD_TYPE_UNKNOWN + : i_param->column_type->get_storage_field_type(); + last_query_type = i_param->query_type; + last_query_value_type = i_param->query_value.get_type(); + if (i_param->query_value.get_type() == TYPE_BOOLEAN) { + last_bool_value = i_param->query_value.get(); + } + if (i_param->query_value.get_type() == TYPE_INT) { + last_int_value = i_param->query_value.get(); + } + if (i_param->roaring != nullptr) { + i_param->roaring->add(3); + } + return Status::OK(); + } + + Status read_null_bitmap(segment_v2::InvertedIndexQueryCacheHandle* /*cache_handle*/) override { + return Status::OK(); + } + + Result has_null() override { return false; } + + std::string last_column_name; + FieldType last_column_storage_type = FieldType::OLAP_FIELD_TYPE_UNKNOWN; + segment_v2::InvertedIndexQueryType last_query_type = + segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY; + PrimitiveType last_query_value_type = PrimitiveType::TYPE_NULL; + bool last_bool_value = false; + Int32 last_int_value = 0; +}; + +class DummyInvertedIndexReader final : public segment_v2::InvertedIndexReader { +public: + explicit DummyInvertedIndexReader(const TabletIndex* index_meta) + : segment_v2::InvertedIndexReader(index_meta, nullptr) {} + + DummyInvertedIndexReader(const TabletIndex* index_meta, + std::shared_ptr index_file_reader, + segment_v2::InvertedIndexReaderType reader_type) + : segment_v2::InvertedIndexReader(index_meta, std::move(index_file_reader)), + _reader_type(reader_type) {} + + Status new_iterator(std::unique_ptr* /*iterator*/) override { + return Status::OK(); + } + + Status query(const segment_v2::IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + segment_v2::InvertedIndexQueryType /*query_type*/, + std::shared_ptr& /*bit_map*/, + const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/ = nullptr) override { + return Status::OK(); + } + + Status try_query(const segment_v2::IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + segment_v2::InvertedIndexQueryType /*query_type*/, + size_t* /*count*/) override { + return Status::OK(); + } + + segment_v2::InvertedIndexReaderType type() override { return _reader_type; } + +private: + segment_v2::InvertedIndexReaderType _reader_type = segment_v2::InvertedIndexReaderType::BKD; +}; + +static TabletIndex make_test_inverted_index( + int64_t index_id, const std::map& properties = {}) { + TabletIndex index_meta; + TabletIndexPB pb; + pb.set_index_type(IndexType::INVERTED); + pb.set_index_id(index_id); + pb.set_index_name("test_index_" + std::to_string(index_id)); + pb.add_col_unique_id(1); + for (const auto& [key, value] : properties) { + (*pb.mutable_properties())[key] = value; + } + index_meta.init_from_pb(pb); + return index_meta; +} + TEST_F(FunctionSearchTest, TestGetName) { EXPECT_EQ("search", function_search->get_name()); } @@ -1630,6 +1736,363 @@ TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) { EXPECT_NE(phrase_query, nullptr); } +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantMissingFieldReturnsUnknown) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.missing"; + clause.value = "value"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + std::unordered_map iterators; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.missing"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + bool mapper_called = false; + resolver.set_leaf_query_mapper([&](const std::string& logical_field, + inverted_index::query_v2::QueryPtr* query) -> Status { + mapper_called = true; + EXPECT_EQ("var.items.missing", logical_field); + EXPECT_NE(nullptr, query); + EXPECT_NE(nullptr, *query); + return Status::OK(); + }); + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 5); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_TRUE(mapper_called); + EXPECT_TRUE(out_binding_key.empty()); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 5; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(null_bitmap, nullptr); + EXPECT_EQ(5u, null_bitmap->cardinality()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantSubcolumnWithMissingIterator) { + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + std::unordered_map iterators; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + ASSERT_TRUE(status.ok()); + EXPECT_FALSE(binding.is_bound()); + EXPECT_TRUE(resolver.binding_cache().empty()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantSubcolumnWithReaderSelectionError) { + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + + segment_v2::InvertedIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.level"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(ErrorCode::INVERTED_INDEX_NO_TERMS, status.code()); +} + +TEST_F(FunctionSearchTest, + TestFieldReaderResolverVariantAnalyzerUpgradeWithMissingIndexFileReader) { + auto context = std::make_shared(); + + std::map properties; + properties[INVERTED_INDEX_PARSER_KEY] = INVERTED_INDEX_PARSER_STANDARD; + auto index_meta = make_test_inverted_index(11, properties); + auto reader = std::make_shared( + &index_meta, nullptr, segment_v2::InvertedIndexReaderType::FULLTEXT); + + segment_v2::InvertedIndexIterator iterator; + iterator.add_reader(segment_v2::InvertedIndexReaderType::FULLTEXT, reader); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.msg", + IndexFieldNameAndTypePair {"1.var.items.msg", std::make_shared()}); + std::unordered_map iterators; + iterators["var.items.msg"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.msg"; + field_binding.is_variant_subcolumn = true; + field_binding.index_properties = properties; + field_binding.__isset.is_variant_subcolumn = true; + field_binding.__isset.index_properties = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = resolver.resolve("var.items.msg", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND, status.code()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantBkdDirectReader) { + auto context = std::make_shared(); + + auto index_meta = make_test_inverted_index(12); + auto index_file_reader = std::make_shared( + nullptr, "/tmp/variant_direct_idx", InvertedIndexStorageFormatPB::V2); + auto reader = std::make_shared( + &index_meta, index_file_reader, segment_v2::InvertedIndexReaderType::BKD); + + segment_v2::InvertedIndexIterator iterator; + iterator.add_reader(segment_v2::InvertedIndexReaderType::BKD, reader); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + std::unordered_map iterators; + iterators["var.items.level"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + ASSERT_TRUE(status.ok()) << status.to_string(); + EXPECT_TRUE(binding.use_direct_index_reader()); + EXPECT_EQ(reader, binding.inverted_reader); + EXPECT_EQ("var.items.level", binding.logical_field_name); + EXPECT_EQ("1.var.items.level", binding.stored_field_name); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, binding.query_type); + + const auto& cache = resolver.binding_cache(); + ASSERT_EQ(1u, cache.size()); + EXPECT_TRUE(cache.begin()->second.use_direct_index_reader()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryDirectUnknownClauseUsesLeafMapper) { + TSearchClause clause; + clause.clause_type = "PHRASE"; + clause.field_name = "var.items.active"; + clause.value = "true"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto bool_type = + std::make_shared(make_nullable(std::make_shared())); + data_type_with_names.emplace("var.items.active", + IndexFieldNameAndTypePair {"1.var.items.active", bool_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.active"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.active"; + binding.stored_field_name = "1.var.items.active"; + binding.stored_field_wstr = L"1.var.items.active"; + binding.column_type = bool_type; + binding.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = resolver.binding_key_for("1.var.items.active", + InvertedIndexQueryType::MATCH_PHRASE_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + bool mapper_called = false; + resolver.set_leaf_query_mapper([&](const std::string& logical_field, + inverted_index::query_v2::QueryPtr* query) -> Status { + mapper_called = true; + EXPECT_EQ("var.items.active", logical_field); + EXPECT_NE(nullptr, query); + EXPECT_NE(nullptr, *query); + return Status::OK(); + }); + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 4); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_TRUE(mapper_called); + EXPECT_EQ(key, out_binding_key); + EXPECT_TRUE(iterator.last_column_name.empty()); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 4; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(null_bitmap, nullptr); + EXPECT_EQ(4u, null_bitmap->cardinality()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantBoolUsesDirectIndexReader) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.active"; + clause.value = "true"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto bool_type = + std::make_shared(make_nullable(std::make_shared())); + data_type_with_names.emplace("var.items.active", + IndexFieldNameAndTypePair {"1.var.items.active", bool_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.active"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.active"; + binding.stored_field_name = "1.var.items.active"; + binding.stored_field_wstr = L"1.var.items.active"; + binding.column_type = bool_type; + binding.query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = + resolver.binding_key_for("1.var.items.active", InvertedIndexQueryType::MATCH_ANY_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 10); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_EQ(key, out_binding_key); + EXPECT_EQ("1.var.items.active", iterator.last_column_name); + EXPECT_EQ(FieldType::OLAP_FIELD_TYPE_BOOL, iterator.last_column_storage_type); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, iterator.last_query_type); + EXPECT_EQ(TYPE_BOOLEAN, iterator.last_query_value_type); + EXPECT_TRUE(iterator.last_bool_value); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 10; + auto scorer = weight->scorer(exec_ctx, out_binding_key); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(3u, scorer->doc()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantNestedIntUsesDirectIndexReader) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.flags.level"; + clause.value = "3"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto int_type = std::make_shared(make_nullable( + std::make_shared(make_nullable(std::make_shared())))); + data_type_with_names.emplace("var.items.flags.level", + IndexFieldNameAndTypePair {"1.var.items.flags.level", int_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.flags.level"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.flags.level"; + binding.stored_field_name = "1.var.items.flags.level"; + binding.stored_field_wstr = L"1.var.items.flags.level"; + binding.column_type = int_type; + binding.query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = resolver.binding_key_for("1.var.items.flags.level", + InvertedIndexQueryType::MATCH_ANY_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 10); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_EQ(key, out_binding_key); + EXPECT_EQ("1.var.items.flags.level", iterator.last_column_name); + EXPECT_EQ(FieldType::OLAP_FIELD_TYPE_INT, iterator.last_column_storage_type); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, iterator.last_query_type); + EXPECT_EQ(TYPE_INT, iterator.last_query_value_type); + EXPECT_EQ(3, iterator.last_int_value); +} + TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) { using doris::segment_v2::InvertedIndexQueryInfo; using doris::segment_v2::TermInfo; diff --git a/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp b/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp index eb965d49db8d05..c9ceaba5288399 100644 --- a/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp +++ b/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp @@ -40,6 +40,28 @@ namespace doris::segment_v2 { using namespace inverted_index; +TEST(BitSetQueryTest, EmptyTruthBitmapPreservesNullBitmap) { + auto true_bitmap = std::make_shared(); + auto null_bitmap = std::make_shared(); + null_bitmap->addRange(0, 4); + + query_v2::BitSetQuery query(std::move(true_bitmap), std::move(null_bitmap)); + auto weight = query.weight(false); + ASSERT_NE(nullptr, weight); + + query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + + const auto* scorer_null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(nullptr, scorer_null_bitmap); + EXPECT_EQ(4, scorer_null_bitmap->cardinality()); + EXPECT_TRUE(scorer_null_bitmap->contains(0)); + EXPECT_TRUE(scorer_null_bitmap->contains(3)); +} + class BooleanQueryTest : public testing::Test { public: const std::string kTestDir1 = "./ut_dir/query_test1";