diff --git a/be/src/exprs/function/function_search.cpp b/be/src/exprs/function/function_search.cpp index 6c96da46c5be0c..9c33752598ccd3 100644 --- a/be/src/exprs/function/function_search.cpp +++ b/be/src/exprs/function/function_search.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -34,8 +35,11 @@ #include "common/status.h" #include "core/block/columns_with_type_and_name.h" #include "core/column/column_const.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" #include "core/data_type/data_type_string.h" #include "exprs/function/simple_function_factory.h" +#include "exprs/function/variant_inverted_index_search.h" #include "exprs/vexpr_context.h" #include "runtime/runtime_profile.h" #include "storage/index/index_file_reader.h" @@ -59,12 +63,11 @@ #include "storage/index/inverted/query_v2/term_query/term_query.h" #include "storage/index/inverted/query_v2/wildcard_query/wildcard_query.h" #include "storage/index/inverted/util/string_helper.h" -#include "storage/segment/segment.h" -#include "storage/segment/variant/nested_group_path.h" +#include "storage/olap_common.h" #include "storage/segment/variant/nested_group_provider.h" -#include "storage/segment/variant/variant_column_reader.h" #include "storage/types.h" #include "util/debug_points.h" +#include "util/string_parser.hpp" #include "util/string_util.h" #include "util/thrift_util.h" @@ -119,277 +122,102 @@ bool is_nested_group_search_supported() { return provider != nullptr && provider->should_enable_nested_group_read_path(); } -class ResolverNullBitmapAdapter final : public query_v2::NullBitmapResolver { -public: - explicit ResolverNullBitmapAdapter(const FieldReaderResolver& resolver) : _resolver(resolver) {} - - segment_v2::IndexIterator* iterator_for(const query_v2::Scorer& /*scorer*/, - const std::string& logical_field) const override { - if (logical_field.empty()) { - return nullptr; - } - return _resolver.get_iterator(logical_field); +query_v2::QueryPtr make_unknown_query(uint32_t num_rows) { + auto null_bitmap = std::make_shared(); + if (num_rows > 0) { + null_bitmap->addRange(0, num_rows); } + return std::make_shared(std::make_shared(), + std::move(null_bitmap)); +} -private: - const FieldReaderResolver& _resolver; -}; - -void populate_binding_context(const FieldReaderResolver& resolver, - query_v2::QueryExecutionContext* exec_ctx) { - DCHECK(exec_ctx != nullptr); - exec_ctx->readers = resolver.readers(); - exec_ctx->reader_bindings = resolver.reader_bindings(); - exec_ctx->field_reader_bindings = resolver.field_readers(); - for (const auto& [binding_key, binding] : resolver.binding_cache()) { - if (binding_key.empty()) { - continue; +DataTypePtr unwrap_direct_index_value_type(DataTypePtr column_type) { + DataTypePtr value_type = remove_nullable(std::move(column_type)); + while (value_type != nullptr && + value_type->get_storage_field_type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + const auto* array_type = dynamic_cast(value_type.get()); + if (array_type == nullptr) { + return value_type; } - query_v2::FieldBindingContext binding_ctx; - binding_ctx.logical_field_name = binding.logical_field_name; - binding_ctx.stored_field_name = binding.stored_field_name; - binding_ctx.stored_field_wstr = binding.stored_field_wstr; - exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx)); + value_type = remove_nullable(array_type->get_nested_type()); } + return value_type; } -query_v2::QueryExecutionContext build_query_execution_context( - uint32_t segment_num_rows, const FieldReaderResolver& resolver, - query_v2::NullBitmapResolver* null_resolver) { - query_v2::QueryExecutionContext exec_ctx; - exec_ctx.segment_num_rows = segment_num_rows; - populate_binding_context(resolver, &exec_ctx); - exec_ctx.null_resolver = null_resolver; - return exec_ctx; -} - -} // namespace - -Status FieldReaderResolver::resolve(const std::string& field_name, - InvertedIndexQueryType query_type, - FieldReaderBinding* binding) { - DCHECK(binding != nullptr); - - // Check if this is a variant subcolumn - bool is_variant_sub = is_variant_subcolumn(field_name); - - auto data_it = _data_type_with_names.find(field_name); - if (data_it == _data_type_with_names.end()) { - // For variant subcolumns, not finding the index is normal (the subcolumn may not exist in this segment) - // Return OK but with null binding to signal "no match" - if (is_variant_sub) { - VLOG_DEBUG << "Variant subcolumn '" << field_name - << "' not found in this segment, treating as no match"; - *binding = FieldReaderBinding(); - return Status::OK(); - } - // For normal fields, this is an error - return Status::Error( - "field '{}' not found in inverted index metadata", field_name); +template +Status parse_integral_search_value(const std::string& value, Field* field) { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + CppType parsed = + StringParser::string_to_int(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as {}", value, + type_to_string(primitive_type)); } + *field = Field::create_field(parsed); + return Status::OK(); +} - const auto& stored_field_name = data_it->second.first; - const auto binding_key = binding_key_for(stored_field_name, query_type); - - auto cache_it = _cache.find(binding_key); - if (cache_it != _cache.end()) { - *binding = cache_it->second; - return Status::OK(); +Status parse_scalar_search_value(const DataTypePtr& column_type, const std::string& value, + Field* field) { + if (column_type == nullptr || field == nullptr) { + return Status::InvalidArgument("missing column type for scalar search value"); } - auto iterator_it = _iterators.find(field_name); - if (iterator_it == _iterators.end() || iterator_it->second == nullptr) { - // For variant subcolumns, not finding the iterator is normal - if (is_variant_sub) { - VLOG_DEBUG << "Variant subcolumn '" << field_name - << "' iterator not found in this segment, treating as no match"; - *binding = FieldReaderBinding(); - return Status::OK(); + switch (column_type->get_storage_field_type()) { + case FieldType::OLAP_FIELD_TYPE_BOOL: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + bool parsed = StringParser::string_to_bool(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as bool", value); } - return Status::Error( - "iterator not found for field '{}'", field_name); - } - - auto* inverted_iterator = dynamic_cast(iterator_it->second); - if (inverted_iterator == nullptr) { - return Status::Error( - "iterator for field '{}' is not InvertedIndexIterator", field_name); + *field = Field::create_field(parsed); + return Status::OK(); } - - // For variant subcolumns, FE resolves the field pattern to a specific index and sends - // its index_properties via TSearchFieldBinding. When FE picks an analyzer-based index, - // upgrade EQUAL_QUERY/WILDCARD_QUERY to MATCH_ANY_QUERY so select_best_reader picks the - // FULLTEXT reader instead of STRING_TYPE. Without this upgrade: - // - TERM (EQUAL_QUERY) clauses would open the wrong (untokenized) index directory - // - WILDCARD clauses would enumerate terms from the wrong index, returning empty results - // - // For regular (non-variant) columns with multiple indexes, the caller (build_leaf_query) - // is responsible for passing the appropriate query_type: MATCH_ANY_QUERY for tokenized - // queries (TERM) and EQUAL_QUERY for exact-match queries (EXACT). This ensures - // select_best_reader picks FULLTEXT vs STRING_TYPE correctly without needing an explicit - // analyzer key, since the query_type alone drives the reader type preference. - InvertedIndexQueryType effective_query_type = query_type; - auto fb_it = _field_binding_map.find(field_name); - std::string analyzer_key; - if (is_variant_sub && fb_it != _field_binding_map.end() && - fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { - analyzer_key = normalize_analyzer_key( - build_analyzer_key_from_properties(fb_it->second->index_properties)); - if (inverted_index::InvertedIndexAnalyzer::should_analyzer( - fb_it->second->index_properties) && - (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || - effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { - effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + case FieldType::OLAP_FIELD_TYPE_TINYINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_SMALLINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_INT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_BIGINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_LARGEINT: + return parse_integral_search_value(value, field); + case FieldType::OLAP_FIELD_TYPE_FLOAT: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + Float32 parsed = + StringParser::string_to_float(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as float", value); } + *field = Field::create_field(parsed); + return Status::OK(); } - - Result reader_result; - const auto& column_type = data_it->second.second; - if (column_type) { - reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type, - analyzer_key); - } else { - reader_result = inverted_iterator->select_best_reader(analyzer_key); - } - - if (!reader_result.has_value()) { - return reader_result.error(); - } - - auto inverted_reader = reader_result.value(); - if (inverted_reader == nullptr) { - return Status::Error( - "selected reader is null for field '{}'", field_name); - } - - auto index_file_reader = inverted_reader->get_index_file_reader(); - if (index_file_reader == nullptr) { - return Status::Error( - "index file reader is null for field '{}'", field_name); - } - - // Use InvertedIndexSearcherCache to avoid re-opening index files repeatedly, - // respecting the enable_inverted_index_searcher_cache session variable. - auto index_file_key = - index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); - InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); - InvertedIndexCacheHandle searcher_cache_handle; - - bool searcher_cache_enabled = - _context->runtime_state != nullptr && - _context->runtime_state->query_options().enable_inverted_index_searcher_cache; - - bool cache_hit = false; - if (searcher_cache_enabled) { - int64_t lookup_dummy = 0; - SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_lookup_timer - : &lookup_dummy); - cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, - &searcher_cache_handle); - } - - std::shared_ptr reader_holder; - if (cache_hit) { - if (_context->stats) { - _context->stats->inverted_index_searcher_cache_hit++; - } - auto searcher_variant = searcher_cache_handle.get_index_searcher(); - auto* searcher_ptr = std::get_if(&searcher_variant); - if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { - reader_holder = std::shared_ptr( - (*searcher_ptr)->getReader(), - [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); + case FieldType::OLAP_FIELD_TYPE_DOUBLE: { + StringParser::ParseResult parse_result = StringParser::PARSE_FAILURE; + Float64 parsed = + StringParser::string_to_float(value.data(), value.size(), &parse_result); + if (parse_result != StringParser::PARSE_SUCCESS) { + return Status::InvalidArgument("failed to parse '{}' as double", value); } + *field = Field::create_field(parsed); + return Status::OK(); } - - if (!reader_holder) { - if (_context->stats) { - _context->stats->inverted_index_searcher_cache_miss++; - } - // Cache miss: open directory, build IndexSearcher, insert into cache - int64_t dummy_timer = 0; - SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer - : &dummy_timer); - RETURN_IF_ERROR( - index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); - auto directory = DORIS_TRY( - index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); - - auto index_searcher_builder = DORIS_TRY( - IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); - auto searcher_result = - DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); - auto reader_size = index_searcher_builder->get_reader_size(); - - // Initialization reads are done. Clear io_ctx on the main stream so the - // cached searcher does not carry a stale reference. Subsequent query-phase - // reads receive the caller's io_ctx through the CLucene API parameters - // (termDocs/termPositions/terms) — the same pattern used by the MATCH path - // in InvertedIndexReader::create_index_searcher(). - auto* stream = static_cast(directory.get())->getDorisIndexInput(); - DBUG_EXECUTE_IF( - "FieldReaderResolver.resolve.io_ctx", ({ - const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext(); - if (cur_io_ctx->file_cache_stats) { - if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) { - LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: " - << cur_io_ctx->file_cache_stats << " vs " - << &_context->stats->file_cache_stats; - } - } - })); - stream->setIoContext(nullptr); - stream->setIndexFile(false); - - auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), - reader_size, UnixMillis()); - InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, - &searcher_cache_handle); - - auto new_variant = searcher_cache_handle.get_index_searcher(); - auto* new_ptr = std::get_if(&new_variant); - if (new_ptr != nullptr && *new_ptr != nullptr) { - reader_holder = std::shared_ptr( - (*new_ptr)->getReader(), - [](lucene::index::IndexReader*) { /* lifetime managed by searcher cache */ }); - } - - if (!reader_holder) { - return Status::Error( - "failed to build IndexSearcher for field '{}'", field_name); - } + default: + return Status::NotSupported("scalar search does not support storage field type {}", + static_cast(column_type->get_storage_field_type())); } +} - _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); - - FieldReaderBinding resolved; - resolved.logical_field_name = field_name; - resolved.stored_field_name = stored_field_name; - resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); - resolved.column_type = column_type; - resolved.query_type = effective_query_type; - resolved.inverted_reader = inverted_reader; - resolved.lucene_reader = reader_holder; - // Prefer FE-provided index_properties (needed for variant subcolumn field_pattern matching) - // Reuse fb_it from earlier lookup above. - if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && - !fb_it->second->index_properties.empty()) { - resolved.index_properties = fb_it->second->index_properties; - } else { - resolved.index_properties = inverted_reader->get_index_properties(); +InvertedIndexQueryType direct_index_query_type_for_clause(const std::string& clause_type) { + if (clause_type == "TERM" || clause_type == "EXACT") { + return InvertedIndexQueryType::EQUAL_QUERY; } - resolved.binding_key = binding_key; - resolved.analyzer_key = - normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); - - _binding_readers[binding_key] = reader_holder; - _field_readers[resolved.stored_field_wstr] = reader_holder; - _readers.emplace_back(reader_holder); - _cache.emplace(binding_key, resolved); - *binding = resolved; - return Status::OK(); + return InvertedIndexQueryType::UNKNOWN_QUERY; } +} // namespace + Status FunctionSearch::execute_impl(FunctionContext* /*context*/, Block& /*block*/, const ColumnNumbers& /*arguments*/, uint32_t /*result*/, size_t /*input_rows_count*/) const { @@ -507,78 +335,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( context->collection_similarity = std::make_shared(); } - // NESTED() queries evaluate predicates on the flattened "element space" of a nested group. - // For VARIANT nested groups, the indexed lucene field (stored_field_name) uses: - // parent_unique_id + "." + - // where the nested path is rooted at either: - // - "__D0_root__" for top-level array (NESTED(data, ...)) - // - "" for object fields (NESTED(data.items, ...)) - // - // FE field bindings are expressed using logical column paths (e.g. "data.items.msg"), so for - // NESTED() we normalize stored_field_name suffix to be consistent with the nested group root. - std::unordered_map patched_data_type_with_names; const auto* effective_data_type_with_names = &data_type_with_names; - if (is_nested_query && search_param.root.__isset.nested_path) { - const std::string& nested_path = search_param.root.nested_path; - const auto dot_pos = nested_path.find('.'); - const std::string root_field = - (dot_pos == std::string::npos) ? nested_path : nested_path.substr(0, dot_pos); - const std::string root_prefix = root_field + "."; - const std::string array_path = (dot_pos == std::string::npos) - ? std::string(segment_v2::kRootNestedGroupPath) - : nested_path.substr(dot_pos + 1); - - bool copied = false; - for (const auto& fb : search_param.field_bindings) { - if (!fb.__isset.is_variant_subcolumn || !fb.is_variant_subcolumn) { - continue; - } - if (fb.field_name.empty()) { - continue; - } - const auto it_orig = data_type_with_names.find(fb.field_name); - if (it_orig == data_type_with_names.end()) { - continue; - } - const std::string& old_stored = it_orig->second.first; - const auto first_dot = old_stored.find('.'); - if (first_dot == std::string::npos) { - continue; - } - std::string sub_path; - if (fb.__isset.subcolumn_path && !fb.subcolumn_path.empty()) { - sub_path = fb.subcolumn_path; - } else if (fb.field_name.starts_with(nested_path + ".")) { - sub_path = fb.field_name.substr(nested_path.size() + 1); - } else if (fb.field_name.starts_with(root_prefix)) { - sub_path = fb.field_name.substr(root_prefix.size()); - } else { - sub_path = fb.field_name; - } - if (sub_path.empty()) { - continue; - } - const std::string array_prefix = array_path + "."; - const std::string suffix_path = - sub_path.starts_with(array_prefix) ? sub_path : (array_prefix + sub_path); - const std::string parent_uid = old_stored.substr(0, first_dot); - const std::string expected_stored = parent_uid + "." + suffix_path; - if (old_stored == expected_stored) { - continue; - } - - if (!copied) { - patched_data_type_with_names = data_type_with_names; - effective_data_type_with_names = &patched_data_type_with_names; - copied = true; - } - auto it = patched_data_type_with_names.find(fb.field_name); - if (it == patched_data_type_with_names.end()) { - continue; - } - it->second.first = expected_stored; - } - } // Pass field_bindings to resolver for variant subcolumn detection FieldReaderResolver resolver(*effective_data_type_with_names, iterators, context, @@ -586,9 +343,10 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( if (is_nested_query) { std::shared_ptr row_bitmap; - RETURN_IF_ERROR(evaluate_nested_query(search_param, search_param.root, context, resolver, - num_rows, index_exec_ctx, field_name_to_column_id, - row_bitmap)); + VariantNestedSearchEvaluator nested_evaluator(*this); + RETURN_IF_ERROR(nested_evaluator.evaluate(search_param, search_param.root, context, + resolver, num_rows, index_exec_ctx, + field_name_to_column_id, row_bitmap)); bitmap_result = InvertedIndexResultBitmap(std::move(row_bitmap), std::make_shared()); bitmap_result.mask_out_null(); @@ -617,7 +375,7 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( SCOPED_RAW_TIMER(stats ? &stats->inverted_index_searcher_search_init_timer : &init_dummy); RETURN_IF_ERROR(build_query_recursive(search_param.root, context, resolver, &root_query, &root_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); } if (root_query == nullptr) { LOG(INFO) << "search: Query tree resolved to empty query, dsl:" @@ -627,9 +385,9 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } - ResolverNullBitmapAdapter null_resolver(resolver); + VariantSearchNullBitmapAdapter null_resolver(resolver); query_v2::QueryExecutionContext exec_ctx = - build_query_execution_context(num_rows, resolver, &null_resolver); + build_variant_search_query_execution_context(num_rows, resolver, &null_resolver); bool enable_scoring = false; bool is_asc = false; @@ -713,139 +471,6 @@ Status FunctionSearch::evaluate_inverted_index_with_search_param( return Status::OK(); } -Status FunctionSearch::evaluate_nested_query( - const TSearchParam& search_param, const TSearchClause& nested_clause, - const std::shared_ptr& context, FieldReaderResolver& resolver, - uint32_t num_rows, const IndexExecContext* index_exec_ctx, - const std::unordered_map& field_name_to_column_id, - std::shared_ptr& result_bitmap) const { - (void)field_name_to_column_id; - if (!(nested_clause.__isset.nested_path)) { - return Status::InvalidArgument("NESTED clause missing nested_path"); - } - if (!(nested_clause.__isset.children) || nested_clause.children.empty()) { - return Status::InvalidArgument("NESTED clause missing inner query"); - } - if (result_bitmap == nullptr) { - result_bitmap = std::make_shared(); - } else { - *result_bitmap = roaring::Roaring(); - } - - // 1. Get the nested group chain directly - std::string root_field = nested_clause.nested_path; - auto dot_pos = nested_clause.nested_path.find('.'); - if (dot_pos != std::string::npos) { - root_field = nested_clause.nested_path.substr(0, dot_pos); - } - if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) { - return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment"); - } - auto* segment = index_exec_ctx->segment(); - const int32_t ordinal = segment->tablet_schema()->field_index(root_field); - if (ordinal < 0) { - return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query", - root_field); - } - const ColumnId column_id = static_cast(ordinal); - - std::shared_ptr column_reader; - RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id), - &column_reader, - index_exec_ctx->column_iter_opts().stats)); - auto* variant_reader = dynamic_cast(column_reader.get()); - if (variant_reader == nullptr) { - return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field); - } - - std::string array_path; - if (dot_pos == std::string::npos) { - array_path = std::string(segment_v2::kRootNestedGroupPath); - } else { - array_path = nested_clause.nested_path.substr(dot_pos + 1); - } - - auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path); - if (!found || group_chain.empty()) { - return Status::OK(); - } - - // Use the read provider for element counting and bitmap mapping. - auto read_provider = segment_v2::create_nested_group_read_provider(); - if (!read_provider || !read_provider->should_enable_nested_group_read_path()) { - return Status::NotSupported( - "NestedGroup search is an enterprise capability, not available in this build"); - } - - auto& leaf_group = group_chain.back(); - uint64_t total_elements = 0; - RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(), - leaf_group, &total_elements)); - if (total_elements == 0) { - return Status::OK(); - } - - // 3. Evaluate inner query - std::string default_operator = "or"; - if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { - default_operator = search_param.default_operator; - } - int32_t minimum_should_match = -1; - if (search_param.__isset.minimum_should_match) { - minimum_should_match = search_param.minimum_should_match; - } - - query_v2::QueryPtr inner_query; - std::string inner_binding_key; - RETURN_IF_ERROR(build_query_recursive(nested_clause.children[0], context, resolver, - &inner_query, &inner_binding_key, default_operator, - minimum_should_match)); - if (inner_query == nullptr) { - return Status::OK(); - } - - if (total_elements > std::numeric_limits::max()) { - return Status::InvalidArgument("nested element_count exceeds uint32_t max"); - } - - ResolverNullBitmapAdapter null_resolver(resolver); - query_v2::QueryExecutionContext exec_ctx = build_query_execution_context( - static_cast(total_elements), resolver, &null_resolver); - - auto weight = inner_query->weight(false); - if (!weight) { - return Status::OK(); - } - auto scorer = weight->scorer(exec_ctx, inner_binding_key); - if (!scorer) { - return Status::OK(); - } - - roaring::Roaring element_bitmap; - uint32_t doc = scorer->doc(); - while (doc != query_v2::TERMINATED) { - element_bitmap.add(doc); - doc = scorer->advance(); - } - - if (scorer->has_null_bitmap(exec_ctx.null_resolver)) { - const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); - if (bitmap != nullptr && !bitmap->isEmpty()) { - element_bitmap -= *bitmap; - } - } - - // 4. Map element-level hits back to row-level hits through NestedGroup chain. - if (result_bitmap == nullptr) { - result_bitmap = std::make_shared(); - } - roaring::Roaring parent_bitmap; - RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords( - group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap)); - *result_bitmap = std::move(parent_bitmap); - return Status::OK(); -} - // Aligned with FE QsClauseType enum - uses enum.name() as clause_type FunctionSearch::ClauseTypeCategory FunctionSearch::get_clause_type_category( const std::string& clause_type) const { @@ -955,13 +580,11 @@ static query_v2::Occur map_thrift_occur(TSearchOccur::type thrift_occur) { } } -Status FunctionSearch::build_query_recursive(const TSearchClause& clause, - const std::shared_ptr& context, - FieldReaderResolver& resolver, - inverted_index::query_v2::QueryPtr* out, - std::string* binding_key, - const std::string& default_operator, - int32_t minimum_should_match) const { +Status FunctionSearch::build_query_recursive( + const TSearchClause& clause, const std::shared_ptr& context, + FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, + std::string* binding_key, const std::string& default_operator, int32_t minimum_should_match, + uint32_t num_rows) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -991,7 +614,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, &child_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); // Determine occur type from child clause query_v2::Occur occur = query_v2::Occur::MUST; // default @@ -1027,7 +650,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, std::string child_binding_key; RETURN_IF_ERROR(build_query_recursive(child_clause, context, resolver, &child_query, &child_binding_key, default_operator, - minimum_should_match)); + minimum_should_match, num_rows)); // Add all children including empty BitSetQuery // BooleanQuery will handle the logic: // - AND with empty bitmap → result is empty @@ -1042,7 +665,7 @@ Status FunctionSearch::build_query_recursive(const TSearchClause& clause, } return build_leaf_query(clause, context, resolver, out, binding_key, default_operator, - minimum_should_match); + minimum_should_match, num_rows); } Status FunctionSearch::build_leaf_query(const TSearchClause& clause, @@ -1051,7 +674,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, const std::string& default_operator, - int32_t minimum_should_match) const { + int32_t minimum_should_match, uint32_t num_rows) const { DCHECK(out != nullptr); *out = nullptr; if (binding_key) { @@ -1083,26 +706,75 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; } + auto finish_leaf_query = [&](query_v2::QueryPtr query) -> Status { + *out = std::move(query); + return resolver.map_leaf_query(field_name, out); + }; + FieldReaderBinding binding; RETURN_IF_ERROR(resolver.resolve(field_name, query_type, &binding)); - // Check if binding is empty (variant subcolumn not found in this segment) - if (binding.lucene_reader == nullptr) { + if (!binding.is_bound()) { LOG(INFO) << "search: No inverted index for field '" << field_name << "' in this segment, clause_type='" << clause_type - << "', query_type=" << static_cast(query_type) << ", returning no matches"; - // Variant subcolumn doesn't exist - create empty BitSetQuery (no matches) - *out = std::make_shared(roaring::Roaring()); + << "', query_type=" << static_cast(query_type) + << ", returning UNKNOWN bitmap"; if (binding_key) { binding_key->clear(); } - return Status::OK(); + return finish_leaf_query(make_unknown_query(num_rows)); } if (binding_key) { *binding_key = binding.binding_key; } + if (binding.use_direct_index_reader()) { + auto direct_query_type = direct_index_query_type_for_clause(clause_type); + if (direct_query_type == InvertedIndexQueryType::UNKNOWN_QUERY) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + + auto value_type = unwrap_direct_index_value_type(binding.column_type); + Field param_value; + auto parse_status = parse_scalar_search_value(value_type, value, ¶m_value); + if (!parse_status.ok()) { + LOG(INFO) << "search: scalar leaf value is unsupported, field=" << field_name + << ", value='" << value << "', reason=" << parse_status.to_string(); + return finish_leaf_query(make_unknown_query(num_rows)); + } + + auto* iterator = resolver.get_iterator(field_name); + if (iterator == nullptr) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + + segment_v2::InvertedIndexParam param; + param.column_name = binding.stored_field_name; + param.column_type = value_type; + param.query_value = param_value; + param.query_type = direct_query_type; + param.num_rows = num_rows; + param.roaring = std::make_shared(); + RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam {¶m})); + + std::shared_ptr null_bitmap = std::make_shared(); + auto has_null = iterator->has_null(); + if (has_null.has_value() && has_null.value()) { + segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle; + RETURN_IF_ERROR(iterator->read_null_bitmap(&null_bitmap_cache_handle)); + if (auto bitmap = null_bitmap_cache_handle.get_bitmap(); bitmap != nullptr) { + null_bitmap = bitmap; + } + } + return finish_leaf_query(std::make_shared(std::move(param.roaring), + std::move(null_bitmap))); + } + + if (binding.lucene_reader == nullptr) { + return finish_leaf_query(make_unknown_query(num_rows)); + } + FunctionSearch::ClauseTypeCategory category = get_clause_type_category(clause_type); std::wstring field_wstr = binding.stored_field_wstr; std::wstring value_wstr = StringHelper::to_wstring(value); @@ -1118,8 +790,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (binding.index_properties.empty()) { LOG(WARNING) << "search: analyzer required but index properties empty for field '" << field_name << "'"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1129,14 +800,13 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, LOG(WARNING) << "search: No terms found after tokenization for TERM query, field=" << field_name << ", value='" << value << "', returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } if (term_infos.size() == 1) { std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); - *out = make_term_query(term_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(term_wstr)); } // When minimum_should_match is specified, use OccurBooleanQuery @@ -1151,8 +821,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); builder->add(make_term_query(term_wstr), occur); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } // Use default_operator to determine how to combine tokenized terms @@ -1165,12 +834,10 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (category == FunctionSearch::ClauseTypeCategory::TOKENIZED) { @@ -1180,16 +847,14 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (!should_analyze) { VLOG_DEBUG << "search: PHRASE on non-tokenized field '" << field_name << "', falling back to TERM"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (binding.index_properties.empty()) { LOG(WARNING) << "search: analyzer required but index properties empty for PHRASE " "query on field '" << field_name << "'"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1199,8 +864,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, LOG(WARNING) << "search: No terms found after tokenization for PHRASE query, field=" << field_name << ", value='" << value << "', returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } std::vector phrase_term_infos = @@ -1209,7 +874,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, const auto& term_info = phrase_term_infos[0]; if (term_info.is_single_term()) { std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); - *out = std::make_shared(context, field_wstr, term_wstr); + return finish_leaf_query( + std::make_shared(context, field_wstr, term_wstr)); } else { auto builder = create_operator_boolean_query_builder(query_v2::OperatorType::OP_OR); @@ -1217,15 +883,15 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term); builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); + return finish_leaf_query(builder->build()); } } else { if (QueryHelper::is_simple_phrase(phrase_term_infos)) { - *out = std::make_shared(context, field_wstr, - phrase_term_infos); + return finish_leaf_query(std::make_shared( + context, field_wstr, phrase_term_infos)); } else { - *out = std::make_shared(context, field_wstr, - phrase_term_infos); + return finish_leaf_query(std::make_shared( + context, field_wstr, phrase_term_infos)); } } @@ -1233,23 +899,20 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, } if (clause_type == "MATCH") { VLOG_DEBUG << "search: MATCH clause not implemented, fallback to TERM"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (clause_type == "ANY" || clause_type == "ALL") { bool should_analyze = inverted_index::InvertedIndexAnalyzer::should_analyzer( binding.index_properties); if (!should_analyze) { - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (binding.index_properties.empty()) { LOG(WARNING) << "search: index properties empty for tokenized clause '" << clause_type << "' field=" << field_name; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } std::vector term_infos = @@ -1258,8 +921,8 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (term_infos.empty()) { LOG(WARNING) << "search: tokenization yielded no terms for clause '" << clause_type << "', field=" << field_name << ", returning empty BitSetQuery"; - *out = std::make_shared(roaring::Roaring()); - return Status::OK(); + return finish_leaf_query( + std::make_shared(roaring::Roaring())); } query_v2::OperatorType bool_type = query_v2::OperatorType::OP_OR; @@ -1269,8 +932,7 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, if (term_infos.size() == 1) { std::wstring term_wstr = StringHelper::to_wstring(term_infos[0].get_single_term()); - *out = make_term_query(term_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(term_wstr)); } auto builder = create_operator_boolean_query_builder(bool_type); @@ -1278,13 +940,11 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, std::wstring term_wstr = StringHelper::to_wstring(term_info.get_single_term()); builder->add(make_term_query(term_wstr), binding.binding_key); } - *out = builder->build(); - return Status::OK(); + return finish_leaf_query(builder->build()); } // Default tokenized clause fallback - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (category == FunctionSearch::ClauseTypeCategory::NON_TOKENIZED) { @@ -1293,10 +953,9 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, // Note: EXACT prefers untokenized index (STRING_TYPE) which doesn't support lowercase // If only tokenized index exists, EXACT may return empty results because // tokenized indexes store individual tokens, not complete strings - *out = make_term_query(value_wstr); VLOG_DEBUG << "search: EXACT clause processed, field=" << field_name << ", value='" << value << "'"; - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } if (clause_type == "PREFIX") { // Apply lowercase only if: @@ -1308,21 +967,20 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, get_parser_lowercase_from_properties(binding.index_properties); bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); std::string pattern = should_lowercase ? to_lower(value) : value; - *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: PREFIX clause processed, field=" << field_name << ", pattern='" << pattern << "' (original='" << value << "', has_parser=" << has_parser << ", lower_case=" << lowercase_setting << ")"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, pattern)); } if (clause_type == "WILDCARD") { // Standalone wildcard "*" matches all non-null values for this field // Consistent with ES query_string behavior where field:* becomes FieldExistsQuery if (value == "*") { - *out = std::make_shared(field_wstr, true); VLOG_DEBUG << "search: WILDCARD '*' converted to AllQuery(nullable=true), field=" << field_name; - return Status::OK(); + return finish_leaf_query(std::make_shared(field_wstr, true)); } // Apply lowercase only if: // 1. There's a parser/analyzer (otherwise lower_case has no effect on indexing) @@ -1333,33 +991,31 @@ Status FunctionSearch::build_leaf_query(const TSearchClause& clause, get_parser_lowercase_from_properties(binding.index_properties); bool should_lowercase = has_parser && (lowercase_setting == INVERTED_INDEX_PARSER_TRUE); std::string pattern = should_lowercase ? to_lower(value) : value; - *out = std::make_shared(context, field_wstr, pattern); VLOG_DEBUG << "search: WILDCARD clause processed, field=" << field_name << ", pattern='" << pattern << "' (original='" << value << "', has_parser=" << has_parser << ", lower_case=" << lowercase_setting << ")"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, pattern)); } if (clause_type == "REGEXP") { // ES-compatible: regex patterns are NOT lowercased (case-sensitive matching) // This matches ES query_string behavior where regex patterns bypass analysis - *out = std::make_shared(context, field_wstr, value); VLOG_DEBUG << "search: REGEXP clause processed, field=" << field_name << ", pattern='" << value << "'"; - return Status::OK(); + return finish_leaf_query( + std::make_shared(context, field_wstr, value)); } if (clause_type == "RANGE" || clause_type == "LIST") { VLOG_DEBUG << "search: clause type '" << clause_type << "' not implemented, fallback to TERM"; } - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } LOG(WARNING) << "search: Unexpected clause type '" << clause_type << "', using TERM fallback"; - *out = make_term_query(value_wstr); - return Status::OK(); + return finish_leaf_query(make_term_query(value_wstr)); } void register_function_search(SimpleFunctionFactory& factory) { diff --git a/be/src/exprs/function/function_search.h b/be/src/exprs/function/function_search.h index 376e1aa07282dd..343db747583a70 100644 --- a/be/src/exprs/function/function_search.h +++ b/be/src/exprs/function/function_search.h @@ -31,6 +31,7 @@ #include "core/data_type/data_type_number.h" #include "core/types.h" #include "exprs/function/function.h" +#include "exprs/function/variant_inverted_index_search.h" #include "storage/index/index_query_context.h" #include "storage/index/inverted/inverted_index_cache.h" #include "storage/index/inverted/query_v2/boolean_query/operator_boolean_query.h" @@ -42,91 +43,6 @@ using namespace doris::segment_v2; class IndexExecContext; -struct FieldReaderBinding { - std::string logical_field_name; - std::string stored_field_name; - std::wstring stored_field_wstr; - DataTypePtr column_type; - InvertedIndexQueryType query_type; - InvertedIndexReaderPtr inverted_reader; - std::shared_ptr lucene_reader; - std::map index_properties; - std::string binding_key; - std::string analyzer_key; -}; - -class FieldReaderResolver { -public: - FieldReaderResolver( - const std::unordered_map& data_type_with_names, - const std::unordered_map& iterators, - std::shared_ptr context, - const std::vector& field_bindings = {}) - : _data_type_with_names(data_type_with_names), - _iterators(iterators), - _context(std::move(context)), - _field_bindings(field_bindings) { - // Build lookup maps for quick access - for (const auto& binding : _field_bindings) { - if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { - _variant_subcolumn_fields.insert(binding.field_name); - } - _field_binding_map[binding.field_name] = &binding; - } - } - - Status resolve(const std::string& field_name, InvertedIndexQueryType query_type, - FieldReaderBinding* binding); - - // Check if a field is a variant subcolumn - bool is_variant_subcolumn(const std::string& field_name) const { - return _variant_subcolumn_fields.count(field_name) > 0; - } - - const std::vector>& readers() const { - return _readers; - } - - const std::unordered_map>& - reader_bindings() const { - return _binding_readers; - } - - const std::unordered_map>& - field_readers() const { - return _field_readers; - } - - const std::unordered_map& binding_cache() const { - return _cache; - } - - IndexIterator* get_iterator(const std::string& field_name) const { - auto it = _iterators.find(field_name); - return (it != _iterators.end()) ? it->second : nullptr; - } - -private: - std::string binding_key_for(const std::string& stored_field_name, - InvertedIndexQueryType query_type) const { - return stored_field_name + "#" + std::to_string(static_cast(query_type)); - } - - const std::unordered_map& _data_type_with_names; - const std::unordered_map& _iterators; - std::shared_ptr _context; - std::vector _field_bindings; - std::unordered_map _field_binding_map; - std::unordered_set _variant_subcolumn_fields; - std::unordered_map _cache; - std::vector> _readers; - std::unordered_map> _binding_readers; - std::unordered_map> _field_readers; - // Keep searcher cache handles alive for the resolver's lifetime. - // This pins cached IndexSearcher entries so extracted IndexReaders remain valid. - std::vector _searcher_cache_handles; -}; - class FunctionSearch : public IFunction { public: static constexpr auto name = "search"; @@ -177,13 +93,6 @@ class FunctionSearch : public IFunction { const std::unordered_map& field_name_to_column_id, const std::shared_ptr& index_query_context = nullptr) const; - Status evaluate_nested_query( - const TSearchParam& search_param, const TSearchClause& nested_clause, - const std::shared_ptr& context, FieldReaderResolver& resolver, - uint32_t num_rows, const IndexExecContext* index_exec_ctx, - const std::unordered_map& field_name_to_column_id, - std::shared_ptr& result_bitmap) const; - // Public methods for testing enum class ClauseTypeCategory { NON_TOKENIZED, // TERM, PREFIX, WILDCARD, REGEXP, RANGE, LIST - no tokenization, use EQUAL_QUERY @@ -204,14 +113,14 @@ class FunctionSearch : public IFunction { const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, - const std::string& default_operator, - int32_t minimum_should_match) const; + const std::string& default_operator, int32_t minimum_should_match, + uint32_t num_rows = 0) const; Status build_leaf_query(const TSearchClause& clause, const std::shared_ptr& context, FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, std::string* binding_key, const std::string& default_operator, - int32_t minimum_should_match) const; + int32_t minimum_should_match, uint32_t num_rows = 0) const; }; } // namespace doris diff --git a/be/src/exprs/function/variant_inverted_index_search.cpp b/be/src/exprs/function/variant_inverted_index_search.cpp new file mode 100644 index 00000000000000..cf3fc0505188c6 --- /dev/null +++ b/be/src/exprs/function/variant_inverted_index_search.cpp @@ -0,0 +1,720 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "exprs/function/variant_inverted_index_search.h" + +#include +#include +#include + +#include +#include +#include + +#include "common/config.h" +#include "common/exception.h" +#include "common/logging.h" +#include "exprs/function/function_search.h" +#include "exprs/vexpr_context.h" +#include "runtime/runtime_state.h" +#include "storage/index/index_file_reader.h" +#include "storage/index/inverted/analyzer/analyzer.h" +#include "storage/index/inverted/inverted_index_compound_reader.h" +#include "storage/index/inverted/inverted_index_parser.h" +#include "storage/index/inverted/inverted_index_searcher.h" +#include "storage/index/inverted/query_v2/bit_set_query/bit_set_scorer.h" +#include "storage/index/inverted/query_v2/doc_set.h" +#include "storage/index/inverted/query_v2/scorer.h" +#include "storage/index/inverted/query_v2/term_query/term_query.h" +#include "storage/index/inverted/query_v2/weight.h" +#include "storage/index/inverted/util/string_helper.h" +#include "storage/segment/segment.h" +#include "storage/segment/variant/nested_group_path.h" +#include "storage/segment/variant/nested_group_provider.h" +#include "storage/segment/variant/variant_column_reader.h" +#include "storage/utils.h" +#include "util/debug_points.h" +#include "util/time.h" + +namespace doris { + +namespace query_v2 = segment_v2::inverted_index::query_v2; + +namespace { + +void add_search_binding_diagnostic(const std::shared_ptr& context, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (context != nullptr && context->stats != nullptr) { + context->stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + +} // namespace + +FieldReaderResolver::FieldReaderResolver( + const std::unordered_map& data_type_with_names, + const std::unordered_map& iterators, + std::shared_ptr context, + const std::vector& field_bindings) + : _data_type_with_names(data_type_with_names), + _iterators(iterators), + _context(std::move(context)), + _field_bindings(field_bindings) { + for (const auto& binding : _field_bindings) { + if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { + _variant_subcolumn_fields.insert(binding.field_name); + } + _field_binding_map[binding.field_name] = &binding; + } +} + +Status FieldReaderResolver::resolve(const std::string& field_name, + InvertedIndexQueryType query_type, + FieldReaderBinding* binding) { + DCHECK(binding != nullptr); + + const bool is_variant_sub = is_variant_subcolumn(field_name); + + auto data_it = _data_type_with_names.find(field_name); + if (data_it == _data_type_with_names.end()) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=no_metadata " + "logical_field={} query_type={} reason=field_not_found", + field_name, query_type_to_string(query_type))); + *binding = FieldReaderBinding(); + return Status::OK(); + } + return Status::Error( + "field '{}' not found in inverted index metadata", field_name); + } + + const auto& stored_field_name = data_it->second.first; + const auto binding_key = binding_key_for(stored_field_name, query_type); + + auto cache_it = _cache.find(binding_key); + if (cache_it != _cache.end()) { + *binding = cache_it->second; + return Status::OK(); + } + + auto iterator_it = _iterators.find(field_name); + if (iterator_it == _iterators.end() || iterator_it->second == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=no_iterator " + "logical_field={} stored_field={} query_type={} " + "reason=iterator_not_found", + field_name, stored_field_name, query_type_to_string(query_type))); + *binding = FieldReaderBinding(); + return Status::OK(); + } + return Status::Error( + "iterator not found for field '{}'", field_name); + } + + auto* inverted_iterator = dynamic_cast(iterator_it->second); + if (inverted_iterator == nullptr) { + return Status::Error( + "iterator for field '{}' is not InvertedIndexIterator", field_name); + } + + InvertedIndexQueryType effective_query_type = query_type; + const auto& column_type = data_it->second.second; + const bool is_text_field = + column_type != nullptr && is_string_type(column_type->get_storage_field_type()); + auto fb_it = _field_binding_map.find(field_name); + std::string analyzer_key; + if (is_text_field && is_variant_sub && fb_it != _field_binding_map.end() && + fb_it->second->__isset.index_properties && !fb_it->second->index_properties.empty()) { + analyzer_key = normalize_analyzer_key( + build_analyzer_key_from_properties(fb_it->second->index_properties)); + if (inverted_index::InvertedIndexAnalyzer::should_analyzer( + fb_it->second->index_properties) && + (effective_query_type == InvertedIndexQueryType::EQUAL_QUERY || + effective_query_type == InvertedIndexQueryType::WILDCARD_QUERY)) { + effective_query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + } + } + + Result reader_result; + if (column_type) { + reader_result = inverted_iterator->select_best_reader(column_type, effective_query_type, + is_text_field ? analyzer_key : ""); + } else { + reader_result = inverted_iterator->select_best_reader(is_text_field ? analyzer_key : ""); + } + + if (!reader_result.has_value()) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} analyzer_key={} reason={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), analyzer_key, + reader_result.error().to_string())); + } + return reader_result.error(); + } + + auto inverted_reader = reader_result.value(); + if (inverted_reader == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} reason=selected_reader_null", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type))); + } + return Status::Error( + "selected reader is null for field '{}'", field_name); + } + + FieldReaderBinding resolved; + resolved.logical_field_name = field_name; + resolved.stored_field_name = stored_field_name; + resolved.stored_field_wstr = StringHelper::to_wstring(resolved.stored_field_name); + resolved.column_type = column_type; + resolved.query_type = effective_query_type; + resolved.inverted_reader = inverted_reader; + resolved.binding_key = binding_key; + resolved.state = SearchFieldBindingState::BOUND; + if (fb_it != _field_binding_map.end() && fb_it->second->__isset.index_properties && + !fb_it->second->index_properties.empty()) { + resolved.index_properties = fb_it->second->index_properties; + } else { + resolved.index_properties = inverted_reader->get_index_properties(); + } + resolved.analyzer_key = + normalize_analyzer_key(build_analyzer_key_from_properties(resolved.index_properties)); + + auto index_file_reader = inverted_reader->get_index_file_reader(); + if (index_file_reader == nullptr) { + if (is_variant_sub) { + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=reject " + "logical_field={} stored_field={} index_id={} suffix={} " + "reason=index_file_reader_null", + field_name, stored_field_name, inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix())); + } + return Status::Error( + "index file reader is null for field '{}'", field_name); + } + + if (inverted_reader->type() == InvertedIndexReaderType::BKD) { + _cache.emplace(binding_key, resolved); + if (is_variant_sub) { + bool index_file_exists = false; + auto probe_status = index_file_reader->index_file_exist( + &inverted_reader->get_index_meta(), &index_file_exists); + add_search_binding_diagnostic( + _context, + fmt::format("[VariantSearchBinding] phase=field_resolve result=selected_direct " + "logical_field={} stored_field={} query_type={} " + "effective_query_type={} index_id={} suffix={} reader_type={} " + "index_file_exists={} probe_status={} index_file={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), + inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix(), + reader_type_to_string(inverted_reader->type()), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string(), + index_file_reader->get_index_file_path( + &inverted_reader->get_index_meta()))); + } + *binding = resolved; + return Status::OK(); + } + + auto index_file_key = + index_file_reader->get_index_file_cache_key(&inverted_reader->get_index_meta()); + InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); + InvertedIndexCacheHandle searcher_cache_handle; + + bool searcher_cache_enabled = + _context->runtime_state != nullptr && + _context->runtime_state->query_options().enable_inverted_index_searcher_cache; + + bool cache_hit = false; + if (searcher_cache_enabled) { + int64_t lookup_dummy = 0; + SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_lookup_timer + : &lookup_dummy); + cache_hit = InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, + &searcher_cache_handle); + } + + std::shared_ptr reader_holder; + if (cache_hit) { + if (_context->stats) { + _context->stats->inverted_index_searcher_cache_hit++; + } + auto searcher_variant = searcher_cache_handle.get_index_searcher(); + auto* searcher_ptr = std::get_if(&searcher_variant); + if (searcher_ptr != nullptr && *searcher_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*searcher_ptr)->getReader(), [](lucene::index::IndexReader*) {}); + } + } + + if (!reader_holder) { + if (_context->stats) { + _context->stats->inverted_index_searcher_cache_miss++; + } + int64_t dummy_timer = 0; + SCOPED_RAW_TIMER(_context->stats ? &_context->stats->inverted_index_searcher_open_timer + : &dummy_timer); + RETURN_IF_ERROR( + index_file_reader->init(config::inverted_index_read_buffer_size, _context->io_ctx)); + auto directory = DORIS_TRY( + index_file_reader->open(&inverted_reader->get_index_meta(), _context->io_ctx)); + + auto index_searcher_builder = DORIS_TRY( + IndexSearcherBuilder::create_index_searcher_builder(inverted_reader->type())); + auto searcher_result = + DORIS_TRY(index_searcher_builder->get_index_searcher(directory.get())); + auto reader_size = index_searcher_builder->get_reader_size(); + + auto* stream = static_cast(directory.get())->getDorisIndexInput(); + DBUG_EXECUTE_IF( + "FieldReaderResolver.resolve.io_ctx", ({ + const auto* cur_io_ctx = (const io::IOContext*)stream->getIoContext(); + if (cur_io_ctx->file_cache_stats) { + if (cur_io_ctx->file_cache_stats != &_context->stats->file_cache_stats) { + LOG(FATAL) << "search: io_ctx file_cache_stats mismatch: " + << cur_io_ctx->file_cache_stats << " vs " + << &_context->stats->file_cache_stats; + } + } + })); + stream->setIoContext(nullptr); + stream->setIndexFile(false); + + auto* cache_value = new InvertedIndexSearcherCache::CacheValue(std::move(searcher_result), + reader_size, UnixMillis()); + InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, cache_value, + &searcher_cache_handle); + + auto new_variant = searcher_cache_handle.get_index_searcher(); + auto* new_ptr = std::get_if(&new_variant); + if (new_ptr != nullptr && *new_ptr != nullptr) { + reader_holder = std::shared_ptr( + (*new_ptr)->getReader(), [](lucene::index::IndexReader*) {}); + } + + if (!reader_holder) { + return Status::Error( + "failed to build IndexSearcher for field '{}'", field_name); + } + } + + _searcher_cache_handles.push_back(std::move(searcher_cache_handle)); + + resolved.lucene_reader = reader_holder; + _binding_readers[binding_key] = reader_holder; + _field_readers[resolved.stored_field_wstr] = reader_holder; + _readers.emplace_back(reader_holder); + _cache.emplace(binding_key, resolved); + if (is_variant_sub) { + bool index_file_exists = false; + auto probe_status = index_file_reader->index_file_exist(&inverted_reader->get_index_meta(), + &index_file_exists); + add_search_binding_diagnostic( + _context, + fmt::format( + "[VariantSearchBinding] phase=field_resolve result=selected " + "logical_field={} stored_field={} query_type={} effective_query_type={} " + "index_id={} suffix={} reader_type={} analyzer_key={} " + "field_pattern={} index_file_exists={} probe_status={} " + "searcher_cache={} index_file={}", + field_name, stored_field_name, query_type_to_string(query_type), + query_type_to_string(effective_query_type), inverted_reader->get_index_id(), + inverted_reader->get_index_meta().get_index_suffix(), + reader_type_to_string(inverted_reader->type()), resolved.analyzer_key, + inverted_reader->get_index_meta().field_pattern(), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string(), + cache_hit ? "hit" : "miss", + index_file_reader->get_index_file_path( + &inverted_reader->get_index_meta()))); + } + *binding = resolved; + return Status::OK(); +} + +segment_v2::IndexIterator* VariantSearchNullBitmapAdapter::iterator_for( + const query_v2::Scorer& /*scorer*/, const std::string& logical_field) const { + if (logical_field.empty()) { + return nullptr; + } + return _resolver.get_iterator(logical_field); +} + +void populate_variant_search_binding_context(const FieldReaderResolver& resolver, + query_v2::QueryExecutionContext* exec_ctx) { + DCHECK(exec_ctx != nullptr); + exec_ctx->readers = resolver.readers(); + exec_ctx->reader_bindings = resolver.reader_bindings(); + exec_ctx->field_reader_bindings = resolver.field_readers(); + for (const auto& [binding_key, binding] : resolver.binding_cache()) { + if (binding_key.empty()) { + continue; + } + query_v2::FieldBindingContext binding_ctx; + binding_ctx.logical_field_name = binding.logical_field_name; + binding_ctx.stored_field_name = binding.stored_field_name; + binding_ctx.stored_field_wstr = binding.stored_field_wstr; + exec_ctx->binding_fields.emplace(binding_key, std::move(binding_ctx)); + } +} + +query_v2::QueryExecutionContext build_variant_search_query_execution_context( + uint32_t segment_num_rows, const FieldReaderResolver& resolver, + query_v2::NullBitmapResolver* null_resolver) { + query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = segment_num_rows; + populate_variant_search_binding_context(resolver, &exec_ctx); + exec_ctx.null_resolver = null_resolver; + return exec_ctx; +} + +namespace { + +class VariantNestedDocMappingWeight final : public query_v2::Weight { +public: + VariantNestedDocMappingWeight( + query_v2::WeightPtr child_weight, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) + : _child_weight(std::move(child_weight)), + _child_to_parent_chain(std::move(child_to_parent_chain)), + _read_provider(read_provider), + _column_iter_opts(std::move(column_iter_opts)) {} + + query_v2::ScorerPtr scorer(const query_v2::QueryExecutionContext& context, + const std::string& binding_key) override { + if (_child_weight == nullptr || _read_provider == nullptr || + _child_to_parent_chain.empty()) { + return std::make_shared(); + } + + auto child_scorer = _child_weight->scorer(context, binding_key); + if (child_scorer == nullptr) { + return std::make_shared(); + } + + roaring::Roaring child_true; + uint32_t doc = child_scorer->doc(); + while (doc != query_v2::TERMINATED) { + child_true.add(doc); + doc = child_scorer->advance(); + } + + auto mapped_true = std::make_shared(); + if (!child_true.isEmpty()) { + auto status = _read_provider->map_elements_to_parent_ords( + _child_to_parent_chain, _column_iter_opts, child_true, mapped_true.get()); + if (!status.ok()) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "failed to map nested search true bitmap: {}", status.to_string()); + } + } + + std::shared_ptr mapped_null; + if (child_scorer->has_null_bitmap(context.null_resolver)) { + const auto* child_null = child_scorer->get_null_bitmap(context.null_resolver); + if (child_null != nullptr && !child_null->isEmpty()) { + mapped_null = std::make_shared(); + auto status = _read_provider->map_elements_to_parent_ords( + _child_to_parent_chain, _column_iter_opts, *child_null, mapped_null.get()); + if (!status.ok()) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "failed to map nested search null bitmap: {}", + status.to_string()); + } + *mapped_null -= *mapped_true; + if (mapped_null->isEmpty()) { + mapped_null.reset(); + } + } + } + + if (mapped_true->isEmpty() && (mapped_null == nullptr || mapped_null->isEmpty())) { + return std::make_shared(); + } + return std::make_shared(std::move(mapped_true), + std::move(mapped_null)); + } + +private: + query_v2::WeightPtr _child_weight; + std::vector _child_to_parent_chain; + const segment_v2::NestedGroupReadProvider* _read_provider; + segment_v2::ColumnIteratorOptions _column_iter_opts; +}; + +class VariantNestedDocMappingQuery final : public query_v2::Query { +public: + VariantNestedDocMappingQuery( + query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) + : _child_query(std::move(child_query)), + _child_to_parent_chain(std::move(child_to_parent_chain)), + _read_provider(read_provider), + _column_iter_opts(std::move(column_iter_opts)) {} + + query_v2::WeightPtr weight(bool enable_scoring) override { + if (_child_query == nullptr) { + return nullptr; + } + return std::make_shared(_child_query->weight(enable_scoring), + _child_to_parent_chain, + _read_provider, _column_iter_opts); + } + +private: + query_v2::QueryPtr _child_query; + std::vector _child_to_parent_chain; + const segment_v2::NestedGroupReadProvider* _read_provider; + segment_v2::ColumnIteratorOptions _column_iter_opts; +}; + +bool starts_with_root_field(const std::string& logical_field_name, const std::string& root_field) { + if (logical_field_name == root_field) { + return true; + } + return logical_field_name.size() > root_field.size() && + logical_field_name.compare(0, root_field.size(), root_field) == 0 && + logical_field_name[root_field.size()] == '.'; +} + +} // namespace + +query_v2::QueryPtr make_variant_nested_doc_mapping_query( + query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts) { + if (child_to_parent_chain.empty()) { + return child_query; + } + return std::make_shared( + std::move(child_query), std::move(child_to_parent_chain), read_provider, + std::move(column_iter_opts)); +} + +Status map_variant_nested_leaf_query_to_active_group(const VariantNestedDocMapperContext& context, + const std::string& logical_field_name, + query_v2::QueryPtr* query) { + if (query == nullptr || *query == nullptr || context.variant_reader == nullptr || + context.read_provider == nullptr || context.active_group_chain.empty() || + context.root_field.empty()) { + return Status::OK(); + } + if (!starts_with_root_field(logical_field_name, context.root_field)) { + return Status::OK(); + } + + std::string relative_path; + if (logical_field_name.size() > context.root_field.size()) { + relative_path = logical_field_name.substr(context.root_field.size() + 1); + } + if (relative_path.empty()) { + return Status::OK(); + } + + auto [found, leaf_group_chain, _] = + context.variant_reader->collect_nested_group_chain(relative_path); + if (!found) { + return Status::OK(); + } + if (leaf_group_chain.size() < context.active_group_chain.size()) { + return Status::InvalidArgument( + "nested search leaf field '{}' is outside active nested path", logical_field_name); + } + for (size_t i = 0; i < context.active_group_chain.size(); ++i) { + if (leaf_group_chain[i] != context.active_group_chain[i]) { + return Status::InvalidArgument( + "nested search leaf field '{}' is outside active nested path", + logical_field_name); + } + } + if (leaf_group_chain.size() == context.active_group_chain.size()) { + return Status::OK(); + } + + std::vector child_to_parent_chain( + leaf_group_chain.begin() + context.active_group_chain.size(), leaf_group_chain.end()); + *query = make_variant_nested_doc_mapping_query(std::move(*query), + std::move(child_to_parent_chain), + context.read_provider, context.column_iter_opts); + return Status::OK(); +} + +Status VariantNestedSearchEvaluator::evaluate( + const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, + FieldReaderResolver& resolver, uint32_t num_rows, const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) const { + (void)num_rows; + (void)field_name_to_column_id; + if (!(nested_clause.__isset.nested_path)) { + return Status::InvalidArgument("NESTED clause missing nested_path"); + } + if (!(nested_clause.__isset.children) || nested_clause.children.empty()) { + return Status::InvalidArgument("NESTED clause missing inner query"); + } + if (result_bitmap == nullptr) { + result_bitmap = std::make_shared(); + } else { + *result_bitmap = roaring::Roaring(); + } + + std::string root_field = nested_clause.nested_path; + auto dot_pos = nested_clause.nested_path.find('.'); + if (dot_pos != std::string::npos) { + root_field = nested_clause.nested_path.substr(0, dot_pos); + } + if (index_exec_ctx == nullptr || index_exec_ctx->segment() == nullptr) { + return Status::InvalidArgument("NESTED query requires IndexExecContext with valid segment"); + } + auto* segment = index_exec_ctx->segment(); + const int32_t ordinal = segment->tablet_schema()->field_index(root_field); + if (ordinal < 0) { + return Status::InvalidArgument("Column '{}' not found in tablet schema for nested query", + root_field); + } + const ColumnId column_id = static_cast(ordinal); + + std::shared_ptr column_reader; + RETURN_IF_ERROR(segment->get_column_reader(segment->tablet_schema()->column(column_id), + &column_reader, + index_exec_ctx->column_iter_opts().stats)); + auto* variant_reader = dynamic_cast(column_reader.get()); + if (variant_reader == nullptr) { + return Status::InvalidArgument("Column '{}' is not VARIANT for nested query", root_field); + } + + std::string array_path; + if (dot_pos == std::string::npos) { + array_path = std::string(segment_v2::kRootNestedGroupPath); + } else { + array_path = nested_clause.nested_path.substr(dot_pos + 1); + } + + auto [found, group_chain, _] = variant_reader->collect_nested_group_chain(array_path); + if (!found || group_chain.empty()) { + return Status::OK(); + } + + auto read_provider = segment_v2::create_nested_group_read_provider(); + if (!read_provider || !read_provider->should_enable_nested_group_read_path()) { + return Status::NotSupported( + "NestedGroup search is an enterprise capability, not available in this build"); + } + + auto& leaf_group = group_chain.back(); + uint64_t total_elements = 0; + RETURN_IF_ERROR(read_provider->get_total_elements(index_exec_ctx->column_iter_opts(), + leaf_group, &total_elements)); + if (total_elements == 0) { + return Status::OK(); + } + if (total_elements > std::numeric_limits::max()) { + return Status::InvalidArgument("nested element_count exceeds uint32_t max"); + } + + std::string default_operator = "or"; + if (search_param.__isset.default_operator && !search_param.default_operator.empty()) { + default_operator = search_param.default_operator; + } + int32_t minimum_should_match = -1; + if (search_param.__isset.minimum_should_match) { + minimum_should_match = search_param.minimum_should_match; + } + + query_v2::QueryPtr inner_query; + std::string inner_binding_key; + VariantNestedDocMapperContext mapper_context; + mapper_context.root_field = root_field; + mapper_context.active_group_chain = group_chain; + mapper_context.variant_reader = variant_reader; + mapper_context.read_provider = read_provider.get(); + mapper_context.column_iter_opts = index_exec_ctx->column_iter_opts(); + resolver.set_leaf_query_mapper( + [mapper_context](const std::string& logical_field_name, query_v2::QueryPtr* query) { + return map_variant_nested_leaf_query_to_active_group(mapper_context, + logical_field_name, query); + }); + struct ScopedLeafMapperReset { + FieldReaderResolver& resolver; + ~ScopedLeafMapperReset() { resolver.set_leaf_query_mapper(nullptr); } + } mapper_reset {resolver}; + RETURN_IF_ERROR(_function_search.build_query_recursive( + nested_clause.children[0], context, resolver, &inner_query, &inner_binding_key, + default_operator, minimum_should_match, static_cast(total_elements))); + if (inner_query == nullptr) { + return Status::OK(); + } + + VariantSearchNullBitmapAdapter null_resolver(resolver); + query_v2::QueryExecutionContext exec_ctx = build_variant_search_query_execution_context( + static_cast(total_elements), resolver, &null_resolver); + + auto weight = inner_query->weight(false); + if (!weight) { + return Status::OK(); + } + auto scorer = weight->scorer(exec_ctx, inner_binding_key); + if (!scorer) { + return Status::OK(); + } + + roaring::Roaring element_bitmap; + uint32_t doc = scorer->doc(); + while (doc != query_v2::TERMINATED) { + element_bitmap.add(doc); + doc = scorer->advance(); + } + + if (scorer->has_null_bitmap(exec_ctx.null_resolver)) { + const auto* bitmap = scorer->get_null_bitmap(exec_ctx.null_resolver); + if (bitmap != nullptr && !bitmap->isEmpty()) { + element_bitmap -= *bitmap; + } + } + + roaring::Roaring parent_bitmap; + RETURN_IF_ERROR(read_provider->map_elements_to_parent_ords( + group_chain, index_exec_ctx->column_iter_opts(), element_bitmap, &parent_bitmap)); + *result_bitmap = std::move(parent_bitmap); + return Status::OK(); +} + +} // namespace doris diff --git a/be/src/exprs/function/variant_inverted_index_search.h b/be/src/exprs/function/variant_inverted_index_search.h new file mode 100644 index 00000000000000..973c9c8c826c55 --- /dev/null +++ b/be/src/exprs/function/variant_inverted_index_search.h @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/status.h" +#include "core/block/columns_with_type_and_name.h" +#include "core/data_type/data_type.h" +#include "storage/index/index_query_context.h" +#include "storage/index/inverted/inverted_index_cache.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/inverted/inverted_index_reader.h" +#include "storage/index/inverted/query_v2/query.h" +#include "storage/index/inverted/query_v2/weight.h" +#include "storage/olap_common.h" +#include "storage/segment/column_reader.h" + +namespace doris::segment_v2::inverted_index::query_v2 { +class Query; +} + +namespace doris::segment_v2 { +class NestedGroupReadProvider; +struct NestedGroupReader; +class VariantColumnReader; +} // namespace doris::segment_v2 + +namespace doris { + +using namespace doris::segment_v2; + +class FunctionSearch; +class IndexExecContext; + +using SearchLeafQueryMapper = std::function*)>; + +enum class SearchFieldBindingState { + BOUND, + MISSING_IN_SEGMENT, +}; + +struct FieldReaderBinding { + std::string logical_field_name; + std::string stored_field_name; + std::wstring stored_field_wstr; + DataTypePtr column_type; + InvertedIndexQueryType query_type; + InvertedIndexReaderPtr inverted_reader; + std::shared_ptr lucene_reader; + std::map index_properties; + std::string binding_key; + std::string analyzer_key; + SearchFieldBindingState state = SearchFieldBindingState::MISSING_IN_SEGMENT; + + bool is_bound() const { + return state == SearchFieldBindingState::BOUND || inverted_reader != nullptr || + lucene_reader != nullptr; + } + bool use_direct_index_reader() const { + return is_bound() && inverted_reader != nullptr && lucene_reader == nullptr; + } +}; + +class FieldReaderResolver { +public: + FieldReaderResolver( + const std::unordered_map& data_type_with_names, + const std::unordered_map& iterators, + std::shared_ptr context, + const std::vector& field_bindings = {}); + + Status resolve(const std::string& field_name, InvertedIndexQueryType query_type, + FieldReaderBinding* binding); + + bool is_variant_subcolumn(const std::string& field_name) const { + return _variant_subcolumn_fields.count(field_name) > 0; + } + + const std::vector>& readers() const { + return _readers; + } + + const std::unordered_map>& + reader_bindings() const { + return _binding_readers; + } + + const std::unordered_map>& + field_readers() const { + return _field_readers; + } + + const std::unordered_map& binding_cache() const { + return _cache; + } + + IndexIterator* get_iterator(const std::string& field_name) const { + auto it = _iterators.find(field_name); + return (it != _iterators.end()) ? it->second : nullptr; + } + + void set_leaf_query_mapper(SearchLeafQueryMapper mapper) { + _leaf_query_mapper = std::move(mapper); + } + + Status map_leaf_query( + const std::string& field_name, + std::shared_ptr* query) const { + if (!_leaf_query_mapper || query == nullptr || *query == nullptr) { + return Status::OK(); + } + return _leaf_query_mapper(field_name, query); + } + +private: + std::string binding_key_for(const std::string& stored_field_name, + InvertedIndexQueryType query_type) const { + return stored_field_name + "#" + std::to_string(static_cast(query_type)); + } + + const std::unordered_map& _data_type_with_names; + const std::unordered_map& _iterators; + std::shared_ptr _context; + std::vector _field_bindings; + std::unordered_map _field_binding_map; + std::unordered_set _variant_subcolumn_fields; + std::unordered_map _cache; + std::vector> _readers; + std::unordered_map> _binding_readers; + std::unordered_map> _field_readers; + std::vector _searcher_cache_handles; + SearchLeafQueryMapper _leaf_query_mapper; +}; + +class VariantSearchNullBitmapAdapter final : public inverted_index::query_v2::NullBitmapResolver { +public: + explicit VariantSearchNullBitmapAdapter(const FieldReaderResolver& resolver) + : _resolver(resolver) {} + + segment_v2::IndexIterator* iterator_for(const inverted_index::query_v2::Scorer& scorer, + const std::string& logical_field) const override; + +private: + const FieldReaderResolver& _resolver; +}; + +void populate_variant_search_binding_context( + const FieldReaderResolver& resolver, + inverted_index::query_v2::QueryExecutionContext* exec_ctx); + +inverted_index::query_v2::QueryExecutionContext build_variant_search_query_execution_context( + uint32_t segment_num_rows, const FieldReaderResolver& resolver, + inverted_index::query_v2::NullBitmapResolver* null_resolver); + +struct VariantNestedDocMapperContext { + std::string root_field; + std::vector active_group_chain; + const segment_v2::VariantColumnReader* variant_reader = nullptr; + const segment_v2::NestedGroupReadProvider* read_provider = nullptr; + segment_v2::ColumnIteratorOptions column_iter_opts; +}; + +Status map_variant_nested_leaf_query_to_active_group(const VariantNestedDocMapperContext& context, + const std::string& logical_field_name, + inverted_index::query_v2::QueryPtr* query); + +inverted_index::query_v2::QueryPtr make_variant_nested_doc_mapping_query( + inverted_index::query_v2::QueryPtr child_query, + std::vector child_to_parent_chain, + const segment_v2::NestedGroupReadProvider* read_provider, + segment_v2::ColumnIteratorOptions column_iter_opts); + +class VariantNestedSearchEvaluator { +public: + explicit VariantNestedSearchEvaluator(const FunctionSearch& function_search) + : _function_search(function_search) {} + + Status evaluate(const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, + FieldReaderResolver& resolver, uint32_t num_rows, + const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) const; + +private: + const FunctionSearch& _function_search; +}; + +} // namespace doris diff --git a/be/src/exprs/vsearch.cpp b/be/src/exprs/vsearch.cpp index f4ed11e95fcd59..3b5da89d29011e 100644 --- a/be/src/exprs/vsearch.cpp +++ b/be/src/exprs/vsearch.cpp @@ -17,6 +17,8 @@ #include "exprs/vsearch.h" +#include + #include #include @@ -30,6 +32,7 @@ #include "glog/logging.h" #include "runtime/runtime_state.h" #include "storage/index/inverted/inverted_index_reader.h" +#include "storage/olap_common.h" #include "storage/segment/segment.h" namespace doris { @@ -45,6 +48,18 @@ struct SearchInputBundle { ColumnsWithTypeAndName literal_args; }; +void add_search_binding_diagnostic(const IndexExecContext* index_context, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (index_context == nullptr) { + return; + } + const auto& index_query_context = index_context->get_index_query_context(); + if (index_query_context != nullptr && index_query_context->stats != nullptr) { + index_query_context->stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, SearchInputBundle* bundle) { DCHECK(bundle != nullptr); @@ -158,9 +173,24 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, if (base_column_index >= 0) { bundle->column_ids.emplace_back(base_column_index); } + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=parent_fallback logical_field={} " + "parent_field={} sub_path={} base_column_id={} " + "stored_field={} reason=slot_iterator_missing", + field_name, binding->parent_field_name, sub_path, + base_column_id, prefix + "." + sub_path)); field_added = true; } } + } else { + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=reject logical_field={} parent_field={} " + "reason=parent_column_not_found", + field_name, binding->parent_field_name)); } } @@ -174,6 +204,15 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, bundle->iterators.emplace(field_name, iterator); bundle->field_types.emplace(field_name, *storage_name_type); bundle->column_ids.emplace_back(column_id); + if (binding != nullptr && binding->__isset.is_variant_subcolumn && + binding->is_variant_subcolumn) { + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=direct_iterator logical_field={} column_id={} " + "stored_field={}", + field_name, column_id, storage_name_type->first)); + } } child_index++; @@ -187,6 +226,18 @@ Status collect_search_inputs(const VSearchExpr& expr, VExprContext* context, field_bindings[child_index].__isset.is_variant_subcolumn && field_bindings[child_index].is_variant_subcolumn) { // Variant subcolumn not materialized - skip, will create empty BitSetQuery in function_search + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=collect_inputs " + "result=unmaterialized_element_at logical_field={} " + "parent_field={} sub_path={} reason=no_slot_ref", + field_bindings[child_index].field_name, + field_bindings[child_index].__isset.parent_field_name + ? field_bindings[child_index].parent_field_name + : "", + field_bindings[child_index].__isset.subcolumn_path + ? field_bindings[child_index].subcolumn_path + : "")); child_index++; continue; } @@ -252,6 +303,11 @@ Status VSearchExpr::evaluate_inverted_index(VExprContext* context, uint32_t segm if (bundle.iterators.empty() && !is_nested_query) { LOG(WARNING) << "VSearchExpr: No indexed columns available for evaluation, DSL: " << _original_dsl; + add_search_binding_diagnostic( + index_context.get(), + fmt::format("[VariantSearchBinding] phase=evaluate_search result=no_iterator " + "dsl={} reason=no_indexed_columns", + _original_dsl)); auto empty_bitmap = InvertedIndexResultBitmap(std::make_shared(), std::make_shared()); index_context->set_index_result_for_expr(this, std::move(empty_bitmap)); diff --git a/be/src/storage/index/inverted/inverted_index_profile.h b/be/src/storage/index/inverted/inverted_index_profile.h index 393c33d711b3c1..eddb20a990503f 100644 --- a/be/src/storage/index/inverted/inverted_index_profile.h +++ b/be/src/storage/index/inverted/inverted_index_profile.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -31,6 +32,14 @@ class InvertedIndexProfileReporter { ~InvertedIndexProfileReporter() = default; void update(RuntimeProfile* profile, const InvertedIndexStatistics* statistics) { + if (!statistics->binding_diagnostics.empty()) { + std::string info; + for (const auto& diagnostic : statistics->binding_diagnostics) { + info += "\n" + diagnostic; + } + profile->add_info_string("VariantSearchBindingDiagnostics", info); + } + // Determine the iteration limit: the smaller of 20 or the size of statistics->stats size_t iteration_limit = std::min(20, statistics->stats.size()); diff --git a/be/src/storage/index/inverted/inverted_index_stats.h b/be/src/storage/index/inverted/inverted_index_stats.h index b82b230f41d71e..863a5bf0219776 100644 --- a/be/src/storage/index/inverted/inverted_index_stats.h +++ b/be/src/storage/index/inverted/inverted_index_stats.h @@ -17,6 +17,9 @@ #pragma once +#include +#include +#include #include namespace doris { @@ -28,7 +31,18 @@ struct InvertedIndexQueryStatistics { }; struct InvertedIndexStatistics { + void add_binding_diagnostic(std::string diagnostic) { + if (binding_diagnostics.size() >= kMaxBindingDiagnostics) { + return; + } + binding_diagnostics.emplace_back(std::move(diagnostic)); + } + std::vector stats; + std::vector binding_diagnostics; + +private: + static constexpr size_t kMaxBindingDiagnostics = 64; }; } // namespace doris diff --git a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h index 5531fb8e62aaf3..35528905471ff3 100644 --- a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h +++ b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_query.h @@ -28,18 +28,22 @@ namespace doris::segment_v2::inverted_index::query_v2 { class BitSetQuery : public Query { public: explicit BitSetQuery(std::shared_ptr bitmap) : _bitmap(std::move(bitmap)) {} + BitSetQuery(std::shared_ptr bitmap, + std::shared_ptr null_bitmap) + : _bitmap(std::move(bitmap)), _null_bitmap(std::move(null_bitmap)) {} BitSetQuery(const roaring::Roaring& bitmap) : _bitmap(std::make_shared(bitmap)) {} ~BitSetQuery() override = default; WeightPtr weight(bool /*enable_scoring*/) override { - return std::make_shared(_bitmap); + return std::make_shared(_bitmap, _null_bitmap); } private: std::shared_ptr _bitmap; + std::shared_ptr _null_bitmap; }; using BitSetQueryPtr = std::shared_ptr; -} // namespace doris::segment_v2::inverted_index::query_v2 \ No newline at end of file +} // namespace doris::segment_v2::inverted_index::query_v2 diff --git a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h index f1a726edd8fc3d..6d3f9b8f038363 100644 --- a/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h +++ b/be/src/storage/index/inverted/query_v2/bit_set_query/bit_set_weight.h @@ -33,10 +33,12 @@ class BitSetWeight final : public Weight { ~BitSetWeight() override = default; ScorerPtr scorer(const QueryExecutionContext& /*context*/) override { - if (_bitmap == nullptr || _bitmap->isEmpty()) { + if ((_bitmap == nullptr || _bitmap->isEmpty()) && + (_null_bitmap == nullptr || _null_bitmap->isEmpty())) { return std::make_shared(); } - return std::make_shared(_bitmap, _null_bitmap); + auto bitmap = _bitmap ? _bitmap : std::make_shared(); + return std::make_shared(std::move(bitmap), _null_bitmap); } private: diff --git a/be/src/storage/segment/segment.cpp b/be/src/storage/segment/segment.cpp index 7563299a856826..e3df7b70f34dc9 100644 --- a/be/src/storage/segment/segment.cpp +++ b/be/src/storage/segment/segment.cpp @@ -802,7 +802,52 @@ Status Segment::new_index_iterator(const TabletColumn& tablet_column, const Tabl // to avoid data race during parallel method calls RETURN_IF_ERROR(_index_file_reader_open.call([&] { return _open_index_file_reader(); })); // after DorisCallOnce.call, _index_file_reader is guaranteed to be not nullptr - RETURN_IF_ERROR(reader->new_index_iterator(_index_file_reader, index_meta, iter)); + const bool need_binding_diagnostic = tablet_column.is_variant_type() || + tablet_column.is_extracted_column() || + !index_meta->get_index_suffix().empty(); + bool index_file_exists = false; + Status probe_status; + if (need_binding_diagnostic) { + probe_status = _index_file_reader->init(config::inverted_index_read_buffer_size, + &read_options.io_ctx); + if (probe_status.ok()) { + probe_status = _index_file_reader->index_file_exist(index_meta, &index_file_exists); + } + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=index_file_probe tablet_id={} rowset_id={} " + "segment_id={} column={} logical_path={} index_id={} suffix={} exists={} " + "status={}", + read_options.tablet_id, _rowset_id.to_string(), _segment_id, + tablet_column.name(), + tablet_column.has_path_info() ? tablet_column.path_info_ptr()->get_path() + : tablet_column.name(), + index_meta->index_id(), index_meta->get_index_suffix(), index_file_exists, + probe_status.ok() ? "OK" : probe_status.to_string()); + VLOG_DEBUG << diagnostic; + if (read_options.stats != nullptr) { + read_options.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } + } + Status iter_status = reader->new_index_iterator(_index_file_reader, index_meta, iter); + if (!iter_status.ok()) { + if (need_binding_diagnostic) { + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=index_iterator_create result=reject " + "tablet_id={} rowset_id={} segment_id={} column={} logical_path={} " + "index_id={} suffix={} reason={}", + read_options.tablet_id, _rowset_id.to_string(), _segment_id, + tablet_column.name(), + tablet_column.has_path_info() ? tablet_column.path_info_ptr()->get_path() + : tablet_column.name(), + index_meta->index_id(), index_meta->get_index_suffix(), + iter_status.to_string()); + VLOG_DEBUG << diagnostic; + if (read_options.stats != nullptr) { + read_options.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } + } + return iter_status; + } return Status::OK(); } return Status::OK(); diff --git a/be/src/storage/segment/segment_iterator.cpp b/be/src/storage/segment/segment_iterator.cpp index 3d542307318ae7..3a9c37aafcd5f4 100644 --- a/be/src/storage/segment/segment_iterator.cpp +++ b/be/src/storage/segment/segment_iterator.cpp @@ -1600,8 +1600,8 @@ Status SegmentIterator::_init_index_iterators() { data_type = inferred_type; } } - inverted_indexs_holder = - variant_reader->find_subcolumn_tablet_indexes(column, data_type); + inverted_indexs_holder = variant_reader->find_subcolumn_tablet_indexes( + column, data_type, _opts.stats); // Extract raw pointers from shared_ptr for iteration for (const auto& index_ptr : inverted_indexs_holder) { inverted_indexs.push_back(index_ptr.get()); @@ -1611,9 +1611,38 @@ Status SegmentIterator::_init_index_iterators() { else { inverted_indexs = _segment->_tablet_schema->inverted_indexs(column); } + if (column.is_extracted_column() && inverted_indexs.empty() && _opts.stats != nullptr) { + const auto relative_path = column.path_info_ptr()->copy_pop_front().get_path(); + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=init_index_iterators " + "result=no_candidate tablet_id={} rowset_id={} segment_id={} cid={} " + "logical_path={} relative_path={} materialized_column={}", + _tablet_id, _segment->rowset_id().to_string(), _segment->id(), cid, + column.path_info_ptr()->get_path(), relative_path, column.name()); + VLOG_DEBUG << diagnostic; + _opts.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } for (const auto& inverted_index : inverted_indexs) { + const bool had_iterator = _index_iterators[cid] != nullptr; RETURN_IF_ERROR(_segment->new_index_iterator(column, inverted_index, _opts, &_index_iterators[cid])); + if ((column.is_extracted_column() || column.is_variant_type()) && + _opts.stats != nullptr) { + const auto diagnostic = fmt::format( + "[VariantSearchBinding] phase=init_index_iterators " + "result={} tablet_id={} rowset_id={} segment_id={} cid={} " + "logical_path={} materialized_column={} index_id={} suffix={} " + "field_pattern={} iterator_state={}", + _index_iterators[cid] == nullptr ? "no_iterator" : "accepted", + _tablet_id, _segment->rowset_id().to_string(), _segment->id(), cid, + column.has_path_info() ? column.path_info_ptr()->get_path() + : column.name(), + column.name(), inverted_index->index_id(), + inverted_index->get_index_suffix(), inverted_index->field_pattern(), + had_iterator ? "preserved" : "created"); + VLOG_DEBUG << diagnostic; + _opts.stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } } if (_index_iterators[cid] != nullptr) { _index_iterators[cid]->set_context(_index_query_context); diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index a072eba289d7dc..ab63cb4f13f944 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -17,6 +17,7 @@ #include "storage/segment/variant/variant_column_reader.h" +#include #include #include @@ -28,6 +29,7 @@ #include #include "common/config.h" +#include "common/logging.h" #include "common/status.h" #include "core/assert_cast.h" #include "core/column/column_array.h" @@ -41,6 +43,7 @@ #include "io/fs/file_reader.h" #include "runtime/descriptors.h" #include "storage/key_coder.h" +#include "storage/olap_common.h" #include "storage/segment/column_meta_accessor.h" #include "storage/segment/column_reader.h" #include "storage/segment/column_reader_cache.h" @@ -67,6 +70,14 @@ bool is_compaction_or_checksum_reader(const StorageReadOptions* opts) { opts->io_ctx.reader_type == ReaderType::READER_CHECKSUM); } +void add_variant_search_binding_diagnostic(OlapReaderStatistics* stats, + const std::string& diagnostic) { + VLOG_DEBUG << diagnostic; + if (stats != nullptr) { + stats->inverted_index_stats.add_binding_diagnostic(diagnostic); + } +} + // Nested-group whole/root-merge iterators dereference NestedGroupReader state that is owned by // VariantColumnReader. Hold the owning reader until the iterator itself is destroyed so query-time // iterator initialization cannot outlive the reader and hit a UAF. @@ -1418,11 +1429,14 @@ Status VariantColumnReader::load_external_meta_once() { } TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletColumn& column, - const DataTypePtr& data_type) { + const DataTypePtr& data_type, + OlapReaderStatistics* stats) { TabletSchema::SubColumnInfo sub_column_info; const auto& parent_index = _tablet_schema->inverted_indexs(column.parent_unique_id()); auto relative_path = column.path_info_ptr()->copy_pop_front(); DataTypePtr index_data_type = data_type; + const std::string logical_path = column.path_info_ptr()->get_path(); + const std::string relative_path_str = relative_path.get_path(); if (!relative_path.empty()) { auto [found, group_chain, child_path] = @@ -1443,6 +1457,16 @@ TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletCol if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), relative_path.get_path(), &sub_column_info) && !sub_column_info.indexes.empty()) { + for (const auto& index : sub_column_info.indexes) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=direct logical_path={} relative_path={} " + "materialized_column={} index_id={} suffix={} field_pattern={} " + "reason=generated_subcolumn_info", + logical_path, relative_path_str, column.name(), index->index_id(), + index->get_index_suffix(), index->field_pattern())); + } return sub_column_info.indexes; } @@ -1458,6 +1482,31 @@ TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletCol .parent_unique_id = column.parent_unique_id(), .path_info = index_path}); variant_util::inherit_index(parent_index, sub_column_info.indexes, target_column); + for (const auto& index : sub_column_info.indexes) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=parent_inherited logical_path={} relative_path={} " + "materialized_column={} index_id={} suffix={} field_pattern={} " + "reason=no_direct_subcolumn_index", + logical_path, relative_path_str, column.name(), index->index_id(), + index->get_index_suffix(), index->field_pattern())); + } + } else if (parent_index.empty()) { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=none logical_path={} relative_path={} materialized_column={} " + "reason=parent_index_missing", + logical_path, relative_path_str, column.name())); + } else { + add_variant_search_binding_diagnostic( + stats, + fmt::format("[VariantSearchBinding] phase=subcolumn_index_candidates " + "source=none logical_path={} relative_path={} materialized_column={} " + "data_type={} reason=unsupported_inherited_index_type", + logical_path, relative_path_str, column.name(), + index_data_type ? index_data_type->get_name() : "null")); } // Return shared_ptr directly to maintain object lifetime return sub_column_info.indexes; diff --git a/be/src/storage/segment/variant/variant_column_reader.h b/be/src/storage/segment/variant/variant_column_reader.h index d9f35730e62efd..af9249e3e2ce81 100644 --- a/be/src/storage/segment/variant/variant_column_reader.h +++ b/be/src/storage/segment/variant/variant_column_reader.h @@ -52,6 +52,7 @@ namespace doris { class TabletIndex; class StorageReadOptions; class TabletSchema; +struct OlapReaderStatistics; namespace segment_v2 { @@ -215,7 +216,8 @@ class VariantColumnReader : public ColumnReader { // Return shared_ptr to ensure the lifetime of TabletIndex objects TabletIndexes find_subcolumn_tablet_indexes(const TabletColumn& target_column, - const DataTypePtr& data_type); + const DataTypePtr& data_type, + OlapReaderStatistics* stats = nullptr); bool exist_in_sparse_column(const PathInData& path) const; diff --git a/be/test/exprs/function/function_search_nested_test.cpp b/be/test/exprs/function/function_search_nested_test.cpp index b44587ba707d1f..1861e26131bc10 100644 --- a/be/test/exprs/function/function_search_nested_test.cpp +++ b/be/test/exprs/function/function_search_nested_test.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -// Unit tests for FunctionSearch::evaluate_nested_query and NESTED clause handling. +// Unit tests for VariantNestedSearchEvaluator and NESTED clause handling. // Migrated from function_search_test.cpp for maintainability. #include @@ -23,19 +23,154 @@ #include #include +#include #include +#include "common/exception.h" #include "core/block/block.h" #include "exprs/function/function_search.h" +#include "exprs/function/variant_inverted_index_search.h" +#include "storage/index/inverted/query_v2/bit_set_query/bit_set_query.h" +#include "storage/index/inverted/query_v2/query.h" +#include "storage/index/inverted/query_v2/weight.h" #include "storage/segment/variant/nested_group_provider.h" +#include "storage/segment/variant/variant_column_reader.h" namespace doris { +class FakeNestedGroupReadProvider final : public segment_v2::NestedGroupReadProvider { +public: + bool should_enable_nested_group_read_path() const override { return true; } + + Status init_readers(const segment_v2::ColumnReaderOptions&, + const std::shared_ptr&, + const std::shared_ptr&, segment_v2::ColumnMetaAccessor*, + int32_t, uint64_t, segment_v2::NestedGroupReaders&) override { + return Status::NotSupported("not implemented"); + } + + bool try_build_read_plan(const TabletSchema*, const segment_v2::NestedGroupReaders&, + const TabletColumn&, const StorageReadOptions*, int32_t, + const PathInData&, bool*, DataTypePtr*, PathInData*, std::string*, + std::string*, std::vector*, + std::optional*) const override { + return false; + } + + Status create_nested_group_iterator(bool, + const std::vector&, + const std::string&, const std::string&, + const std::optional&, + segment_v2::ColumnIteratorUPtr*, DataTypePtr*) override { + return Status::NotSupported("not implemented"); + } + + Status get_total_elements(const segment_v2::ColumnIteratorOptions&, + const segment_v2::NestedGroupReader*, uint64_t*) const override { + return Status::NotSupported("not implemented"); + } + + Status create_root_merge_iterator(segment_v2::ColumnIteratorUPtr, + const segment_v2::NestedGroupReaders&, + const StorageReadOptions*, + segment_v2::ColumnIteratorUPtr*) override { + return Status::NotSupported("not implemented"); + } + + Status map_elements_to_parent_ords(const std::vector&, + const segment_v2::ColumnIteratorOptions&, + const roaring::Roaring& element_bitmap, + roaring::Roaring* parent_bitmap) const override { + for (auto doc : element_bitmap) { + parent_bitmap->add(doc / 2); + } + return Status::OK(); + } +}; + +class ErrorNestedGroupReadProvider final : public segment_v2::NestedGroupReadProvider { +public: + bool should_enable_nested_group_read_path() const override { return true; } + + Status init_readers(const segment_v2::ColumnReaderOptions&, + const std::shared_ptr&, + const std::shared_ptr&, segment_v2::ColumnMetaAccessor*, + int32_t, uint64_t, segment_v2::NestedGroupReaders&) override { + return Status::NotSupported("not implemented"); + } + + bool try_build_read_plan(const TabletSchema*, const segment_v2::NestedGroupReaders&, + const TabletColumn&, const StorageReadOptions*, int32_t, + const PathInData&, bool*, DataTypePtr*, PathInData*, std::string*, + std::string*, std::vector*, + std::optional*) const override { + return false; + } + + Status create_nested_group_iterator(bool, + const std::vector&, + const std::string&, const std::string&, + const std::optional&, + segment_v2::ColumnIteratorUPtr*, DataTypePtr*) override { + return Status::NotSupported("not implemented"); + } + + Status get_total_elements(const segment_v2::ColumnIteratorOptions&, + const segment_v2::NestedGroupReader*, uint64_t*) const override { + return Status::NotSupported("not implemented"); + } + + Status create_root_merge_iterator(segment_v2::ColumnIteratorUPtr, + const segment_v2::NestedGroupReaders&, + const StorageReadOptions*, + segment_v2::ColumnIteratorUPtr*) override { + return Status::NotSupported("not implemented"); + } + + Status map_elements_to_parent_ords(const std::vector&, + const segment_v2::ColumnIteratorOptions&, + const roaring::Roaring&, roaring::Roaring*) const override { + return Status::InternalError("forced mapping failure"); + } +}; + +class NullWeightQuery final : public inverted_index::query_v2::Query { +public: + inverted_index::query_v2::WeightPtr weight(bool) override { return nullptr; } +}; + +class NullScorerWeight final : public inverted_index::query_v2::Weight { +public: + inverted_index::query_v2::ScorerPtr scorer( + const inverted_index::query_v2::QueryExecutionContext&, + const std::string& = {}) override { + return nullptr; + } +}; + +class NullScorerQuery final : public inverted_index::query_v2::Query { +public: + inverted_index::query_v2::WeightPtr weight(bool) override { + return std::make_shared(); + } +}; + class FunctionSearchNestedTest : public testing::Test { public: void SetUp() override { function_search = std::make_shared(); } protected: + Status evaluate_nested_query( + const TSearchParam& search_param, const TSearchClause& nested_clause, + const std::shared_ptr& context, FieldReaderResolver& resolver, + uint32_t num_rows, const IndexExecContext* index_exec_ctx, + const std::unordered_map& field_name_to_column_id, + std::shared_ptr& result_bitmap) { + VariantNestedSearchEvaluator evaluator(*function_search); + return evaluator.evaluate(search_param, nested_clause, context, resolver, num_rows, + index_exec_ctx, field_name_to_column_id, result_bitmap); + } + std::shared_ptr function_search; }; @@ -88,6 +223,187 @@ TEST_F(FunctionSearchNestedTest, NestedClauseMustBeTopLevel) { std::string::npos); } +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryMapsTruthAndNullBitmaps) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); + true_bitmap->add(4); + auto null_bitmap = std::make_shared(); + null_bitmap->add(7); + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + + roaring::Roaring actual_true; + uint32_t doc = scorer->doc(); + while (doc != inverted_index::query_v2::TERMINATED) { + actual_true.add(doc); + doc = scorer->advance(); + } + + EXPECT_TRUE(actual_true.contains(1)); + EXPECT_TRUE(actual_true.contains(2)); + EXPECT_EQ(2, actual_true.cardinality()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* actual_null = scorer->get_null_bitmap(); + ASSERT_NE(nullptr, actual_null); + EXPECT_TRUE(actual_null->contains(3)); + EXPECT_EQ(1, actual_null->cardinality()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryReturnsChildWhenNoMappingChain) { + auto child_query = std::make_shared(roaring::Roaring()); + + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, {}, nullptr, + segment_v2::ColumnIteratorOptions {}); + + EXPECT_EQ(child_query, mapped_query); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryHandlesNullChildWeight) { + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + FakeNestedGroupReadProvider read_provider; + auto mapped_query = make_variant_nested_doc_mapping_query(std::make_shared(), + chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryHandlesNullChildScorer) { + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + FakeNestedGroupReadProvider read_provider; + auto mapped_query = make_variant_nested_doc_mapping_query(std::make_shared(), + chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryDropsNullsCoveredByTrueHits) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); // maps to parent 1 + auto null_bitmap = std::make_shared(); + null_bitmap->add(3); // also maps to parent 1, then is removed from null bitmap + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(1u, scorer->doc()); + EXPECT_FALSE(scorer->has_null_bitmap()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryReturnsEmptyForEmptyChildResult) { + auto child_query = std::make_shared(roaring::Roaring()); + + FakeNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryThrowsOnMappingError) { + auto true_bitmap = std::make_shared(); + true_bitmap->add(2); + auto child_query = std::make_shared(true_bitmap); + + ErrorNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + EXPECT_THROW((void)weight->scorer(exec_ctx), Exception); +} + +TEST_F(FunctionSearchNestedTest, NestedDocMappingQueryThrowsOnNullBitmapMappingError) { + auto true_bitmap = std::make_shared(); + auto null_bitmap = std::make_shared(); + null_bitmap->add(3); + auto child_query = + std::make_shared(true_bitmap, null_bitmap); + + ErrorNestedGroupReadProvider read_provider; + segment_v2::NestedGroupReader nested_group; + std::vector chain {&nested_group}; + auto mapped_query = make_variant_nested_doc_mapping_query(child_query, chain, &read_provider, + segment_v2::ColumnIteratorOptions {}); + + auto weight = mapped_query->weight(false); + ASSERT_NE(nullptr, weight); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + EXPECT_THROW((void)weight->scorer(exec_ctx), Exception); +} + +TEST_F(FunctionSearchNestedTest, VariantNestedLeafMapperEarlyExitBranches) { + inverted_index::query_v2::QueryPtr query = + std::make_shared(roaring::Roaring()); + auto original_query = query; + VariantNestedDocMapperContext mapper_context; + + ASSERT_TRUE( + map_variant_nested_leaf_query_to_active_group(mapper_context, "data.items.msg", &query) + .ok()); + EXPECT_EQ(original_query, query); + + segment_v2::NestedGroupReader nested_group; + FakeNestedGroupReadProvider read_provider; + segment_v2::VariantColumnReader variant_reader; + mapper_context.root_field = "data"; + mapper_context.active_group_chain = {&nested_group}; + mapper_context.read_provider = &read_provider; + mapper_context.variant_reader = &variant_reader; + + ASSERT_TRUE(map_variant_nested_leaf_query_to_active_group(mapper_context, "metrics.items.msg", + &query) + .ok()); + EXPECT_EQ(original_query, query); + + ASSERT_TRUE(map_variant_nested_leaf_query_to_active_group(mapper_context, "data", &query).ok()); + EXPECT_EQ(original_query, query); +} + // =========================================================================== // Community-edition fallback: NESTED root → NOT_IMPLEMENTED_ERROR // =========================================================================== @@ -151,9 +467,8 @@ TEST_F(FunctionSearchNestedTest, MissingNestedPath) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing nested_path"), std::string::npos); @@ -178,9 +493,8 @@ TEST_F(FunctionSearchNestedTest, MissingChildren) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing inner query"), std::string::npos); @@ -205,9 +519,8 @@ TEST_F(FunctionSearchNestedTest, EmptyChildrenList) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("missing inner query"), std::string::npos); @@ -237,9 +550,8 @@ TEST_F(FunctionSearchNestedTest, NullExecContext) { std::shared_ptr result_bitmap; std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("IndexExecContext"), std::string::npos); @@ -263,9 +575,8 @@ TEST_F(FunctionSearchNestedTest, InitializesNullResultBitmap) { std::shared_ptr result_bitmap; // nullptr std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); // Should fail (nested_path not set), but no crash on null bitmap EXPECT_FALSE(status.ok()); } @@ -304,9 +615,8 @@ TEST_F(FunctionSearchNestedTest, BitmapClearedAfterPassingValidation) { std::unordered_map field_to_col_id; - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); // Will fail later (null context), but bitmap should be cleared EXPECT_FALSE(status.ok()); ASSERT_NE(nullptr, result_bitmap); @@ -338,9 +648,8 @@ TEST_F(FunctionSearchNestedTest, DottedNestedPath) { std::unordered_map field_to_col_id; // null context → InvalidArgument about segment - auto status = - function_search->evaluate_nested_query(search_param, nested_clause, context, resolver, - 100, nullptr, field_to_col_id, result_bitmap); + auto status = evaluate_nested_query(search_param, nested_clause, context, resolver, 100, + nullptr, field_to_col_id, result_bitmap); EXPECT_FALSE(status.ok()); EXPECT_TRUE(status.is()); EXPECT_NE(status.to_string().find("IndexExecContext"), std::string::npos); diff --git a/be/test/exprs/function/function_search_test.cpp b/be/test/exprs/function/function_search_test.cpp index ba11058ca5fce9..ac57847f32e4de 100644 --- a/be/test/exprs/function/function_search_test.cpp +++ b/be/test/exprs/function/function_search_test.cpp @@ -21,12 +21,21 @@ #include #include +#include #include #include #include +#include #include "core/block/block.h" +#include "core/data_type/data_type_array.h" +#include "core/data_type/data_type_nullable.h" +#include "core/data_type/data_type_string.h" +#include "core/data_type/primitive_type.h" +#include "storage/index/index_file_reader.h" #include "storage/index/index_iterator.h" +#include "storage/index/inverted/inverted_index_iterator.h" +#include "storage/index/inverted/inverted_index_parser.h" #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_query.h" #include "storage/index/inverted/query_v2/phrase_query/multi_phrase_weight.h" #include "storage/index/inverted/query_v2/phrase_query/phrase_query.h" @@ -60,6 +69,103 @@ class DummyIndexIterator : public segment_v2::IndexIterator { Result has_null() override { return false; } }; +class RecordingIndexIterator : public segment_v2::IndexIterator { +public: + segment_v2::IndexReaderPtr get_reader( + segment_v2::IndexReaderType /*reader_type*/) const override { + return nullptr; + } + + Status read_from_index(const segment_v2::IndexParam& param) override { + auto* i_param_ptr = std::get_if(¶m); + if (i_param_ptr == nullptr || *i_param_ptr == nullptr) { + return Status::InvalidArgument("missing inverted index param"); + } + auto* i_param = *i_param_ptr; + last_column_name = i_param->column_name; + last_column_storage_type = i_param->column_type == nullptr + ? FieldType::OLAP_FIELD_TYPE_UNKNOWN + : i_param->column_type->get_storage_field_type(); + last_query_type = i_param->query_type; + last_query_value_type = i_param->query_value.get_type(); + if (i_param->query_value.get_type() == TYPE_BOOLEAN) { + last_bool_value = i_param->query_value.get(); + } + if (i_param->query_value.get_type() == TYPE_INT) { + last_int_value = i_param->query_value.get(); + } + if (i_param->roaring != nullptr) { + i_param->roaring->add(3); + } + return Status::OK(); + } + + Status read_null_bitmap(segment_v2::InvertedIndexQueryCacheHandle* /*cache_handle*/) override { + return Status::OK(); + } + + Result has_null() override { return false; } + + std::string last_column_name; + FieldType last_column_storage_type = FieldType::OLAP_FIELD_TYPE_UNKNOWN; + segment_v2::InvertedIndexQueryType last_query_type = + segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY; + PrimitiveType last_query_value_type = PrimitiveType::TYPE_NULL; + bool last_bool_value = false; + Int32 last_int_value = 0; +}; + +class DummyInvertedIndexReader final : public segment_v2::InvertedIndexReader { +public: + explicit DummyInvertedIndexReader(const TabletIndex* index_meta) + : segment_v2::InvertedIndexReader(index_meta, nullptr) {} + + DummyInvertedIndexReader(const TabletIndex* index_meta, + std::shared_ptr index_file_reader, + segment_v2::InvertedIndexReaderType reader_type) + : segment_v2::InvertedIndexReader(index_meta, std::move(index_file_reader)), + _reader_type(reader_type) {} + + Status new_iterator(std::unique_ptr* /*iterator*/) override { + return Status::OK(); + } + + Status query(const segment_v2::IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + segment_v2::InvertedIndexQueryType /*query_type*/, + std::shared_ptr& /*bit_map*/, + const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/ = nullptr) override { + return Status::OK(); + } + + Status try_query(const segment_v2::IndexQueryContextPtr& /*context*/, + const std::string& /*column_name*/, const Field& /*query_value*/, + segment_v2::InvertedIndexQueryType /*query_type*/, + size_t* /*count*/) override { + return Status::OK(); + } + + segment_v2::InvertedIndexReaderType type() override { return _reader_type; } + +private: + segment_v2::InvertedIndexReaderType _reader_type = segment_v2::InvertedIndexReaderType::BKD; +}; + +static TabletIndex make_test_inverted_index( + int64_t index_id, const std::map& properties = {}) { + TabletIndex index_meta; + TabletIndexPB pb; + pb.set_index_type(IndexType::INVERTED); + pb.set_index_id(index_id); + pb.set_index_name("test_index_" + std::to_string(index_id)); + pb.add_col_unique_id(1); + for (const auto& [key, value] : properties) { + (*pb.mutable_properties())[key] = value; + } + index_meta.init_from_pb(pb); + return index_meta; +} + TEST_F(FunctionSearchTest, TestGetName) { EXPECT_EQ("search", function_search->get_name()); } @@ -1630,6 +1736,363 @@ TEST_F(FunctionSearchTest, TestBuildLeafQueryPhrase) { EXPECT_NE(phrase_query, nullptr); } +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantMissingFieldReturnsUnknown) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.missing"; + clause.value = "value"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + std::unordered_map iterators; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.missing"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + bool mapper_called = false; + resolver.set_leaf_query_mapper([&](const std::string& logical_field, + inverted_index::query_v2::QueryPtr* query) -> Status { + mapper_called = true; + EXPECT_EQ("var.items.missing", logical_field); + EXPECT_NE(nullptr, query); + EXPECT_NE(nullptr, *query); + return Status::OK(); + }); + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 5); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_TRUE(mapper_called); + EXPECT_TRUE(out_binding_key.empty()); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 5; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(null_bitmap, nullptr); + EXPECT_EQ(5u, null_bitmap->cardinality()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantSubcolumnWithMissingIterator) { + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + std::unordered_map iterators; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + ASSERT_TRUE(status.ok()); + EXPECT_FALSE(binding.is_bound()); + EXPECT_TRUE(resolver.binding_cache().empty()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantSubcolumnWithReaderSelectionError) { + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + + segment_v2::InvertedIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.level"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(ErrorCode::INVERTED_INDEX_NO_TERMS, status.code()); +} + +TEST_F(FunctionSearchTest, + TestFieldReaderResolverVariantAnalyzerUpgradeWithMissingIndexFileReader) { + auto context = std::make_shared(); + + std::map properties; + properties[INVERTED_INDEX_PARSER_KEY] = INVERTED_INDEX_PARSER_STANDARD; + auto index_meta = make_test_inverted_index(11, properties); + auto reader = std::make_shared( + &index_meta, nullptr, segment_v2::InvertedIndexReaderType::FULLTEXT); + + segment_v2::InvertedIndexIterator iterator; + iterator.add_reader(segment_v2::InvertedIndexReaderType::FULLTEXT, reader); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.msg", + IndexFieldNameAndTypePair {"1.var.items.msg", std::make_shared()}); + std::unordered_map iterators; + iterators["var.items.msg"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.msg"; + field_binding.is_variant_subcolumn = true; + field_binding.index_properties = properties; + field_binding.__isset.is_variant_subcolumn = true; + field_binding.__isset.index_properties = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = resolver.resolve("var.items.msg", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + EXPECT_FALSE(status.ok()); + EXPECT_EQ(ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND, status.code()); +} + +TEST_F(FunctionSearchTest, TestFieldReaderResolverVariantBkdDirectReader) { + auto context = std::make_shared(); + + auto index_meta = make_test_inverted_index(12); + auto index_file_reader = std::make_shared( + nullptr, "/tmp/variant_direct_idx", InvertedIndexStorageFormatPB::V2); + auto reader = std::make_shared( + &index_meta, index_file_reader, segment_v2::InvertedIndexReaderType::BKD); + + segment_v2::InvertedIndexIterator iterator; + iterator.add_reader(segment_v2::InvertedIndexReaderType::BKD, reader); + + std::unordered_map data_type_with_names; + data_type_with_names.emplace( + "var.items.level", + IndexFieldNameAndTypePair {"1.var.items.level", std::make_shared()}); + std::unordered_map iterators; + iterators["var.items.level"] = &iterator; + + TSearchFieldBinding field_binding; + field_binding.field_name = "var.items.level"; + field_binding.is_variant_subcolumn = true; + field_binding.__isset.is_variant_subcolumn = true; + + FieldReaderResolver resolver(data_type_with_names, iterators, context, {field_binding}); + FieldReaderBinding binding; + auto status = + resolver.resolve("var.items.level", InvertedIndexQueryType::EQUAL_QUERY, &binding); + + ASSERT_TRUE(status.ok()) << status.to_string(); + EXPECT_TRUE(binding.use_direct_index_reader()); + EXPECT_EQ(reader, binding.inverted_reader); + EXPECT_EQ("var.items.level", binding.logical_field_name); + EXPECT_EQ("1.var.items.level", binding.stored_field_name); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, binding.query_type); + + const auto& cache = resolver.binding_cache(); + ASSERT_EQ(1u, cache.size()); + EXPECT_TRUE(cache.begin()->second.use_direct_index_reader()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryDirectUnknownClauseUsesLeafMapper) { + TSearchClause clause; + clause.clause_type = "PHRASE"; + clause.field_name = "var.items.active"; + clause.value = "true"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto bool_type = + std::make_shared(make_nullable(std::make_shared())); + data_type_with_names.emplace("var.items.active", + IndexFieldNameAndTypePair {"1.var.items.active", bool_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.active"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.active"; + binding.stored_field_name = "1.var.items.active"; + binding.stored_field_wstr = L"1.var.items.active"; + binding.column_type = bool_type; + binding.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = resolver.binding_key_for("1.var.items.active", + InvertedIndexQueryType::MATCH_PHRASE_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + bool mapper_called = false; + resolver.set_leaf_query_mapper([&](const std::string& logical_field, + inverted_index::query_v2::QueryPtr* query) -> Status { + mapper_called = true; + EXPECT_EQ("var.items.active", logical_field); + EXPECT_NE(nullptr, query); + EXPECT_NE(nullptr, *query); + return Status::OK(); + }); + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 4); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_TRUE(mapper_called); + EXPECT_EQ(key, out_binding_key); + EXPECT_TRUE(iterator.last_column_name.empty()); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 4; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(inverted_index::query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + const auto* null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(null_bitmap, nullptr); + EXPECT_EQ(4u, null_bitmap->cardinality()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantBoolUsesDirectIndexReader) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.active"; + clause.value = "true"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto bool_type = + std::make_shared(make_nullable(std::make_shared())); + data_type_with_names.emplace("var.items.active", + IndexFieldNameAndTypePair {"1.var.items.active", bool_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.active"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.active"; + binding.stored_field_name = "1.var.items.active"; + binding.stored_field_wstr = L"1.var.items.active"; + binding.column_type = bool_type; + binding.query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = + resolver.binding_key_for("1.var.items.active", InvertedIndexQueryType::MATCH_ANY_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 10); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_EQ(key, out_binding_key); + EXPECT_EQ("1.var.items.active", iterator.last_column_name); + EXPECT_EQ(FieldType::OLAP_FIELD_TYPE_BOOL, iterator.last_column_storage_type); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, iterator.last_query_type); + EXPECT_EQ(TYPE_BOOLEAN, iterator.last_query_value_type); + EXPECT_TRUE(iterator.last_bool_value); + + auto weight = out->weight(false); + ASSERT_NE(weight, nullptr); + inverted_index::query_v2::QueryExecutionContext exec_ctx; + exec_ctx.segment_num_rows = 10; + auto scorer = weight->scorer(exec_ctx, out_binding_key); + ASSERT_NE(scorer, nullptr); + EXPECT_EQ(3u, scorer->doc()); +} + +TEST_F(FunctionSearchTest, TestBuildLeafQueryVariantNestedIntUsesDirectIndexReader) { + TSearchClause clause; + clause.clause_type = "TERM"; + clause.field_name = "var.items.flags.level"; + clause.value = "3"; + clause.__isset.field_name = true; + clause.__isset.value = true; + + auto context = std::make_shared(); + + std::unordered_map data_type_with_names; + auto int_type = std::make_shared(make_nullable( + std::make_shared(make_nullable(std::make_shared())))); + data_type_with_names.emplace("var.items.flags.level", + IndexFieldNameAndTypePair {"1.var.items.flags.level", int_type}); + + RecordingIndexIterator iterator; + std::unordered_map iterators; + iterators["var.items.flags.level"] = &iterator; + + FieldReaderResolver resolver(data_type_with_names, iterators, context); + + FieldReaderBinding binding; + binding.logical_field_name = "var.items.flags.level"; + binding.stored_field_name = "1.var.items.flags.level"; + binding.stored_field_wstr = L"1.var.items.flags.level"; + binding.column_type = int_type; + binding.query_type = InvertedIndexQueryType::MATCH_ANY_QUERY; + binding.state = SearchFieldBindingState::BOUND; + TabletIndex index_meta; + binding.inverted_reader = std::make_shared(&index_meta); + + std::string key = resolver.binding_key_for("1.var.items.flags.level", + InvertedIndexQueryType::MATCH_ANY_QUERY); + binding.binding_key = key; + resolver._cache[key] = binding; + + inverted_index::query_v2::QueryPtr out; + std::string out_binding_key; + Status st = function_search->build_leaf_query(clause, context, resolver, &out, &out_binding_key, + "OR", 0, 10); + ASSERT_TRUE(st.ok()); + ASSERT_NE(out, nullptr); + EXPECT_EQ(key, out_binding_key); + EXPECT_EQ("1.var.items.flags.level", iterator.last_column_name); + EXPECT_EQ(FieldType::OLAP_FIELD_TYPE_INT, iterator.last_column_storage_type); + EXPECT_EQ(InvertedIndexQueryType::EQUAL_QUERY, iterator.last_query_type); + EXPECT_EQ(TYPE_INT, iterator.last_query_value_type); + EXPECT_EQ(3, iterator.last_int_value); +} + TEST_F(FunctionSearchTest, TestMultiPhraseQueryCase) { using doris::segment_v2::InvertedIndexQueryInfo; using doris::segment_v2::TermInfo; diff --git a/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp b/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp index eb965d49db8d05..c9ceaba5288399 100644 --- a/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp +++ b/be/test/storage/index/inverted/query_v2/boolean_query_test.cpp @@ -40,6 +40,28 @@ namespace doris::segment_v2 { using namespace inverted_index; +TEST(BitSetQueryTest, EmptyTruthBitmapPreservesNullBitmap) { + auto true_bitmap = std::make_shared(); + auto null_bitmap = std::make_shared(); + null_bitmap->addRange(0, 4); + + query_v2::BitSetQuery query(std::move(true_bitmap), std::move(null_bitmap)); + auto weight = query.weight(false); + ASSERT_NE(nullptr, weight); + + query_v2::QueryExecutionContext exec_ctx; + auto scorer = weight->scorer(exec_ctx); + ASSERT_NE(nullptr, scorer); + EXPECT_EQ(query_v2::TERMINATED, scorer->doc()); + ASSERT_TRUE(scorer->has_null_bitmap()); + + const auto* scorer_null_bitmap = scorer->get_null_bitmap(); + ASSERT_NE(nullptr, scorer_null_bitmap); + EXPECT_EQ(4, scorer_null_bitmap->cardinality()); + EXPECT_TRUE(scorer_null_bitmap->contains(0)); + EXPECT_TRUE(scorer_null_bitmap->contains(3)); +} + class BooleanQueryTest : public testing::Test { public: const std::string kTestDir1 = "./ut_dir/query_test1";