From 2afc859816542ac44475554f143f73d056a0eb4e Mon Sep 17 00:00:00 2001 From: Sandy Chen Date: Fri, 22 May 2026 19:07:37 +0900 Subject: [PATCH] fix(integration): skip `or vector(...)` in fuzz query generator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TestVerticalShardingFuzz flakes when promqlsmith emits an expression containing ` or vector()` against block data with partial time coverage on the LHS: Cortex's vertically-sharded query path and the unsharded reference engine disagree on which timestamps trigger the `vector(...)` fallback, because each shard decides locally whether the LHS is empty at a given step while the unsharded path sees the union. Both answers are individually correct under their own model, but the final results diverge on a handful of points. Same analyser-side family as the known `absent` / `absent_over_time` / `scalar` cases (#5203, #5204, #5205): the Thanos PromQL analyser does not classify `or vector(...)` as non-shardable. Full empirical census in #7547. Add hasOrVectorFallback(parser.Expr), an AST walker that uses parser.Inspect to detect any *parser.BinaryExpr with Op == parser.LOR whose RHS is a *parser.Call to the `vector` function. Call it from isValidQuery alongside the existing `limitk` / `limit_ratio` / `--` filters so every fuzz caller (TestVerticalShardingFuzz plus the seven other fuzz tests that go through runQueryFuzzTestCases) skips the offending shape. An earlier draft used `strings.Contains(queryStr, "vector(")`. Round-2 review flagged that as too broad: a substring scan fires on bare `vector(0)`, `sum(vector(1))`, `vector(1) + on() vector(2)`, and even on incidental `vector(` text inside label literals — none of which diverge between the sharded and unsharded engines. The AST predicate matches only ` or vector()`, leaving the rest of the random surface intact. Add TestHasOrVectorFallback with six subtests pinning the predicate: `up` (no), `vector(1)` (no — bare call is fine), `up or up` (no — `or` without `vector`), `up or vector(1)` (yes), a deep-LHS positive (`(sum(rate(up[1m])) == bool 0) or vector(0)`), and a simplified form of the actual failing query from #7547. All six pass locally. This is a test-side dodge, not a sharding-engine fix. The proper upstream change — teaching the Thanos analyser to recognise `or vector(...)` as non-shardable, mirroring the existing `absent` / `scalar` exclusions — is tracked separately. Trade-off: isValidQuery is global to all fuzz callers, so random fuzz coverage of `or vector(...)` is dropped everywhere, not just in TestVerticalShardingFuzz. Accepted for the reasons in #7547: the empirical signal points at the sharded vs unsharded path as the only current divergence context, and the filter becomes moot the moment the upstream analyser is fixed. Fixes #7547 Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Sandy Chen --- integration/query_fuzz_test.go | 54 ++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/integration/query_fuzz_test.go b/integration/query_fuzz_test.go index f735bd91d5..836a4adc4a 100644 --- a/integration/query_fuzz_test.go +++ b/integration/query_fuzz_test.go @@ -1980,6 +1980,55 @@ func shouldUseSampleNumComparer(query string) bool { return false } +// hasOrVectorFallback reports whether expr contains a sub-expression of the +// shape ` or vector()`. Such queries diverge between Cortex's +// sharded and unsharded query paths when the LHS has partial time coverage: +// the unsharded engine sees the gap and falls through to vector(...), while +// individual shards may each see (different) coverage and skip the fallback. +// This is the same class of semantic-divergence bug already known for +// `absent`/`absent_over_time`/`scalar` (see #5203, #5204, #5205). +func hasOrVectorFallback(expr parser.Expr) bool { + found := false + parser.Inspect(expr, func(node parser.Node, _ []parser.Node) error { + be, ok := node.(*parser.BinaryExpr) + if !ok || be.Op != parser.LOR { + return nil + } + call, ok := be.RHS.(*parser.Call) + if !ok || call.Func == nil { + return nil + } + if call.Func.Name == "vector" { + found = true + } + return nil + }) + return found +} + +func TestHasOrVectorFallback(t *testing.T) { + for _, tc := range []struct { + query string + want bool + }{ + {`up`, false}, + {`vector(1)`, false}, + {`up or up`, false}, + {`up or vector(1)`, true}, + {`(sum(rate(up[1m])) == bool 0) or vector(0)`, true}, + // The actual failing case from issue #7547, simplified. + {`(sum without (job) (stddev_over_time(up[4m]) == -up)) or vector(2.0099)`, true}, + } { + t.Run(tc.query, func(t *testing.T) { + expr, err := parser.ParseExpr(tc.query) + require.NoError(t, err) + if got := hasOrVectorFallback(expr); got != tc.want { + t.Fatalf("hasOrVectorFallback(%q) = %v, want %v", tc.query, got, tc.want) + } + }) + } +} + func isValidQuery(generatedQuery parser.Expr, skipBackwardIncompat bool) bool { isValid := true queryStr := generatedQuery.String() @@ -1998,6 +2047,11 @@ func isValidQuery(generatedQuery parser.Expr, skipBackwardIncompat bool) bool { // to avoid false positives in backward compatibility tests. return false } + if hasOrVectorFallback(generatedQuery) { + // `or vector(...)` falls through at different timestamps in sharded vs + // unsharded engines when the LHS has partial time coverage; see #7547. + return false + } if skipBackwardIncompat { // Skip functions and aggregations whose evaluation semantics changed across // Prometheus versions embedded in different Cortex releases. These produce