Introduce Symbolic Constraint Solver for SQL-Driven Data Generation#564
Conversation
There was a problem hiding this comment.
Half way through the README.md. Will continue reading and then proceed to the code.
How does the system in general handle expressions where the values depend on each other.
Eg.
SELECT * FROM test.suitcase WHERE width + height + length < 25
Does this need a new domain type?
| /** | ||
| * Copyright 2025 LinkedIn Corporation. All rights reserved. | ||
| * Licensed under the BSD-2 Clause license. | ||
| * See LICENSE in the project root for license information. | ||
| */ | ||
| package com.linkedin.coral.datagen.domain; | ||
|
|
||
| import java.util.Arrays; | ||
| import java.util.List; | ||
|
|
||
| import org.testng.annotations.Test; | ||
|
|
||
|
|
||
| /** | ||
| * Tests for IntegerDomain class. | ||
| */ | ||
| public class IntegerDomainTest { | ||
|
|
||
| @Test | ||
| public void testSingleValue() { | ||
| System.out.println("\n=== Single Value Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(42); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Is empty: " + domain.isEmpty()); | ||
| System.out.println("Contains 42: " + domain.contains(42)); | ||
| System.out.println("Contains 43: " + domain.contains(43)); | ||
| System.out.println("Samples: " + domain.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testSingleInterval() { | ||
| System.out.println("\n=== Single Interval Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Contains 10: " + domain.contains(10)); | ||
| System.out.println("Contains 15: " + domain.contains(15)); | ||
| System.out.println("Contains 20: " + domain.contains(20)); | ||
| System.out.println("Contains 21: " + domain.contains(21)); | ||
| System.out.println("Samples: " + domain.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultipleIntervals() { | ||
| System.out.println("\n=== Multiple Intervals Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 5), | ||
| new IntegerDomain.Interval(10, 15), new IntegerDomain.Interval(20, 30)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Domain: " + domain); | ||
| System.out.println("Contains 3: " + domain.contains(3)); | ||
| System.out.println("Contains 7: " + domain.contains(7)); | ||
| System.out.println("Contains 12: " + domain.contains(12)); | ||
| System.out.println("Contains 25: " + domain.contains(25)); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testIntersection() { | ||
| System.out.println("\n=== Intersection Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 20); | ||
| IntegerDomain domain2 = IntegerDomain.of(10, 30); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Samples: " + intersection.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testUnion() { | ||
| System.out.println("\n=== Union Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 10); | ||
| IntegerDomain domain2 = IntegerDomain.of(20, 30); | ||
| IntegerDomain union = domain1.union(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Union: " + union); | ||
| System.out.println("Samples: " + union.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAddConstant() { | ||
| System.out.println("\n=== Add Constant Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain shifted = domain.add(5); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After adding 5: " + shifted); | ||
| System.out.println("Samples: " + shifted.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultiplyConstant() { | ||
| System.out.println("\n=== Multiply Constant Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain scaled = domain.multiply(2); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After multiplying by 2: " + scaled); | ||
| System.out.println("Samples: " + scaled.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testNegativeMultiply() { | ||
| System.out.println("\n=== Negative Multiply Test ==="); | ||
| IntegerDomain domain = IntegerDomain.of(10, 20); | ||
| IntegerDomain scaled = domain.multiply(-1); | ||
| System.out.println("Original domain: " + domain); | ||
| System.out.println("After multiplying by -1: " + scaled); | ||
| System.out.println("Samples: " + scaled.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testOverlappingIntervalsMerge() { | ||
| System.out.println("\n=== Overlapping Intervals Merge Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 10), | ||
| new IntegerDomain.Interval(5, 15), new IntegerDomain.Interval(20, 30)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Input intervals: [1, 10], [5, 15], [20, 30]"); | ||
| System.out.println("Merged domain: " + domain); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testAdjacentIntervalsMerge() { | ||
| System.out.println("\n=== Adjacent Intervals Merge Test ==="); | ||
| List<IntegerDomain.Interval> intervals = Arrays.asList(new IntegerDomain.Interval(1, 10), | ||
| new IntegerDomain.Interval(11, 20), new IntegerDomain.Interval(30, 40)); | ||
| IntegerDomain domain = IntegerDomain.of(intervals); | ||
| System.out.println("Input intervals: [1, 10], [11, 20], [30, 40]"); | ||
| System.out.println("Merged domain: " + domain); | ||
| System.out.println("Samples: " + domain.sampleValues(10)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testEmptyDomain() { | ||
| System.out.println("\n=== Empty Domain Test ==="); | ||
| IntegerDomain empty = IntegerDomain.empty(); | ||
| System.out.println("Empty domain: " + empty); | ||
| System.out.println("Is empty: " + empty.isEmpty()); | ||
| System.out.println("Samples: " + empty.sampleValues(5)); | ||
| } | ||
|
|
||
| @Test | ||
| public void testIntersectionEmpty() { | ||
| System.out.println("\n=== Intersection Empty Test ==="); | ||
| IntegerDomain domain1 = IntegerDomain.of(1, 10); | ||
| IntegerDomain domain2 = IntegerDomain.of(20, 30); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Is empty: " + intersection.isEmpty()); | ||
| } | ||
|
|
||
| @Test | ||
| public void testComplexArithmetic() { | ||
| System.out.println("\n=== Complex Arithmetic Test ==="); | ||
| // Solve: 2*x + 5 = 25, where x in [0, 100] | ||
| // => 2*x = 20 | ||
| // => x = 10 | ||
| IntegerDomain output = IntegerDomain.of(25); | ||
| IntegerDomain afterSubtract = output.add(-5); // x = 20 | ||
| IntegerDomain solution = afterSubtract.multiply(1).intersect(IntegerDomain.of(0, 100)); | ||
|
|
||
| System.out.println("Equation: 2*x + 5 = 25"); | ||
| System.out.println("Output domain: " + output); | ||
| System.out.println("After subtracting 5: " + afterSubtract); | ||
| System.out.println("Solution (x must be in [0, 100]): " + solution); | ||
|
|
||
| // Verify | ||
| if (!solution.isEmpty()) { | ||
| long x = solution.sampleValues(1).get(0); | ||
| System.out.println("Sample x: " + x); | ||
| System.out.println("Verification: 2*" + x + " + 5 = " + (2 * x + 5)); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
| public void testMultiIntervalIntersection() { | ||
| System.out.println("\n=== Multi-Interval Intersection Test ==="); | ||
| List<IntegerDomain.Interval> intervals1 = | ||
| Arrays.asList(new IntegerDomain.Interval(1, 20), new IntegerDomain.Interval(30, 50)); | ||
| List<IntegerDomain.Interval> intervals2 = | ||
| Arrays.asList(new IntegerDomain.Interval(10, 35), new IntegerDomain.Interval(45, 60)); | ||
|
|
||
| IntegerDomain domain1 = IntegerDomain.of(intervals1); | ||
| IntegerDomain domain2 = IntegerDomain.of(intervals2); | ||
| IntegerDomain intersection = domain1.intersect(domain2); | ||
|
|
||
| System.out.println("Domain 1: " + domain1); | ||
| System.out.println("Domain 2: " + domain2); | ||
| System.out.println("Intersection: " + intersection); | ||
| System.out.println("Expected: [10, 20] ∪ [30, 35] ∪ [45, 50]"); | ||
| System.out.println("Samples: " + intersection.sampleValues(15)); | ||
| } | ||
| } |
There was a problem hiding this comment.
These tests don't have assertions. Some other files have tests like these too.
| @Test | ||
| public void testArithmeticExpression() { | ||
| testDomainInference("Arithmetic Expression Test", "SELECT * FROM test.T WHERE age * 2 + 5 = 25", inputDomain -> { | ||
| assertTrue(inputDomain instanceof IntegerDomain, "Should be IntegerDomain"); |
There was a problem hiding this comment.
When there is an error, a new test I'm adding still passes
@Test
public void testMultiVariateArithmeticExpression() {
testDomainInference("Arithmetic Expression Test", "SELECT * FROM test.suitcase WHERE width + height + length < 25", inputDomain -> {
assertTrue(inputDomain instanceof IntegerDomain, "Should be IntegerDomain");
IntegerDomain intDomain = (IntegerDomain) inputDomain;
System.out.println(intDomain);
assertTrue(intDomain.contains(10), "Should contain 10 (since 10 * 2 + 5 = 25)");
assertTrue(intDomain.contains(10), "Should contain 10 (since 10 * 2 + 5 = 25)");
assertTrue(intDomain.isSingleton(), "Should be singleton");
});
}There was a problem hiding this comment.
Good catch. The old testDomainInference helper used if guards (if (disjunct instanceof RexCall), if (operator == EQUALS)) that would silently skip the assertion lambda when the structure didn't match, making any test pass vacuously.
I've refactored the helper to replace those if guards with hard assertions (assertTrue(..., "disjunct should be a RexCall"), assertEquals(..., EQUALS, "operator should be EQUALS")), so a test like your testMultiVariateArithmeticExpression example would now fail explicitly at the operator check instead of passing silently.
Thanks for the review. This sounds like a type of "domain propagation" which is used for joins (e.g., when we resolve one variable, we resolve the other based on the relationship between them). However, this cases is a bit more complex than join because all expressions mutually depend on each other, and there is no obvious expression to start from and propagate to the rest. We will tackle this separately. |
| * - Single interval: [10, 20] | ||
| * - Multiple intervals: [1, 5] ∪ [10, 15] ∪ [20, 30] | ||
| */ | ||
| public class IntegerDomain extends Domain<Long, IntegerDomain> { |
There was a problem hiding this comment.
IntegerDomain.class and IntegerDomain$Interval.class has also committed, need to be removed.
There was a problem hiding this comment.
Good catch. Thanks. Removed.
| if (max == Long.MAX_VALUE || min == Long.MIN_VALUE) { | ||
| return Long.MAX_VALUE; // Unbounded | ||
| } | ||
| return max - min + 1; |
There was a problem hiding this comment.
Potential overflow: max - min + 1 can overflow for large intervals where neither bound is exactly Long.MIN_VALUE/Long.MAX_VALUE. For example, Interval(Long.MIN_VALUE + 1, Long.MAX_VALUE - 1) bypasses both guards but the arithmetic overflows.
There was a problem hiding this comment.
Now throws exception and handles gracefully.
| } | ||
|
|
||
| public boolean isAdjacent(Interval other) { | ||
| return this.max + 1 == other.min || other.max + 1 == this.min; |
There was a problem hiding this comment.
Potential overflow: this.max + 1 overflows when max == Long.MAX_VALUE (wraps to Long.MIN_VALUE), which could cause two non-adjacent intervals to incorrectly appear adjacent and get merged during normalization.
| { ':', '@' }, // colon to @ (common punctuation) | ||
| { '[', '`' }, // [ to backtick | ||
| { '{', '~' } // { to tilde | ||
| }; |
There was a problem hiding this comment.
Nit: ALPHABET_RANGES is hard to read — requires knowledge of ASCII table gaps. Consider deriving the alphabet programmatically from the printable ASCII range:
private static final String ALPHABET;
static {
char printableStart = ' '; // 0x20 — first printable ASCII character
char printableEnd = '~'; // 0x7E — last printable ASCII character
StringBuilder sb = new StringBuilder();
for (char c = printableStart; c <= printableEnd; c++) {
sb.append(c);
}
ALPHABET = sb.toString();
}This covers the same set of characters and makes the intent explicit.
There was a problem hiding this comment.
enumerateAllowedChars can then just refer to the above.
There was a problem hiding this comment.
I would keep it simple for now.
| */ | ||
| private RegexDomain(Automaton automaton) { | ||
| this.regex = automaton.toString(); | ||
| this.automaton = automaton; |
There was a problem hiding this comment.
dk.brics Automaton is mutable — methods like determinize() and reduce() can modify internal state. Since intersection() and union() return new instances this is low-risk today, but as a defensive measure consider cloning:
private RegexDomain(Automaton automaton) {
this.automaton = automaton.clone();
this.regex = this.automaton.toString();
}There was a problem hiding this comment.
I would keep it simple since the above pattern does not exist.
| * Creates a RegexDomain from an existing automaton. | ||
| */ | ||
| private RegexDomain(Automaton automaton) { | ||
| this.regex = automaton.toString(); |
There was a problem hiding this comment.
automaton.toString() returns a debug representation of states/transitions, not a valid regex pattern. This means this.regex here is not equivalent to the regex stored in the public constructor (line 41). This could be confusing if regex is used for display or debugging.
One option is to pass a descriptive string from the call sites, e.g.:
private RegexDomain(Automaton automaton, String regex) { ... }
// In intersect():
return new RegexDomain(intersection, "(" + this.regex + ")&(" + other.regex + ")");
// In union():
return new RegexDomain(union, "(" + this.regex + ")|(" + other.regex + ")");Up to you whether this is worth addressing now.
There was a problem hiding this comment.
This also affects LowerRegexTransformer: if a RegexDomain produced by intersect()/union() is passed as the output domain, getRegex() returns the debug string. isLiteral() will return false (since the debug string contains special characters), so the transformer silently skips the case-insensitive inversion and returns the domain unchanged. Fixing toString() here would fix that too.
There was a problem hiding this comment.
No longer applicable. The regex String field has been eliminated entirely. RegexDomain is now purely automaton-driven. isLiteral() uses automaton.getFiniteStrings(2) and getLiteralValue() uses automaton.getFiniteStrings(1).
| */ | ||
| @Override | ||
| public List<String> sample(int limit) { | ||
| return sampleStrings(limit, 100); |
There was a problem hiding this comment.
Nit: consider extracting 100 to a named constant (e.g. DEFAULT_MAX_SAMPLE_LENGTH) to make the intent clearer.
There was a problem hiding this comment.
Extracted to DEFAULT_MAX_SAMPLE_LENGTH = 100. sample() now throws IllegalStateException when the automaton is non-empty but DFS yields zero samples.
| */ | ||
| @Override | ||
| public List<String> sample(int limit) { | ||
| return sampleStrings(limit, 100); |
There was a problem hiding this comment.
If the regex only matches strings longer than maxLength (100), this silently returns an empty list. The caller can't distinguish "empty domain" from "domain exists but all valid strings exceed the length cap." Consider logging a warning or throwing when the domain is non-empty but no samples could be generated.
|
|
||
| // Initialize generic domain inference program with all transformers | ||
| program = new DomainInferenceProgram(Arrays.asList(new LowerRegexTransformer(), new SubstringRegexTransformer(), | ||
| new PlusRegexTransformer(), new TimesRegexTransformer(), new CastRegexTransformer())); |
There was a problem hiding this comment.
As new transformers are added, every caller that assembles this list needs to remember to include them — easy to miss silently.
Consider adding a factory method to DomainInferenceProgram as the single source of truth:
public static DomainInferenceProgram withDefaultTransformers() {
return new DomainInferenceProgram(List.of(
new LowerRegexTransformer(),
new SubstringRegexTransformer(),
new PlusRegexTransformer(),
new TimesRegexTransformer(),
new CastRegexTransformer()));
}You could also add a test that verifies all DomainTransformer implementations are included in the default list, so adding a new transformer without registering it fails a test.
There was a problem hiding this comment.
I was considering if we need a ServiceLoader but they seems like overkill.
There was a problem hiding this comment.
Agreed, ServiceLoader is overkill. Added DomainInferenceProgram.withDefaultTransformers() as the single source of truth for the default transformer list.
| * Convenience method for deriving IntegerDomain constraints. | ||
| * Throws if the result is not an IntegerDomain. | ||
| */ | ||
| public IntegerDomain deriveInputInteger(RexNode expr, IntegerDomain outputInteger) { |
There was a problem hiding this comment.
This method doesn't appear to be called anywhere. Consider removing it to avoid dead code, or adding test coverage if it's intended for future use.
|
|
||
| @Override | ||
| public boolean canHandle(RexNode expr) { | ||
| return expr instanceof RexCall && ((RexCall) expr).getOperator() == SqlStdOperatorTable.LOWER; |
There was a problem hiding this comment.
Should we also handle UPPER? The inversion logic is the same — both produce a case-insensitive regex. Consider generalizing this into a CaseRegexTransformer that handles both SqlStdOperatorTable.LOWER and SqlStdOperatorTable.UPPER, to avoid duplicating the class.
There was a problem hiding this comment.
Deferring to a follow-up. The logic is identical, but adding UPPER support can be done cleanly when it's actually needed.
| if (isStringType(targetTypeName) && !isStringType(sourceTypeName)) { | ||
| if (isDateType(sourceTypeName)) { | ||
| // Date to String | ||
| String dateFormatRegex = "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|30)$"; |
There was a problem hiding this comment.
Date end should handle 31 too. Right now handling ends at 30.
| // For complex patterns, wrap with case-insensitive flag | ||
| // Note: Java regex doesn't have inline (?i) in automaton library, | ||
| // so we return the pattern as-is and rely on character-level matching | ||
| return outputRegex; |
There was a problem hiding this comment.
Consider adding a disabled test to track this known limitation, so it doesn't get silently forgotten:
@Test(enabled = false, description = "Non-literal LOWER patterns not yet case-insensitive inverted — see LowerRegexTransformer line 74")
public void testLowerWithComplexPattern() {
// LOWER(name) LIKE '%abc%' should produce .*[aA][bB][cC].*
}This way it shows up in test reports as skipped rather than being invisible.
There was a problem hiding this comment.
The non-literal LOWER fallback (returning as-is for complex patterns like .*abc.*) is a known feature gap. A proper fix would walk the automaton transitions and expand alphabetic ranges to include both cases. Deferring to a follow-up alongside UPPER support.
| return new Output(scans, remapped); | ||
| } | ||
|
|
||
| private static final Map<RexNode, RelNode> predicateOriginMap = new IdentityHashMap<>(); |
There was a problem hiding this comment.
P0 — Thread-safety issue: predicateOriginMap is a static mutable field shared across all invocations. extract() calls .clear() (line 42) then populates it during collection (line 81). If two threads call extract() concurrently, they corrupt each other's data.
Fix: Make it a local variable inside extract() and pass it through as a parameter, or make the class non-static with an instance field.
There was a problem hiding this comment.
Fixed. Made predicateOriginMap a local variable in extract() and passed it as a parameter to collectPredicates(). The class remains fully static with a private constructor — no API change since collectPredicates is private.
| } | ||
| // Remove escape sequences | ||
| return result.replaceAll("\\\\(.)", "$1"); | ||
| } |
There was a problem hiding this comment.
Dead code: unescapeLiteral is defined but never called within this class. Consider removing it.
| } | ||
| // Remove escape sequences | ||
| return result.replaceAll("\\\\(.)", "$1"); | ||
| } |
There was a problem hiding this comment.
Duplicated code: this identical unescapeLiteral method also appears in SubstringRegexTransformer (line 119) and CastRegexTransformer (line 253, dead there). Consider extracting to a shared utility (e.g., RegexUtils.unescapeLiteral).
There was a problem hiding this comment.
Leaving as-is. Duplication across 2 active files is tolerable; the method is small and stable.
| } | ||
| } | ||
|
|
||
| private static class Node { |
There was a problem hiding this comment.
Dead code: inner class Node is never referenced anywhere. Likely a remnant of an earlier BFS-based sampling approach that was replaced by dfsCollect. Consider removing it.
There was a problem hiding this comment.
Gone after refactor.
| return interval.min; | ||
| } | ||
|
|
||
| long range = interval.max - interval.min; |
There was a problem hiding this comment.
Potential overflow: interval.max - interval.min can overflow for ranges exceeding Long.MAX_VALUE. For example, Interval(-1, Long.MAX_VALUE) produces a negative range, which would cause the range < Integer.MAX_VALUE check to behave unexpectedly and lead to incorrect sampling.
| * x + 5 in [20, 30] | ||
| * produces: x in [15, 25] | ||
| */ | ||
| public class PlusRegexTransformer implements DomainTransformer { |
There was a problem hiding this comment.
Misleading name: PlusRegexTransformer operates on IntegerDomain and throws if given RegexDomain. The "Regex" prefix is a historical artifact. Consider renaming to PlusIntegerTransformer. Same applies to TimesRegexTransformer → TimesIntegerTransformer.
There was a problem hiding this comment.
Renamed. PlusRegexTransformer → PlusIntegerTransformer, TimesRegexTransformer → TimesIntegerTransformer.
| @@ -0,0 +1,659 @@ | |||
| /** | |||
There was a problem hiding this comment.
AI-Assisted Review Summary (In addition to the above comments)
Note: This is an automated review comment generated by AI (Claude). It is intended for consumption by both humans and AI agents. Author: please reply with a sub-comment indicating which issues should be fixed (e.g.,
fix: 1, 2, 5orfix: allorskip: all).
P1 — Should Fix
| # | File | Line | Issue |
|---|---|---|---|
| 1 | SubstringRegexTransformer.java |
33 | Operator name matching uses hardcoded "substr" but Hive's SUBSTRING may be registered as "substring" in Calcite. If mismatched, this transformer silently never fires. |
| 2 | RegexToIntegerDomainConverter.java |
462-465 | createIntegerDomainFromBounds computes a single [min, max] interval for large domains. Pattern ^(10|20|30)$ becomes [10, 30] including 20 false values. Should document the over-approximation explicitly. |
| 3 | RegexToIntegerDomainConverter.java |
109 | pattern.matches(".*[*+?].*") rejects *, +, ? anywhere — including inside character classes like [*+?] where they are literal. Could incorrectly reject valid patterns. |
| 4 | RegexToIntegerDomainConverter.java |
289 | No validation that quantifier min <= max. Pattern {5,2} is silently accepted; downstream loops iterate from min to max, producing no iterations rather than erroring. |
| 5 | CastRegexTransformer.java |
216-218 | break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision. |
| 6 | CanonicalPredicateExtractor.java |
84-89 | Join condition remapping applies a single base offset to all RexInputRefs. Correct only if Projects are already pulled up (stated in Javadoc precondition). No runtime validation — wrong results if called without pull-up. |
| 7 | ProjectPullUpRewriter.java |
241 | rightCount = join.getRowType().getFieldCount() - leftCount uses original join output type, but leftCount comes from leftProj.getRowType() (Project output, not input). Mismatch if the left Project changes field count. |
P2 — Consider
| # | File | Line | Issue |
|---|---|---|---|
| 8 | RegexToIntegerDomainConverter.java |
331-342 | estimateSize sums across repetition counts: [0-9]{2,3} gives 10^2 + 10^3 = 1100 but actual domain is 1000 values. Over-estimates and may unnecessarily trigger bounds path, losing precision. |
| 9 | CastRegexTransformer.java |
206-219 | convertIntegerDomainToRegex emits -?[0-9]+ (matches ANY integer) for ranges >100 values. [1000, 9999] loses all constraint info. Known limitation but severe. |
| 10 | RegexToIntegerDomainConverter.java |
551, 590 | computeMinString/computeMaxString for AlternationNode: if alt.alternatives is empty, returns null → NPE in caller's sb.append(). Parser shouldn't produce empty alternations but not validated. |
| 11 | build.gradle |
2, 5 | Uses deprecated compile/testCompile instead of implementation/testImplementation. |
| 12 | rel/ package |
— | No unit tests for ProjectPullUpRewriter, CanonicalPredicateExtractor, DnfRewriter. Only tested indirectly via integration tests. |
There was a problem hiding this comment.
break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision.
Fixed with IntegerRangeAutomaton — a ~80-line utility that builds a minimal automaton accepting exactly the decimal representations of integers in [lo, hi] using recursive digit-by-digit construction. All intervals are now processed precisely via parts.add(IntegerRangeAutomaton.build(min, max)). The break and the generic -?[0-9]+ fallback are both gone. Added 19 tests in IntegerRangeAutomatonTest covering single values, cross-digit boundaries, large ranges, negative/mixed ranges, and leading-zero rejection.
There was a problem hiding this comment.
Join condition remapping applies a single base offset to all RexInputRefs. Correct only if Projects are already pulled up (stated in Javadoc precondition). No runtime validation — wrong results if called without pull-up.
The Javadoc precondition is sufficient. The single production caller always applies ProjectPullUpController.applyUntilFixedPoint() before extract().
There was a problem hiding this comment.
rightCount = join.getRowType().getFieldCount() - leftCount uses original join output type, but leftCount comes from leftProj.getRowType() (Project output, not input). Mismatch if the left Project changes field count.
Fixed. The "only left had Project" and "only right had Project" branches now use newLeft.getRowType().getFieldCount() for pass-through RexInputRef offsets and newJoin.getRowType() for field types.
There was a problem hiding this comment.
Uses deprecated compile/testCompile instead of implementation/testImplementation.
Fixed. Replaced throughout.
There was a problem hiding this comment.
P1-2, P1-3, P1-4:
RegexToIntegerDomainConverterissues (over-approximation, naive[*+?]check, no quantifier validation).
No longer applicable. The class was completely rewritten to use dk.brics automaton directly. isConvertible() checks a.isFinite() && isDigitOnly(a), convert() uses getFiniteStrings(5000) for enumeration with DFS fallback. The custom regex parser, estimateSize(), checkForInvalidConstructs(), computeMinString/computeMaxString, and createIntegerDomainFromBounds are all gone.
There was a problem hiding this comment.
break on first large interval (>100 values) loses remaining intervals. [10, 20] ∪ [1000, 2000] silently becomes -?[0-9]+, discarding the first interval's precision.
Fixed via IntegerRangeAutomaton (see CastRegexTransformer.java:216-218 reply above).
There was a problem hiding this comment.
estimateSize sums across repetition counts: [0-9]{2,3} gives 10^2 + 10^3 = 1100 but actual domain is 1000 values. Over-estimates and may unnecessarily trigger bounds path, losing precision.
No longer applicable (class rewritten, estimateSize removed).
There was a problem hiding this comment.
convertIntegerDomainToRegex emits -?[0-9]+ (matches ANY integer) for ranges >100 values. [1000, 9999] loses all constraint info. Known limitation but severe.
Fixed via IntegerRangeAutomaton.
There was a problem hiding this comment.
computeMinString/computeMaxString for AlternationNode: if alt.alternatives is empty, returns null → NPE in caller's sb.append(). Parser shouldn't produce empty alternations but not validated.
No longer applicable (class rewritten, methods removed).
Bug Bash: Failing Test Cases for Already-Identified IssuesHere are test cases for bugs already flagged in the review. They all fail on the current code and can be copy-pasted directly. Tests to add to
|
|
Consider moving all transformer implementations ( |
| } | ||
|
|
||
| @Override | ||
| public Domain<?, ?> refineInputDomain(RexNode expr, Domain<?, ?> outputDomain) { |
There was a problem hiding this comment.
Suggestion: Refactor to avoid the growing if-else chain
This method dispatches on (sourceType, targetType, domainType) via a series of if blocks. As more types are supported (decimal, timestamp, etc.), this will grow unwieldy.
Option A — Handler registry (simplest):
@FunctionalInterface
interface CastHandler {
Domain<?, ?> handle(RexCall call, Domain<?, ?> outputDomain);
}
enum TypeCategory { STRING, INTEGER, DATE, NUMERIC_OTHER }
// Register handlers keyed by (source, target, domainClass)
private final Map<CastKey, CastHandler> handlers = new HashMap<>();
private void registerHandlers() {
handlers.put(key(STRING, INTEGER, IntegerDomain.class), this::castStringToIntegerWithIntegerDomain);
handlers.put(key(STRING, INTEGER, RegexDomain.class), this::castStringToIntegerWithRegexDomain);
handlers.put(key(INTEGER, STRING, RegexDomain.class), this::castIntegerToStringWithRegexDomain);
handlers.put(key(DATE, STRING, RegexDomain.class), this::castDateToStringWithRegexDomain);
// ...add new entries as needed
}Then refineInputDomain becomes:
public Domain<?, ?> refineInputDomain(RexNode expr, Domain<?, ?> outputDomain) {
// ... extract sourceType, targetType ...
if (sourceTypeName == targetTypeName) return outputDomain;
CastHandler handler = handlers.get(key(categorize(sourceTypeName), categorize(targetTypeName), outputDomain.getClass()));
if (handler != null) return handler.handle(call, outputDomain);
// fallback for numeric-to-numeric, unknown combos, etc.
return outputDomain;
}Option B — Two-level dispatch: Group by (sourceCategory, targetCategory) into strategy objects, each of which handles the domain type internally. Better if strategies within a type pair share logic.
Option A is probably the right starting point — adding support for a new type is just adding a map entry and a private method, with no risk of breaking existing branches.
There was a problem hiding this comment.
Deferring. The current 8-branch if-else chain is manageable. Will refactor to a handler registry when it reaches ~15+ branches as more types are supported.
There was a problem hiding this comment.
A handler registry sounds like a good approach as a follow up when more branches are neede.
| * | ||
| * @deprecated Use {@link DomainInferenceProgram} for full cross-domain support | ||
| */ | ||
| public class RegexDomainInferenceProgram { |
There was a problem hiding this comment.
This is deadcode, no callers.
There was a problem hiding this comment.
Good catch. Removed.
| * <p>Usage pattern:</p> | ||
| * <pre> | ||
| * CanonicalPredicateExtractor.Output extracted = CanonicalPredicateExtractor.extract(rel); | ||
| * CanonicalPredicateDnf.Output dnf = CanonicalPredicateDnf.convert(extracted, rexBuilder); |
There was a problem hiding this comment.
Nit: Javadoc references CanonicalPredicateDnf.Output / CanonicalPredicateDnf.convert but the class is DnfRewriter.
| } | ||
|
|
||
| List<RelNode> newInputs = new ArrayList<>(); | ||
| boolean anyChanged = false; |
There was a problem hiding this comment.
Dead code: anyChanged is assigned at line 152 but never read — the method returns immediately after setting it.
|
|
||
| // Inline right Project if present | ||
| if (rightProj != null) { | ||
| int leftFieldCount = newLeft.getRowType().getFieldCount(); |
There was a problem hiding this comment.
Same category of bug as line 241: after newLeft = leftProj.getInput() (line 205), newLeft.getRowType().getFieldCount() gives the field count of the left project's input (child), not its output. If the left project changes field count (adds/drops columns), this offset will be wrong for right-side condition inlining.
There was a problem hiding this comment.
Good catch. Fixed. Captured leftFieldCount from leftProj.getRowType().getFieldCount() before newLeft is reassigned. Added a unit test (testJoinLeftProjectPullUp_FieldCountChanged) that confirms the fix.
|
Thanks for the Bug Bash tests. Added them. They pass after the above fixes. |
Moved to own package. |
- Move transformers to dedicated subpackage (domain/transformer/) - Rename PlusRegexTransformer → PlusIntegerTransformer, TimesRegexTransformer → TimesIntegerTransformer - Add DomainInferenceProgram.withDefaultTransformers() factory method - Rewrite RegexToIntegerDomainConverter with automaton-based approach and add IntegerRangeAutomaton for precise integer range constraints - Add unit tests for CanonicalPredicateExtractor, ProjectPullUpRewriter, and IntegerRangeAutomaton - Fix DnfRewriter Javadoc, extract RegexDomain magic number, make IntegerRangeAutomaton public for cross-package access - Remove dead code: deriveInputRegex(), deprecated sampleValues() - Normalize test conventions: setup method naming, section separators, camelCase method names, assertEquals parameter order, and IntegerDomain construction style
simbadzina
left a comment
There was a problem hiding this comment.
LGTM. Just merge conflicts need to be addressed.
- Move transformers to dedicated subpackage (domain/transformer/) - Rename PlusRegexTransformer → PlusIntegerTransformer, TimesRegexTransformer → TimesIntegerTransformer - Add DomainInferenceProgram.withDefaultTransformers() factory method - Rewrite RegexToIntegerDomainConverter with automaton-based approach and add IntegerRangeAutomaton for precise integer range constraints - Add unit tests for CanonicalPredicateExtractor, ProjectPullUpRewriter, and IntegerRangeAutomaton - Fix DnfRewriter Javadoc, extract RegexDomain magic number, make IntegerRangeAutomaton public for cross-package access - Remove dead code: deriveInputRegex(), deprecated sampleValues() - Normalize test conventions: setup method naming, section separators, camelCase method names, assertEquals parameter order, and IntegerDomain construction style
84d7ff4 to
7c1d771
Compare
Builds on the symbolic constraint solver from PR linkedin#564. The work breaks into five overlapping areas — new operator support, latent bug fixes surfaced by new tests, naming cleanup, weak-assertion strengthening, and one rewriter fix uncovered by the strengthened tests. ## New operator support Adds DomainTransformer implementations for ABS, CONCAT, TRIM, UPPER, unary minus, MINUS, FIELD access, and ITEM access — registered in DomainInferenceProgram.withDefaultTransformers() alongside the existing PLUS / TIMES / LOWER / SUBSTRING / CAST / LOWER transformers. The transformer wiring is reordered into string ops → integer ops → cross- domain → structural pass-throughs for readability. Adds new methods on DomainInferenceProgram: deriveInputRegex, deriveInputInteger, deriveInputDomainFromPredicate, resolveAllPaths, resolveDisjunct, findAccessPath, plus helpers (unionDomains, intersectDomains, etc.). The IntegerDomain.Interval overflow guard is tightened. Naming: AbsRegexTransformer, MinusRegexTransformer, and NegateRegexTransformer were renamed to *IntegerTransformer to match the PlusIntegerTransformer / TimesIntegerTransformer convention — they operate on IntegerDomain, not RegexDomain. Error messages switched from hardcoded class names to getClass().getSimpleName() so future renames don't drift. ## Hive-compatibility bug fixes (caught by new integration tests) - ConcatRegexTransformer only matched SqlStdOperatorTable.CONCAT (the SQL || operator). Hive emits concat(...) as an OTHER_FUNCTION named "concat". Now matches either form. - TrimRegexTransformer only handled Calcite's 3-operand standard TRIM. Hive emits trim(...) with a single source operand. Now handles both arities via a sourceOperand helper. - deriveInputDomainFromPredicate rejected RexCall(UNARY_MINUS, RexLiteral) on the RHS, which is how Calcite encodes negative literals in many contexts. Now folds that case into a negate flag for createDomainFromComparison. - Trim.canHandle and Substring.canHandle now check operand arity (both index operands.get(2)). ## CAST(integer AS VARCHAR) inverse fix CastRegexTransformer.refineInputDomain previously called the permissive RegexToIntegerDomainConverter to invert CAST(integer AS VARCHAR). That converter accepts padded representations like "000" → 0, but CAST(integer AS VARCHAR) produces canonical decimal — "0", not "000". The two are different SQL semantics: STRING → INT parses leading zeros, INT → STRING does not produce them. Using the permissive converter for the INT → STRING direction admits integers whose canonical string does not match the regex (e.g., int 0 for /^[0-9]{3}$/). Fix by intersecting the output regex with /^(0|-?[1-9][0-9]*)$/ before inverting; canonical strings then map back to the correct integer set. ## ProjectPullUpRewriter join-condition offset shift When a left Project under a Join changed the left input's field count (e.g., Project(a,b) over a 3-column scan), the rewriter inlined the left-side InputRefs into the new frame but left right-side InputRefs at their old offsets. After replacing the left Project with its unprojected input, the right-side refs then pointed at the wrong column. For Join(Project(a,b)+T1, T2, b=x) the rewritten condition came out as =($1, $2) — $2 was T1.c (VARCHAR) instead of T2.x (INTEGER). Replace the separate inlineLeftSide / inlineRightSide helpers with a single remapJoinCondition pass that recomputes both sides using oldLeftCount and newLeftCount, so right-side refs are either absorbed by rightProj (using newLeftCount in adjustOffsets) or shifted by (newLeftCount - oldLeftCount) when rightProj is absent. ## Naming cleanup - ColumnPath → AccessPath. The path's elements are heterogeneous accesses — struct fields (FIELD), map key lookups (MAP_KEY), and array indexing (ARRAY_INDEX) — none of which are columns; only the root is. - DomainInferenceProgram.findColumnPath → findAccessPath. - DomainInferenceProgram.resolveAllColumns → resolveAllPaths. The method resolves a Map<AccessPath, Domain> where keys can be nested paths, not just flat columns. - Map<AccessPath, Domain> columnDomains → pathDomains across the resolver and ~18 test lambdas. - Stale @return javadoc on resolveAllPaths rewritten to describe what is actually returned. ## Test work - New IntegerTransformerTest cases for Minus, Negate, Abs (rigorous style: construct RexCalls via RexBuilder, invoke transformer.refineInputDomain directly, assert containment). - New RegexTransformerTest for Concat (prefix/suffix stripping, mismatch, empty operand, non-literal output). - New SQL-level integration tests in RegexDomainInferenceProgramTest for CONCAT, TRIM, and unary minus. - 33 weak tests strengthened across six files: replaced "instanceof RegexDomain && !isEmpty()" shape checks with isLiteral() + getLiteralValue() value checks for equality predicates on struct/map/ array paths, explicit accept/reject probes on automata, sample-value content checks for singleton integer domains, and field-name/type preservation across ProjectPullUp rewrites. - testJoinLeftProjectPullUpFieldCountChanged now pins the rewritten condition to =($1, $3) and forbids stale $2 — this assertion is what surfaced the offset-shift rewriter bug above. ## Cleanup Removed REGEX_TO_AUTOMATON_CONSTRUCTION_REPORT.md and REGEX_TO_AUTOMATON_MIGRATION.md — work-in-progress notes from a migration that has since landed (RegexDomain is automaton-only; getRegex() is gone). ## Verification Full module pipeline (build + javadoc + spotlessJavaCheck) is green; 238 tests pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-column resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based, multi-column inference New public entry points on `DomainInferenceProgram`: - `deriveInputRegex` / `deriveInputInteger` — convenience wrappers returning a typed domain or throwing if the inferred type is wrong. - `deriveInputDomainFromPredicate(RexCall)` — accepts a comparison predicate (`=`, `<`, `>`, `<=`, `>=`), supports negative literals encoded as `RexCall(UNARY_MINUS, RexLiteral)` (the form Calcite often produces), and dispatches into the transformer chain. - `resolveAllPaths(List<RexNode> disjuncts)` — resolves a `Map<AccessPath, Domain>` across DNF disjuncts: intersects within a disjunct (AND), unions across disjuncts (OR), and skips predicates the current transformer set cannot handle. Supporting private helpers: `resolveDisjunct`, `findAccessPath`, `isTerminalItemAccess`, `unionDomains`, `intersectDomains`, `createDomainFromComparison`, `isNumericType`. ## CAST inverse — canonical decimal restriction `CastRegexTransformer.refineInputDomain` previously inverted `CAST(integer AS VARCHAR)` by calling the permissive `RegexToIntegerDomainConverter`, which accepts padded representations such as `"000" → 0`. But standard SQL `CAST(integer AS VARCHAR)` produces canonical decimal (`"0"`, no padding), so the converter admitted integers whose canonical string does not actually match the output regex (e.g., it returned `0..999` for output regex `/^[0-9]{3}$/`, when the SQL truth is `100..999`). The transformer now intersects the output regex with the canonical-decimal regex `^(0|-?[1-9][0-9]*)$` before inverting. Two static `RegexDomain` constants — `CANONICAL_INTEGER_STRING` and `ANY_INTEGER_STRING` — are extracted so they're built once. ## Join-condition remap in ProjectPullUpRewriter When a left `Project` under a `Join` changed the left input's field count (e.g., `Project(a,b)` over a 3-column scan), the rewriter inlined the left-side `InputRef`s into the new frame but left right-side `InputRef`s at their old offsets. After replacing the left `Project` with its unprojected input, right-side refs then pointed at the wrong column — for `Join(Project(a,b)+T1, T2, b=x)` the rewritten condition came out as `=($1, $2)` where `$2` was `T1.c` (`VARCHAR`) instead of `T2.x` (`INTEGER`). The two separate inlining helpers (`inlineLeftSide`, `inlineRightSide`) are replaced by a single `remapJoinCondition` pass that recomputes both sides using `oldLeftCount` and `newLeftCount`, so right-side refs are either absorbed by `rightProj` (`adjustOffsets` called with `newLeftCount`) or shifted by `(newLeftCount - oldLeftCount)` when `rightProj` is absent. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against three test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-column resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference across a full WHERE clause Master only exposed a single low-level entry point: `deriveInputDomain(RexNode expr, Domain<?, ?> outputDomain)` — given an expression tree and an *already-computed* output domain, it walks inward to the variable and returns the matching input domain. Useful, but the caller had to do every step of orchestration: extract the predicate from the plan, compute the comparison's output domain, deal with one column at a time, and combine results across AND/OR. This PR adds three layered entry points so callers can stay at the level they care about: - **`resolveAllPaths(List<RexNode> disjuncts)` — whole-query, top-level.** Takes the DNF disjuncts produced by `DnfRewriter` (one disjunct per conjunctive clause in DNF form) and returns `Map<AccessPath, Domain<?, ?>>`. Within each disjunct, predicates on the same path are intersected (AND); across disjuncts, domains for the same path are unioned (OR). Predicates the current transformer set cannot handle are silently skipped — notably column-to-column join predicates, which still require per-column literals. Example: for `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the returned map is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. - **`deriveInputDomainFromPredicate(RexCall predicate)` — single comparison.** Accepts a comparison `expr OP literal` for `=`, `<`, `>`, `<=`, `>=`, builds the output domain from the operator and literal (e.g., `> 5` → `IntegerDomain([6, ∞))`, `= 'abc'` → `RegexDomain.literal("abc")`), and recurses into `deriveInputDomain`. It also unwraps `RexCall(UNARY_MINUS, RexLiteral)` on the RHS — the form Calcite produces for negative literals in many contexts — so `age = -5` works the same as `age = 5`. - **`deriveInputRegex` / `deriveInputInteger` — typed convenience.** Thin wrappers around `deriveInputDomain` that cast the result and throw if the inferred type does not match the caller's expectation — saves the caller a manual `instanceof` + cast in tests and consumers that know the expected shape. ## CAST(integer AS VARCHAR) inverse: restrict to canonical decimal Concrete scenario: a query like `WHERE CAST(age AS VARCHAR) MATCHES '^[0-9]{3}$'` asks the system "which integer values of `age` produce a 3-character decimal string?" The SQL truth is `100..999` — `CAST(0 AS VARCHAR) = "0"` (length 1) does not match a 3-digit regex, so `0` should not appear in the input domain. Master's inverter returned `0..999`. The cause is an asymmetry in SQL CAST semantics: - `CAST(string AS INTEGER)` is *permissive* — it accepts leading zeros (`CAST('000' AS INTEGER) = 0`). - `CAST(integer AS VARCHAR)` is *canonical* — it produces the shortest decimal form, never padded (`CAST(0 AS VARCHAR) = "0"`, not `"000"`). `CastRegexTransformer.refineInputDomain` used the same converter (`RegexToIntegerDomainConverter`) for both directions. That converter correctly models the permissive direction — given `^[0-9]{3}$` it returns every integer that *some* matching 3-digit string parses to, which includes `0` (via the string `"000"`). Used to invert the canonical direction, it admits integers whose canonical string does not match. The fix is localized to the `CAST(integer AS VARCHAR)` branch: intersect the output regex with the canonical-decimal regex `^(0|-?[1-9][0-9]*)$` before passing it to the converter. After the intersection, only canonical strings remain (e.g., `"100".."999"`), so the permissive converter yields the canonical answer. The `string → int` branch is unchanged. Two `RegexDomain` constants (`CANONICAL_INTEGER_STRING`, `ANY_INTEGER_STRING`) are extracted so they build once at class load. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## CAST(integer AS VARCHAR) inverse: restrict to canonical decimal Concrete scenario: a query like `WHERE CAST(age AS VARCHAR) MATCHES '^[0-9]{3}$'` asks the system "which integer values of `age` produce a 3-character decimal string?" The SQL truth is `100..999` — `CAST(0 AS VARCHAR) = "0"` (length 1) does not match a 3-digit regex, so `0` should not appear in the input domain. Master's inverter returned `0..999`. The cause is an asymmetry in SQL CAST semantics: - `CAST(string AS INTEGER)` is *permissive* — it accepts leading zeros (`CAST('000' AS INTEGER) = 0`). - `CAST(integer AS VARCHAR)` is *canonical* — it produces the shortest decimal form, never padded (`CAST(0 AS VARCHAR) = "0"`, not `"000"`). `CastRegexTransformer.refineInputDomain` used the same converter (`RegexToIntegerDomainConverter`) for both directions. That converter correctly models the permissive direction — given `^[0-9]{3}$` it returns every integer that *some* matching 3-digit string parses to, which includes `0` (via the string `"000"`). Used to invert the canonical direction, it admits integers whose canonical string does not match. The fix is localized to the `CAST(integer AS VARCHAR)` branch: intersect the output regex with the canonical-decimal regex `^(0|-?[1-9][0-9]*)$` before passing it to the converter. After the intersection, only canonical strings remain (e.g., `"100".."999"`), so the permissive converter yields the canonical answer. The `string → int` branch is unchanged. Two `RegexDomain` constants (`CANONICAL_INTEGER_STRING`, `ANY_INTEGER_STRING`) are extracted so they build once at class load. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## CAST(integer AS VARCHAR) inverse: restrict to canonical decimal Concrete scenario: a query like `WHERE CAST(age AS VARCHAR) MATCHES '^[0-9]{3}$'` asks "which integer values of `age` produce a 3-character decimal string?" The SQL truth is `100..999` — `CAST(0 AS VARCHAR) = "0"` (length 1) does not match a 3-digit regex, so `0` should not appear in the input domain. Master returned `0..999`. The relevant code path lives in the `CAST(integer AS VARCHAR)` branch of `CastRegexTransformer.refineInputDomain`. To invert that cast, it asks `RegexToIntegerDomainConverter` for the integer set that the output regex represents. That converter is unidirectional: given a regex `R` it returns `{ n : some s ∈ R parses to integer n }` — the permissive-parse semantics. For `^[0-9]{3}$` it returns `{0..999}` because `"000"`, `"001"`, …, `"999"` all parse to integers in that range. That set is the right answer for a different question — inverting `CAST(string AS INTEGER)`, where SQL parsing is permissive about leading zeros (`CAST('005' AS INTEGER) = 5`). It is the wrong answer for the question this branch is solving. `CAST(integer AS VARCHAR)` produces exactly one string for each integer — its canonical decimal form — so the right answer is `{ n : canonical_decimal(n) ∈ R }`. `0`'s canonical decimal is `"0"`, which is not in `^[0-9]{3}$`, so `0` does not belong in the input set. The fix turns the unidirectional converter into a correct canonical inverter via one identity: ``` { n : canonical_decimal(n) ∈ R } == { n : some s ∈ (R ∩ canonical_strings) parses to n } ``` i.e., if we first intersect the output regex with the set of canonical decimal strings (`^(0|-?[1-9][0-9]*)$`), the permissive converter sees only canonical strings and produces the canonical answer. For `R = ^[0-9]{3}$`, the intersection is the set `"100".."999"`, and the converter then returns `{100..999}` — the SQL truth. Scope is localized to the int→string branch. The other branches of `refineInputDomain` are unchanged. Two `RegexDomain` constants (`CANONICAL_INTEGER_STRING`, `ANY_INTEGER_STRING`) are extracted so they build once at class load. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## CAST(integer AS VARCHAR) inverse: restrict to canonical decimal The `CAST(int AS VARCHAR)` branch of `CastRegexTransformer.refineInputDomain` inverts the cast — its contract is ``` refineInputDomain(CAST(n AS VARCHAR), R) = { n : CAST(n AS VARCHAR) ∈ R } ``` SQL `CAST(integer AS VARCHAR)` produces canonical decimal (one string per integer, no leading zeros), so the contract reduces to ``` { n : canonical_decimal(n) ∈ R } ``` Master implemented this by calling `regexToIntegerConverter.convert(R)`, which is unidirectional and answers a different question: ``` { n : some s ∈ R parses to integer n } (permissive-parse semantics) ``` That answer is the right one for inverting `CAST(string AS INTEGER)` — SQL string-to-integer parsing IS permissive about leading zeros — but wrong for inverting the canonical direction. For `R = ^[0-9]{3}$` it includes `0` (via `"000"`), even though `CAST(0 AS VARCHAR) = "0"` is 1 char and does not match. The output came out as `{0..999}` when the SQL truth is `{100..999}`. Fix: use the identity ``` { n : canonical_decimal(n) ∈ R } == { n : some s ∈ (R ∩ canonical_strings) parses to n } ``` i.e., intersect the output regex with the regex describing canonical decimal strings (`^(0|-?[1-9][0-9]*)$`) before calling the unidirectional converter. The converter then sees only canonical strings and returns the canonical answer. For `R = ^[0-9]{3}$`, the intersection is `"100".."999"`, and the converter yields `{100..999}`. Localized to the int→string branch; the string→int branch is unchanged (its use of the same unidirectional converter is correct for that direction). ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## CAST(integer AS VARCHAR) inverse: restrict to canonical decimal The `CAST(int AS VARCHAR)` branch of `CastRegexTransformer.refineInputDomain` must return the set of integer inputs whose CAST result matches the output regex. Concrete example: output regex `^[0-9]{3}$` (strings of exactly 3 digits). SQL `CAST(int AS VARCHAR)` produces canonical decimal — `0 → "0"`, `99 → "99"`, `100 → "100"`, `999 → "999"`. Only integers whose canonical decimal is exactly 3 chars long round-trip into a 3-digit regex, so the correct answer is `{100..999}`. Master returns `{0..999}` — wrong. Integers like `0`, `1`, …, `99` cannot actually CAST to a 3-digit string but appear in the output anyway. Fix: intersect the output regex with the regex matching canonical decimal strings (`^(0|-?[1-9][0-9]*)$`) before converting it back to integers. For `^[0-9]{3}$` the intersection is the strings `"100" .. "999"`, and the resulting integer set is `{100..999}`. Localized to the int→string branch; other branches are unchanged. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`) and pins the canonical behavior for `CAST(int AS VARCHAR)` with a 3-digit regex output. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all 238 tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## Tighten `RegexToIntegerDomainConverter`: accept only canonical decimal regexes Concrete example. A predicate like `WHERE CAST(age AS VARCHAR) MATCHES '^[0-9]{3}$'` asks "which integer values of `age` produce a 3-character decimal string?" SQL `CAST(integer AS VARCHAR)` produces canonical decimal — `0 → "0"`, `99 → "99"`, `100 → "100"`, `999 → "999"` — so the correct integer set is `{100..999}`. Master returned `{0..999}`. The root cause was that `RegexToIntegerDomainConverter` answered an under-specified question. Given a digit-only finite regex `R`, it returned the integer set `{ n : some s ∈ R parses to n }`. That answer is correct for inverting `CAST(string AS INTEGER)` (which is permissive about leading zeros: `CAST('000' AS INTEGER) = 0`), but wrong for inverting `CAST(integer AS VARCHAR)` (which is canonical: integer `0` produces only `"0"`, never `"000"`). Fix: the converter's contract is now narrowed to canonical-decimal regexes only. Its accept rule changed from "finite + digit-only" to "finite + subset of canonical-decimal strings", checked against the canonical-strings automaton `^(0|[1-9][0-9]*)$`. Non-canonical inputs like `^[0-9]{3}$` (admits `"000"`) and `^009$` are now rejected with `NonConvertibleDomainException`. The converter no longer silently picks an interpretation. `CastRegexTransformer.refineInputDomain` in the `CAST(int AS VARCHAR)` branch keeps its existing call shape — just `convert(outputRegex)` — and relies on the strict contract. Real SQL inference at this branch produces canonical outputs (literal `"500"`, alternations like `"100|999"`, etc.), so the converter accepts and returns the canonical integer set. Synthetic non-canonical shapes (which no real query path produces) fall through to a regex-format fallback. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `RegexToIntegerDomainConverterTest` is updated to match the new contract: tests that previously passed non-canonical regexes (e.g., `^[0-9]{3}$`, `^009$`, `^[0-9]?$`) now assert the converter rejects them with `NonConvertibleDomainException`. Parallel positive tests use canonical-form inputs (`^[1-9][0-9]{2}$` instead of `^[0-9]{3}$`). `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`), pins the canonical behavior of `CAST(int AS VARCHAR)` with a canonical 3-digit output, and documents the non-canonical fallback path. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## Tighten `RegexToIntegerDomainConverter`: accept only canonical decimal regexes `RegexToIntegerDomainConverter.convert(R)` previously accepted any finite digit-only regex `R` and returned the integer set of every string `R` matches. For `R = ^[0-9]{3}$` it returned `{0..999}` — including `0`, because `"000"` is matched by `R`. That answer is wrong for inverting SQL `CAST(integer AS VARCHAR)`, which produces canonical decimal (`0 → "0"`, never `"000"`); the correct integer set is `{100..999}`. Fix: narrow the converter's contract to canonical-decimal regexes only. The accept rule changes from "finite + digit-only" to "finite + subset of `^(0|[1-9][0-9]*)$`". Non-canonical inputs (`^[0-9]{3}$`, `^009$`, empty regex, …) are rejected with `NonConvertibleDomainException`. `CastRegexTransformer`'s `CAST(int AS VARCHAR)` branch keeps calling `convert(outputRegex)` directly and relies on this strict contract; real SQL inference at that branch produces canonical outputs and synthetic non-canonical shapes fall through to a regex-format fallback. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `RegexToIntegerDomainConverterTest` is updated to match the new contract: tests that previously passed non-canonical regexes (e.g., `^[0-9]{3}$`, `^009$`, `^[0-9]?$`) now assert the converter rejects them with `NonConvertibleDomainException`. Parallel positive tests use canonical-form inputs (`^[1-9][0-9]{2}$` instead of `^[0-9]{3}$`). `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`), pins the canonical behavior of `CAST(int AS VARCHAR)` with a canonical 3-digit output, and documents the non-canonical fallback path. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all tests in the module pass.
…rence Extends `coral-data-generation` so the symbolic-constraint solver from PR linkedin#564 covers a wider class of WHERE predicates: more SQL operators, struct and map/array element access, and a predicate-based inference entry point that resolves per-path domains from a DNF query. Also tightens two inference paths whose existing rewrites silently produced wrong results for the new cases. ## New operator coverage Eight new `DomainTransformer` implementations are wired into `DomainInferenceProgram.withDefaultTransformers()`: | Transformer | SQL operator | | --- | --- | | `AbsIntegerTransformer` | `ABS(x)` | | `MinusIntegerTransformer` | binary `x - k` and `k - x` | | `NegateIntegerTransformer` | unary `-x` | | `UpperRegexTransformer` | `UPPER(x)` | | `ConcatRegexTransformer` | `CONCAT(x, lit)` / `CONCAT(lit, x)` | | `TrimRegexTransformer` | `TRIM(x)` — supports both Calcite's 3-operand standard form and Hive's 1-operand form | | `FieldAccessTransformer` | struct field access (`s.name`) on nested expressions | | `ItemTransformer` | `ITEM(coll, idx-or-key)` for array indexing and map lookup on nested expressions | `ConcatRegexTransformer` matches both `SqlStdOperatorTable.CONCAT` (the SQL `||` operator) and the `OTHER_FUNCTION` named `concat` that Hive emits. Existing transformers (`LowerRegexTransformer`, `PlusIntegerTransformer`, `TimesIntegerTransformer`, `SubstringRegexTransformer`) now accept `RexFieldAccess` as a valid variable operand, so expressions like `LOWER(s.name)`, `s.age + 5`, and `UPPER(sarr[0].name)` flow through. `SubstringRegexTransformer.canHandle` also gained an operand-arity check. The transformer registration is grouped into string ops → integer ops → cross-domain → structural pass-throughs for readability. ## Nested-type access New `AccessPath` value type identifies any value reachable from a root column index through a chain of struct fields (`FIELD`), map lookups (`MAP_KEY`), and array indices (`ARRAY_INDEX`). It's the key type of the new multi-path resolution API (below) and is also used in tests to assert which nested values were resolved. `DomainInferenceProgram.deriveInputDomain` gained two base cases so inference terminates correctly at nested column references — struct field access on a `RexInputRef` (e.g., `$3.name`) and ITEM access on a `RexInputRef` (e.g., `ITEM($2, 1)` for arrays, `ITEM($4, 'env')` for maps). ## Predicate-based inference: two reductions up the SQL evaluation hierarchy Master exposed one primitive — `deriveInputDomain(expr, outputDomain) → inputDomain` — which answers the leaf question: given an expression and a constraint on its output, derive the constraint on the input variable. Real callers, though, start higher up the SQL evaluation stack. The PR adds the two reductions that bridge a full WHERE clause down to the primitive: ``` WHERE clause (tree of AND / OR over comparisons) │ │ DnfRewriter (already exists) ▼ list of DNF disjuncts ── resolveAllPaths (new) │ │ for each disjunct, for each conjunct ▼ single comparison predicate (expr OP literal) ── deriveInputDomainFromPredicate (new) │ │ compute output domain from OP + literal ▼ (expression, output domain) pair ── deriveInputDomain (primitive) │ │ walk expr, refine via transformers ▼ domain on the input variable ``` - **`deriveInputDomainFromPredicate(RexCall predicate)`** is one reduction above the primitive. It takes a comparison `expr OP literal` (`=`, `<`, `>`, `<=`, `>=`), computes the output domain from the operator and literal — `> 5` ⇒ `IntegerDomain([6, ∞))`, `= 'abc'` ⇒ `RegexDomain.literal("abc")` — and reduces to `deriveInputDomain(expr, that)`. It also unwraps the `RexCall(UNARY_MINUS, RexLiteral)` shape Calcite uses for negative literals so `age = -5` works the same as `age = 5`. - **`resolveAllPaths(List<RexNode> disjuncts)`** is one reduction above that. Given the DNF disjuncts produced by `DnfRewriter`, it walks every disjunct, every conjunct, calls `deriveInputDomainFromPredicate` on each comparison, and combines the per-`AccessPath` results with AND semantics within a disjunct (intersection) and OR semantics across disjuncts (union). Predicates outside the comparison-with-literal shape are silently skipped — notably column-to-column join predicates, which still require per-column literals. For `WHERE (age > 10 AND name = 'foo') OR (age = 0)` the result is roughly `{ $age → IntegerDomain([11,∞) ∪ {0}), $name → RegexDomain("foo") }`. Nothing else is added: anything more specific belongs in a transformer, and anything less specific (such as converting a WHERE tree to DNF in the first place) was already the caller's job via `DnfRewriter`. ## Tighten `RegexToIntegerDomainConverter`: accept only canonical decimal regexes - **Input:** `R = ^[0-9]{3}$`. - **Master returns:** `IntegerDomain{0..999}`. - **Should return:** `IntegerDomain{100..999}` — SQL `CAST(integer AS VARCHAR)` produces canonical decimal (`0 → "0"`, never `"000"`), so `0` does not belong. - **Fix:** narrow the converter's contract to canonical-decimal regexes only. The accept rule changes from "finite + digit-only" to "finite + subset of `^(0|[1-9][0-9]*)$`". Non-canonical inputs (`^[0-9]{3}$`, `^009$`, empty regex, …) are now rejected with `NonConvertibleDomainException`. `CastRegexTransformer`'s `CAST(int AS VARCHAR)` branch keeps calling `convert(outputRegex)` directly and relies on this strict contract. ## ProjectPullUpRewriter: remap the join condition when a left Project changes field count Concrete scenario: tables `T1(a, b, c)` (3 cols) and `T2(x, y)` (2 cols). Plan before pull-up: ``` Join(condition: b = x) ├── Project(a, b) keeps 2 of T1's 3 columns │ └── Scan(T1) └── Scan(T2) ``` The join's row type is `[Project-output | T2] = [a, b, x, y]`, so inside the condition `b` resolves to `$1` and `x` to `$2`. The condition is `$1 = $2`. After pull-up, the `Project` moves above the `Join`, and the new join's left input is the raw `Scan(T1)`: ``` Project(...) └── Join(condition: ???) ├── Scan(T1) └── Scan(T2) ``` The new join's row type is `[T1 | T2] = [a, b, c, x, y]`. `b` is still `$1`, but `x` is now `$3` because the left input grew from 2 columns back to 3. The rewritten condition must be `$1 = $3`. Master inlined left-side `InputRef`s through the removed `Project` but left right-side `InputRef`s at their old positions. The rewritten condition came out as `$1 = $2`, which in the new frame points at `T1.c` (`VARCHAR`) — not `T2.x` (`INTEGER`). Wrong column, and a type mismatch that breaks join evaluation. The fix replaces the two side-specific helpers (`inlineLeftSide`, `inlineRightSide`) with a single `remapJoinCondition` pass. For every `InputRef` in the old condition it computes the position in the new frame using `oldLeftCount` (Project-output width) and `newLeftCount` (unprojected-left width): right-side references shift by `newLeftCount - oldLeftCount`; left-side references are remapped through the lifted projection expressions. ## IntegerDomain - New `negate()` method (returns `multiply(-1)`), used by the new `NegateIntegerTransformer`. - `Interval.isAdjacent` refactored to make the overflow guard explicit in two named booleans, matching the original behavior. ## Build `coral-data-generation/build.gradle` now applies the `java-library` plugin so the module exposes proper `api`/`implementation` configurations. ## Tests `RegexDomainInferenceProgramTest` is the main integration suite and grows substantially: it exercises every new operator individually, every new nested-type access pattern, and combined SQL queries with AND/OR over struct/map/array paths against four test tables (`test.T`, `test.complex`, `test.deep`, `test.interleaved`). Notable coverage areas: - single-operator tests for `SUBSTRING`, `LOWER`, `UPPER`, `CAST(int→str)`, `CAST(str→int)`, `CAST(str→date)`, arithmetic, `MINUS`, `ABS`, unary minus, `CONCAT`, `TRIM`, comparison operators with and without arithmetic - multi-column AND/OR with same-column intersection, disjoint ranges, range-with-equality, contradictory ranges, mixed regex/integer domains - struct field equality and arithmetic, map-element equality, array of structs, nested struct (`nested_struct.sub.value`), map of structs (`map_of_structs['key'].score`), and interleaved combinations - CAST cross-domain on struct fields, OR disjunction on struct fields, per-column union semantics `RegexTransformerTest` is a new dedicated unit-test class for `Concat`: prefix/suffix stripping, prefix/suffix mismatch (empty domain), empty suffix as identity, non-literal output passthrough. `IntegerTransformerTest` adds rigorous-style cases for `Minus`, `Negate`, and `Abs`: each test constructs the `RexCall` via `RexBuilder` and calls `transformer.refineInputDomain` directly, then asserts containment and boundaries — including the empty case for `ABS` over an all-negative output interval. `RegexToIntegerDomainConverterTest` is updated to match the new contract: tests that previously passed non-canonical regexes (e.g., `^[0-9]{3}$`, `^009$`, `^[0-9]?$`) now assert the converter rejects them with `NonConvertibleDomainException`. Parallel positive tests use canonical-form inputs (`^[1-9][0-9]{2}$` instead of `^[0-9]{3}$`). `CastRegexTransformerTest` adds concrete accept/reject probes for the returned regex (e.g., `getAutomaton().run("100")`), pins the canonical behavior of `CAST(int AS VARCHAR)` with a canonical 3-digit output, and documents the non-canonical fallback path. `ProjectPullUpRewriterTest` asserts row-type field-name and type preservation across pull-ups, and pins the rewritten join condition to `=($1, $3)` for the case described above. ## Verification Full module pipeline (`build`, `javadoc`, `spotlessJavaCheck`) passes; all tests in the module pass.
Introduce Symbolic Constraint Solver for SQL-Driven Data Generation
Overview
This PR introduces coral-data-generation, a symbolic constraint solver that inverts SQL expressions to derive input domain constraints. Instead of forward evaluation (generate → test → reject), it solves backward from predicates to derive what inputs must satisfy, enabling efficient test data generation with guaranteed constraint satisfaction.
Motivation
Problem: Traditional test data generation uses rejection sampling—generate random values, evaluate SQL predicates, discard mismatches. This is inefficient for complex nested expressions and cannot detect unsatisfiable queries.
Solution: Symbolic inversion treats SQL expressions as mathematical transformations with inverse functions. Starting from output constraints (e.g.,
= '50'), the system walks expression trees inward, applying inverse operations to derive input domains.Examples
1. Nested String Operations
2. Cross-Domain Arithmetic
3. Date Extraction with Type Casting
4. Complex Nested Substring
5. Contradiction Detection
6. Date String Pattern Matching
Key Components
1. Domain System
2. Transformer Architecture
Pluggable symbolic inversion functions implementing DomainTransformer:
SUBSTRING(x, start, len)with positional constraintsLOWER(x)via case-insensitive regex generationx + c = value→x = value - cx * c = value→x = value / c3. Relational Preprocessing
Normalizes Calcite RelNode trees for symbolic analysis:
4. Solver
DomainInferenceProgram: Top-down expression tree traversal with domain refinement at each step, detecting contradictions via empty domain intersection.
Technical Approach
Symbolic Inversion: For nested expression
f(g(h(x))) = constant:f⁻¹→ intermediate domaing⁻¹→ refined domainh⁻¹→ input constraint onxContradiction Detection: Multiple predicates on same variable → domain intersection. Empty result = unsatisfiable query.
Extensibility: Architecture supports multi-table inference (join propagation), fixed-point iteration (recursive constraints), and arbitrary domain types (date, decimal, enum).
Testing
Integration Tests (RegexDomainInferenceProgramTest): 14+ test scenarios covering simple/nested transformations, cross-domain CAST operations, arithmetic inversion, and contradiction detection. All tests validate generated samples satisfy original SQL predicates.
Documentation
This module comes with aomprehensive README with conceptual model, examples, and API reference.
Future Extensibility
The architecture naturally extends to additional domains (DecimalDomain, DateDomain), more transformers (CONCAT, REGEXP_EXTRACT), multi-table inference (join constraint propagation), and aggregate support (cardinality constraints).