From 6e921b578093702ab491a20edba98ffd495f0814 Mon Sep 17 00:00:00 2001 From: "manthapavankumar11@gmail.com" Date: Fri, 10 Apr 2026 06:57:42 +0530 Subject: [PATCH] enhanced with advanced filter logics --- README.md | 232 +++++++++++++++++++++++++++++++++++++---- src/qql/ast_nodes.py | 117 ++++++++++++++++++++- src/qql/executor.py | 136 +++++++++++++++++++++++- src/qql/lexer.py | 108 +++++++++++++++++-- src/qql/parser.py | 227 +++++++++++++++++++++++++++++++++++++++- tests/test_executor.py | 215 ++++++++++++++++++++++++++++++++++++++ tests/test_lexer.py | 76 ++++++++++++++ tests/test_parser.py | 194 ++++++++++++++++++++++++++++++++++ 8 files changed, 1263 insertions(+), 42 deletions(-) diff --git a/README.md b/README.md index cc2d3c0..28de472 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,16 @@ # QQL — Qdrant Query Language -A SQL-like CLI for [Qdrant](https://qdrant.tech), a high-performance vector database. Instead of writing Python SDK calls, you write natural query statements to insert, search, manage, and delete vector data. +A SQL-like CLI for [Qdrant](https://qdrant.tech), a high-performance vector database. Instead of writing Python SDK calls, you write natural query statements to insert, search, manage, and delete vector data — including rich SQL-style `WHERE` filters. ``` -qql> INSERT INTO COLLECTION notes VALUES {'text': 'Qdrant is a vector database', 'author': 'alice'} +qql> INSERT INTO COLLECTION notes VALUES {'text': 'Qdrant is a vector database', 'author': 'alice', 'year': 2024} ✓ Inserted 1 point [3f2e1a4b-8c91-4d0e-b123-abc123def456] -qql> SEARCH notes SIMILAR TO 'vector storage engines' LIMIT 3 -✓ Found 2 result(s) +qql> SEARCH notes SIMILAR TO 'vector storage engines' LIMIT 3 WHERE year >= 2023 +✓ Found 1 result(s) Score │ ID │ Payload ────────┼──────────────────────────────────────┼────────────────────────────────────── - 0.8931 │ 3f2e1a4b-8c91-4d0e-b123-abc123def456 │ {'text': 'Qdrant is a ...', 'author': 'alice'} + 0.8931 │ 3f2e1a4b-8c91-4d0e-b123-abc123def456 │ {'text': 'Qdrant is a ...', 'author': 'alice', 'year': 2024} ``` --- @@ -24,6 +24,7 @@ qql> SEARCH notes SIMILAR TO 'vector storage engines' LIMIT 3 - [All QQL Operations](#all-qql-operations) - [INSERT — add a point](#insert--add-a-point) - [SEARCH — find similar points](#search--find-similar-points) + - [WHERE Clause Filters](#where-clause-filters) - [SHOW COLLECTIONS — list collections](#show-collections--list-collections) - [CREATE COLLECTION — create a collection](#create-collection--create-a-collection) - [DROP COLLECTION — delete a collection](#drop-collection--delete-a-collection) @@ -60,6 +61,8 @@ Your query string When you run `INSERT`, the `text` field in your dictionary is automatically converted into a dense vector using [Fastembed](https://github.com/qdrant/fastembed). The vector and the rest of your fields (stored as payload) are then upserted into Qdrant together. You never have to manage vectors manually. +`SEARCH` also embeds your query text and finds the nearest vectors by cosine similarity. An optional `WHERE` clause lets you pre-filter the candidate set using any payload field before similarity ranking — exactly like a SQL `WHERE` on top of a vector search. + --- ## Installation @@ -195,7 +198,7 @@ Insert with a specific embedding model: INSERT INTO COLLECTION articles VALUES {'text': 'hello world'} USING MODEL 'BAAI/bge-small-en-v1.5' ``` -Insert with nested metadata: +Insert with nested metadata and tags: ```sql INSERT INTO COLLECTION articles VALUES { 'text': 'Attention is all you need', @@ -221,10 +224,13 @@ INSERT INTO COLLECTION articles VALUES { Performs a **semantic similarity search**: your query text is embedded with the same model used during insert, then Qdrant finds the nearest vectors by cosine distance. +An optional `WHERE` clause filters the candidate set **before** similarity ranking so you only get results that match both the semantic query and the payload conditions. + **Syntax:** ``` SEARCH SIMILAR TO '' LIMIT SEARCH SIMILAR TO '' LIMIT USING MODEL '' +SEARCH SIMILAR TO '' LIMIT [USING MODEL ''] WHERE ``` **Examples:** @@ -234,9 +240,19 @@ Basic search, return top 5 results: SEARCH articles SIMILAR TO 'machine learning algorithms' LIMIT 5 ``` -Search with a specific model: +Search only papers published after 2020: +```sql +SEARCH articles SIMILAR TO 'deep learning' LIMIT 10 WHERE year > 2020 +``` + +Search within a specific category, excluding drafts: ```sql -SEARCH articles SIMILAR TO 'deep learning' LIMIT 10 USING MODEL 'BAAI/bge-small-en-v1.5' +SEARCH articles SIMILAR TO 'neural networks' LIMIT 5 WHERE category = 'ml' AND status != 'draft' +``` + +Search with a model override and a filter: +```sql +SEARCH articles SIMILAR TO 'embeddings' LIMIT 10 USING MODEL 'BAAI/bge-small-en-v1.5' WHERE year >= 2022 ``` **Output:** @@ -258,6 +274,165 @@ Results are displayed as a table with three columns: --- +### WHERE Clause Filters + +The `WHERE` clause lets you filter on any payload field using SQL-style predicates. All standard comparison, range, membership, null-check, and full-text operators are supported. + +#### Equality and inequality + +```sql +-- Exact match +SEARCH articles SIMILAR TO 'ml' LIMIT 10 WHERE category = 'paper' + +-- Not equal +SEARCH articles SIMILAR TO 'ml' LIMIT 10 WHERE status != 'draft' +``` + +#### Range comparisons + +```sql +-- Greater than / less than +SEARCH articles SIMILAR TO 'ai' LIMIT 5 WHERE score > 0.8 +SEARCH articles SIMILAR TO 'ai' LIMIT 5 WHERE year < 2024 + +-- Greater than or equal / less than or equal +SEARCH articles SIMILAR TO 'ai' LIMIT 5 WHERE score >= 0.75 +SEARCH articles SIMILAR TO 'ai' LIMIT 5 WHERE year <= 2023 +``` + +#### BETWEEN … AND + +```sql +-- Inclusive range (equivalent to year >= 2018 AND year <= 2023) +SEARCH articles SIMILAR TO 'history of ai' LIMIT 10 WHERE year BETWEEN 2018 AND 2023 +``` + +#### IN and NOT IN + +```sql +-- Field value must be one of the listed values +SEARCH articles SIMILAR TO 'retrieval' LIMIT 10 WHERE status IN ('published', 'reviewed') + +-- Field value must not be any of the listed values +SEARCH articles SIMILAR TO 'retrieval' LIMIT 10 WHERE status NOT IN ('deleted', 'archived') + +-- Trailing commas are allowed +SEARCH articles SIMILAR TO 'x' LIMIT 5 WHERE status IN ('a', 'b',) +``` + +#### IS NULL and IS NOT NULL + +```sql +-- Points where the reviewer field is absent or explicitly null +SEARCH articles SIMILAR TO 'peer review' LIMIT 5 WHERE reviewer IS NULL + +-- Points where reviewer is set to any non-null value +SEARCH articles SIMILAR TO 'peer review' LIMIT 5 WHERE reviewer IS NOT NULL +``` + +#### IS EMPTY and IS NOT EMPTY + +```sql +-- Points where the tags list is empty +SEARCH articles SIMILAR TO 'untagged' LIMIT 5 WHERE tags IS EMPTY + +-- Points where the tags list has at least one element +SEARCH articles SIMILAR TO 'categorized' LIMIT 5 WHERE tags IS NOT EMPTY +``` + +#### Full-text MATCH + +```sql +-- All terms in the string must appear in the field (full-text index required) +SEARCH articles SIMILAR TO 'search' LIMIT 10 WHERE title MATCH 'vector database' + +-- Any term in the string can match +SEARCH articles SIMILAR TO 'search' LIMIT 10 WHERE title MATCH ANY 'embedding retrieval' + +-- The exact phrase must appear +SEARCH articles SIMILAR TO 'search' LIMIT 10 WHERE title MATCH PHRASE 'semantic search' +``` + +> Full-text MATCH requires a Qdrant full-text index on the field. Create one in the Qdrant dashboard or via the SDK before using MATCH filters. + +#### AND, OR, NOT — logical operators + +Operator precedence: `NOT` (highest) > `AND` > `OR` (lowest). Use parentheses to override. + +```sql +-- AND: both conditions must be true +SEARCH articles SIMILAR TO 'nlp' LIMIT 10 WHERE category = 'paper' AND year >= 2020 + +-- OR: either condition can be true +SEARCH articles SIMILAR TO 'llm' LIMIT 10 WHERE source = 'arxiv' OR source = 'pubmed' + +-- NOT: negate a condition +SEARCH articles SIMILAR TO 'benchmark' LIMIT 10 WHERE NOT status = 'draft' + +-- Chained AND (three conditions, all must hold) +SEARCH articles SIMILAR TO 'deep learning' LIMIT 20 + WHERE year >= 2019 AND category = 'cv' AND status != 'retracted' + +-- Parentheses to group OR inside AND +SEARCH articles SIMILAR TO 'conference paper' LIMIT 10 + WHERE (source = 'arxiv' OR source = 'ieee') AND year >= 2022 + +-- NOT on a parenthesized group +SEARCH articles SIMILAR TO 'x' LIMIT 5 WHERE NOT (status = 'draft' OR status = 'deleted') +``` + +#### Dot-notation for nested fields + +Qdrant supports nested payload fields accessed with dot notation. Use the same path syntax in `WHERE`: + +```sql +-- Filter on meta.source nested field +SEARCH articles SIMILAR TO 'wikipedia' LIMIT 5 WHERE meta.source = 'web' + +-- Filter on a deeply nested array field +SEARCH cities SIMILAR TO 'large city' LIMIT 5 WHERE country.cities[].population > 1000000 +``` + +#### Combined example + +```sql +-- Semantic search over research papers: +-- must be from arxiv or IEEE, published 2020–2023, not retracted, with a reviewer assigned +SEARCH papers SIMILAR TO 'attention mechanism transformers' LIMIT 20 + WHERE (source = 'arxiv' OR source = 'ieee') + AND year BETWEEN 2020 AND 2023 + AND status != 'retracted' + AND reviewer IS NOT NULL +``` + +#### Full filter reference + +| WHERE syntax | Description | +|---|---| +| `field = 'x'` | Exact match | +| `field != 'x'` | Not equal | +| `field > n` | Greater than | +| `field >= n` | Greater than or equal | +| `field < n` | Less than | +| `field <= n` | Less than or equal | +| `field BETWEEN a AND b` | Inclusive range | +| `field IN ('a', 'b')` | Value in list | +| `field NOT IN ('a', 'b')` | Value not in list | +| `field IS NULL` | Field absent or null | +| `field IS NOT NULL` | Field present and non-null | +| `field IS EMPTY` | Field is an empty list | +| `field IS NOT EMPTY` | Field is a non-empty list | +| `field MATCH 'text'` | All terms present (full-text) | +| `field MATCH ANY 'text'` | Any term present (full-text) | +| `field MATCH PHRASE 'text'` | Exact phrase present (full-text) | +| `A AND B` | Both conditions must hold | +| `A OR B` | Either condition must hold | +| `NOT A` | Condition must not hold | +| `(A OR B) AND C` | Parentheses for grouping | +| `meta.source = 'x'` | Dot-notation nested field | + +--- + ### SHOW COLLECTIONS — list collections Lists all collections in the connected Qdrant instance. @@ -373,6 +548,12 @@ INSERT INTO docs VALUES {'text': 'hello'} USING MODEL 'BAAI/bge-small-en-v1.5' SEARCH docs SIMILAR TO 'hello' LIMIT 5 USING MODEL 'BAAI/bge-small-en-v1.5' ``` +`USING MODEL` and `WHERE` can be combined: + +```sql +SEARCH docs SIMILAR TO 'hello' LIMIT 5 USING MODEL 'BAAI/bge-small-en-v1.5' WHERE year >= 2022 +``` + ### Commonly available Fastembed models | Model | Dimensions | Notes | @@ -410,7 +591,7 @@ The `VALUES` dictionary (and nested dicts) supports these types: | Nested dict | `{'key': 'val'}` | Arbitrary nesting | | List | `['a', 'b', 1]` | Mixed types allowed | -**Examples of each:** +**Example using every type:** ```sql INSERT INTO demo VALUES { 'text': 'example document', @@ -468,21 +649,29 @@ QQL can also be used as a Python library without the CLI: ```python from qql import run_query -# Single query +# Insert a document result = run_query( - "INSERT INTO COLLECTION notes VALUES {'text': 'hello world', 'author': 'alice'}", + "INSERT INTO COLLECTION notes VALUES {'text': 'hello world', 'author': 'alice', 'year': 2024}", url="http://localhost:6333", ) print(result.message) # "Inserted 1 point []" print(result.data) # {"id": "...", "collection": "notes"} -# Search +# Basic search result = run_query( "SEARCH notes SIMILAR TO 'hello' LIMIT 5", url="http://localhost:6333", ) for hit in result.data: print(hit["score"], hit["id"], hit["payload"]) + +# Search with a WHERE filter +result = run_query( + "SEARCH notes SIMILAR TO 'hello' LIMIT 5 WHERE year >= 2023 AND author != 'bot'", + url="http://localhost:6333", +) +for hit in result.data: + print(hit["score"], hit["payload"]) ``` Or use the pipeline directly for more control: @@ -498,12 +687,13 @@ client = QdrantClient(url="http://localhost:6333") config = QQLConfig(url="http://localhost:6333") executor = Executor(client, config) -query = "SHOW COLLECTIONS" +query = "SEARCH articles SIMILAR TO 'deep learning' LIMIT 10 WHERE category = 'cv'" tokens = Lexer().tokenize(query) node = Parser(tokens).parse() result = executor.execute(node) -print(result.data) # ["notes", "articles", ...] +for hit in result.data: + print(hit["score"], hit["payload"]) ``` ### ExecutionResult @@ -541,14 +731,14 @@ qql/ │ ├── config.py # QQLConfig dataclass + ~/.qql/config.json I/O │ ├── exceptions.py # QQLError, QQLSyntaxError, QQLRuntimeError │ ├── lexer.py # Tokenizer: string → List[Token] -│ ├── ast_nodes.py # Frozen dataclasses for each statement type +│ ├── ast_nodes.py # Frozen dataclasses for each statement and filter type │ ├── parser.py # Recursive descent parser: tokens → AST node │ ├── embedder.py # Fastembed wrapper with per-model cache -│ └── executor.py # AST node → Qdrant client call +│ └── executor.py # AST node → Qdrant client call + filter conversion └── tests/ - ├── test_lexer.py # Tokenizer unit tests - ├── test_parser.py # Parser unit tests (all 6 statement types) - └── test_executor.py # Executor unit tests (mocked Qdrant client) + ├── test_lexer.py # Tokenizer unit tests (keywords, operators, dot-paths) + ├── test_parser.py # Parser unit tests (all statements + WHERE filters) + └── test_executor.py # Executor unit tests (mocked Qdrant client + filter builders) ``` --- @@ -561,7 +751,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked. pytest tests/ -v ``` -Expected output: **54 tests passing**. +Expected output: **118 tests passing**. --- @@ -577,3 +767,5 @@ Expected output: **54 tests passing**. | `Unexpected token '...'; expected a QQL statement keyword` | Unrecognized statement | Check the query syntax; QQL does not support SQL SELECT | | `Unterminated string literal (at position N)` | A string is missing its closing quote | Close the string with a matching `'` or `"` | | `Unexpected character '@' (at position N)` | A character not part of QQL syntax | Remove or quote the offending character | +| `Expected a filter operator after field '...'` | Unknown operator in WHERE clause | Use one of: `=`, `!=`, `>`, `>=`, `<`, `<=`, `IN`, `NOT IN`, `BETWEEN`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `MATCH` | +| `Expected ')' ...` | Unclosed parenthesis in WHERE clause | Add the missing `)` to close the group | diff --git a/src/qql/ast_nodes.py b/src/qql/ast_nodes.py index a346c57..b8cd943 100644 --- a/src/qql/ast_nodes.py +++ b/src/qql/ast_nodes.py @@ -1,12 +1,122 @@ +from __future__ import annotations + from dataclasses import dataclass -from typing import Any +from typing import Any, Union + + +# ── Filter expression leaf nodes ────────────────────────────────────────────── + +@dataclass(frozen=True) +class CompareExpr: + """field op literal — covers =, !=, >, >=, <, <=""" + field: str + op: str # one of: "=", "!=", ">", ">=", "<", "<=" + value: str | int | float + + +@dataclass(frozen=True) +class BetweenExpr: + """field BETWEEN low AND high""" + field: str + low: int | float + high: int | float + + +@dataclass(frozen=True) +class InExpr: + """field IN (v1, v2, ...)""" + field: str + values: tuple[str | int | float, ...] + + +@dataclass(frozen=True) +class NotInExpr: + """field NOT IN (v1, v2, ...)""" + field: str + values: tuple[str | int | float, ...] + + +@dataclass(frozen=True) +class IsNullExpr: + """field IS NULL""" + field: str + + +@dataclass(frozen=True) +class IsNotNullExpr: + """field IS NOT NULL""" + field: str + + +@dataclass(frozen=True) +class IsEmptyExpr: + """field IS EMPTY""" + field: str + + +@dataclass(frozen=True) +class IsNotEmptyExpr: + """field IS NOT EMPTY""" + field: str + + +@dataclass(frozen=True) +class MatchTextExpr: + """field MATCH 'text' — all terms required (MatchText)""" + field: str + text: str + + +@dataclass(frozen=True) +class MatchAnyExpr: + """field MATCH ANY 'text' — any term matches (MatchTextAny)""" + field: str + text: str + + +@dataclass(frozen=True) +class MatchPhraseExpr: + """field MATCH PHRASE 'text' — exact phrase (MatchPhrase)""" + field: str + text: str + + +# ── Filter expression logical nodes ────────────────────────────────────────── + +@dataclass(frozen=True) +class AndExpr: + """A AND B AND C — flattened into a single node with N operands.""" + operands: tuple[FilterExpr, ...] + + +@dataclass(frozen=True) +class OrExpr: + """A OR B OR C""" + operands: tuple[FilterExpr, ...] + + +@dataclass(frozen=True) +class NotExpr: + """NOT A""" + operand: FilterExpr + + +# Union type covering all filter expression nodes +FilterExpr = Union[ + CompareExpr, BetweenExpr, InExpr, NotInExpr, + IsNullExpr, IsNotNullExpr, IsEmptyExpr, IsNotEmptyExpr, + MatchTextExpr, MatchAnyExpr, MatchPhraseExpr, + AndExpr, OrExpr, NotExpr, +] + +# ── Statement nodes ─────────────────────────────────────────────────────────── @dataclass(frozen=True) class InsertStmt: collection: str values: dict[str, Any] # must contain "text" key - model: str | None # None → use default + model: str | None # None → use default @dataclass(frozen=True) @@ -30,6 +140,7 @@ class SearchStmt: query_text: str limit: int model: str | None + query_filter: FilterExpr | None = None # optional WHERE clause; default keeps existing tests valid @dataclass(frozen=True) @@ -38,7 +149,7 @@ class DeleteStmt: point_id: str | int -# Union type for all statement nodes +# Union type for all top-level statement nodes ASTNode = ( InsertStmt | CreateCollectionStmt diff --git a/src/qql/executor.py b/src/qql/executor.py index ac7a89c..e1936b0 100644 --- a/src/qql/executor.py +++ b/src/qql/executor.py @@ -6,14 +6,45 @@ from qdrant_client import QdrantClient from qdrant_client.http.exceptions import UnexpectedResponse -from qdrant_client.models import Distance, PointStruct, VectorParams +from qdrant_client.models import ( + Distance, + FieldCondition, + Filter, + IsEmptyCondition, + IsNullCondition, + MatchAny, + MatchExcept, + MatchPhrase, + MatchText, + MatchTextAny, + MatchValue, + PayloadField, + PointStruct, + Range, + VectorParams, +) from .ast_nodes import ( ASTNode, + AndExpr, + BetweenExpr, + CompareExpr, CreateCollectionStmt, DeleteStmt, DropCollectionStmt, + FilterExpr, + InExpr, InsertStmt, + IsEmptyExpr, + IsNotEmptyExpr, + IsNotNullExpr, + IsNullExpr, + MatchAnyExpr, + MatchPhraseExpr, + MatchTextExpr, + NotExpr, + NotInExpr, + OrExpr, SearchStmt, ShowCollectionsStmt, ) @@ -84,7 +115,6 @@ def _execute_create(self, node: CreateCollectionStmt) -> ExecutionResult: success=True, message=f"Collection '{node.collection}' already exists", ) - # Create with default model dimensions embedder = Embedder(self._config.default_model) dims = embedder.dimensions self._client.create_collection( @@ -122,11 +152,18 @@ def _execute_search(self, node: SearchStmt) -> ExecutionResult: embedder = Embedder(model_name) vector = embedder.embed(node.query_text) + qdrant_filter: Filter | None = None + if node.query_filter is not None: + qdrant_filter = self._wrap_as_filter( + self._build_qdrant_filter(node.query_filter) + ) + try: response = self._client.query_points( collection_name=node.collection, query=vector, limit=node.limit, + query_filter=qdrant_filter, ) except UnexpectedResponse as e: raise QQLRuntimeError(f"Qdrant error during SEARCH: {e}") from e @@ -160,7 +197,100 @@ def _execute_delete(self, node: DeleteStmt) -> ExecutionResult: message=f"Deleted point '{node.point_id}' from '{node.collection}'", ) - # ── Helpers ─────────────────────────────────────────────────────────── + # ── Filter conversion ───────────────────────────────────────────────── + + def _build_qdrant_filter(self, expr: FilterExpr) -> Any: + """Convert a FilterExpr AST node into a Qdrant model object. + + Returns one of: Filter, FieldCondition, IsNullCondition, IsEmptyCondition. + Use _wrap_as_filter() to guarantee the top-level result is a Filter. + """ + # ── Logical combinators ─────────────────────────────────────────── + if isinstance(expr, AndExpr): + return Filter(must=[self._build_qdrant_filter(op) for op in expr.operands]) + + if isinstance(expr, OrExpr): + return Filter(should=[self._build_qdrant_filter(op) for op in expr.operands]) + + if isinstance(expr, NotExpr): + return Filter(must_not=[self._build_qdrant_filter(expr.operand)]) + + # ── Comparison ──────────────────────────────────────────────────── + if isinstance(expr, CompareExpr): + if expr.op == "=": + return FieldCondition( + key=expr.field, match=MatchValue(value=expr.value) + ) + if expr.op == "!=": + return Filter( + must_not=[ + FieldCondition(key=expr.field, match=MatchValue(value=expr.value)) + ] + ) + _range_key = {">": "gt", ">=": "gte", "<": "lt", "<=": "lte"}[expr.op] + return FieldCondition( + key=expr.field, range=Range(**{_range_key: expr.value}) + ) + + # ── BETWEEN ─────────────────────────────────────────────────────── + if isinstance(expr, BetweenExpr): + return FieldCondition( + key=expr.field, range=Range(gte=expr.low, lte=expr.high) + ) + + # ── IN / NOT IN ─────────────────────────────────────────────────── + if isinstance(expr, InExpr): + return FieldCondition( + key=expr.field, match=MatchAny(any=list(expr.values)) + ) + + if isinstance(expr, NotInExpr): + return FieldCondition( + key=expr.field, + match=MatchExcept(**{"except": list(expr.values)}), + ) + + # ── IS NULL / IS NOT NULL ───────────────────────────────────────── + if isinstance(expr, IsNullExpr): + return IsNullCondition(is_null=PayloadField(key=expr.field)) + + if isinstance(expr, IsNotNullExpr): + return Filter( + must_not=[IsNullCondition(is_null=PayloadField(key=expr.field))] + ) + + # ── IS EMPTY / IS NOT EMPTY ─────────────────────────────────────── + if isinstance(expr, IsEmptyExpr): + return IsEmptyCondition(is_empty=PayloadField(key=expr.field)) + + if isinstance(expr, IsNotEmptyExpr): + return Filter( + must_not=[IsEmptyCondition(is_empty=PayloadField(key=expr.field))] + ) + + # ── Full-text MATCH ─────────────────────────────────────────────── + if isinstance(expr, MatchTextExpr): + return FieldCondition(key=expr.field, match=MatchText(text=expr.text)) + + if isinstance(expr, MatchAnyExpr): + return FieldCondition( + key=expr.field, match=MatchTextAny(text_any=expr.text) + ) + + if isinstance(expr, MatchPhraseExpr): + return FieldCondition( + key=expr.field, match=MatchPhrase(phrase=expr.text) + ) + + raise QQLRuntimeError(f"Unknown filter expression type: {type(expr)}") + + def _wrap_as_filter(self, qdrant_expr: Any) -> Filter: + """Ensure the top-level expression is a Filter (required by query_points).""" + if isinstance(qdrant_expr, Filter): + return qdrant_expr + return Filter(must=[qdrant_expr]) + + # ── Collection helpers ──────────────────────────────────────────────── def _ensure_collection(self, name: str, vector_size: int) -> None: """Create the collection if it doesn't exist. Raises on dimension mismatch.""" diff --git a/src/qql/lexer.py b/src/qql/lexer.py index 7ed13a4..e3bf8e3 100644 --- a/src/qql/lexer.py +++ b/src/qql/lexer.py @@ -5,7 +5,7 @@ class TokenKind(Enum): - # Keywords + # ── Statement keywords ──────────────────────────────────────────────── INSERT = auto() INTO = auto() COLLECTION = auto() @@ -24,24 +24,45 @@ class TokenKind(Enum): FROM = auto() WHERE = auto() ID = auto() - # Literals & names + # ── Filter keywords ─────────────────────────────────────────────────── + AND = auto() + OR = auto() + NOT = auto() + IN = auto() + BETWEEN = auto() + IS = auto() + NULL = auto() + EMPTY = auto() + MATCH = auto() + ANY = auto() + PHRASE = auto() + # ── Literals & names ───────────────────────────────────────────────── IDENTIFIER = auto() STRING = auto() INTEGER = auto() FLOAT = auto() - # Punctuation + # ── Punctuation ─────────────────────────────────────────────────────── LBRACE = auto() RBRACE = auto() LBRACKET = auto() RBRACKET = auto() + LPAREN = auto() + RPAREN = auto() COLON = auto() COMMA = auto() EQUALS = auto() - # Control + # ── Comparison operators ────────────────────────────────────────────── + NOT_EQUALS = auto() # != + GT = auto() # > + GTE = auto() # >= + LT = auto() # < + LTE = auto() # <= + # ── Control ─────────────────────────────────────────────────────────── EOF = auto() _KEYWORDS: dict[str, TokenKind] = { + # Statement keywords "INSERT": TokenKind.INSERT, "INTO": TokenKind.INTO, "COLLECTION": TokenKind.COLLECTION, @@ -60,6 +81,18 @@ class TokenKind(Enum): "FROM": TokenKind.FROM, "WHERE": TokenKind.WHERE, "ID": TokenKind.ID, + # Filter keywords + "AND": TokenKind.AND, + "OR": TokenKind.OR, + "NOT": TokenKind.NOT, + "IN": TokenKind.IN, + "BETWEEN": TokenKind.BETWEEN, + "IS": TokenKind.IS, + "NULL": TokenKind.NULL, + "EMPTY": TokenKind.EMPTY, + "MATCH": TokenKind.MATCH, + "ANY": TokenKind.ANY, + "PHRASE": TokenKind.PHRASE, } @@ -83,7 +116,7 @@ def tokenize(self, query: str) -> list[Token]: ch = query[i] - # Single-character punctuation + # ── Braces / brackets / punctuation ────────────────────────── if ch == "{": tokens.append(Token(TokenKind.LBRACE, "{", i)) i += 1 @@ -96,17 +129,45 @@ def tokenize(self, query: str) -> list[Token]: elif ch == "]": tokens.append(Token(TokenKind.RBRACKET, "]", i)) i += 1 + elif ch == "(": + tokens.append(Token(TokenKind.LPAREN, "(", i)) + i += 1 + elif ch == ")": + tokens.append(Token(TokenKind.RPAREN, ")", i)) + i += 1 elif ch == ":": tokens.append(Token(TokenKind.COLON, ":", i)) i += 1 elif ch == ",": tokens.append(Token(TokenKind.COMMA, ",", i)) i += 1 + + # ── Comparison operators (multi-char look-ahead) ────────────── elif ch == "=": tokens.append(Token(TokenKind.EQUALS, "=", i)) i += 1 + elif ch == "!": + if i + 1 < n and query[i + 1] == "=": + tokens.append(Token(TokenKind.NOT_EQUALS, "!=", i)) + i += 2 + else: + raise QQLSyntaxError(f"Unexpected character '!'", i) + elif ch == ">": + if i + 1 < n and query[i + 1] == "=": + tokens.append(Token(TokenKind.GTE, ">=", i)) + i += 2 + else: + tokens.append(Token(TokenKind.GT, ">", i)) + i += 1 + elif ch == "<": + if i + 1 < n and query[i + 1] == "=": + tokens.append(Token(TokenKind.LTE, "<=", i)) + i += 2 + else: + tokens.append(Token(TokenKind.LT, "<", i)) + i += 1 - # String literals + # ── String literals ─────────────────────────────────────────── elif ch in ('"', "'"): start = i quote = ch @@ -114,7 +175,6 @@ def tokenize(self, query: str) -> list[Token]: buf: list[str] = [] while i < n: if query[i] == "\\" and i + 1 < n: - # Handle escape sequences next_ch = query[i + 1] if next_ch == "n": buf.append("\n") @@ -136,7 +196,7 @@ def tokenize(self, query: str) -> list[Token]: raise QQLSyntaxError("Unterminated string literal", start) tokens.append(Token(TokenKind.STRING, "".join(buf), start)) - # Numbers: optional leading minus + # ── Numbers: optional leading minus ─────────────────────────── elif ch.isdigit() or (ch == "-" and i + 1 < n and query[i + 1].isdigit()): start = i if ch == "-": @@ -151,14 +211,40 @@ def tokenize(self, query: str) -> list[Token]: else: tokens.append(Token(TokenKind.INTEGER, query[start:i], start)) - # Identifiers and keywords + # ── Identifiers, keywords, and dot-notation field paths ──────── elif ch.isalpha() or ch == "_": start = i + # Collect the base word while i < n and (query[i].isalnum() or query[i] == "_"): i += 1 + # Extend for dotted field paths: consume ".word" and "[].word" segments + # so that meta.source and country.cities[].population become single tokens. + while i < n: + if query[i] == "." and i + 1 < n and (query[i + 1].isalpha() or query[i + 1] == "_"): + # ".identifier" segment + i += 1 # consume "." + while i < n and (query[i].isalnum() or query[i] == "_"): + i += 1 + elif ( + i + 2 < n + and query[i : i + 3] == "[]." + and i + 3 < n + and (query[i + 3].isalpha() or query[i + 3] == "_") + ): + # "[]." array marker segment + i += 3 # consume "[]." + while i < n and (query[i].isalnum() or query[i] == "_"): + i += 1 + else: + break word = query[start:i] - upper = word.upper() - kind = _KEYWORDS.get(upper, TokenKind.IDENTIFIER) + # Keyword lookup uses the uppercased first segment only for dotted paths + # so that field names like "meta.from" are always IDENTIFIER, not keywords. + first_segment = word.split(".")[0].upper() + if "." not in word and first_segment in _KEYWORDS: + kind = _KEYWORDS[first_segment] + else: + kind = TokenKind.IDENTIFIER tokens.append(Token(kind, word, start)) else: diff --git a/src/qql/parser.py b/src/qql/parser.py index 06eb619..17fe0f9 100644 --- a/src/qql/parser.py +++ b/src/qql/parser.py @@ -2,16 +2,41 @@ from .ast_nodes import ( ASTNode, + AndExpr, + BetweenExpr, + CompareExpr, CreateCollectionStmt, DeleteStmt, DropCollectionStmt, + FilterExpr, + InExpr, InsertStmt, + IsEmptyExpr, + IsNotEmptyExpr, + IsNotNullExpr, + IsNullExpr, + MatchAnyExpr, + MatchPhraseExpr, + MatchTextExpr, + NotExpr, + NotInExpr, + OrExpr, SearchStmt, ShowCollectionsStmt, ) from .exceptions import QQLSyntaxError from .lexer import Token, TokenKind +# Comparison operator token → string symbol mapping +_CMP_OPS: dict[TokenKind, str] = { + TokenKind.EQUALS: "=", + TokenKind.NOT_EQUALS: "!=", + TokenKind.GT: ">", + TokenKind.GTE: ">=", + TokenKind.LT: "<", + TokenKind.LTE: "<=", +} + class Parser: def __init__(self, tokens: list[Token]) -> None: @@ -88,7 +113,17 @@ def _parse_search(self) -> SearchStmt: self._advance() self._expect(TokenKind.MODEL) model = self._expect(TokenKind.STRING).value - return SearchStmt(collection=collection, query_text=query_text, limit=limit, model=model) + query_filter: FilterExpr | None = None + if self._peek().kind == TokenKind.WHERE: + self._advance() # consume WHERE + query_filter = self._parse_filter_expr() + return SearchStmt( + collection=collection, + query_text=query_text, + limit=limit, + model=model, + query_filter=query_filter, + ) def _parse_delete(self) -> DeleteStmt: self._expect(TokenKind.DELETE) @@ -110,10 +145,187 @@ def _parse_delete(self) -> DeleteStmt: ) return DeleteStmt(collection=collection, point_id=point_id) - # ── Value parsers ───────────────────────────────────────────────────── + # ── WHERE clause filter parsing (precedence: NOT > AND > OR) ───────── + + def _parse_filter_expr(self) -> FilterExpr: + """filter_or ::= filter_and { OR filter_and }""" + left = self._parse_filter_and() + if self._peek().kind != TokenKind.OR: + return left + operands: list[FilterExpr] = [left] + while self._peek().kind == TokenKind.OR: + self._advance() # consume OR + operands.append(self._parse_filter_and()) + return OrExpr(operands=tuple(operands)) + + def _parse_filter_and(self) -> FilterExpr: + """filter_and ::= filter_not { AND filter_not }""" + left = self._parse_filter_not() + if self._peek().kind != TokenKind.AND: + return left + operands: list[FilterExpr] = [left] + while self._peek().kind == TokenKind.AND: + self._advance() # consume AND + operands.append(self._parse_filter_not()) + return AndExpr(operands=tuple(operands)) + + def _parse_filter_not(self) -> FilterExpr: + """filter_not ::= NOT filter_not | filter_primary""" + if self._peek().kind == TokenKind.NOT: + self._advance() # consume NOT + return NotExpr(operand=self._parse_filter_not()) # right-recursive + return self._parse_filter_primary() + + def _parse_filter_primary(self) -> FilterExpr: + """filter_primary ::= '(' filter_expr ')' | predicate""" + if self._peek().kind == TokenKind.LPAREN: + self._advance() # consume ( + expr = self._parse_filter_expr() + self._expect(TokenKind.RPAREN) + return expr + return self._parse_predicate() + + def _parse_predicate(self) -> FilterExpr: + """All leaf filter conditions.""" + field = self._parse_field_path() + tok = self._peek() + + # ── IS NULL / IS NOT NULL / IS EMPTY / IS NOT EMPTY ────────────── + if tok.kind == TokenKind.IS: + self._advance() # consume IS + if self._peek().kind == TokenKind.NOT: + self._advance() # consume NOT + if self._peek().kind == TokenKind.NULL: + self._advance() + return IsNotNullExpr(field=field) + if self._peek().kind == TokenKind.EMPTY: + self._advance() + return IsNotEmptyExpr(field=field) + raise QQLSyntaxError( + "Expected NULL or EMPTY after IS NOT", self._peek().pos + ) + if self._peek().kind == TokenKind.NULL: + self._advance() + return IsNullExpr(field=field) + if self._peek().kind == TokenKind.EMPTY: + self._advance() + return IsEmptyExpr(field=field) + raise QQLSyntaxError( + "Expected NULL, NOT NULL, EMPTY, or NOT EMPTY after IS", self._peek().pos + ) + + # ── IN ( ... ) ──────────────────────────────────────────────────── + if tok.kind == TokenKind.IN: + self._advance() # consume IN + values = self._parse_literal_list() + return InExpr(field=field, values=tuple(values)) + + # ── NOT IN ( ... ) ──────────────────────────────────────────────── + if tok.kind == TokenKind.NOT: + self._advance() # consume NOT + self._expect(TokenKind.IN) + values = self._parse_literal_list() + return NotInExpr(field=field, values=tuple(values)) + + # ── BETWEEN low AND high ────────────────────────────────────────── + if tok.kind == TokenKind.BETWEEN: + self._advance() # consume BETWEEN + low = self._parse_number() + self._expect(TokenKind.AND) # consumes AND as separator (not logical AND) + high = self._parse_number() + return BetweenExpr(field=field, low=low, high=high) + + # ── MATCH / MATCH ANY / MATCH PHRASE ───────────────────────────── + if tok.kind == TokenKind.MATCH: + self._advance() # consume MATCH + if self._peek().kind == TokenKind.ANY: + self._advance() + text = self._expect(TokenKind.STRING).value + return MatchAnyExpr(field=field, text=text) + if self._peek().kind == TokenKind.PHRASE: + self._advance() + text = self._expect(TokenKind.STRING).value + return MatchPhraseExpr(field=field, text=text) + # plain MATCH — all terms required + text = self._expect(TokenKind.STRING).value + return MatchTextExpr(field=field, text=text) + + # ── Comparison operators: =, !=, >, >=, <, <= ──────────────────── + if tok.kind in _CMP_OPS: + op = _CMP_OPS[tok.kind] + self._advance() + value = self._parse_literal() + return CompareExpr(field=field, op=op, value=value) + + raise QQLSyntaxError( + f"Expected a filter operator after field '{field}', got '{tok.value}'", + tok.pos, + ) + + # ── Filter parsing helpers ──────────────────────────────────────────── + + def _parse_field_path(self) -> str: + """Dot-notation paths are already single IDENTIFIER tokens from the lexer.""" + tok = self._peek() + if tok.kind != TokenKind.IDENTIFIER: + raise QQLSyntaxError( + f"Expected a field name, got '{tok.value}'", tok.pos + ) + self._advance() + return tok.value + + def _parse_literal(self) -> str | int | float: + """STRING | INTEGER | FLOAT""" + tok = self._peek() + if tok.kind == TokenKind.STRING: + self._advance() + return tok.value + if tok.kind == TokenKind.INTEGER: + self._advance() + return int(tok.value) + if tok.kind == TokenKind.FLOAT: + self._advance() + return float(tok.value) + raise QQLSyntaxError( + f"Expected a literal value (string, integer, or float), got '{tok.value}'", + tok.pos, + ) + + def _parse_number(self) -> int | float: + """INTEGER | FLOAT only (used by BETWEEN).""" + tok = self._peek() + if tok.kind == TokenKind.INTEGER: + self._advance() + return int(tok.value) + if tok.kind == TokenKind.FLOAT: + self._advance() + return float(tok.value) + raise QQLSyntaxError( + f"Expected a number, got '{tok.value}'", tok.pos + ) + + def _parse_literal_list(self) -> list[str | int | float]: + """'(' literal { ',' literal } [','] ')' — used by IN / NOT IN.""" + self._expect(TokenKind.LPAREN) + items: list[str | int | float] = [] + if self._peek().kind == TokenKind.RPAREN: + self._advance() + return items + while True: + items.append(self._parse_literal()) + if self._peek().kind == TokenKind.COMMA: + self._advance() + if self._peek().kind == TokenKind.RPAREN: + break # trailing comma allowed + else: + break + self._expect(TokenKind.RPAREN) + return items + + # ── Dict / value parsers (for INSERT VALUES) ────────────────────────── def _parse_identifier(self) -> str: - """Accept either a bare IDENTIFIER or a quoted STRING as a name.""" + """Accept either a bare IDENTIFIER or a quoted STRING as a collection name.""" tok = self._peek() if tok.kind == TokenKind.IDENTIFIER: self._advance() @@ -144,9 +356,8 @@ def _parse_dict(self) -> dict[str, Any]: result[key] = value if self._peek().kind == TokenKind.COMMA: self._advance() - # Allow trailing comma if self._peek().kind == TokenKind.RBRACE: - break + break # trailing comma else: break self._expect(TokenKind.RBRACE) @@ -180,6 +391,10 @@ def _parse_value(self) -> Any: if tok.kind == TokenKind.INTEGER: self._advance() return int(tok.value) + if tok.kind == TokenKind.NULL: + # NULL is now a keyword token + self._advance() + return None if tok.kind == TokenKind.IDENTIFIER: upper = tok.value.upper() if upper == "TRUE": @@ -189,6 +404,8 @@ def _parse_value(self) -> Any: self._advance() return False if upper == "NULL": + # Fallback: handle 'null' that arrived as IDENTIFIER (shouldn't happen + # after lexer change, but kept for safety) self._advance() return None self._advance() diff --git a/tests/test_executor.py b/tests/test_executor.py index 3fd0a65..b3e45af 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -166,3 +166,218 @@ def test_delete_nonexistent_collection_raises(self, executor, mock_client): node = DeleteStmt(collection="ghost", point_id="x") with pytest.raises(QQLRuntimeError, match="does not exist"): executor.execute(node) + + +class TestSearchWithFilter: + """Tests for _build_qdrant_filter and filter pass-through in _execute_search.""" + + def _search_node(self, query_filter=None): + return SearchStmt( + collection="docs", query_text="hello", limit=5, model=None, + query_filter=query_filter, + ) + + def test_search_without_filter_passes_none_to_qdrant(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.points = [] + mock_client.query_points.return_value = mock_response + + executor.execute(self._search_node()) + + call_kwargs = mock_client.query_points.call_args.kwargs + assert call_kwargs.get("query_filter") is None + + def test_search_with_filter_passes_filter_to_qdrant(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.points = [] + mock_client.query_points.return_value = mock_response + + from qql.ast_nodes import CompareExpr + node = self._search_node(query_filter=CompareExpr(field="cat", op="=", value="ai")) + executor.execute(node) + + call_kwargs = mock_client.query_points.call_args.kwargs + assert call_kwargs.get("query_filter") is not None + + # ── _build_qdrant_filter unit tests (no Qdrant connection needed) ───── + + def test_build_equality(self, executor): + from qdrant_client.models import FieldCondition, Filter, MatchValue + from qql.ast_nodes import CompareExpr + + result = executor._wrap_as_filter( + executor._build_qdrant_filter(CompareExpr(field="status", op="=", value="active")) + ) + assert isinstance(result, Filter) + fc = result.must[0] + assert isinstance(fc, FieldCondition) + assert fc.match == MatchValue(value="active") + + def test_build_not_equals(self, executor): + from qdrant_client.models import Filter + from qql.ast_nodes import CompareExpr + + result = executor._build_qdrant_filter(CompareExpr(field="s", op="!=", value="x")) + assert isinstance(result, Filter) + assert result.must_not is not None and len(result.must_not) == 1 + + def test_build_range_gt(self, executor): + from qdrant_client.models import FieldCondition + from qql.ast_nodes import CompareExpr + + result = executor._build_qdrant_filter(CompareExpr(field="score", op=">", value=0.8)) + assert isinstance(result, FieldCondition) + assert result.range.gt == pytest.approx(0.8) + + def test_build_range_gte(self, executor): + from qdrant_client.models import FieldCondition + from qql.ast_nodes import CompareExpr + + result = executor._build_qdrant_filter(CompareExpr(field="year", op=">=", value=2020)) + assert isinstance(result, FieldCondition) + assert result.range.gte == 2020 + + def test_build_range_lt(self, executor): + from qdrant_client.models import FieldCondition + from qql.ast_nodes import CompareExpr + + result = executor._build_qdrant_filter(CompareExpr(field="year", op="<", value=2024)) + assert isinstance(result, FieldCondition) + assert result.range.lt == 2024 + + def test_build_range_lte(self, executor): + from qdrant_client.models import FieldCondition + from qql.ast_nodes import CompareExpr + + result = executor._build_qdrant_filter(CompareExpr(field="year", op="<=", value=2023)) + assert isinstance(result, FieldCondition) + assert result.range.lte == 2023 + + def test_build_between(self, executor): + from qdrant_client.models import FieldCondition + from qql.ast_nodes import BetweenExpr + + result = executor._build_qdrant_filter(BetweenExpr(field="year", low=2018, high=2023)) + assert isinstance(result, FieldCondition) + assert result.range.gte == 2018 + assert result.range.lte == 2023 + + def test_build_in(self, executor): + from qdrant_client.models import FieldCondition, MatchAny + from qql.ast_nodes import InExpr + + result = executor._build_qdrant_filter(InExpr(field="status", values=("a", "b"))) + assert isinstance(result, FieldCondition) + assert isinstance(result.match, MatchAny) + + def test_build_not_in(self, executor): + from qdrant_client.models import FieldCondition, MatchExcept + from qql.ast_nodes import NotInExpr + + result = executor._build_qdrant_filter(NotInExpr(field="status", values=("deleted",))) + assert isinstance(result, FieldCondition) + assert isinstance(result.match, MatchExcept) + + def test_build_is_null(self, executor): + from qdrant_client.models import IsNullCondition + from qql.ast_nodes import IsNullExpr + + result = executor._build_qdrant_filter(IsNullExpr(field="reviewer")) + assert isinstance(result, IsNullCondition) + + def test_build_is_not_null(self, executor): + from qdrant_client.models import Filter, IsNullCondition + from qql.ast_nodes import IsNotNullExpr + + result = executor._build_qdrant_filter(IsNotNullExpr(field="reviewer")) + assert isinstance(result, Filter) + assert isinstance(result.must_not[0], IsNullCondition) + + def test_build_is_empty(self, executor): + from qdrant_client.models import IsEmptyCondition + from qql.ast_nodes import IsEmptyExpr + + result = executor._build_qdrant_filter(IsEmptyExpr(field="tags")) + assert isinstance(result, IsEmptyCondition) + + def test_build_is_not_empty(self, executor): + from qdrant_client.models import Filter, IsEmptyCondition + from qql.ast_nodes import IsNotEmptyExpr + + result = executor._build_qdrant_filter(IsNotEmptyExpr(field="tags")) + assert isinstance(result, Filter) + assert isinstance(result.must_not[0], IsEmptyCondition) + + def test_build_match_text(self, executor): + from qdrant_client.models import FieldCondition, MatchText + from qql.ast_nodes import MatchTextExpr + + result = executor._build_qdrant_filter(MatchTextExpr(field="title", text="vector db")) + assert isinstance(result, FieldCondition) + assert isinstance(result.match, MatchText) + assert result.match.text == "vector db" + + def test_build_match_any(self, executor): + from qdrant_client.models import FieldCondition, MatchTextAny + from qql.ast_nodes import MatchAnyExpr + + result = executor._build_qdrant_filter(MatchAnyExpr(field="title", text="nlp ai")) + assert isinstance(result, FieldCondition) + assert isinstance(result.match, MatchTextAny) + + def test_build_match_phrase(self, executor): + from qdrant_client.models import FieldCondition, MatchPhrase + from qql.ast_nodes import MatchPhraseExpr + + result = executor._build_qdrant_filter(MatchPhraseExpr(field="title", text="quick fox")) + assert isinstance(result, FieldCondition) + assert isinstance(result.match, MatchPhrase) + + def test_build_and(self, executor): + from qdrant_client.models import Filter + from qql.ast_nodes import AndExpr, CompareExpr + + expr = AndExpr(operands=( + CompareExpr(field="a", op="=", value="x"), + CompareExpr(field="b", op="=", value="y"), + )) + result = executor._build_qdrant_filter(expr) + assert isinstance(result, Filter) + assert len(result.must) == 2 + + def test_build_or(self, executor): + from qdrant_client.models import Filter + from qql.ast_nodes import CompareExpr, OrExpr + + expr = OrExpr(operands=( + CompareExpr(field="src", op="=", value="arxiv"), + CompareExpr(field="src", op="=", value="ieee"), + )) + result = executor._build_qdrant_filter(expr) + assert isinstance(result, Filter) + assert len(result.should) == 2 + + def test_build_not(self, executor): + from qdrant_client.models import Filter + from qql.ast_nodes import CompareExpr, NotExpr + + expr = NotExpr(operand=CompareExpr(field="st", op="=", value="draft")) + result = executor._build_qdrant_filter(expr) + assert isinstance(result, Filter) + assert result.must_not is not None + + def test_wrap_as_filter_passthrough(self, executor): + from qdrant_client.models import Filter + + f = Filter(must=[]) + assert executor._wrap_as_filter(f) is f + + def test_wrap_as_filter_wraps_field_condition(self, executor): + from qdrant_client.models import FieldCondition, Filter, MatchValue + + fc = FieldCondition(key="x", match=MatchValue(value="y")) + result = executor._wrap_as_filter(fc) + assert isinstance(result, Filter) + assert result.must[0] is fc diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 3b13b09..93a00ea 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -105,6 +105,82 @@ def test_error_includes_position(self): assert exc_info.value.pos is not None +class TestNewOperators: + def test_not_equals(self): + tokens = tokenize("field != 'x'") + assert tokens[1].kind == TokenKind.NOT_EQUALS + assert tokens[1].value == "!=" + + def test_gt(self): + tokens = tokenize("score > 0.5") + assert tokens[1].kind == TokenKind.GT + assert tokens[1].value == ">" + + def test_gte(self): + tokens = tokenize("score >= 0.5") + assert tokens[1].kind == TokenKind.GTE + assert tokens[1].value == ">=" + + def test_lt(self): + tokens = tokenize("year < 2024") + assert tokens[1].kind == TokenKind.LT + assert tokens[1].value == "<" + + def test_lte(self): + tokens = tokenize("year <= 2023") + assert tokens[1].kind == TokenKind.LTE + assert tokens[1].value == "<=" + + def test_lparen_rparen(self): + ks = kinds("(a OR b)") + assert TokenKind.LPAREN in ks + assert TokenKind.RPAREN in ks + + def test_filter_keywords(self): + ks = kinds("AND OR NOT IN BETWEEN IS NULL EMPTY MATCH ANY PHRASE") + assert TokenKind.AND in ks + assert TokenKind.OR in ks + assert TokenKind.NOT in ks + assert TokenKind.IN in ks + assert TokenKind.BETWEEN in ks + assert TokenKind.IS in ks + assert TokenKind.NULL in ks + assert TokenKind.EMPTY in ks + assert TokenKind.MATCH in ks + assert TokenKind.ANY in ks + assert TokenKind.PHRASE in ks + + def test_filter_keywords_case_insensitive(self): + ks = kinds("and or not in between is null empty match any phrase") + assert TokenKind.AND in ks + assert TokenKind.OR in ks + assert TokenKind.NOT in ks + + def test_dotted_identifier(self): + tokens = tokenize("meta.source") + assert tokens[0].kind == TokenKind.IDENTIFIER + assert tokens[0].value == "meta.source" + + def test_three_level_dotted_identifier(self): + tokens = tokenize("a.b.c") + assert tokens[0].kind == TokenKind.IDENTIFIER + assert tokens[0].value == "a.b.c" + + def test_nested_array_path(self): + tokens = tokenize("country.cities[].population") + assert tokens[0].kind == TokenKind.IDENTIFIER + assert tokens[0].value == "country.cities[].population" + + def test_gt_does_not_consume_equals_sign(self): + # ">" followed by non-"=" should be GT only + tokens = tokenize("a > b") + assert tokens[1].kind == TokenKind.GT + + def test_bare_exclamation_raises(self): + with pytest.raises(QQLSyntaxError): + tokenize("field ! 'x'") + + class TestEOF: def test_ends_with_eof(self): tokens = tokenize("hello") diff --git a/tests/test_parser.py b/tests/test_parser.py index d296c8b..adcb2ed 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,10 +1,24 @@ import pytest from qql.ast_nodes import ( + AndExpr, + BetweenExpr, + CompareExpr, CreateCollectionStmt, DeleteStmt, DropCollectionStmt, + InExpr, InsertStmt, + IsEmptyExpr, + IsNotEmptyExpr, + IsNotNullExpr, + IsNullExpr, + MatchAnyExpr, + MatchPhraseExpr, + MatchTextExpr, + NotExpr, + NotInExpr, + OrExpr, SearchStmt, ShowCollectionsStmt, ) @@ -132,3 +146,183 @@ def test_missing_collection_name(self): def test_empty_input(self): with pytest.raises(QQLSyntaxError): parse("") + + +class TestSearchWithWhere: + def test_no_where_clause(self): + node = parse("SEARCH docs SIMILAR TO 'ml' LIMIT 5") + assert node.query_filter is None + + def test_equality_filter(self): + node = parse("SEARCH docs SIMILAR TO 'ml' LIMIT 5 WHERE category = 'paper'") + f = node.query_filter + assert isinstance(f, CompareExpr) + assert f.field == "category" + assert f.op == "=" + assert f.value == "paper" + + def test_not_equals_filter(self): + node = parse("SEARCH docs SIMILAR TO 'ml' LIMIT 5 WHERE status != 'draft'") + f = node.query_filter + assert isinstance(f, CompareExpr) + assert f.op == "!=" + assert f.value == "draft" + + def test_range_gt(self): + node = parse("SEARCH docs SIMILAR TO 'ml' LIMIT 5 WHERE score > 0.8") + f = node.query_filter + assert isinstance(f, CompareExpr) + assert f.op == ">" + assert f.value == pytest.approx(0.8) + + def test_range_gte(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE year >= 2020") + assert isinstance(node.query_filter, CompareExpr) + assert node.query_filter.op == ">=" + assert node.query_filter.value == 2020 + + def test_range_lt(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE year < 2024") + assert isinstance(node.query_filter, CompareExpr) + assert node.query_filter.op == "<" + + def test_range_lte(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE year <= 2023") + assert isinstance(node.query_filter, CompareExpr) + assert node.query_filter.op == "<=" + + def test_between(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE year BETWEEN 2018 AND 2023") + f = node.query_filter + assert isinstance(f, BetweenExpr) + assert f.field == "year" + assert f.low == 2018 + assert f.high == 2023 + + def test_in_expr(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE status IN ('a', 'b')") + f = node.query_filter + assert isinstance(f, InExpr) + assert f.field == "status" + assert f.values == ("a", "b") + + def test_in_with_trailing_comma(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE status IN ('a', 'b',)") + assert isinstance(node.query_filter, InExpr) + assert len(node.query_filter.values) == 2 + + def test_not_in_expr(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE status NOT IN ('deleted', 'archived')") + f = node.query_filter + assert isinstance(f, NotInExpr) + assert f.values == ("deleted", "archived") + + def test_is_null(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE reviewer IS NULL") + f = node.query_filter + assert isinstance(f, IsNullExpr) + assert f.field == "reviewer" + + def test_is_not_null(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE reviewer IS NOT NULL") + assert isinstance(node.query_filter, IsNotNullExpr) + assert node.query_filter.field == "reviewer" + + def test_is_empty(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE tags IS EMPTY") + assert isinstance(node.query_filter, IsEmptyExpr) + assert node.query_filter.field == "tags" + + def test_is_not_empty(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE tags IS NOT EMPTY") + assert isinstance(node.query_filter, IsNotEmptyExpr) + + def test_match_text(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE title MATCH 'deep learning'") + f = node.query_filter + assert isinstance(f, MatchTextExpr) + assert f.field == "title" + assert f.text == "deep learning" + + def test_match_any(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE title MATCH ANY 'nlp ai'") + f = node.query_filter + assert isinstance(f, MatchAnyExpr) + assert f.text == "nlp ai" + + def test_match_phrase(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE title MATCH PHRASE 'neural net'") + assert isinstance(node.query_filter, MatchPhraseExpr) + + def test_and_expr_two_operands(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE a = '1' AND b = '2'") + f = node.query_filter + assert isinstance(f, AndExpr) + assert len(f.operands) == 2 + assert all(isinstance(op, CompareExpr) for op in f.operands) + + def test_and_expr_three_operands_flattened(self): + node = parse( + "SEARCH d SIMILAR TO 'x' LIMIT 5 WHERE a = '1' AND b = '2' AND c = '3'" + ) + f = node.query_filter + assert isinstance(f, AndExpr) + assert len(f.operands) == 3 # flattened, not binary-nested + + def test_or_expr(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE a = '1' OR b = '2'") + f = node.query_filter + assert isinstance(f, OrExpr) + assert len(f.operands) == 2 + + def test_not_expr(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE NOT status = 'draft'") + f = node.query_filter + assert isinstance(f, NotExpr) + assert isinstance(f.operand, CompareExpr) + + def test_parenthesized_or_inside_and(self): + node = parse( + "SEARCH docs SIMILAR TO 'x' LIMIT 5 " + "WHERE (src = 'a' OR src = 'b') AND year > 2020" + ) + f = node.query_filter + assert isinstance(f, AndExpr) + assert isinstance(f.operands[0], OrExpr) + assert isinstance(f.operands[1], CompareExpr) + + def test_dotted_field_path(self): + node = parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE meta.source = 'web'") + assert isinstance(node.query_filter, CompareExpr) + assert node.query_filter.field == "meta.source" + + def test_using_model_then_where(self): + node = parse( + "SEARCH docs SIMILAR TO 'x' LIMIT 5 " + "USING MODEL 'my-model' WHERE category = 'paper'" + ) + assert node.model == "my-model" + assert isinstance(node.query_filter, CompareExpr) + + def test_between_and_does_not_confuse_logical_and(self): + # The AND inside BETWEEN must not be consumed by the logical AND loop + node = parse( + "SEARCH d SIMILAR TO 'x' LIMIT 5 WHERE year BETWEEN 2018 AND 2023 AND category = 'ai'" + ) + f = node.query_filter + assert isinstance(f, AndExpr) + assert isinstance(f.operands[0], BetweenExpr) + assert isinstance(f.operands[1], CompareExpr) + assert len(f.operands) == 2 + + def test_not_negates_parenthesized_group(self): + node = parse( + "SEARCH d SIMILAR TO 'x' LIMIT 5 WHERE NOT (a = '1' OR b = '2')" + ) + f = node.query_filter + assert isinstance(f, NotExpr) + assert isinstance(f.operand, OrExpr) + + def test_missing_rparen_raises(self): + with pytest.raises(QQLSyntaxError): + parse("SEARCH docs SIMILAR TO 'x' LIMIT 5 WHERE (a = '1'")