From 573f850c5f2ba04253748b193b3e25ea7e111e77 Mon Sep 17 00:00:00 2001 From: "manthapavankumar11@gmail.com" Date: Fri, 15 May 2026 13:21:36 +0530 Subject: [PATCH 1/4] 2 new concepts 1. Update and 2.Group By --- README.md | 21 +++- docs/collections.md | 65 ++++++++++++ docs/reference.md | 10 +- docs/search.md | 65 ++++++++++++ src/qql/ast_nodes.py | 21 ++++ src/qql/executor.py | 142 +++++++++++++++++++++++++++ src/qql/lexer.py | 12 +++ src/qql/parser.py | 70 +++++++++++++ tests/test_executor.py | 217 +++++++++++++++++++++++++++++++++++++++++ tests/test_lexer.py | 49 ++++++++++ tests/test_parser.py | 179 +++++++++++++++++++++++++++++++++ 11 files changed, 844 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 01e1fcd..ff85b01 100644 --- a/README.md +++ b/README.md @@ -5,9 +5,9 @@ [![PyPI version](https://img.shields.io/pypi/v/qql-cli?color=blue&label=PyPI)](https://pypi.org/project/qql-cli/) [![Python 3.12+](https://img.shields.io/pypi/pyversions/qql-cli)](https://pypi.org/project/qql-cli/) [![MIT License](https://img.shields.io/badge/license-MIT-green)](LICENSE) -[![Tests](https://img.shields.io/badge/tests-405%20passing-brightgreen)](tests/) +[![Tests](https://img.shields.io/badge/tests-485%20passing-brightgreen)](tests/) -Write `INSERT`, `SELECT`, `SEARCH`, `SCROLL`, `RECOMMEND`, `DELETE`, and `CREATE COLLECTION` statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, cross-encoder reranking, quantization (scalar, turbo, binary, product), SQL-style `WHERE` filters, script execution, and collection dump/restore. +Write `INSERT`, `SELECT`, `SEARCH`, `SCROLL`, `RECOMMEND`, `UPDATE`, `DELETE`, and `CREATE COLLECTION` statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, grouped search (GROUP BY), cross-encoder reranking, quantization (scalar, turbo, binary, product), SQL-style `WHERE` filters, script execution, and collection dump/restore. ``` qql> INSERT INTO COLLECTION notes VALUES {'text': 'Qdrant is a vector database', 'author': 'alice', 'year': 2024} @@ -82,9 +82,9 @@ Full documentation lives in the [`docs/`](docs/) folder and at **[pavanjava.gith |---|---| | [Getting Started](docs/getting-started.md) | Installation, connecting, first queries | | [INSERT / INSERT BULK](docs/insert.md) | Adding documents, batch inserts, payload types | -| [SEARCH / SELECT / SCROLL / RECOMMEND / Hybrid / RERANK](docs/search.md) | Semantic search, point retrieval, pagination, hybrid, reranking, recommendations | +| [SEARCH / SELECT / SCROLL / RECOMMEND / Hybrid / GROUP BY / RERANK](docs/search.md) | Semantic search, grouped search, point retrieval, pagination, hybrid, reranking, recommendations | | [WHERE Filters](docs/filters.md) | Full SQL-style filter operators | -| [Collections & Quantization](docs/collections.md) | SHOW, CREATE, DROP, QUANTIZE (scalar/turbo/binary/product), CREATE INDEX | +| [Collections & Quantization](docs/collections.md) | SHOW, CREATE, DROP, QUANTIZE (scalar/turbo/binary/product), CREATE INDEX, UPDATE VECTOR, UPDATE PAYLOAD | | [Scripts: EXECUTE / DUMP](docs/scripts.md) | Script files, collection backup/restore | | [Programmatic Usage](docs/programmatic.md) | Use QQL as a Python library | | [Reference: Models / Config / Errors](docs/reference.md) | Embedding models, config file, error reference | @@ -128,6 +128,17 @@ SHOW COLLECTIONS SHOW COLLECTION articles DROP COLLECTION articles +-- Search with grouping +SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY category +SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY category GROUP_SIZE 3 +SEARCH articles SIMILAR TO 'query' LIMIT 5 WHERE year >= 2020 GROUP BY category GROUP_SIZE 2 +SEARCH articles SIMILAR TO 'query' LIMIT 5 USING HYBRID GROUP BY category + +-- Update +UPDATE articles SET VECTOR WHERE id = '3f2e1a4b-...' [0.1, 0.2, 0.3, 0.4] +UPDATE articles SET PAYLOAD WHERE id = '3f2e1a4b-...' {'year': 2025, 'status': 'active'} +UPDATE articles SET PAYLOAD WHERE category = 'draft' {'status': 'published'} + -- Delete DELETE FROM articles WHERE id = '3f2e1a4b-...' DELETE FROM articles WHERE year < 2020 @@ -147,7 +158,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked. pytest tests/ -v ``` -Expected: **405 tests passing**. +Expected: **485 tests passing**. --- diff --git a/docs/collections.md b/docs/collections.md index a36e6e6..fbc4d14 100644 --- a/docs/collections.md +++ b/docs/collections.md @@ -310,3 +310,68 @@ DELETE FROM articles WHERE year < 2020 AND status = 'draft' **Notes:** - If no points match the filter or ID, the operation succeeds silently with a count of 0. - The collection itself must exist; deleting from a non-existent collection raises an error. + +--- + +## UPDATE SET VECTOR — replace a point's dense vector + +Replaces the stored dense vector for a **single point** identified by its ID. The point must already exist in the collection. Use this when you want to refresh an embedding without changing the payload. + +**Syntax:** +``` +UPDATE SET VECTOR WHERE id = '' [] +UPDATE SET VECTOR WHERE id = [] +``` + +The vector is provided as a JSON-style float array `[v1, v2, ..., vN]`. The array length must match the collection's configured vector dimensions. + +**Examples:** + +```sql +-- Replace vector by UUID +UPDATE articles SET VECTOR WHERE id = '3f2e1a4b-8c91-4d0e-b123-abc123def456' [0.1, 0.2, 0.3, 0.4] + +-- Replace vector by integer ID +UPDATE articles SET VECTOR WHERE id = 42 [0.1, 0.2, 0.3, 0.4] +``` + +**Notes:** +- Only single-point updates are supported (by ID). Bulk or filter-based vector updates are not supported. +- The point must already exist; this operation does not create new points. +- The collection must exist; updating from a non-existent collection raises an error. +- For hybrid collections, the dense vector named `"dense"` is updated. Sparse vectors are managed separately. + +--- + +## UPDATE SET PAYLOAD — merge fields into a point's payload + +Merges new key/value pairs into the payload of one or more points. **Existing fields not mentioned in the update are preserved** (additive merge, not a full replace). Use a `WHERE` filter to update multiple points at once. + +**Syntax:** +``` +UPDATE SET PAYLOAD WHERE id = '' {} +UPDATE SET PAYLOAD WHERE id = {} +UPDATE SET PAYLOAD WHERE {} +``` + +**Examples:** + +```sql +-- Update a single point by UUID +UPDATE articles SET PAYLOAD WHERE id = '3f2e1a4b-8c91-4d0e-b123-abc123def456' {'year': 2025, 'status': 'active'} + +-- Update a single point by integer ID +UPDATE articles SET PAYLOAD WHERE id = 42 {'category': 'tech'} + +-- Update all points matching a filter +UPDATE articles SET PAYLOAD WHERE category = 'draft' {'status': 'published'} + +-- Compound filter update +UPDATE articles SET PAYLOAD WHERE year < 2020 AND status = 'draft' {'archived': true} +``` + +**Notes:** +- **Merge semantics:** only the fields in `{…}` are written; all other existing payload fields are preserved. +- If no points match the filter, the operation succeeds silently with no changes. +- The collection must exist; updating from a non-existent collection raises an error. +- All `WHERE` filter operators supported by `DELETE` are also supported here (see [WHERE Filters](filters.md)). diff --git a/docs/reference.md b/docs/reference.md index 93c59b7..7cd863e 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -162,7 +162,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked. pytest tests/ -v ``` -Expected output: **405 tests passing**. +Expected output: **485 tests passing**. --- @@ -174,7 +174,7 @@ Expected output: **405 tests passing**. | `Connection failed: ...` | Qdrant unreachable at given URL | Check that Qdrant is running and the URL is correct | | `INSERT requires a 'text' field in VALUES` | `text` key missing from the VALUES dict | Add `'text': '...'` to your dict | | `Vector dimension mismatch: collection '...' expects X dims, but model produces Y dims` | Model used in INSERT differs from the one used to create the collection | Use `USING MODEL` to specify the same model as the collection was created with | -| `Collection '...' does not exist` | SEARCH / SCROLL / SELECT / DROP / DELETE on a non-existent collection | Check name spelling or run `SHOW COLLECTIONS` | +| `Collection '...' does not exist` | SEARCH / SCROLL / SELECT / DROP / DELETE / UPDATE on a non-existent collection | Check name spelling or run `SHOW COLLECTIONS` | | `Unexpected token '...'; expected a QQL statement keyword` | Unrecognized statement | Check the query syntax and supported statement list | | `SELECT requires a string or integer point id, got '...'` | `SELECT` used with a non-ID filter value | Use `SELECT * FROM WHERE id = ''` or an integer ID | | `Unterminated string literal (at position N)` | A string is missing its closing quote | Close the string with a matching `'` or `"` | @@ -182,6 +182,12 @@ Expected output: **405 tests passing**. | `Expected a filter operator after field '...'` | Unknown operator in WHERE clause | Use one of: `=`, `!=`, `>`, `>=`, `<`, `<=`, `IN`, `NOT IN`, `BETWEEN`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `MATCH` | | `Expected ')' ...` | Unclosed parenthesis in WHERE clause | Add the missing `)` to close the group | | `Qdrant error during SEARCH: ...` | Hybrid search on a non-hybrid collection, or wrong vector names | Ensure the collection was created with `HYBRID` before using `USING HYBRID` in INSERT/SEARCH | +| `Qdrant error during GROUP BY SEARCH: ...` | GROUP BY on an unindexed field, or unsupported field type | Ensure the group-by field is indexed as `keyword` or `integer` via `CREATE INDEX` | +| `GROUP BY and RERANK cannot be combined ...` | Both GROUP BY and RERANK specified in the same SEARCH | Remove one of the two clauses | +| `Expected VECTOR or PAYLOAD after SET, got '...'` | Unknown keyword after SET in UPDATE | Use `UPDATE ... SET VECTOR ...` or `UPDATE ... SET PAYLOAD ...` | +| `Expected a vector list [...] after point ID in UPDATE SET VECTOR` | UPDATE SET VECTOR missing the `[...]` float array | Add the vector array: `UPDATE ... SET VECTOR WHERE id = '...' [0.1, 0.2, ...]` | +| `Qdrant error during UPDATE VECTOR: ...` | Point does not exist, or vector dimensions mismatch | Verify the point ID exists and the vector length matches the collection's dimensions | +| `Qdrant error during UPDATE PAYLOAD: ...` | Qdrant rejected the payload update | Check field values and collection state | | `Qdrant error during SCROLL: ...` | Qdrant rejected scroll request | Verify collection state, filter, and cursor (`AFTER`) value | | `Unknown index type '...'` | Invalid schema type in CREATE INDEX | Use one of: `keyword`, `integer`, `float`, `bool`, `text`, `geo`, `datetime` | | `Qdrant error during CREATE INDEX: ...` | Qdrant rejected the index creation | Check field name and collection state | diff --git a/docs/search.md b/docs/search.md index f475aa6..9f195fc 100644 --- a/docs/search.md +++ b/docs/search.md @@ -343,3 +343,68 @@ SEARCH articles SIMILAR TO 'semantic search' LIMIT 5 | Large collections with keyword-heavy queries | `USING HYBRID RERANK` | > **Note on scores:** After reranking, the `score` column shows the cross-encoder's raw logit (can be any real number, unbounded). Do not compare reranked scores to non-reranked cosine similarity scores. + +--- + +## SEARCH … GROUP BY — grouped results + +Returns the top-scoring points **grouped by a payload field value**. Instead of a single flat ranked list, results are organised into groups — each group contains the top-scoring points that share the same value for the specified field. + +Useful for **result diversification**: e.g. "return the 3 best articles from each category", or "show the top 2 papers per author". + +**Syntax:** +``` +SEARCH SIMILAR TO '' LIMIT GROUP BY +SEARCH SIMILAR TO '' LIMIT GROUP BY GROUP_SIZE +SEARCH SIMILAR TO '' LIMIT [WHERE ] GROUP BY [GROUP_SIZE ] +SEARCH SIMILAR TO '' LIMIT USING HYBRID GROUP BY [GROUP_SIZE ] +``` + +- **`LIMIT `** — maximum number of **groups** to return. +- **`GROUP_SIZE `** — maximum number of points per group (default: **3**). +- **`GROUP BY `** — the payload field whose values define the groups. Dot-notation is supported (e.g. `meta.author`). The field should be indexed as `keyword` or `integer` for best performance. +- `WHERE` filters, `USING HYBRID`, and `USING MODEL` are all compatible with GROUP BY. +- **`GROUP BY` and `RERANK` cannot be combined** in the same statement — this raises a syntax error. + +**Examples:** + +Top 5 categories, up to 3 articles each (default group_size): +```sql +SEARCH articles SIMILAR TO 'machine learning' LIMIT 5 GROUP BY category +``` + +Top 3 authors, up to 2 papers each: +```sql +SEARCH papers SIMILAR TO 'neural networks' LIMIT 3 GROUP BY author GROUP_SIZE 2 +``` + +Grouped search with a payload filter: +```sql +SEARCH articles SIMILAR TO 'deep learning' LIMIT 5 WHERE year >= 2022 GROUP BY category GROUP_SIZE 4 +``` + +Grouped hybrid search: +```sql +SEARCH articles SIMILAR TO 'vector databases' LIMIT 4 USING HYBRID GROUP BY category GROUP_SIZE 3 +``` + +**Output:** + +``` +✓ Found 3 group(s) by 'category' (grouped) +Group: machine-learning + Score │ ID │ Payload +────────┼──────────────────────────────────────┼──────────────────────────────────────── + 0.9312 │ 3f2e1a4b-8c91-4d0e-b123-abc123def456 │ {'text': '...', 'category': 'machine-learning'} + 0.8845 │ 9a1b2c3d-4e5f-6789-abcd-ef0123456789 │ {'text': '...', 'category': 'machine-learning'} + +Group: nlp + Score │ ID │ Payload +────────┼──────────────────────────────────────┼──────────────────────────────────────── + 0.9100 │ 1a2b3c4d-5e6f-7890-bcde-f01234567890 │ {'text': '...', 'category': 'nlp'} +``` + +> **Tip:** For GROUP BY to work efficiently, create a payload index on the grouping field first: +> ```sql +> CREATE INDEX ON COLLECTION articles FOR category TYPE keyword +> ``` diff --git a/src/qql/ast_nodes.py b/src/qql/ast_nodes.py index 99d3bae..cfb817c 100644 --- a/src/qql/ast_nodes.py +++ b/src/qql/ast_nodes.py @@ -213,6 +213,8 @@ class SearchStmt: rerank: bool = False # if True, apply cross-encoder reranking post-Qdrant rerank_model: str | None = None # cross-encoder model; None → CrossEncoderEmbedder.DEFAULT_MODEL with_clause: SearchWith | None = None + group_by: str | None = None # GROUP BY field name; None → normal flat search + group_size: int = 3 # max points per group (ignored when group_by is None) @dataclass(frozen=True) @@ -237,6 +239,23 @@ class DeleteStmt: query_filter: FilterExpr | None = None +@dataclass(frozen=True) +class UpdateVectorStmt: + """UPDATE SET VECTOR WHERE id = [vector...]""" + collection: str + point_id: str | int + vector: tuple[float, ...] # dense vector as immutable tuple (frozen=True compatible) + + +@dataclass(frozen=True) +class UpdatePayloadStmt: + """UPDATE SET PAYLOAD WHERE {payload}""" + collection: str + payload: dict[str, Any] + point_id: str | int | None = None # mutually exclusive with query_filter + query_filter: FilterExpr | None = None + + # Union type for all top-level statement nodes ASTNode = ( InsertStmt @@ -251,4 +270,6 @@ class DeleteStmt: | SearchStmt | RecommendStmt | DeleteStmt + | UpdateVectorStmt + | UpdatePayloadStmt ) diff --git a/src/qql/executor.py b/src/qql/executor.py index 6107c62..281b83d 100644 --- a/src/qql/executor.py +++ b/src/qql/executor.py @@ -31,6 +31,7 @@ PayloadField, PayloadSchemaType, PointStruct, + PointVectors, Prefetch, ProductQuantization, ProductQuantizationConfig, @@ -82,6 +83,8 @@ SearchWith, ShowCollectionStmt, ShowCollectionsStmt, + UpdateVectorStmt, + UpdatePayloadStmt, ) from .config import QQLConfig from .embedder import CrossEncoderEmbedder, Embedder, SparseEmbedder @@ -130,6 +133,10 @@ def execute(self, node: ASTNode) -> ExecutionResult: return self._execute_recommend(node) if isinstance(node, DeleteStmt): return self._execute_delete(node) + if isinstance(node, UpdateVectorStmt): + return self._execute_update_vector(node) + if isinstance(node, UpdatePayloadStmt): + return self._execute_update_payload(node) raise QQLRuntimeError(f"Unknown AST node type: {type(node)}") # ── Statement executors ─────────────────────────────────────────────── @@ -599,6 +606,10 @@ def _execute_search(self, node: SearchStmt) -> ExecutionResult: # enough material to reorder; only `node.limit` results are returned. fetch_limit = node.limit * _RERANK_FETCH_MULTIPLIER if node.rerank else node.limit + # ── GROUP BY SEARCH: delegate to query_points_groups() ───────────── + if node.group_by is not None: + return self._execute_search_groups(node, qdrant_filter, search_params) + # ── Hybrid SEARCH: prefetch dense+sparse, fuse with the requested strategy ── if node.hybrid: dense_model = node.model or self._config.default_model @@ -932,6 +943,137 @@ def _execute_delete(self, node: DeleteStmt) -> ExecutionResult: message=f"Deleted point '{node.point_id}' from '{node.collection}'", ) + def _execute_search_groups( + self, + node: SearchStmt, + qdrant_filter: Filter | None, + search_params: SearchParams | None, + ) -> ExecutionResult: + """Execute SEARCH ... GROUP BY using query_points_groups().""" + try: + if node.hybrid: + dense_model = node.model or self._config.default_model + sparse_model_name = node.sparse_model or SparseEmbedder.DEFAULT_MODEL + dense_vector = Embedder(dense_model).embed(node.query_text) + sparse_obj = SparseEmbedder(sparse_model_name).query_embed(node.query_text) + sparse_vector = SparseVector( + indices=sparse_obj["indices"], + values=sparse_obj["values"], + ) + response = self._client.query_points_groups( + collection_name=node.collection, + group_by=node.group_by, + prefetch=[ + Prefetch( + query=dense_vector, + using="dense", + limit=node.limit * _HYBRID_PREFETCH_MULTIPLIER, + params=search_params, + ), + Prefetch( + query=sparse_vector, + using="sparse", + limit=node.limit * _HYBRID_PREFETCH_MULTIPLIER, + params=search_params, + ), + ], + query=FusionQuery(fusion=self._resolve_hybrid_fusion(node.fusion)), + limit=node.limit, + group_size=node.group_size, + query_filter=qdrant_filter, + ) + label = "hybrid, grouped" + else: + model_name = node.model or self._config.default_model + vector = Embedder(model_name).embed(node.query_text) + query_using = self._get_dense_vector_name(node.collection) + response = self._client.query_points_groups( + collection_name=node.collection, + group_by=node.group_by, + query=vector, + using=query_using, + limit=node.limit, + group_size=node.group_size, + query_filter=qdrant_filter, + search_params=search_params, + ) + label = "grouped" + except UnexpectedResponse as e: + raise QQLRuntimeError(f"Qdrant error during GROUP BY SEARCH: {e}") from e + + groups = [ + { + "group_id": str(g.id), + "hits": [ + {"id": str(h.id), "score": round(h.score, 4), "payload": h.payload} + for h in g.hits + ], + } + for g in response.groups + ] + return ExecutionResult( + success=True, + message=f"Found {len(groups)} group(s) by '{node.group_by}' ({label})", + data=groups, + ) + + def _execute_update_vector(self, node: UpdateVectorStmt) -> ExecutionResult: + """Execute UPDATE ... SET VECTOR using update_vectors().""" + if not self._client.collection_exists(node.collection): + raise QQLRuntimeError(f"Collection '{node.collection}' does not exist") + # Named-vector collections (hybrid) use "dense"; unnamed use plain list. + vector_name = self._get_dense_vector_name(node.collection) + vector_struct: Any = ( + {vector_name: list(node.vector)} if vector_name else list(node.vector) + ) + try: + self._client.update_vectors( + collection_name=node.collection, + points=[PointVectors(id=node.point_id, vector=vector_struct)], + wait=True, + ) + except UnexpectedResponse as e: + raise QQLRuntimeError(f"Qdrant error during UPDATE VECTOR: {e}") from e + return ExecutionResult( + success=True, + message=f"Updated vector for point [{node.point_id}] in '{node.collection}'", + data=[], + ) + + def _execute_update_payload(self, node: UpdatePayloadStmt) -> ExecutionResult: + """Execute UPDATE ... SET PAYLOAD using set_payload().""" + if not self._client.collection_exists(node.collection): + raise QQLRuntimeError(f"Collection '{node.collection}' does not exist") + try: + if node.query_filter is not None: + qdrant_filter = self._wrap_as_filter( + self._build_qdrant_filter(node.query_filter) + ) + self._client.set_payload( + collection_name=node.collection, + payload=node.payload, + points=qdrant_filter, + wait=True, + ) + return ExecutionResult( + success=True, + message=f"Payload updated in '{node.collection}' (filter-based)", + data=[], + ) + self._client.set_payload( + collection_name=node.collection, + payload=node.payload, + points=[node.point_id], + wait=True, + ) + except UnexpectedResponse as e: + raise QQLRuntimeError(f"Qdrant error during UPDATE PAYLOAD: {e}") from e + return ExecutionResult( + success=True, + message=f"Payload updated for point [{node.point_id}] in '{node.collection}'", + data=[], + ) + # ── Filter conversion ───────────────────────────────────────────────── def _build_qdrant_filter(self, expr: FilterExpr) -> Any: diff --git a/src/qql/lexer.py b/src/qql/lexer.py index 0b397b0..3e1b277 100644 --- a/src/qql/lexer.py +++ b/src/qql/lexer.py @@ -59,6 +59,12 @@ class TokenKind(Enum): ID = auto() FOR = auto() TYPE = auto() + GROUP = auto() + BY = auto() + GROUP_SIZE = auto() + UPDATE = auto() + SET = auto() + PAYLOAD = auto() # ── Filter keywords ─────────────────────────────────────────────────── AND = auto() OR = auto() @@ -152,6 +158,12 @@ class TokenKind(Enum): "ID": TokenKind.ID, "FOR": TokenKind.FOR, "TYPE": TokenKind.TYPE, + "GROUP": TokenKind.GROUP, + "BY": TokenKind.BY, + "GROUP_SIZE": TokenKind.GROUP_SIZE, + "UPDATE": TokenKind.UPDATE, + "SET": TokenKind.SET, + "PAYLOAD": TokenKind.PAYLOAD, # Filter keywords "AND": TokenKind.AND, "OR": TokenKind.OR, diff --git a/src/qql/parser.py b/src/qql/parser.py index cd52d95..cae29bb 100644 --- a/src/qql/parser.py +++ b/src/qql/parser.py @@ -32,6 +32,8 @@ SearchWith, ShowCollectionStmt, ShowCollectionsStmt, + UpdateVectorStmt, + UpdatePayloadStmt, ) from .exceptions import QQLSyntaxError from .lexer import Token, TokenKind @@ -76,6 +78,8 @@ def parse(self) -> ASTNode: node = self._parse_recommend() elif tok.kind == TokenKind.DELETE: node = self._parse_delete() + elif tok.kind == TokenKind.UPDATE: + node = self._parse_update() else: raise QQLSyntaxError( f"Unexpected token '{tok.value}'; expected a QQL statement keyword", @@ -422,6 +426,20 @@ def _parse_search(self) -> SearchStmt: exact=parsed_with.exact or with_clause.exact, acorn=parsed_with.acorn or with_clause.acorn, ) + group_by: str | None = None + group_size: int = 3 + if self._peek().kind == TokenKind.GROUP: + self._advance() # consume GROUP + self._expect(TokenKind.BY) + group_by = self._parse_field_path() + if rerank: + raise QQLSyntaxError( + "GROUP BY and RERANK cannot be combined in the same SEARCH statement", + self._peek().pos, + ) + if self._peek().kind == TokenKind.GROUP_SIZE: + self._advance() # consume GROUP_SIZE + group_size = int(self._expect(TokenKind.INTEGER).value) return SearchStmt( collection=collection, query_text=query_text, @@ -435,6 +453,8 @@ def _parse_search(self) -> SearchStmt: rerank=rerank, rerank_model=rerank_model, with_clause=with_clause, + group_by=group_by, + group_size=group_size, ) def _parse_recommend(self) -> RecommendStmt: @@ -524,6 +544,56 @@ def _parse_delete(self) -> DeleteStmt: query_filter = self._parse_filter_expr() return DeleteStmt(collection=collection, query_filter=query_filter) + def _parse_update(self) -> UpdateVectorStmt | UpdatePayloadStmt: + """ + UPDATE SET VECTOR WHERE id = [] + UPDATE SET PAYLOAD WHERE id = {} + UPDATE SET PAYLOAD WHERE {} + """ + self._expect(TokenKind.UPDATE) + collection = self._parse_identifier() + self._expect(TokenKind.SET) + + if self._peek().kind == TokenKind.VECTOR: + self._advance() # consume VECTOR + self._expect(TokenKind.WHERE) + self._expect(TokenKind.ID) + self._expect(TokenKind.EQUALS) + point_id = self._parse_point_id_value("UPDATE SET VECTOR") + vector_val = self._parse_value() # parses [...] list + if not isinstance(vector_val, list): + raise QQLSyntaxError( + "Expected a vector list [...] after point ID in UPDATE SET VECTOR", + self._peek().pos, + ) + return UpdateVectorStmt( + collection=collection, + point_id=point_id, + vector=tuple(float(v) for v in vector_val), + ) + + if self._peek().kind == TokenKind.PAYLOAD: + self._advance() # consume PAYLOAD + self._expect(TokenKind.WHERE) + if self._peek().kind == TokenKind.ID: + self._advance() # consume ID + self._expect(TokenKind.EQUALS) + point_id = self._parse_point_id_value("UPDATE SET PAYLOAD") + payload = self._parse_dict() + return UpdatePayloadStmt( + collection=collection, point_id=point_id, payload=payload + ) + query_filter = self._parse_filter_expr() + payload = self._parse_dict() + return UpdatePayloadStmt( + collection=collection, query_filter=query_filter, payload=payload + ) + + tok = self._peek() + raise QQLSyntaxError( + f"Expected VECTOR or PAYLOAD after SET, got '{tok.value}'", tok.pos + ) + # ── WHERE clause filter parsing (precedence: NOT > AND > OR) ───────── def _parse_filter_expr(self) -> FilterExpr: diff --git a/tests/test_executor.py b/tests/test_executor.py index 39de579..1698aa4 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -2082,3 +2082,220 @@ def test_turbo_invalid_bits_at_executor_raises(self, executor, mock_client): qc = QuantizationConfig(type=QuantizationType.TURBO, turbo_bits=3.0) with pytest.raises(QQLErr, match="Unsupported TURBO bit depth"): executor._build_quantization_config(qc) + + +# ── New feature executor tests ──────────────────────────────────────────────── + +class TestSearchGroupBy: + def test_group_by_calls_query_points_groups(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_group = mocker.MagicMock() + mock_group.id = "tech" + mock_hit = mocker.MagicMock() + mock_hit.id = "abc-123" + mock_hit.score = 0.95 + mock_hit.payload = {"text": "hello"} + mock_group.hits = [mock_hit] + mock_response = mocker.MagicMock() + mock_response.groups = [mock_group] + mock_client.query_points_groups.return_value = mock_response + + node = SearchStmt( + collection="articles", query_text="ai", limit=5, model=None, + group_by="category", group_size=3, + ) + result = executor.execute(node) + mock_client.query_points_groups.assert_called_once() + assert result.success is True + + def test_group_by_result_structure(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_group = mocker.MagicMock() + mock_group.id = "science" + mock_hit = mocker.MagicMock() + mock_hit.id = "xyz-456" + mock_hit.score = 0.88 + mock_hit.payload = {"text": "deep learning"} + mock_group.hits = [mock_hit] + mock_response = mocker.MagicMock() + mock_response.groups = [mock_group] + mock_client.query_points_groups.return_value = mock_response + + node = SearchStmt( + collection="articles", query_text="ml", limit=3, model=None, + group_by="field", group_size=2, + ) + result = executor.execute(node) + assert len(result.data) == 1 + assert result.data[0]["group_id"] == "science" + assert len(result.data[0]["hits"]) == 1 + assert result.data[0]["hits"][0]["score"] == 0.88 + + def test_group_by_message_contains_field_name(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + group_by="category", + ) + result = executor.execute(node) + assert "category" in result.message + + def test_group_by_nonexistent_collection_raises(self, executor, mock_client): + mock_client.collection_exists.return_value = False + node = SearchStmt( + collection="ghost", query_text="q", limit=5, model=None, + group_by="field", + ) + with pytest.raises(QQLRuntimeError, match="does not exist"): + executor.execute(node) + + def test_group_by_passes_group_size_to_qdrant(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + group_by="tag", group_size=7, + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + assert kwargs["group_size"] == 7 + assert kwargs["group_by"] == "tag" + + def test_group_by_hybrid_uses_query_points_groups(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + mock_sparse_embedder = mocker.MagicMock() + mock_sparse_embedder.query_embed.return_value = {"indices": [0, 1], "values": [0.5, 0.5]} + mocker.patch("qql.executor.SparseEmbedder", return_value=mock_sparse_embedder) + + node = SearchStmt( + collection="articles", query_text="q", limit=3, model=None, + hybrid=True, group_by="category", group_size=2, + ) + result = executor.execute(node) + mock_client.query_points_groups.assert_called_once() + kwargs = mock_client.query_points_groups.call_args.kwargs + assert kwargs["group_by"] == "category" + assert "prefetch" in kwargs + + +class TestUpdateVector: + def test_update_vector_calls_update_vectors(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + node = UpdateVectorStmt( + collection="articles", point_id="abc-123", vector=(0.1, 0.2, 0.3) + ) + result = executor.execute(node) + mock_client.update_vectors.assert_called_once() + assert result.success is True + + def test_update_vector_passes_correct_point_id(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + from qdrant_client.models import PointVectors + mock_client.collection_exists.return_value = True + mock_client.get_collection.return_value.config.params.vectors = {} # non-dict → unnamed + node = UpdateVectorStmt( + collection="notes", point_id=42, vector=(0.5, 0.6) + ) + executor.execute(node) + call_kwargs = mock_client.update_vectors.call_args.kwargs + points = call_kwargs["points"] + assert len(points) == 1 + assert points[0].id == 42 + + def test_update_vector_nonexistent_collection_raises(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = False + node = UpdateVectorStmt(collection="ghost", point_id=1, vector=(0.1,)) + with pytest.raises(QQLRuntimeError, match="does not exist"): + executor.execute(node) + + def test_update_vector_result_success_message(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + node = UpdateVectorStmt(collection="articles", point_id="id-1", vector=(0.1, 0.2)) + result = executor.execute(node) + assert result.success is True + assert "id-1" in result.message + + def test_update_vector_passes_wait_true(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + node = UpdateVectorStmt(collection="articles", point_id=1, vector=(0.1,)) + executor.execute(node) + kwargs = mock_client.update_vectors.call_args.kwargs + assert kwargs.get("wait") is True + + +class TestUpdatePayload: + def test_update_payload_by_id_calls_set_payload(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", point_id="abc-123", payload={"year": 2025} + ) + result = executor.execute(node) + mock_client.set_payload.assert_called_once() + assert result.success is True + + def test_update_payload_by_filter_calls_set_payload_with_filter( + self, executor, mock_client + ): + from qql.ast_nodes import UpdatePayloadStmt, CompareExpr + from qdrant_client.models import Filter + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", + payload={"status": "published"}, + query_filter=CompareExpr(field="category", op="=", value="draft"), + ) + result = executor.execute(node) + mock_client.set_payload.assert_called_once() + kwargs = mock_client.set_payload.call_args.kwargs + assert isinstance(kwargs["points"], Filter) + assert result.success is True + + def test_update_payload_nonexistent_collection_raises(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = False + node = UpdatePayloadStmt(collection="ghost", point_id=1, payload={"x": 1}) + with pytest.raises(QQLRuntimeError, match="does not exist"): + executor.execute(node) + + def test_update_payload_result_success_message(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", point_id="id-99", payload={"tag": "ai"} + ) + result = executor.execute(node) + assert result.success is True + assert "id-99" in result.message + + def test_update_payload_passes_correct_payload(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = True + payload = {"title": "New", "score": 0.9} + node = UpdatePayloadStmt(collection="articles", point_id=1, payload=payload) + executor.execute(node) + kwargs = mock_client.set_payload.call_args.kwargs + assert kwargs["payload"] == payload + + def test_update_payload_passes_wait_true(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt(collection="articles", point_id=1, payload={"x": 1}) + executor.execute(node) + kwargs = mock_client.set_payload.call_args.kwargs + assert kwargs.get("wait") is True diff --git a/tests/test_lexer.py b/tests/test_lexer.py index 1ac5e29..a9bfd57 100644 --- a/tests/test_lexer.py +++ b/tests/test_lexer.py @@ -289,3 +289,52 @@ def test_with_keyword(self): def test_acorn_keyword(self): ks = kinds("ACORN") assert ks[0] == TokenKind.ACORN + + +class TestUpdateGroupByKeywords: + def test_group_token(self): + ks = kinds("GROUP") + assert ks[0] == TokenKind.GROUP + + def test_by_token(self): + ks = kinds("BY") + assert ks[0] == TokenKind.BY + + def test_group_size_token(self): + ks = kinds("GROUP_SIZE") + assert ks[0] == TokenKind.GROUP_SIZE + + def test_update_token(self): + ks = kinds("UPDATE") + assert ks[0] == TokenKind.UPDATE + + def test_set_token(self): + ks = kinds("SET") + assert ks[0] == TokenKind.SET + + def test_payload_token(self): + ks = kinds("PAYLOAD") + assert ks[0] == TokenKind.PAYLOAD + + def test_group_by_sequence(self): + ks = kinds("GROUP BY category") + assert ks[0] == TokenKind.GROUP + assert ks[1] == TokenKind.BY + assert ks[2] == TokenKind.IDENTIFIER + + def test_group_size_followed_by_integer(self): + ks = kinds("GROUP_SIZE 5") + assert ks[0] == TokenKind.GROUP_SIZE + assert ks[1] == TokenKind.INTEGER + + def test_update_set_payload_sequence(self): + ks = kinds("UPDATE SET PAYLOAD") + assert ks[0] == TokenKind.UPDATE + assert ks[1] == TokenKind.SET + assert ks[2] == TokenKind.PAYLOAD + + def test_update_set_vector_sequence(self): + ks = kinds("UPDATE SET VECTOR") + assert ks[0] == TokenKind.UPDATE + assert ks[1] == TokenKind.SET + assert ks[2] == TokenKind.VECTOR diff --git a/tests/test_parser.py b/tests/test_parser.py index a840438..f7e36cf 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1189,3 +1189,182 @@ def test_turbo_invalid_bits_raises(self): def test_turbo_invalid_bits_float_raises(self): with pytest.raises(QQLSyntaxError): parse("CREATE COLLECTION articles QUANTIZE TURBO BITS 0.5") + + +# ── New feature tests ───────────────────────────────────────────────────────── + +class TestSearchGroupBy: + def test_group_by_basic(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY category") + assert isinstance(node, SearchStmt) + assert node.group_by == "category" + assert node.group_size == 3 # default + + def test_group_by_with_group_size(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY category GROUP_SIZE 5") + assert node.group_by == "category" + assert node.group_size == 5 + + def test_group_by_with_where(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 WHERE year >= 2020 GROUP BY category") + assert node.group_by == "category" + assert node.query_filter is not None + + def test_group_by_with_where_and_group_size(self): + node = parse( + "SEARCH articles SIMILAR TO 'query' LIMIT 5 WHERE year >= 2020 " + "GROUP BY category GROUP_SIZE 2" + ) + assert node.group_by == "category" + assert node.group_size == 2 + assert node.query_filter is not None + + def test_group_by_with_hybrid(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 USING HYBRID GROUP BY category") + assert node.hybrid is True + assert node.group_by == "category" + + def test_group_by_dotted_field(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY meta.author") + assert node.group_by == "meta.author" + + def test_group_by_rerank_raises(self): + with pytest.raises(QQLSyntaxError): + parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 RERANK GROUP BY category") + + def test_plain_search_has_no_group_by(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 10") + assert node.group_by is None + + def test_group_size_default_is_3(self): + node = parse("SEARCH articles SIMILAR TO 'query' LIMIT 5 GROUP BY tag") + assert node.group_size == 3 + + def test_group_by_with_model(self): + node = parse( + "SEARCH articles SIMILAR TO 'query' LIMIT 5 " + "USING MODEL 'BAAI/bge-base-en-v1.5' GROUP BY category" + ) + assert node.model == "BAAI/bge-base-en-v1.5" + assert node.group_by == "category" + + def test_group_by_collection_stored(self): + node = parse("SEARCH notes SIMILAR TO 'query' LIMIT 3 GROUP BY topic GROUP_SIZE 4") + assert node.collection == "notes" + assert node.limit == 3 + assert node.group_by == "topic" + assert node.group_size == 4 + + +class TestUpdateVector: + def test_update_vector_by_string_id(self): + from qql.ast_nodes import UpdateVectorStmt + node = parse("UPDATE articles SET VECTOR WHERE id = 'abc-123' [0.1, 0.2, 0.3]") + assert isinstance(node, UpdateVectorStmt) + assert node.collection == "articles" + assert node.point_id == "abc-123" + assert node.vector == (0.1, 0.2, 0.3) + + def test_update_vector_by_integer_id(self): + from qql.ast_nodes import UpdateVectorStmt + node = parse("UPDATE articles SET VECTOR WHERE id = 42 [0.1, 0.2]") + assert isinstance(node, UpdateVectorStmt) + assert node.point_id == 42 + + def test_update_vector_parses_float_list(self): + from qql.ast_nodes import UpdateVectorStmt + node = parse("UPDATE notes SET VECTOR WHERE id = 1 [0.1, 0.2, 0.3, 0.4]") + assert isinstance(node, UpdateVectorStmt) + assert len(node.vector) == 4 + assert all(isinstance(v, float) for v in node.vector) + + def test_update_vector_collection_stored(self): + from qql.ast_nodes import UpdateVectorStmt + node = parse("UPDATE my_col SET VECTOR WHERE id = 99 [0.5]") + assert node.collection == "my_col" + + def test_update_vector_wrong_keyword_raises(self): + with pytest.raises(QQLSyntaxError): + parse("UPDATE articles SET FOOBAR WHERE id = 1 [0.1]") + + def test_update_vector_missing_brackets_raises(self): + with pytest.raises(QQLSyntaxError): + parse("UPDATE articles SET VECTOR WHERE id = 1 0.1 0.2") + + def test_update_vector_missing_id_eq_raises(self): + with pytest.raises(QQLSyntaxError): + parse("UPDATE articles SET VECTOR WHERE 'abc' [0.1]") + + def test_update_vector_large_vector(self): + from qql.ast_nodes import UpdateVectorStmt + vec = ", ".join(["0.1"] * 384) + node = parse(f"UPDATE articles SET VECTOR WHERE id = 1 [{vec}]") + assert isinstance(node, UpdateVectorStmt) + assert len(node.vector) == 384 + + +class TestUpdatePayload: + def test_update_payload_by_string_id(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse("UPDATE articles SET PAYLOAD WHERE id = 'abc-123' {'year': 2025}") + assert isinstance(node, UpdatePayloadStmt) + assert node.collection == "articles" + assert node.point_id == "abc-123" + assert node.payload == {"year": 2025} + assert node.query_filter is None + + def test_update_payload_by_integer_id(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse("UPDATE articles SET PAYLOAD WHERE id = 42 {'status': 'active'}") + assert isinstance(node, UpdatePayloadStmt) + assert node.point_id == 42 + assert node.payload == {"status": "active"} + + def test_update_payload_by_filter(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse( + "UPDATE articles SET PAYLOAD WHERE category = 'draft' {'status': 'published'}" + ) + assert isinstance(node, UpdatePayloadStmt) + assert node.point_id is None + assert node.query_filter is not None + assert node.payload == {"status": "published"} + + def test_update_payload_compound_filter(self): + from qql.ast_nodes import UpdatePayloadStmt, AndExpr + node = parse( + "UPDATE articles SET PAYLOAD WHERE year < 2020 AND status = 'draft' " + "{'archived': true}" + ) + assert isinstance(node, UpdatePayloadStmt) + assert isinstance(node.query_filter, AndExpr) + assert node.payload == {"archived": True} + + def test_update_payload_dict_values_preserved(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse( + "UPDATE articles SET PAYLOAD WHERE id = 1 " + "{'title': 'New Title', 'year': 2025, 'score': 0.99}" + ) + assert isinstance(node, UpdatePayloadStmt) + assert node.payload["title"] == "New Title" + assert node.payload["year"] == 2025 + assert node.payload["score"] == pytest.approx(0.99) + + def test_update_payload_collection_stored(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse("UPDATE my_notes SET PAYLOAD WHERE id = 7 {'tag': 'ai'}") + assert node.collection == "my_notes" + + def test_update_payload_missing_dict_raises(self): + with pytest.raises(QQLSyntaxError): + parse("UPDATE articles SET PAYLOAD WHERE id = 1") + + def test_update_payload_dotted_filter_field(self): + from qql.ast_nodes import UpdatePayloadStmt + node = parse( + "UPDATE articles SET PAYLOAD WHERE meta.author = 'alice' {'reviewed': true}" + ) + assert isinstance(node, UpdatePayloadStmt) + assert node.query_filter is not None + assert node.payload == {"reviewed": True} From 301bad55d7cc20bca05ae41e1be7fb34eaa807b5 Mon Sep 17 00:00:00 2001 From: "manthapavankumar11@gmail.com" Date: Fri, 15 May 2026 15:52:51 +0530 Subject: [PATCH 2/4] Fixed all issues --- README.md | 4 +- docs/reference.md | 4 +- docs/search.md | 2 +- src/qql/cli.py | 38 +++++++++ src/qql/executor.py | 17 +++++ src/qql/parser.py | 15 +++- tests/test_executor.py | 170 +++++++++++++++++++++++++++++++++++++++++ tests/test_parser.py | 35 +++++++++ 8 files changed, 280 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ff85b01..77de62e 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![PyPI version](https://img.shields.io/pypi/v/qql-cli?color=blue&label=PyPI)](https://pypi.org/project/qql-cli/) [![Python 3.12+](https://img.shields.io/pypi/pyversions/qql-cli)](https://pypi.org/project/qql-cli/) [![MIT License](https://img.shields.io/badge/license-MIT-green)](LICENSE) -[![Tests](https://img.shields.io/badge/tests-485%20passing-brightgreen)](tests/) +[![Tests](https://img.shields.io/badge/tests-500%20passing-brightgreen)](tests/) Write `INSERT`, `SELECT`, `SEARCH`, `SCROLL`, `RECOMMEND`, `UPDATE`, `DELETE`, and `CREATE COLLECTION` statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, grouped search (GROUP BY), cross-encoder reranking, quantization (scalar, turbo, binary, product), SQL-style `WHERE` filters, script execution, and collection dump/restore. @@ -158,7 +158,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked. pytest tests/ -v ``` -Expected: **485 tests passing**. +Expected: **500 tests passing**. --- diff --git a/docs/reference.md b/docs/reference.md index 7cd863e..7ef56b2 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -162,7 +162,7 @@ Tests do not require a running Qdrant instance — the Qdrant client is mocked. pytest tests/ -v ``` -Expected output: **485 tests passing**. +Expected output: **500 tests passing**. --- @@ -188,6 +188,8 @@ Expected output: **485 tests passing**. | `Expected a vector list [...] after point ID in UPDATE SET VECTOR` | UPDATE SET VECTOR missing the `[...]` float array | Add the vector array: `UPDATE ... SET VECTOR WHERE id = '...' [0.1, 0.2, ...]` | | `Qdrant error during UPDATE VECTOR: ...` | Point does not exist, or vector dimensions mismatch | Verify the point ID exists and the vector length matches the collection's dimensions | | `Qdrant error during UPDATE PAYLOAD: ...` | Qdrant rejected the payload update | Check field values and collection state | +| `Vector elements must be numeric; got invalid value: ...` | A non-numeric value (string, boolean, or null) was present in the vector array for `UPDATE SET VECTOR` | Ensure all vector elements are floats: `UPDATE … [0.1, 0.2, …, 0.N]` | +| `GROUP_SIZE must be a positive integer, got N` | `GROUP_SIZE 0` or a negative value was specified | Use a positive integer: `GROUP_SIZE 3` | | `Qdrant error during SCROLL: ...` | Qdrant rejected scroll request | Verify collection state, filter, and cursor (`AFTER`) value | | `Unknown index type '...'` | Invalid schema type in CREATE INDEX | Use one of: `keyword`, `integer`, `float`, `bool`, `text`, `geo`, `datetime` | | `Qdrant error during CREATE INDEX: ...` | Qdrant rejected the index creation | Check field name and collection state | diff --git a/docs/search.md b/docs/search.md index 9f195fc..f973cd3 100644 --- a/docs/search.md +++ b/docs/search.md @@ -362,7 +362,7 @@ SEARCH SIMILAR TO '' LIMIT USING HYBRID GROUP BY - **`LIMIT `** — maximum number of **groups** to return. - **`GROUP_SIZE `** — maximum number of points per group (default: **3**). -- **`GROUP BY `** — the payload field whose values define the groups. Dot-notation is supported (e.g. `meta.author`). The field should be indexed as `keyword` or `integer` for best performance. +- **`GROUP BY `** — the payload field whose values define the groups. **Must be a string (keyword) or number (integer) field** — this is enforced by Qdrant. Dot-notation is supported (e.g. `meta.author`). Array-valued fields are allowed: a point with multiple values for the field can appear in multiple groups. The field should be indexed as `keyword` or `integer` for best performance (see [CREATE INDEX](collections.md)). - `WHERE` filters, `USING HYBRID`, and `USING MODEL` are all compatible with GROUP BY. - **`GROUP BY` and `RERANK` cannot be combined** in the same statement — this raises a syntax error. diff --git a/src/qql/cli.py b/src/qql/cli.py index 560a4c3..e88e81a 100644 --- a/src/qql/cli.py +++ b/src/qql/cli.py @@ -71,6 +71,9 @@ Optional: [yellow]RERANK[/yellow] [MODEL ''] rerank results with a cross-encoder Optional: [yellow]EXACT[/yellow] bypass HNSW and perform exact search Optional: [yellow]WITH[/yellow] { hnsw_ef: , exact: , acorn: } search parameters + Optional: [yellow]GROUP BY[/yellow] [[yellow]GROUP_SIZE[/yellow] ] + Group results by a payload field value (default GROUP_SIZE: 3). + Field must be keyword or integer type. RERANK and GROUP BY cannot be combined. [yellow]RECOMMEND FROM[/yellow] [yellow]POSITIVE IDS[/yellow] (, ...) Find points similar to known examples. @@ -82,6 +85,15 @@ [yellow]DELETE FROM[/yellow] [yellow]WHERE id =[/yellow] '' Delete a point by its ID. + [yellow]UPDATE[/yellow] [yellow]SET VECTOR WHERE id =[/yellow] ''| [] + Replace the dense vector for a single point by ID. + The point must already exist. Vector is a float array: [0.1, 0.2, ..., 0.N] + + [yellow]UPDATE[/yellow] [yellow]SET PAYLOAD WHERE id =[/yellow] ''| {} + [yellow]UPDATE[/yellow] [yellow]SET PAYLOAD WHERE[/yellow] {} + Merge new key/value pairs into a point's payload (additive; existing fields preserved). + Supports all WHERE filter operators. Filter-based updates affect all matching points. + Script files (in-shell): [yellow]EXECUTE[/yellow] or [yellow]\\e[/yellow] Run a .qql script file. Statements are executed in order. @@ -458,6 +470,32 @@ def _run_and_print(executor: Executor, query: str) -> None: console.print(_format_collection_diagnostics(result.data)) return + # Pretty-print grouped search results (GROUP BY) + if ( + isinstance(result.data, list) + and result.data + and isinstance(result.data[0], dict) + and "group_id" in result.data[0] + ): + for group in result.data: + console.print(f"\n[bold cyan]Group: {group['group_id']}[/bold cyan]") + hits = group.get("hits", []) + if hits: + tbl = Table(show_header=True, header_style="bold") + tbl.add_column("Score", style="green", no_wrap=True, justify="right") + tbl.add_column("ID") + tbl.add_column("Payload") + for hit in hits: + tbl.add_row( + str(hit["score"]), + str(hit["id"]), + str(hit.get("payload", {})), + ) + console.print(tbl) + else: + console.print(" (no hits)") + return + # Pretty-print search results if isinstance(result.data, list) and result.data and isinstance(result.data[0], dict) and "score" in result.data[0]: table = Table(show_header=True, header_style="bold cyan") diff --git a/src/qql/executor.py b/src/qql/executor.py index 281b83d..7a6cf9b 100644 --- a/src/qql/executor.py +++ b/src/qql/executor.py @@ -983,6 +983,23 @@ def _execute_search_groups( query_filter=qdrant_filter, ) label = "hybrid, grouped" + elif node.sparse_only: + sparse_model_name = node.sparse_model or SparseEmbedder.DEFAULT_MODEL + sparse_obj = SparseEmbedder(sparse_model_name).query_embed(node.query_text) + sparse_vector = SparseVector( + indices=sparse_obj["indices"], + values=sparse_obj["values"], + ) + response = self._client.query_points_groups( + collection_name=node.collection, + group_by=node.group_by, + query=sparse_vector, + using="sparse", + limit=node.limit, + group_size=node.group_size, + query_filter=qdrant_filter, + ) + label = "sparse, grouped" else: model_name = node.model or self._config.default_model vector = Embedder(model_name).embed(node.query_text) diff --git a/src/qql/parser.py b/src/qql/parser.py index cae29bb..4f84730 100644 --- a/src/qql/parser.py +++ b/src/qql/parser.py @@ -439,7 +439,13 @@ def _parse_search(self) -> SearchStmt: ) if self._peek().kind == TokenKind.GROUP_SIZE: self._advance() # consume GROUP_SIZE + gs_tok = self._peek() group_size = int(self._expect(TokenKind.INTEGER).value) + if group_size <= 0: + raise QQLSyntaxError( + f"GROUP_SIZE must be a positive integer, got {group_size}", + gs_tok.pos, + ) return SearchStmt( collection=collection, query_text=query_text, @@ -566,10 +572,17 @@ def _parse_update(self) -> UpdateVectorStmt | UpdatePayloadStmt: "Expected a vector list [...] after point ID in UPDATE SET VECTOR", self._peek().pos, ) + try: + coerced = tuple(float(v) for v in vector_val) + except (ValueError, TypeError) as exc: + raise QQLSyntaxError( + f"Vector elements must be numeric; got invalid value: {exc}", + self._peek().pos, + ) from exc return UpdateVectorStmt( collection=collection, point_id=point_id, - vector=tuple(float(v) for v in vector_val), + vector=coerced, ) if self._peek().kind == TokenKind.PAYLOAD: diff --git a/tests/test_executor.py b/tests/test_executor.py index 1698aa4..9af6941 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -2299,3 +2299,173 @@ def test_update_payload_passes_wait_true(self, executor, mock_client): executor.execute(node) kwargs = mock_client.set_payload.call_args.kwargs assert kwargs.get("wait") is True + + +# ── PR #28 review gap fixes ─────────────────────────────────────────────────── + +class TestSearchGroupBySparse: + """Gap 1 & 6 — sparse-only grouped search must use the sparse path.""" + + def test_sparse_only_grouped_calls_query_points_groups(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + mock_sparse = mocker.MagicMock() + mock_sparse.query_embed.return_value = {"indices": [0, 1], "values": [0.5, 0.5]} + mocker.patch("qql.executor.SparseEmbedder", return_value=mock_sparse) + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + sparse_only=True, group_by="category", group_size=3, + ) + executor.execute(node) + mock_client.query_points_groups.assert_called_once() + kwargs = mock_client.query_points_groups.call_args.kwargs + assert kwargs.get("using") == "sparse" + # Must NOT have called dense Embedder + from qql.embedder import Embedder as _Embedder # noqa: F401 + # mock_embedder fixture patches Embedder; query_points not called confirms no dense path + mock_client.query_points.assert_not_called() + + def test_sparse_only_grouped_label_in_message(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + mock_sparse = mocker.MagicMock() + mock_sparse.query_embed.return_value = {"indices": [0], "values": [1.0]} + mocker.patch("qql.executor.SparseEmbedder", return_value=mock_sparse) + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + sparse_only=True, group_by="tag", group_size=2, + ) + result = executor.execute(node) + assert "sparse" in result.message + assert "grouped" in result.message + + +class TestSearchGroupByAdvanced: + """Gaps 7 & 8 — fusion and search params forwarding in grouped search.""" + + def test_grouped_hybrid_fusion_dbsf(self, executor, mock_client, mocker): + from qdrant_client.models import Fusion + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + mock_sparse = mocker.MagicMock() + mock_sparse.query_embed.return_value = {"indices": [0], "values": [1.0]} + mocker.patch("qql.executor.SparseEmbedder", return_value=mock_sparse) + + node = SearchStmt( + collection="articles", query_text="q", limit=3, model=None, + hybrid=True, fusion="dbsf", group_by="category", group_size=2, + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + fusion_query = kwargs.get("query") + assert fusion_query is not None + assert fusion_query.fusion == Fusion.DBSF + + def test_grouped_search_params_with_clause_forwarded(self, executor, mock_client, mocker): + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + with_clause=SearchWith(exact=True), group_by="category", + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + assert kwargs.get("search_params") is not None + + +class TestUpdateVectorVectorShape: + """Gaps 12 & 13 — verify exact vector shape sent to Qdrant for named/unnamed collections.""" + + def test_update_vector_unnamed_collection_sends_plain_list(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + # Unnamed collection: get_collection returns non-dict vectors + mock_vectors = mocker.MagicMock() if False else type("V", (), {})() + info = mock_client.get_collection.return_value + info.config.params.vectors = [None] # list → not a dict → unnamed + + node = UpdateVectorStmt(collection="articles", point_id=1, vector=(0.1, 0.2, 0.3)) + executor.execute(node) + kwargs = mock_client.update_vectors.call_args.kwargs + pv = kwargs["points"][0] + assert isinstance(pv.vector, list) + assert pv.vector == [0.1, 0.2, 0.3] + + def test_update_vector_named_collection_sends_dict(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + # Named collection: get_collection returns dict vectors + info = mock_client.get_collection.return_value + info.config.params.vectors = {"dense": object(), "sparse": object()} # dict → named + + node = UpdateVectorStmt(collection="articles", point_id="id-1", vector=(0.5, 0.6)) + executor.execute(node) + kwargs = mock_client.update_vectors.call_args.kwargs + pv = kwargs["points"][0] + assert isinstance(pv.vector, dict) + assert "dense" in pv.vector + assert pv.vector["dense"] == [0.5, 0.6] + + def test_update_vector_exact_values_preserved(self, executor, mock_client): + from qql.ast_nodes import UpdateVectorStmt + mock_client.collection_exists.return_value = True + info = mock_client.get_collection.return_value + info.config.params.vectors = [None] # unnamed + + vec = (0.11, 0.22, 0.33, 0.44) + node = UpdateVectorStmt(collection="articles", point_id=99, vector=vec) + executor.execute(node) + kwargs = mock_client.update_vectors.call_args.kwargs + assert kwargs["points"][0].vector == list(vec) + + +class TestUpdatePayloadMessages: + """Gaps 17 — assert specific message text for both update-payload branches.""" + + def test_filter_based_update_message_contains_filter_based(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt, CompareExpr + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", + payload={"status": "done"}, + query_filter=CompareExpr(field="year", op="<", value=2020), + ) + result = executor.execute(node) + assert "filter-based" in result.message + + def test_id_based_update_message_contains_point_id(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", point_id="abc-999", payload={"tag": "ai"} + ) + result = executor.execute(node) + assert "abc-999" in result.message + + def test_filter_based_set_payload_receives_filter_object(self, executor, mock_client): + from qql.ast_nodes import UpdatePayloadStmt, CompareExpr + from qdrant_client.models import Filter + mock_client.collection_exists.return_value = True + node = UpdatePayloadStmt( + collection="articles", + payload={"x": 1}, + query_filter=CompareExpr(field="cat", op="=", value="tech"), + ) + executor.execute(node) + kwargs = mock_client.set_payload.call_args.kwargs + # SDK verified: PointsSelector accepts rest.Filter — must receive Filter, not a list + assert isinstance(kwargs["points"], Filter) diff --git a/tests/test_parser.py b/tests/test_parser.py index f7e36cf..d3d33e8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1368,3 +1368,38 @@ def test_update_payload_dotted_filter_field(self): assert isinstance(node, UpdatePayloadStmt) assert node.query_filter is not None assert node.payload == {"reviewed": True} + + +# ── PR #28 review gap fixes ─────────────────────────────────────────────────── + +class TestSearchGroupByValidation: + """Parser-level validation added for PR #28 gaps 2 and 2.""" + + def test_group_size_zero_raises(self): + with pytest.raises(QQLSyntaxError, match="GROUP_SIZE must be a positive integer"): + parse("SEARCH articles SIMILAR TO 'q' LIMIT 5 GROUP BY category GROUP_SIZE 0") + + def test_group_size_negative_raises(self): + with pytest.raises(QQLSyntaxError, match="GROUP_SIZE must be a positive integer"): + parse("SEARCH articles SIMILAR TO 'q' LIMIT 5 GROUP BY category GROUP_SIZE -1") + + +class TestUpdateVectorValidation: + """PR #28 gap 11 — non-numeric vector elements should raise QQLSyntaxError.""" + + def test_non_numeric_string_element_raises(self): + with pytest.raises(QQLSyntaxError, match="Vector elements must be numeric"): + parse("UPDATE articles SET VECTOR WHERE id = 1 ['abc', 0.2, 0.3]") + + def test_none_element_raises(self): + # null parsed as Python None → TypeError → QQLSyntaxError + with pytest.raises(QQLSyntaxError, match="Vector elements must be numeric"): + parse("UPDATE articles SET VECTOR WHERE id = 1 [null, 0.2]") + + +class TestUpdateSetInvalidTargetMessage: + """PR #28 gap 16 — explicit error message for bad SET target.""" + + def test_invalid_set_target_message(self): + with pytest.raises(QQLSyntaxError, match="Expected VECTOR or PAYLOAD after SET"): + parse("UPDATE articles SET FOOBAR WHERE id = 1 [0.1]") From 2557814f588acd3160547355914c72ee2ab2ab20 Mon Sep 17 00:00:00 2001 From: "manthapavankumar11@gmail.com" Date: Fri, 15 May 2026 15:53:57 +0530 Subject: [PATCH 3/4] Fixed version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 10bdabb..ef13698 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "qql-cli" -version = "2.2.0" +version = "2.3.0" description = "QQL is a SQL-like query language and CLI for Qdrant vector database. Write INSERT, SEARCH, RECOMMEND, DELETE, and CREATE COLLECTION statements instead of Python SDK calls. Supports hybrid dense+sparse vector search, cross-encoder reranking, quantization (scalar, turbo, binary, product), WHERE clause filters, script execution, and collection dump/restore." readme = "README.md" license = { file = "LICENSE" } From 967ab619850b951cdff87ad31897522acce9a892 Mon Sep 17 00:00:00 2001 From: "manthapavankumar11@gmail.com" Date: Fri, 15 May 2026 16:09:52 +0530 Subject: [PATCH 4/4] Fixed additional issues --- docs/reference.md | 3 +- src/qql/executor.py | 37 +++++--- src/qql/parser.py | 7 ++ tests/test_executor.py | 209 +++++++++++++++++++++++++++++++++++++++++ tests/test_parser.py | 10 ++ 5 files changed, 252 insertions(+), 14 deletions(-) diff --git a/docs/reference.md b/docs/reference.md index 7ef56b2..135f7fa 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -188,7 +188,8 @@ Expected output: **500 tests passing**. | `Expected a vector list [...] after point ID in UPDATE SET VECTOR` | UPDATE SET VECTOR missing the `[...]` float array | Add the vector array: `UPDATE ... SET VECTOR WHERE id = '...' [0.1, 0.2, ...]` | | `Qdrant error during UPDATE VECTOR: ...` | Point does not exist, or vector dimensions mismatch | Verify the point ID exists and the vector length matches the collection's dimensions | | `Qdrant error during UPDATE PAYLOAD: ...` | Qdrant rejected the payload update | Check field values and collection state | -| `Vector elements must be numeric; got invalid value: ...` | A non-numeric value (string, boolean, or null) was present in the vector array for `UPDATE SET VECTOR` | Ensure all vector elements are floats: `UPDATE … [0.1, 0.2, …, 0.N]` | +| `Vector elements must be numeric floats; boolean values are not allowed` | A boolean (`true` or `false`) was present in the vector array for `UPDATE SET VECTOR` — `float(True)` silently equals `1.0` in Python, so this is caught explicitly | Replace booleans with numeric floats: `UPDATE … [0.1, 0.2, …, 0.N]` | +| `Vector elements must be numeric; got invalid value: ...` | A non-numeric value (string or null) was present in the vector array for `UPDATE SET VECTOR` | Ensure all vector elements are floats: `UPDATE … [0.1, 0.2, …, 0.N]` | | `GROUP_SIZE must be a positive integer, got N` | `GROUP_SIZE 0` or a negative value was specified | Use a positive integer: `GROUP_SIZE 3` | | `Qdrant error during SCROLL: ...` | Qdrant rejected scroll request | Verify collection state, filter, and cursor (`AFTER`) value | | `Unknown index type '...'` | Invalid schema type in CREATE INDEX | Use one of: `keyword`, `integer`, `float`, `bool`, `text`, `geo`, `datetime` | diff --git a/src/qql/executor.py b/src/qql/executor.py index 7a6cf9b..b4e8651 100644 --- a/src/qql/executor.py +++ b/src/qql/executor.py @@ -614,14 +614,8 @@ def _execute_search(self, node: SearchStmt) -> ExecutionResult: if node.hybrid: dense_model = node.model or self._config.default_model sparse_model_name = node.sparse_model or SparseEmbedder.DEFAULT_MODEL - dense_embedder = Embedder(dense_model) - sparse_embedder = SparseEmbedder(sparse_model_name) - - dense_vector = dense_embedder.embed(node.query_text) - sparse_obj = sparse_embedder.query_embed(node.query_text) - sparse_vector = SparseVector( - indices=sparse_obj["indices"], - values=sparse_obj["values"], + dense_vector, sparse_vector = self._build_hybrid_vectors( + node.query_text, dense_model, sparse_model_name ) try: @@ -744,6 +738,26 @@ def _execute_search(self, node: SearchStmt) -> ExecutionResult: data=results, ) + def _build_hybrid_vectors( + self, + query_text: str, + dense_model: str, + sparse_model_name: str, + ) -> tuple[list[float], SparseVector]: + """Embed *query_text* with both dense and sparse models. + + Returns ``(dense_vector, sparse_vector)`` — a plain Python list for + dense and a :class:`SparseVector` for sparse. Extracted to eliminate + duplication between the flat-hybrid and grouped-hybrid paths. + """ + dense_vector: list[float] = Embedder(dense_model).embed(query_text) + sparse_obj = SparseEmbedder(sparse_model_name).query_embed(query_text) + sparse_vector = SparseVector( + indices=sparse_obj["indices"], + values=sparse_obj["values"], + ) + return dense_vector, sparse_vector + def _resolve_hybrid_fusion(self, fusion: str | None) -> Fusion: if fusion is None or fusion == "rrf": return Fusion.RRF @@ -954,11 +968,8 @@ def _execute_search_groups( if node.hybrid: dense_model = node.model or self._config.default_model sparse_model_name = node.sparse_model or SparseEmbedder.DEFAULT_MODEL - dense_vector = Embedder(dense_model).embed(node.query_text) - sparse_obj = SparseEmbedder(sparse_model_name).query_embed(node.query_text) - sparse_vector = SparseVector( - indices=sparse_obj["indices"], - values=sparse_obj["values"], + dense_vector, sparse_vector = self._build_hybrid_vectors( + node.query_text, dense_model, sparse_model_name ) response = self._client.query_points_groups( collection_name=node.collection, diff --git a/src/qql/parser.py b/src/qql/parser.py index 4f84730..4add845 100644 --- a/src/qql/parser.py +++ b/src/qql/parser.py @@ -573,6 +573,13 @@ def _parse_update(self) -> UpdateVectorStmt | UpdatePayloadStmt: self._peek().pos, ) try: + for v in vector_val: + if isinstance(v, bool): + raise QQLSyntaxError( + "Vector elements must be numeric floats; " + "boolean values are not allowed", + self._peek().pos, + ) coerced = tuple(float(v) for v in vector_val) except (ValueError, TypeError) as exc: raise QQLSyntaxError( diff --git a/tests/test_executor.py b/tests/test_executor.py index 9af6941..294f93a 100644 --- a/tests/test_executor.py +++ b/tests/test_executor.py @@ -2469,3 +2469,212 @@ def test_filter_based_set_payload_receives_filter_object(self, executor, mock_cl kwargs = mock_client.set_payload.call_args.kwargs # SDK verified: PointsSelector accepts rest.Filter — must receive Filter, not a list assert isinstance(kwargs["points"], Filter) + + +# ── Round-2 review gap fixes ────────────────────────────────────────────────── + +class TestGroupedCustomModelForwarding: + """Gap 9 (round 2) — grouped search must forward custom model names to the embedders.""" + + def test_grouped_dense_custom_model_forwarded(self, executor, mock_client, mocker): + """Dense grouped search with USING MODEL must instantiate Embedder with that model.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + mock_client.get_collection.return_value.config.params.vectors = None # unnamed + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_instance = mocker.MagicMock() + embedder_instance.embed.return_value = [0.1] * 384 + embedder_cls.return_value = embedder_instance + + node = SearchStmt( + collection="articles", query_text="q", limit=5, + model="BAAI/bge-base-en-v1.5", + group_by="category", group_size=3, + ) + executor.execute(node) + # Embedder must have been instantiated with the custom model name + embedder_cls.assert_called_once_with("BAAI/bge-base-en-v1.5") + + def test_grouped_hybrid_custom_dense_model_forwarded(self, executor, mock_client, mocker): + """Hybrid grouped search must instantiate Embedder with the custom dense model.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.1] * 384 + + sparse_cls = mocker.patch("qql.executor.SparseEmbedder") + sparse_cls.return_value.query_embed.return_value = {"indices": [0], "values": [1.0]} + + node = SearchStmt( + collection="articles", query_text="q", limit=5, + model="BAAI/bge-large-en-v1.5", hybrid=True, + group_by="category", group_size=2, + ) + executor.execute(node) + embedder_cls.assert_called_once_with("BAAI/bge-large-en-v1.5") + + def test_grouped_hybrid_custom_sparse_model_forwarded(self, executor, mock_client, mocker): + """Hybrid grouped search must instantiate SparseEmbedder with the custom sparse model.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.1] * 384 + + sparse_cls = mocker.patch("qql.executor.SparseEmbedder") + sparse_cls.return_value.query_embed.return_value = {"indices": [0], "values": [1.0]} + + node = SearchStmt( + collection="articles", query_text="q", limit=5, + model=None, hybrid=True, sparse_model="prithivida/Splade_PP_en_v1", + group_by="category", group_size=2, + ) + executor.execute(node) + sparse_cls.assert_called_once_with("prithivida/Splade_PP_en_v1") + + +class TestGroupedSearchParamsDepth: + """Gap 8 (round 2) — verify hnsw_ef/acorn values are actually forwarded in grouped search.""" + + def test_grouped_search_hnsw_ef_value_forwarded(self, executor, mock_client, mocker): + """search_params for dense grouped search must carry the hnsw_ef value.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + mock_client.get_collection.return_value.config.params.vectors = None # unnamed + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.1] * 384 + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + with_clause=SearchWith(hnsw_ef=256), + group_by="category", group_size=3, + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + sp = kwargs.get("search_params") + assert sp is not None + assert sp.hnsw_ef == 256 + + def test_grouped_search_exact_value_forwarded(self, executor, mock_client, mocker): + """search_params for dense grouped search must carry exact=True when set.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + mock_client.get_collection.return_value.config.params.vectors = None # unnamed + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.1] * 384 + + node = SearchStmt( + collection="articles", query_text="q", limit=5, model=None, + with_clause=SearchWith(exact=True), + group_by="category", group_size=3, + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + sp = kwargs.get("search_params") + assert sp is not None + assert sp.exact is True + + def test_grouped_hybrid_prefetch_params_forwarded(self, executor, mock_client, mocker): + """Hybrid grouped search must forward search_params into each Prefetch.params.""" + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.1] * 384 + + sparse_cls = mocker.patch("qql.executor.SparseEmbedder") + sparse_cls.return_value.query_embed.return_value = {"indices": [0], "values": [1.0]} + + node = SearchStmt( + collection="articles", query_text="q", limit=5, + model=None, hybrid=True, with_clause=SearchWith(hnsw_ef=128), + group_by="category", group_size=2, + ) + executor.execute(node) + kwargs = mock_client.query_points_groups.call_args.kwargs + prefetch_list = kwargs.get("prefetch") + assert prefetch_list is not None and len(prefetch_list) == 2 + for pf in prefetch_list: + assert pf.params is not None + assert pf.params.hnsw_ef == 128 + + +class TestBuildHybridVectorsHelper: + """Gap 10 (round 2) — _build_hybrid_vectors() is the single source of truth for both paths.""" + + def test_helper_returns_list_and_sparse_vector(self, executor, mocker): + """_build_hybrid_vectors must return (list[float], SparseVector).""" + from qdrant_client.models import SparseVector + + embedder_cls = mocker.patch("qql.executor.Embedder") + embedder_cls.return_value.embed.return_value = [0.5] * 4 + + sparse_cls = mocker.patch("qql.executor.SparseEmbedder") + sparse_cls.return_value.query_embed.return_value = {"indices": [1, 3], "values": [0.8, 0.2]} + + dense, sparse = executor._build_hybrid_vectors("test query", "dense-model", "sparse-model") + assert dense == [0.5] * 4 + assert isinstance(sparse, SparseVector) + assert sparse.indices == [1, 3] + assert sparse.values == [0.8, 0.2] + embedder_cls.assert_called_once_with("dense-model") + sparse_cls.assert_called_once_with("sparse-model") + + def test_flat_hybrid_search_uses_build_hybrid_vectors(self, executor, mock_client, mocker): + """The flat hybrid path must call _build_hybrid_vectors (not inline Embedder calls).""" + from qdrant_client.models import SparseVector + + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.points = [] + mock_client.query_points.return_value = mock_response + + # Patch the shared helper on the executor instance + helper = mocker.patch.object( + executor, "_build_hybrid_vectors", + return_value=([0.1] * 384, SparseVector(indices=[0], values=[1.0])), + ) + + node = SearchStmt( + collection="articles", query_text="hello", limit=5, + model=None, hybrid=True, + ) + executor.execute(node) + helper.assert_called_once() + + def test_grouped_hybrid_search_uses_build_hybrid_vectors(self, executor, mock_client, mocker): + """The grouped hybrid path must call _build_hybrid_vectors (not inline Embedder calls).""" + from qdrant_client.models import SparseVector + + mock_client.collection_exists.return_value = True + mock_response = mocker.MagicMock() + mock_response.groups = [] + mock_client.query_points_groups.return_value = mock_response + + helper = mocker.patch.object( + executor, "_build_hybrid_vectors", + return_value=([0.1] * 384, SparseVector(indices=[0], values=[1.0])), + ) + + node = SearchStmt( + collection="articles", query_text="hello", limit=5, + model=None, hybrid=True, group_by="category", + ) + executor.execute(node) + helper.assert_called_once() diff --git a/tests/test_parser.py b/tests/test_parser.py index d3d33e8..1320fe1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1396,6 +1396,16 @@ def test_none_element_raises(self): with pytest.raises(QQLSyntaxError, match="Vector elements must be numeric"): parse("UPDATE articles SET VECTOR WHERE id = 1 [null, 0.2]") + def test_boolean_true_element_raises(self): + # bool is a subclass of int — float(True) == 1.0 would silently pass + # without an explicit isinstance(v, bool) guard. + with pytest.raises(QQLSyntaxError, match="boolean values are not allowed"): + parse("UPDATE articles SET VECTOR WHERE id = 1 [true, 0.2, 0.3]") + + def test_boolean_false_element_raises(self): + with pytest.raises(QQLSyntaxError, match="boolean values are not allowed"): + parse("UPDATE articles SET VECTOR WHERE id = 1 [false, 0.5]") + class TestUpdateSetInvalidTargetMessage: """PR #28 gap 16 — explicit error message for bad SET target."""