pavanjava · srimon12 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/README.md b/README.md
@@ -44,6 +44,7 @@ qql> SEARCH notes SIMILAR TO 'vector databases' LIMIT 5 USING HYBRID RERANK
   - [Cross-Encoder Reranking (RERANK)](#cross-encoder-reranking-rerank)
   - [SHOW COLLECTIONS — list collections](#show-collections--list-collections)
   - [CREATE COLLECTION — create a collection](#create-collection--create-a-collection)
+  - [Quantization — QUANTIZE clause](#quantization--quantize-clause)
   - [CREATE INDEX — create a payload index](#create-index--create-a-payload-index)
   - [DROP COLLECTION — delete a collection](#drop-collection--delete-a-collection)
   - [DELETE — remove points](#delete--remove-points)
@@ -903,6 +904,82 @@ If the collection already exists, the command succeeds with a message and does n
 
 ---
 
+### Quantization — QUANTIZE clause
+
+Quantization reduces the memory footprint of vector collections and speeds up search at the cost of a small, controllable accuracy loss. QQL supports all three Qdrant quantization strategies via an optional `QUANTIZE` clause appended to `CREATE COLLECTION`.
+
+**Three strategies:**
+
+| Type | Compression | Accuracy Loss | Best For |
+|---|---|---|---|
+| `SCALAR` | 4× (float32 → int8) | < 1% | Most collections — best balance |
+| `BINARY` | 32× (float32 → 1-bit) | Higher | High-dimensional vectors (768+), speed priority |
+| `PRODUCT` | 4× (configurable) | Variable | Memory-constrained deployments |
+
+**Full syntax:**
+```
+CREATE COLLECTION <name> ... QUANTIZE SCALAR [QUANTILE <0.0–1.0>] [ALWAYS RAM]
+CREATE COLLECTION <name> ... QUANTIZE BINARY  [ALWAYS RAM]
+CREATE COLLECTION <name> ... QUANTIZE PRODUCT [ALWAYS RAM]
+```
+
+- **`QUANTILE <float>`** — (scalar only) calibration quantile for the INT8 conversion; defaults to Qdrant's built-in default (0.99) when omitted. Lower values improve outlier handling at the cost of a slightly wider value range.
+- **`ALWAYS RAM`** — keep the **original** (unquantized) vectors in RAM for rescoring, sacrificing memory savings but preserving accuracy during re-ranking. Supported by all three types.
+- **`QUANTIZE`** always appears **after** all other clauses (`HYBRID`, `USING MODEL`, etc.).
+- For `PRODUCT`, the compression ratio is fixed at **4×** in this version.
+- When used with `HYBRID` collections, quantization applies only to the **dense** vector (Qdrant's behavior).
+
+**Examples:**
+
+Scalar quantization (recommended default):
+```sql
+CREATE COLLECTION research_papers QUANTIZE SCALAR
+```
+
+Scalar with explicit calibration and original vectors kept in RAM:
+```sql
+CREATE COLLECTION research_papers QUANTIZE SCALAR QUANTILE 0.95 ALWAYS RAM
+```
+
+Binary quantization for large high-dimensional embeddings:
+```sql
+CREATE COLLECTION research_papers QUANTIZE BINARY
+```
+
+Product quantization for maximum memory savings:
+```sql
+CREATE COLLECTION research_papers QUANTIZE PRODUCT ALWAYS RAM
+```
+
+Combined with hybrid collection:
+```sql
+CREATE COLLECTION research_papers HYBRID QUANTIZE SCALAR
+```
+
+Combined with a pinned model:
+```sql
+CREATE COLLECTION research_papers USING MODEL 'BAAI/bge-base-en-v1.5' QUANTIZE SCALAR QUANTILE 0.99
+```
+
+Combined with hybrid + dense model:
+```sql
+CREATE COLLECTION research_papers USING HYBRID DENSE MODEL 'BAAI/bge-base-en-v1.5' QUANTIZE BINARY
+```
+
+**Valid combinations:**
+
+| Base form | + QUANTIZE SCALAR | + QUANTIZE BINARY | + QUANTIZE PRODUCT |
+|---|---|---|---|
+| `CREATE COLLECTION name` | ✓ | ✓ | ✓ |
+| `... HYBRID` | ✓ | ✓ | ✓ |
+| `... USING MODEL 'x'` | ✓ | ✓ | ✓ |
+| `... USING HYBRID` | ✓ | ✓ | ✓ |
+| `... USING HYBRID DENSE MODEL 'x'` | ✓ | ✓ | ✓ |
+
+> **Note:** INSERT and SEARCH on quantized collections work exactly the same as on non-quantized ones — no changes to INSERT or SEARCH syntax are needed.
+
+---
+
 ### CREATE INDEX — create a payload index
 
 Creates a payload index on a collection field. Payload indexes speed up `WHERE` clause filtering by allowing Qdrant to efficiently match on indexed fields.

diff --git a/src/qql/ast_nodes.py b/src/qql/ast_nodes.py
@@ -1,9 +1,24 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
+from enum import Enum
 from typing import Any, Union
 
 
+class QuantizationType(Enum):
+    SCALAR  = "scalar"
+    BINARY  = "binary"
+    PRODUCT = "product"
+
+
+@dataclass(frozen=True)
+class QuantizationConfig:
+    """Quantization settings parsed from a QUANTIZE clause."""
+    type: QuantizationType
+    quantile: float | None = None   # SCALAR only; None → Qdrant default (0.99)
+    always_ram: bool = False        # all types; default False
+
+
 @dataclass(frozen=True)
 class SearchWith:
     """Query-time search params supported by Qdrant SearchParams."""
@@ -141,8 +156,9 @@ class InsertBulkStmt:
 @dataclass(frozen=True)
 class CreateCollectionStmt:
     collection: str
-    hybrid: bool = False    # if True, create with dense + sparse named vectors
-    model: str | None = None  # dense model; None → use config default
+    hybrid: bool = False                      # if True, create with dense + sparse named vectors
+    model: str | None = None                  # dense model; None → use config default
+    quantization: QuantizationConfig | None = None  # optional QUANTIZE clause
 
 
 @dataclass(frozen=True)

diff --git a/src/qql/cli.py b/src/qql/cli.py
@@ -38,6 +38,10 @@
       Create a new collection. Add HYBRID for dense+sparse BM25 vectors.
       Optional: [yellow]USING MODEL[/yellow] '<model>'
       Optional: [yellow]USING HYBRID[/yellow] [DENSE MODEL '<model>']
+      Optional: [yellow]QUANTIZE SCALAR[/yellow] [QUANTILE <0.0–1.0>] [ALWAYS RAM]
+      Optional: [yellow]QUANTIZE BINARY[/yellow] [ALWAYS RAM]
+      Optional: [yellow]QUANTIZE PRODUCT[/yellow] [ALWAYS RAM]   (4× compression)
+      QUANTIZE may be combined with any HYBRID or MODEL clause.
 
   [yellow]DROP COLLECTION[/yellow] <name>
       Delete a collection and all its points.

diff --git a/src/qql/executor.py b/src/qql/executor.py
@@ -9,6 +9,9 @@
 from qdrant_client.http.exceptions import UnexpectedResponse
 from qdrant_client.models import (
     AcornSearchParams,
+    BinaryQuantization,
+    BinaryQuantizationConfig,
+    CompressionRatio,
     Distance,
     FieldCondition,
     Filter,
@@ -29,10 +32,15 @@
     PayloadSchemaType,
     PointStruct,
     Prefetch,
+    ProductQuantization,
+    ProductQuantizationConfig,
     Range,
     RecommendInput,
     RecommendQuery,
     RecommendStrategy,
+    ScalarQuantization,
+    ScalarQuantizationConfig,
+    ScalarType,
     SearchParams,
     SparseVector,
     SparseVectorParams,
@@ -62,6 +70,8 @@
     NotExpr,
     NotInExpr,
     OrExpr,
+    QuantizationConfig,
+    QuantizationType,
     RecommendStmt,
     SearchStmt,
     SearchWith,
@@ -292,37 +302,55 @@ def _execute_create(self, node: CreateCollectionStmt) -> ExecutionResult:
 
         dense_model_name = node.model or self._config.default_model
 
+        # Build optional quantization config (None when QUANTIZE clause absent)
+        quant_config = (
+            self._build_quantization_config(node.quantization)
+            if node.quantization is not None
+            else None
+        )
+        quant_label = (
+            f", {node.quantization.type.value} quantization"
+            if node.quantization is not None
+            else ""
+        )
+
         # ── Hybrid collection: named dense + sparse vectors ────────────────
         if node.hybrid:
             embedder = Embedder(dense_model_name)
             dims = embedder.dimensions
-            self._create_collection_and_wait(
-                collection_name=node.collection,
-                vectors_config={
+            create_kwargs: dict[str, Any] = {
+                "collection_name": node.collection,
+                "vectors_config": {
                     "dense": VectorParams(size=dims, distance=Distance.COSINE)
                 },
-                sparse_vectors_config={
+                "sparse_vectors_config": {
                     "sparse": SparseVectorParams(modifier=Modifier.IDF)
                 },
-            )
+            }
+            if quant_config is not None:
+                create_kwargs["quantization_config"] = quant_config
+            self._create_collection_and_wait(**create_kwargs)
             return ExecutionResult(
                 success=True,
                 message=(
                     f"Collection '{node.collection}' created "
-                    f"(hybrid: {dims}-dim dense + BM25 sparse, cosine distance)"
+                    f"(hybrid: {dims}-dim dense + BM25 sparse, cosine distance{quant_label})"
                 ),
             )
 
         # ── Standard dense-only collection ─────────────────────────────────
         embedder = Embedder(dense_model_name)
         dims = embedder.dimensions
-        self._create_collection_and_wait(
-            collection_name=node.collection,
-            vectors_config=VectorParams(size=dims, distance=Distance.COSINE),
-        )
+        create_kwargs = {
+            "collection_name": node.collection,
+            "vectors_config": VectorParams(size=dims, distance=Distance.COSINE),
+        }
+        if quant_config is not None:
+            create_kwargs["quantization_config"] = quant_config
+        self._create_collection_and_wait(**create_kwargs)
         return ExecutionResult(
             success=True,
-            message=f"Collection '{node.collection}' created ({dims}-dimensional vectors, cosine distance)",
+            message=f"Collection '{node.collection}' created ({dims}-dimensional vectors, cosine distance{quant_label})",
         )
 
     def _execute_create_index(self, node: CreateIndexStmt) -> ExecutionResult:
@@ -816,6 +844,31 @@ def _wrap_as_filter(self, qdrant_expr: Any) -> Filter:
 
     # ── Collection helpers ────────────────────────────────────────────────
 
+    def _build_quantization_config(
+        self, qc: QuantizationConfig
+    ) -> ScalarQuantization | BinaryQuantization | ProductQuantization:
+        """Convert a parsed QuantizationConfig to a Qdrant SDK quantization object."""
+        if qc.type == QuantizationType.SCALAR:
+            return ScalarQuantization(
+                scalar=ScalarQuantizationConfig(
+                    type=ScalarType.INT8,
+                    quantile=qc.quantile,      # None → SDK uses its own default (0.99)
+                    always_ram=qc.always_ram,
+                )
+            )
+        if qc.type == QuantizationType.BINARY:
+            return BinaryQuantization(
+                binary=BinaryQuantizationConfig(always_ram=qc.always_ram)
+            )
+        if qc.type == QuantizationType.PRODUCT:
+            return ProductQuantization(
+                product=ProductQuantizationConfig(
+                    compression=CompressionRatio.X4,
+                    always_ram=qc.always_ram,
+                )
+            )
+        raise QQLRuntimeError(f"Unknown quantization type: {qc.type}")
+
     def _collection_is_hybrid(self, name: str) -> bool:
         """Return True if *name* exists and uses named vectors (hybrid collection)."""
         if not self._client.collection_exists(name):

diff --git a/src/qql/lexer.py b/src/qql/lexer.py
@@ -20,6 +20,13 @@ class TokenKind(Enum):
     EXACT = auto()
     WITH = auto()
     ACORN = auto()
+    QUANTIZE = auto()
+    SCALAR   = auto()
+    BINARY   = auto()
+    PRODUCT  = auto()
+    QUANTILE = auto()
+    ALWAYS   = auto()
+    RAM      = auto()
     CREATE = auto()
     INDEX = auto()
     ON = auto()
@@ -98,7 +105,14 @@ class TokenKind(Enum):
     "RERANK": TokenKind.RERANK,
     "EXACT": TokenKind.EXACT,
     "WITH": TokenKind.WITH,
-    "ACORN": TokenKind.ACORN,
+    "ACORN":    TokenKind.ACORN,
+    "QUANTIZE": TokenKind.QUANTIZE,
+    "SCALAR":   TokenKind.SCALAR,
+    "BINARY":   TokenKind.BINARY,
+    "PRODUCT":  TokenKind.PRODUCT,
+    "QUANTILE": TokenKind.QUANTILE,
+    "ALWAYS":   TokenKind.ALWAYS,
+    "RAM":      TokenKind.RAM,
     "CREATE": TokenKind.CREATE,
     "INDEX": TokenKind.INDEX,
     "ON": TokenKind.ON,

diff --git a/src/qql/parser.py b/src/qql/parser.py
@@ -23,6 +23,8 @@
     NotExpr,
     NotInExpr,
     OrExpr,
+    QuantizationConfig,
+    QuantizationType,
     RecommendStmt,
     SearchStmt,
     SearchWith,
@@ -149,7 +151,7 @@ def _parse_insert_bulk_body(self) -> InsertBulkStmt:
             model=model, hybrid=hybrid, sparse_model=sparse_model,
         )
 
-    def _parse_create(self) -> CreateCollectionStmt:
+    def _parse_create(self) -> CreateCollectionStmt | CreateIndexStmt:
         self._expect(TokenKind.CREATE)
         if self._peek().kind == TokenKind.COLLECTION:
             self._advance()
@@ -175,10 +177,17 @@ def _parse_create(self) -> CreateCollectionStmt:
                     self._expect(TokenKind.MODEL)
                     model = self._expect(TokenKind.STRING).value
 
+            # ── Optional QUANTIZE clause ──────────────────────────────────
+            quantization: QuantizationConfig | None = None
+            if self._peek().kind == TokenKind.QUANTIZE:
+                self._advance()  # consume QUANTIZE
+                quantization = self._parse_quantize_clause()
+
             return CreateCollectionStmt(
                 collection=collection,
                 hybrid=hybrid,
                 model=model,
+                quantization=quantization,
             )
 
         self._expect(TokenKind.INDEX)
@@ -191,6 +200,59 @@ def _parse_create(self) -> CreateCollectionStmt:
         schema = self._expect(TokenKind.IDENTIFIER).value.lower()
         return CreateIndexStmt(collection=collection, field_name=field_name, schema=schema)
 
+    def _parse_quantize_clause(self) -> QuantizationConfig:
+        """Parse: (SCALAR | BINARY | PRODUCT) [QUANTILE <float>] [ALWAYS RAM]
+
+        Called immediately after the QUANTIZE token has been consumed.
+        """
+        tok = self._peek()
+
+        if tok.kind == TokenKind.SCALAR:
+            self._advance()
+            quantile: float | None = None
+            always_ram: bool = False
+            if self._peek().kind == TokenKind.QUANTILE:
+                self._advance()
+                quantile_tok = self._peek()
+                quantile = float(self._parse_number())
+                if not 0.0 <= quantile <= 1.0:
+                    raise QQLSyntaxError(
+                        f"QUANTILE must be between 0 and 1 inclusive, got {quantile}",
+                        quantile_tok.pos,
+                    )
+            if self._peek().kind == TokenKind.ALWAYS:
+                self._advance()
+                self._expect(TokenKind.RAM)
+                always_ram = True
+            return QuantizationConfig(
+                type=QuantizationType.SCALAR,
+                quantile=quantile,
+                always_ram=always_ram,
+            )
+
+        if tok.kind == TokenKind.BINARY:
+            self._advance()
+            always_ram = False
+            if self._peek().kind == TokenKind.ALWAYS:
+                self._advance()
+                self._expect(TokenKind.RAM)
+                always_ram = True
+            return QuantizationConfig(type=QuantizationType.BINARY, always_ram=always_ram)
+
+        if tok.kind == TokenKind.PRODUCT:
+            self._advance()
+            always_ram = False
+            if self._peek().kind == TokenKind.ALWAYS:
+                self._advance()
+                self._expect(TokenKind.RAM)
+                always_ram = True
+            return QuantizationConfig(type=QuantizationType.PRODUCT, always_ram=always_ram)
+
+        raise QQLSyntaxError(
+            f"Expected SCALAR, BINARY, or PRODUCT after QUANTIZE, got '{tok.value}'",
+            tok.pos,
+        )
+
     def _parse_drop(self) -> DropCollectionStmt:
         self._expect(TokenKind.DROP)
         self._expect(TokenKind.COLLECTION)