ENTERPILOT · SantiagoDePolonia · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026 · Mar 31, 2026
diff --git a/.env.template b/.env.template
@@ -46,6 +46,8 @@
 # REDIS_TTL_MODELS=86400
 # REDIS_KEY_RESPONSES=gomodel:response:
 # REDIS_TTL_RESPONSES=3600
+# Opt-in when config.yaml has no cache.response.simple block (e.g. env-only deploys). Omit otherwise.
+# RESPONSE_CACHE_SIMPLE_ENABLED=true
 
 # Optional: Custom cache directory for local file cache
 # GOMODEL_CACHE_DIR=.cache

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -9,8 +9,35 @@ permissions:
   contents: write # Required to create releases
 
 jobs:
+  semantic-cache-e2e:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Set up Go
+        uses: actions/setup-go@v6
+        with:
+          go-version: 1.26.1
+          cache: true
+
+      - name: Run semantic cache release scenarios
+        env:
+          OUTPUT_DIR: ${{ runner.temp }}/release-semantic-cache
+        run: tests/e2e/run-release-semantic-cache.sh
+
+      - name: Upload semantic cache logs
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: semantic-cache-release-e2e
+          path: ${{ runner.temp }}/release-semantic-cache
+
   goreleaser:
     runs-on: ubuntu-latest
+    needs: semantic-cache-e2e
     steps:
       - name: Checkout
         uses: actions/checkout@v6

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -126,6 +126,9 @@ jobs:
     steps:
       - uses: actions/checkout@v6
 
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y libsecret-1-dev
+
       - name: Setup Node.js
         uses: actions/setup-node@v6
         with:

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -35,6 +35,8 @@ make tidy              # go mod tidy
 make clean             # Remove bin/
 make record-api        # Record API responses for contract tests
 make swagger           # Regenerate Swagger docs
+make infra             # Docker Compose: Redis, Postgres, MongoDB, Adminer only
+make image             # Docker Compose: full stack (GOModel + Prometheus)
 ```
 
 **Single test:** `go test ./internal/providers -v -run TestName`
@@ -107,7 +109,7 @@ Full reference: `.env.template` and `config/config.yaml`
 - **Storage:** `STORAGE_TYPE` (sqlite), `SQLITE_PATH` (data/gomodel.db), `POSTGRES_URL`, `MONGODB_URL`
 - **Audit logging:** `LOGGING_ENABLED` (false), `LOGGING_LOG_BODIES` (false), `LOGGING_LOG_HEADERS` (false), `LOGGING_RETENTION_DAYS` (30)
 - **Usage tracking:** `USAGE_ENABLED` (true), `ENFORCE_RETURNING_USAGE_DATA` (true), `USAGE_RETENTION_DAYS` (90)
-- **Cache:** `CACHE_TYPE` (local), `CACHE_REFRESH_INTERVAL` (3600s), `REDIS_URL`, `REDIS_KEY_MODELS`, `REDIS_TTL_MODELS`, `REDIS_KEY_RESPONSES`, `REDIS_TTL_RESPONSES`
+- **Cache:** `CACHE_TYPE` (local), `CACHE_REFRESH_INTERVAL` (3600s), `REDIS_URL`, `REDIS_KEY_MODELS`, `REDIS_TTL_MODELS`. Exact response cache uses `cache.response.simple` in `config.yaml` (optional `enabled`); `REDIS_KEY_RESPONSES`, `REDIS_TTL_RESPONSES`, and `REDIS_URL` apply only when that block exists or when `RESPONSE_CACHE_SIMPLE_ENABLED=true`. Semantic response cache uses `cache.response.semantic` (optional `enabled`); when enabled, `embedder.provider` must name a key in the top-level `providers` map (no default embedder). At runtime that key is resolved against the same env-merged, credential-filtered provider set as routing (not YAML-only), so env-only credentials apply. `vector_store.type` must be set explicitly to one of `qdrant`, `pgvector`, `pinecone`, `weaviate` (each has its own nested config and `SEMANTIC_CACHE_*` env vars). Tuning via `SEMANTIC_CACHE_*` applies when the semantic block exists or `SEMANTIC_CACHE_ENABLED=true`.
 - **HTTP client:** `HTTP_TIMEOUT` (600s), `HTTP_RESPONSE_HEADER_TIMEOUT` (600s)
 - **Resilience:** Configured via `config/config.yaml` — global `resilience.retry.*` and `resilience.circuit_breaker.*` defaults with optional per-provider overrides under `providers.<name>.resilience.retry.*` and `providers.<name>.resilience.circuit_breaker.*`. Retry defaults: `max_retries` (3), `initial_backoff` (1s), `max_backoff` (30s), `backoff_factor` (2.0), `jitter_factor` (0.1). Circuit breaker defaults: `failure_threshold` (5), `success_threshold` (2), `timeout` (30s)
 - **Metrics:** `METRICS_ENABLED` (false), `METRICS_ENDPOINT` (/metrics)

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: all build run clean tidy test test-e2e test-integration test-contract test-all lint lint-fix record-api swagger install-tools perf-check perf-bench
+.PHONY: all build run clean tidy test test-e2e test-integration test-contract test-all lint lint-fix record-api swagger install-tools perf-check perf-bench infra image
 
 all: build
 
@@ -13,7 +13,7 @@ LDFLAGS := -X "gomodel/internal/version.Version=$(VERSION)" \
            -X "gomodel/internal/version.Date=$(DATE)"
 
 install-tools:
-	@command -v golangci-lint > /dev/null 2>&1 || (echo "Installing golangci-lint..." && go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.10.1)
+	@command -v golangci-lint > /dev/null 2>&1 || (echo "Installing golangci-lint..." && go install github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.4)
 	@command -v pre-commit > /dev/null 2>&1 || (echo "Installing pre-commit..." && pip install pre-commit==4.5.1)
 	@echo "All tools are ready"
 
@@ -31,6 +31,14 @@ clean:
 tidy:
 	go mod tidy
 
+# Docker Compose: Redis, PostgreSQL, MongoDB, Adminer (no app image build)
+infra:
+	docker compose up -d
+
+# Docker Compose: full stack (GOModel + Prometheus; builds app image when needed)
+image:
+	docker compose --profile app up -d
+
 # Run unit tests only
 test:
 	go test ./internal/... ./config/... -v

diff --git a/README.md b/README.md
@@ -100,14 +100,22 @@ Example model identifiers are illustrative and subject to change; consult provid
    make run
    ```
 
-### Docker Compose (Full Stack)
+### Docker Compose
 
-Includes GOModel + Redis + PostgreSQL + MongoDB + Adminer + Prometheus:
+**Infrastructure only** (Redis, PostgreSQL, MongoDB, Adminer — no image build):
+
+```bash
+docker compose up -d
+# or: make infra
+```
+
+**Full stack** (adds GOModel + Prometheus; builds the app image):
 
 ```bash
 cp .env.template .env
 # Add your API keys to .env
-docker compose up -d
+docker compose --profile app up -d
+# or: make image
 ```
 
 | Service | URL |
@@ -204,15 +212,15 @@ Or via environment variables: `REDIS_URL`, `REDIS_KEY_RESPONSES`, `REDIS_TTL_RES
 
 Responses served from this layer carry `X-Cache: HIT (exact)`.
 
-### Layer 2 — Semantic cache *(coming soon)*
+### Layer 2 — Semantic cache
 
-Embeds the last user message with `all-MiniLM-L6-v2` (local ONNX, zero external dependency) and performs a KNN vector search. Semantically equivalent queries — e.g. *"What's the capital of France?"* vs *"Which city is France's capital?"* — return the same cached response without an upstream LLM call.
+Embeds the last user message via your configured provider’s OpenAI-compatible `/v1/embeddings` API (`cache.response.semantic.embedder.provider` must name a key in the top-level `providers` map) and performs a KNN vector search. Semantically equivalent queries — e.g. *"What's the capital of France?"* vs *"Which city is France's capital?"* — can return the same cached response without an upstream LLM call.
 
 Expected hit rates: ~60–70% in high-repetition workloads vs. ~18% for exact-match alone.
 
 Responses served from this layer carry `X-Cache: HIT (semantic)`.
 
-Supported vector backends: `sqlite-vec` (default, embedded), `pgvector`, `qdrant`.
+Supported vector backends: `qdrant`, `pgvector`, `pinecone`, `weaviate` (set `cache.response.semantic.vector_store.type` and the matching nested block).
 
 Both cache layers run **after** guardrail/execution-plan patching so they always see the final prompt. Use `Cache-Control: no-cache` or `Cache-Control: no-store` to bypass caching per-request.
 
@@ -234,12 +242,12 @@ See [DEVELOPMENT.md](DEVELOPMENT.md) for testing, linting, and pre-commit setup.
 | Administrative endpoints | ✅ | Admin API and dashboard ship with usage, audit, and model views. |
 | Guardrails | ✅ | The guardrails pipeline is implemented and can be enabled from config. |
 | System prompt guardrails | ✅ | `inject`, `override`, and `decorator` modes are supported. |
+| Semantic response cache | ✅ | Exact-match Redis plus optional semantic layer (API embeddings, `qdrant` / `pgvector` / `pinecone` / `weaviate`) — see [ADR-0006](docs/adr/0006-semantic-response-cache.md). |
 
 ## In Progress
 
 | Area | Status | Notes |
 | ---- | :----: | ----- |
-| Semantic response cache | 🚧 | Exact-match Redis cache is live. Semantic (vector KNN) layer with local `all-MiniLM-L6-v2` embedder is in progress — see [ADR-0006](docs/adr/0006-semantic-response-cache.md). |
 | Billing management | 🚧 | Usage and pricing primitives exist, but billing workflows are not complete. |
 | Budget management | 🚧 | Gateway-level budget enforcement and policy controls are not implemented yet. |
 | Guardrails depth | 🚧 | The system prompt guardrail is available today; broader guardrail types are still to come. |

diff --git a/config/cache_validation_test.go b/config/cache_validation_test.go
@@ -79,15 +79,17 @@ func TestValidateCacheConfig_RedisOnly(t *testing.T) {
 	}
 }
 
+func boolPtr(b bool) *bool { return &b }
+
 func TestValidateCacheConfig_SemanticDisabledIgnoresInvalidVectorStore(t *testing.T) {
 	cfg := &CacheConfig{
 		Model: ModelCacheConfig{
 			Local: &LocalCacheConfig{CacheDir: ".cache"},
 			Redis: nil,
 		},
 		Response: ResponseCacheConfig{
-			Semantic: SemanticCacheConfig{
-				Enabled: false,
+			Semantic: &SemanticCacheConfig{
+				Enabled: boolPtr(false),
 				VectorStore: VectorStoreConfig{
 					Type: "qdrant",
 					// Intentionally missing URL — valid because semantic cache is off.
@@ -107,10 +109,11 @@ func TestValidateCacheConfig_SemanticEnabledRequiresQdrantURL(t *testing.T) {
 			Redis: nil,
 		},
 		Response: ResponseCacheConfig{
-			Semantic: SemanticCacheConfig{
-				Enabled:             true,
+			Semantic: &SemanticCacheConfig{
+				Enabled:             boolPtr(true),
 				SimilarityThreshold: 0.9,
-				TTL:                 3600,
+				TTL:                 intPtr(3600),
+				Embedder:            EmbedderConfig{Provider: "openai"},
 				VectorStore: VectorStoreConfig{
 					Type: "qdrant",
 				},
@@ -122,20 +125,47 @@ func TestValidateCacheConfig_SemanticEnabledRequiresQdrantURL(t *testing.T) {
 	}
 }
 
+func TestValidateCacheConfig_SemanticEnabledRequiresQdrantCollection(t *testing.T) {
+	cfg := &CacheConfig{
+		Model: ModelCacheConfig{
+			Local: &LocalCacheConfig{CacheDir: ".cache"},
+			Redis: nil,
+		},
+		Response: ResponseCacheConfig{
+			Semantic: &SemanticCacheConfig{
+				Enabled:             boolPtr(true),
+				SimilarityThreshold: 0.9,
+				TTL:                 intPtr(3600),
+				Embedder:            EmbedderConfig{Provider: "openai"},
+				VectorStore: VectorStoreConfig{
+					Type:   "qdrant",
+					Qdrant: QdrantConfig{URL: "http://localhost:6333"},
+				},
+			},
+		},
+	}
+	if err := ValidateCacheConfig(cfg); err == nil {
+		t.Fatal("expected error when qdrant collection empty")
+	}
+}
+
 func TestValidateCacheConfig_SemanticSimilarityThresholdInvalid(t *testing.T) {
 	base := CacheConfig{
 		Model: ModelCacheConfig{
 			Local: &LocalCacheConfig{CacheDir: ".cache"},
 			Redis: nil,
 		},
 		Response: ResponseCacheConfig{
-			Semantic: SemanticCacheConfig{
-				Enabled: true,
-				TTL:     3600,
+			Semantic: &SemanticCacheConfig{
+				Enabled:  boolPtr(true),
+				TTL:      intPtr(3600),
+				Embedder: EmbedderConfig{Provider: "openai"},
 				VectorStore: VectorStoreConfig{
-					Type: "sqlite-vec",
-					SQLiteVec: SQLiteVecConfig{
-						Path: ".cache/semantic.db",
+					Type: "pgvector",
+					PGVector: PGVectorConfig{
+						URL:       "postgres://localhost/test",
+						Table:     "gomodel_semantic_cache",
+						Dimension: 1536,
 					},
 				},
 			},
@@ -165,21 +195,81 @@ func TestValidateCacheConfig_SemanticSimilarityThresholdInvalid(t *testing.T) {
 	}
 }
 
+func TestValidateCacheConfig_SemanticRequiresEmbedderProvider(t *testing.T) {
+	cfg := &CacheConfig{
+		Model: ModelCacheConfig{
+			Local: &LocalCacheConfig{CacheDir: ".cache"},
+			Redis: nil,
+		},
+		Response: ResponseCacheConfig{
+			Semantic: &SemanticCacheConfig{
+				Enabled:             boolPtr(true),
+				SimilarityThreshold: 0.9,
+				TTL:                 intPtr(3600),
+				VectorStore: VectorStoreConfig{
+					Type: "pgvector",
+					PGVector: PGVectorConfig{
+						URL:       "postgres://localhost/test",
+						Dimension: 768,
+					},
+				},
+			},
+		},
+	}
+	err := ValidateCacheConfig(cfg)
+	if err == nil {
+		t.Fatal("expected error when semantic enabled without embedder provider")
+	}
+	if !strings.Contains(err.Error(), "embedder.provider") {
+		t.Fatalf("unexpected error: %v", err)
+	}
+}
+
+func TestValidateCacheConfig_SemanticRejectsLocalEmbedder(t *testing.T) {
+	cfg := &CacheConfig{
+		Model: ModelCacheConfig{
+			Local: &LocalCacheConfig{CacheDir: ".cache"},
+			Redis: nil,
+		},
+		Response: ResponseCacheConfig{
+			Semantic: &SemanticCacheConfig{
+				Enabled:             boolPtr(true),
+				SimilarityThreshold: 0.9,
+				TTL:                 intPtr(3600),
+				Embedder:            EmbedderConfig{Provider: "local"},
+				VectorStore: VectorStoreConfig{
+					Type: "pgvector",
+					PGVector: PGVectorConfig{
+						URL:       "postgres://localhost/test",
+						Dimension: 768,
+					},
+				},
+			},
+		},
+	}
+	err := ValidateCacheConfig(cfg)
+	if err == nil {
+		t.Fatal("expected error for local embedder provider")
+	}
+}
+
 func TestValidateCacheConfig_SemanticNegativeTTL(t *testing.T) {
 	cfg := &CacheConfig{
 		Model: ModelCacheConfig{
 			Local: &LocalCacheConfig{CacheDir: ".cache"},
 			Redis: nil,
 		},
 		Response: ResponseCacheConfig{
-			Semantic: SemanticCacheConfig{
-				Enabled:             true,
+			Semantic: &SemanticCacheConfig{
+				Enabled:             boolPtr(true),
 				SimilarityThreshold: 0.9,
-				TTL:                 -1,
+				TTL:                 intPtr(-1),
+				Embedder:            EmbedderConfig{Provider: "openai"},
 				VectorStore: VectorStoreConfig{
-					Type: "sqlite-vec",
-					SQLiteVec: SQLiteVecConfig{
-						Path: ".cache/semantic.db",
+					Type: "pgvector",
+					PGVector: PGVectorConfig{
+						URL:       "postgres://localhost/test",
+						Dimension: 768,
 					},
 				},
 			},

diff --git a/config/config.example.yaml b/config/config.example.yaml
@@ -22,6 +22,37 @@ cache:
     #   url: "redis://localhost:6379"
     #   key: "gomodel:models"
     #   ttl: 86400 # 24 hours in seconds
+  # response:
+  #   simple: # omit the whole `simple` key to disable exact-match caching (unless RESPONSE_CACHE_SIMPLE_ENABLED=true)
+  #     enabled: true # default when `simple` is present; set false to disable while keeping the block
+  #     redis:
+  #       url: "redis://localhost:6379"
+  #       key: "gomodel:response:"
+  #       ttl: 3600
+  #   semantic: # omit the whole `semantic` key to disable semantic caching (unless SEMANTIC_CACHE_ENABLED=true)
+  #     enabled: true
+  #     embedder:
+  #       provider: openai # must match a key under `providers` (openai, gemini, groq, …)
+  #       model: text-embedding-3-small # optional; provider-specific defaults apply if omitted
+  #     vector_store:
+  #       type: qdrant # required: qdrant | pgvector | pinecone | weaviate (pick one block below)
+  #       qdrant:
+  #         url: "http://localhost:6333"
+  #         collection: "gomodel_semantic"
+  #         api_key: "" # optional for local Qdrant
+  #       # pgvector:
+  #       #   url: "postgres://user:pass@localhost:5432/gomodel"
+  #       #   table: "gomodel_semantic_cache" # optional; default if omitted
+  #       #   dimension: 1536 # must match embedding model output
+  #       # pinecone:
+  #       #   host: "https://your-index.svc.region.pinecone.io" # data-plane host
+  #       #   api_key: "..."
+  #       #   namespace: "" # optional
+  #       #   dimension: 1536 # must match your Pinecone index
+  #       # weaviate:
+  #       #   url: "http://localhost:8080"
+  #       #   class: "GomodelSemanticCache" # PascalCase recommended (GraphQL)
+  #       #   api_key: "" # optional
 
 storage:
   type: "sqlite" # "sqlite", "postgresql", or "mongodb"