vectoria/config.py at main · voidkey/vectoria · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
from pydantic import SecretStr
from pydantic_settings import BaseSettings, SettingsConfigDict
from typing import Literal
from functools import lru_cache


class Settings(BaseSettings):
    model_config = SettingsConfigDict(env_file=".env", extra="ignore")

    # LLM
    openai_base_url: str = "https://api.openai.com/v1"
    openai_api_key: SecretStr = SecretStr("")
    llm_model: str = "gpt-4o"

    # Embedding (falls back to LLM settings if not set)
    embedding_base_url: str = ""
    embedding_api_key: SecretStr = SecretStr("")
    embedding_model: str = "text-embedding-3-small"
    embedding_dimensions: int = 1536
    embedding_batch_size: int = 4

    @property
    def effective_embedding_base_url(self) -> str:
        return self.embedding_base_url or self.openai_base_url

    @property
    def effective_embedding_api_key(self) -> str:
        key = self.embedding_api_key.get_secret_value()
        return key if key else self.openai_api_key.get_secret_value()

    # Vector store — pgvector is the only supported backend; removed
    # the chroma enum option in W6-6 because no chroma adapter was
    # ever implemented (vectorstore/ has only pgvector.py + base.py).
    database_url: SecretStr = SecretStr("postgresql+asyncpg://postgres:postgres@localhost/vectoria")

    # Object storage
    storage_type: str = "s3"
    s3_endpoint: str = "http://localhost:9000"
    s3_region: str = ""
    s3_access_key: str = "minioadmin"
    s3_secret_key: SecretStr = SecretStr("minioadmin")
    s3_bucket: str = "vectoria"
    s3_addressing_style: str = "auto"  # auto|virtual|path
    s3_presign_expires: int = 3600

    # Parse engine
    default_parse_engine: str = "auto"

    # Hard cap on parsed document content (characters). Anything larger is
    # rejected with 413 before splitting/embedding to avoid OOM — the splitter
    # and embedding pipeline hold the full content in memory and fan out into
    # many intermediate copies.
    max_content_chars: int = 5_000_000

    # Chunking knobs. Chunks are what gets embedded + indexed and also
    # what the LLM receives as retrieval context. Defaults are tuned
    # for mixed CJK/Latin content at 1024 chars — large enough to
    # carry a paragraph's worth of context but still under the
    # embedding model's token limits at the char-to-token expansion
    # ratio typical of CJK.
    splitter_chunk_size: int = 1024
    splitter_chunk_overlap: int = 64

    # Hard cap on raw upload size (bytes). Rejected at the HTTP entry before
    # the file is buffered in memory.
    max_upload_bytes: int = 50 * 1024 * 1024

    # Hard cap on PDF page count. The byte cap above doesn't catch
    # "small file, many pages" — a 19 MB scanned PDF can hide 1000+
    # pages that mineru can't OCR within its 120 s per-call timeout,
    # burning 3 retries × 120 s of GPU time before fallback. Rejecting
    # at upload (after pypdfium2 reads the xref, ~ms) costs nothing.
    # 200 covers the long tail of business documents (reports, slide
    # exports, manuals); larger inputs should be split or routed via
    # a dedicated long-doc pipeline.
    max_pdf_pages: int = 200

    # Hard cap on PPTX slide count. Same shape of attack as PDF —
    # text-only slides compress small but each slide still pays the
    # full per-slide parse + image-extraction + vision cost. Counting
    # is a zip directory listing, no XML parse, microseconds.
    max_pptx_slides: int = 200

    # Per-parse wall-clock timeout (seconds). Parsers run in a subprocess
    # pool; after this timeout the worker is terminated so a stuck convert()
    # can't block the API thread indefinitely.
    parser_timeout: float = 120.0

    # Whether heavy parsers run in a subprocess pool. Defaults on for prod
    # isolation; tests that rely on in-process patching (mocking
    # DocumentConverter etc.) flip this off via monkeypatch.
    parser_isolation: bool = True

    # --- Phase 1 ingest-quality knobs -----------------------------------
    # Min extracted content length (chars, post-.strip()) for a document
    # to count as "non-empty". Shorter → fail (or image_only rescue if
    # a structured-source URL handler flagged allow_image_only=True and
    # image_urls is non-empty). Rule is strict-less-than, so a value
    # exactly equal to this threshold passes.
    min_content_chars: int = 50

    # Cap on number of image URLs any URL handler will carry per
    # document. Set to a large number (e.g. 9999) to effectively disable.
    url_image_cap: int = 50

    # When True, reject uploads whose magic-byte-sniffed MIME family
    # does NOT match the claimed file extension. When False, log +
    # metric but let the upload through (safe rollback during rollout).
    strict_mime_check: bool = True

    # POST /documents/{file,url}?wait=true polls the Document row for up
    # to this many seconds before returning so backward-compat clients
    # can still receive content in the response body. Past the timeout
    # we return whatever state the Document is in — queued is a valid
    # outcome for clients that didn't opt in.
    ingest_wait_timeout_seconds: float = 30.0
    ingest_wait_poll_interval_seconds: float = 0.25

    # MinerU remote API
    mineru_api_url: str = ""
    mineru_backend: str = "pipeline"
    mineru_language: str = "ch"
    # Breaker: open after N consecutive 5xx/timeout/network failures; stay
    # open for reset_timeout seconds, then probe with one request.
    mineru_breaker_threshold: int = 5
    mineru_breaker_reset_timeout: float = 300.0

    # PaddleOCR-VL remote API (PDF primary; MinerU stays as fallback B).
    # Both URL and key required; either being empty makes the parser
    # advertise unavailable so the registry falls straight through to
    # mineru. Gateway accepts JSON+base64 PDF (see docs in
    # ``parsers/paddle_parser.py``).
    paddle_api_url: str = ""
    paddle_api_key: SecretStr = SecretStr("")
    # Wall-clock per VL call (s). VL gateway's own ceiling is 600 s;
    # we stay close to that — long PDFs (~50-200 pages) routinely sit
    # at 60-90 s, and a 120 s client-side cut (the value MinerU uses)
    # would prematurely fail them when the gateway is still working.
    paddle_timeout: float = 600.0
    # Per-process cap on concurrent VL requests. Single-card GPU
    # serializes; >3 concurrent on image-heavy PDFs has been observed
    # to drop connections (see VL gateway docs §5). Multi-worker hosts
    # get N × ceiling; tune at worker count level.
    paddle_concurrency: int = 3
    paddle_breaker_threshold: int = 5
    paddle_breaker_reset_timeout: float = 300.0

    # Vision LLM (for image description + vision-native parser)
    vision_base_url: str = ""
    vision_api_key: SecretStr = SecretStr("")
    vision_model: str = "gpt-4o"
    vision_breaker_threshold: int = 5
    vision_breaker_reset_timeout: float = 300.0
    # Rough per-call USD cost estimate, used by the cost counter and
    # daily-budget guardrail. Real cost depends on tokens; a flat
    # estimate is a small, conservative approximation. Adjust per
    # vendor: gpt-4o-mini ≈ 0.005, gpt-4o ≈ 0.02, qwen-vl ≈ 0.002.
    vision_cost_per_call_usd: float = 0.005
    # Soft daily budget. When today's accumulated estimated spend
    # crosses this, vision-native parser advertises is_available()=False
    # and registry falls back to ocr-native (rapidocr). 0 = no cap.
    # Per-process state — multi-worker hosts get N×budget effective
    # ceiling, conservative tune accordingly.
    vision_daily_budget_usd: float = 0.0

    # Embedding reliability. Threshold is higher than mineru/vision because
    # the embedder already retries internally with backoff; the breaker is
    # the last resort when retries can't drain the outage.
    embedding_breaker_threshold: int = 10
    embedding_breaker_reset_timeout: float = 60.0

    # Security
    api_key: SecretStr = SecretStr("")
    cors_origins: list[str] = []

    # Explicit gate for "no auth configured" mode. When both ``api_key``
    # and ``jwt_secret`` are empty, ``verify_auth`` lets everything
    # through — convenient for local dev but dangerous in prod if a
    # sed mistake wipes the .env secrets. Default ``False`` means that
    # combination instead raises 503 at request time so the leak is
    # loud. Flip to ``True`` only in dev / CI.
    allow_unauthenticated: bool = False

    # JWT auth (optional; enables X-Authorization-Token and Authorization: Bearer
    # alongside X-API-Key). Must match the signing secret/algorithm of whatever
    # service issues the tokens.
    jwt_secret: SecretStr = SecretStr("")
    # Restricted to go-atlas's supported set; rejects `none` and other algos at load time.
    jwt_algorithm: Literal["HS256", "HS384", "HS512"] = "HS256"
    # When set, tokens must carry a matching `iss` claim. When empty, issuer is not
    # verified — tokens with any (or no) issuer are accepted.
    jwt_issuer: str = ""

    # RAG pipeline toggles
    # Query rewrite: ``eval/reports/baseline-2026-04-22.json`` showed
    # the LLM rewriter drops CJK hit@1 from 0.70 → 0.55 on real
    # philosophy-text queries; 5/20 queries miss entirely with
    # rewrite on. Leaving the knob in place for opt-in experimentation
    # (short queries / English traffic may behave differently) but
    # the default is off.
    enable_query_rewrite: bool = False
    enable_reranker: bool = False
    reranker_base_url: str = ""

    # Observability
    # Port the worker process binds for prometheus_client stdlib HTTP server.
    # API exposes /metrics on the main uvicorn port via fastapi-instrumentator,
    # so this only applies to worker pods. Intentionally different from the
    # default API port (8001 in scripts/deploy-host.sh) so host-mode deploys
    # don't collide — override in K8s if 9001 is needed elsewhere.
    worker_metrics_port: int = 9001

    # Redis URL for distributed state (rate-limit token buckets today;
    # shared circuit-breaker state / dedupe caches as future use cases).
    # SecretStr so password-bearing URLs (``redis://:pw@host:6379/0``)
    # don't show up in debug dumps or error traces.
    redis_url: SecretStr = SecretStr("redis://localhost:6379/0")

    # Worker runtime limits
    # RSS self-kill threshold in bytes. When a worker's resident memory
    # exceeds this between tasks, it exits cleanly and K8s restarts it.
    # 0 disables the check (dev default — no /proc on macOS anyway).
    # Production recommendation: 2 GiB, leaving headroom under a 4 GiB
    # container limit to absorb one more large task before OOM-killer hits.
    worker_rss_limit_bytes: int = 0

    # Comma-separated list of ``task_type`` values this worker instance
    # will consume from the queue. Empty = accept all task types (default).
    # Drives multi-deployment sharding: e.g. ``WORKER_QUEUES=url_render``
    # in one K8s Deployment, everything-else in another — same image, same
    # code, different env.
    worker_queues: str = ""


@lru_cache
def get_settings() -> Settings:
    return Settings()