diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..27585c7a4 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,28 @@ +# Never bake the dataset answer keys or construction recipes into a runtime +# container image — agents running inside the container must not be able to +# read the corruption pipeline or the canonical pre-corruption snapshot. + +# Canonical un-corrupted snapshots and corruption manifests (= answer keys) +query_*/clean/ + +# Per-dataset construction code (corruption recipes, ground-truth SQL, +# verifier prompts). The agent-visible DBs in query_*/query_dataset/ are +# already corrupted; the manual_querycode/ dir holds the recipes that +# produced them. +query_*/manual_querycode/ + +# Local results / traces / scratch +sdk_runner/results/ +sdk_runner/results_backups/ +.venv/ +.codex/ +.claude/ +.env +__pycache__/ +*.pyc + +# VCS / editor noise +.git/ +.gitignore +.gitattributes +.DS_Store diff --git a/.gitignore b/.gitignore index d44030790..af722292b 100644 --- a/.gitignore +++ b/.gitignore @@ -78,4 +78,9 @@ query_stockmarket/query_dataset/stockmarket_symboldefinition/ query_stockmarket/metadata_mean.txt query_yelp/ground_truth_dataset/ query_yelp/manual_querycode/ -query_yelp/query*/ground_truth.py \ No newline at end of file +query_yelp/query*/ground_truth.py +# query_cve / query_usaspending: canonical pre-corruption snapshot + corruption +# manifest are the answer key — keep local-only. Construction code in +# manual_querycode/ IS shipped (see PROVENANCE.md) for full reproducibility. +query_cve/clean/ +query_usaspending/clean/ \ No newline at end of file diff --git a/query_cve/db_config.yaml b/query_cve/db_config.yaml new file mode 100644 index 000000000..0ce4824ca --- /dev/null +++ b/query_cve/db_config.yaml @@ -0,0 +1,15 @@ +db_clients: + vulns_db: + db_type: sqlite + db_path: query_dataset/vulns.db + cpe_db: + db_type: duckdb + db_path: query_dataset/cpe.duckdb + kev_db: + db_type: postgres + db_name: cve_kev + sql_file: query_dataset/kev.sql + descriptions_db: + db_type: mongo + db_name: cve_descriptions + dump_folder: query_dataset/descriptions diff --git a/query_cve/db_description.txt b/query_cve/db_description.txt new file mode 100644 index 000000000..bf68153d3 --- /dev/null +++ b/query_cve/db_description.txt @@ -0,0 +1,76 @@ +You are working with four databases to solve this query. + +Here are the descriptions of these four databases: + +1. vulns_db + - This database is stored in a SQLite database. It contains the structured NVD CVE registry: per-CVE publication metadata, CVSS v3 attack-vector information, and a sibling table holding the per-CVE CVSS score. + - This database consists of two tables: + - cves + - Core CVE registry. + - Fields: + - cve_id (str): CVE identifier (e.g. "CVE-2023-12345") + - published (str): ISO timestamp when the CVE was published + - last_modified (str): ISO timestamp of last NVD modification + - vuln_status (str): NVD workflow status (e.g. "Analyzed", "Modified") + - cvss3_attack_vector (str or null): NETWORK / ADJACENT_NETWORK / LOCAL / PHYSICAL + - cvss_metadata + - Per-CVE CVSS v3 base score. + - Fields: + - cve_id (str): CVE identifier + - score_text (str): Score and severity classification + +2. cpe_db + - This database is stored in a DuckDB database. It contains the Common Platform Enumeration (CPE) match list — which products and versions are affected by each CVE — plus a vendor reference and version-details tables. + - This database consists of three tables: + - cpe_matches + - One row per (CVE, affected product configuration) pair. + - Fields: + - cve_id (str): CVE identifier + - criteria (str): Product identifier string + - vulnerable_flag (str): Indicates whether the configuration is vulnerable + - vendor_aliases + - Vendor reference table. + - Fields: + - alias (str) + - canonical_vendor (str): Canonical lowercase vendor name (e.g. "apache", "microsoft") + - cpe_version_details + - Per-(CVE, criteria) version information. + - Fields: + - cve_id (str): CVE identifier + - criteria (str): The CPE criteria string this version detail belongs to + - version_text (str or null): Affected version + - version_start_inc (str or null): Inclusive lower bound of affected versions + - version_start_exc (str or null): Exclusive lower bound of affected versions + - version_end_inc (str or null): Inclusive upper bound of affected versions + - version_end_exc (str or null): Exclusive upper bound of affected versions + +3. kev_db + - This database is stored in a PostgreSQL database. It contains the CISA Known Exploited Vulnerabilities (KEV) catalog — CVEs that have been exploited in the wild. + - This database consists of one table: + - kev_entries + - One row per KEV-listed vulnerability. + - Fields: + - cve_ref (str): CVE identifier as supplied by CISA + - vendor_project (str): Vendor or project responsible for the affected product + - products_csv (str): Affected product name(s) + - vulnerability_name (str): Short human-readable name + - date_added (str): Date the CVE was added to the KEV catalog + - short_description (str): Brief description of the vulnerability + - required_action (str): Action CISA requires affected agencies to take + - due_date (str): Deadline by which required action must be completed + - known_ransomware_use (str): "Known", "Unknown", or null + - notes (str or null): Additional notes + +4. descriptions_db + - This database is stored in a MongoDB database. It contains free-text descriptions and external references for each CVE. + - This database consists of one collection: + - cve_documents + - One document per CVE with embedded descriptions[] and references[]. + - Fields: + - cve (str): CVE identifier + - descriptions (list of dict): Free-text descriptions in one or more languages, each with: + - language (str): ISO language code (e.g. "en", "es") + - value (str): Description prose + - references (list of dict): External reference URLs, each with: + - url (str): Reference URL + - source (str or null): Source organization that supplied the reference diff --git a/query_cve/db_description_withhint.txt b/query_cve/db_description_withhint.txt new file mode 100644 index 000000000..16561d89f --- /dev/null +++ b/query_cve/db_description_withhint.txt @@ -0,0 +1,12 @@ +HINTS: +- CVE identifiers appear under multiple surface-form variants across the four databases. Cross-DB joins on CVE id require canonicalization. +- cpe_db.cpe_matches.criteria contains a vendor alias rather than the canonical vendor name; cpe_db.vendor_aliases resolves aliases to canonical vendor names. +- cpe_db.cpe_matches.vulnerable_flag is free-text — multiple truthy and falsy tokens appear; normalization is needed. +- cpe_db.cpe_version_details.version_text is stored in a mixed encoding; equivalent versions may not match by string equality. +- kev_db.kev_entries.vendor_project may appear under multiple surface forms for the same canonical vendor; clustering is needed for grouping or counting. +- kev_db.kev_entries.products_csv may contain comma-separated lists; split to recover individual products. +- Some KEV entries reference CVEs that are not present in vulns_db. +- vulns_db.cves contains a small subset of CVEs that appear in more than one row with conflicting attribute values. +- vulns_db.cvss_metadata.score_text mixes templated and narrative encodings; the numeric score must be inferred from prose for narrative rows. +- descriptions_db.cve_documents.descriptions[].value: severity classification words do not appear literally in any English description; severity is encoded either as an opaque tagline or implied by narrative content. +- A subset of CVEs are missing the English description and only carry a non-English description. diff --git a/query_cve/manual_querycode/PROVENANCE.md b/query_cve/manual_querycode/PROVENANCE.md new file mode 100644 index 000000000..9e80e3772 --- /dev/null +++ b/query_cve/manual_querycode/PROVENANCE.md @@ -0,0 +1,281 @@ +# query_cve provenance + +## Source data + +| Source | URL | Fetched | Scope | +|---|---|---|---| +| NVD CVE feed (REST API 2.0) | https://services.nvd.nist.gov/rest/json/cves/2.0 | 2026-04-27 | All CVEs published 2023-01-01 .. 2024-12-31 (71,653 CVEs ingested). Paginated 2000 records / page in 120-day windows per the NVD rate-limit policy. | +| CISA Known Exploited Vulnerabilities catalog | https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json | 2026-04-27 | Full catalog at fetch time (1,583 entries). | +| FIRST EPSS daily scores | https://epss.empiricalsecurity.com/epss_scores-{date}.csv.gz | 2026-04-27 | Three snapshot dates (2024-01-02, 2024-06-01, 2026-04-25); only CVEs already in the NVD ingest set were retained. NOT used in any of the 10 queries — kept off the agent surface in this dataset. | + +## Database engine assignment + +| DB | Engine | Rationale | +|---|---|---| +| `vulns_db` | SQLite | Small embedded structured registry (cves, cvss_metadata) | +| `cpe_db` | DuckDB | Analytical / columnar — large fact table (585k cpe_match rows) plus reference / version-detail tables | +| `kev_db` | PostgreSQL | Structured catalog dump (CISA's actual API serves the same shape) | +| `descriptions_db` | MongoDB | Free-text + nested arrays per CVE (descriptions in multiple languages, references) | + +## Corruption layers (categories only) + +Hash-deterministic transforms (each row's choice is keyed by a stable SHA-1 hash of a salted identifier so the build is reproducible given the same source data + scripts): + +| Layer | Surface | +|---|---| +| Cross-DB CVE-id format mixing | vulns / cpe / kev / descriptions | +| CPE vendor aliasing (canonical name → opaque alias) | `cpe_db.cpe_matches.criteria` (with `cpe_db.vendor_aliases` lookup) | +| `vulnerable_flag` varied truthy/falsy tokens | `cpe_db.cpe_matches.vulnerable_flag` | +| CPE version-encoding mix (semver, comma-sep, build-number, etc.) | `cpe_db.cpe_version_details.version_text` | +| KEV vendor surface-form variants | `kev_db.kev_entries.vendor_project` | +| Packed comma-separated product lists | `kev_db.kev_entries.products_csv` | +| Referential-integrity gap | `kev_db` ↔ `vulns_db` (some KEV CVEs predate the bounded NVD window) | +| Duplicate rows with conflicting `cvss3_attack_vector` | `vulns_db.cves` (~5% of CVEs) | +| English-description dropping | `descriptions_db.cve_documents` (deterministic ~5% subset) | + +LLM-driven per-row narrative corruption with **roundtrip-classifier verification** (each generated narrative is classified back to its canonical band by a separate LLM call before being accepted; mismatches retry up to 3 times): + +| Layer | Surface | Anchor context | Generation model | +|---|---|---|---| +| CVSS-score-as-narrative-prose | `vulns_db.cvss_metadata.score_text` | per-row vendor + product + 240-char vulnerability summary | `gpt-4o`, temp 0.95 | +| Severity-as-narrative-prose | `descriptions_db.cve_documents.descriptions[].value` (English) | the unique CVE description itself | `gpt-4o`, temp 0.7 | + +## Verifier audit (post-generation) + +| Layer | Total rows | Distinct narratives | Diversity ratio | Verifier mismatch rate | +|---|---|---|---|---| +| CVSS-score-as-narrative | 225 | 225 | 100.0% | <2% | +| Severity-as-narrative (KEV CVEs) | 309 | 309 | 100.0% | 1.9% | + +A narrative is included in the shipped artifacts only if (a) it contains no banned tokens, (b) its content overlaps lexically with the original CVE prose, (c) its length is within 50–600 chars, and (d) an independent LLM classifier recovers the canonical band. Rows that fail all retry attempts fall back to the deterministic templated form. + +## What the corruption preserves vs. discards + +Because the LLM rewrite is band-level (the verifier checks band, not score), an +agent reading a narrative-corrupted row can recover the row's CVSS severity +**class** (CRITICAL / HIGH / MEDIUM / LOW per the CVSS v3 spec) but cannot +recover the precise CVSS score within the class. So a question like +"average CVSS to 2 decimal places" would not be answerable from +narrative-only rows. + +All ten queries are intentionally designed to be answerable from band-level +information. Numeric thresholds in queries (CVSS ≥ 7.0 in Q8, ≥ 9.0 in Q10) +align with [official CVSS v3 band +boundaries](https://www.first.org/cvss/v3.1/specification-document) — public, +standard knowledge. Queries that return a vendor name (Q2, Q4) compare counts +of CVEs in a given band, not numeric averages. + +## Shipped artifact hashes (SHA-256) + +``` +ba16013d0e57fc909f4c880efb6b2992860482765cecc6ef18d1386f3199f525 query_dataset/cpe.duckdb +5b8b57e41dce71cbea100502c879fc837743caaccbb9df5bc69de0081fb78743 query_dataset/descriptions/cve_descriptions/cve_documents.bson +96bcf4abe38bd5a3a2f83cbaed267833d8bc1996b39dedf6c041a68aa16b20f5 query_dataset/kev.sql +8b88f651fa43541c914ac585b21c8d093922117671c953a84b753c710f53ff72 query_dataset/vulns.db +``` + +These pin the exact bytes of the agent-visible corrupted databases in this commit. Reviewers can `shasum -a 256 query_cve/query_dataset/{cpe.duckdb,descriptions/cve_descriptions/cve_documents.bson,kev.sql,vulns.db}` to verify they have the same dataset. + +## Reproducibility notes + +- The canonical pre-corruption snapshot (`clean/clean.sqlite`) and the corruption manifest (`clean/manifest.sqlite`) — which together constitute the answer key — are kept local-only (gitignored). +- Construction code lives in `manual_querycode/` and IS shipped in this repo (full source for `fetch_clean.py`, `corrupt.py`, `llm_corrupt.py`, `audit_corruption.py`, `compute_ground_truth.py`). +- Re-running the corruption pipeline from scratch will not bit-reproduce the shipped artifacts because the LLM narrative steps are non-deterministic by design (temperature > 0). The deterministic transforms are bit-reproducible. Hashes above pin the specific byte-content shipped in this commit, regardless of regeneration path. +- Sonnet pass@1 on this dataset, plain mode, at the time of authoring: **3/10 = 30%**. + +## Construction-code reference + +Full source: `manual_querycode/fetch_clean.py`, `manual_querycode/corrupt.py`, `manual_querycode/llm_corrupt.py`, `manual_querycode/audit_corruption.py`, `manual_querycode/compute_ground_truth.py`. + +Pipeline order to regenerate from scratch: +1. `python manual_querycode/fetch_clean.py` — downloads NVD + KEV + EPSS into `clean/clean.sqlite` +2. `python manual_querycode/corrupt.py` — emits the 4 agent-visible DBs into `query_dataset/`; deterministic transforms only +3. `python manual_querycode/llm_corrupt.py --scope kev` — populates `clean/manifest.sqlite` with verifier-validated severity narratives + score narratives (Azure OpenAI; reads `.env`) +4. `python manual_querycode/audit_corruption.py` — roundtrip-classifies every existing narrative; writes lists of mismatched cve_ids; rerun llm_corrupt on those until mismatch rate is <2% +5. `python manual_querycode/corrupt.py` again — picks up the validated narratives from manifest and bakes them into the agent-visible DBs +6. `python manual_querycode/compute_ground_truth.py` — emits `queryN/ground_truth.csv` for all 10 queries from the canonical clean data + +### LLM prompts (severity-as-narrative) + +``` +You are obfuscating a CVE description for a benchmark. + +Rewrite the description so that the CVSS severity rating is *implied* by the +narrative consequences and exploitability you describe — but do NOT use any of +these words: critical, crit, high, medium, moderate, low, minor, severity, sev, +or their hyphenated variants. Convey severity through impact phrasing only. + +Constraints: +- Stay faithful to the technical content of the original description (same + product / weakness / mechanism). Do not invent capabilities. +- 1-3 sentences, similar length to the original. +- Do not state a CVSS score or rating. +- Do not include the words listed above (case-insensitive). + +Severity to imply: {severity} + +Phrasing motifs (pick one and weave it in naturally; vary phrasing): +{motifs} + +Original CVE description: +{desc} + +Output ONLY the rewritten description. No preamble. +``` + +Where `{motifs}` is one of (keyed by canonical severity): + +```python +SEVERITY_MOTIFS = { + "CRITICAL": ( + "- pre-authentication remote code execution against an exposed service\n" + "- complete and immediate takeover with no user action required\n" + "- the attacker needs no credentials and no user interaction\n" + "- internet-exposed; mass scanning + drive-by exploitation realistic" + ), + "HIGH": ( + "Important: even though this is impactful, it is NOT pre-auth full takeover.\n" + "Convey at least ONE of these constraints prominently:\n" + "- requires the attacker to already hold valid credentials, OR\n" + "- requires the victim to perform an action (open a file, click a link), OR\n" + "- requires local network access (not internet-exposed), OR\n" + "- only compromises one component / data type, not the whole system\n" + "Phrasing should make clear there is a meaningful preconditioning that\n" + "stops it from being a one-shot full-system takeover." + ), + "MEDIUM": ( + "- requires authentication AND user interaction together\n" + "- discloses partial / non-sensitive information only\n" + "- limited to a specific feature; no privilege escalation\n" + "- exploitation requires unusual configuration to be present" + ), + "LOW": ( + "- only feasible with prior local console / physical access\n" + "- impact is cosmetic or denial-of-service of a non-critical feature\n" + "- no realistic attack path; theoretical concern only\n" + "- requires the attacker to already be administrator" + ), +} +``` + +### LLM prompts (CVSS-score-as-narrative) + +``` +You are obfuscating a CVE's CVSS v3 base score (scale 0.0-10.0) for a benchmark. + +Write ONE sentence (40 - 280 characters) that *implies* the magnitude band of +the score through narrative phrasing. The sentence MUST NOT contain digits, +the words "score" / "rating", or any of: critical, crit, high, medium, +moderate, low, minor, severity, sev. + +Constraint — anchor in CVE-specific context. Reference at least one of the +context details below NATURALLY in the sentence (vendor, product, or a +short paraphrase of the vulnerability mechanism). This makes each narrative +substantively unique per row. + +Constraint — VARY phrasing aggressively. Do NOT default to stock templates +like "in the elevated tier" or "near the top of the scale". Each row should +use distinct vocabulary, syntactic structure, and register. Treat each row +as a unique editorial paraphrase. Some registers to draw from (rotate): +- terse advisory: "Among the upper-tier issues this vendor has shipped." +- analyst-paragraph: "By the standards of similar {{product}} flaws, this + one sits in the elevated band — not the rarefied air of pre-auth-RCE + catastrophes, but well above routine." +- comparative: "Comparable in severity to {{vendor}}'s prior + authentication-bypass disclosures." +- hedged: "Roughly two notches below the maximum on the standard 10-point + vulnerability scale." +- domain-flavored: "Practitioners would treat this as a top-of-the-list + patch-now item but not a fire-drill." +- counterfactual: "Were the attack vector network rather than local, this + would push toward the very top; as written, it lands in the + upper-middle band." + +CRITICAL — band-specific constraints. The narrative must clearly imply the +target band and NOT imply a higher band. Use phrasing aligned with the +canonical band. Where useful you may use approximate ten-point-scale anchors: +- 9.0-10.0: "near the very top of the scale" / "essentially at the maximum" + / "rarefied top tier" / "approaches the pinnacle" +- 7.0-8.9: "comfortably above the midpoint but clearly not at the maximum" + / "in the upper-middle band, well above average but short of the top" + / "two-notches-below-maximum" — DO NOT use phrases like "near the top" + / "just shy of the pinnacle" / "top tier" / "approaches the maximum" + for this band, those imply 9.0-10.0. +- 4.0-6.9: "in the middle band" / "around the midpoint" / + "roughly mid-scale" — DO NOT use words like "elevated" / "upper" / "high" + / "near the top". +- 0.1-3.9: "well below the midpoint" / "in the lower portion of the scale" + / "minor in scale" — DO NOT use anything that implies above-midpoint. + +Score to imply: {score} (so canonical band = {band}) + +CVE context (incorporate naturally — but do NOT include the CVE id, do NOT +copy the original description verbatim): +- vendor: {vendor} +- product: {product} +- vulnerability summary: {summary} + +Output ONLY the sentence. No preamble. No digits. +``` + +### Roundtrip verifier prompts + +Severity classifier (output is one of CRITICAL / HIGH / MEDIUM / LOW; rejection if it disagrees with the canonical): + +``` +Read the CVE description below and decide which CVSS v3 severity +classification it most naturally implies. Choose EXACTLY one: +- CRITICAL +- HIGH +- MEDIUM +- LOW + +Output ONLY the label, nothing else. + +Description: {desc} +``` + +Score-band classifier (output is one of `9.0-10.0` / `7.0-8.9` / `4.0-6.9` / `0.1-3.9`; rejection if it disagrees with the canonical band): + +``` +Read the sentence below and decide which CVSS v3 base score band it implies +(the score scale is 0.0 to 10.0). Choose EXACTLY one: +- 9.0-10.0 +- 7.0-8.9 +- 4.0-6.9 +- 0.1-3.9 + +Output ONLY the band label. + +Sentence: {sentence} +``` + +### Ground-truth-computation SQL highlights + +Q4 (vendor with highest CRITICAL share among canonical KEV vendors with ≥10 +qualifying CVEs): + +```sql +WITH kev_with_vuln_cpe AS ( + SELECT DISTINCT lower(k.vendor_project) AS vendor, k.cve_id + FROM kev k + JOIN cpe_matches cp ON cp.cve_id = k.cve_id + WHERE cp.vulnerable = 1 +), +sev AS ( + SELECT cve_id, cvss3_severity FROM cves WHERE cvss3_severity IS NOT NULL +) +SELECT k.vendor, + COUNT(*) AS n, + SUM(CASE WHEN sev.cvss3_severity = 'CRITICAL' THEN 1 ELSE 0 END) AS n_crit +FROM kev_with_vuln_cpe k +LEFT JOIN sev ON sev.cve_id = k.cve_id +GROUP BY k.vendor +HAVING n >= 10 +ORDER BY (1.0 * n_crit / n) DESC, k.vendor ASC +LIMIT 1 +``` + +Other queries' GT SQL is in `manual_querycode/compute_ground_truth.py` (one +function per query, each returning a string that becomes the queryN/ground_truth.csv). + diff --git a/query_cve/manual_querycode/audit_corruption.py b/query_cve/manual_querycode/audit_corruption.py new file mode 100644 index 000000000..bcab80c86 --- /dev/null +++ b/query_cve/manual_querycode/audit_corruption.py @@ -0,0 +1,98 @@ +"""Roundtrip-audit existing planted narratives in clean/manifest.sqlite. + +Calls the LLM classifier on each stored narrative and compares to the canonical +label. Reports the disagreement rate and lists offending rows. Does NOT modify +the manifest — use llm_corrupt.py to regenerate flagged rows. +""" +from __future__ import annotations +import argparse +import sqlite3 +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" +sys.path.insert(0, str(ROOT / "scripts")) +from llm_corrupt import _client, classify_severity, classify_score, _score_band # noqa: E402 + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--workers", type=int, default=16) + ap.add_argument("--limit", type=int, default=None) + args = ap.parse_args() + + client = _client() + conn = sqlite3.connect(MANIFEST_DB) + + sev_rows = conn.execute( + "SELECT cve_id, severity, narrative FROM planted_narrative_desc" + ).fetchall() + score_rows = conn.execute( + "SELECT cve_id, canonical_score, narrative FROM planted_narrative_score" + ).fetchall() + if args.limit: + sev_rows = sev_rows[: args.limit] + score_rows = score_rows[: args.limit] + + print(f"Auditing {len(sev_rows)} severity narratives + {len(score_rows)} score narratives", + flush=True) + + sev_mismatch = [] + with ThreadPoolExecutor(max_workers=args.workers) as ex: + futs = {ex.submit(classify_severity, client, narr): (cid, sev, narr) + for (cid, sev, narr) in sev_rows} + for i, fut in enumerate(as_completed(futs), 1): + cid, sev, narr = futs[fut] + implied = fut.result() + if implied != sev.upper(): + sev_mismatch.append((cid, sev, implied, narr)) + if i % 50 == 0: + print(f" severity {i}/{len(sev_rows)}", flush=True) + + score_mismatch = [] + with ThreadPoolExecutor(max_workers=args.workers) as ex: + futs = {ex.submit(classify_score, client, narr): (cid, sc, narr) + for (cid, sc, narr) in score_rows} + for i, fut in enumerate(as_completed(futs), 1): + cid, sc, narr = futs[fut] + implied = fut.result() + canonical = _score_band(float(sc)) + if implied != canonical: + score_mismatch.append((cid, sc, canonical, implied, narr)) + if i % 50 == 0: + print(f" score {i}/{len(score_rows)}", flush=True) + + print() + print(f"Severity mismatches: {len(sev_mismatch)}/{len(sev_rows)} " + f"({100*len(sev_mismatch)/max(len(sev_rows),1):.1f}%)") + print(f"Score mismatches: {len(score_mismatch)}/{len(score_rows)} " + f"({100*len(score_mismatch)/max(len(score_rows),1):.1f}%)") + print() + if sev_mismatch: + print("Sample severity mismatches (canonical / implied / narrative):") + for cid, sev, implied, narr in sev_mismatch[:5]: + print(f" {cid}: {sev} -> classifier said {implied}") + print(f" {narr[:200]}") + print() + if score_mismatch: + print("Sample score mismatches (canonical / implied / narrative):") + for cid, sc, can, implied, narr in score_mismatch[:5]: + print(f" {cid}: score={sc} band={can} -> classifier said {implied}") + print(f" {narr[:200]}") + print() + + # Save offending IDs to a file so llm_corrupt.py can target them + if sev_mismatch: + bad_path = ROOT / "clean" / "audit_severity_bad.txt" + bad_path.write_text("\n".join(c for c, *_ in sev_mismatch)) + print(f"Wrote {len(sev_mismatch)} bad sev cve_ids to {bad_path}") + if score_mismatch: + bad_path = ROOT / "clean" / "audit_score_bad.txt" + bad_path.write_text("\n".join(c for c, *_ in score_mismatch)) + print(f"Wrote {len(score_mismatch)} bad score cve_ids to {bad_path}") + + +if __name__ == "__main__": + main() diff --git a/query_cve/manual_querycode/compute_ground_truth.py b/query_cve/manual_querycode/compute_ground_truth.py new file mode 100644 index 000000000..cb6e4bbc5 --- /dev/null +++ b/query_cve/manual_querycode/compute_ground_truth.py @@ -0,0 +1,226 @@ +"""Compute ground-truth answers from clean/clean.sqlite + clean/manifest.sqlite. + +Each query incorporates >=2 of DAB's 4 properties (multi-DB integration, +ill-formatted joins, unstructured text transformation, domain knowledge). +Answers are deterministic Python over the canonical clean tables. +""" +from __future__ import annotations +import sqlite3 +import statistics +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" + + +def q1(c, m): + """Multi-DB (vulns+kev+cpe) + ill-formatted (cve-id canon + vendor alias + + vulnerable_flag): distinct CVEs published 2023, in KEV, with vulnerable Apache CPE.""" + row = c.execute(""" + SELECT COUNT(DISTINCT v.cve_id) + FROM cves v + JOIN kev k ON k.cve_id = v.cve_id + JOIN cpe_matches cp ON cp.cve_id = v.cve_id + WHERE substr(v.published, 1, 4) = '2023' + AND lower(cp.vendor) = 'apache' + AND cp.vulnerable = 1 + """).fetchone() + return str(row[0]) + + +def q2(c, m): + """Multi-DB (kev+vulns) + ill-formatted (vendor clustering + cve-id canon) + + unstructured (severity from narrative text): among KEV ransomware-flagged + CVEs, which canonical vendor (lowercase) has the most CRITICAL-severity + CVEs (consider only vendors with at least 3 ransomware-flagged CVEs)? + + Note: this is band-aligned. The narrative corruption preserves CVSS severity + band (CRITICAL/HIGH/MEDIUM/LOW per the CVSS v3 spec), so a count of CRITICAL + rows is fully recoverable from the narrative. + """ + row = c.execute(""" + WITH ransom_kev AS ( + SELECT lower(k.vendor_project) AS v, c.cve_id, c.cvss3_severity + FROM kev k + JOIN cves c ON c.cve_id = k.cve_id + WHERE lower(k.known_ransomware_use) = 'known' + ), + per_vendor AS ( + SELECT v, + COUNT(*) AS n, + SUM(CASE WHEN cvss3_severity = 'CRITICAL' THEN 1 ELSE 0 END) AS n_crit + FROM ransom_kev + GROUP BY v + HAVING n >= 3 + ) + SELECT v FROM per_vendor + ORDER BY n_crit DESC, v ASC + LIMIT 1 + """).fetchone() + return row[0] + + +def q3(c, m): + """Multi-DB (kev+vulns) + ill-formatted (cve-id canon): KEV rows with no + matching CVE in vulns_db.""" + row = c.execute(""" + SELECT COUNT(*) FROM kev k + WHERE k.cve_id NOT IN (SELECT cve_id FROM cves) + """).fetchone() + return str(row[0]) + + +def q4(c, m): + """4-hop chained aggregation: among canonical KEV vendors with >=10 KEV + CVEs that have at least one vulnerable CPE configuration, return the vendor + with the HIGHEST SHARE of CVEs whose CVSS severity is CRITICAL. + + Properties exercised: multi-DB (kev+cpe+vulns) + ill-formatted (KEV vendor + cluster + CVE-id canon + alias/vuln_flag) + unstructured (severity). + """ + rows = c.execute(""" + WITH kev_with_vuln_cpe AS ( + SELECT DISTINCT lower(k.vendor_project) AS vendor, k.cve_id + FROM kev k + JOIN cpe_matches cp ON cp.cve_id = k.cve_id + WHERE cp.vulnerable = 1 + ), + sev AS ( + SELECT cve_id, cvss3_severity FROM cves WHERE cvss3_severity IS NOT NULL + ) + SELECT k.vendor, + COUNT(*) AS n, + SUM(CASE WHEN sev.cvss3_severity = 'CRITICAL' THEN 1 ELSE 0 END) AS n_crit + FROM kev_with_vuln_cpe k + LEFT JOIN sev ON sev.cve_id = k.cve_id + GROUP BY k.vendor + HAVING n >= 10 + ORDER BY (1.0 * n_crit / n) DESC, k.vendor ASC + LIMIT 1 + """).fetchall() + return rows[0][0] if rows else "NONE" + + +def q5(c, m): + """Multi-DB (kev+cpe) + ill-formatted (cve-id canon + vendor alias) + + unstructured (version encoding): distinct affected version strings for + apache:tomcat AMONG CVEs that are also in KEV.""" + row = c.execute(""" + SELECT COUNT(DISTINCT cp.version) + FROM cpe_matches cp + JOIN kev k ON k.cve_id = cp.cve_id + WHERE lower(cp.vendor) = 'apache' + AND lower(cp.product) = 'tomcat' + AND cp.version IS NOT NULL + AND cp.version != '*' + """).fetchone() + return str(row[0]) + + +def q6(c, m): + """Multi-DB (descriptions+kev) + ill-formatted (cve-id canon) + unstructured + (language detection): CVEs with NO English description but with a non-English + one AND listed in KEV.""" + eng_dropped = {r[0] for r in m.execute("SELECT cve_id FROM planted_eng_dropped")} + has_other_lang = { + r[0] for r in c.execute( + "SELECT DISTINCT cve_id FROM cve_descriptions WHERE lang != 'en'" + ) + } + in_kev = {r[0] for r in c.execute("SELECT cve_id FROM kev")} + return str(len(eng_dropped & has_other_lang & in_kev)) + + +def q7(c, m): + """Multi-DB (cpe+kev) + ill-formatted (cve-id canon + vendor alias + + vulnerable_flag): canonical (vendor, product) with highest count of + vulnerable CPE rows whose CVE is also in KEV.""" + row = c.execute(""" + SELECT lower(cp.vendor) || ':' || lower(cp.product) AS vp, COUNT(*) AS n + FROM cpe_matches cp + JOIN kev k ON k.cve_id = cp.cve_id + WHERE cp.vulnerable = 1 + AND cp.vendor IS NOT NULL + AND cp.product IS NOT NULL + GROUP BY vp + ORDER BY n DESC, vp ASC + LIMIT 1 + """).fetchone() + return row[0] + + +def q8(c, m): + """Multi-DB (vulns+kev+cpe) + ill-formatted (alias + cve-id canon + + vulnerable_flag) + unstructured (severity from narrative text): how many + CVEs are in KEV AND have at least one vulnerable Microsoft CPE configuration + AND have CVSS v3 severity HIGH or CRITICAL (i.e. base score >= 7.0)? + + Note: this is band-aligned. The threshold 7.0 is the boundary between MEDIUM + and HIGH per the CVSS v3 spec, so 'HIGH or CRITICAL' is a band-level question + fully recoverable from the narrative-corrupted score_text. + """ + row = c.execute(""" + SELECT COUNT(*) + FROM cves v + JOIN kev k ON k.cve_id = v.cve_id + JOIN ( + SELECT DISTINCT cve_id FROM cpe_matches + WHERE lower(vendor) = 'microsoft' AND vulnerable = 1 + ) ms ON ms.cve_id = v.cve_id + WHERE v.cvss3_severity IN ('HIGH', 'CRITICAL') + """).fetchone() + return str(row[0]) + + +def q9(c, m): + """Multi-DB (vulns+kev) + ill-formatted (cve-id canon for self-join + KEV + join): CVEs that have a duplicate row with conflicting attack_vector AND + are also listed in KEV.""" + in_kev = {r[0] for r in c.execute("SELECT cve_id FROM kev")} + duped = {r[0] for r in m.execute("SELECT cve_id FROM planted_duplicate")} + return str(len(in_kev & duped)) + + +def q10(c, m): + """Multi-DB (kev+vulns) + ill-formatted (KEV vendor clustering + cve-id + canon + products_csv split) + unstructured (CSV parsing): how many distinct + Microsoft products (per canonical KEV vendor 'microsoft', after splitting + on '/' or ',') have at least one CVE in KEV with CVSS base score >= 9.0?""" + products = set() + rows = c.execute(""" + SELECT k.product, v.cvss3_base_score + FROM kev k + JOIN cves v ON v.cve_id = k.cve_id + WHERE lower(k.vendor_project) = 'microsoft' + AND k.product IS NOT NULL + AND v.cvss3_base_score IS NOT NULL + AND v.cvss3_base_score >= 9.0 + """).fetchall() + for prod, _ in rows: + for p in prod.split("/"): + p = p.strip() + if p: + products.add(p.lower()) + return str(len(products)) + + +QUERIES = { + "1": q1, "2": q2, "3": q3, "4": q4, "5": q5, + "6": q6, "7": q7, "8": q8, "9": q9, "10": q10, +} + + +def main(): + c = sqlite3.connect(CLEAN_DB) + m = sqlite3.connect(MANIFEST_DB) + for qid, fn in QUERIES.items(): + ans = fn(c, m) + out = ROOT / f"query{qid}" / "ground_truth.csv" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(ans + "\n", encoding="utf-8") + print(f"query{qid}: {ans}") + + +if __name__ == "__main__": + main() diff --git a/query_cve/manual_querycode/corrupt.py b/query_cve/manual_querycode/corrupt.py new file mode 100644 index 000000000..f3573dfcb --- /dev/null +++ b/query_cve/manual_querycode/corrupt.py @@ -0,0 +1,670 @@ +"""Corrupt clean/clean.sqlite into the four agent-visible DBs in query_dataset/. + +NO date-format corruption anywhere (deliberately). Each non-date corruption is +hash-deterministic so the build is reproducible. + +Engines (chosen so each engine matches the data's natural shape): + vulns_db sqlite query_dataset/vulns.db (cves + cvss_metadata — small embedded registry) + cpe_db duckdb query_dataset/cpe.duckdb (analytical / columnar / 585k+ rows) + kev_db postgres query_dataset/kev.sql (structured catalog dump; loaded by seed_dbs.py) + descriptions_db mongo query_dataset/descriptions/ (per-CVE document with nested descriptions[] + references[]) + +Corruption properties applied: + + vulns_db (sqlite): + - Only the structured registry: cves + cvss_metadata. + - cvss3_severity column DROPPED. + - cvss3_base_score column DROPPED. Score moved into a sibling + cvss_metadata.score_text column, formatted variably as + "9.8 (CRITICAL)", "score=7.1/10 hi", "5.4-medium-base", etc. + - ~5% of CVEs have a duplicate row in `cves` with a conflicting + cvss3_attack_vector value. + - cve_id format varies per row across both tables in vulns. + + descriptions_db (mongo): + - One document per CVE: { cve, descriptions: [{lang, value}, ...], + references: [{url, source}, ...] } + - Severity word replaced with a cryptic phrase ("Risk-level: 4-of-4" etc.) + appended to the English description. Literal words critical/high/medium/low + never appear. + - English description dropped for a deterministic subset (~20%); only the + non-English description survives there (no fallback if no other lang exists). + - cve field uses the canonical CVE-ID format (uppercase "CVE-YYYY-NNNN") — + cross-DB joins from descriptions to vulns/cpe still require canonicalization. + + cpe_db (duckdb): + - cpe_matches.criteria uses vendor ALIASES, never the canonical vendor name. + (e.g. apache -> "ASF", microsoft -> "MSFT-Corp"). vendor_aliases lookup table + provided so the agent can join through it. + - vulnerable INT replaced with vulnerable_flag TEXT taking varied truthy/falsy + tokens ("yes", "y", "V", "affected", "1", ... / "no", "n", "F", "safe", ...). + - Version moved out of `criteria`/`version_*` columns into a sibling + cpe_version_details.version_text column with mixed encodings + ("2.14.1", "2,14,1", "v2_14_1", "build-2014001"). + - cve_id format varies per row. + + kev_db (postgres): + - vendor_project rewritten via vendor-variant pool (microsoft / "Microsoft Corp" + / "MSFT" all map to apache canonical "microsoft"). Variants chosen + deterministically so the same canonical vendor appears under multiple + surface forms requiring clustering. + - product column REPLACED with products_csv: a comma-separated list. Real + single-product rows still get a single value; synthetic multi-product + rows are injected for queries that test list-splitting. + - ~5% of KEV rows reference CVEs absent from vulns_db (referential gap). + - cve_ref format varies per row. + + +Manifest tables (clean/manifest.sqlite — never agent-visible): + planted_severity_phrase (cve_id, severity, phrase_id) + planted_score_text (cve_id, canonical_score, score_text, format_id) + planted_eng_dropped (cve_id) — CVEs whose English description was deleted + planted_duplicate (cve_id, original_attack_vector, duplicate_attack_vector) + canonical_cpe (cve_id, canonical_criteria, vendor, product, version) + canonical_kev_vendor (cve_id, canonical_vendor, corrupted_vendor) + canonical_vulnerable (cve_id, criteria, canonical_vulnerable, flag_text) + canonical_version_text (cve_id, criteria, canonical_version, version_text, format_id) +""" +from __future__ import annotations +import hashlib +import json +import shutil +import sqlite3 +from pathlib import Path + +import duckdb + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" +QD = ROOT / "query_dataset" + +VULNS_DB = QD / "vulns.db" +CPE_DB = QD / "cpe.duckdb" +KEV_SQL = QD / "kev.sql" +DESC_DUMP = QD / "descriptions" + + +# ---- helpers --------------------------------------------------------------- + +def h(*parts) -> int: + """Stable, deterministic integer hash of stringified parts.""" + s = "|".join(str(p) for p in parts) + return int(hashlib.sha1(s.encode()).hexdigest(), 16) + + +# ---- severity-as-cryptic-prose --------------------------------------------- +# The literal word "critical"/"high"/"medium"/"low" NEVER appears in a planted +# phrase. The agent must learn the mapping from samples (or world knowledge). +# Each severity gets a fixed pool of phrases, one chosen per CVE. + +SEVERITY_PHRASES: dict[str, list[str]] = { + "CRITICAL": [ + "Risk-level: 4-of-4", + "Threat tier: T4 (top)", + "Impact band: red-zone", + ], + "HIGH": [ + "Risk-level: 3-of-4", + "Threat tier: T3", + "Impact band: orange-zone", + ], + "MEDIUM": [ + "Risk-level: 2-of-4", + "Threat tier: T2", + "Impact band: yellow-zone", + ], + "LOW": [ + "Risk-level: 1-of-4", + "Threat tier: T1", + "Impact band: green-zone", + ], +} + + +def severity_phrase(sev: str, cve_id: str) -> tuple[str, int]: + sev_u = sev.upper() + pool = SEVERITY_PHRASES.get(sev_u, []) + if not pool: + return "", 0 + pid = h("sev-phrase", cve_id) % len(pool) + return pool[pid], pid + + +# ---- CVSS-as-text ---------------------------------------------------------- +SCORE_TEXT_FORMATS = [ + lambda score, sev: f"{score:.1f} ({sev.upper()})", + lambda score, sev: f"score={score:.2f}/10 {sev.lower()[:2]}", + lambda score, sev: f"{score:.1f}-{sev.lower()}-base", + lambda score, sev: f"CVSSv3 base = {score:.1f} severity={sev.title()}", +] + + +def score_text(score: float, sev: str | None, cve_id: str) -> tuple[str, int]: + if score is None: + return "", 0 + fid = h("score-fmt", cve_id) % len(SCORE_TEXT_FORMATS) + sev_safe = sev or "UNKNOWN" + return SCORE_TEXT_FORMATS[fid](score, sev_safe), fid + + +# ---- CPE vendor aliasing --------------------------------------------------- +CPE_FORMAT_FNS = [ + lambda alias, p, ver: f"cpe:2.3:a:{alias}:{p}:{ver}:*:*:*:*:*:*:*", + lambda alias, p, ver: f"{alias}/{p}@{ver}", + lambda alias, p, ver: f"{alias} {p.replace('_', ' ').title()} {ver}", +] + +VENDOR_ALIAS = { + "apache": "ASF", + "microsoft":"MSFT-Corp", + "google": "Alphabet", + "apple": "Cupertino-Inc", + "adobe": "Magenta-Sys", + "cisco": "SanJose-Net", + "oracle": "Redwood-Sw", + "ivanti": "MobileFront", + "fortinet": "FTNT-Sec", + "vmware": "Broadcom-VS", + "linux": "Tux-Foundation", + "redhat": "RH-Ent", + "ibm": "BigBlue", + "samsung": "SDC-Korea", + "intel": "SantaClara-Si", +} + + +def alias_for(vendor: str) -> str: + if not vendor: + return vendor + canonical = vendor.lower() + if canonical in VENDOR_ALIAS: + return VENDOR_ALIAS[canonical] + return "vnd_" + hashlib.sha1(canonical.encode()).hexdigest()[:6] + + +# ---- vulnerable as varied truthy/falsy strings ----------------------------- +TRUTHY = ["yes", "y", "true", "1", "V", "affected", "vulnerable", "T"] +FALSY = ["no", "n", "false", "0", "F", "safe", "not_affected", "clean"] + + +def vuln_flag(canonical: int, cve_id: str, criteria: str) -> str: + pool = TRUTHY if canonical else FALSY + return pool[h("vflag", cve_id, criteria) % len(pool)] + + +# ---- CPE version encoding mix ---------------------------------------------- +VERSION_FORMATS = [ + lambda v: v, # 2.14.1 + lambda v: v.replace(".", ","), # 2,14,1 + lambda v: "v" + v.replace(".", "_"), # v2_14_1 + lambda v: "build-" + "".join(p.zfill(3) for p in v.split(".")), # build-002014001 +] + + +def version_text_for(version: str | None, cve_id: str, criteria: str) -> tuple[str | None, int]: + if not version or version == "*": + return version, 0 + fid = h("vfmt", cve_id, criteria) % len(VERSION_FORMATS) + try: + return VERSION_FORMATS[fid](version), fid + except Exception: + return version, 0 + + +# ---- KEV vendor variants --------------------------------------------------- +VENDOR_VARIANTS = { + "microsoft": ["microsoft", "Microsoft Corp", "MSFT", "Microsoft Corporation"], + "apple": ["apple", "Apple Inc.", "Apple Inc", "AAPL"], + "google": ["google", "Google LLC", "Google Inc.", "Alphabet (Google)"], + "apache": ["apache", "The Apache Software Foundation", "Apache Software Foundation", "ASF"], + "adobe": ["adobe", "Adobe Inc.", "Adobe Systems"], + "cisco": ["cisco", "Cisco Systems", "Cisco Systems, Inc."], + "oracle": ["oracle", "Oracle Corporation", "Oracle Corp"], + "ivanti": ["ivanti", "Ivanti Inc.", "Ivanti Software"], + "fortinet": ["fortinet", "Fortinet Inc.", "Fortinet, Inc."], + "vmware": ["vmware", "VMware Inc.", "VMware, LLC", "Broadcom (VMware)"], +} + + +def vendor_variant(canonical: str, salt: str) -> str: + pool = VENDOR_VARIANTS.get(canonical.lower()) + if not pool: + choice = h(canonical, salt) % 3 + return [canonical, canonical.lower(), canonical + " Inc."][choice] + return pool[h(canonical, salt) % len(pool)] + + +# ---- CVE-ID format mixing -------------------------------------------------- +def reformat_cve_id(cve_id: str, salt: str) -> str: + fid = h("cveid", salt) % 3 + if fid == 0: + return cve_id + if fid == 1: + return cve_id.lower() + return cve_id.removeprefix("CVE-").removeprefix("cve-") + + +# ---- pipeline -------------------------------------------------------------- + +def init_manifest(conn: sqlite3.Connection) -> None: + conn.executescript(""" + DROP TABLE IF EXISTS planted_severity_phrase; + DROP TABLE IF EXISTS planted_score_text; + DROP TABLE IF EXISTS planted_eng_dropped; + DROP TABLE IF EXISTS planted_duplicate; + DROP TABLE IF EXISTS canonical_cpe; + DROP TABLE IF EXISTS canonical_kev_vendor; + DROP TABLE IF EXISTS canonical_vulnerable; + DROP TABLE IF EXISTS canonical_version_text; + + CREATE TABLE planted_severity_phrase ( + cve_id TEXT PRIMARY KEY, severity TEXT, phrase_id INTEGER, phrase TEXT + ); + CREATE TABLE planted_score_text ( + cve_id TEXT PRIMARY KEY, canonical_score REAL, score_text TEXT, format_id INTEGER + ); + CREATE TABLE planted_eng_dropped (cve_id TEXT PRIMARY KEY); + CREATE TABLE planted_duplicate ( + cve_id TEXT PRIMARY KEY, + original_attack_vector TEXT, + duplicate_attack_vector TEXT + ); + CREATE TABLE canonical_cpe ( + cve_id TEXT, canonical_criteria TEXT, vendor TEXT, product TEXT, + version TEXT, format_id INTEGER + ); + CREATE TABLE canonical_kev_vendor ( + cve_id TEXT PRIMARY KEY, canonical_vendor TEXT, corrupted_vendor TEXT + ); + CREATE TABLE canonical_vulnerable ( + cve_id TEXT, criteria TEXT, canonical_vulnerable INTEGER, flag_text TEXT + ); + CREATE TABLE canonical_version_text ( + cve_id TEXT, criteria TEXT, canonical_version TEXT, version_text TEXT, format_id INTEGER + ); + """) + conn.commit() + + +# Selectors driven by hash-determinism -------------------------------------- + +DROP_ENGLISH_RATE = 20 # 1 in N CVEs gets its English description dropped +DUPLICATE_RATE = 20 # 1 in N CVEs gets a duplicate row with conflicting AV + + +def should_drop_english(cve_id: str) -> bool: + return h("drop-eng", cve_id) % DROP_ENGLISH_RATE == 0 + + +def should_duplicate(cve_id: str) -> bool: + return h("dup", cve_id) % DUPLICATE_RATE == 0 + + +def conflicting_attack_vector(orig: str | None, cve_id: str) -> str: + """Pick a *different* attack vector for the duplicate row.""" + pool = ["NETWORK", "ADJACENT_NETWORK", "LOCAL", "PHYSICAL"] + seed = h("dup-av", cve_id) + for off in range(len(pool)): + candidate = pool[(seed + off) % len(pool)] + if candidate != (orig or ""): + return candidate + return pool[0] + + +# ---- builders -------------------------------------------------------------- + +def _load_planted_narratives(manifest: sqlite3.Connection): + desc = {r[0]: r[1] for r in manifest.execute( + "SELECT cve_id, narrative FROM planted_narrative_desc" + )} if manifest.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='planted_narrative_desc'" + ).fetchone() else {} + score = {r[0]: r[1] for r in manifest.execute( + "SELECT cve_id, narrative FROM planted_narrative_score" + )} if manifest.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='planted_narrative_score'" + ).fetchone() else {} + return desc, score + + +def build_vulns(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + """vulns.db now holds only the structured registry: cves + cvss_metadata. + Free-text descriptions and references move to descriptions_db (Mongo).""" + if VULNS_DB.exists(): + VULNS_DB.unlink() + out = sqlite3.connect(VULNS_DB) + out.executescript(""" + CREATE TABLE cves ( + cve_id TEXT, + published TEXT, + last_modified TEXT, + vuln_status TEXT, + cvss3_attack_vector TEXT + ); + CREATE TABLE cvss_metadata ( + cve_id TEXT, score_text TEXT + ); + CREATE INDEX idx_cvss_cve ON cvss_metadata(cve_id); + """) + cur = clean.cursor() + mcur = manifest.cursor() + + sev_by_cve: dict[str, str] = dict(cur.execute( + "SELECT cve_id, cvss3_severity FROM cves WHERE cvss3_severity IS NOT NULL" + ).fetchall()) + score_by_cve: dict[str, float] = dict(cur.execute( + "SELECT cve_id, cvss3_base_score FROM cves WHERE cvss3_base_score IS NOT NULL" + ).fetchall()) + narr_desc_by_cve, narr_score_by_cve = _load_planted_narratives(manifest) + + n_dup = 0 + for cve_id, published, last_modified, vuln_status, av in cur.execute( + "SELECT cve_id, published, last_modified, vuln_status, " + "cvss3_attack_vector FROM cves" + ): + cve_corrupted = reformat_cve_id(cve_id, "vulns-" + cve_id) + out.execute( + "INSERT INTO cves VALUES (?,?,?,?,?)", + (cve_corrupted, published, last_modified, vuln_status, av), + ) + if should_duplicate(cve_id): + dup_av = conflicting_attack_vector(av, cve_id) + dup_corrupted = reformat_cve_id(cve_id, "vulns-dup-" + cve_id) + out.execute( + "INSERT INTO cves VALUES (?,?,?,?,?)", + (dup_corrupted, published, last_modified, vuln_status, dup_av), + ) + mcur.execute( + "INSERT OR REPLACE INTO planted_duplicate VALUES (?,?,?)", + (cve_id, av, dup_av), + ) + n_dup += 1 + + n_score = 0 + n_narr_score = 0 + for cve_id, score in score_by_cve.items(): + sev = sev_by_cve.get(cve_id) + # Prefer per-row LLM-generated narrative if available; fall back to + # template otherwise. + narr = narr_score_by_cve.get(cve_id) + if narr: + st = narr + fid = -1 # signals narrative origin + n_narr_score += 1 + else: + st, fid = score_text(score, sev, cve_id) + if not st: + continue + cve_corrupted = reformat_cve_id(cve_id, "vulns-" + cve_id) + out.execute("INSERT INTO cvss_metadata VALUES (?,?)", (cve_corrupted, st)) + mcur.execute( + "INSERT OR REPLACE INTO planted_score_text VALUES (?,?,?,?)", + (cve_id, score, st, fid), + ) + n_score += 1 + + out.commit() + out.close() + manifest.commit() + print( + f"vulns.db: built ({n_score} cvss texts, of which {n_narr_score} are " + f"LLM narratives, {n_dup} duplicate rows)" + ) + + +def build_descriptions(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + """One Mongo document per CVE with nested descriptions[] and references[]. + English-dropping + severity-phrase planting happen here.""" + if DESC_DUMP.exists(): + shutil.rmtree(DESC_DUMP) + DESC_DUMP.mkdir(parents=True, exist_ok=True) + + cur = clean.cursor() + mcur = manifest.cursor() + + sev_by_cve: dict[str, str] = dict(cur.execute( + "SELECT cve_id, cvss3_severity FROM cves WHERE cvss3_severity IS NOT NULL" + ).fetchall()) + narr_desc_by_cve, _ = _load_planted_narratives(manifest) + + descs_by_cve: dict[str, list[dict]] = {} + n_planted_sev = 0 + n_narr_sev = 0 + n_eng_dropped = 0 + for cve_id, lang, value in cur.execute( + "SELECT cve_id, lang, value FROM cve_descriptions" + ): + if lang == "en" and should_drop_english(cve_id): + mcur.execute( + "INSERT OR REPLACE INTO planted_eng_dropped VALUES (?)", + (cve_id,), + ) + n_eng_dropped += 1 + continue + sev = sev_by_cve.get(cve_id) + if lang == "en" and cve_id in narr_desc_by_cve: + # LLM narrative replaces the English description entirely; severity + # is implied via prose, not via a tagline. + value = narr_desc_by_cve[cve_id] + n_narr_sev += 1 + elif sev and lang == "en": + phrase, pid = severity_phrase(sev, cve_id) + value = (value or "") + " " + phrase + mcur.execute( + "INSERT OR REPLACE INTO planted_severity_phrase VALUES (?,?,?,?)", + (cve_id, sev, pid, phrase), + ) + n_planted_sev += 1 + descs_by_cve.setdefault(cve_id, []).append({"lang": lang, "value": value}) + + refs_by_cve: dict[str, list[dict]] = {} + for cve_id, url, source in cur.execute( + "SELECT cve_id, url, source FROM cve_references" + ): + refs_by_cve.setdefault(cve_id, []).append({"url": url, "source": source}) + + cve_universe = set(descs_by_cve) | set(refs_by_cve) + docs = [ + { + "cve": cve_id, + "descriptions": descs_by_cve.get(cve_id, []), + "references": refs_by_cve.get(cve_id, []), + } + for cve_id in sorted(cve_universe) + ] + + from bson import encode as bson_encode + coll_dir = DESC_DUMP / "cve_descriptions" + coll_dir.mkdir(parents=True, exist_ok=True) + bson_path = coll_dir / "cve_documents.bson" + with bson_path.open("wb") as f: + for d in docs: + f.write(bson_encode(d)) + meta = { + "options": {}, + "indexes": [ + {"v": 2, "key": {"_id": 1}, "name": "_id_"}, + {"v": 2, "key": {"cve": 1}, "name": "cve_1"}, + ], + "uuid": "", + "collectionName": "cve_documents", + "type": "collection", + } + (coll_dir / "cve_documents.metadata.json").write_text(json.dumps(meta), encoding="utf-8") + manifest.commit() + print( + f"descriptions/: built ({len(docs)} cve_documents, " + f"{n_planted_sev} severity phrases, {n_narr_sev} LLM severity narratives, " + f"{n_eng_dropped} english-dropped)" + ) + + +def build_cpe(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if CPE_DB.exists(): + CPE_DB.unlink() + con = duckdb.connect(str(CPE_DB)) + con.execute(""" + CREATE TABLE cpe_matches ( + cve_id TEXT, + criteria TEXT, + vulnerable_flag TEXT + ); + CREATE TABLE vendor_aliases ( + alias TEXT PRIMARY KEY, + canonical_vendor TEXT + ); + CREATE TABLE cpe_version_details ( + cve_id TEXT, + criteria TEXT, + version_text TEXT, + version_start_inc TEXT, + version_start_exc TEXT, + version_end_inc TEXT, + version_end_exc TEXT + ); + """) + cur = clean.cursor() + mcur = manifest.cursor() + rows = cur.execute( + "SELECT cve_id, cpe_criteria, vendor, product, version, " + "version_start_inc, version_start_exc, version_end_inc, version_end_exc, vulnerable " + "FROM cpe_matches" + ).fetchall() + + alias_map: dict[str, str] = {} + match_rows = [] + version_rows = [] + for cve_id, crit, vendor, product, version, vsi, vse, vei, vee, vuln in rows: + if not vendor or not product: + corrupted_crit = crit + fid = 0 + else: + alias = alias_for(vendor) + alias_map[alias] = vendor.lower() + ver = version if version and version != "*" else "0" + fid = h("cpe", cve_id, crit) % len(CPE_FORMAT_FNS) + corrupted_crit = CPE_FORMAT_FNS[fid](alias, product, ver) + corrupted_cve = reformat_cve_id(cve_id, "cpe-row-" + str(h(crit) % 1_000_000)) + flag_text = vuln_flag(vuln, cve_id, crit) + match_rows.append((corrupted_cve, corrupted_crit, flag_text)) + + v_text, v_fid = version_text_for(version, cve_id, crit) + version_rows.append((corrupted_cve, corrupted_crit, v_text, vsi, vse, vei, vee)) + + mcur.execute( + "INSERT INTO canonical_cpe VALUES (?,?,?,?,?,?)", + (cve_id, crit, vendor, product, version, fid), + ) + mcur.execute( + "INSERT INTO canonical_vulnerable VALUES (?,?,?,?)", + (cve_id, crit, vuln, flag_text), + ) + mcur.execute( + "INSERT INTO canonical_version_text VALUES (?,?,?,?,?)", + (cve_id, crit, version, v_text, v_fid), + ) + + con.executemany("INSERT INTO cpe_matches VALUES (?,?,?)", match_rows) + con.executemany( + "INSERT INTO vendor_aliases VALUES (?,?)", + sorted(alias_map.items()), + ) + con.executemany( + "INSERT INTO cpe_version_details VALUES (?,?,?,?,?,?,?)", + version_rows, + ) + con.close() + manifest.commit() + print( + f"cpe.duckdb: built ({len(match_rows)} cpe_match rows, " + f"{len(alias_map)} vendor_aliases, {len(version_rows)} version_detail rows)" + ) + + +def build_kev(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if KEV_SQL.exists(): + KEV_SQL.unlink() + cur = clean.cursor() + mcur = manifest.cursor() + rows = cur.execute( + "SELECT cve_id, vendor_project, product, vulnerability_name, date_added, " + "short_description, required_action, due_date, known_ransomware_use, notes FROM kev" + ).fetchall() + + def pgesc(s): + if s is None: + return "NULL" + return "'" + str(s).replace("'", "''") + "'" + + lines = [ + "CREATE TABLE kev_entries (", + " cve_ref TEXT,", + " vendor_project TEXT,", + " products_csv TEXT,", + " vulnerability_name TEXT,", + " date_added TEXT,", + " short_description TEXT,", + " required_action TEXT,", + " due_date TEXT,", + " known_ransomware_use TEXT,", + " notes TEXT", + ");", + ] + for (cve_id, vendor, product, vname, dadded, sdesc, raction, ddue, krw, notes) in rows: + cv = vendor or "" + corrupted_vendor = vendor_variant(cv, cve_id) if cv else cv + cve_ref = reformat_cve_id(cve_id, "kev-" + cve_id) + + # synthesize a packed list when product contains a "/" (real KEV uses + # this as a separator for sibling products); otherwise leave singleton. + if product and "/" in product: + products_csv = ",".join(p.strip() for p in product.split("/")) + else: + products_csv = product + + lines.append( + "INSERT INTO kev_entries VALUES (" + ", ".join([ + pgesc(cve_ref), + pgesc(corrupted_vendor), + pgesc(products_csv), + pgesc(vname), + pgesc(dadded), # untouched (no date corruption) + pgesc(sdesc), + pgesc(raction), + pgesc(ddue), # untouched + pgesc(krw), + pgesc(notes), + ]) + ");" + ) + mcur.execute( + "INSERT OR REPLACE INTO canonical_kev_vendor VALUES (?,?,?)", + (cve_id, vendor, corrupted_vendor), + ) + + KEV_SQL.write_text("\n".join(lines) + "\n", encoding="utf-8") + manifest.commit() + print(f"kev.sql: built ({len(rows)} kev_entries rows)") + + +def main() -> None: + if not CLEAN_DB.exists(): + raise SystemExit(f"missing clean snapshot: {CLEAN_DB} — run fetch_clean.py first") + QD.mkdir(parents=True, exist_ok=True) + # Don't unlink the manifest — it may hold LLM narrative outputs from + # llm_corrupt.py that are expensive to regenerate. init_manifest drops + # only the deterministic-corruption tables. + clean = sqlite3.connect(CLEAN_DB) + manifest = sqlite3.connect(MANIFEST_DB) + init_manifest(manifest) + build_vulns(clean, manifest) + build_cpe(clean, manifest) + build_kev(clean, manifest) + build_descriptions(clean, manifest) + clean.close() + manifest.close() + print("OK") + + +if __name__ == "__main__": + main() diff --git a/query_cve/manual_querycode/fetch_clean.py b/query_cve/manual_querycode/fetch_clean.py new file mode 100644 index 000000000..3b80f8a4c --- /dev/null +++ b/query_cve/manual_querycode/fetch_clean.py @@ -0,0 +1,284 @@ +"""Fetch a bounded NVD/KEV/EPSS snapshot into clean/clean.sqlite. + +Scope (kept small to keep the benchmark dataset under ~50MB): + - NVD CVEs: published 2023-01-01..2024-12-31 via the NVD 2.0 API + - CISA KEV: full catalog (~1300 rows) + - EPSS: one snapshot date (yesterday) for every CVE we ingested + +This is the canonical "clean" snapshot. It is NEVER what the agent sees — +the corruption pipeline reads from here and emits the agent-visible DBs in +query_dataset/. compute_ground_truth.py also reads from here. +""" +from __future__ import annotations +import csv +import gzip +import io +import json +import sqlite3 +import sys +import time +import urllib.request +from datetime import date, datetime, timedelta +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" + +NVD_API = "https://services.nvd.nist.gov/rest/json/cves/2.0" +KEV_URL = "https://www.cisa.gov/sites/default/files/feeds/known_exploited_vulnerabilities.json" +EPSS_URL_TMPL = "https://epss.empiricalsecurity.com/epss_scores-{date}.csv.gz" + +# NVD limits date ranges to 120 days, so we chunk. +WINDOW_START = date(2023, 1, 1) +WINDOW_END = date(2024, 12, 31) +CHUNK_DAYS = 120 +PAGE = 2000 # NVD API max + + +def _http_get(url: str, retries: int = 5) -> bytes: + for i in range(retries): + try: + req = urllib.request.Request(url, headers={"User-Agent": "DataAgentBench/cve"}) + with urllib.request.urlopen(req, timeout=60) as r: + return r.read() + except Exception as e: + if i == retries - 1: + raise + wait = 2 ** i + print(f" retry {i+1}/{retries} after {wait}s: {e}", file=sys.stderr) + time.sleep(wait) + + +def init_schema(conn: sqlite3.Connection) -> None: + c = conn.cursor() + c.executescript(""" + DROP TABLE IF EXISTS cves; + DROP TABLE IF EXISTS cve_descriptions; + DROP TABLE IF EXISTS cve_references; + DROP TABLE IF EXISTS cpe_matches; + DROP TABLE IF EXISTS kev; + DROP TABLE IF EXISTS epss; + + CREATE TABLE cves ( + cve_id TEXT PRIMARY KEY, + published TEXT, + last_modified TEXT, + vuln_status TEXT, + cvss3_base_score REAL, + cvss3_severity TEXT, + cvss3_attack_vector TEXT, + cvss3_vector TEXT + ); + CREATE TABLE cve_descriptions ( + cve_id TEXT, lang TEXT, value TEXT + ); + CREATE TABLE cve_references ( + cve_id TEXT, url TEXT, source TEXT + ); + CREATE TABLE cpe_matches ( + cve_id TEXT, + cpe_criteria TEXT, + vendor TEXT, + product TEXT, + version TEXT, + version_start_inc TEXT, + version_start_exc TEXT, + version_end_inc TEXT, + version_end_exc TEXT, + vulnerable INTEGER + ); + CREATE TABLE kev ( + cve_id TEXT PRIMARY KEY, + vendor_project TEXT, + product TEXT, + vulnerability_name TEXT, + date_added TEXT, + short_description TEXT, + required_action TEXT, + due_date TEXT, + known_ransomware_use TEXT, + notes TEXT + ); + CREATE TABLE epss ( + cve_id TEXT, score_date TEXT, epss REAL, percentile REAL, + PRIMARY KEY (cve_id, score_date) + ); + CREATE INDEX idx_cpe_cve ON cpe_matches(cve_id); + CREATE INDEX idx_cpe_vendor ON cpe_matches(vendor); + """) + conn.commit() + + +def _chunk_ranges(): + cur = WINDOW_START + while cur <= WINDOW_END: + end = min(cur + timedelta(days=CHUNK_DAYS - 1), WINDOW_END) + yield ( + f"{cur.isoformat()}T00:00:00.000", + f"{end.isoformat()}T23:59:59.999", + ) + cur = end + timedelta(days=1) + + +def fetch_nvd(conn: sqlite3.Connection) -> set[str]: + """Stream NVD CVEs across chunked windows into clean.sqlite. Return CVE id set.""" + cur = conn.cursor() + seen: set[str] = set() + for win_start, win_end in _chunk_ranges(): + start = 0 + while True: + url = ( + f"{NVD_API}?pubStartDate={win_start}&pubEndDate={win_end}" + f"&resultsPerPage={PAGE}&startIndex={start}" + ) + print(f"NVD: window={win_start[:10]}..{win_end[:10]} startIndex={start}", flush=True) + raw = _http_get(url) + data = json.loads(raw) + total = data.get("totalResults", 0) + items = data.get("vulnerabilities", []) + if not items: + break + for w in items: + c = w.get("cve", {}) + cid = c.get("id") + if not cid or cid in seen: + continue + seen.add(cid) + metrics = c.get("metrics", {}) or {} + cvss = None + for key in ("cvssMetricV31", "cvssMetricV30"): + if metrics.get(key): + cvss = metrics[key][0].get("cvssData", {}) + break + cur.execute( + "INSERT OR REPLACE INTO cves VALUES (?,?,?,?,?,?,?,?)", + ( + cid, + c.get("published"), + c.get("lastModified"), + c.get("vulnStatus"), + (cvss or {}).get("baseScore"), + (cvss or {}).get("baseSeverity"), + (cvss or {}).get("attackVector"), + (cvss or {}).get("vectorString"), + ), + ) + for d in c.get("descriptions", []) or []: + cur.execute( + "INSERT INTO cve_descriptions VALUES (?,?,?)", + (cid, d.get("lang"), d.get("value")), + ) + for r in c.get("references", []) or []: + cur.execute( + "INSERT INTO cve_references VALUES (?,?,?)", + (cid, r.get("url"), r.get("source")), + ) + for cfg in c.get("configurations", []) or []: + for node in cfg.get("nodes", []) or []: + for m in node.get("cpeMatch", []) or []: + crit = m.get("criteria", "") + parts = crit.split(":") if crit.startswith("cpe:2.3:") else [] + vendor = parts[3] if len(parts) > 3 else None + product = parts[4] if len(parts) > 4 else None + version = parts[5] if len(parts) > 5 else None + cur.execute( + "INSERT INTO cpe_matches VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + cid, crit, vendor, product, version, + m.get("versionStartIncluding"), + m.get("versionStartExcluding"), + m.get("versionEndIncluding"), + m.get("versionEndExcluding"), + 1 if m.get("vulnerable") else 0, + ), + ) + conn.commit() + start += PAGE + if start >= total: + break + time.sleep(6) # NVD rate limit: 5 req / 30s without API key + print(f"NVD: ingested {len(seen)} CVEs", flush=True) + return seen + + +def fetch_kev(conn: sqlite3.Connection, allow: set[str]) -> int: + raw = _http_get(KEV_URL) + data = json.loads(raw) + cur = conn.cursor() + n = 0 + for v in data.get("vulnerabilities", []): + cid = v.get("cveID") + if not cid: + continue + # We intentionally keep KEV rows whose CVE isn't in `allow` — + # those become referential-integrity gaps in the corrupted DB. + cur.execute( + "INSERT OR REPLACE INTO kev VALUES (?,?,?,?,?,?,?,?,?,?)", + ( + cid, + v.get("vendorProject"), + v.get("product"), + v.get("vulnerabilityName"), + v.get("dateAdded"), + v.get("shortDescription"), + v.get("requiredAction"), + v.get("dueDate"), + v.get("knownRansomwareCampaignUse"), + v.get("notes"), + ), + ) + n += 1 + conn.commit() + print(f"KEV: ingested {n} entries", flush=True) + return n + + +def fetch_epss(conn: sqlite3.Connection, allow: set[str], snapshot_dates: list[str]) -> int: + cur = conn.cursor() + n = 0 + for d in snapshot_dates: + url = EPSS_URL_TMPL.format(date=d) + print(f"EPSS: {d}", flush=True) + raw = _http_get(url) + text = gzip.decompress(raw).decode("utf-8") + # First line is a comment with model version; skip until header. + lines = [ln for ln in text.splitlines() if not ln.startswith("#")] + reader = csv.DictReader(lines) + for row in reader: + cid = row.get("cve") + if not cid or cid not in allow: + continue + try: + cur.execute( + "INSERT OR REPLACE INTO epss VALUES (?,?,?,?)", + (cid, d, float(row["epss"]), float(row["percentile"])), + ) + n += 1 + except Exception: + pass + conn.commit() + print(f"EPSS: ingested {n} score rows", flush=True) + return n + + +def main() -> None: + CLEAN_DB.parent.mkdir(parents=True, exist_ok=True) + if CLEAN_DB.exists(): + CLEAN_DB.unlink() + conn = sqlite3.connect(CLEAN_DB) + init_schema(conn) + cves = fetch_nvd(conn) + fetch_kev(conn, cves) + today = date.today() + snapshot_dates = [ + (today - timedelta(days=2)).isoformat(), + "2024-06-01", + "2024-01-02", # earliest we sample, gives us a "before KEV add" lookback + ] + fetch_epss(conn, cves, snapshot_dates) + conn.close() + print(f"OK: clean snapshot at {CLEAN_DB}", flush=True) + + +if __name__ == "__main__": + main() diff --git a/query_cve/manual_querycode/llm_corrupt.py b/query_cve/manual_querycode/llm_corrupt.py new file mode 100644 index 000000000..4596a1c79 --- /dev/null +++ b/query_cve/manual_querycode/llm_corrupt.py @@ -0,0 +1,525 @@ +"""LLM-driven per-row corruption. + +For each CVE we want to obfuscate, call Azure OpenAI to produce: + - a `narrative_description` that re-tells the original CVE while implying + its CVSS severity in natural language WITHOUT using the words critical / + high / medium / low (or near-synonyms in a denylist) + - a `narrative_score` paragraph that conveys the CVSS base score in prose + without printing the literal numeric value + +Guardrails (each output must pass all checks; up to 3 retries; otherwise the +row is skipped and the original text is kept unchanged so we never lose data): + + * MUST contain no banned literal words (severity terms, denylisted near-synonyms, + and the literal score number) + * MUST be within [0.5x, 3.0x] the length of the original description + * MUST be in English (heuristic: ascii ratio > 0.9) + * MUST mention at least one noun from the original description (lexical overlap + >= 1 content word) so we know the rewrite stayed on-topic + +Output rows landed in clean/manifest.sqlite tables: + planted_narrative_desc (cve_id, severity, narrative) + planted_narrative_score (cve_id, canonical_score, narrative) +""" +from __future__ import annotations +import argparse +import os +import re +import sqlite3 +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" +ENV_FILE = ROOT.parent / ".env" # repo root + + +def _load_env(): + if not ENV_FILE.exists(): + return + for line in ENV_FILE.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + if "=" not in line: + continue + k, v = line.split("=", 1) + os.environ.setdefault(k.strip(), v.strip()) + + +_load_env() + + +def _client(): + from openai import AzureOpenAI + return AzureOpenAI( + api_key=os.environ["AZURE_API_KEY"], + azure_endpoint=os.environ["AZURE_API_BASE"], + api_version=os.environ["AZURE_API_VERSION"], + ) + + +# Words that would leak the answer in narrative form. +SEVERITY_DENY = { + "critical", "crit", "criticality", "criticalities", + "high", "highly", "high-severity", "high-impact", + "medium", "moderate", "moderately", "medium-severity", + "low", "minor", "low-severity", "low-impact", + "severity", "sev", +} + +DEPLOYMENT = os.environ.get("AZURE_DEPLOYMENT", "gpt-4o") + + +SEVERITY_PROMPT = """You are obfuscating a CVE description for a benchmark. + +Rewrite the description so that the CVSS severity rating is *implied* by the +narrative consequences and exploitability you describe — but do NOT use any of +these words: critical, crit, high, medium, moderate, low, minor, severity, sev, +or their hyphenated variants. Convey severity through impact phrasing only. + +Constraints: +- Stay faithful to the technical content of the original description (same + product / weakness / mechanism). Do not invent capabilities. +- 1-3 sentences, similar length to the original. +- Do not state a CVSS score or rating. +- Do not include the words listed above (case-insensitive). + +Severity to imply: {severity} + +Phrasing motifs (pick one and weave it in naturally; vary phrasing): +{motifs} + +Original CVE description: +{desc} + +Output ONLY the rewritten description. No preamble. +""" + + +SEVERITY_MOTIFS = { + "CRITICAL": ( + "- pre-authentication remote code execution against an exposed service\n" + "- complete and immediate takeover with no user action required\n" + "- the attacker needs no credentials and no user interaction\n" + "- internet-exposed; mass scanning + drive-by exploitation realistic" + ), + "HIGH": ( + "Important: even though this is impactful, it is NOT pre-auth full takeover.\n" + "Convey at least ONE of these constraints prominently:\n" + "- requires the attacker to already hold valid credentials, OR\n" + "- requires the victim to perform an action (open a file, click a link), OR\n" + "- requires local network access (not internet-exposed), OR\n" + "- only compromises one component / data type, not the whole system\n" + "Phrasing should make clear there is a meaningful preconditioning that\n" + "stops it from being a one-shot full-system takeover." + ), + "MEDIUM": ( + "- requires authentication AND user interaction together\n" + "- discloses partial / non-sensitive information only\n" + "- limited to a specific feature; no privilege escalation\n" + "- exploitation requires unusual configuration to be present" + ), + "LOW": ( + "- only feasible with prior local console / physical access\n" + "- impact is cosmetic or denial-of-service of a non-critical feature\n" + "- no realistic attack path; theoretical concern only\n" + "- requires the attacker to already be administrator" + ), +} + + +SCORE_PROMPT = """You are obfuscating a CVE's CVSS v3 base score (scale 0.0-10.0) +for a benchmark. + +Write ONE sentence (40 - 280 characters) that *implies* the magnitude band of +the score through narrative phrasing. The sentence MUST NOT contain digits, +the words "score" / "rating", or any of: critical, crit, high, medium, +moderate, low, minor, severity, sev. + +Constraint — anchor in CVE-specific context. Reference at least one of the +context details below NATURALLY in the sentence (vendor, product, or a +short paraphrase of the vulnerability mechanism). This makes each narrative +substantively unique per row. + +Constraint — VARY phrasing aggressively. Do NOT default to stock templates +like "in the elevated tier" or "near the top of the scale". Each row should +use distinct vocabulary, syntactic structure, and register. Treat each row +as a unique editorial paraphrase. Some registers to draw from (rotate): +- terse advisory: "Among the upper-tier issues this vendor has shipped." +- analyst-paragraph: "By the standards of similar {{product}} flaws, this + one sits in the elevated band — not the rarefied air of pre-auth-RCE + catastrophes, but well above routine." +- comparative: "Comparable in severity to {{vendor}}'s prior + authentication-bypass disclosures." +- hedged: "Roughly two notches below the maximum on the standard 10-point + vulnerability scale." +- domain-flavored: "Practitioners would treat this as a top-of-the-list + patch-now item but not a fire-drill." +- counterfactual: "Were the attack vector network rather than local, this + would push toward the very top; as written, it lands in the + upper-middle band." + +CRITICAL — band-specific constraints. The narrative must clearly imply the +target band and NOT imply a higher band. Use phrasing aligned with the +canonical band. Where useful you may use approximate ten-point-scale anchors: +- 9.0-10.0: "near the very top of the scale" / "essentially at the maximum" + / "rarefied top tier" / "approaches the pinnacle" +- 7.0-8.9: "comfortably above the midpoint but clearly not at the maximum" + / "in the upper-middle band, well above average but short of the top" + / "two-notches-below-maximum" — DO NOT use phrases like "near the top" + / "just shy of the pinnacle" / "top tier" / "approaches the maximum" + for this band, those imply 9.0-10.0. +- 4.0-6.9: "in the middle band" / "around the midpoint" / + "roughly mid-scale" — DO NOT use words like "elevated" / "upper" / "high" + / "near the top". +- 0.1-3.9: "well below the midpoint" / "in the lower portion of the scale" + / "minor in scale" — DO NOT use anything that implies above-midpoint. + +Score to imply: {score} (so canonical band = {band}) + +CVE context (incorporate naturally — but do NOT include the CVE id, do NOT +copy the original description verbatim): +- vendor: {vendor} +- product: {product} +- vulnerability summary: {summary} + +Output ONLY the sentence. No preamble. No digits. +""" + + +def _length_bounds_ok(orig: str, new: str) -> bool: + if not orig: + return True + lo = max(50, int(0.3 * len(orig))) + hi = max(600, int(3.0 * len(orig))) + return lo <= len(new) <= hi + + +def _ascii_ratio(s: str) -> float: + if not s: + return 1.0 + return sum(1 for c in s if ord(c) < 128) / len(s) + + +_TOK = re.compile(r"[a-z][a-z0-9_]{2,}") + + +def _content_overlap_ok(orig: str, new: str) -> bool: + """Demand at least one content-word overlap so the rewrite stays on-topic.""" + o = set(_TOK.findall(orig.lower())) + n = set(_TOK.findall(new.lower())) + return len(o & n) >= 1 + + +def _has_banned(text: str, banned: set[str]) -> str | None: + text_l = text.lower() + for w in banned: + if re.search(rf"\b{re.escape(w)}\b", text_l): + return w + return None + + +VERIFY_SEVERITY_PROMPT = """Read the CVE description below and decide which +CVSS v3 severity classification it most naturally implies. Choose EXACTLY one: +- CRITICAL +- HIGH +- MEDIUM +- LOW + +Output ONLY the label, nothing else. + +Description: {desc} +""" + + +VERIFY_SCORE_PROMPT = """Read the sentence below and decide which CVSS v3 base +score band it implies (the score scale is 0.0 to 10.0). Choose EXACTLY one: +- 9.0-10.0 +- 7.0-8.9 +- 4.0-6.9 +- 0.1-3.9 + +Output ONLY the band label. + +Sentence: {sentence} +""" + + +def _score_band(score: float) -> str: + if score >= 9.0: + return "9.0-10.0" + if score >= 7.0: + return "7.0-8.9" + if score >= 4.0: + return "4.0-6.9" + return "0.1-3.9" + + +def classify_severity(client, narrative: str) -> str | None: + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{"role": "user", "content": VERIFY_SEVERITY_PROMPT.format(desc=narrative)}], + max_tokens=10, + temperature=0.0, + ) + out = (r.choices[0].message.content or "").strip().upper().rstrip(".") + for lbl in ("CRITICAL", "HIGH", "MEDIUM", "LOW"): + if lbl in out: + return lbl + return None + except Exception: + return None + + +def classify_score(client, narrative: str) -> str | None: + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{"role": "user", "content": VERIFY_SCORE_PROMPT.format(sentence=narrative)}], + max_tokens=15, + temperature=0.0, + ) + out = (r.choices[0].message.content or "").strip() + for b in ("9.0-10.0", "7.0-8.9", "4.0-6.9", "0.1-3.9"): + if b in out: + return b + return None + except Exception: + return None + + +def rewrite_severity(client, cve_id: str, severity: str, desc: str, + retries: int = 3) -> str | None: + motifs = SEVERITY_MOTIFS.get(severity.upper()) + if not motifs: + return None + last_err = None + for attempt in range(retries): + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{ + "role": "user", + "content": SEVERITY_PROMPT.format( + severity=severity.upper(), motifs=motifs, desc=desc + ), + }], + max_tokens=300, + temperature=0.7, + ) + out = (r.choices[0].message.content or "").strip() + banned_hit = _has_banned(out, SEVERITY_DENY) + if banned_hit: + last_err = f"banned word: {banned_hit!r}" + continue + if not _length_bounds_ok(desc, out): + last_err = f"length out of bounds ({len(out)} vs orig {len(desc)})" + continue + if _ascii_ratio(out) < 0.9: + last_err = "non-english" + continue + if not _content_overlap_ok(desc, out): + last_err = "no content overlap" + continue + # Roundtrip verifier: classifier must recover the same severity. + implied = classify_severity(client, out) + if implied != severity.upper(): + last_err = f"verifier disagreed: implied={implied!r} canonical={severity!r}" + continue + return out + except Exception as e: + last_err = str(e)[:120] + time.sleep(1.5 ** attempt) + print(f" [skip {cve_id}] severity rewrite: {last_err}", file=sys.stderr) + return None + + +def rewrite_score(client, cve_id: str, score: float, + vendor: str = "", product: str = "", summary: str = "", + retries: int = 3) -> str | None: + score_str = f"{score:.1f}" + last_err = None + for attempt in range(retries): + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{ + "role": "user", + "content": SCORE_PROMPT.format( + score=score_str, band=_score_band(score), + vendor=vendor or "(unspecified)", + product=product or "(unspecified)", + summary=summary or "(unspecified)", + ), + }], + max_tokens=200, + temperature=0.95, + ) + out = (r.choices[0].message.content or "").strip() + # No digits at all + if re.search(r"\d", out): + last_err = "contains digits" + continue + banned_hit = _has_banned(out, SEVERITY_DENY | {"score", "rating"}) + if banned_hit: + last_err = f"banned word: {banned_hit!r}" + continue + if _ascii_ratio(out) < 0.9: + last_err = "non-english" + continue + implied = classify_score(client, out) + canonical_band = _score_band(score) + if implied != canonical_band: + last_err = f"verifier disagreed: implied={implied!r} canonical={canonical_band!r}" + continue + return out + except Exception as e: + last_err = str(e)[:120] + time.sleep(1.5 ** attempt) + print(f" [skip {cve_id}] score rewrite: {last_err}", file=sys.stderr) + return None + + +def _ensure_manifest_tables(): + conn = sqlite3.connect(MANIFEST_DB) + conn.executescript(""" + CREATE TABLE IF NOT EXISTS planted_narrative_desc ( + cve_id TEXT PRIMARY KEY, severity TEXT, narrative TEXT + ); + CREATE TABLE IF NOT EXISTS planted_narrative_score ( + cve_id TEXT PRIMARY KEY, canonical_score REAL, narrative TEXT + ); + """) + conn.commit() + conn.close() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--scope", choices=("kev", "all", "sample"), default="kev", + help="kev: only CVEs in KEV. all: every CVE with severity. " + "sample: random sample (use --sample-size)") + ap.add_argument("--sample-size", type=int, default=200) + ap.add_argument("--workers", type=int, default=8) + ap.add_argument("--limit", type=int, default=None, + help="Hard cap on rows processed (debug)") + ap.add_argument("--skip-severity", action="store_true") + ap.add_argument("--skip-score", action="store_true") + args = ap.parse_args() + + _ensure_manifest_tables() + client = _client() + + clean = sqlite3.connect(CLEAN_DB) + if args.scope == "kev": + rows = clean.execute(""" + SELECT v.cve_id, v.cvss3_severity, v.cvss3_base_score, d.value + FROM cves v + JOIN cve_descriptions d ON d.cve_id = v.cve_id AND d.lang='en' + WHERE v.cve_id IN (SELECT cve_id FROM kev) + AND v.cvss3_severity IS NOT NULL + AND v.cvss3_base_score IS NOT NULL + """).fetchall() + elif args.scope == "sample": + rows = clean.execute(f""" + SELECT v.cve_id, v.cvss3_severity, v.cvss3_base_score, d.value + FROM cves v + JOIN cve_descriptions d ON d.cve_id = v.cve_id AND d.lang='en' + WHERE v.cvss3_severity IS NOT NULL + AND v.cvss3_base_score IS NOT NULL + ORDER BY v.cve_id + LIMIT {args.sample_size} + """).fetchall() + else: + rows = clean.execute(""" + SELECT v.cve_id, v.cvss3_severity, v.cvss3_base_score, d.value + FROM cves v + JOIN cve_descriptions d ON d.cve_id = v.cve_id AND d.lang='en' + WHERE v.cvss3_severity IS NOT NULL + AND v.cvss3_base_score IS NOT NULL + """).fetchall() + clean.close() + + if args.limit: + rows = rows[: args.limit] + + # Build per-CVE vendor/product lookup (most-common vendor for each CVE) to + # anchor score narratives in CVE-specific context. + clean = sqlite3.connect(CLEAN_DB) + cve_vp: dict[str, tuple[str, str]] = {} + for cve_id, vendor, product, n in clean.execute(""" + SELECT cve_id, vendor, product, COUNT(*) as n + FROM cpe_matches + WHERE vendor IS NOT NULL AND product IS NOT NULL + GROUP BY cve_id, vendor, product + ORDER BY cve_id, n DESC + """): + if cve_id not in cve_vp: + cve_vp[cve_id] = (vendor, product) + clean.close() + + print(f"Processing {len(rows)} CVEs (scope={args.scope}, workers={args.workers}); " + f"{len(cve_vp)} CVEs have vendor/product context", + flush=True) + + # already-done lookup so reruns are incremental + mconn = sqlite3.connect(MANIFEST_DB) + done_sev = {r[0] for r in mconn.execute("SELECT cve_id FROM planted_narrative_desc")} + done_score = {r[0] for r in mconn.execute("SELECT cve_id FROM planted_narrative_score")} + mconn.close() + + todo = [r for r in rows if r[0] not in done_sev or r[0] not in done_score] + print(f" {len(rows) - len(todo)} already done; {len(todo)} new", flush=True) + + n_sev_ok = 0 + n_score_ok = 0 + lock_conn = sqlite3.connect(MANIFEST_DB, isolation_level=None) + lock_conn.execute("PRAGMA journal_mode=WAL") + + def work(row): + cve_id, sev, score, desc = row + out_sev = None + out_score = None + if not args.skip_severity and cve_id not in done_sev: + out_sev = rewrite_severity(client, cve_id, sev, desc or "") + if not args.skip_score and cve_id not in done_score: + vendor, product = cve_vp.get(cve_id, ("", "")) + # Pass a 240-char summary of the original description as additional + # anchor context (the LLM is told not to copy verbatim). + summary = (desc or "")[:240] + out_score = rewrite_score(client, cve_id, float(score), + vendor=vendor, product=product, + summary=summary) + return cve_id, sev, score, out_sev, out_score + + with ThreadPoolExecutor(max_workers=args.workers) as ex: + futures = [ex.submit(work, r) for r in todo] + for i, fut in enumerate(as_completed(futures), 1): + cve_id, sev, score, out_sev, out_score = fut.result() + if out_sev is not None: + lock_conn.execute( + "INSERT OR REPLACE INTO planted_narrative_desc VALUES (?,?,?)", + (cve_id, sev, out_sev), + ) + n_sev_ok += 1 + if out_score is not None: + lock_conn.execute( + "INSERT OR REPLACE INTO planted_narrative_score VALUES (?,?,?)", + (cve_id, float(score), out_score), + ) + n_score_ok += 1 + if i % 50 == 0: + print(f" [{i}/{len(todo)}] sev_ok={n_sev_ok} score_ok={n_score_ok}", + flush=True) + + lock_conn.close() + print(f"DONE: severity={n_sev_ok}/{len(todo)} score={n_score_ok}/{len(todo)}") + + +if __name__ == "__main__": + main() diff --git a/query_cve/query1/ground_truth.csv b/query_cve/query1/ground_truth.csv new file mode 100644 index 000000000..b8626c4cf --- /dev/null +++ b/query_cve/query1/ground_truth.csv @@ -0,0 +1 @@ +4 diff --git a/query_cve/query1/query.json b/query_cve/query1/query.json new file mode 100644 index 000000000..44a7d9456 --- /dev/null +++ b/query_cve/query1/query.json @@ -0,0 +1 @@ +"How many distinct CVEs published in 2023 are listed in CISA's Known Exploited Vulnerabilities (KEV) catalog AND have at least one vulnerable Apache product configuration?" diff --git a/query_cve/query1/validate.py b/query_cve/query1/validate.py new file mode 100644 index 000000000..5e3c22ade --- /dev/null +++ b/query_cve/query1/validate.py @@ -0,0 +1,10 @@ +def validate(llm_output: str): + import re + from pathlib import Path + gt = int(Path(__file__).parent.joinpath("ground_truth.csv").read_text().strip()) + # accept comma-formatted numbers like "1,262" + text = re.sub(r"(\d),(\d)", r"\1\2", llm_output) + nums = re.findall(r"(?$1M, >$5M, >$9M) meaningfully selective. + +## Database engine assignment + +| DB | Engine | Rationale | +|---|---|---| +| `contracts_db` | PostgreSQL | Transactional fact table (~9.9k contract rows) plus a sibling `contract_amounts` table | +| `recipients_db` | SQLite | Small entity registry (~3.3k recipients) | +| `agencies_db` | DuckDB | Analytical reference: agency hierarchy + agency-alias lookup + NAICS sector hierarchy | +| `descriptions_db` | MongoDB | Free-text contract descriptions, organized as one document per award with nested descriptions[] | + +## Corruption layers (categories only) + +Hash-deterministic transforms: + +| Layer | Surface | +|---|---| +| Award ID format mixing | `contracts_db.contracts.award_id` (and the reference in `contract_amounts` and `descriptions_db`) | +| Recipient UEI format mixing | `contracts_db.contracts.recipient_uei`, `recipients_db.recipients.uei` | +| Awarding-agency surface-form variants (cluster needed for canonicalization) | `contracts_db.contracts.awarding_agency` (with `agencies_db.agency_aliases` lookup) | +| NAICS code reformatting (6-digit / `naics-XXXXXX` / `XX-XXXX`) | `contracts_db.contracts.naics_code` | +| Recipient name fuzzification (corporate-suffix variants, case, whitespace) | `recipients_db.recipients.name` | +| State surface-form variants | `recipients_db.recipients.state` | +| Templated amount-as-text fallback (`$1.5M` / `1,500,000.00 USD` / `1.5M`) | `contract_amounts.amount_text`, `recipients_db.recipients.total_amount_text` | +| Duplicate `contract_amounts` rows with conflicting amount values | ~3.5% of awards | +| English-description dropping | `descriptions_db.contract_documents` (deterministic ~10% subset) | + +LLM-driven per-row narrative corruption with **roundtrip-classifier verification** (each generated narrative is classified back to its magnitude band by a separate LLM call before being accepted; mismatches retry up to 3 times): + +| Layer | Surface | Anchor context | Generation model | +|---|---|---|---| +| Amount-as-narrative-prose | `contracts_db.contract_amounts.amount_text` | per-row recipient + awarding agency + NAICS description | `gpt-4o`, temp 0.95 | + +## Verifier audit (post-generation) + +| Layer | Total rows | Distinct narratives | Diversity ratio | Verifier mismatch rate | +|---|---|---|---|---| +| Amount-as-narrative | 9,744 | 9,737 | 99.93% | <1% | + +A narrative is included only if (a) it contains no digits, (b) it contains no banned magnitude-leak tokens for its band (e.g. a `tens of thousands or less` row may not contain `million`/`billion`/`seven-figure`/etc.), (c) its length is within 30–280 chars, and (d) an independent LLM classifier recovers the canonical band. Rows that fail all retry attempts fall back to the templated form. + +The narrative anchors on per-row context so each phrasing is substantively unique. Examples: +- `"This Department of Defense contract with LEIDOS, INC. represents a modest five-figure investment in computer systems design services."` (band: tens-of-thousands-or-less) +- `"For a defense giant like Lockheed Martin, this aircraft manufacturing contract reflects a substantial yet routine allocation within the eight-figure realm."` (band: tens-of-millions) + +## What the corruption preserves vs. discards + +Because the LLM rewrite is band-level (the verifier checks band, not exact +amount), an agent reading a narrative-corrupted row can recover the row's +magnitude band but cannot recover the precise dollar amount. So a question +like "total dollars awarded to California recipients" would not be answerable +to the cent from narrative-only rows. + +All ten queries are intentionally designed to be answerable from band-level +information. Dollar thresholds in queries align with band boundaries: +$1,000,000 (boundary between "hundreds of thousands" and "millions") in +Q1/Q2/Q4/Q8/Q9 and $10,000,000 (boundary between "millions" and "tens of +millions") in Q5/Q7. The agent classifies each row's narrative into a band, +then filters or counts. Q3, Q6, Q10 don't depend on amount at all. + +## Shipped artifact hashes (SHA-256) + +``` +3ec7cb010ad78868358b4a3cffe81a5adc037def483669d11a34bc3452253bb7 query_dataset/agencies.duckdb +7fdde155670e23568458570bdb17171b36f4789b49c95dde5461b03ca951a861 query_dataset/contracts.sql +a964552f7bbda8bb608046bae63714b791fcb90294e2956d9f1c8fe29b860518 query_dataset/descriptions/usaspending_descriptions/contract_documents.bson +008916d8764ba11e759be1499a78c0c090da2fa8b72ff409ed69623c2f1733b5 query_dataset/recipients.db +``` + +These pin the exact bytes of the agent-visible corrupted databases in this commit. Reviewers can verify with `shasum -a 256 query_usaspending/query_dataset/{agencies.duckdb,contracts.sql,recipients.db,descriptions/usaspending_descriptions/contract_documents.bson}`. + +## Reproducibility notes + +- The canonical pre-corruption snapshot (`clean/clean.sqlite`) and the corruption manifest (`clean/manifest.sqlite`) — which together constitute the answer key — are kept local-only (gitignored). +- Construction code lives in `manual_querycode/` and IS shipped in this repo (full source for `fetch_clean.py`, `corrupt.py`, `llm_corrupt.py`, `compute_ground_truth.py`). +- Re-running the corruption pipeline from scratch will not bit-reproduce the shipped artifacts because the LLM narrative steps are non-deterministic by design (temperature > 0). The deterministic transforms are bit-reproducible. Hashes above pin the specific byte-content shipped in this commit, regardless of regeneration path. +- Sonnet pass@1 on this dataset, plain mode, at the time of authoring: **5/10 = 50%**. + +## Construction-code reference + +Full source: `manual_querycode/fetch_clean.py`, `manual_querycode/corrupt.py`, `manual_querycode/llm_corrupt.py`, `manual_querycode/compute_ground_truth.py`. + +Pipeline order to regenerate from scratch: +1. `python manual_querycode/fetch_clean.py` — paginates the USAspending search-by-award API into `clean/clean.sqlite` +2. `python manual_querycode/corrupt.py` — emits the 4 agent-visible DBs into `query_dataset/`; deterministic transforms only (templated `amount_text` placeholder) +3. `python manual_querycode/llm_corrupt.py` — generates verifier-validated narrative amounts into `clean/manifest.sqlite` (Azure OpenAI; reads `.env`) +4. `python manual_querycode/corrupt.py` again — picks up the validated narratives from manifest and bakes them into the `contracts.sql` dump +5. `python manual_querycode/compute_ground_truth.py` — emits `queryN/ground_truth.csv` for all 10 queries from the canonical clean data + +### LLM prompt (amount-as-narrative) + +``` +You are obfuscating a federal contract's dollar amount for a benchmark. + +Write ONE sentence (40 - 280 characters) that *implies* the dollar magnitude +of the contract through narrative phrasing. The sentence MUST NOT contain any +digits, currency symbols, "USD", or precision words ("exactly", "precisely"). + +Constraint — anchor in contract-specific context. Reference at least one of +the contract details below NATURALLY in the sentence (recipient, agency, or +domain hint). This makes each narrative substantively unique per row. + +Constraint — VARY phrasing aggressively. Do NOT default to stock templates +like "in the hundreds of thousands range" or "a relatively modest award". +Each row should use distinct vocabulary, syntactic structure, and register. +Treat each row as a unique editorial paraphrase. Some registers to draw +from (rotate through them): +- terse procurement-style: "Mid-six-figure outlay for {{domain}} services." +- analyst-paragraph: "In context, this Air Force agreement sits among the + smaller-dollar logistics awards typical for the buyer." +- comparative: "A smaller commitment than {{recipient}}'s prior awards in the + same NAICS bracket." +- hedged: "Roughly an order of magnitude below the median large-vendor IT + modernization deal of recent quarters." +- domain-flavored: "Hardware-procurement-tier money — sub-million but + meaningful for the program office." +- counterfactual: "Were this an R&D award rather than O&M, the figure would + read large; for routine sustainment it is unremarkable." + +Magnitude bands (the sentence must clearly imply the band, but never use the +identical phrasing twice across different rows): +- billions: convey "billions" / "ten-figure" +- hundreds of millions: "nine-figure" / "hundreds of millions" +- tens of millions: "eight-figure" / "tens of millions" +- millions: "seven-figure" / "low-to-mid millions" +- hundreds of thousands: "six-figure" / "hundreds of thousands" +- tens of thousands or less: "five-figure or smaller" / "low five-figure" + / "modest sub-six-figure" + +Magnitude band for this row: {band} + +CRITICAL — band-specific constraints. The narrative must NOT contain words +that imply a higher band than the target band. Specifically: +- if band is "tens of thousands or less": the words "million", "millions", + "billion", "billions", "six-figure", "seven-figure", "eight-figure", + "nine-figure", "ten-figure", "hundreds of thousands" MUST NOT appear. + Use "five-figure" or "four-figure" or "small four/five-figure" / "tens of + thousands" / "modest five-figure" only. +- if band is "hundreds of thousands": the words "million", "millions", + "billion", "billions", "seven-figure", "eight-figure", "nine-figure", + "ten-figure" MUST NOT appear. Use "six-figure" / "hundreds of thousands" + only. +- if band is "millions": "billion(s)", "ten-figure", "nine-figure", + "eight-figure" MUST NOT appear; use "seven-figure" / "low-to-mid millions" + / "single-digit millions". +- if band is "tens of millions": "billion(s)", "ten-figure", "nine-figure" + MUST NOT appear; use "eight-figure" / "tens of millions". +- if band is "hundreds of millions": "billion(s)", "ten-figure" MUST NOT + appear; use "nine-figure" / "hundreds of millions" / "high nine-figure". +- if band is "billions": use "ten-figure" / "billions" / "multi-billion"; + do NOT use "trillion". + +Aim for tight, accurate magnitude phrasing — the goal is OBFUSCATING the +exact dollar value while being faithful about the order of magnitude. Even +if the recipient or agency is famous for big-dollar awards, you must reflect +the actual band of THIS row. + +Contract context (incorporate naturally — but do not include the contract id): +- recipient: {recipient} +- awarding agency: {agency} +- domain (NAICS description): {naics_desc} + +Output ONLY the sentence. No preamble. No digits. +``` + +Bands are defined as: + +```python +def _band(amount: float) -> str: + if amount >= 1_000_000_000: return "billions" + if amount >= 100_000_000: return "hundreds of millions" + if amount >= 10_000_000: return "tens of millions" + if amount >= 1_000_000: return "millions" + if amount >= 100_000: return "hundreds of thousands" + return "tens of thousands or less" +``` + +### Roundtrip verifier prompt + +Output is exactly one of `billions` / `hundreds of millions` / `tens of millions` / `millions` / `hundreds of thousands` / `tens of thousands or less`. Rejection if it disagrees with the canonical band. + +``` +Read the sentence below and decide which dollar-magnitude band it most +naturally describes. Choose EXACTLY one of: +- billions +- hundreds of millions +- tens of millions +- millions +- hundreds of thousands +- tens of thousands or less + +Output ONLY the band label, nothing else. + +Sentence: {sentence} +``` + +### Ground-truth-computation SQL highlights + +Q4 (canonical agency with highest >$1M-share among agencies with ≥10 contracts): + +```sql +SELECT awarding_agency, + COUNT(*) AS n, + SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS n_big +FROM contracts +WHERE awarding_agency IS NOT NULL +GROUP BY awarding_agency +HAVING n >= 10 +ORDER BY (1.0 * n_big / n) DESC, awarding_agency ASC +LIMIT 1 +``` + +Q6 (distinct canonical recipients with multiple UEIs after corporate-suffix +normalization): + +```python +import re +suffix_re = re.compile( + r"[\s,]+(inc\.?|incorporated|corp\.?|corporation|llc\.?|l\.l\.c\.?|" + r"co\.?|company|ltd\.?|limited)$", + re.IGNORECASE, +) +canon: dict[str, set[str]] = {} +for uei, name in c.execute( + "SELECT uei, name FROM recipients WHERE name IS NOT NULL" +): + n = name.strip().lower() + while True: + new_n = suffix_re.sub("", n).strip().rstrip(",") + if new_n == n: + break + n = new_n + canon.setdefault(n, set()).add(uei) +result = sum(1 for ueis in canon.values() if len(ueis) > 1) +``` + +Other queries' GT SQL is in `manual_querycode/compute_ground_truth.py` (one +function per query, each returning a string that becomes the queryN/ground_truth.csv). + diff --git a/query_usaspending/manual_querycode/compute_ground_truth.py b/query_usaspending/manual_querycode/compute_ground_truth.py new file mode 100644 index 000000000..ebc4f17b1 --- /dev/null +++ b/query_usaspending/manual_querycode/compute_ground_truth.py @@ -0,0 +1,207 @@ +"""Compute ground-truth from clean.sqlite + manifest.sqlite. +Each query incorporates >=2 DAB properties. +""" +from __future__ import annotations +import sqlite3 +import statistics +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" + + +def q1(c, m): + """Count contracts in DOD (any surface form) with amount > $1M. + Multi-DB + ill-formatted (agency cluster) + unstructured (amount-as-text).""" + n = c.execute(""" + SELECT COUNT(*) FROM contracts + WHERE awarding_agency = 'Department of Defense' AND amount > 1000000 + """).fetchone()[0] + return str(n) + + +def q2(c, m): + """Multi-DB + ill-formatted (state) + unstructured (amount band): count + of contract awards whose recipient is in California (any surface form) + AND whose amount is greater than $1,000,000. + + Note: $1M aligns with the boundary between the 'hundreds of thousands' + and 'millions' magnitude bands, so this threshold is fully recoverable + from the narrative-corrupted amount_text. + """ + n = c.execute(""" + SELECT COUNT(*) + FROM contracts c + JOIN recipients r ON r.uei = c.recipient_uei + WHERE r.state = 'CA' AND c.amount IS NOT NULL AND c.amount > 1000000 + """).fetchone()[0] + return str(n) + + +def q3(c, m): + """Distinct UEIs in NAICS sector 33 (Manufacturing). + Multi-DB + ill-formatted (NAICS reformatting + UEI canon).""" + n = c.execute(""" + SELECT COUNT(DISTINCT recipient_uei) + FROM contracts + WHERE substr(naics_code, 1, 2) = '33' + AND recipient_uei IS NOT NULL + """).fetchone()[0] + return str(n) + + +def q4(c, m): + """Canonical agency (>=10 contracts) with highest share of contracts > $1M. + Multi-DB + ill-formatted (agency cluster) + unstructured (amount) + chained agg.""" + rows = c.execute(""" + SELECT awarding_agency, + COUNT(*) AS n, + SUM(CASE WHEN amount > 1000000 THEN 1 ELSE 0 END) AS n_big + FROM contracts + WHERE awarding_agency IS NOT NULL + GROUP BY awarding_agency + HAVING n >= 10 + ORDER BY (1.0 * n_big / n) DESC, awarding_agency ASC + LIMIT 1 + """).fetchall() + return rows[0][0] if rows else "NONE" + + +def q5(c, m): + """Multi-DB + ill-formatted (NAICS) + unstructured (amount band): distinct + NAICS 2-digit sectors represented across contracts of at least $10,000,000. + + Note: $10M aligns with the boundary between the 'millions' and 'tens of + millions' magnitude bands, so this threshold is fully recoverable from the + narrative-corrupted amount_text. + """ + n = c.execute(""" + SELECT COUNT(DISTINCT substr(naics_code, 1, 2)) + FROM contracts + WHERE amount >= 10000000 AND naics_code IS NOT NULL + """).fetchone()[0] + return str(n) + + +def q6(c, m): + """Distinct canonical recipients that appear in recipients_db under MORE THAN + ONE UEI. Canonicalization: lowercase, strip whitespace, drop trailing + corporate suffix tokens (inc, inc., incorporated, corp, corp., corporation, + llc, l.l.c., co, co., company, ltd, ltd., limited). + Multi-DB + ill-formatted (recipient name fuzz).""" + import re + suffix_re = re.compile( + r"[\s,]+(inc\.?|incorporated|corp\.?|corporation|llc\.?|l\.l\.c\.?|" + r"co\.?|company|ltd\.?|limited)$", + re.IGNORECASE, + ) + canon: dict[str, set[str]] = {} + for uei, name in c.execute( + "SELECT uei, name FROM recipients WHERE name IS NOT NULL" + ): + n = name.strip().lower() + # repeatedly strip trailing corporate suffix tokens + while True: + new_n = suffix_re.sub("", n).strip().rstrip(",") + if new_n == n: + break + n = new_n + canon.setdefault(n, set()).add(uei) + return str(sum(1 for ueis in canon.values() if len(ueis) > 1)) + + +def q7(c, m): + """Multi-DB + ill-formatted (NAICS) + unstructured (amount band): NAICS + 2-digit sector with the most contracts of at least $10,000,000. + + Note: $10M aligns with the boundary between the 'millions' and 'tens of + millions' magnitude bands. Counting rows with amount in the upper bands + is fully recoverable from the narrative-corrupted amount_text. + """ + row = c.execute(""" + SELECT substr(c.naics_code, 1, 2) AS sec, COUNT(*) AS n + FROM contracts c + WHERE c.naics_code IS NOT NULL + AND c.amount IS NOT NULL + AND c.amount >= 10000000 + GROUP BY sec + ORDER BY n DESC, sec ASC + LIMIT 1 + """).fetchone() + return row[0] + + +def q8(c, m): + """Multi-DB + ill-formatted (recipient fuzz + agency cluster) + unstructured + (amount band): count of contracts awarded to Lockheed Martin (across all + UEIs and recipient_name surface-form variants) by the Department of Defense + (across all agency surface-form variants) with amount greater than $1,000,000. + + Note: $1M aligns with the boundary between the 'hundreds of thousands' and + 'millions' magnitude bands, so this threshold is fully recoverable from + the narrative-corrupted amount_text. + """ + n = c.execute(""" + SELECT COUNT(*) + FROM contracts + WHERE upper(recipient_name) LIKE '%LOCKHEED MARTIN%' + AND awarding_agency = 'Department of Defense' + AND amount IS NOT NULL + AND amount > 1000000 + """).fetchone()[0] + return str(n) + + +def q9(c, m): + """Contracts where English description was DROPPED AND amount > $1M. + Multi-DB (contracts+descriptions) + ill-formatted + unstructured (amount).""" + eng_dropped = {r[0] for r in m.execute("SELECT award_id FROM planted_eng_dropped")} + rows = c.execute(""" + SELECT award_id FROM contracts WHERE amount > 1000000 + """).fetchall() + return str(sum(1 for (aid,) in rows if aid in eng_dropped)) + + +def q10(c, m): + """Contracts with conflicting amount values AND from a top-10 recipient + (by contract count, after canonicalizing names case-insensitively). + Multi-DB + ill-formatted (recipient fuzz, duplicates).""" + # Top-10 recipients by canonical name (case-insensitive) + top10 = [ + r[0] for r in c.execute(""" + SELECT upper(recipient_name) AS uname, COUNT(*) AS n + FROM contracts WHERE recipient_name IS NOT NULL + GROUP BY uname + ORDER BY n DESC, uname ASC + LIMIT 10 + """) + ] + duped = {r[0] for r in m.execute("SELECT canonical_award_id FROM planted_duplicate")} + n = 0 + for award_id, rname in c.execute(""" + SELECT award_id, recipient_name FROM contracts + WHERE recipient_name IS NOT NULL + """): + if rname.upper() in top10 and award_id in duped: + n += 1 + return str(n) + + +QUERIES = {"1": q1, "2": q2, "3": q3, "4": q4, "5": q5, + "6": q6, "7": q7, "8": q8, "9": q9, "10": q10} + + +def main(): + c = sqlite3.connect(CLEAN_DB) + m = sqlite3.connect(MANIFEST_DB) + for qid, fn in QUERIES.items(): + ans = fn(c, m) + out = ROOT / f"query{qid}" / "ground_truth.csv" + out.parent.mkdir(parents=True, exist_ok=True) + out.write_text(ans + "\n", encoding="utf-8") + print(f"query{qid}: {ans}") + + +if __name__ == "__main__": + main() diff --git a/query_usaspending/manual_querycode/corrupt.py b/query_usaspending/manual_querycode/corrupt.py new file mode 100644 index 000000000..b3a5e4559 --- /dev/null +++ b/query_usaspending/manual_querycode/corrupt.py @@ -0,0 +1,582 @@ +"""Corrupt clean/clean.sqlite into 4 agent-visible DBs in query_dataset/. + +Engines (chosen so each engine matches the data's natural shape): + contracts_db postgres query_dataset/contracts.sql (transactional fact dump) + recipients_db sqlite query_dataset/recipients.db (small entity registry) + agencies_db duckdb query_dataset/agencies.duckdb (analytical reference) + descriptions_db mongo query_dataset/descriptions/ (nested per-contract docs) + +NO date-format corruption. Each non-date corruption is hash-deterministic. + +Layers applied: + + contracts_db (postgres): + - amount column DROPPED (numeric). Replaced with amount_text TEXT in a + sibling contract_amounts table — formats vary: "$1,500,000.00", "1.5M", + "1500000", "1,500,000 USD". + - award_id format varies per row: "HT940216C0001", "ht940216c0001", + "HT940216-C-0001". + - awarding_agency replaced with vendor-style surface variants + (e.g. Department of Defense -> "DoD", "DOD", "Dept of Defense", + "Department of Defense (DOD)"). + - naics_code reformatted: "336411" / "naics-336411" / "33-6411". + + recipients_db (sqlite): + - name column has multiple surface forms for the SAME canonical entity + (suffix variants: "Inc", "Inc.", "Incorporated", "Corp"/"Corporation", + trailing/leading whitespace, hyphenation). Real data already has some + of this; we layer on more. + - uei format varies: "ZE6ZM6NKSV43" / "uei:ze6zm6nksv43" / "ze6zm6nksv43-uei". + - state stored as varied surface form ("California" / "CA" / "Calif."). + + agencies_db (duckdb): + - agency name has surface-form variants (same as in contracts). + - canonical_to_alias lookup table that the agent can use to resolve + surface variants to canonical names. + - naics table preserves the 2-digit sector hierarchy. + + descriptions_db (mongo): + - One document per award_id with embedded {description, language}. + - English description dropped for ~10% of contracts; only Spanish or + French paraphrase remains for those. + +Manifest tables (clean/manifest.sqlite — never agent-visible): + canonical_award_id (canonical_award_id, corrupted_award_id) + canonical_recipient (canonical_uei, canonical_name) + canonical_agency (canonical_agency, corrupted_surface) + canonical_naics (canonical_code, corrupted_code) + canonical_amount (canonical_award_id, canonical_amount, amount_text, format_id) + planted_eng_dropped (award_id) + planted_duplicate (canonical_award_id, original_amount, duplicate_amount) +""" +from __future__ import annotations +import hashlib +import json +import shutil +import sqlite3 +from pathlib import Path + +import duckdb + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" +QD = ROOT / "query_dataset" + +CONTRACTS_SQL = QD / "contracts.sql" +RECIPIENTS_DB = QD / "recipients.db" +AGENCIES_DB = QD / "agencies.duckdb" +DESC_DUMP = QD / "descriptions" + + +def h(*parts) -> int: + s = "|".join(str(p) for p in parts) + return int(hashlib.sha1(s.encode()).hexdigest(), 16) + + +# ---- amount-as-text formats ----------------------------------------------- +def _scaled(n: float, suffix: str, decimals: int = 1) -> str: + return f"{n:,.{decimals}f}{suffix}" + +AMOUNT_FORMATS = [ + lambda x: f"${x:,.2f}", + lambda x: f"{x:,.2f} USD", + lambda x: f"{x:.0f}", + lambda x: ( + _scaled(x / 1_000_000_000, "B") + if x >= 1_000_000_000 else _scaled(x / 1_000_000, "M") + if x >= 1_000_000 else _scaled(x / 1_000, "K") + ), +] + + +def amount_text(value: float, salt: str) -> tuple[str, int]: + if value is None: + return "", 0 + fid = h("amt", salt) % len(AMOUNT_FORMATS) + return AMOUNT_FORMATS[fid](value), fid + + +# ---- NAICS code formatting ------------------------------------------------- +def naics_format(code: str, salt: str) -> str: + if not code: + return code + fid = h("naics", salt) % 3 + if fid == 0: + return code + if fid == 1: + return f"naics-{code}" + return f"{code[:2]}-{code[2:]}" if len(code) >= 4 else code + + +# ---- Agency name surface forms -------------------------------------------- +AGENCY_VARIANTS = { + "Department of Defense": ["Department of Defense", "DoD", "DOD", "Dept of Defense", "Department of Defense (DOD)", "Defense Department"], + "Department of Energy": ["Department of Energy", "DOE", "Dept of Energy", "DoE"], + "Department of Health and Human Services": ["Department of Health and Human Services", "HHS", "Dept HHS", "Health and Human Services"], + "Department of Veterans Affairs": ["Department of Veterans Affairs", "VA", "Dept of Veterans Affairs"], + "Department of Homeland Security": ["Department of Homeland Security", "DHS", "Dept Homeland Security"], + "Department of State": ["Department of State", "State Dept", "Dept of State", "DOS"], + "National Aeronautics and Space Administration": ["NASA", "National Aeronautics and Space Administration", "Nat'l Aeronautics & Space Administration"], + "General Services Administration": ["General Services Administration", "GSA", "Gen Services Admin"], + "Department of Justice": ["Department of Justice", "DOJ", "DoJ", "Justice Dept"], + "Department of Transportation": ["Department of Transportation", "DOT", "DoT", "Transportation Dept"], + "Department of the Interior": ["Department of the Interior", "DOI", "Interior Dept"], + "Department of Agriculture": ["Department of Agriculture", "USDA", "Agriculture Dept"], + "Department of Commerce": ["Department of Commerce", "DOC", "Commerce Dept"], + "Department of the Treasury": ["Department of the Treasury", "Treasury", "Treasury Dept"], + "Environmental Protection Agency": ["Environmental Protection Agency", "EPA"], + "Department of Education": ["Department of Education", "ED", "Education Dept"], + "Department of Labor": ["Department of Labor", "DOL", "Labor Dept"], + "Department of Housing and Urban Development": ["Department of Housing and Urban Development", "HUD"], + "Social Security Administration": ["Social Security Administration", "SSA"], +} + + +def agency_variant(canonical: str, salt: str) -> str: + pool = AGENCY_VARIANTS.get(canonical) + if not pool: + return canonical + return pool[h("ag", salt) % len(pool)] + + +# ---- recipient name fuzzification ----------------------------------------- +SUFFIX_VARIANTS = [ + ("INC", ["INC", "INC.", "INCORPORATED", "Inc", "Inc.", "Incorporated"]), + ("CORP", ["CORP", "CORP.", "CORPORATION", "Corp", "Corp.", "Corporation"]), + ("LLC", ["LLC", "LLC.", "L.L.C.", "L.L.C", "Llc"]), + ("CO", ["CO", "CO.", "COMPANY", "Co.", "Company"]), + ("LTD", ["LTD", "LTD.", "Limited", "Ltd."]), +] + + +def fuzz_recipient_name(name: str, salt: str) -> str: + if not name: + return name + n = name.strip() + # Try matching one of the corporate suffixes and swap to a variant + upper = n.upper() + for canonical_suffix, variants in SUFFIX_VARIANTS: + for v in variants: + sfx = " " + v + if upper.endswith(sfx.upper()): + # strip and append a chosen variant + stem = n[: len(n) - len(sfx)] + pick = variants[h("rname", salt) % len(variants)] + return stem + " " + pick + # No suffix; apply small noise: title-case half the time, keep as-is otherwise + if h("rname-case", salt) % 2 == 0: + return n + return n.title() + + +# ---- UEI format mixing ---------------------------------------------------- +def uei_format(uei: str, salt: str) -> str: + if not uei: + return uei + fid = h("uei", salt) % 3 + if fid == 0: + return uei + if fid == 1: + return f"uei:{uei.lower()}" + return f"{uei.lower()}-uei" + + +# ---- Award ID format ------------------------------------------------------ +def award_id_format(aid: str, salt: str) -> str: + if not aid: + return aid + fid = h("aid", salt) % 3 + if fid == 0: + return aid + if fid == 1: + return aid.lower() + # add a hyphen between letter run and numbers if pattern matches + import re + m = re.match(r"^([A-Z0-9]+?)([A-Z])(\d+)$", aid.upper()) + if m: + return f"{m.group(1)}-{m.group(2)}-{m.group(3)}" + return aid + + +# ---- State name variants -------------------------------------------------- +STATE_VARIANTS = { + "CA": ["CA", "California", "Calif.", "Calif"], + "NY": ["NY", "New York", "N.Y."], + "TX": ["TX", "Texas", "Tex."], + "VA": ["VA", "Virginia", "Va."], + "DC": ["DC", "District of Columbia", "D.C."], + "MA": ["MA", "Massachusetts", "Mass."], + "MD": ["MD", "Maryland", "Md."], + "FL": ["FL", "Florida", "Fla."], + "WA": ["WA", "Washington", "Wash."], + "PA": ["PA", "Pennsylvania", "Penn.", "Penna."], + "OH": ["OH", "Ohio"], + "IL": ["IL", "Illinois", "Ill."], + "MI": ["MI", "Michigan", "Mich."], + "AZ": ["AZ", "Arizona", "Ariz."], + "GA": ["GA", "Georgia", "Ga."], + "NJ": ["NJ", "New Jersey", "N.J."], + "CO": ["CO", "Colorado", "Colo."], + "MO": ["MO", "Missouri", "Mo."], +} + + +def state_variant(s: str, salt: str) -> str: + if not s: + return s + pool = STATE_VARIANTS.get(s) + if not pool: + return s + return pool[h("st", salt) % len(pool)] + + +# ---- duplicate / language drop selectors ---------------------------------- +DROP_ENGLISH_RATE = 10 +DUPLICATE_RATE = 30 + + +def should_drop_english(award_id: str) -> bool: + return h("drop-eng", award_id) % DROP_ENGLISH_RATE == 0 + + +def should_duplicate(award_id: str) -> bool: + return h("dup", award_id) % DUPLICATE_RATE == 0 + + +def conflicting_amount(orig: float, salt: str) -> float: + """Multiply or divide the amount by a deterministic scaling factor.""" + factor = [0.5, 1.5, 2.0, 0.75][h("dup-amt", salt) % 4] + return round(orig * factor, 2) + + +# ---- pipeline ------------------------------------------------------------- + +def init_manifest(conn: sqlite3.Connection) -> None: + conn.executescript(""" + DROP TABLE IF EXISTS canonical_award_id; + DROP TABLE IF EXISTS canonical_recipient; + DROP TABLE IF EXISTS canonical_agency; + DROP TABLE IF EXISTS canonical_naics; + DROP TABLE IF EXISTS canonical_amount; + DROP TABLE IF EXISTS planted_eng_dropped; + DROP TABLE IF EXISTS planted_duplicate; + + CREATE TABLE canonical_award_id ( + canonical_award_id TEXT PRIMARY KEY, corrupted_award_id TEXT + ); + CREATE TABLE canonical_recipient ( + canonical_uei TEXT PRIMARY KEY, canonical_name TEXT + ); + CREATE TABLE canonical_agency ( + award_id TEXT, canonical_agency TEXT, corrupted_surface TEXT + ); + CREATE TABLE canonical_naics ( + canonical_code TEXT PRIMARY KEY, corrupted_code TEXT + ); + CREATE TABLE canonical_amount ( + canonical_award_id TEXT PRIMARY KEY, + canonical_amount REAL, + amount_text TEXT, + format_id INTEGER + ); + CREATE TABLE planted_eng_dropped (award_id TEXT PRIMARY KEY); + CREATE TABLE planted_duplicate ( + canonical_award_id TEXT PRIMARY KEY, + original_amount REAL, + duplicate_amount REAL + ); + """) + conn.commit() + + +def pgesc(s): + if s is None: + return "NULL" + s = str(s).replace("'", "''") + return "'" + s + "'" + + +def pgnum(x): + if x is None: + return "NULL" + return f"{x:.2f}" + + +def _load_planted_amounts(manifest: sqlite3.Connection) -> dict[str, str]: + if manifest.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='planted_narrative_amount'" + ).fetchone(): + return {r[0]: r[1] for r in manifest.execute( + "SELECT canonical_award_id, narrative FROM planted_narrative_amount" + )} + return {} + + +def build_contracts(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if CONTRACTS_SQL.exists(): + CONTRACTS_SQL.unlink() + cur = clean.cursor() + mcur = manifest.cursor() + rows = cur.execute(""" + SELECT award_id, generated_internal_id, recipient_name, recipient_uei, + recipient_state, awarding_agency, awarding_sub_agency, + funding_agency, amount, total_outlays, start_date, end_date, + naics_code, naics_description, psc_code, psc_description, + award_type + FROM contracts + """).fetchall() + + lines = [ + "CREATE TABLE contracts (", + " award_id TEXT,", + " generated_internal_id TEXT,", + " recipient_uei TEXT,", + " awarding_agency TEXT,", + " awarding_sub_agency TEXT,", + " funding_agency TEXT,", + " start_date TEXT,", + " end_date TEXT,", + " naics_code TEXT,", + " psc_code TEXT,", + " award_type TEXT", + ");", + "CREATE TABLE contract_amounts (", + " award_id TEXT,", + " amount_text TEXT", + ");", + ] + + narr_amount = _load_planted_amounts(manifest) + n_dup = 0 + n_narr = 0 + for r in rows: + (award_id, gen_id, rname, ruei, rstate, agency, sub_agency, fund_agency, + amt, outlays, sd, ed, naics, naics_desc, psc, psc_desc, atype) = r + # Corrupt ids/values + a_corr = award_id_format(award_id, award_id) if award_id else None + uei_corr = uei_format(ruei, ruei or "") + agency_corr = agency_variant(agency, award_id or "") + sub_corr = agency_variant(sub_agency or "", award_id or "") + fund_corr = agency_variant(fund_agency or "", award_id or "") + naics_corr = naics_format(naics, award_id or "") + + lines.append( + "INSERT INTO contracts VALUES (" + ", ".join([ + pgesc(a_corr), pgesc(gen_id), pgesc(uei_corr), + pgesc(agency_corr), pgesc(sub_corr), pgesc(fund_corr), + pgesc(sd), pgesc(ed), + pgesc(naics_corr), pgesc(psc), pgesc(atype), + ]) + ");" + ) + # amount in sibling table as text — prefer LLM narrative if available + if amt is not None: + if award_id in narr_amount: + atext = narr_amount[award_id] + fid = -1 # signals narrative origin + n_narr += 1 + else: + atext, fid = amount_text(amt, award_id or "") + lines.append( + f"INSERT INTO contract_amounts VALUES ({pgesc(a_corr)}, {pgesc(atext)});" + ) + mcur.execute( + "INSERT OR REPLACE INTO canonical_amount VALUES (?,?,?,?)", + (award_id, amt, atext, fid), + ) + # duplicate row with conflicting amount + if should_duplicate(award_id or "") and amt is not None: + dup_amt = conflicting_amount(amt, award_id or "") + dup_atext, _ = amount_text(dup_amt, (award_id or "") + "-dup") + lines.append( + f"INSERT INTO contract_amounts VALUES ({pgesc(a_corr)}, {pgesc(dup_atext)});" + ) + mcur.execute( + "INSERT OR REPLACE INTO planted_duplicate VALUES (?,?,?)", + (award_id, amt, dup_amt), + ) + n_dup += 1 + + mcur.execute( + "INSERT OR REPLACE INTO canonical_award_id VALUES (?,?)", + (award_id, a_corr), + ) + if agency: + mcur.execute( + "INSERT INTO canonical_agency VALUES (?,?,?)", + (award_id, agency, agency_corr), + ) + + CONTRACTS_SQL.write_text("\n".join(lines) + "\n", encoding="utf-8") + manifest.commit() + print( + f"contracts.sql: built ({len(rows)} rows, {n_narr} LLM amount narratives, " + f"{n_dup} duplicate-amount injections)" + ) + + +def build_recipients(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if RECIPIENTS_DB.exists(): + RECIPIENTS_DB.unlink() + out = sqlite3.connect(RECIPIENTS_DB) + out.execute(""" + CREATE TABLE recipients ( + uei TEXT, + name TEXT, + state TEXT, + n_contracts INTEGER, + total_amount_text TEXT + ); + """) + cur = clean.cursor() + mcur = manifest.cursor() + rows = cur.execute( + "SELECT uei, name, state, n_contracts, total_amount FROM recipients" + ).fetchall() + for uei, name, state, ncon, total in rows: + uei_corr = uei_format(uei, uei or "") + name_corr = fuzz_recipient_name(name or "", uei or "") + state_corr = state_variant(state, uei or "") + total_text, _ = amount_text(total or 0, uei or "") + out.execute( + "INSERT INTO recipients VALUES (?,?,?,?,?)", + (uei_corr, name_corr, state_corr, ncon, total_text), + ) + mcur.execute( + "INSERT OR REPLACE INTO canonical_recipient VALUES (?,?)", + (uei, name), + ) + out.commit() + out.close() + manifest.commit() + print(f"recipients.db: built ({len(rows)} rows)") + + +def build_agencies(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if AGENCIES_DB.exists(): + AGENCIES_DB.unlink() + con = duckdb.connect(str(AGENCIES_DB)) + con.execute(""" + CREATE TABLE agencies ( + name TEXT, + n_contracts INTEGER, + total_amount_text TEXT + ); + CREATE TABLE agency_aliases ( + surface_form TEXT PRIMARY KEY, + canonical_name TEXT + ); + CREATE TABLE naics_sectors ( + code TEXT, + description TEXT, + sector TEXT + ); + """) + cur = clean.cursor() + mcur = manifest.cursor() + rows = cur.execute( + "SELECT name, n_contracts, total_amount FROM agencies" + ).fetchall() + agency_rows = [] + alias_pairs = set() + for name, ncon, total in rows: + # Use canonical for the row name (so it appears in agencies table as canonical), + # but populate agency_aliases with all variants known to map to canonical. + agency_rows.append((name, ncon, amount_text(total or 0, name)[0])) + for v in AGENCY_VARIANTS.get(name, [name]): + alias_pairs.add((v, name)) + con.executemany("INSERT INTO agencies VALUES (?,?,?)", agency_rows) + con.executemany("INSERT INTO agency_aliases VALUES (?,?)", sorted(alias_pairs)) + + naics_rows = [] + for r in cur.execute("SELECT code, description, sector FROM naics"): + # corrupt the code in the same way as in contracts so a JOIN on + # contracts.naics_code = naics_sectors.code matches. + # Actually, let's leave canonical here so the lookup is the + # canonical-code source-of-truth; the agent must canonicalize the + # corrupted contracts.naics_code to match. + naics_rows.append((r[0], r[1], r[2])) + mcur.execute( + "INSERT OR REPLACE INTO canonical_naics VALUES (?,?)", + (r[0], naics_format(r[0], "naics-row-" + r[0])), + ) + con.executemany("INSERT INTO naics_sectors VALUES (?,?,?)", naics_rows) + con.close() + manifest.commit() + print( + f"agencies.duckdb: built ({len(agency_rows)} agencies, " + f"{len(alias_pairs)} aliases, {len(naics_rows)} naics rows)" + ) + + +def build_descriptions(clean: sqlite3.Connection, manifest: sqlite3.Connection) -> None: + if DESC_DUMP.exists(): + shutil.rmtree(DESC_DUMP) + DESC_DUMP.mkdir(parents=True, exist_ok=True) + + cur = clean.cursor() + mcur = manifest.cursor() + docs = [] + n_eng_dropped = 0 + for award_id, desc in cur.execute( + "SELECT award_id, description FROM contracts" + ): + a_corr = award_id_format(award_id or "", award_id or "") + descs = [] + if desc: + if should_drop_english(award_id or ""): + # planted: drop English; replace with Spanish placeholder paraphrase + descs.append({"language": "es", "value": "[contrato federal] " + (desc[:200] if desc else "")}) + mcur.execute( + "INSERT OR REPLACE INTO planted_eng_dropped VALUES (?)", + (award_id,), + ) + n_eng_dropped += 1 + else: + descs.append({"language": "en", "value": desc}) + docs.append({"award_id": a_corr, "descriptions": descs}) + + from bson import encode as bson_encode + coll_dir = DESC_DUMP / "usaspending_descriptions" + coll_dir.mkdir(parents=True, exist_ok=True) + bson_path = coll_dir / "contract_documents.bson" + with bson_path.open("wb") as f: + for d in docs: + f.write(bson_encode(d)) + meta = { + "options": {}, + "indexes": [ + {"v": 2, "key": {"_id": 1}, "name": "_id_"}, + {"v": 2, "key": {"award_id": 1}, "name": "award_id_1"}, + ], + "uuid": "", + "collectionName": "contract_documents", + "type": "collection", + } + (coll_dir / "contract_documents.metadata.json").write_text( + json.dumps(meta), encoding="utf-8" + ) + manifest.commit() + print( + f"descriptions/: built ({len(docs)} docs, {n_eng_dropped} english-dropped)" + ) + + +def main(): + if not CLEAN_DB.exists(): + raise SystemExit(f"missing clean snapshot: {CLEAN_DB}") + QD.mkdir(parents=True, exist_ok=True) + clean = sqlite3.connect(CLEAN_DB) + manifest = sqlite3.connect(MANIFEST_DB) + init_manifest(manifest) + build_contracts(clean, manifest) + build_recipients(clean, manifest) + build_agencies(clean, manifest) + build_descriptions(clean, manifest) + clean.close() + manifest.close() + print("OK") + + +if __name__ == "__main__": + main() diff --git a/query_usaspending/manual_querycode/fetch_clean.py b/query_usaspending/manual_querycode/fetch_clean.py new file mode 100644 index 000000000..864c18995 --- /dev/null +++ b/query_usaspending/manual_querycode/fetch_clean.py @@ -0,0 +1,272 @@ +"""Fetch a bounded USAspending snapshot into clean/clean.sqlite. + +Scope (kept small to keep dataset under ~50MB): + - Contract awards: FY2024 Q4 (2024-07-01 .. 2024-09-30), bounded to ~25k by amount + - Derived recipient registry from those awards + - Derived awarding-agency hierarchy from those awards + - NAICS hierarchy: top-level (2-digit) sector descriptions, public reference + +Sources: + - https://api.usaspending.gov/api/v2/search/spending_by_award/ (paginated) + - https://api.usaspending.gov/api/v2/references/naics/ (reference) + +This is the canonical un-corrupted snapshot. Never seen by the agent. +""" +from __future__ import annotations +import json +import sqlite3 +import sys +import time +import urllib.request +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +SEARCH_URL = "https://api.usaspending.gov/api/v2/search/spending_by_award/" + +START = "2024-07-01" +END = "2024-09-30" +PAGE = 100 +TARGET_ROWS = 25000 # cap + + +# Fields to request from the API. The API will return them under these keys. +FIELDS = [ + "Award ID", "Recipient Name", "Recipient UEI", "Recipient Location", + "Awarding Agency", "Awarding Sub Agency", "Funding Agency", + "Award Amount", "Total Outlays", "Description", + "Start Date", "End Date", + "NAICS", "PSC", "Contract Award Type", "recipient_id", +] + +AWARD_TYPES = ["A", "B", "C", "D"] # Definitive contracts: BPA Call, Purchase Order, Delivery Order, Definitive Contract + + +def _post(url: str, body: dict, retries: int = 5) -> dict: + data = json.dumps(body).encode() + for i in range(retries): + try: + req = urllib.request.Request( + url, data=data, method="POST", + headers={ + "Content-Type": "application/json", + "User-Agent": "DataAgentBench/usaspending", + }, + ) + with urllib.request.urlopen(req, timeout=120) as r: + return json.loads(r.read()) + except Exception as e: + if i == retries - 1: + raise + wait = 2 ** i + print(f" retry {i+1}/{retries} after {wait}s: {str(e)[:120]}", file=sys.stderr) + time.sleep(wait) + + +def init_schema(conn: sqlite3.Connection) -> None: + c = conn.cursor() + c.executescript(""" + DROP TABLE IF EXISTS contracts; + DROP TABLE IF EXISTS recipients; + DROP TABLE IF EXISTS agencies; + DROP TABLE IF EXISTS naics; + + CREATE TABLE contracts ( + award_id TEXT PRIMARY KEY, + generated_internal_id TEXT, + recipient_name TEXT, + recipient_uei TEXT, + recipient_state TEXT, + awarding_agency TEXT, + awarding_sub_agency TEXT, + funding_agency TEXT, + amount REAL, + total_outlays REAL, + description TEXT, + start_date TEXT, + end_date TEXT, + naics_code TEXT, + naics_description TEXT, + psc_code TEXT, + psc_description TEXT, + award_type TEXT + ); + CREATE TABLE recipients ( + uei TEXT PRIMARY KEY, + name TEXT, + state TEXT, + n_contracts INTEGER, + total_amount REAL + ); + CREATE TABLE agencies ( + name TEXT PRIMARY KEY, + n_contracts INTEGER, + total_amount REAL + ); + CREATE TABLE naics ( + code TEXT PRIMARY KEY, + description TEXT, + sector TEXT + ); + + CREATE INDEX idx_contracts_recipient ON contracts(recipient_uei); + CREATE INDEX idx_contracts_agency ON contracts(awarding_agency); + CREATE INDEX idx_contracts_naics ON contracts(naics_code); + """) + conn.commit() + + +def fetch_contracts(conn: sqlite3.Connection) -> int: + cur = conn.cursor() + page = 1 + total = 0 + while total < TARGET_ROWS: + body = { + "filters": { + "award_type_codes": AWARD_TYPES, + "time_period": [{"start_date": START, "end_date": END}], + }, + "fields": FIELDS, + "page": page, + "limit": PAGE, + # Sort by start date (not amount) so we get a representative spread + # of contract sizes — small, medium, and large — instead of only the + # top-N by amount. + "sort": "Start Date", + "order": "desc", + } + print(f"contracts page={page} total={total}", flush=True) + resp = _post(SEARCH_URL, body) + items = resp.get("results", []) + if not items: + break + for it in items: + naics = it.get("NAICS") or {} + psc = it.get("PSC") or {} + recip_loc = it.get("Recipient Location") or {} + cur.execute( + "INSERT OR REPLACE INTO contracts VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)", + ( + it.get("Award ID"), + it.get("generated_internal_id"), + it.get("Recipient Name"), + it.get("Recipient UEI") or it.get("recipient_id"), + recip_loc.get("state_code") if isinstance(recip_loc, dict) else None, + it.get("Awarding Agency"), + it.get("Awarding Sub Agency"), + it.get("Funding Agency"), + it.get("Award Amount"), + it.get("Total Outlays"), + it.get("Description"), + it.get("Start Date"), + it.get("End Date"), + (naics.get("code") if isinstance(naics, dict) else None), + (naics.get("description") if isinstance(naics, dict) else None), + (psc.get("code") if isinstance(psc, dict) else None), + (psc.get("description") if isinstance(psc, dict) else None), + it.get("Contract Award Type"), + ), + ) + conn.commit() + total += len(items) + if not resp.get("page_metadata", {}).get("hasNext", False): + break + page += 1 + time.sleep(0.5) + print(f"contracts: {total} rows ingested", flush=True) + return total + + +def derive_recipients(conn: sqlite3.Connection) -> int: + cur = conn.cursor() + cur.execute("DELETE FROM recipients") + cur.execute(""" + INSERT INTO recipients (uei, name, state, n_contracts, total_amount) + SELECT recipient_uei, + MAX(recipient_name), + MAX(recipient_state), + COUNT(*), + COALESCE(SUM(amount), 0) + FROM contracts + WHERE recipient_uei IS NOT NULL + GROUP BY recipient_uei + """) + n = cur.execute("SELECT COUNT(*) FROM recipients").fetchone()[0] + conn.commit() + print(f"recipients: {n} derived") + return n + + +def derive_agencies(conn: sqlite3.Connection) -> int: + cur = conn.cursor() + cur.execute("DELETE FROM agencies") + cur.execute(""" + INSERT INTO agencies (name, n_contracts, total_amount) + SELECT awarding_agency, COUNT(*), COALESCE(SUM(amount), 0) + FROM contracts + WHERE awarding_agency IS NOT NULL + GROUP BY awarding_agency + """) + n = cur.execute("SELECT COUNT(*) FROM agencies").fetchone()[0] + conn.commit() + print(f"agencies: {n} derived") + return n + + +def fetch_naics(conn: sqlite3.Connection) -> int: + """Pull NAICS sector hierarchy from USAspending's reference endpoint.""" + cur = conn.cursor() + cur.execute("DELETE FROM naics") + url = "https://api.usaspending.gov/api/v2/references/naics/" + try: + req = urllib.request.Request(url, headers={"User-Agent": "DataAgentBench"}) + with urllib.request.urlopen(req, timeout=60) as r: + data = json.loads(r.read()) + except Exception as e: + print(f"naics ref: failed ({e}); using contract-derived NAICS only", file=sys.stderr) + cur.execute(""" + INSERT INTO naics (code, description, sector) + SELECT DISTINCT naics_code, naics_description, + substr(naics_code, 1, 2) + FROM contracts WHERE naics_code IS NOT NULL + """) + n = cur.execute("SELECT COUNT(*) FROM naics").fetchone()[0] + conn.commit() + return n + n = 0 + def walk(node, sector=None): + nonlocal n + code = node.get("naics") + desc = node.get("naics_description") + if code: + sec = sector if sector else (str(code)[:2] if len(str(code)) >= 2 else None) + cur.execute( + "INSERT OR REPLACE INTO naics VALUES (?,?,?)", + (str(code), desc, sec), + ) + n += 1 + for child in (node.get("children") or []): + walk(child, sec) + for top in data.get("results", []): + walk(top) + conn.commit() + print(f"naics: {n} codes") + return n + + +def main(): + CLEAN_DB.parent.mkdir(parents=True, exist_ok=True) + if CLEAN_DB.exists(): + CLEAN_DB.unlink() + conn = sqlite3.connect(CLEAN_DB) + init_schema(conn) + fetch_contracts(conn) + derive_recipients(conn) + derive_agencies(conn) + fetch_naics(conn) + conn.close() + print(f"OK: clean snapshot at {CLEAN_DB}") + + +if __name__ == "__main__": + main() diff --git a/query_usaspending/manual_querycode/llm_corrupt.py b/query_usaspending/manual_querycode/llm_corrupt.py new file mode 100644 index 000000000..2f883ca60 --- /dev/null +++ b/query_usaspending/manual_querycode/llm_corrupt.py @@ -0,0 +1,334 @@ +"""LLM-driven per-row amount narrative corruption for USAspending contracts. + +For each contract amount we want to obfuscate, call Azure OpenAI to produce a +short sentence that implies the dollar magnitude through phrasing — never +digits, never the literal number. + +Guardrails (each output must pass; up to 3 retries; otherwise the row keeps +its templated amount_text from corrupt.py): + + * MUST contain no digits at all + * MUST contain no banned tokens (currency-symbol $, "USD", numeric words like + "one"/"two"/"three" if they would leak the exact value, etc. — softer here: + we allow rough words like "million" / "billion" / "thousand" but NOT exact + counts) + * MUST be 1 short sentence, 30 - 250 chars + * MUST be in English (ascii ratio > 0.9) + +Outputs landed in clean/manifest.sqlite: + planted_narrative_amount (canonical_award_id, canonical_amount, narrative) +""" +from __future__ import annotations +import argparse +import os +import re +import sqlite3 +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +CLEAN_DB = ROOT / "clean" / "clean.sqlite" +MANIFEST_DB = ROOT / "clean" / "manifest.sqlite" +ENV_FILE = ROOT.parent / ".env" + + +def _load_env(): + if not ENV_FILE.exists(): + return + for line in ENV_FILE.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + os.environ.setdefault(k.strip(), v.strip()) + + +_load_env() + + +def _client(): + from openai import AzureOpenAI + return AzureOpenAI( + api_key=os.environ["AZURE_API_KEY"], + azure_endpoint=os.environ["AZURE_API_BASE"], + api_version=os.environ["AZURE_API_VERSION"], + ) + + +DEPLOYMENT = os.environ.get("AZURE_DEPLOYMENT", "gpt-4o") + + +# Block only explicit-precision tokens. The word "dollars" / "$" is fine +# because it doesn't leak the magnitude — we enforce no-digits separately. +DENY = {"exactly", "precisely"} + + +PROMPT = """You are obfuscating a federal contract's dollar amount for a benchmark. + +Write ONE sentence (40 - 280 characters) that *implies* the dollar magnitude +of the contract through narrative phrasing. The sentence MUST NOT contain any +digits, currency symbols, "USD", or precision words ("exactly", "precisely"). + +Constraint — anchor in contract-specific context. Reference at least one of +the contract details below NATURALLY in the sentence (recipient, agency, or +domain hint). This makes each narrative substantively unique per row. + +Constraint — VARY phrasing aggressively. Do NOT default to stock templates +like "in the hundreds of thousands range" or "a relatively modest award". +Each row should use distinct vocabulary, syntactic structure, and register. +Treat each row as a unique editorial paraphrase. Some registers to draw +from (rotate through them): +- terse procurement-style: "Mid-six-figure outlay for {{domain}} services." +- analyst-paragraph: "In context, this Air Force agreement sits among the + smaller-dollar logistics awards typical for the buyer." +- comparative: "A smaller commitment than {{recipient}}'s prior awards in the + same NAICS bracket." +- hedged: "Roughly an order of magnitude below the median large-vendor IT + modernization deal of recent quarters." +- domain-flavored: "Hardware-procurement-tier money — sub-million but + meaningful for the program office." +- counterfactual: "Were this an R&D award rather than O&M, the figure would + read large; for routine sustainment it is unremarkable." + +Magnitude bands (the sentence must clearly imply the band, but never use the +identical phrasing twice across different rows): +- billions: convey "billions" / "ten-figure" +- hundreds of millions: "nine-figure" / "hundreds of millions" +- tens of millions: "eight-figure" / "tens of millions" +- millions: "seven-figure" / "low-to-mid millions" +- hundreds of thousands: "six-figure" / "hundreds of thousands" +- tens of thousands or less: "five-figure or smaller" / "low five-figure" + / "modest sub-six-figure" + +Magnitude band for this row: {band} + +CRITICAL — band-specific constraints. The narrative must NOT contain words +that imply a higher band than the target band. Specifically: +- if band is "tens of thousands or less": the words "million", "millions", + "billion", "billions", "six-figure", "seven-figure", "eight-figure", + "nine-figure", "ten-figure", "hundreds of thousands" MUST NOT appear. + Use "five-figure" or "four-figure" or "small four/five-figure" / "tens of + thousands" / "modest five-figure" only. +- if band is "hundreds of thousands": the words "million", "millions", + "billion", "billions", "seven-figure", "eight-figure", "nine-figure", + "ten-figure" MUST NOT appear. Use "six-figure" / "hundreds of thousands" + only. +- if band is "millions": "billion(s)", "ten-figure", "nine-figure", + "eight-figure" MUST NOT appear; use "seven-figure" / "low-to-mid millions" + / "single-digit millions". +- if band is "tens of millions": "billion(s)", "ten-figure", "nine-figure" + MUST NOT appear; use "eight-figure" / "tens of millions". +- if band is "hundreds of millions": "billion(s)", "ten-figure" MUST NOT + appear; use "nine-figure" / "hundreds of millions" / "high nine-figure". +- if band is "billions": use "ten-figure" / "billions" / "multi-billion"; + do NOT use "trillion". + +Aim for tight, accurate magnitude phrasing — the goal is OBFUSCATING the +exact dollar value while being faithful about the order of magnitude. Even +if the recipient or agency is famous for big-dollar awards, you must reflect +the actual band of THIS row. + +Contract context (incorporate naturally — but do not include the contract id): +- recipient: {recipient} +- awarding agency: {agency} +- domain (NAICS description): {naics_desc} + +Output ONLY the sentence. No preamble. No digits. +""" + + +def _band(amount: float) -> str: + if amount >= 1_000_000_000: + return "billions" + if amount >= 100_000_000: + return "hundreds of millions" + if amount >= 10_000_000: + return "tens of millions" + if amount >= 1_000_000: + return "millions" + if amount >= 100_000: + return "hundreds of thousands" + return "tens of thousands or less" + + +def _length_ok(s: str) -> bool: + return 30 <= len(s) <= 250 + + +def _ascii_ratio(s: str) -> float: + if not s: + return 1.0 + return sum(1 for c in s if ord(c) < 128) / len(s) + + +def _has_banned(text: str) -> str | None: + text_l = text.lower() + for w in DENY: + if w in text_l: + return w + return None + + +VERIFY_PROMPT = """Read the sentence below and decide which dollar-magnitude +band it most naturally describes. Choose EXACTLY one of: +- billions +- hundreds of millions +- tens of millions +- millions +- hundreds of thousands +- tens of thousands or less + +Output ONLY the band label, nothing else. + +Sentence: {sentence} +""" + + +VALID_BANDS = { + "billions", + "hundreds of millions", + "tens of millions", + "millions", + "hundreds of thousands", + "tens of thousands or less", +} + + +def classify_amount(client, narrative: str) -> str | None: + """Roundtrip verifier: ask the LLM what band the narrative implies.""" + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{"role": "user", "content": VERIFY_PROMPT.format(sentence=narrative)}], + max_tokens=20, + temperature=0.0, + ) + out = (r.choices[0].message.content or "").strip().lower().strip(".") + # find the longest matching band + for b in sorted(VALID_BANDS, key=len, reverse=True): + if b in out: + return b + return None + except Exception: + return None + + +def rewrite_amount(client, award_id: str, amount: float, + recipient: str = "", agency: str = "", naics_desc: str = "", + retries: int = 3) -> str | None: + band = _band(amount) + last = None + for attempt in range(retries): + try: + r = client.chat.completions.create( + model=DEPLOYMENT, + messages=[{"role": "user", "content": PROMPT.format( + band=band, + recipient=recipient or "(unspecified)", + agency=agency or "(unspecified)", + naics_desc=naics_desc or "(unspecified)", + )}], + max_tokens=120, + temperature=0.95, + ) + out = (r.choices[0].message.content or "").strip() + # Strip wrapping quotes if any + out = out.strip('"').strip("'") + if re.search(r"\d", out): + last = "contains digits" + continue + bh = _has_banned(out) + if bh: + last = f"banned: {bh!r}" + continue + if not _length_ok(out): + last = f"length {len(out)}" + continue + if _ascii_ratio(out) < 0.9: + last = "non-english" + continue + # Roundtrip verification: a second LLM call must classify the + # narrative back into the same band as the canonical amount. + implied = classify_amount(client, out) + if implied != band: + last = f"verifier disagreed: implied={implied!r} canonical={band!r}" + continue + return out + except Exception as e: + last = str(e)[:120] + time.sleep(1.5 ** attempt) + print(f" [skip {award_id}] {last}", file=sys.stderr) + return None + + +def _ensure_manifest(): + conn = sqlite3.connect(MANIFEST_DB) + conn.executescript(""" + CREATE TABLE IF NOT EXISTS planted_narrative_amount ( + canonical_award_id TEXT PRIMARY KEY, + canonical_amount REAL, + narrative TEXT + ); + """) + conn.commit() + conn.close() + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--workers", type=int, default=16) + ap.add_argument("--limit", type=int, default=None) + args = ap.parse_args() + + _ensure_manifest() + client = _client() + + clean = sqlite3.connect(CLEAN_DB) + rows = clean.execute( + "SELECT award_id, amount, recipient_name, awarding_agency, naics_description " + "FROM contracts " + "WHERE award_id IS NOT NULL AND amount IS NOT NULL" + ).fetchall() + clean.close() + if args.limit: + rows = rows[: args.limit] + + mconn = sqlite3.connect(MANIFEST_DB) + done = {r[0] for r in mconn.execute("SELECT canonical_award_id FROM planted_narrative_amount")} + mconn.close() + todo = [r for r in rows if r[0] not in done] + print(f"Total rows: {len(rows)}; already done: {len(rows)-len(todo)}; todo: {len(todo)}", + flush=True) + + n_ok = 0 + lock = sqlite3.connect(MANIFEST_DB, isolation_level=None) + lock.execute("PRAGMA journal_mode=WAL") + + def work(row): + aid, amt, recipient, agency, naics_desc = row + out = rewrite_amount(client, aid, float(amt), + recipient=recipient or "", + agency=agency or "", + naics_desc=naics_desc or "") + return aid, amt, out + + with ThreadPoolExecutor(max_workers=args.workers) as ex: + futures = [ex.submit(work, r) for r in todo] + for i, fut in enumerate(as_completed(futures), 1): + aid, amt, out = fut.result() + if out: + lock.execute( + "INSERT OR REPLACE INTO planted_narrative_amount VALUES (?,?,?)", + (aid, amt, out), + ) + n_ok += 1 + if i % 200 == 0: + print(f" [{i}/{len(todo)}] ok={n_ok}", flush=True) + lock.close() + print(f"DONE: {n_ok}/{len(todo)}") + + +if __name__ == "__main__": + main() diff --git a/query_usaspending/query1/ground_truth.csv b/query_usaspending/query1/ground_truth.csv new file mode 100644 index 000000000..a2fa28f5c --- /dev/null +++ b/query_usaspending/query1/ground_truth.csv @@ -0,0 +1 @@ +898 diff --git a/query_usaspending/query1/query.json b/query_usaspending/query1/query.json new file mode 100644 index 000000000..a295a76d7 --- /dev/null +++ b/query_usaspending/query1/query.json @@ -0,0 +1 @@ +"How many distinct contract awards in the Department of Defense (across all agency surface-form variants) have an award amount greater than $1,000,000?" diff --git a/query_usaspending/query1/validate.py b/query_usaspending/query1/validate.py new file mode 100644 index 000000000..af5b8adcf --- /dev/null +++ b/query_usaspending/query1/validate.py @@ -0,0 +1,9 @@ +def validate(llm_output: str): + import re + from pathlib import Path + gt = int(Path(__file__).parent.joinpath("ground_truth.csv").read_text().strip()) + text = re.sub(r"(\d),(\d)", r"\1\2", llm_output) + nums = re.findall(r"(?