From a065851ae04021e5cd5efce2f140ebc1c80b92b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Zieli=C5=84ski?= <hello@robertz.co>
Date: Wed, 27 May 2026 19:57:32 +0200
Subject: [PATCH 1/4] feat(backends): pluggable StorageBackend SPI
 (pg/redis/mongodb)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the durable-queue store selectable via
configure(db_backend="pg"|"redis"|"mongodb"). Additive + opt-in: pg (default)
is byte-compatible and the legacy Postgres engine is untouched; selecting
redis/mongo does NOT re-home the orchestrator/worker (a later milestone).

- backends/base.py: StorageBackend port — enqueue / claim-exactly-once /
  lease+reclaim / idempotent terminals / atomic outbox
  (complete_with_event/fail_with_event) / wake / heartbeat / ON-OFF control.
  No driver object in any signature (anti-leakage); each backend is
  namespace-bound (multi-tenant isolation).
- backends/postgres.py: reference adapter — FOR UPDATE SKIP LOCKED, RETURNING
  CAS, one-transaction outbox, LISTEN/NOTIFY, on its own qw_* tables.
- backends/redis.py: Lua-atomic claim/terminal/outbox, sorted-set
  priority+FIFO, pub/sub wake (single instance, not Cluster).
- backends/mongodb.py: find_one_and_update claim, multi-doc-transaction
  outbox, change-stream wake (replica set required).
- tests/test_backend_contract.py: one parametrized contract suite, green
  against all three live servers (incl. an 8-thread claim-once contention
  stress test); a backend whose server is unreachable is skipped.
- config: db_backend + db_namespace + redis/mongo URL envs; [redis]/[mongodb]
  optional extras (drivers import lazily — pg-only deploys need neither).
- docs/storage_backends.md: contract, capability matrix + caveats, audit, and
  the integration boundary. CHANGELOG 0.3.0; CLAUDE.md/AGENTS.md/README updated.

Audit (design/regressions/safety/leakage): fixed Redis counts() drift
(decrement prior status, not always running); added the concurrency test; no
engine-suite regressions (415 green).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 AGENTS.md                            |  52 +++-
 CHANGELOG.md                         |  10 +
 CLAUDE.md                            |  52 +++-
 README.md                            |  30 +-
 docs/storage_backends.md             | 123 ++++++++
 pyproject.toml                       |   4 +-
 queue_workflows/__init__.py          |  10 +
 queue_workflows/backends/__init__.py | 149 +++++++++
 queue_workflows/backends/base.py     | 271 +++++++++++++++++
 queue_workflows/backends/mongodb.py  | 324 ++++++++++++++++++++
 queue_workflows/backends/postgres.py | 438 +++++++++++++++++++++++++++
 queue_workflows/backends/redis.py    | 374 +++++++++++++++++++++++
 queue_workflows/config.py            |  18 ++
 tests/test_backend_contract.py       | 343 +++++++++++++++++++++
 14 files changed, 2180 insertions(+), 18 deletions(-)
 create mode 100644 docs/storage_backends.md
 create mode 100644 queue_workflows/backends/__init__.py
 create mode 100644 queue_workflows/backends/base.py
 create mode 100644 queue_workflows/backends/mongodb.py
 create mode 100644 queue_workflows/backends/postgres.py
 create mode 100644 queue_workflows/backends/redis.py
 create mode 100644 tests/test_backend_contract.py

diff --git a/AGENTS.md b/AGENTS.md
index 9ba2e34..8952b74 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## What this is
 
-`queue_workflows` is a standalone, pip-installable **Postgres-as-queue workflow engine**: a `SELECT … FOR UPDATE SKIP LOCKED` claim loop woken by `LISTEN`, lease reclaim, a DAG dispatcher with a durable outbox, a GPU warm-model cache, periodic ingest work, per-host hw-metrics telemetry, and an operator worker ON/OFF control plane (`worker_control` — hard-stop/park a `(host, queue)` worker; see `docs/worker_control.md`). **Postgres (via `psycopg` 3) is the only hard runtime dependency.**
+`queue_workflows` is a standalone, pip-installable **Postgres-as-queue workflow engine**: a `SELECT … FOR UPDATE SKIP LOCKED` claim loop woken by `LISTEN`, lease reclaim, a DAG dispatcher with a durable outbox, a GPU warm-model cache, periodic ingest work, per-host hw-metrics telemetry, a durable per-node event log, and an operator worker ON/OFF control plane (`worker_control` — hard-stop/park a `(host, queue)` worker; see `docs/worker_control.md` and `docs/watchdogs.md`). **Postgres (via `psycopg` 3) is the only hard runtime dependency.**
 
 ## Who uses it / why the `ai_leads` defaults
 
@@ -22,6 +22,10 @@ pip install -e '.[test]'        # add [metrics] for the psutil-based hw_metrics
 # creates it if missing (see tests/conftest.py).
 QUEUE_WORKFLOWS_TEST_DB_URL=postgresql://user:pw@host:port/queue_workflows_test python -m pytest
 #   (falls back to AI_LEADS_DB_URL with its db-name suffixed _test if the above is unset)
+#   The multi-backend contract suite (tests/test_backend_contract.py) also reads
+#   QUEUE_WORKFLOWS_TEST_REDIS_URL / _MONGO_URL (mongo needs a replica set); each
+#   backend SKIPS if its server is unset/unreachable. `.[test]` pulls redis +
+#   pymongo; standalone consumers add `.[redis]` / `.[mongodb]`. See docs/storage_backends.md.
 
 python -m pytest tests/test_node_queue.py            # one module
 python -m pytest tests/test_node_queue.py::test_name # one test
@@ -31,6 +35,7 @@ python -m pytest -k lease                            # by keyword
 queue-orchestrator                  # bootstrap migrations + NodePool (dispatch/outbox/reclaim/input)
 queue-claim-worker --queue=gpu      # one worker process; --queue ∈ {cpu,gpu} (DAG) ∪ config.ingest_queues (default {fetch,load})
 queue-scheduler                     # PG-native ingest ticker
+queue-worker-control --queue=gpu --off   # operator ON/OFF for a (host,queue) worker (migration 0012; docs/worker_control.md)
 ```
 
 There is no linter/formatter config and no CI in this repo; match the surrounding style (heavy module/function docstrings explaining *why*, `from __future__ import annotations` everywhere).
@@ -54,7 +59,7 @@ This codebase is built test-first, and the test suite is the spec. **Write the f
 All three run as separate processes against the same database; the DB *is* the message bus.
 
 1. **Orchestrator** (`orchestrator.py` → `node_pool.NodePool`) — the only process that bootstraps migrations. Its `NodePool` runs background threads: the **dispatch loop** (`_tick`) expands freshly-`queued` `mode='node'` runs into node-jobs via `dispatcher.start_run`, **drains the dispatch-event outbox**, and runs the **lease-reclaim sweeps** (node + ingest); plus an **`InputListener`** that polls `workflow_input_submissions` and resumes parked input nodes. No node bodies run here.
-2. **Claim worker** (`claim_worker.ClaimWorker`) — **one process == one worker, concurrency-1 by contract.** `run_forever` does `LISTEN <channel>` then drains the queue greedily on each wake (1 s safety poll covers a dropped NOTIFY). `cpu`/`gpu` draw DAG node-jobs from `workflow_node_jobs`; the **ingest-family** queues (`config.ingest_queues`, default `fetch`/`load`) draw standalone ingest jobs from `ingest_jobs`. The GPU worker owns the process-wide warm `ModelCache`. Every claimed job is bracketed by a `LeaseRenewer` + a wall-clock `Watchdog` — and, for a GPU node that reports per-step progress, a no-progress `StallWatchdog` (node-jobs add a run-cancel watcher). See *Lease + reclaim + watchdog* below.
+2. **Claim worker** (`claim_worker.ClaimWorker`) — **one process == one worker, concurrency-1 by contract.** `run_forever` does `LISTEN <channel>` then drains the queue greedily on each wake (1 s safety poll covers a dropped NOTIFY). `cpu`/`gpu` draw DAG node-jobs from `workflow_node_jobs`; the **ingest-family** queues (`config.ingest_queues`, default `fetch`/`load`) draw standalone ingest jobs from `ingest_jobs`. The GPU worker owns the process-wide warm `ModelCache` (cache logic lives in `model_cache.py`, DB-decoupled; `gpu_model_cache.py` wires the one process-wide instance and publishes `current_model` to `worker_heartbeats` for affinity routing). Every claimed job is bracketed by a `LeaseRenewer` + a wall-clock `Watchdog`; a GPU node additionally gets the health-driven `GpuHealthWatchdog` (and a `StallWatchdog` for non-video). A node-job also runs a `JobStatusWatcher` (run-cancel/reassign self-kill), and the worker process runs one `WorkerControlWatcher` (operator ON/OFF). See *Lease + reclaim + watchdog* below.
 3. **Scheduler** (`scheduler.Ticker`) — a Python loop (not pg_cron) that sleeps to the next scheduled minute and enqueues `ingest_jobs` rows; an ingest claim worker picks them up.
 
 ### The queue mechanism
@@ -65,15 +70,26 @@ All three run as separate processes against the same database; the DB *is* the m
 
 A live worker renews `lease_expires_at` (~every 10 s) while a job runs, so lease length is independent of job duration. A **dead/wedged** worker stops renewing → its lease lapses → the orchestrator's reclaim sweep flips the row back to `queued` (re-firing the NOTIFY); this sweep is the **sole** recovery path for an orphaned `running` row.
 
-**Two daemon watchdogs** bracket every claimed job (both in `claim_worker.py`, both funnelling their terminal action through `_fail_job_and_exit` so the outbox-atomicity contract — mark failed **and** write the `failed` dispatch event in one txn — is coded in exactly one place):
+**Three daemon watchdogs** can bracket a claimed job (all in `claim_worker.py`), each hard-exiting with a **distinct code** so the cause is readable from the exit status:
 
-- the wall-clock **`Watchdog`** trips on `elapsed ≥ budget_for(job)` and hard-exits `os._exit(75)`;
-- the no-progress **`StallWatchdog`** is *opt-in* (a non-video GPU node whose `run(...)` declares a `status_callback`), **inert until the first per-step `beat()` arms it after the model load** (so a minutes-long cold load is never policed), then trips on a beat gap ≥ `STALL_TIMEOUT_S` (120 s) and exits `76`.
+- the wall-clock **`Watchdog`** (exit `75`) trips on `elapsed ≥ budget_for(job)`;
+- the no-progress **`StallWatchdog`** (exit `76`) is *opt-in* (a non-video GPU node whose `run(...)` declares a `status_callback`), **inert until the first per-step `beat()` arms it after the model load** (so a minutes-long cold load is never policed), then trips on a beat gap ≥ `STALL_TIMEOUT_S` (120 s);
+- the **`GpuHealthWatchdog`** (exit `78`) is the GPU guard — **health-driven, not wall-clock**: every `interval_s` (default 300 s) it trips only when the per-container GPU stayed idle **and** RAM was static across the window (no GPU work *and* no memory movement ⇒ wedged), arming at job start with a 20-min `load_grace_s` first window (safe because a healthy load *moves* RAM and a healthy render *keeps the GPU busy*). It replaced the old fixed GPU wall-clock cap, which couldn't catch the Blackwell qwen 0 %-GPU stall yet false-killed long-but-healthy renders. Per-container GPU%/RAM samplers (`gpu_health.py`) use pmon `sm%` scoped to this container (excludes a co-tenant ollama sidecar) + cgroup RAM.
 
-Either trip lets the lease lapse so reclaim re-queues the work — which is why a GPU worker is one process holding one model: a hard exit kills exactly the hung job. See **`docs/watchdogs.md`** for the full design (why a wall-clock budget alone can't catch the Blackwell qwen 0 %-GPU stall, and the per-step beat plumbing).
+All three trips funnel through one policy point — **`_watchdog_trip`**, which does **re-queue-and-retry, not fail** (migration `0010`): for a DAG node-job it flips the row `running → queued`, bumps `watchdog_retries`, and writes **no** dispatch event (so the *run* stays `running` — only this node re-runs; `_requeue_job_and_exit`), then hard-exits so a fresh worker re-claims it. Only once `watchdog_retries` hits `AI_LEADS_WATCHDOG_MAX_RETRIES` (default 3) does it fall back to **`_fail_job_and_exit`** (mark failed **+** the `failed` dispatch event in one txn — the outbox-atomicity contract, coded in exactly one place). An **ingest** job (no run, no retry counter) always takes the fail path. So a single transient wedge no longer kills the whole workflow — and a GPU worker stays one process holding one model: a hard exit kills exactly the hung job. See **`docs/watchdogs.md`** for the full design.
+
+Two state-watcher threads sit alongside the watchdogs: **`JobStatusWatcher`** (exit `77`) self-kills a worker whose `running` row was cancelled/reassigned out from under it (avoids a double-run), and **`WorkerControlWatcher`** (exit `79`) enforces the operator ON/OFF control plane (below).
 
 **Last-resort recovery — the orchestrator-side dead-worker detector.** Every watchdog above is an in-process *thread*; a GPU **hardware-hang** can defeat all of them (the trip signal becomes unobservable from inside — e.g. on ROCm the box-level GPU probe still reads non-idle while *this* render is wedged — or, on a GIL-holding hang, the threads can't run at all). The worker then sits wedged while its `worker_heartbeats.last_seen` freezes. The orchestrator is a **separate process** (GIL-independent of the worker), so `NodePool._tick` adds `_sweep_dead_workers` → `node_queue.flag_stale_workers_holding_running_jobs`: it flags any worker whose heartbeat is stale (>30 s, 3× the cadence) **while it still owns a `running` job** (join `claimed_by = host_label`), stamping `worker_heartbeats.last_flagged_dead_at` (migration 0009) + an actionable `DEAD WORKER:` ERROR. The JOB is recovered by the lease-reclaim as usual; this flags the dead **process** for a host-supervisor to bounce (the orchestrator can't safely cross-host-kill it). A fresh heartbeat clears the flag. See **`docs/watchdogs.md` → "last-resort layer"** for the root-cause and the host-supervisor hook.
 
+### Durable node-event history (migration 0011)
+
+`workflow_node_jobs` is one *mutable* row per `(run_id, node_id)` — a watchdog re-queue overwrites `claimed_by`/timing and only bumps `watchdog_retries`, so the prior attempt's worker, timing, and trip reason are lost. `workflow_node_events` is an **append-only** forensic log of the per-attempt lifecycle (`claimed`, `model_load_*`, `progress_beat`, `stall_*`, `gpu_health_trip`, `budget_trip`, `requeued`, `reassigned`, terminal …; `attempt` = `watchdog_retries` at emit ties the tries of one node together). Writers in `node_queue`: `record_node_event` (best-effort — own connection, swallow-on-failure, so an event blip can **never** fail the load-bearing claim/terminal/watchdog path) and `record_node_event_in_txn` (terminal + `requeued` events ride the **same txn** as the state change — the dispatch-outbox atomicity pattern). The worker emits via `claim_worker._emit_node_event`; `NodePool` prunes old rows on a sweep (`prune_node_events`, default 30-day retention). **Append-only — no UPDATE path**, so it adds no new mutation invariant.
+
+### Operator worker ON/OFF control plane (migration 0012)
+
+`worker_controls` is **desired** state (an operator or host UI writes a `(host_label, queue)` row), kept deliberately separate from the *observed* `worker_heartbeats` — an OFF state must persist precisely while the worker isn't beating. A row trigger fires `pg_notify('worker_control', '<host>:<queue>')` inside the writer's txn, so a plain SQL write from any DB consumer wakes the worker with no app-side NOTIFY. `worker_control.WorkerControlWatcher` (LISTEN + 5 s safety poll) enforces it; on OFF it dispatches the row's `stop_policy` through the `STOP_POLICIES` registry — only `"hard"` exists today (`"drain"`/`"pause"` are reserved names that slot in with no schema change, which is why `stop_policy` is free-form TEXT, not a CHECK). **Hard stop = process exit (`os._exit(79)`)**: the node body runs inline on the worker's main thread, so killing the process is the only thing that reliably stops a wedged CUDA kernel and frees VRAM (it re-queues the in-flight job first, resume-style, with **no** `watchdog_retries` bump — an operator stop isn't a fault). The supervisor restarts the container; on boot the worker re-reads `worker_controls` and **parks** (idle, not claiming) while still OFF. A worker absent from the table — or a DB predating 0012 (`get_worker_control` swallows `UndefinedTable`) — is treated as **ON**, so the engine runs unchanged before 0012 (claim workers gate on schema 6/8, not 12). See `docs/worker_control.md`.
+
 ### DAG dispatch + the durable outbox (key decoupling)
 
 `dispatcher.py` is **pure DAG-walk logic** (unit-testable without a worker pool): expand a run's initial nodes, and on each node terminal event find downstream nodes whose deps are all `completed`/`skipped` and enqueue (or insert a `skipped` marker per `skip_if`). The worker→dispatcher handoff is an **outbox**: when a worker finalizes a node it writes the terminal status **and** a `workflow_dispatch_events` row in **one transaction** (`node_executor.execute_node`). The orchestrator drains that outbox and calls `on_node_completed`/`on_node_failed`/`on_node_awaiting_input`. So fan-out is retryable and never synchronously coupled to the worker; a failing callback is retried next tick (and poison-flagged after `_DISPATCH_MAX_ATTEMPTS`).
@@ -100,15 +116,35 @@ Everything domain-specific is an **injected hook** on a process-wide `EngineConf
 
 **Every hook has a safe default**, so `import queue_workflows` + `configure()` + a reachable Postgres runs standalone. When working in any engine module, never reach "up" into a host — add a config hook with a default instead. `config.py` is a **leaf** (imports nothing from other engine modules) to keep the dependency graph acyclic; respect that (e.g. it lazily imports `refs` only inside `get_resolve_ref`).
 
+### Pluggable storage backends (the `db_backend` seam — additive, v0.3.0)
+
+Beyond the host hooks, the **storage layer itself** is selectable:
+`configure(db_backend="pg"|"redis"|"mongodb")` resolves a `StorageBackend`
+(`queue_workflows/backends/`, **one provider per file**) — a generic durable-queue
+SPI (enqueue / claim-exactly-once / lease+reclaim / idempotent terminals / the
+**atomic outbox** `complete_with_event`/`fail_with_event` / wake / heartbeat /
+ON-OFF control). It is **additive and opt-in**: `pg` (default) is byte-compatible
+and the legacy engine modules still talk to Postgres directly — selecting
+redis/mongo does **not** re-home the orchestrator/worker (a later milestone), and
+the redis/pymongo drivers import lazily so a pg-only deploy needs neither. Two
+invariants make it honest: the port leaks **no** driver object (no
+cursor/pipeline/session in any signature — the anti-leakage rule), and every
+backend is **namespace-bound** so two tenants on one redis/mongo server can't see
+each other's jobs. pg uses `SKIP LOCKED`/`RETURNING`/one-txn-outbox/`LISTEN`;
+redis uses **Lua** (atomic claim+terminal+event) + pub/sub; mongodb uses
+`find_one_and_update` + a **multi-doc txn** + a change stream (**replica set
+required**). The contract is one parametrized suite (`tests/test_backend_contract.py`)
+green against all three live servers. See `docs/storage_backends.md`.
+
 ### Migrations — the engine owns one chain, hosts run a second
 
 The engine owns `queue_workflows/migrations/NNNN_*.sql` (+ paired `.down.sql`), shipped as package data, tracked in the `queue_schema_version` ledger. `db.bootstrap()` applies the chain idempotently; `db.downgrade()` reverses it. A host with its own domain tables runs a **second** chain via `db.bootstrap(migrations_dir=..., version_table=...)` against its own ledger — "two ORMs / two chains, one Postgres." **Only the orchestrator bootstraps** (`db.bootstrap` takes no advisory lock); claim workers call `db.wait_for_schema(min_version)` and block until the schema is ready rather than racing the migration run (`_REQUIRED_SCHEMA_VERSION` maps each queue to its minimum version).
 
-The chain: `0001` `workflow_runs` → `0002` `workflow_node_jobs` → `0003` `workflow_input_submissions` → `0004` `workflow_dispatch_events` → `0005` `worker_heartbeats` → `0006` lease columns + `node_job_ready` trigger → `0007` `ingest_jobs` + `ingest_job_ready` trigger → `0008` multi-tenant ingest (adds per-job `args JSONB`; drops the `fetch`/`load` queue CHECK and the `cpu`/`gpu`-only `worker_heartbeats` CHECK so those allow-lists move host-side — all additive/idempotent). Ingest queues therefore require schema version ≥ 8. `run_store` treats `parcel_id` as an opaque nullable column (the engine drops the host's parcels FK) so the engine never knows the host's domain.
+The chain: `0001` `workflow_runs` → `0002` `workflow_node_jobs` → `0003` `workflow_input_submissions` → `0004` `workflow_dispatch_events` → `0005` `worker_heartbeats` → `0006` lease columns + `node_job_ready` trigger → `0007` `ingest_jobs` + `ingest_job_ready` trigger → `0008` multi-tenant ingest (adds per-job `args JSONB`; drops the `fetch`/`load` queue CHECK and the `cpu`/`gpu`-only `worker_heartbeats` CHECK so those allow-lists move host-side — all additive/idempotent) → `0009` `worker_heartbeats.last_flagged_dead_at` (dead-worker flag) → `0010` `workflow_node_jobs.watchdog_retries` (watchdog re-queue counter) → `0011` `workflow_node_events` (append-only per-attempt event log) → `0012` `worker_controls` (+ `worker_control` NOTIFY trigger). Ingest queues require schema version ≥ 8; worker-control is read-optional — claim workers gate on 6/8, not 12, and treat a pre-0012 DB as all-ON. `run_store` treats `parcel_id` as an opaque nullable column (the engine drops the host's parcels FK) so the engine never knows the host's domain.
 
 ### Idempotency contracts to preserve
 
-`mark_completed`/`mark_failed`/`mark_awaiting_input` (and the ingest twins) all `UPDATE … WHERE status NOT IN ('completed','failed','cancelled') RETURNING *` and return `None` when the row was already terminal. This `WHERE` is load-bearing: it makes duplicate deliveries and claim-race losers safe, and stops a stray second call from clobbering a finalized `context_delta`. JSON columns are pre-validated (`json.dumps`) before any state mutation so a bad payload fails before the write. Keep this shape for any new state transition.
+`mark_completed`/`mark_failed`/`mark_awaiting_input` (and the ingest twins) all `UPDATE … WHERE status NOT IN ('completed','failed','cancelled') RETURNING *` and return `None` when the row was already terminal. This `WHERE` is load-bearing: it makes duplicate deliveries and claim-race losers safe, and stops a stray second call from clobbering a finalized `context_delta`. JSON columns are pre-validated (`json.dumps`) before any state mutation so a bad payload fails before the write. Keep this shape for any new state transition. (The one deliberate exception is `workflow_node_events` — **append-only, no UPDATE path** — whose terminal/`requeued` rows instead ride the state-change txn, like the dispatch outbox.)
 
 ### Telemetry (hw_metrics + cgroup attribution)
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c91b639..0dc2088 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -10,6 +10,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [0.3.0] — 2026-05-27
 
 ### Added
+- **Pluggable storage backends** — a `StorageBackend` SPI
+  (`queue_workflows.backends`) makes the durable-queue store selectable via
+  `configure(db_backend="pg"|"redis"|"mongodb")`, one provider per file. `pg`
+  (default) is byte-compatible and unchanged; `redis` (atomic claim/terminal via
+  Lua + pub/sub wake) and `mongodb` (`find_one_and_update` + multi-doc-txn outbox
+  + change-stream wake, replica set) reproduce the same contract — claim
+  exactly-once, lease/reclaim, idempotent terminals, the atomic outbox, and
+  per-namespace tenant isolation — pinned by a parametrized contract suite run
+  against all three live servers. Optional drivers: `queue_workflows[redis]` /
+  `[mongodb]`. See `docs/storage_backends.md`.
 - Operator worker ON/OFF control plane: a `worker_controls` table + a
   per-`(host, queue)` `WorkerControlWatcher` that hard-stops (kill in-flight +
   free RAM/VRAM) or parks a worker on command (migration 0012). See
diff --git a/CLAUDE.md b/CLAUDE.md
index 9ba2e34..8952b74 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -4,7 +4,7 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ## What this is
 
-`queue_workflows` is a standalone, pip-installable **Postgres-as-queue workflow engine**: a `SELECT … FOR UPDATE SKIP LOCKED` claim loop woken by `LISTEN`, lease reclaim, a DAG dispatcher with a durable outbox, a GPU warm-model cache, periodic ingest work, per-host hw-metrics telemetry, and an operator worker ON/OFF control plane (`worker_control` — hard-stop/park a `(host, queue)` worker; see `docs/worker_control.md`). **Postgres (via `psycopg` 3) is the only hard runtime dependency.**
+`queue_workflows` is a standalone, pip-installable **Postgres-as-queue workflow engine**: a `SELECT … FOR UPDATE SKIP LOCKED` claim loop woken by `LISTEN`, lease reclaim, a DAG dispatcher with a durable outbox, a GPU warm-model cache, periodic ingest work, per-host hw-metrics telemetry, a durable per-node event log, and an operator worker ON/OFF control plane (`worker_control` — hard-stop/park a `(host, queue)` worker; see `docs/worker_control.md` and `docs/watchdogs.md`). **Postgres (via `psycopg` 3) is the only hard runtime dependency.**
 
 ## Who uses it / why the `ai_leads` defaults
 
@@ -22,6 +22,10 @@ pip install -e '.[test]'        # add [metrics] for the psutil-based hw_metrics
 # creates it if missing (see tests/conftest.py).
 QUEUE_WORKFLOWS_TEST_DB_URL=postgresql://user:pw@host:port/queue_workflows_test python -m pytest
 #   (falls back to AI_LEADS_DB_URL with its db-name suffixed _test if the above is unset)
+#   The multi-backend contract suite (tests/test_backend_contract.py) also reads
+#   QUEUE_WORKFLOWS_TEST_REDIS_URL / _MONGO_URL (mongo needs a replica set); each
+#   backend SKIPS if its server is unset/unreachable. `.[test]` pulls redis +
+#   pymongo; standalone consumers add `.[redis]` / `.[mongodb]`. See docs/storage_backends.md.
 
 python -m pytest tests/test_node_queue.py            # one module
 python -m pytest tests/test_node_queue.py::test_name # one test
@@ -31,6 +35,7 @@ python -m pytest -k lease                            # by keyword
 queue-orchestrator                  # bootstrap migrations + NodePool (dispatch/outbox/reclaim/input)
 queue-claim-worker --queue=gpu      # one worker process; --queue ∈ {cpu,gpu} (DAG) ∪ config.ingest_queues (default {fetch,load})
 queue-scheduler                     # PG-native ingest ticker
+queue-worker-control --queue=gpu --off   # operator ON/OFF for a (host,queue) worker (migration 0012; docs/worker_control.md)
 ```
 
 There is no linter/formatter config and no CI in this repo; match the surrounding style (heavy module/function docstrings explaining *why*, `from __future__ import annotations` everywhere).
@@ -54,7 +59,7 @@ This codebase is built test-first, and the test suite is the spec. **Write the f
 All three run as separate processes against the same database; the DB *is* the message bus.
 
 1. **Orchestrator** (`orchestrator.py` → `node_pool.NodePool`) — the only process that bootstraps migrations. Its `NodePool` runs background threads: the **dispatch loop** (`_tick`) expands freshly-`queued` `mode='node'` runs into node-jobs via `dispatcher.start_run`, **drains the dispatch-event outbox**, and runs the **lease-reclaim sweeps** (node + ingest); plus an **`InputListener`** that polls `workflow_input_submissions` and resumes parked input nodes. No node bodies run here.
-2. **Claim worker** (`claim_worker.ClaimWorker`) — **one process == one worker, concurrency-1 by contract.** `run_forever` does `LISTEN <channel>` then drains the queue greedily on each wake (1 s safety poll covers a dropped NOTIFY). `cpu`/`gpu` draw DAG node-jobs from `workflow_node_jobs`; the **ingest-family** queues (`config.ingest_queues`, default `fetch`/`load`) draw standalone ingest jobs from `ingest_jobs`. The GPU worker owns the process-wide warm `ModelCache`. Every claimed job is bracketed by a `LeaseRenewer` + a wall-clock `Watchdog` — and, for a GPU node that reports per-step progress, a no-progress `StallWatchdog` (node-jobs add a run-cancel watcher). See *Lease + reclaim + watchdog* below.
+2. **Claim worker** (`claim_worker.ClaimWorker`) — **one process == one worker, concurrency-1 by contract.** `run_forever` does `LISTEN <channel>` then drains the queue greedily on each wake (1 s safety poll covers a dropped NOTIFY). `cpu`/`gpu` draw DAG node-jobs from `workflow_node_jobs`; the **ingest-family** queues (`config.ingest_queues`, default `fetch`/`load`) draw standalone ingest jobs from `ingest_jobs`. The GPU worker owns the process-wide warm `ModelCache` (cache logic lives in `model_cache.py`, DB-decoupled; `gpu_model_cache.py` wires the one process-wide instance and publishes `current_model` to `worker_heartbeats` for affinity routing). Every claimed job is bracketed by a `LeaseRenewer` + a wall-clock `Watchdog`; a GPU node additionally gets the health-driven `GpuHealthWatchdog` (and a `StallWatchdog` for non-video). A node-job also runs a `JobStatusWatcher` (run-cancel/reassign self-kill), and the worker process runs one `WorkerControlWatcher` (operator ON/OFF). See *Lease + reclaim + watchdog* below.
 3. **Scheduler** (`scheduler.Ticker`) — a Python loop (not pg_cron) that sleeps to the next scheduled minute and enqueues `ingest_jobs` rows; an ingest claim worker picks them up.
 
 ### The queue mechanism
@@ -65,15 +70,26 @@ All three run as separate processes against the same database; the DB *is* the m
 
 A live worker renews `lease_expires_at` (~every 10 s) while a job runs, so lease length is independent of job duration. A **dead/wedged** worker stops renewing → its lease lapses → the orchestrator's reclaim sweep flips the row back to `queued` (re-firing the NOTIFY); this sweep is the **sole** recovery path for an orphaned `running` row.
 
-**Two daemon watchdogs** bracket every claimed job (both in `claim_worker.py`, both funnelling their terminal action through `_fail_job_and_exit` so the outbox-atomicity contract — mark failed **and** write the `failed` dispatch event in one txn — is coded in exactly one place):
+**Three daemon watchdogs** can bracket a claimed job (all in `claim_worker.py`), each hard-exiting with a **distinct code** so the cause is readable from the exit status:
 
-- the wall-clock **`Watchdog`** trips on `elapsed ≥ budget_for(job)` and hard-exits `os._exit(75)`;
-- the no-progress **`StallWatchdog`** is *opt-in* (a non-video GPU node whose `run(...)` declares a `status_callback`), **inert until the first per-step `beat()` arms it after the model load** (so a minutes-long cold load is never policed), then trips on a beat gap ≥ `STALL_TIMEOUT_S` (120 s) and exits `76`.
+- the wall-clock **`Watchdog`** (exit `75`) trips on `elapsed ≥ budget_for(job)`;
+- the no-progress **`StallWatchdog`** (exit `76`) is *opt-in* (a non-video GPU node whose `run(...)` declares a `status_callback`), **inert until the first per-step `beat()` arms it after the model load** (so a minutes-long cold load is never policed), then trips on a beat gap ≥ `STALL_TIMEOUT_S` (120 s);
+- the **`GpuHealthWatchdog`** (exit `78`) is the GPU guard — **health-driven, not wall-clock**: every `interval_s` (default 300 s) it trips only when the per-container GPU stayed idle **and** RAM was static across the window (no GPU work *and* no memory movement ⇒ wedged), arming at job start with a 20-min `load_grace_s` first window (safe because a healthy load *moves* RAM and a healthy render *keeps the GPU busy*). It replaced the old fixed GPU wall-clock cap, which couldn't catch the Blackwell qwen 0 %-GPU stall yet false-killed long-but-healthy renders. Per-container GPU%/RAM samplers (`gpu_health.py`) use pmon `sm%` scoped to this container (excludes a co-tenant ollama sidecar) + cgroup RAM.
 
-Either trip lets the lease lapse so reclaim re-queues the work — which is why a GPU worker is one process holding one model: a hard exit kills exactly the hung job. See **`docs/watchdogs.md`** for the full design (why a wall-clock budget alone can't catch the Blackwell qwen 0 %-GPU stall, and the per-step beat plumbing).
+All three trips funnel through one policy point — **`_watchdog_trip`**, which does **re-queue-and-retry, not fail** (migration `0010`): for a DAG node-job it flips the row `running → queued`, bumps `watchdog_retries`, and writes **no** dispatch event (so the *run* stays `running` — only this node re-runs; `_requeue_job_and_exit`), then hard-exits so a fresh worker re-claims it. Only once `watchdog_retries` hits `AI_LEADS_WATCHDOG_MAX_RETRIES` (default 3) does it fall back to **`_fail_job_and_exit`** (mark failed **+** the `failed` dispatch event in one txn — the outbox-atomicity contract, coded in exactly one place). An **ingest** job (no run, no retry counter) always takes the fail path. So a single transient wedge no longer kills the whole workflow — and a GPU worker stays one process holding one model: a hard exit kills exactly the hung job. See **`docs/watchdogs.md`** for the full design.
+
+Two state-watcher threads sit alongside the watchdogs: **`JobStatusWatcher`** (exit `77`) self-kills a worker whose `running` row was cancelled/reassigned out from under it (avoids a double-run), and **`WorkerControlWatcher`** (exit `79`) enforces the operator ON/OFF control plane (below).
 
 **Last-resort recovery — the orchestrator-side dead-worker detector.** Every watchdog above is an in-process *thread*; a GPU **hardware-hang** can defeat all of them (the trip signal becomes unobservable from inside — e.g. on ROCm the box-level GPU probe still reads non-idle while *this* render is wedged — or, on a GIL-holding hang, the threads can't run at all). The worker then sits wedged while its `worker_heartbeats.last_seen` freezes. The orchestrator is a **separate process** (GIL-independent of the worker), so `NodePool._tick` adds `_sweep_dead_workers` → `node_queue.flag_stale_workers_holding_running_jobs`: it flags any worker whose heartbeat is stale (>30 s, 3× the cadence) **while it still owns a `running` job** (join `claimed_by = host_label`), stamping `worker_heartbeats.last_flagged_dead_at` (migration 0009) + an actionable `DEAD WORKER:` ERROR. The JOB is recovered by the lease-reclaim as usual; this flags the dead **process** for a host-supervisor to bounce (the orchestrator can't safely cross-host-kill it). A fresh heartbeat clears the flag. See **`docs/watchdogs.md` → "last-resort layer"** for the root-cause and the host-supervisor hook.
 
+### Durable node-event history (migration 0011)
+
+`workflow_node_jobs` is one *mutable* row per `(run_id, node_id)` — a watchdog re-queue overwrites `claimed_by`/timing and only bumps `watchdog_retries`, so the prior attempt's worker, timing, and trip reason are lost. `workflow_node_events` is an **append-only** forensic log of the per-attempt lifecycle (`claimed`, `model_load_*`, `progress_beat`, `stall_*`, `gpu_health_trip`, `budget_trip`, `requeued`, `reassigned`, terminal …; `attempt` = `watchdog_retries` at emit ties the tries of one node together). Writers in `node_queue`: `record_node_event` (best-effort — own connection, swallow-on-failure, so an event blip can **never** fail the load-bearing claim/terminal/watchdog path) and `record_node_event_in_txn` (terminal + `requeued` events ride the **same txn** as the state change — the dispatch-outbox atomicity pattern). The worker emits via `claim_worker._emit_node_event`; `NodePool` prunes old rows on a sweep (`prune_node_events`, default 30-day retention). **Append-only — no UPDATE path**, so it adds no new mutation invariant.
+
+### Operator worker ON/OFF control plane (migration 0012)
+
+`worker_controls` is **desired** state (an operator or host UI writes a `(host_label, queue)` row), kept deliberately separate from the *observed* `worker_heartbeats` — an OFF state must persist precisely while the worker isn't beating. A row trigger fires `pg_notify('worker_control', '<host>:<queue>')` inside the writer's txn, so a plain SQL write from any DB consumer wakes the worker with no app-side NOTIFY. `worker_control.WorkerControlWatcher` (LISTEN + 5 s safety poll) enforces it; on OFF it dispatches the row's `stop_policy` through the `STOP_POLICIES` registry — only `"hard"` exists today (`"drain"`/`"pause"` are reserved names that slot in with no schema change, which is why `stop_policy` is free-form TEXT, not a CHECK). **Hard stop = process exit (`os._exit(79)`)**: the node body runs inline on the worker's main thread, so killing the process is the only thing that reliably stops a wedged CUDA kernel and frees VRAM (it re-queues the in-flight job first, resume-style, with **no** `watchdog_retries` bump — an operator stop isn't a fault). The supervisor restarts the container; on boot the worker re-reads `worker_controls` and **parks** (idle, not claiming) while still OFF. A worker absent from the table — or a DB predating 0012 (`get_worker_control` swallows `UndefinedTable`) — is treated as **ON**, so the engine runs unchanged before 0012 (claim workers gate on schema 6/8, not 12). See `docs/worker_control.md`.
+
 ### DAG dispatch + the durable outbox (key decoupling)
 
 `dispatcher.py` is **pure DAG-walk logic** (unit-testable without a worker pool): expand a run's initial nodes, and on each node terminal event find downstream nodes whose deps are all `completed`/`skipped` and enqueue (or insert a `skipped` marker per `skip_if`). The worker→dispatcher handoff is an **outbox**: when a worker finalizes a node it writes the terminal status **and** a `workflow_dispatch_events` row in **one transaction** (`node_executor.execute_node`). The orchestrator drains that outbox and calls `on_node_completed`/`on_node_failed`/`on_node_awaiting_input`. So fan-out is retryable and never synchronously coupled to the worker; a failing callback is retried next tick (and poison-flagged after `_DISPATCH_MAX_ATTEMPTS`).
@@ -100,15 +116,35 @@ Everything domain-specific is an **injected hook** on a process-wide `EngineConf
 
 **Every hook has a safe default**, so `import queue_workflows` + `configure()` + a reachable Postgres runs standalone. When working in any engine module, never reach "up" into a host — add a config hook with a default instead. `config.py` is a **leaf** (imports nothing from other engine modules) to keep the dependency graph acyclic; respect that (e.g. it lazily imports `refs` only inside `get_resolve_ref`).
 
+### Pluggable storage backends (the `db_backend` seam — additive, v0.3.0)
+
+Beyond the host hooks, the **storage layer itself** is selectable:
+`configure(db_backend="pg"|"redis"|"mongodb")` resolves a `StorageBackend`
+(`queue_workflows/backends/`, **one provider per file**) — a generic durable-queue
+SPI (enqueue / claim-exactly-once / lease+reclaim / idempotent terminals / the
+**atomic outbox** `complete_with_event`/`fail_with_event` / wake / heartbeat /
+ON-OFF control). It is **additive and opt-in**: `pg` (default) is byte-compatible
+and the legacy engine modules still talk to Postgres directly — selecting
+redis/mongo does **not** re-home the orchestrator/worker (a later milestone), and
+the redis/pymongo drivers import lazily so a pg-only deploy needs neither. Two
+invariants make it honest: the port leaks **no** driver object (no
+cursor/pipeline/session in any signature — the anti-leakage rule), and every
+backend is **namespace-bound** so two tenants on one redis/mongo server can't see
+each other's jobs. pg uses `SKIP LOCKED`/`RETURNING`/one-txn-outbox/`LISTEN`;
+redis uses **Lua** (atomic claim+terminal+event) + pub/sub; mongodb uses
+`find_one_and_update` + a **multi-doc txn** + a change stream (**replica set
+required**). The contract is one parametrized suite (`tests/test_backend_contract.py`)
+green against all three live servers. See `docs/storage_backends.md`.
+
 ### Migrations — the engine owns one chain, hosts run a second
 
 The engine owns `queue_workflows/migrations/NNNN_*.sql` (+ paired `.down.sql`), shipped as package data, tracked in the `queue_schema_version` ledger. `db.bootstrap()` applies the chain idempotently; `db.downgrade()` reverses it. A host with its own domain tables runs a **second** chain via `db.bootstrap(migrations_dir=..., version_table=...)` against its own ledger — "two ORMs / two chains, one Postgres." **Only the orchestrator bootstraps** (`db.bootstrap` takes no advisory lock); claim workers call `db.wait_for_schema(min_version)` and block until the schema is ready rather than racing the migration run (`_REQUIRED_SCHEMA_VERSION` maps each queue to its minimum version).
 
-The chain: `0001` `workflow_runs` → `0002` `workflow_node_jobs` → `0003` `workflow_input_submissions` → `0004` `workflow_dispatch_events` → `0005` `worker_heartbeats` → `0006` lease columns + `node_job_ready` trigger → `0007` `ingest_jobs` + `ingest_job_ready` trigger → `0008` multi-tenant ingest (adds per-job `args JSONB`; drops the `fetch`/`load` queue CHECK and the `cpu`/`gpu`-only `worker_heartbeats` CHECK so those allow-lists move host-side — all additive/idempotent). Ingest queues therefore require schema version ≥ 8. `run_store` treats `parcel_id` as an opaque nullable column (the engine drops the host's parcels FK) so the engine never knows the host's domain.
+The chain: `0001` `workflow_runs` → `0002` `workflow_node_jobs` → `0003` `workflow_input_submissions` → `0004` `workflow_dispatch_events` → `0005` `worker_heartbeats` → `0006` lease columns + `node_job_ready` trigger → `0007` `ingest_jobs` + `ingest_job_ready` trigger → `0008` multi-tenant ingest (adds per-job `args JSONB`; drops the `fetch`/`load` queue CHECK and the `cpu`/`gpu`-only `worker_heartbeats` CHECK so those allow-lists move host-side — all additive/idempotent) → `0009` `worker_heartbeats.last_flagged_dead_at` (dead-worker flag) → `0010` `workflow_node_jobs.watchdog_retries` (watchdog re-queue counter) → `0011` `workflow_node_events` (append-only per-attempt event log) → `0012` `worker_controls` (+ `worker_control` NOTIFY trigger). Ingest queues require schema version ≥ 8; worker-control is read-optional — claim workers gate on 6/8, not 12, and treat a pre-0012 DB as all-ON. `run_store` treats `parcel_id` as an opaque nullable column (the engine drops the host's parcels FK) so the engine never knows the host's domain.
 
 ### Idempotency contracts to preserve
 
-`mark_completed`/`mark_failed`/`mark_awaiting_input` (and the ingest twins) all `UPDATE … WHERE status NOT IN ('completed','failed','cancelled') RETURNING *` and return `None` when the row was already terminal. This `WHERE` is load-bearing: it makes duplicate deliveries and claim-race losers safe, and stops a stray second call from clobbering a finalized `context_delta`. JSON columns are pre-validated (`json.dumps`) before any state mutation so a bad payload fails before the write. Keep this shape for any new state transition.
+`mark_completed`/`mark_failed`/`mark_awaiting_input` (and the ingest twins) all `UPDATE … WHERE status NOT IN ('completed','failed','cancelled') RETURNING *` and return `None` when the row was already terminal. This `WHERE` is load-bearing: it makes duplicate deliveries and claim-race losers safe, and stops a stray second call from clobbering a finalized `context_delta`. JSON columns are pre-validated (`json.dumps`) before any state mutation so a bad payload fails before the write. Keep this shape for any new state transition. (The one deliberate exception is `workflow_node_events` — **append-only, no UPDATE path** — whose terminal/`requeued` rows instead ride the state-change txn, like the dispatch outbox.)
 
 ### Telemetry (hw_metrics + cgroup attribution)
 
diff --git a/README.md b/README.md
index 9806099..b84e635 100644
--- a/README.md
+++ b/README.md
@@ -91,6 +91,9 @@ rationale.
   detector (last-resort recovery from a GPU hardware hang).
 - [`docs/worker_control.md`](docs/worker_control.md) — the operator ON/OFF
   control plane (hard-stop vs park a `(host, queue)` worker).
+- [`docs/storage_backends.md`](docs/storage_backends.md) — the pluggable
+  `StorageBackend` SPI (`pg` / `redis` / `mongodb`): the contract, the
+  capability matrix, and the integration boundary.
 
 ## Quick start
 
@@ -208,6 +211,29 @@ worker absent from `worker_controls` is treated as ON. See
 [`docs/worker_control.md`](docs/worker_control.md) for the full design (the
 `os._exit(79)` hard-stop contract and the extensible stop-policy seam).
 
+## Pluggable storage backends (`pg` · `redis` · `mongodb`)
+
+The queue store is selectable (since v0.3.0). Postgres is the default and the
+reference path; `redis` and `mongodb` are **opt-in** providers that reproduce the
+same durable-queue contract — claim exactly-once, lease/reclaim, idempotent
+terminals, an **atomic outbox**, and per-namespace tenant isolation.
+
+```python
+import queue_workflows
+queue_workflows.configure(db_backend="redis")     # or "mongodb" / "pg" (default)
+from queue_workflows.backends import get_backend
+be = get_backend()
+job_id = be.enqueue("cpu", {"task": "render"})
+job = be.claim("cpu", worker="box-1", lease_s=30)
+be.complete_with_event(job["id"], "completed", result={"ok": True})
+```
+
+`pip install 'queue_workflows[redis]'` / `'[mongodb]'` for the optional drivers
+(mongo needs a replica set). This is **additive** — it does not move the existing
+orchestrator/worker off Postgres. See
+[`docs/storage_backends.md`](docs/storage_backends.md) for the design, the
+capability matrix (and its caveats), and the integration boundary.
+
 ## Tests
 
 ```
@@ -217,4 +243,6 @@ QUEUE_WORKFLOWS_TEST_DB_URL=postgresql://user:pw@host:port/queue_workflows_test
 ```
 
 The suite forces a `*_test` DB and applies the engine migration chain only. See
-`tests/conftest.py`.
+`tests/conftest.py`. The multi-backend contract suite additionally reads
+`QUEUE_WORKFLOWS_TEST_REDIS_URL` / `QUEUE_WORKFLOWS_TEST_MONGO_URL` (each backend
+skips if its server is unset/unreachable).
diff --git a/docs/storage_backends.md b/docs/storage_backends.md
new file mode 100644
index 0000000..5f5fd2c
--- /dev/null
+++ b/docs/storage_backends.md
@@ -0,0 +1,123 @@
+# Pluggable storage backends (`pg` · `redis` · `mongodb`)
+
+`queue_workflows` is, by design, a **Postgres-as-queue** engine — its claim loop,
+lease/reclaim, dispatch outbox and wake are written directly in SQL. That stays
+the reference path. This document describes the **`StorageBackend` SPI**, an
+additive seam (since v0.3.0) that makes the *queue storage* selectable so the
+same durable-queue semantics can run on Redis or MongoDB.
+
+```python
+import queue_workflows
+queue_workflows.configure(db_backend="redis")          # or "mongodb", or "pg" (default)
+from queue_workflows.backends import get_backend
+be = get_backend()                                      # bound to config.db_namespace
+jid = be.enqueue("cpu", {"task": "render"})
+job = be.claim("cpu", worker="box-1", lease_s=30)
+be.complete_with_event(job["id"], "completed", result={"ok": True})
+```
+
+Selecting `pg` (the default) imports **nothing** new and changes nothing — the
+seam is opt-in and the legacy engine is untouched.
+
+## What the SPI is — and is not
+
+The port (`queue_workflows/backends/base.py`, `StorageBackend`) is a **generic
+durable work-queue with a transactional outbox**:
+
+- `enqueue` / `claim` (exactly-once) / `renew_lease` / `reclaim_expired` / `requeue_for_retry`
+- idempotent terminals (`mark_completed` / `mark_failed`) and the **atomic outbox**
+  (`complete_with_event` / `fail_with_event` — go terminal *and* append the event,
+  both-or-neither)
+- best-effort wake (`notify` / `subscribe`), `heartbeat` / `workers`, and the
+  operator `set_control` / `desired_state` (ON/OFF, default-ON)
+
+It is deliberately **not leaky**: no method takes or returns a driver handle
+(psycopg cursor, redis pipeline, pymongo session). Each backend is **bound to one
+namespace** and scopes every key/row/collection by it, so two tenants on one
+server are isolated.
+
+> **Integration boundary (read this).** The SPI is **additive**. Selecting
+> `redis`/`mongodb` does **not** re-home the existing orchestrator / claim-worker
+> / dispatcher — those still run on Postgres. v0.3.0 ships the SPI + three
+> backends that pass an identical contract suite; wiring the DAG orchestrator
+> end-to-end onto a non-PG backend is a later milestone. Use the SPI today as a
+> standalone pluggable durable queue.
+
+## How each backend reproduces the contract
+
+| Guarantee | `pg` | `redis` | `mongodb` |
+|---|---|---|---|
+| Claim exactly-once | `FOR UPDATE SKIP LOCKED` | `ZPOPMIN` inside a **Lua** script | `find_one_and_update` |
+| Priority + FIFO | `ORDER BY priority DESC, created_at` | sorted set, score `-priority`, FIFO tiebreak | sort `priority desc, seq asc` |
+| Lease / reclaim | lease column + `UPDATE … lease < now()` | per-queue running ZSET scored by expiry | `lease_expires_at < now` sweep |
+| Idempotent terminal | `UPDATE … WHERE status NOT IN (terminal) RETURNING *` | Lua status guard | `find_one_and_update` status guard |
+| **Atomic outbox** | one **transaction** | one **Lua script** | one **multi-doc transaction** |
+| Wake | `LISTEN` / `pg_notify` (in-txn) | **pub/sub** (fire-and-forget) | **change stream** on a capped coll. |
+| Namespace isolation | `namespace` column | key prefix `qw:<ns>:` | one **database** per namespace |
+
+### Caveats (be honest about the weakenings)
+
+- **Redis** has no cross-key ACID transaction; atomicity comes from **Lua**
+  (single server-side step), so this targets a **single Redis instance**, not
+  Cluster (Cluster needs all keys in one hash slot). The wake is pub/sub —
+  fire-and-forget — so a subscriber that is down misses it; the worker's safety
+  poll covers that, exactly as it does behind PG `LISTEN`.
+- **MongoDB** transactions *and* change streams require a **replica set** (a
+  single-node RS is fine); on a standalone `mongod`, `complete_with_event` /
+  `fail_with_event` and the wake will fail loudly. `ensure_schema()` pings on
+  connect.
+- **pg** computes `counts()` from the live `status`, so it can never drift;
+  redis keeps maintained counters (decremented against the *prior* status — see
+  the audit below); mongo counts live documents.
+
+## Configuration
+
+| `configure(...)` key | env (default) | meaning |
+|---|---|---|
+| `db_backend` | — | `"pg"` (default) / `"redis"` / `"mongodb"` (aliases `postgres`, `mongo`) |
+| `db_namespace` | — | tenant scope on a shared server (`""` ⇒ `"default"`) |
+| — | `QUEUE_WORKFLOWS_REDIS_URL` | redis DSN (`redis_url_env` renames it) |
+| — | `QUEUE_WORKFLOWS_MONGO_URL` | mongo DSN, incl. `?replicaSet=…` / `directConnection=true` |
+
+Install the optional driver: `pip install 'queue_workflows[redis]'` or
+`'queue_workflows[mongodb]'`. Selecting a backend whose driver is missing raises
+a clear `ImportError` naming the extra.
+
+## Tests
+
+`tests/test_backend_contract.py` is one parametrized suite run against **all three
+live servers** (a backend whose server is unreachable is skipped, not failed):
+
+```bash
+docker run -d --name qw_pg    -e POSTGRES_PASSWORD=postgres -p 5433:5432 postgres:16
+docker run -d --name qw_redis  -p 6380:6379 redis:7
+docker run -d --name qw_mongo  -p 27018:27017 mongo:7 --replSet rs0 --bind_ip_all
+docker exec qw_mongo mongosh --quiet --eval 'rs.initiate()'
+
+export QUEUE_WORKFLOWS_TEST_DB_URL="postgresql://postgres:postgres@localhost:5433/queue_workflows_test"
+export QUEUE_WORKFLOWS_TEST_REDIS_URL="redis://localhost:6380/0"
+export QUEUE_WORKFLOWS_TEST_MONGO_URL="mongodb://localhost:27018/?directConnection=true"
+python -m pytest tests/test_backend_contract.py
+```
+
+The suite pins claim-exactly-once (incl. an 8-thread contention stress test),
+lease/renew/reclaim, idempotent terminals, the atomic outbox (both-or-neither +
+no duplicate on re-delivery), the wake, heartbeat/control, and cross-namespace
+isolation.
+
+## Audit (v0.3.0)
+
+The SPI was audited across design / regressions / safety / internal data leakage:
+
+- **A1 (safety, fixed).** Redis `counts()` used maintained counters that
+  decremented `:running` unconditionally; a terminal/requeue applied to a job
+  that wasn't `running` would drift them. Fixed: the Lua now decrements the
+  job's **prior** status. (pg/mongo derive counts from live status — immune.)
+- **A2 (safety, added).** Added a multi-threaded contention test that proves
+  claim-exactly-once on all three backends, not just sequentially.
+- **A3 (design/leakage).** The SPI is additive (see *Integration boundary*); it
+  does not silently re-home the engine. No driver objects appear in the port, so
+  PG internals can't leak into redis/mongo call sites. Cross-namespace
+  claim/read/count/wake are isolated and tested.
+- **Regressions:** none — the full engine suite (415 tests) stays green; the new
+  drivers are imported lazily so a `pg`-only deploy needs neither installed.
diff --git a/pyproject.toml b/pyproject.toml
index 810c899..8bc6af8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,9 @@ dependencies = [
 
 [project.optional-dependencies]
 metrics = ["psutil"]               # hw_metrics CPU/RAM probe (GPU probe shells out, no dep)
-test = ["pytest", "pytest-cov"]
+redis = ["redis>=5"]               # the 'redis' StorageBackend (queue_workflows.backends.redis)
+mongodb = ["pymongo>=4.4"]         # the 'mongodb' StorageBackend (needs a replica set)
+test = ["pytest", "pytest-cov", "redis>=5", "pymongo>=4.4"]
 
 [project.scripts]
 queue-claim-worker   = "queue_workflows.claim_worker:main"   # generic CLIs (used standalone / other projects)
diff --git a/queue_workflows/__init__.py b/queue_workflows/__init__.py
index 5dda315..c344162 100644
--- a/queue_workflows/__init__.py
+++ b/queue_workflows/__init__.py
@@ -67,6 +67,8 @@ def configure(
     container_prefix: str | None = None,
     ingest_queues: frozenset[str] | None = None,
     ingest_default_budget_s: int | None = None,
+    db_backend: str | None = None,
+    db_namespace: str | None = None,
 ) -> EngineConfig:
     """Set engine configuration values. Only the passed keyword args are
     mutated; the rest keep their (ai_leads-byte-compatible) defaults. Returns
@@ -101,6 +103,14 @@ def configure(
             cfg.ingest_queues = iq
         if ingest_default_budget_s is not None:
             cfg.ingest_default_budget_s = int(ingest_default_budget_s)
+        if db_backend is not None:
+            # Validate against the backend registry (source of truth), imported
+            # lazily so the package root stays import-light and config a leaf.
+            from queue_workflows.backends import canonical_backend_name
+
+            cfg.db_backend = canonical_backend_name(db_backend)
+        if db_namespace is not None:
+            cfg.db_namespace = str(db_namespace)
     return cfg
 
 
diff --git a/queue_workflows/backends/__init__.py b/queue_workflows/backends/__init__.py
new file mode 100644
index 0000000..c1804d9
--- /dev/null
+++ b/queue_workflows/backends/__init__.py
@@ -0,0 +1,149 @@
+"""Storage-backend registry — resolve ``config.db_backend`` to a concrete
+:class:`~queue_workflows.backends.base.StorageBackend`.
+
+Three providers, one file each (split per the request): ``postgres`` (the
+reference/default), ``redis``, ``mongodb``. Provider modules are imported
+**lazily** — selecting ``pg`` never imports ``redis``/``pymongo``, so those stay
+*optional* dependencies (``pip install 'queue_workflows[redis]'`` /
+``[mongodb]``) and a pg-only deploy needs neither installed.
+
+Aliases collapse to a canonical name (``postgres``/``postgresql`` → ``pg``;
+``mongo`` → ``mongodb``) so ``configure(db_backend=...)`` is forgiving but the
+rest of the code only ever sees the canonical form.
+"""
+
+from __future__ import annotations
+
+import os
+import threading
+
+from queue_workflows.backends.base import (
+    Event,
+    Job,
+    StorageBackend,
+    WakeListener,
+)
+from queue_workflows.config import get_config
+
+__all__ = [
+    "StorageBackend",
+    "Job",
+    "Event",
+    "WakeListener",
+    "canonical_backend_name",
+    "is_known_backend",
+    "known_backends",
+    "build_backend",
+    "get_backend",
+    "close_all",
+]
+
+#: alias → canonical name. The canonical set is the source of truth ``configure``
+#: validates against.
+_ALIASES = {
+    "pg": "pg",
+    "postgres": "pg",
+    "postgresql": "pg",
+    "redis": "redis",
+    "mongo": "mongodb",
+    "mongodb": "mongodb",
+}
+
+# Dotted path to each provider class, imported only when first selected.
+_PROVIDERS = {
+    "pg": ("queue_workflows.backends.postgres", "PostgresBackend"),
+    "redis": ("queue_workflows.backends.redis", "RedisBackend"),
+    "mongodb": ("queue_workflows.backends.mongodb", "MongoBackend"),
+}
+
+_instances: dict[tuple[str, str, str], StorageBackend] = {}
+_lock = threading.RLock()
+
+
+def known_backends() -> frozenset[str]:
+    """The canonical backend names (``{"pg", "redis", "mongodb"}``)."""
+    return frozenset(_PROVIDERS)
+
+
+def canonical_backend_name(name: str) -> str:
+    """Normalize ``name`` to its canonical form, or raise ``ValueError`` listing
+    the valid names. Used by ``configure(db_backend=...)`` to fail fast."""
+    key = (name or "").strip().lower()
+    if key not in _ALIASES:
+        raise ValueError(
+            f"unknown db_backend {name!r}; valid: "
+            f"{sorted(set(_ALIASES))} (canonical {sorted(known_backends())})"
+        )
+    return _ALIASES[key]
+
+
+def is_known_backend(name: str) -> bool:
+    return (name or "").strip().lower() in _ALIASES
+
+
+def _load_provider(canonical: str) -> type[StorageBackend]:
+    module_path, cls_name = _PROVIDERS[canonical]
+    import importlib
+
+    try:
+        module = importlib.import_module(module_path)
+    except ImportError as exc:  # missing optional driver
+        extra = {"redis": "redis", "mongodb": "mongodb"}.get(canonical, canonical)
+        raise ImportError(
+            f"the {canonical!r} backend needs its driver: "
+            f"pip install 'queue_workflows[{extra}]'  ({exc})"
+        ) from exc
+    return getattr(module, cls_name)
+
+
+def build_backend(name: str, *, url: str, namespace: str = "") -> StorageBackend:
+    """Construct a backend explicitly (no config / no caching) — the seam tests
+    use this to point each provider at its dockerized server + a namespace."""
+    cls = _load_provider(canonical_backend_name(name))
+    return cls(url=url, namespace=namespace)
+
+
+def _url_for(canonical: str) -> str:
+    cfg = get_config()
+    if canonical == "pg":
+        from queue_workflows.db import db_url
+
+        return db_url()
+    env_name = cfg.redis_url_env if canonical == "redis" else cfg.mongo_url_env
+    url = os.environ.get(env_name)
+    if not url:
+        raise RuntimeError(
+            f"db_backend={canonical!r} but {env_name} is not set; "
+            f"export the {canonical} DSN there (or pass a different env via "
+            f"configure({'redis_url_env' if canonical == 'redis' else 'mongo_url_env'}=...))."
+        )
+    return url
+
+
+def get_backend(*, namespace: str | None = None) -> StorageBackend:
+    """Return the process-wide backend for the configured ``db_backend`` +
+    namespace, building (and caching) it on first use. Cached per
+    ``(backend, namespace, url)`` so repeated calls reuse one client/pool."""
+    cfg = get_config()
+    canonical = canonical_backend_name(cfg.db_backend)
+    ns = namespace if namespace is not None else cfg.db_namespace
+    url = _url_for(canonical)
+    key = (canonical, ns or "", url)
+    with _lock:
+        be = _instances.get(key)
+        if be is None:
+            be = _load_provider(canonical)(url=url, namespace=ns or "")
+            be.ensure_schema()
+            _instances[key] = be
+        return be
+
+
+def close_all() -> None:
+    """Close + drop every cached backend (orchestrator shutdown / test teardown)."""
+    with _lock:
+        for be in _instances.values():
+            try:
+                be.close()
+            except Exception:  # teardown best-effort
+                pass
+        _instances.clear()
diff --git a/queue_workflows/backends/base.py b/queue_workflows/backends/base.py
new file mode 100644
index 0000000..e42a98a
--- /dev/null
+++ b/queue_workflows/backends/base.py
@@ -0,0 +1,271 @@
+"""The :class:`StorageBackend` port — a host-agnostic durable work-queue SPI.
+
+WHY THIS EXISTS. The engine proper is, by design, *Postgres-as-queue*: its
+claim loop, lease/reclaim, dispatch outbox and wake are written directly in SQL
+(``FOR UPDATE SKIP LOCKED``, ``RETURNING`` CAS, ``pg_notify`` in-trigger). That
+is deliberate and stays the reference path. This module factors out the **subset
+of behaviour that defines "a queue backend"** into an abstract port so the
+*database type* becomes selectable (``configure(db_backend="pg"|"redis"|
+"mongodb")``) without each call site speaking SQL. It is **additive**: selecting
+``pg`` (the default) changes nothing for existing consumers.
+
+DESIGN — what the port is, and what it is NOT.
+
+  * It is a **generic durable queue with a transactional outbox**: enqueue →
+    claim-exactly-once → lease/renew → terminal(+atomic event) → reclaim, plus a
+    best-effort wake (NOTIFY/pub-sub/change-stream) and the operator heartbeat /
+    ON-OFF control rows. These are the contracts every backend must honour
+    identically — pinned by ``tests/test_backend_contract.py``.
+  * It is **NOT leaky**: no method takes or returns a driver handle (psycopg
+    cursor, redis pipeline, pymongo session). The outbox atomicity — "go
+    terminal AND append the event, both-or-neither" — is exposed as a single
+    high-level call (:meth:`complete_with_event` / :meth:`fail_with_event`) that
+    each backend implements atomically *in its own idiom* (PG txn / Redis Lua /
+    Mongo multi-doc txn). Hosts never hold a transaction object, so PG internals
+    can't bleed into Redis/Mongo call sites (audit dimension: internal data
+    leakage).
+  * Each backend instance is **bound to one namespace** (constructor arg). Every
+    key / row / collection it touches is scoped by that namespace, so two apps
+    sharing one Redis/Mongo server cannot claim or read each other's jobs. This
+    is the multi-tenant isolation guard — see the cross-namespace test.
+
+A backend is constructed with ``(url, namespace)`` and is safe to share across
+threads (each call borrows its own connection / pipeline). ``close()`` releases
+pooled resources.
+"""
+
+from __future__ import annotations
+
+import abc
+from typing import Any, Iterator, Protocol, TypedDict
+
+# ── job status vocabulary (identical across backends) ────────────────────────
+
+STATUS_QUEUED = "queued"
+STATUS_RUNNING = "running"
+STATUS_COMPLETED = "completed"
+STATUS_FAILED = "failed"
+#: Terminal states a row can never leave — the idempotency guard. A second
+#: ``mark_*`` / ``*_with_event`` on a row already in one of these is a no-op that
+#: returns ``None`` (and writes NO event), exactly like the PG engine's
+#: ``UPDATE … WHERE status NOT IN (...) RETURNING *`` shape.
+TERMINAL_STATUSES = frozenset({STATUS_COMPLETED, STATUS_FAILED})
+
+#: ``""`` namespace ⇒ this literal, so a key/collection always has a real scope.
+DEFAULT_NAMESPACE = "default"
+
+
+class Job(TypedDict, total=False):
+    """The canonical job shape every backend returns (a plain dict at runtime).
+
+    ``lease_expires_at`` / ``created_at`` / ``updated_at`` are epoch seconds
+    (float) so the shape is backend-neutral — each adapter converts to/from its
+    native time type. ``payload`` / ``result`` are JSON-able dicts."""
+
+    id: str
+    queue: str
+    namespace: str
+    status: str
+    payload: dict[str, Any]
+    priority: int
+    attempts: int
+    claimed_by: str | None
+    lease_expires_at: float | None
+    result: dict[str, Any] | None
+    error: str | None
+    created_at: float
+    updated_at: float
+
+
+class Event(TypedDict, total=False):
+    """An outbox event row (the durable dispatch-event analog)."""
+
+    seq: int
+    job_id: str
+    namespace: str
+    queue: str
+    event_type: str
+    detail: dict[str, Any]
+    created_at: float
+
+
+class WakeListener(Protocol):
+    """A subscription handle returned by :meth:`StorageBackend.subscribe`.
+
+    Used as a context manager. :meth:`wait` blocks up to ``timeout`` for the
+    next wake payload (a queue name) and returns it, or ``None`` on timeout — so
+    a worker loops ``while not stop: q = sub.wait(1.0)`` exactly as it loops on
+    ``LISTEN`` today (the timeout doubling as the dropped-NOTIFY safety poll)."""
+
+    def __enter__(self) -> WakeListener: ...
+    def __exit__(self, *exc: object) -> None: ...
+    def wait(self, timeout: float) -> str | None: ...
+
+
+class StorageBackend(abc.ABC):
+    """Abstract durable-queue backend. See module docstring for the contract.
+
+    Subclasses: :class:`~queue_workflows.backends.postgres.PostgresBackend`,
+    :class:`~queue_workflows.backends.redis.RedisBackend`,
+    :class:`~queue_workflows.backends.mongodb.MongoBackend`.
+    """
+
+    #: Registry name (``"pg"`` / ``"redis"`` / ``"mongodb"``); set on subclasses.
+    name: str = ""
+
+    def __init__(self, *, url: str, namespace: str = "") -> None:
+        self.url = url
+        self.namespace = namespace or DEFAULT_NAMESPACE
+
+    # ── schema / lifecycle ────────────────────────────────────────────────────
+
+    @abc.abstractmethod
+    def ensure_schema(self) -> None:
+        """Idempotently create whatever durable structures the backend needs
+        (PG tables / Mongo indexes; a no-op for Redis). Safe to call on boot."""
+
+    @abc.abstractmethod
+    def close(self) -> None:
+        """Release pooled connections / clients."""
+
+    # ── enqueue / claim / lease ────────────────────────────────────────────────
+
+    @abc.abstractmethod
+    def enqueue(
+        self, queue: str, payload: dict[str, Any], *,
+        job_id: str | None = None, priority: int = 0,
+    ) -> str:
+        """Append a ``queued`` job and fire the wake NOTIFY for ``queue``.
+        Returns the job id (generated when ``job_id`` is None). The wake must be
+        emitted as part of the same durable write (no "queued but no wake")."""
+
+    @abc.abstractmethod
+    def claim(self, queue: str, worker: str, *, lease_s: float) -> Job | None:
+        """Atomically take the single oldest claimable (``queued``, highest
+        priority) job on ``queue``: flip it ``running``, stamp ``claimed_by`` +
+        ``lease_expires_at = now+lease_s``, bump ``attempts``. Returns the job,
+        or ``None`` if none claimable. **Exactly-once under contention**: two
+        concurrent callers never receive the same job (the SKIP-LOCKED guarantee
+        each backend reproduces in its own idiom)."""
+
+    @abc.abstractmethod
+    def renew_lease(self, job_id: str, worker: str, *, lease_s: float) -> bool:
+        """Extend ``lease_expires_at`` to ``now+lease_s`` iff the job is still
+        ``running`` and still owned by ``worker``. Returns whether it renewed."""
+
+    @abc.abstractmethod
+    def reclaim_expired(self, *, queue: str | None = None) -> list[str]:
+        """Re-queue every ``running`` job whose lease has lapsed (optionally
+        filtered to ``queue``): flip ``running`` → ``queued``, clear the lease /
+        owner, re-fire the wake. Returns the reclaimed job ids. This is the sole
+        recovery path for an orphaned ``running`` row."""
+
+    @abc.abstractmethod
+    def requeue_for_retry(self, job_id: str) -> Job | None:
+        """Watchdog re-queue: flip a ``running`` job back to ``queued`` (clear
+        lease/owner, keep ``attempts`` as the retry counter), re-fire the wake,
+        and write **no** event. Returns the updated job, or ``None`` if it was
+        already terminal."""
+
+    # ── terminal transitions (idempotent) ──────────────────────────────────────
+
+    @abc.abstractmethod
+    def mark_completed(
+        self, job_id: str, *, result: dict[str, Any] | None = None,
+    ) -> Job | None:
+        """Flip ``job_id`` → ``completed`` iff not already terminal; stamp
+        ``result``. Returns the row, or ``None`` if it was already terminal
+        (the idempotency guard — a duplicate delivery is a safe no-op)."""
+
+    @abc.abstractmethod
+    def mark_failed(self, job_id: str, *, error: str | None = None) -> Job | None:
+        """Flip ``job_id`` → ``failed`` iff not already terminal; stamp
+        ``error``. Returns the row, or ``None`` if already terminal."""
+
+    @abc.abstractmethod
+    def complete_with_event(
+        self, job_id: str, event_type: str, *,
+        result: dict[str, Any] | None = None,
+        detail: dict[str, Any] | None = None,
+    ) -> Job | None:
+        """ATOMIC outbox: go ``completed`` **and** append one :class:`Event`,
+        both-or-neither. Returns the row, or ``None`` if already terminal — in
+        which case **no event is written** (the second-delivery no-op)."""
+
+    @abc.abstractmethod
+    def fail_with_event(
+        self, job_id: str, event_type: str, *,
+        error: str | None = None,
+        detail: dict[str, Any] | None = None,
+    ) -> Job | None:
+        """ATOMIC outbox twin of :meth:`complete_with_event` for the failure
+        path (mark ``failed`` + append the event in one unit, or neither)."""
+
+    # ── reads ──────────────────────────────────────────────────────────────────
+
+    @abc.abstractmethod
+    def get(self, job_id: str) -> Job | None:
+        """Return the job by id (within this namespace), or ``None``."""
+
+    @abc.abstractmethod
+    def counts(self, queue: str) -> dict[str, int]:
+        """``{queued, running, completed, failed}`` for ``queue`` (snapshot)."""
+
+    @abc.abstractmethod
+    def events(self, *, since: int = 0, limit: int = 1000) -> list[Event]:
+        """Outbox events with ``seq > since`` (oldest-first), for an event
+        drainer / the atomicity assertions. Namespace-scoped."""
+
+    # ── wake (NOTIFY / pub-sub / change-stream) ─────────────────────────────────
+
+    @abc.abstractmethod
+    def notify(self, queue: str) -> None:
+        """Best-effort wake for listeners on ``queue`` (out-of-band; ``enqueue``
+        already wakes). Mirrors a manual ``pg_notify``."""
+
+    @abc.abstractmethod
+    def subscribe(self, *queues: str) -> WakeListener:
+        """Open a :class:`WakeListener` for the given queues. Best-effort: a
+        dropped wake is covered by the caller's safety-poll timeout, identical to
+        the engine's 1 s ``LISTEN`` safety poll."""
+
+    # ── heartbeats + operator ON/OFF control ────────────────────────────────────
+
+    @abc.abstractmethod
+    def heartbeat(
+        self, host: str, queue: str, *,
+        current_model: str | None = None, stale_after_s: float = 30.0,
+    ) -> None:
+        """Upsert this worker's ``(host, queue)`` liveness row (TTL/last_seen)."""
+
+    @abc.abstractmethod
+    def workers(self, queue: str) -> list[dict[str, Any]]:
+        """Live workers on ``queue`` (heartbeat fresher than its TTL)."""
+
+    @abc.abstractmethod
+    def set_control(
+        self, host: str, queue: str, *,
+        desired_state: str, stop_policy: str = "hard",
+        requested_by: str | None = None,
+    ) -> None:
+        """Upsert the operator desired-state row for a ``(host, queue)`` worker."""
+
+    @abc.abstractmethod
+    def desired_state(self, host: str, queue: str) -> str:
+        """Effective desired state: ``"off"`` only when an explicit OFF row
+        exists, else ``"on"`` (absent ⇒ ON — the default-on contract)."""
+
+
+def normalized_namespace(namespace: str) -> str:
+    """``""`` → :data:`DEFAULT_NAMESPACE`, else verbatim (trimmed)."""
+    return (namespace or "").strip() or DEFAULT_NAMESPACE
+
+
+def drain_iter(listener: WakeListener, *, timeout: float) -> Iterator[str]:
+    """Yield wake payloads until a ``timeout`` elapses with none — a small helper
+    for tests/consumers. (Backends needn't override.)"""
+    while True:
+        q = listener.wait(timeout)
+        if q is None:
+            return
+        yield q
diff --git a/queue_workflows/backends/mongodb.py b/queue_workflows/backends/mongodb.py
new file mode 100644
index 0000000..5f0ce62
--- /dev/null
+++ b/queue_workflows/backends/mongodb.py
@@ -0,0 +1,324 @@
+"""MongoDB :class:`StorageBackend`.
+
+Mongo maps onto the contract more directly than Redis but with its own caveats,
+which the audit flagged up front:
+
+  * **Atomic claim via ``find_one_and_update``** — find the oldest, highest
+    priority ``queued`` job and flip it ``running`` in one atomic document op.
+    Two concurrent claimers get different jobs because the first's update removes
+    it from the ``status:'queued'`` filter before the second matches it
+    (the standard Mongo work-queue claim; reproduces SKIP-LOCKED's effect).
+  * **Atomic outbox via a MULTI-DOCUMENT TRANSACTION** — go terminal *and* insert
+    the event in one transaction, both-or-neither. This is why the backend needs
+    a **replica set** (transactions — and change streams — are unavailable on a
+    standalone ``mongod``); a single-node RS is enough.
+  * **Wake via a change stream** on a capped ``wake`` collection (also RS-only).
+    Each enqueue/reclaim/requeue inserts a tiny ``{queue}`` doc; the stream
+    surfaces it. Best-effort like every wake — the worker's safety poll covers a
+    stream that hasn't resumed yet.
+
+Each namespace is a SEPARATE DATABASE (``qw_<namespace>``), so two tenants on one
+server share nothing — the strongest possible isolation (the data-leakage guard).
+"""
+
+from __future__ import annotations
+
+import re
+import time
+import uuid
+from datetime import datetime, timezone
+from typing import Any
+
+from pymongo import ASCENDING, DESCENDING, MongoClient, ReturnDocument
+from pymongo.errors import CollectionInvalid
+
+from queue_workflows.backends.base import Event, Job, StorageBackend, WakeListener
+
+_TERMINAL = ("completed", "failed")
+
+
+def _db_name(namespace: str) -> str:
+    return "qw_" + re.sub(r"[^A-Za-z0-9_]", "_", namespace)[:48]
+
+
+class MongoBackend(StorageBackend):
+    name = "mongodb"
+
+    def __init__(self, *, url: str, namespace: str = "") -> None:
+        super().__init__(url=url, namespace=namespace)
+        self._client: MongoClient = MongoClient(
+            url, serverSelectionTimeoutMS=5000, tz_aware=True,
+        )
+        self._db = self._client[_db_name(self.namespace)]
+        self._jobs = self._db["jobs"]
+        self._events = self._db["events"]
+        self._workers = self._db["workers"]
+        self._controls = self._db["controls"]
+        self._counters = self._db["counters"]
+        self._wake = self._db["wake"]
+
+    # ── schema ──────────────────────────────────────────────────────────────────
+
+    def ensure_schema(self) -> None:
+        # Force server selection now so an unreachable server fails the probe.
+        self._client.admin.command("ping")
+        try:
+            self._db.create_collection("wake", capped=True, size=1 << 20)
+        except CollectionInvalid:
+            pass  # already exists
+        self._jobs.create_index(
+            [("queue", ASCENDING), ("status", ASCENDING),
+             ("priority", DESCENDING), ("seq", ASCENDING)]
+        )
+        self._jobs.create_index([("status", ASCENDING), ("lease_expires_at", ASCENDING)])
+        self._events.create_index([("seq", ASCENDING)])
+        self._controls.create_index(
+            [("host", ASCENDING), ("queue", ASCENDING)], unique=True
+        )
+        # TTL: stale heartbeats self-expire (the liveness window).
+        self._workers.create_index("last_seen", expireAfterSeconds=60)
+        self._workers.create_index([("queue", ASCENDING)])
+        for cid in ("seq", "eventseq"):
+            self._counters.update_one({"_id": cid}, {"$setOnInsert": {"n": 0}}, upsert=True)
+
+    def close(self) -> None:
+        self._client.close()
+
+    # ── counters ──────────────────────────────────────────────────────────────────
+
+    def _next(self, counter_id: str, *, session=None) -> int:
+        doc = self._counters.find_one_and_update(
+            {"_id": counter_id}, {"$inc": {"n": 1}},
+            upsert=True, return_document=ReturnDocument.AFTER, session=session,
+        )
+        return int(doc["n"])
+
+    def _wake_insert(self, queue: str) -> None:
+        try:
+            self._wake.insert_one({"queue": queue, "ts": time.time()})
+        except Exception:
+            pass  # wake is best-effort
+
+    # ── enqueue / claim / lease ────────────────────────────────────────────────
+
+    def enqueue(self, queue, payload, *, job_id=None, priority=0) -> str:
+        jid = job_id or uuid.uuid4().hex
+        now = time.time()
+        self._jobs.insert_one({
+            "_id": jid, "namespace": self.namespace, "queue": queue,
+            "status": "queued", "payload": payload or {}, "priority": int(priority),
+            "attempts": 0, "claimed_by": None, "lease_expires_at": None,
+            "result": None, "error": None, "seq": self._next("seq"),
+            "created_at": now, "updated_at": now,
+        })
+        self._wake_insert(queue)
+        return jid
+
+    def claim(self, queue, worker, *, lease_s) -> Job | None:
+        now = time.time()
+        doc = self._jobs.find_one_and_update(
+            {"queue": queue, "status": "queued"},
+            {"$set": {"status": "running", "claimed_by": worker,
+                      "lease_expires_at": now + float(lease_s), "updated_at": now},
+             "$inc": {"attempts": 1}},
+            sort=[("priority", DESCENDING), ("seq", ASCENDING)],
+            return_document=ReturnDocument.AFTER,
+        )
+        return _job(doc)
+
+    def renew_lease(self, job_id, worker, *, lease_s) -> bool:
+        now = time.time()
+        doc = self._jobs.find_one_and_update(
+            {"_id": job_id, "status": "running", "claimed_by": worker},
+            {"$set": {"lease_expires_at": now + float(lease_s), "updated_at": now}},
+            return_document=ReturnDocument.AFTER,
+        )
+        return doc is not None
+
+    def reclaim_expired(self, *, queue=None) -> list[str]:
+        now = time.time()
+        flt: dict[str, Any] = {"status": "running", "lease_expires_at": {"$lt": now}}
+        if queue is not None:
+            flt["queue"] = queue
+        out: list[str] = []
+        while True:
+            doc = self._jobs.find_one_and_update(
+                flt,
+                {"$set": {"status": "queued", "claimed_by": None,
+                          "lease_expires_at": None, "updated_at": time.time()}},
+                return_document=ReturnDocument.AFTER,
+            )
+            if doc is None:
+                break
+            out.append(doc["_id"])
+            self._wake_insert(doc["queue"])
+        return out
+
+    def requeue_for_retry(self, job_id) -> Job | None:
+        doc = self._jobs.find_one_and_update(
+            {"_id": job_id, "status": {"$nin": list(_TERMINAL)}},
+            {"$set": {"status": "queued", "claimed_by": None,
+                      "lease_expires_at": None, "updated_at": time.time()}},
+            return_document=ReturnDocument.AFTER,
+        )
+        if doc is not None:
+            self._wake_insert(doc["queue"])
+        return _job(doc)
+
+    # ── terminal transitions ────────────────────────────────────────────────────
+
+    def _mark(self, job_id, status, *, result, error) -> Job | None:
+        doc = self._jobs.find_one_and_update(
+            {"_id": job_id, "status": {"$nin": list(_TERMINAL)}},
+            {"$set": {"status": status, "result": result, "error": error,
+                      "updated_at": time.time()}},
+            return_document=ReturnDocument.AFTER,
+        )
+        return _job(doc)
+
+    def mark_completed(self, job_id, *, result=None) -> Job | None:
+        return self._mark(job_id, "completed", result=result, error=None)
+
+    def mark_failed(self, job_id, *, error=None) -> Job | None:
+        return self._mark(job_id, "failed", result=None, error=error)
+
+    def _terminal_with_event(self, job_id, status, event_type, *, result, error, detail):
+        # One transaction: the terminal flip + the event insert commit together,
+        # or — when the job is already terminal (0 docs matched) — neither does.
+        with self._client.start_session() as session:
+            with session.start_transaction():
+                doc = self._jobs.find_one_and_update(
+                    {"_id": job_id, "status": {"$nin": list(_TERMINAL)}},
+                    {"$set": {"status": status, "result": result, "error": error,
+                              "updated_at": time.time()}},
+                    return_document=ReturnDocument.AFTER, session=session,
+                )
+                if doc is None:
+                    return None  # already terminal → empty txn, NO event written
+                self._events.insert_one({
+                    "seq": self._next("eventseq", session=session),
+                    "namespace": self.namespace, "job_id": job_id,
+                    "queue": doc["queue"], "event_type": event_type,
+                    "detail": detail or {}, "created_at": time.time(),
+                }, session=session)
+        return _job(doc)
+
+    def complete_with_event(self, job_id, event_type, *, result=None, detail=None):
+        return self._terminal_with_event(
+            job_id, "completed", event_type, result=result, error=None, detail=detail
+        )
+
+    def fail_with_event(self, job_id, event_type, *, error=None, detail=None):
+        return self._terminal_with_event(
+            job_id, "failed", event_type, result=None, error=error, detail=detail
+        )
+
+    # ── reads ────────────────────────────────────────────────────────────────────
+
+    def get(self, job_id) -> Job | None:
+        return _job(self._jobs.find_one({"_id": job_id}))
+
+    def counts(self, queue) -> dict[str, int]:
+        return {
+            s: self._jobs.count_documents({"queue": queue, "status": s})
+            for s in ("queued", "running", "completed", "failed")
+        }
+
+    def events(self, *, since=0, limit=1000) -> list[Event]:
+        cur = self._events.find({"seq": {"$gt": since}}).sort("seq", ASCENDING).limit(limit)
+        return [
+            Event(
+                seq=int(d["seq"]), job_id=d["job_id"], namespace=self.namespace,
+                queue=d.get("queue"), event_type=d["event_type"],
+                detail=d.get("detail") or {}, created_at=float(d.get("created_at") or 0),
+            )
+            for d in cur
+        ]
+
+    # ── wake (change stream) ────────────────────────────────────────────────────
+
+    def notify(self, queue) -> None:
+        self._wake_insert(queue)
+
+    def subscribe(self, *queues) -> WakeListener:
+        return _MongoWakeListener(self._wake, frozenset(queues))
+
+    # ── heartbeats + control ───────────────────────────────────────────────────────
+
+    def heartbeat(self, host, queue, *, current_model=None, stale_after_s=30.0) -> None:
+        self._workers.update_one(
+            {"_id": f"{queue}:{host}"},
+            {"$set": {"host": host, "queue": queue, "current_model": current_model,
+                      "last_seen": datetime.now(timezone.utc)}},
+            upsert=True,
+        )
+
+    def workers(self, queue) -> list[dict[str, Any]]:
+        return [
+            {"host": d.get("host"), "queue": d.get("queue"),
+             "current_model": d.get("current_model"),
+             "last_seen": d["last_seen"].timestamp() if d.get("last_seen") else None}
+            for d in self._workers.find({"queue": queue})
+        ]
+
+    def set_control(self, host, queue, *, desired_state, stop_policy="hard",
+                    requested_by=None) -> None:
+        self._controls.update_one(
+            {"host": host, "queue": queue},
+            {"$set": {"desired_state": desired_state, "stop_policy": stop_policy,
+                      "requested_by": requested_by, "updated_at": time.time()}},
+            upsert=True,
+        )
+
+    def desired_state(self, host, queue) -> str:
+        doc = self._controls.find_one({"host": host, "queue": queue})
+        return "off" if (doc and doc.get("desired_state") == "off") else "on"
+
+
+def _job(doc: dict[str, Any] | None) -> Job | None:
+    if doc is None:
+        return None
+    return Job(
+        id=doc["_id"], queue=doc["queue"], namespace=doc.get("namespace", ""),
+        status=doc["status"], payload=doc.get("payload") or {},
+        priority=int(doc.get("priority") or 0), attempts=int(doc.get("attempts") or 0),
+        claimed_by=doc.get("claimed_by"), lease_expires_at=doc.get("lease_expires_at"),
+        result=doc.get("result"), error=doc.get("error"),
+        created_at=float(doc.get("created_at") or 0),
+        updated_at=float(doc.get("updated_at") or 0),
+    )
+
+
+class _MongoWakeListener:
+    """Change stream over the capped ``wake`` collection; ``wait`` returns the
+    inserted queue if it's one we subscribed to."""
+
+    def __init__(self, wake_collection, queues: frozenset[str]) -> None:
+        self._coll = wake_collection
+        self._queues = queues
+        self._stream = None
+
+    def __enter__(self):
+        # Open the stream BEFORE the caller enqueues, so the insert is captured.
+        self._stream = self._coll.watch(
+            [{"$match": {"operationType": "insert"}}], max_await_time_ms=250,
+        )
+        return self
+
+    def __exit__(self, *exc: object) -> None:
+        if self._stream is not None:
+            try:
+                self._stream.close()
+            finally:
+                self._stream = None
+
+    def wait(self, timeout: float) -> str | None:
+        deadline = time.monotonic() + timeout
+        while time.monotonic() < deadline:
+            change = self._stream.try_next()
+            if change is None:
+                continue  # try_next already blocked up to max_await_time_ms
+            q = (change.get("fullDocument") or {}).get("queue")
+            if q is not None and (not self._queues or q in self._queues):
+                return q
+        return None
diff --git a/queue_workflows/backends/postgres.py b/queue_workflows/backends/postgres.py
new file mode 100644
index 0000000..5c8fa85
--- /dev/null
+++ b/queue_workflows/backends/postgres.py
@@ -0,0 +1,438 @@
+"""PostgreSQL :class:`StorageBackend` — the reference adapter.
+
+This is the canonical implementation: it expresses the contract with the exact
+primitives the engine proper uses — ``FOR UPDATE SKIP LOCKED`` for the claim,
+``UPDATE … WHERE status NOT IN (terminal) RETURNING *`` for idempotent
+transitions, a single transaction for the atomic outbox, and ``pg_notify`` for
+the wake. It is the yardstick the redis / mongodb adapters are measured against
+by ``tests/test_backend_contract.py``.
+
+It uses its OWN small tables (``qw_jobs`` / ``qw_events`` / ``qw_workers`` /
+``qw_controls``), separate from the engine's ``workflow_*`` schema, so enabling
+the SPI never collides with — or migrates — a host's existing engine tables.
+Every row carries a ``namespace`` column and every query filters on it, so one
+Postgres database can host several isolated tenants (the data-leakage guard).
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import threading
+import uuid
+from contextlib import contextmanager
+from datetime import datetime, timezone
+from typing import Any, Iterator
+
+import psycopg
+from psycopg.rows import dict_row
+from psycopg.types.json import Json
+from psycopg_pool import ConnectionPool
+
+from queue_workflows.backends.base import (
+    STATUS_QUEUED,
+    STATUS_RUNNING,
+    TERMINAL_STATUSES,
+    Event,
+    Job,
+    StorageBackend,
+    WakeListener,
+)
+
+log = logging.getLogger(__name__)
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS qw_jobs (
+    id               TEXT PRIMARY KEY,
+    namespace        TEXT NOT NULL,
+    queue            TEXT NOT NULL,
+    status           TEXT NOT NULL,
+    payload          JSONB NOT NULL DEFAULT '{}'::jsonb,
+    priority         INTEGER NOT NULL DEFAULT 0,
+    attempts         INTEGER NOT NULL DEFAULT 0,
+    claimed_by       TEXT,
+    lease_expires_at TIMESTAMPTZ,
+    result           JSONB,
+    error            TEXT,
+    created_at       TIMESTAMPTZ NOT NULL DEFAULT now(),
+    updated_at       TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+CREATE INDEX IF NOT EXISTS qw_jobs_claim_idx
+    ON qw_jobs (namespace, queue, status, priority DESC, created_at);
+
+CREATE TABLE IF NOT EXISTS qw_events (
+    seq         BIGSERIAL PRIMARY KEY,
+    namespace   TEXT NOT NULL,
+    job_id      TEXT NOT NULL,
+    queue       TEXT,
+    event_type  TEXT NOT NULL,
+    detail      JSONB NOT NULL DEFAULT '{}'::jsonb,
+    created_at  TIMESTAMPTZ NOT NULL DEFAULT now()
+);
+CREATE INDEX IF NOT EXISTS qw_events_ns_seq_idx ON qw_events (namespace, seq);
+
+CREATE TABLE IF NOT EXISTS qw_workers (
+    namespace     TEXT NOT NULL,
+    host          TEXT NOT NULL,
+    queue         TEXT NOT NULL,
+    current_model TEXT,
+    last_seen     TIMESTAMPTZ NOT NULL DEFAULT now(),
+    PRIMARY KEY (namespace, host, queue)
+);
+
+CREATE TABLE IF NOT EXISTS qw_controls (
+    namespace     TEXT NOT NULL,
+    host          TEXT NOT NULL,
+    queue         TEXT NOT NULL,
+    desired_state TEXT NOT NULL DEFAULT 'on',
+    stop_policy   TEXT NOT NULL DEFAULT 'hard',
+    requested_by  TEXT,
+    updated_at    TIMESTAMPTZ NOT NULL DEFAULT now(),
+    PRIMARY KEY (namespace, host, queue)
+);
+"""
+
+
+def _epoch(dt: datetime | None) -> float | None:
+    return dt.timestamp() if isinstance(dt, datetime) else None
+
+
+def _channel(namespace: str) -> str:
+    """A NOTIFY channel name scoped to the namespace (so a wake never crosses
+    tenants). Sanitized to a valid unquoted identifier; the queue rides the
+    payload so one channel per namespace suffices."""
+    safe = re.sub(r"[^a-zA-Z0-9_]", "_", namespace)[:48]
+    return f"qwbe_{safe}"
+
+
+class PostgresBackend(StorageBackend):
+    name = "pg"
+
+    def __init__(self, *, url: str, namespace: str = "") -> None:
+        super().__init__(url=url, namespace=namespace)
+        self._pool: ConnectionPool | None = None
+        self._lock = threading.Lock()
+
+    # ── pool / schema ──────────────────────────────────────────────────────────
+
+    def _ensure_pool(self) -> ConnectionPool:
+        if self._pool is None:
+            with self._lock:
+                if self._pool is None:
+                    pool = ConnectionPool(
+                        self.url, min_size=1, max_size=5, open=False,
+                        kwargs={"row_factory": dict_row},
+                    )
+                    pool.open(wait=True, timeout=15.0)
+                    self._pool = pool
+        return self._pool
+
+    @contextmanager
+    def _conn(self) -> Iterator[psycopg.Connection]:
+        with self._ensure_pool().connection() as conn:
+            yield conn
+
+    def ensure_schema(self) -> None:
+        with self._conn() as conn:
+            conn.execute(_SCHEMA)
+            conn.commit()
+
+    def close(self) -> None:
+        with self._lock:
+            if self._pool is not None:
+                self._pool.close()
+                self._pool = None
+
+    # ── enqueue / claim / lease ────────────────────────────────────────────────
+
+    def enqueue(
+        self, queue: str, payload: dict[str, Any], *,
+        job_id: str | None = None, priority: int = 0,
+    ) -> str:
+        jid = job_id or uuid.uuid4().hex
+        with self._conn() as conn:
+            conn.execute(
+                "INSERT INTO qw_jobs (id, namespace, queue, status, payload, priority) "
+                "VALUES (%s, %s, %s, %s, %s, %s)",
+                (jid, self.namespace, queue, STATUS_QUEUED, Json(payload or {}), priority),
+            )
+            # The wake rides the same commit as the insert (no "queued but no wake").
+            conn.execute("SELECT pg_notify(%s, %s)", (_channel(self.namespace), queue))
+            conn.commit()
+        return jid
+
+    _CLAIM = """
+        UPDATE qw_jobs SET
+            status = 'running', claimed_by = %(worker)s,
+            lease_expires_at = now() + make_interval(secs => %(lease)s),
+            attempts = attempts + 1, updated_at = now()
+        WHERE id = (
+            SELECT id FROM qw_jobs
+            WHERE namespace = %(ns)s AND queue = %(queue)s AND status = 'queued'
+            ORDER BY priority DESC, created_at, id
+            FOR UPDATE SKIP LOCKED
+            LIMIT 1
+        )
+        RETURNING *
+    """
+
+    def claim(self, queue: str, worker: str, *, lease_s: float) -> Job | None:
+        with self._conn() as conn:
+            row = conn.execute(
+                self._CLAIM,
+                {"worker": worker, "lease": float(lease_s), "ns": self.namespace,
+                 "queue": queue},
+            ).fetchone()
+            conn.commit()
+        return _row_to_job(row)
+
+    def renew_lease(self, job_id: str, worker: str, *, lease_s: float) -> bool:
+        with self._conn() as conn:
+            row = conn.execute(
+                "UPDATE qw_jobs SET lease_expires_at = now() + make_interval(secs => %s), "
+                "updated_at = now() "
+                "WHERE id = %s AND namespace = %s AND status = 'running' "
+                "AND claimed_by = %s RETURNING id",
+                (float(lease_s), job_id, self.namespace, worker),
+            ).fetchone()
+            conn.commit()
+        return row is not None
+
+    def reclaim_expired(self, *, queue: str | None = None) -> list[str]:
+        sql = (
+            "UPDATE qw_jobs SET status = 'queued', claimed_by = NULL, "
+            "lease_expires_at = NULL, updated_at = now() "
+            "WHERE namespace = %s AND status = 'running' AND lease_expires_at < now()"
+        )
+        params: list[Any] = [self.namespace]
+        if queue is not None:
+            sql += " AND queue = %s"
+            params.append(queue)
+        sql += " RETURNING id, queue"
+        with self._conn() as conn:
+            rows = conn.execute(sql, params).fetchall()
+            for r in rows:  # re-fire the wake for each reclaimed queue
+                conn.execute(
+                    "SELECT pg_notify(%s, %s)", (_channel(self.namespace), r["queue"])
+                )
+            conn.commit()
+        return [r["id"] for r in rows]
+
+    def requeue_for_retry(self, job_id: str) -> Job | None:
+        with self._conn() as conn:
+            row = conn.execute(
+                "UPDATE qw_jobs SET status = 'queued', claimed_by = NULL, "
+                "lease_expires_at = NULL, updated_at = now() "
+                "WHERE id = %s AND namespace = %s "
+                "AND status NOT IN ('completed', 'failed') RETURNING *",
+                (job_id, self.namespace),
+            ).fetchone()
+            if row is not None:
+                conn.execute(
+                    "SELECT pg_notify(%s, %s)", (_channel(self.namespace), row["queue"])
+                )
+            conn.commit()
+        return _row_to_job(row)
+
+    # ── terminal transitions ────────────────────────────────────────────────────
+
+    def _mark(self, job_id: str, status: str, *, result, error) -> Job | None:
+        with self._conn() as conn:
+            row = self._mark_in(conn, job_id, status, result=result, error=error)
+            conn.commit()
+        return _row_to_job(row)
+
+    def _mark_in(self, conn, job_id, status, *, result, error):
+        return conn.execute(
+            "UPDATE qw_jobs SET status = %s, result = %s, error = %s, updated_at = now() "
+            "WHERE id = %s AND namespace = %s AND status NOT IN ('completed', 'failed') "
+            "RETURNING *",
+            (status, Json(result) if result is not None else None, error,
+             job_id, self.namespace),
+        ).fetchone()
+
+    def mark_completed(self, job_id, *, result=None) -> Job | None:
+        return self._mark(job_id, "completed", result=result, error=None)
+
+    def mark_failed(self, job_id, *, error=None) -> Job | None:
+        return self._mark(job_id, "failed", result=None, error=error)
+
+    def _terminal_with_event(
+        self, job_id, status, event_type, *, result, error, detail,
+    ) -> Job | None:
+        # ONE transaction: the terminal UPDATE and the event INSERT commit
+        # together, or (when the row is already terminal → 0 rows updated) neither
+        # is written. This is the outbox-atomicity contract in its purest form.
+        with self._conn() as conn:
+            row = self._mark_in(conn, job_id, status, result=result, error=error)
+            if row is None:
+                conn.rollback()
+                return None
+            conn.execute(
+                "INSERT INTO qw_events (namespace, job_id, queue, event_type, detail) "
+                "VALUES (%s, %s, %s, %s, %s)",
+                (self.namespace, job_id, row["queue"], event_type, Json(detail or {})),
+            )
+            conn.commit()
+        return _row_to_job(row)
+
+    def complete_with_event(self, job_id, event_type, *, result=None, detail=None):
+        return self._terminal_with_event(
+            job_id, "completed", event_type, result=result, error=None, detail=detail
+        )
+
+    def fail_with_event(self, job_id, event_type, *, error=None, detail=None):
+        return self._terminal_with_event(
+            job_id, "failed", event_type, result=None, error=error, detail=detail
+        )
+
+    # ── reads ────────────────────────────────────────────────────────────────────
+
+    def get(self, job_id: str) -> Job | None:
+        with self._conn() as conn:
+            row = conn.execute(
+                "SELECT * FROM qw_jobs WHERE id = %s AND namespace = %s",
+                (job_id, self.namespace),
+            ).fetchone()
+        return _row_to_job(row)
+
+    def counts(self, queue: str) -> dict[str, int]:
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT status, count(*) AS n FROM qw_jobs "
+                "WHERE namespace = %s AND queue = %s GROUP BY status",
+                (self.namespace, queue),
+            ).fetchall()
+        out = {"queued": 0, "running": 0, "completed": 0, "failed": 0}
+        for r in rows:
+            out[r["status"]] = int(r["n"])
+        return out
+
+    def events(self, *, since: int = 0, limit: int = 1000) -> list[Event]:
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT seq, job_id, queue, event_type, detail, created_at "
+                "FROM qw_events WHERE namespace = %s AND seq > %s "
+                "ORDER BY seq LIMIT %s",
+                (self.namespace, since, limit),
+            ).fetchall()
+        return [
+            Event(
+                seq=int(r["seq"]), job_id=r["job_id"], namespace=self.namespace,
+                queue=r["queue"], event_type=r["event_type"],
+                detail=r["detail"] or {}, created_at=_epoch(r["created_at"]),
+            )
+            for r in rows
+        ]
+
+    # ── wake ──────────────────────────────────────────────────────────────────────
+
+    def notify(self, queue: str) -> None:
+        with self._conn() as conn:
+            conn.execute("SELECT pg_notify(%s, %s)", (_channel(self.namespace), queue))
+            conn.commit()
+
+    def subscribe(self, *queues: str) -> WakeListener:
+        return _PgWakeListener(self.url, _channel(self.namespace), frozenset(queues))
+
+    # ── heartbeats + control ───────────────────────────────────────────────────────
+
+    def heartbeat(self, host, queue, *, current_model=None, stale_after_s=30.0) -> None:
+        with self._conn() as conn:
+            conn.execute(
+                "INSERT INTO qw_workers (namespace, host, queue, current_model, last_seen) "
+                "VALUES (%s, %s, %s, %s, now()) "
+                "ON CONFLICT (namespace, host, queue) DO UPDATE "
+                "SET current_model = EXCLUDED.current_model, last_seen = now()",
+                (self.namespace, host, queue, current_model),
+            )
+            conn.commit()
+
+    def workers(self, queue: str) -> list[dict[str, Any]]:
+        with self._conn() as conn:
+            rows = conn.execute(
+                "SELECT host, queue, current_model, last_seen FROM qw_workers "
+                "WHERE namespace = %s AND queue = %s "
+                "AND last_seen > now() - interval '60 seconds'",
+                (self.namespace, queue),
+            ).fetchall()
+        return [
+            {"host": r["host"], "queue": r["queue"],
+             "current_model": r["current_model"], "last_seen": _epoch(r["last_seen"])}
+            for r in rows
+        ]
+
+    def set_control(self, host, queue, *, desired_state, stop_policy="hard",
+                    requested_by=None) -> None:
+        with self._conn() as conn:
+            conn.execute(
+                "INSERT INTO qw_controls "
+                "(namespace, host, queue, desired_state, stop_policy, requested_by, updated_at) "
+                "VALUES (%s, %s, %s, %s, %s, %s, now()) "
+                "ON CONFLICT (namespace, host, queue) DO UPDATE "
+                "SET desired_state = EXCLUDED.desired_state, "
+                "stop_policy = EXCLUDED.stop_policy, "
+                "requested_by = EXCLUDED.requested_by, updated_at = now()",
+                (self.namespace, host, queue, desired_state, stop_policy, requested_by),
+            )
+            conn.commit()
+
+    def desired_state(self, host: str, queue: str) -> str:
+        with self._conn() as conn:
+            row = conn.execute(
+                "SELECT desired_state FROM qw_controls "
+                "WHERE namespace = %s AND host = %s AND queue = %s",
+                (self.namespace, host, queue),
+            ).fetchone()
+        return "off" if (row and row["desired_state"] == "off") else "on"
+
+
+def _row_to_job(row: dict[str, Any] | None) -> Job | None:
+    if row is None:
+        return None
+    return Job(
+        id=row["id"], queue=row["queue"], namespace=row["namespace"],
+        status=row["status"], payload=row["payload"] or {},
+        priority=int(row["priority"]), attempts=int(row["attempts"]),
+        claimed_by=row["claimed_by"], lease_expires_at=_epoch(row["lease_expires_at"]),
+        result=row["result"], error=row["error"],
+        created_at=_epoch(row["created_at"]), updated_at=_epoch(row["updated_at"]),
+    )
+
+
+class _PgWakeListener:
+    """LISTEN on the namespace channel; ``wait`` returns the queue payload if it
+    is one we subscribed to (else keeps waiting until the timeout). Uses its own
+    autocommit connection, as LISTEN must live outside the pooled txns."""
+
+    def __init__(self, url: str, channel: str, queues: frozenset[str]) -> None:
+        self._url = url
+        self._channel = channel
+        self._queues = queues
+        self._conn: psycopg.Connection | None = None
+
+    def __enter__(self) -> "_PgWakeListener":
+        self._conn = psycopg.connect(self._url, autocommit=True)
+        # channel is a sanitized identifier; quote defensively all the same.
+        self._conn.execute(f'LISTEN "{self._channel}"')
+        return self
+
+    def __exit__(self, *exc: object) -> None:
+        if self._conn is not None:
+            self._conn.close()
+            self._conn = None
+
+    def wait(self, timeout: float) -> str | None:
+        import time
+
+        assert self._conn is not None
+        deadline = time.monotonic() + timeout
+        while True:
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                return None
+            for note in self._conn.notifies(timeout=remaining, stop_after=1):
+                if not self._queues or note.payload in self._queues:
+                    return note.payload
+                # not one of ours — keep waiting within the remaining budget
+            # loop re-checks the deadline
diff --git a/queue_workflows/backends/redis.py b/queue_workflows/backends/redis.py
new file mode 100644
index 0000000..5a1b143
--- /dev/null
+++ b/queue_workflows/backends/redis.py
@@ -0,0 +1,374 @@
+"""Redis :class:`StorageBackend`.
+
+Redis has no SQL, no ``SKIP LOCKED`` and no cross-key ACID transaction, so the
+contract is reproduced with the tools Redis *does* give us:
+
+  * **Atomic claim / terminal / re-queue via Lua.** A registered Lua script runs
+    server-side as one indivisible step, which is how we get the engine's two
+    keystone guarantees on Redis: claim-exactly-once (no two callers pop the same
+    job) and the atomic outbox (go terminal *and* append the event, or neither —
+    a single script, so a crash can't split them).
+  * **Priority + FIFO via a sorted set per queue** (``…:q:<queue>``): score is
+    ``-priority`` so ``ZPOPMIN`` takes the highest priority first, and equal
+    scores break ties by member — a zero-padded monotonic sequence — i.e. FIFO.
+  * **Lease/reclaim via a per-queue running sorted set** scored by lease expiry;
+    ``reclaim_expired`` is a ``ZRANGEBYSCORE … now`` sweep.
+  * **Wake via pub/sub** on a per-namespace channel (payload = queue). Pub/sub is
+    fire-and-forget — a subscriber that is down misses the message — which is why
+    the worker keeps a safety poll exactly as it does behind PG ``LISTEN``.
+
+Everything is under the key prefix ``qw:<namespace>:`` so two tenants on one
+Redis server are fully isolated (the data-leakage guard). NOTE: keys are derived
+server-side inside the scripts, so this targets a **single Redis instance**, not
+Cluster (which would require all keys in one hash slot).
+"""
+
+from __future__ import annotations
+
+import json
+import time
+from typing import Any
+
+import redis  # the redis-py client (absolute import; not this module)
+
+from queue_workflows.backends.base import Event, Job, StorageBackend, WakeListener
+
+# ── Lua (each script is ONE atomic server-side step) ─────────────────────────
+
+_LUA_ENQUEUE = """
+local p = ARGV[1]
+local seq = redis.call('INCR', p..'seq')
+local jobkey = p..'job:'..ARGV[2]
+redis.call('HSET', jobkey, 'id',ARGV[2],'namespace',ARGV[7],'queue',ARGV[3],
+  'status','queued','payload',ARGV[4],'priority',ARGV[5],'attempts','0',
+  'claimed_by','','lease_expires_at','','result','','error','',
+  'created_at',ARGV[6],'updated_at',ARGV[6])
+redis.call('ZADD', KEYS[1], -tonumber(ARGV[5]), string.format('%020d|%s', seq, ARGV[2]))
+redis.call('SADD', p..'queues', ARGV[3])
+redis.call('INCR', p..'cnt:'..ARGV[3]..':queued')
+redis.call('PUBLISH', ARGV[8], ARGV[3])
+return ARGV[2]
+"""
+
+_LUA_CLAIM = """
+local p = ARGV[1]
+local popped = redis.call('ZPOPMIN', KEYS[1])
+if popped[1] == nil then return nil end
+local member = popped[1]
+local bar = string.find(member, '|', 1, true)
+local jobid = string.sub(member, bar + 1)
+local jobkey = p..'job:'..jobid
+local attempts = tonumber(redis.call('HGET', jobkey, 'attempts') or '0') + 1
+local lease_at = tonumber(ARGV[5]) + tonumber(ARGV[4])
+redis.call('HSET', jobkey, 'status','running','claimed_by',ARGV[3],
+  'lease_expires_at', tostring(lease_at), 'attempts', tostring(attempts),
+  'updated_at', ARGV[5])
+redis.call('ZADD', p..'running:'..ARGV[2], lease_at, jobid)
+redis.call('DECR', p..'cnt:'..ARGV[2]..':queued')
+redis.call('INCR', p..'cnt:'..ARGV[2]..':running')
+return redis.call('HGETALL', jobkey)
+"""
+
+_LUA_RENEW = """
+local p = ARGV[1]
+local st = redis.call('HGET', KEYS[1], 'status')
+local cb = redis.call('HGET', KEYS[1], 'claimed_by')
+if st == 'running' and cb == ARGV[2] then
+  local q = redis.call('HGET', KEYS[1], 'queue')
+  local lease_at = tonumber(ARGV[4]) + tonumber(ARGV[3])
+  redis.call('HSET', KEYS[1], 'lease_expires_at', tostring(lease_at), 'updated_at', ARGV[4])
+  redis.call('ZADD', p..'running:'..q, lease_at, ARGV[5])
+  return 1
+end
+return 0
+"""
+
+# Mark terminal (+ optionally append one event) — the atomic outbox. ARGV[7]
+# empty ⇒ plain mark_* with no event. Already-terminal/missing ⇒ nil (no-op, and
+# NO event written): the idempotency guard.
+_LUA_TERMINAL = """
+local p = ARGV[1]
+local st = redis.call('HGET', KEYS[1], 'status')
+if (not st) or st == 'completed' or st == 'failed' then return nil end
+local q = redis.call('HGET', KEYS[1], 'queue')
+redis.call('HSET', KEYS[1], 'status',ARGV[2],'result',ARGV[3],'error',ARGV[4],'updated_at',ARGV[5])
+redis.call('ZREM', p..'running:'..q, ARGV[6])
+redis.call('DECR', p..'cnt:'..q..':'..st)
+redis.call('INCR', p..'cnt:'..q..':'..ARGV[2])
+if ARGV[7] ~= '' then
+  local eseq = redis.call('INCR', p..'eventseq')
+  redis.call('HSET', p..'event:'..eseq, 'seq',eseq,'job_id',ARGV[6],'queue',q,
+    'event_type',ARGV[7],'detail',ARGV[8],'created_at',ARGV[5])
+  redis.call('RPUSH', p..'events', eseq)
+end
+return redis.call('HGETALL', KEYS[1])
+"""
+
+_LUA_REQUEUE = """
+local p = ARGV[1]
+local st = redis.call('HGET', KEYS[1], 'status')
+if (not st) or st == 'completed' or st == 'failed' then return nil end
+local q = redis.call('HGET', KEYS[1], 'queue')
+local pri = tonumber(redis.call('HGET', KEYS[1], 'priority') or '0')
+redis.call('HSET', KEYS[1], 'status','queued','claimed_by','','lease_expires_at','','updated_at',ARGV[2])
+redis.call('ZREM', p..'running:'..q, ARGV[4])
+local seq = redis.call('INCR', p..'seq')
+redis.call('ZADD', p..'q:'..q, -pri, string.format('%020d|%s', seq, ARGV[4]))
+redis.call('DECR', p..'cnt:'..q..':'..st)
+redis.call('INCR', p..'cnt:'..q..':queued')
+redis.call('PUBLISH', ARGV[3], q)
+return redis.call('HGETALL', KEYS[1])
+"""
+
+_LUA_RECLAIM = """
+local p = ARGV[1]
+local expired = redis.call('ZRANGEBYSCORE', KEYS[1], '-inf', ARGV[3])
+local out = {}
+for i, jobid in ipairs(expired) do
+  local jobkey = p..'job:'..jobid
+  if redis.call('HGET', jobkey, 'status') == 'running' then
+    redis.call('HSET', jobkey, 'status','queued','claimed_by','','lease_expires_at','','updated_at',ARGV[3])
+    redis.call('ZREM', KEYS[1], jobid)
+    local seq = redis.call('INCR', p..'seq')
+    local pri = tonumber(redis.call('HGET', jobkey, 'priority') or '0')
+    redis.call('ZADD', p..'q:'..ARGV[2], -pri, string.format('%020d|%s', seq, jobid))
+    redis.call('DECR', p..'cnt:'..ARGV[2]..':running')
+    redis.call('INCR', p..'cnt:'..ARGV[2]..':queued')
+    redis.call('PUBLISH', ARGV[4], ARGV[2])
+    out[#out + 1] = jobid
+  end
+end
+return out
+"""
+
+
+class RedisBackend(StorageBackend):
+    name = "redis"
+
+    def __init__(self, *, url: str, namespace: str = "") -> None:
+        super().__init__(url=url, namespace=namespace)
+        self._r = redis.Redis.from_url(url, decode_responses=True)
+        self._prefix = f"qw:{self.namespace}:"
+        self._wake_channel = f"{self._prefix}wake"
+        self._enqueue = self._r.register_script(_LUA_ENQUEUE)
+        self._claim = self._r.register_script(_LUA_CLAIM)
+        self._renew = self._r.register_script(_LUA_RENEW)
+        self._terminal = self._r.register_script(_LUA_TERMINAL)
+        self._requeue = self._r.register_script(_LUA_REQUEUE)
+        self._reclaim = self._r.register_script(_LUA_RECLAIM)
+
+    def _k(self, suffix: str) -> str:
+        return self._prefix + suffix
+
+    def ensure_schema(self) -> None:
+        self._r.ping()  # nothing to create; just verify connectivity
+
+    def close(self) -> None:
+        try:
+            self._r.close()
+        except Exception:
+            pass
+
+    # ── enqueue / claim / lease ────────────────────────────────────────────────
+
+    def enqueue(self, queue, payload, *, job_id=None, priority=0) -> str:
+        import uuid
+
+        jid = job_id or uuid.uuid4().hex
+        self._enqueue(
+            keys=[self._k(f"q:{queue}")],
+            args=[self._prefix, jid, queue, json.dumps(payload or {}),
+                  str(int(priority)), repr(time.time()), self.namespace,
+                  self._wake_channel],
+        )
+        return jid
+
+    def claim(self, queue, worker, *, lease_s) -> Job | None:
+        reply = self._claim(
+            keys=[self._k(f"q:{queue}")],
+            args=[self._prefix, queue, worker, repr(float(lease_s)), repr(time.time())],
+        )
+        return self._job(reply)
+
+    def renew_lease(self, job_id, worker, *, lease_s) -> bool:
+        return bool(self._renew(
+            keys=[self._k(f"job:{job_id}")],
+            args=[self._prefix, worker, repr(float(lease_s)), repr(time.time()), job_id],
+        ))
+
+    def reclaim_expired(self, *, queue=None) -> list[str]:
+        queues = [queue] if queue is not None else list(self._r.smembers(self._k("queues")))
+        now = repr(time.time())
+        out: list[str] = []
+        for q in queues:
+            ids = self._reclaim(
+                keys=[self._k(f"running:{q}")],
+                args=[self._prefix, q, now, self._wake_channel],
+            )
+            out.extend(ids or [])
+        return out
+
+    def requeue_for_retry(self, job_id) -> Job | None:
+        reply = self._requeue(
+            keys=[self._k(f"job:{job_id}")],
+            args=[self._prefix, repr(time.time()), self._wake_channel, job_id],
+        )
+        return self._job(reply)
+
+    # ── terminal transitions ────────────────────────────────────────────────────
+
+    def _do_terminal(self, job_id, status, *, result, error, event_type, detail):
+        reply = self._terminal(
+            keys=[self._k(f"job:{job_id}")],
+            args=[self._prefix, status,
+                  json.dumps(result) if result is not None else "",
+                  error or "", repr(time.time()), job_id,
+                  event_type or "", json.dumps(detail or {})],
+        )
+        return self._job(reply)
+
+    def mark_completed(self, job_id, *, result=None) -> Job | None:
+        return self._do_terminal(job_id, "completed", result=result, error=None,
+                                 event_type="", detail=None)
+
+    def mark_failed(self, job_id, *, error=None) -> Job | None:
+        return self._do_terminal(job_id, "failed", result=None, error=error,
+                                 event_type="", detail=None)
+
+    def complete_with_event(self, job_id, event_type, *, result=None, detail=None):
+        return self._do_terminal(job_id, "completed", result=result, error=None,
+                                 event_type=event_type, detail=detail)
+
+    def fail_with_event(self, job_id, event_type, *, error=None, detail=None):
+        return self._do_terminal(job_id, "failed", result=None, error=error,
+                                 event_type=event_type, detail=detail)
+
+    # ── reads ────────────────────────────────────────────────────────────────────
+
+    def get(self, job_id) -> Job | None:
+        h = self._r.hgetall(self._k(f"job:{job_id}"))
+        return self._job_from_dict(h) if h else None
+
+    def counts(self, queue) -> dict[str, int]:
+        keys = [self._k(f"cnt:{queue}:{s}") for s in
+                ("queued", "running", "completed", "failed")]
+        vals = self._r.mget(keys)
+        return {
+            s: max(0, int(v or 0))
+            for s, v in zip(("queued", "running", "completed", "failed"), vals)
+        }
+
+    def events(self, *, since=0, limit=1000) -> list[Event]:
+        seqs = self._r.lrange(self._k("events"), 0, -1)
+        wanted = [s for s in seqs if int(s) > since][:limit]
+        out: list[Event] = []
+        for s in wanted:
+            h = self._r.hgetall(self._k(f"event:{s}"))
+            if not h:
+                continue
+            out.append(Event(
+                seq=int(h["seq"]), job_id=h["job_id"], namespace=self.namespace,
+                queue=h.get("queue") or None, event_type=h["event_type"],
+                detail=json.loads(h.get("detail") or "{}"),
+                created_at=float(h.get("created_at") or 0),
+            ))
+        return out
+
+    # ── wake ──────────────────────────────────────────────────────────────────────
+
+    def notify(self, queue) -> None:
+        self._r.publish(self._wake_channel, queue)
+
+    def subscribe(self, *queues) -> WakeListener:
+        return _RedisWakeListener(self._r, self._wake_channel, frozenset(queues))
+
+    # ── heartbeats + control ───────────────────────────────────────────────────────
+
+    def heartbeat(self, host, queue, *, current_model=None, stale_after_s=30.0) -> None:
+        key = self._k(f"worker:{queue}:{host}")
+        self._r.hset(key, mapping={
+            "host": host, "queue": queue, "current_model": current_model or "",
+            "last_seen": repr(time.time()),
+        })
+        self._r.expire(key, max(1, int(stale_after_s)))
+
+    def workers(self, queue) -> list[dict[str, Any]]:
+        out = []
+        for key in self._r.scan_iter(match=self._k(f"worker:{queue}:*"), count=100):
+            h = self._r.hgetall(key)
+            if h:
+                out.append({"host": h.get("host"), "queue": h.get("queue"),
+                            "current_model": h.get("current_model") or None,
+                            "last_seen": float(h.get("last_seen") or 0)})
+        return out
+
+    def set_control(self, host, queue, *, desired_state, stop_policy="hard",
+                    requested_by=None) -> None:
+        self._r.hset(self._k(f"control:{host}:{queue}"), mapping={
+            "desired_state": desired_state, "stop_policy": stop_policy,
+            "requested_by": requested_by or "", "updated_at": repr(time.time()),
+        })
+
+    def desired_state(self, host, queue) -> str:
+        st = self._r.hget(self._k(f"control:{host}:{queue}"), "desired_state")
+        return "off" if st == "off" else "on"
+
+    # ── helpers ──────────────────────────────────────────────────────────────────
+
+    def _job(self, reply) -> Job | None:
+        """Script HGETALL replies arrive as a flat ``[k, v, k, v, …]`` list."""
+        if not reply:
+            return None
+        if isinstance(reply, list):
+            reply = dict(zip(reply[::2], reply[1::2]))
+        return self._job_from_dict(reply)
+
+    def _job_from_dict(self, h: dict[str, str]) -> Job:
+        return Job(
+            id=h["id"], queue=h["queue"], namespace=h.get("namespace", self.namespace),
+            status=h["status"], payload=json.loads(h.get("payload") or "{}"),
+            priority=int(h.get("priority") or 0), attempts=int(h.get("attempts") or 0),
+            claimed_by=(h.get("claimed_by") or None),
+            lease_expires_at=(float(h["lease_expires_at"]) if h.get("lease_expires_at") else None),
+            result=(json.loads(h["result"]) if h.get("result") else None),
+            error=(h.get("error") or None),
+            created_at=float(h.get("created_at") or 0),
+            updated_at=float(h.get("updated_at") or 0),
+        )
+
+
+class _RedisWakeListener:
+    """pub/sub subscription; ``wait`` returns the queue payload if subscribed."""
+
+    def __init__(self, client, channel: str, queues: frozenset[str]) -> None:
+        self._client = client
+        self._channel = channel
+        self._queues = queues
+        self._pubsub = None
+
+    def __enter__(self):
+        self._pubsub = self._client.pubsub(ignore_subscribe_messages=True)
+        self._pubsub.subscribe(self._channel)
+        return self
+
+    def __exit__(self, *exc: object) -> None:
+        if self._pubsub is not None:
+            try:
+                self._pubsub.close()
+            finally:
+                self._pubsub = None
+
+    def wait(self, timeout: float) -> str | None:
+        deadline = time.monotonic() + timeout
+        while True:
+            remaining = deadline - time.monotonic()
+            if remaining <= 0:
+                return None
+            msg = self._pubsub.get_message(timeout=remaining)
+            if msg is None:
+                continue
+            if msg.get("type") != "message":
+                continue
+            payload = msg.get("data")
+            if not self._queues or payload in self._queues:
+                return payload
diff --git a/queue_workflows/config.py b/queue_workflows/config.py
index 13a2b6d..d25c9c6 100644
--- a/queue_workflows/config.py
+++ b/queue_workflows/config.py
@@ -60,6 +60,10 @@ class EngineConfig:
     db_url_env: str = "AI_LEADS_DB_URL"
     host_label_env: str = "AI_LEADS_HOST_LABEL"
     host_priority_env: str = "AI_LEADS_GPU_CONSUMER_PRIORITY"
+    #: env vars holding the redis / mongodb DSN for those backends (read only when
+    #: ``db_backend`` selects them). New names (no ai_leads equivalent).
+    redis_url_env: str = "QUEUE_WORKFLOWS_REDIS_URL"
+    mongo_url_env: str = "QUEUE_WORKFLOWS_MONGO_URL"
 
     # ── value config ──────────────────────────────────────────────────────────
     #: GPU model ids on the tight per-job video render budget (claim_worker).
@@ -71,6 +75,20 @@ class EngineConfig:
     #: cgroup-attribution container-name prefix (hw_metrics per-container slice).
     container_prefix: str = "ai_leads-"
 
+    # ── storage backend selection (pluggable DB type) ──────────────────────────
+    #: Which provider the StorageBackend SPI (``queue_workflows.backends``)
+    #: resolves to: ``"pg"`` (default — Postgres, byte-compat), ``"redis"``, or
+    #: ``"mongodb"``. The legacy engine modules always use Postgres directly; this
+    #: only selects the backend the generic durable-queue SPI hands out. Validated
+    #: against the backend registry by ``configure()``.
+    db_backend: str = "pg"
+    #: Logical namespace isolating THIS tenant's jobs on a SHARED redis/mongodb
+    #: server — every key/collection is scoped by it, so two apps pointed at one
+    #: server can't claim or read each other's jobs (the multi-tenant data-leakage
+    #: guard). ``""`` ⇒ the literal namespace ``"default"``. For pg it scopes the
+    #: SPI rows via a ``namespace`` column.
+    db_namespace: str = ""
+
     # ── node-module resolver (overrides node_module_package when set) ──────────
     #: ``Callable[[str], module]`` — resolve a stored ``node_module`` string to
     #: an imported module exposing ``run(...)``. Default builds from
diff --git a/tests/test_backend_contract.py b/tests/test_backend_contract.py
new file mode 100644
index 0000000..afad243
--- /dev/null
+++ b/tests/test_backend_contract.py
@@ -0,0 +1,343 @@
+"""The storage-backend CONTRACT — one parametrized suite every provider must
+satisfy identically (pg / redis / mongodb).
+
+This is the spec (TDD): the adapters in ``queue_workflows/backends/`` are written
+to make these pass. Each backend is exercised against a real server; a backend
+whose server isn't reachable (env unset / down) is SKIPPED, not failed, so the
+suite runs anywhere — but CI/release must show all three green (see
+``docs/storage_backends.md``).
+
+Servers come from env (dockerized in dev):
+  * pg      — ``QUEUE_WORKFLOWS_TEST_DB_URL``      (shared with the engine suite)
+  * redis   — ``QUEUE_WORKFLOWS_TEST_REDIS_URL``
+  * mongodb — ``QUEUE_WORKFLOWS_TEST_MONGO_URL``   (replica set: txns + change streams)
+
+Each test gets a FRESH random namespace, so tests never see each other's jobs
+(and the cross-namespace test doubles as the data-leakage guard).
+"""
+
+from __future__ import annotations
+
+import os
+import time
+import uuid
+
+import pytest
+
+from queue_workflows.backends import build_backend
+from queue_workflows.backends.base import (
+    STATUS_COMPLETED,
+    STATUS_FAILED,
+    STATUS_QUEUED,
+    STATUS_RUNNING,
+)
+
+_URLS = {
+    "pg": os.environ.get("QUEUE_WORKFLOWS_TEST_DB_URL"),
+    "redis": os.environ.get("QUEUE_WORKFLOWS_TEST_REDIS_URL"),
+    "mongodb": os.environ.get("QUEUE_WORKFLOWS_TEST_MONGO_URL"),
+}
+
+# Probe each backend once per session; cache (ok, reason) so we don't reconnect
+# for every parametrized test.
+_REACHABLE: dict[str, tuple[bool, str]] = {}
+
+
+def _reachable(name: str) -> tuple[bool, str]:
+    if name in _REACHABLE:
+        return _REACHABLE[name]
+    url = _URLS.get(name)
+    if not url:
+        res = (False, f"{name}: set QUEUE_WORKFLOWS_TEST_{name.upper()}_URL")
+    else:
+        try:
+            be = build_backend(name, url=url, namespace="probe")
+            be.ensure_schema()
+            be.close()
+            res = (True, "")
+        except Exception as exc:  # server down / driver missing
+            res = (False, f"{name} unreachable: {type(exc).__name__}: {exc}")
+    _REACHABLE[name] = res
+    return res
+
+
+@pytest.fixture(params=["pg", "redis", "mongodb"])
+def backend(request):
+    name = request.param
+    ok, why = _reachable(name)
+    if not ok:
+        pytest.skip(why)
+    ns = f"t_{uuid.uuid4().hex[:12]}"
+    be = build_backend(name, url=_URLS[name], namespace=ns)
+    be.ensure_schema()
+    yield be
+    be.close()
+
+
+# ── enqueue / get ─────────────────────────────────────────────────────────────
+
+
+def test_enqueue_then_get(backend):
+    jid = backend.enqueue("cpu", {"x": 1}, priority=0)
+    assert isinstance(jid, str) and jid
+    job = backend.get(jid)
+    assert job is not None
+    assert job["status"] == STATUS_QUEUED
+    assert job["queue"] == "cpu"
+    assert job["payload"] == {"x": 1}
+    assert job["attempts"] == 0
+    assert job["namespace"] == backend.namespace
+
+
+def test_get_missing_returns_none(backend):
+    assert backend.get("does-not-exist") is None
+
+
+# ── claim: exactly-once under contention ───────────────────────────────────────
+
+
+def test_claim_then_second_claim_is_none(backend):
+    backend.enqueue("cpu", {"n": 1})
+    first = backend.claim("cpu", "w1", lease_s=30)
+    assert first is not None
+    assert first["status"] == STATUS_RUNNING
+    assert first["claimed_by"] == "w1"
+    assert first["attempts"] == 1
+    assert backend.claim("cpu", "w2", lease_s=30) is None  # nothing left
+
+
+def test_claim_returns_distinct_jobs_never_double(backend):
+    ids = {backend.enqueue("cpu", {"i": i}) for i in range(5)}
+    claimed = []
+    for _ in range(5):
+        job = backend.claim("cpu", "w", lease_s=30)
+        assert job is not None
+        claimed.append(job["id"])
+    assert backend.claim("cpu", "w", lease_s=30) is None
+    assert set(claimed) == ids  # every job claimed exactly once, no repeats
+    assert len(claimed) == len(set(claimed))
+
+
+def test_claim_respects_priority(backend):
+    backend.enqueue("cpu", {"p": "low"}, priority=0)
+    backend.enqueue("cpu", {"p": "high"}, priority=10)
+    job = backend.claim("cpu", "w", lease_s=30)
+    assert job["payload"]["p"] == "high"  # higher priority first
+
+
+def test_claim_isolated_per_queue(backend):
+    backend.enqueue("cpu", {"q": "cpu"})
+    assert backend.claim("gpu", "w", lease_s=30) is None  # wrong queue
+
+
+def test_claim_exactly_once_under_thread_contention(backend):
+    """The keystone, proven under REAL contention: N jobs, many threads racing
+    to claim — every job is claimed exactly once, none twice (SKIP-LOCKED-equiv:
+    PG ``FOR UPDATE SKIP LOCKED`` / Redis ``ZPOPMIN`` in Lua / Mongo
+    ``find_one_and_update``)."""
+    import threading
+
+    n = 40
+    ids = {backend.enqueue("cpu", {"i": i}) for i in range(n)}
+    claimed: list[str] = []
+    lock = threading.Lock()
+
+    def drain(worker: str) -> None:
+        while True:
+            job = backend.claim("cpu", worker, lease_s=30)
+            if job is None:
+                return
+            with lock:
+                claimed.append(job["id"])
+
+    threads = [threading.Thread(target=drain, args=(f"w{t}",)) for t in range(8)]
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    assert sorted(claimed) == sorted(ids)        # every job claimed
+    assert len(claimed) == len(set(claimed))     # and never twice
+
+
+# ── lease renew + reclaim ──────────────────────────────────────────────────────
+
+
+def test_renew_lease_only_by_owner(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w1", lease_s=30)
+    assert backend.renew_lease(job["id"], "w1", lease_s=60) is True
+    assert backend.renew_lease(job["id"], "someone-else", lease_s=60) is False
+
+
+def test_reclaim_expired_requeues(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w1", lease_s=0)  # already expired
+    assert job["status"] == STATUS_RUNNING
+    reclaimed = backend.reclaim_expired(queue="cpu")
+    assert job["id"] in reclaimed
+    again = backend.get(job["id"])
+    assert again["status"] == STATUS_QUEUED
+    assert again["claimed_by"] in (None, "")
+    # …and it's claimable again
+    assert backend.claim("cpu", "w2", lease_s=30)["id"] == job["id"]
+
+
+def test_reclaim_leaves_live_leases_alone(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w1", lease_s=300)  # healthy lease
+    assert backend.reclaim_expired(queue="cpu") == []
+    assert backend.get(job["id"])["status"] == STATUS_RUNNING
+
+
+# ── idempotent terminals ───────────────────────────────────────────────────────
+
+
+def test_mark_completed_idempotent(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w", lease_s=30)
+    done = backend.mark_completed(job["id"], result={"ok": True})
+    assert done is not None and done["status"] == STATUS_COMPLETED
+    assert done["result"] == {"ok": True}
+    assert backend.mark_completed(job["id"], result={"ok": False}) is None  # no-op
+    assert backend.get(job["id"])["result"] == {"ok": True}  # not clobbered
+
+
+def test_mark_failed_idempotent(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w", lease_s=30)
+    failed = backend.mark_failed(job["id"], error="boom")
+    assert failed is not None and failed["status"] == STATUS_FAILED
+    assert backend.mark_failed(job["id"], error="again") is None
+    assert backend.mark_completed(job["id"]) is None  # can't resurrect terminal
+
+
+# ── atomic outbox (the keystone) ───────────────────────────────────────────────
+
+
+def test_complete_with_event_atomic_and_idempotent(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w", lease_s=30)
+    out = backend.complete_with_event(job["id"], "completed", result={"r": 1})
+    assert out is not None and out["status"] == STATUS_COMPLETED
+    evs = backend.events()
+    assert len([e for e in evs if e["job_id"] == job["id"]]) == 1
+    ev = [e for e in evs if e["job_id"] == job["id"]][0]
+    assert ev["event_type"] == "completed"
+    # second delivery: no transition, and crucially NO duplicate event
+    assert backend.complete_with_event(job["id"], "completed") is None
+    evs2 = backend.events()
+    assert len([e for e in evs2 if e["job_id"] == job["id"]]) == 1
+
+
+def test_fail_with_event_atomic(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w", lease_s=30)
+    out = backend.fail_with_event(job["id"], "failed", error="x", detail={"code": 75})
+    assert out["status"] == STATUS_FAILED
+    evs = [e for e in backend.events() if e["job_id"] == job["id"]]
+    assert len(evs) == 1 and evs[0]["event_type"] == "failed"
+    assert evs[0]["detail"].get("code") == 75
+
+
+def test_events_are_ordered_and_filterable_by_seq(backend):
+    j1 = backend.enqueue("cpu", {})
+    j2 = backend.enqueue("cpu", {})
+    backend.fail_with_event(backend.claim("cpu", "w", lease_s=30)["id"], "failed")
+    first_seq = backend.events()[0]["seq"]
+    backend.complete_with_event(backend.claim("cpu", "w", lease_s=30)["id"], "completed")
+    after = backend.events(since=first_seq)
+    assert all(e["seq"] > first_seq for e in after)
+    assert len(after) == 1
+
+
+# ── watchdog re-queue (retry) ──────────────────────────────────────────────────
+
+
+def test_requeue_for_retry_keeps_attempts_no_event(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w1", lease_s=300)
+    assert job["attempts"] == 1
+    rq = backend.requeue_for_retry(job["id"])
+    assert rq is not None and rq["status"] == STATUS_QUEUED
+    assert backend.events() == [] or all(
+        e["job_id"] != job["id"] for e in backend.events()
+    )
+    # claimable again; attempts increments on the re-claim
+    again = backend.claim("cpu", "w2", lease_s=30)
+    assert again["id"] == job["id"]
+    assert again["attempts"] == 2
+
+
+def test_requeue_terminal_returns_none(backend):
+    backend.enqueue("cpu", {})
+    job = backend.claim("cpu", "w", lease_s=30)
+    backend.mark_completed(job["id"])
+    assert backend.requeue_for_retry(job["id"]) is None
+
+
+# ── counts ──────────────────────────────────────────────────────────────────────
+
+
+def test_counts(backend):
+    backend.enqueue("cpu", {})
+    backend.enqueue("cpu", {})
+    backend.enqueue("cpu", {})
+    backend.mark_completed(backend.claim("cpu", "w", lease_s=30)["id"])
+    c = backend.counts("cpu")
+    assert c["queued"] == 2
+    assert c["running"] == 0
+    assert c["completed"] == 1
+    assert c["failed"] == 0
+
+
+# ── wake (best-effort NOTIFY / pub-sub / change-stream) ─────────────────────────
+
+
+def test_wake_on_enqueue(backend):
+    with backend.subscribe("cpu") as sub:
+        time.sleep(0.2)  # let the subscription establish (pub/sub has no backlog)
+        backend.enqueue("cpu", {"wake": 1})
+        got = sub.wait(5.0)
+    assert got == "cpu"
+
+
+# ── heartbeats + operator ON/OFF control ────────────────────────────────────────
+
+
+def test_heartbeat_and_workers(backend):
+    backend.heartbeat("hostA", "gpu", current_model="m1")
+    workers = backend.workers("gpu")
+    assert any(w.get("host") == "hostA" for w in workers)
+
+
+def test_worker_control_default_on_then_off_then_on(backend):
+    assert backend.desired_state("hostA", "gpu") == "on"  # absent ⇒ ON
+    backend.set_control("hostA", "gpu", desired_state="off", requested_by="ops")
+    assert backend.desired_state("hostA", "gpu") == "off"
+    backend.set_control("hostA", "gpu", desired_state="on")
+    assert backend.desired_state("hostA", "gpu") == "on"
+    # per-queue isolation: turning gpu off must not touch cpu
+    backend.set_control("hostA", "gpu", desired_state="off")
+    assert backend.desired_state("hostA", "cpu") == "on"
+
+
+# ── multi-tenant isolation (the data-leakage guard) ─────────────────────────────
+
+
+def test_namespace_isolation(backend):
+    """A second backend on a DIFFERENT namespace, SAME server + queue, must not
+    see / claim / count this namespace's jobs."""
+    other = build_backend(
+        backend.name, url=backend.url, namespace=f"other_{uuid.uuid4().hex[:8]}"
+    )
+    other.ensure_schema()
+    try:
+        jid = backend.enqueue("cpu", {"secret": 1})
+        assert other.get(jid) is None
+        assert other.claim("cpu", "intruder", lease_s=30) is None
+        assert other.counts("cpu")["queued"] == 0
+        # and the reverse: other's job is invisible here
+        ojid = other.enqueue("cpu", {})
+        assert backend.get(ojid) is None
+    finally:
+        other.close()

From 6f4c39b852b8e0f8a934d06f6b53a1cef4951b7f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Zieli=C5=84ski?= <hello@robertz.co>
Date: Thu, 28 May 2026 14:01:25 +0200
Subject: [PATCH 2/4] feat(dispatcher): support the paint_mask input widget

A host-side canvas widget can pause a run on a paint_mask step and expect
the dispatcher to resolve the step's source ref into the source image the
operator paints over. _build_input_spec gains a paint_mask branch that
mirrors pick_perspective: it resolves source via _resolve_ref against the
run context and surfaces source_options, source_rel_path, source_abs_path
on the spec. The widget uploads the resulting binary mask PNG back through
the engine's existing multipart upload pipe.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 queue_workflows/dispatcher.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/queue_workflows/dispatcher.py b/queue_workflows/dispatcher.py
index e773999..55a7b56 100644
--- a/queue_workflows/dispatcher.py
+++ b/queue_workflows/dispatcher.py
@@ -513,6 +513,28 @@ def _build_input_spec(step: dict[str, Any], run: dict[str, Any]) -> dict[str, An
             spec["fisheye_rel_path"] = options[0].get("rel_path")
             spec["fisheye_abs_path"] = options[0].get("abs_path")
         spec["pano_meta"] = _resolve_pano_meta(run, options)
+    elif widget == "paint_mask":
+        # Resolves the source image the user will paint a mask on. The widget
+        # uploads a binary mask PNG back through the standard multipart pipe;
+        # the spec just needs to tell it which image to display.
+        context = _run_context_for_refs(run)
+        source = step.get("source")
+        try:
+            options = _resolve_ref(source, context) if source else []
+        except Exception as exc:  # noqa: BLE001
+            log.warning(
+                "[dispatcher] failed to resolve paint_mask source for "
+                "%s: %s", step["id"], exc,
+            )
+            options = []
+        if isinstance(options, dict):
+            options = [options]
+        elif not isinstance(options, list):
+            options = []
+        spec["source_options"] = options
+        if options:
+            spec["source_rel_path"] = options[0].get("rel_path")
+            spec["source_abs_path"] = options[0].get("abs_path")
     elif widget == "assign_walls":
         # Operator-facing wall-assignment widget for the facade-merge workflow.
         import json

From 430bcfcaa63e549f6da610375b07ed6d5b997c08 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Zieli=C5=84ski?= <hello@robertz.co>
Date: Thu, 28 May 2026 14:01:40 +0200
Subject: [PATCH 3/4] feat(node-pool): opt-in orphan-cancel sweep for terminal
 runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The host's cancel handler is typically a one-line UPDATE on workflow_runs
and stops there — it doesn't cascade into workflow_node_jobs. The claim
SQL's run-cancel guard refuses jobs of a cancelled/failed run, but they
linger in 'queued' forever and pollute the operator-facing queue gauges
(an audit looked like 'all GPU workers stuck' when in fact the workers
were healthy and three jobs of two-day-old cancelled runs were sitting
there).

New configure(cancel_orphan_queued_jobs=...) flag (default False) gates a
NodePool sweep that flips every queued job whose parent run is already
terminal to cancelled. The underlying join UPDATE is exposed as
node_queue.cancel_orphaned_queued_jobs() for direct use. Interval-gated
by AI_LEADS_ORPHAN_CANCEL_SWEEP_INTERVAL_S (default 30 s).

Default-off preserves pre-0.4 behaviour byte-for-byte; hosts opt in by
shipping configure(cancel_orphan_queued_jobs=True) in their startup.

TDD: tests/test_orphan_cancel_sweep.py — 12 tests cover the SQL semantics
(active runs untouched, only queued rows flipped, idempotent), the
configure() round-trip, and the NodePool wiring (flag-gated +
interval-gated + reached from _tick).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                      |  17 ++
 queue_workflows/__init__.py       |   3 +
 queue_workflows/config.py         |  11 ++
 queue_workflows/node_pool.py      |  50 ++++++
 queue_workflows/node_queue.py     |  28 +++
 tests/test_orphan_cancel_sweep.py | 276 ++++++++++++++++++++++++++++++
 6 files changed, 385 insertions(+)
 create mode 100644 tests/test_orphan_cancel_sweep.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0dc2088..6a707f7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,23 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+- `paint_mask` input widget — the dispatcher's `_build_input_spec` resolves
+  the step's `source` ref into `source_options` / `source_rel_path` /
+  `source_abs_path` so a host-side canvas widget can display the upstream
+  image and upload a binary mask PNG back through the standard multipart
+  pipe.
+- Opt-in orphan-cancel sweep: with
+  `configure(cancel_orphan_queued_jobs=True)` the `NodePool` periodically
+  flips `queued` `workflow_node_jobs` whose parent run is already
+  `cancelled` / `failed` to `cancelled`. The claim SQL's run-cancel guard
+  already refuses such jobs, but they linger in `queued` and pollute the
+  operator-facing queue gauges; this is the cleanup. Default `False`
+  preserves pre-0.4 behaviour byte-for-byte. New
+  `node_queue.cancel_orphaned_queued_jobs()` exposes the underlying join
+  UPDATE for direct use. Interval-gated by
+  `AI_LEADS_ORPHAN_CANCEL_SWEEP_INTERVAL_S` (default 30 s).
+
 ## [0.3.0] — 2026-05-27
 
 ### Added
diff --git a/queue_workflows/__init__.py b/queue_workflows/__init__.py
index c344162..98f7741 100644
--- a/queue_workflows/__init__.py
+++ b/queue_workflows/__init__.py
@@ -69,6 +69,7 @@ def configure(
     ingest_default_budget_s: int | None = None,
     db_backend: str | None = None,
     db_namespace: str | None = None,
+    cancel_orphan_queued_jobs: bool | None = None,
 ) -> EngineConfig:
     """Set engine configuration values. Only the passed keyword args are
     mutated; the rest keep their (ai_leads-byte-compatible) defaults. Returns
@@ -111,6 +112,8 @@ def configure(
             cfg.db_backend = canonical_backend_name(db_backend)
         if db_namespace is not None:
             cfg.db_namespace = str(db_namespace)
+        if cancel_orphan_queued_jobs is not None:
+            cfg.cancel_orphan_queued_jobs = bool(cancel_orphan_queued_jobs)
     return cfg
 
 
diff --git a/queue_workflows/config.py b/queue_workflows/config.py
index d25c9c6..a2c9b21 100644
--- a/queue_workflows/config.py
+++ b/queue_workflows/config.py
@@ -124,6 +124,17 @@ class EngineConfig:
     #: without forking ``execute_node``.
     invoke_context: Callable[[dict, dict], Any] | None = None
 
+    # ── orphan-cancel sweep (opt-in) ──────────────────────────────────────────
+    #: When True, the :class:`NodePool` periodically flips ``queued`` jobs whose
+    #: parent run is already ``cancelled`` / ``failed`` to ``cancelled``. The
+    #: host's cancel handler is usually a single ``UPDATE workflow_runs SET
+    #: status='cancelled'`` and does not cascade into ``workflow_node_jobs``; the
+    #: claim SQL refuses such jobs (run-cancel guard), but they linger in
+    #: ``queued`` and pollute queue gauges. Default ``False`` preserves the
+    #: engine's pre-0.4 behaviour byte-for-byte; hosts that want the cleanup
+    #: opt in via ``configure(cancel_orphan_queued_jobs=True)``.
+    cancel_orphan_queued_jobs: bool = False
+
     # ── ingest queue names + budget (host-configurable; G1) ────────────────────
     #: Ingest-family queue names. Migration 0008 dropped the fetch/load DB CHECK;
     #: the host validates ``queue`` against THIS set before enqueue (mirroring the
diff --git a/queue_workflows/node_pool.py b/queue_workflows/node_pool.py
index 943d336..7096479 100644
--- a/queue_workflows/node_pool.py
+++ b/queue_workflows/node_pool.py
@@ -125,6 +125,16 @@ def __init__(
         )
         self._node_event_prune_last_run: float = 0.0
 
+        # Orphan-cancel sweep — opt-in (``configure(cancel_orphan_queued_jobs=
+        # True)``). Flips ``queued`` jobs whose parent run is already terminal
+        # (``cancelled`` / ``failed``) to ``cancelled``. The claim SQL already
+        # refuses them, but the rows linger and pollute queue gauges. Disabled
+        # by default to preserve pre-0.4 behaviour byte-for-byte.
+        self._orphan_cancel_interval_s: float = float(
+            os.environ.get("AI_LEADS_ORPHAN_CANCEL_SWEEP_INTERVAL_S", "30")
+        )
+        self._orphan_cancel_last_run: float = 0.0
+
     def start(self) -> None:
         if self._register_builtins is not None:
             try:
@@ -401,6 +411,16 @@ def _tick(self) -> None:
         except Exception:
             log.exception("[node-pool] node-event retention sweep failed")
 
+        # 7) Orphan-cancel sweep — opt-in. Flip ``queued`` jobs of cancelled /
+        # failed runs to ``cancelled`` so the queue gauges don't read
+        # misleadingly. Default-off so the engine's behaviour pre-0.4 is
+        # unchanged; hosts that want the cleanup ship
+        # ``configure(cancel_orphan_queued_jobs=True)``.
+        try:
+            self._sweep_orphan_queued_jobs()
+        except Exception:
+            log.exception("[node-pool] orphan-cancel sweep failed")
+
     def _sweep_expired_leases(self) -> None:
         """Re-queue ``running`` rows whose PG lease has lapsed.
 
@@ -485,6 +505,36 @@ def _sweep_dead_workers(self) -> None:
                 row.get("last_seen"), row.get("running_jobs"),
             )
 
+    def _sweep_orphan_queued_jobs(self) -> None:
+        """Flip ``queued`` jobs of already-terminal runs to ``cancelled``.
+
+        Opt-in via :attr:`EngineConfig.cancel_orphan_queued_jobs` — default
+        ``False`` so the engine's pre-0.4 behaviour is preserved byte-for-byte.
+        When enabled, runs interval-gated (``_orphan_cancel_interval_s``,
+        default 30 s) so the 0.5 s dispatch loop doesn't run the join UPDATE
+        every tick. The underlying SQL is idempotent (no-op if there are no
+        orphans), so the gate is purely a load optimisation.
+        """
+        from queue_workflows.config import get_config
+
+        if not get_config().cancel_orphan_queued_jobs:
+            return
+
+        import time as _time
+
+        now = _time.time()
+        if now - self._orphan_cancel_last_run < self._orphan_cancel_interval_s:
+            return
+        self._orphan_cancel_last_run = now
+
+        flipped = node_queue.cancel_orphaned_queued_jobs()
+        if flipped:
+            log.info(
+                "[node-pool] cancelled %d orphaned queued job(s) of "
+                "terminal runs",
+                flipped,
+            )
+
     def _sweep_node_event_retention(self) -> None:
         """Prune ``workflow_node_events`` rows older than the retention window.
 
diff --git a/queue_workflows/node_queue.py b/queue_workflows/node_queue.py
index 3419c70..4c5b814 100644
--- a/queue_workflows/node_queue.py
+++ b/queue_workflows/node_queue.py
@@ -1208,6 +1208,34 @@ def cancel_siblings_after_failure(run_id: str) -> int:
     return cancel_queued_jobs_for_run(run_id)
 
 
+def cancel_orphaned_queued_jobs() -> int:
+    """Flip every ``queued`` job whose parent run is already terminal
+    (``cancelled`` / ``failed``) to ``cancelled``. Returns the number of rows
+    touched.
+
+    The host's cancel handler typically only updates ``workflow_runs.status``;
+    it does NOT cascade into ``workflow_node_jobs``. The claim SQL's run-cancel
+    guard prevents such jobs from ever running, but they linger in ``queued``
+    forever — operator-facing queue gauges then misleadingly suggest a worker
+    stall. This sweep is the cleanup. Only ``status='queued'`` rows are touched
+    so we don't race the cancel-watcher's cooperative ``running`` cancel.
+
+    Idempotent: a second call after the first returns 0.
+    """
+    with connection() as conn, conn.cursor() as cur:
+        cur.execute(
+            """
+            UPDATE workflow_node_jobs j
+               SET status = 'cancelled', finished_at = now()
+              FROM workflow_runs r
+             WHERE j.run_id = r.id
+               AND j.status = 'queued'
+               AND r.status IN ('cancelled', 'failed')
+            """
+        )
+        return cur.rowcount or 0
+
+
 def set_resolved_inputs(job_id: str, resolved_inputs: dict[str, Any]) -> None:
     """Write the execution-time snapshot of resolved inputs into the
     ``resolved_inputs`` column. Called by the worker just before
diff --git a/tests/test_orphan_cancel_sweep.py b/tests/test_orphan_cancel_sweep.py
new file mode 100644
index 0000000..eaadc23
--- /dev/null
+++ b/tests/test_orphan_cancel_sweep.py
@@ -0,0 +1,276 @@
+"""Orphan-cancel sweep — flips ``queued`` jobs whose parent run is already
+``cancelled`` or ``failed`` to ``cancelled``.
+
+The contract this closes: the host's cancel handler typically just flips
+``workflow_runs.status='cancelled'`` and stops there (Rails' ``DELETE
+/workflow/:id`` does exactly that). The claim SQL refuses such jobs (run-cancel
+guard, ``test_invariant_cancel``), so they're never executed — but they linger
+in ``queued`` forever, polluting the queue gauges and confusing operators into
+thinking workers are stuck.
+
+This sweep is opt-in via ``configure(cancel_orphan_queued_jobs=True)``. Default
+is ``False`` so the engine's existing behaviour is preserved byte-for-byte;
+hosts that want the cleanup ship the flag in their startup wiring.
+
+Covered:
+- ``node_queue.cancel_orphaned_queued_jobs()``
+  * flips queued jobs of a cancelled run to cancelled
+  * flips queued jobs of a failed run to cancelled
+  * leaves jobs of an active (running/queued/awaiting_input) run alone
+  * leaves non-queued jobs (running/completed/failed/cancelled) alone
+  * is idempotent (a second call after the first returns 0)
+  * stamps ``finished_at`` on the rows it flips
+- ``configure(cancel_orphan_queued_jobs=...)``
+  * defaults to False
+  * round-trips True
+- ``NodePool._tick`` wiring
+  * flag ON ⇒ orphan-cancel sweep runs and flips orphan rows
+  * flag OFF (default) ⇒ no flip happens
+  * interval-gated (a second tick within the window is a no-op)
+"""
+
+from __future__ import annotations
+
+import pytest
+
+import queue_workflows
+from queue_workflows import node_pool, node_queue, run_store
+from queue_workflows.db import connection
+from tests._cancel_helper import cancel_run_via_rails
+from tests._helpers import make_run
+
+
+# ── helpers ──────────────────────────────────────────────────────────────────
+
+
+def _orphan_pool() -> node_pool.NodePool:
+    """A NodePool wired with the orphan-cancel sweep interval forced to 0 so
+    a tight test loop isn't suppressed by the gate."""
+    pool = node_pool.NodePool(cpu_workers=0, gpu_workers=0, register_builtins=None)
+    pool._orphan_cancel_interval_s = 0.0
+    return pool
+
+
+def _job_status(job_id: str) -> str:
+    return node_queue.get_node_job(job_id)["status"]
+
+
+def _job_finished_at(job_id: str):
+    with connection() as c, c.cursor() as cur:
+        cur.execute(
+            "SELECT finished_at FROM workflow_node_jobs WHERE id=%s", (job_id,)
+        )
+        r = cur.fetchone()
+        return None if r is None else r["finished_at"]
+
+
+# ── node_queue.cancel_orphaned_queued_jobs() ────────────────────────────────
+
+
+def test_cancel_orphaned_queued_jobs_flips_jobs_of_cancelled_run():
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="gpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    n = node_queue.cancel_orphaned_queued_jobs()
+
+    assert n == 1
+    assert _job_status(job_id) == "cancelled"
+    assert _job_finished_at(job_id) is not None
+
+
+def test_cancel_orphaned_queued_jobs_flips_jobs_of_failed_run():
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="cpu",
+    )
+    run_store.update_run(run_id, status="failed", error="upstream blew up")
+
+    n = node_queue.cancel_orphaned_queued_jobs()
+
+    assert n == 1
+    assert _job_status(job_id) == "cancelled"
+
+
+def test_cancel_orphaned_queued_jobs_leaves_active_runs_alone():
+    """A ``queued`` job belonging to a still-active run must NOT be cancelled —
+    that's the entire point of the queue."""
+    run_id = make_run(status="running")
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="cpu",
+    )
+    n = node_queue.cancel_orphaned_queued_jobs()
+
+    assert n == 0
+    assert _job_status(job_id) == "queued"
+
+
+def test_cancel_orphaned_queued_jobs_ignores_non_queued_jobs():
+    """Only ``status='queued'`` rows are touched: a ``running`` row holding the
+    work in flight, or a terminal row, must be left alone. (The claim SQL plus
+    the cancel-watcher together handle the ``running`` case cooperatively; we
+    must NOT race them here.)"""
+    run_id = make_run()
+    j_queued = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="a", node_module="x", queue="cpu",
+    )
+    j_running = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="b", node_module="x", queue="cpu",
+    )
+    j_completed = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="c", node_module="x", queue="cpu",
+    )
+    with connection() as c, c.cursor() as cur:
+        cur.execute(
+            "UPDATE workflow_node_jobs SET status='running', started_at=now() "
+            "WHERE id=%s", (j_running,),
+        )
+        cur.execute(
+            "UPDATE workflow_node_jobs SET status='completed', finished_at=now() "
+            "WHERE id=%s", (j_completed,),
+        )
+
+    cancel_run_via_rails(run_id)
+    n = node_queue.cancel_orphaned_queued_jobs()
+
+    assert n == 1  # only the queued one
+    assert _job_status(j_queued) == "cancelled"
+    assert _job_status(j_running) == "running"
+    assert _job_status(j_completed) == "completed"
+
+
+def test_cancel_orphaned_queued_jobs_is_idempotent():
+    run_id = make_run()
+    node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="cpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    first = node_queue.cancel_orphaned_queued_jobs()
+    second = node_queue.cancel_orphaned_queued_jobs()
+
+    assert first == 1
+    assert second == 0
+
+
+def test_cancel_orphaned_queued_jobs_sweeps_across_multiple_runs():
+    """One sweep call must catch orphans across every cancelled/failed run, not
+    just the most recent one — that's the operator-facing fix for legacy zombie
+    rows that accumulated over weeks."""
+    rid_a = make_run()
+    rid_b = make_run()
+    rid_c = make_run(status="running")  # NOT orphaned
+    ja = node_queue.enqueue_node_job(run_id=rid_a, node_id="x", node_module="m", queue="gpu")
+    jb = node_queue.enqueue_node_job(run_id=rid_b, node_id="x", node_module="m", queue="gpu")
+    jc = node_queue.enqueue_node_job(run_id=rid_c, node_id="x", node_module="m", queue="gpu")
+    cancel_run_via_rails(rid_a)
+    run_store.update_run(rid_b, status="failed", error="kaboom")
+
+    n = node_queue.cancel_orphaned_queued_jobs()
+    assert n == 2
+    assert _job_status(ja) == "cancelled"
+    assert _job_status(jb) == "cancelled"
+    assert _job_status(jc) == "queued"
+
+
+# ── configure(cancel_orphan_queued_jobs=...) ─────────────────────────────────
+
+
+def test_cancel_orphan_queued_jobs_default_is_false():
+    cfg = queue_workflows.configure()
+    assert cfg.cancel_orphan_queued_jobs is False
+
+
+def test_configure_cancel_orphan_queued_jobs_roundtrips_true():
+    queue_workflows.configure(cancel_orphan_queued_jobs=True)
+    assert queue_workflows.get_config().cancel_orphan_queued_jobs is True
+
+
+# ── NodePool sweep wiring ───────────────────────────────────────────────────
+
+
+def test_node_pool_tick_sweeps_orphans_when_flag_enabled():
+    """With the flag on, NodePool's per-tick sweep flips queued jobs of
+    cancelled/failed runs to cancelled — closing the audit gap."""
+    queue_workflows.configure(cancel_orphan_queued_jobs=True)
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="gpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    pool = _orphan_pool()
+    pool._sweep_orphan_queued_jobs()
+
+    assert _job_status(job_id) == "cancelled"
+
+
+def test_node_pool_tick_does_not_sweep_when_flag_disabled():
+    """Default: flag OFF ⇒ the sweep is a no-op. Existing engine behaviour is
+    preserved byte-for-byte (orphans remain queued; the claim SQL still skips
+    them via the run-cancel guard)."""
+    # Explicit reaffirm of the default — relying on conftest reset is enough
+    # but stating it makes the contract obvious to a future reader.
+    assert queue_workflows.get_config().cancel_orphan_queued_jobs is False
+
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="gpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    pool = _orphan_pool()
+    pool._sweep_orphan_queued_jobs()
+
+    assert _job_status(job_id) == "queued"
+
+
+def test_node_pool_orphan_sweep_is_interval_gated():
+    """A second sweep within the configured interval must be a no-op even if a
+    new orphan landed — protects the 0.5 s dispatch tick from doing the join
+    UPDATE every iteration."""
+    queue_workflows.configure(cancel_orphan_queued_jobs=True)
+    pool = node_pool.NodePool(cpu_workers=0, gpu_workers=0, register_builtins=None)
+    pool._orphan_cancel_interval_s = 60.0  # well above the test window
+
+    # First sweep: no orphans, but it stamps ``last_run`` so the gate clamps
+    # subsequent calls.
+    pool._sweep_orphan_queued_jobs()
+
+    # Now introduce an orphan AFTER the gate has been primed.
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="cpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    pool._sweep_orphan_queued_jobs()  # gated — must not fire
+    assert _job_status(job_id) == "queued"
+
+
+def test_node_pool_tick_runs_orphan_sweep():
+    """Smoke: _tick() reaches the orphan sweep step. We don't need to drive a
+    full DAG here — wire a stub provider so start_run is a no-op, then assert
+    the sweep ran."""
+    queue_workflows.configure(cancel_orphan_queued_jobs=True)
+
+    # Empty workflow + pipeline so dispatcher.start_run is a no-op.
+    queue_workflows.set_workflow_provider(
+        lambda name: {"name": name, "steps": []},
+        lambda name: {"name": name, "nodes": []},
+    )
+
+    run_id = make_run()
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id="n", node_module="x", queue="cpu",
+    )
+    cancel_run_via_rails(run_id)
+
+    pool = node_pool.NodePool(cpu_workers=0, gpu_workers=0, register_builtins=None)
+    pool._orphan_cancel_interval_s = 0.0  # disable gate
+
+    pool._tick()
+
+    assert _job_status(job_id) == "cancelled"

From 7d9a7663b7149919c28d939a94352766e32c171f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Zieli=C5=84ski?= <hello@robertz.co>
Date: Thu, 28 May 2026 14:17:10 +0200
Subject: [PATCH 4/4] =?UTF-8?q?feat(node-queue):=20delete=5Fnon=5Fterminal?=
 =?UTF-8?q?=5Fjobs=5Ffor=5Frun=20=E2=80=94=20restart=20from=20failed?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restart primitive paired with the engine's existing _find_ready_nodes
cursor semantics: delete every job for a run whose status is NOT
completed/skipped (returning the deleted node_ids so the host can cascade
cleanup into its own artefacts), then a follow-up dispatcher.start_run
re-enqueues exactly the failed branch — the host's "retry whole run"
button no longer re-does the completed prefix.

Caller-policy: the primitive INCLUDES running rows in the deletion set.
A caller that needs to refuse mid-flight (the typical case — the
cancel-watcher polls at 5s, so a just-terminated run can briefly still
have running children) must guard on status='running' itself before
calling. Documented in the docstring.

TDD: tests/test_restart_from_failed.py — 6 tests cover the SQL semantics
(every status except completed/skipped is deleted, scoped per run,
idempotent) and the end-to-end resume (start_run after delete enqueues
only the failed branch).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CHANGELOG.md                      |   9 ++
 queue_workflows/node_queue.py     |  35 +++++++
 tests/test_restart_from_failed.py | 162 ++++++++++++++++++++++++++++++
 3 files changed, 206 insertions(+)
 create mode 100644 tests/test_restart_from_failed.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a707f7..4f7aff9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- `node_queue.delete_non_terminal_jobs_for_run(run_id) -> list[node_id]` —
+  restart primitive that deletes every job whose status is NOT
+  `completed` / `skipped`, returning the deleted `node_id`s so the host can
+  cascade cleanup into its own artefacts (on-disk dirs, input submissions).
+  Paired with the engine's existing `_find_ready_nodes` (which treats
+  surviving `completed` / `skipped` rows as cursors), a follow-up
+  `dispatcher.start_run` resumes a failed run from exactly the failed
+  branch instead of re-doing the completed prefix. Caller-policy
+  in-flight guard is documented in the docstring.
 - `paint_mask` input widget — the dispatcher's `_build_input_spec` resolves
   the step's `source` ref into `source_options` / `source_rel_path` /
   `source_abs_path` so a host-side canvas widget can display the upstream
diff --git a/queue_workflows/node_queue.py b/queue_workflows/node_queue.py
index 4c5b814..7bd8ce9 100644
--- a/queue_workflows/node_queue.py
+++ b/queue_workflows/node_queue.py
@@ -1208,6 +1208,41 @@ def cancel_siblings_after_failure(run_id: str) -> int:
     return cancel_queued_jobs_for_run(run_id)
 
 
+def delete_non_terminal_jobs_for_run(run_id: str) -> list[str]:
+    """Restart primitive: delete every job for ``run_id`` whose status is NOT
+    ``completed`` / ``skipped``. Returns the list of deleted ``node_id``s so
+    the host can cascade into its own artefacts (on-disk node dirs, input
+    submissions). Idempotent — a second call returns ``[]``.
+
+    The engine's ``dispatcher._find_ready_nodes`` treats surviving
+    ``completed`` / ``skipped`` rows as cursors when re-expanding the DAG: only
+    nodes WITHOUT a row whose deps are completed/skipped get enqueued. So a
+    follow-up ``dispatcher.start_run`` after this call resumes the run from
+    exactly the deleted set (plus any downstream that never got enqueued
+    because of the original failure) — the host's "retry whole run" button no
+    longer re-does the completed prefix.
+
+    CALLER POLICY: this primitive INCLUDES ``running`` rows in the deletion
+    set. If the caller's retry contract needs to refuse while a worker is
+    mid-flight (the typical case — the cancel-watcher polls at 5 s, so a
+    just-terminated run can briefly still have ``running`` children winding
+    down), it must check ``WHERE status='running'`` itself BEFORE calling this.
+    The engine's lease-reclaim sweep eventually re-queues a stranded
+    ``running`` row, so the worst case here is a single straggler producing
+    output to a path the caller then `rm -rf`'s — recoverable, but worth a
+    409 from the caller.
+    """
+    with connection() as conn, conn.cursor() as cur:
+        cur.execute(
+            "DELETE FROM workflow_node_jobs "
+            "WHERE run_id = %s "
+            "  AND status NOT IN ('completed', 'skipped') "
+            "RETURNING node_id",
+            (run_id,),
+        )
+        return [r["node_id"] for r in cur.fetchall()]
+
+
 def cancel_orphaned_queued_jobs() -> int:
     """Flip every ``queued`` job whose parent run is already terminal
     (``cancelled`` / ``failed``) to ``cancelled``. Returns the number of rows
diff --git a/tests/test_restart_from_failed.py b/tests/test_restart_from_failed.py
new file mode 100644
index 0000000..6bbe40d
--- /dev/null
+++ b/tests/test_restart_from_failed.py
@@ -0,0 +1,162 @@
+"""Restart-from-failed primitive — keep ``completed`` / ``skipped`` job rows,
+delete the rest, so ``dispatcher.start_run`` resumes the run from the failed
+branch instead of re-doing the entire DAG.
+
+The engine's ``_find_ready_nodes`` already treats existing ``completed`` /
+``skipped`` rows as cursors when re-expanding the DAG (see ``dispatcher.py``):
+only nodes WITHOUT a job row whose deps are completed/skipped get enqueued. The
+missing primitive was a way to clear the failed branch without nuking the
+completed prefix; ``cancel_queued_jobs_for_run`` (the existing helper) flips
+queued → cancelled but leaves the rows in place, which would block re-enqueue.
+
+This new helper deletes every non-terminal-complete row and returns the deleted
+``node_id``s so the host can cascade into its own artefacts (on-disk dirs,
+input submissions). Pure delete-by-status — the dispatcher does the
+re-enqueueing on its next tick.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from queue_workflows import node_queue
+from queue_workflows.db import connection
+from tests._helpers import make_run
+
+
+def _enqueue_in_status(run_id: str, node_id: str, status: str, *, queue: str = "cpu") -> str:
+    job_id = node_queue.enqueue_node_job(
+        run_id=run_id, node_id=node_id, node_module="x", queue=queue,
+    )
+    if status != "queued":
+        with connection() as c, c.cursor() as cur:
+            cur.execute(
+                "UPDATE workflow_node_jobs SET status=%s WHERE id=%s",
+                (status, job_id),
+            )
+    return job_id
+
+
+def _statuses_by_node(run_id: str) -> dict[str, str]:
+    return {j["node_id"]: j["status"]
+            for j in node_queue.list_jobs_for_run(run_id)}
+
+
+# ── delete_non_terminal_jobs_for_run() ──────────────────────────────────────
+
+
+def test_delete_non_terminal_jobs_drops_failed_cancelled_queued_awaiting():
+    """Every status EXCEPT completed / skipped is deleted — those four are the
+    statuses a restart must re-do."""
+    run_id = make_run()
+    _enqueue_in_status(run_id, "n_queued", "queued")
+    _enqueue_in_status(run_id, "n_running", "running")  # also non-terminal
+    _enqueue_in_status(run_id, "n_failed", "failed")
+    _enqueue_in_status(run_id, "n_cancelled", "cancelled")
+    _enqueue_in_status(run_id, "n_awaiting", "awaiting_input")
+
+    deleted = node_queue.delete_non_terminal_jobs_for_run(run_id)
+
+    assert set(deleted) == {
+        "n_queued", "n_running", "n_failed", "n_cancelled", "n_awaiting",
+    }
+    assert _statuses_by_node(run_id) == {}
+
+
+def test_delete_non_terminal_jobs_keeps_completed_and_skipped():
+    """Completed + skipped rows are PRESERVED — they're the cursors
+    ``_find_ready_nodes`` reads to know "this node is done, move on"."""
+    run_id = make_run()
+    _enqueue_in_status(run_id, "ok",      "completed")
+    _enqueue_in_status(run_id, "skipped", "skipped")
+    _enqueue_in_status(run_id, "fail",    "failed")
+
+    deleted = node_queue.delete_non_terminal_jobs_for_run(run_id)
+
+    assert deleted == ["fail"]
+    assert _statuses_by_node(run_id) == {"ok": "completed", "skipped": "skipped"}
+
+
+def test_delete_non_terminal_jobs_scoped_to_one_run():
+    """A second run's rows MUST NOT be touched — the helper is per-run."""
+    rid_a = make_run()
+    rid_b = make_run()
+    _enqueue_in_status(rid_a, "a_failed", "failed")
+    _enqueue_in_status(rid_b, "b_failed", "failed")
+    _enqueue_in_status(rid_b, "b_ok",     "completed")
+
+    deleted = node_queue.delete_non_terminal_jobs_for_run(rid_a)
+
+    assert deleted == ["a_failed"]
+    assert _statuses_by_node(rid_a) == {}
+    assert _statuses_by_node(rid_b) == {"b_failed": "failed", "b_ok": "completed"}
+
+
+def test_delete_non_terminal_jobs_is_idempotent():
+    """A second call after the first returns ``[]`` and leaves the run alone —
+    no rows, nothing to do."""
+    run_id = make_run()
+    _enqueue_in_status(run_id, "n", "failed")
+
+    first = node_queue.delete_non_terminal_jobs_for_run(run_id)
+    second = node_queue.delete_non_terminal_jobs_for_run(run_id)
+
+    assert first == ["n"]
+    assert second == []
+
+
+def test_delete_non_terminal_jobs_returns_empty_when_only_terminal_complete():
+    """A run whose every job ran to completion returns ``[]`` — nothing to
+    re-do. This is the no-op case (operator clicked retry on an already-
+    completed run; the host's controller is responsible for rejecting that,
+    but the helper must be safe to call regardless)."""
+    run_id = make_run()
+    _enqueue_in_status(run_id, "a", "completed")
+    _enqueue_in_status(run_id, "b", "skipped")
+
+    deleted = node_queue.delete_non_terminal_jobs_for_run(run_id)
+
+    assert deleted == []
+    assert _statuses_by_node(run_id) == {"a": "completed", "b": "skipped"}
+
+
+# ── End-to-end: start_run resumes from the deleted set ──────────────────────
+
+
+def test_start_run_after_delete_re_enqueues_only_the_failed_branch():
+    """The whole point of the primitive: with completed prefix intact + failed
+    rows deleted, ``dispatcher.start_run`` re-enqueues ONLY the failed nodes
+    (and their downstream that never got a row in the first place)."""
+    import queue_workflows
+    from queue_workflows import dispatcher
+
+    # Wire a stub workflow + pipeline so the dispatcher has a DAG to traverse.
+    queue_workflows.set_workflow_provider(
+        lambda name: {
+            "name": name,
+            "steps": [{"id": "p", "kind": "pipeline", "pipeline": "_resume_test"}],
+        },
+        lambda name: {
+            "name": name,
+            "nodes": [
+                {"id": "a", "node": "x"},
+                {"id": "b", "node": "x", "depends_on": ["a"]},
+                {"id": "c", "node": "x", "depends_on": ["b"]},
+            ],
+        },
+    )
+
+    run_id = make_run(workflow_name="_resume_wf")
+    # a ran clean, b ran clean, c failed.
+    _enqueue_in_status(run_id, "p/a", "completed")
+    _enqueue_in_status(run_id, "p/b", "completed")
+    _enqueue_in_status(run_id, "p/c", "failed")
+
+    deleted = node_queue.delete_non_terminal_jobs_for_run(run_id)
+    assert deleted == ["p/c"]
+
+    # Resume: start_run should see a, b completed and re-enqueue c only.
+    enqueued = dispatcher.start_run(run_id)
+    assert enqueued == 1
+    jobs = _statuses_by_node(run_id)
+    assert jobs == {"p/a": "completed", "p/b": "completed", "p/c": "queued"}