diff --git a/agent/README.md b/agent/README.md new file mode 100644 index 0000000..a9e30d8 --- /dev/null +++ b/agent/README.md @@ -0,0 +1,124 @@ +# Nexus reference agent + +Minimal Node 20+ ESM script that pushes Docker host + container telemetry to a Nexus deployment. Single file, no dependencies — its job is to **document the payload contract** for the production-grade agent that will follow (roadmap §8.7). + +## Quick start + +```bash +# 1. In Nexus: Settings → Hosts → New host → mint an agent token. +# Copy the plaintext immediately — it's shown only once. + +# 2. On the host, with Docker installed and `docker info` working: +NEXUS_URL=https://nexus.example.com \ +NEXUS_AGENT_TOKEN=paste-your-token-here \ +node agent/reference-agent.mjs +``` + +You should see one log line per push: + +``` +[nexus-agent] 2026-05-01T19:55:30.123Z OK · 12 container(s) +``` + +In Nexus, the host's status flips to **online** within one interval, and the host detail page starts accumulating snapshots. + +## Environment variables + +| Variable | Required | Default | Description | +|---|---|---|---| +| `NEXUS_URL` | yes | — | Base URL of the Nexus deployment (e.g. `https://nexus.example.com`). | +| `NEXUS_AGENT_TOKEN` | yes | — | Plaintext bearer token from Settings → Hosts → "Mint agent token". | +| `NEXUS_AGENT_INTERVAL_SECONDS` | no | `30` | Seconds between pushes. Capped server-side at 60 req/min/token. | +| `NEXUS_AGENT_DOCKER_BIN` | no | `docker` (from `$PATH`) | Override path to the `docker` CLI. | + +## Exit codes + +| Code | Meaning | +|---|---| +| `0` | clean shutdown (not currently triggered — script runs forever) | +| `2` | missing required env var at startup | +| `3` | server returned 401 (token revoked or wrong `NEXUS_URL`) | + +Anything else (network blips, 5xx, parse errors) is logged and retried on the next interval. The supervisor (systemd, Docker restart policy) handles process crashes. + +## Sample systemd unit + +```ini +# /etc/systemd/system/nexus-agent.service +[Unit] +Description=Nexus telemetry agent +After=docker.service +Requires=docker.service + +[Service] +Environment=NEXUS_URL=https://nexus.example.com +Environment=NEXUS_AGENT_TOKEN=paste-your-token +Environment=NEXUS_AGENT_INTERVAL_SECONDS=30 +ExecStart=/usr/bin/node /opt/nexus-agent/reference-agent.mjs +Restart=on-failure +RestartSec=15s + +[Install] +WantedBy=multi-user.target +``` + +```bash +sudo systemctl daemon-reload +sudo systemctl enable --now nexus-agent +journalctl -u nexus-agent -f +``` + +## Payload contract + +The agent POSTs JSON to `${NEXUS_URL}/agent/telemetry`: + +```jsonc +{ + "recorded_at": "2026-05-01T19:55:30.123Z", + "host": { + "facts": { + "cpu_count": 4, + "memory_total_mb": 8192, + "os": "Ubuntu 24.04", + "docker_version": "26.1.0" + }, + "metrics": { + "cpu_percent": null, + "memory_used_mb": null, + "load_average": null, + "network_rx_bytes": null, + "network_tx_bytes": null + } + }, + "containers": [ + { + "container_id": "abc123…", + "name": "web-1", + "image": "ghcr.io/acme/web", + "image_tag": "v1.2.3", + "status": "running", + "state": "running", + "health_status": "healthy", + "ports": ["0.0.0.0:8080->80/tcp"], + "labels": { "com.docker.compose.service": "web" }, + "metrics": { + "cpu_percent": 4.21, + "memory_usage_mb": 128, + "memory_limit_mb": 512, + "network_rx_bytes": 1024000, + "network_tx_bytes": 512000, + "block_read_bytes": 0, + "block_write_bytes": 0 + } + } + ] +} +``` + +`recorded_at` must be within ±1h past / +5min future of the server's clock; outside that window the request is rejected with 422. Set up NTP on the host. + +## Limits + +- This reference agent leaves host-level CPU/memory/load null. The container-level metrics are populated from `docker stats --no-stream`. A production Linux agent would read `/proc/loadavg`, `/proc/meminfo`, etc. +- It does not retry per-tick failures — relies on the supervisor to restart on hard failures and the next interval to recover from soft ones. +- Each call shells out to three `docker` invocations. On a host with thousands of containers this can take a few seconds; tune `NEXUS_AGENT_INTERVAL_SECONDS` accordingly. diff --git a/agent/reference-agent.mjs b/agent/reference-agent.mjs new file mode 100644 index 0000000..1a77f99 --- /dev/null +++ b/agent/reference-agent.mjs @@ -0,0 +1,330 @@ +#!/usr/bin/env node +// @ts-check +/** + * Nexus reference agent (spec 027). + * + * A minimal Node 20+ ESM script that gathers Docker host + container + * telemetry from the local Docker daemon and POSTs it to a Nexus + * `/agent/telemetry` endpoint on a fixed interval. Single file, no + * dependencies — the goal is to *document* the payload contract, not + * to be a production-grade agent. + * + * The production-grade Go agent (roadmap §8.7) will follow. + * + * Required env vars: + * NEXUS_URL — base URL of the Nexus deployment. + * NEXUS_AGENT_TOKEN — plaintext bearer minted in Settings → Hosts. + * + * Optional: + * NEXUS_AGENT_INTERVAL_SECONDS — push interval (default 30). + * NEXUS_AGENT_DOCKER_BIN — path to docker CLI (default + * `docker` from $PATH). + */ + +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; + +const execFileAsync = promisify(execFile); + +const env = { + nexusUrl: requireEnv('NEXUS_URL'), + token: requireEnv('NEXUS_AGENT_TOKEN'), + intervalSeconds: positiveInt( + process.env.NEXUS_AGENT_INTERVAL_SECONDS, + 30, + ), + dockerBin: process.env.NEXUS_AGENT_DOCKER_BIN ?? 'docker', +}; + +const endpoint = new URL('/agent/telemetry', env.nexusUrl).toString(); + +function requireEnv(name) { + const v = process.env[name]; + if (!v) { + console.error(`[nexus-agent] missing required env var: ${name}`); + process.exit(2); + } + return v; +} + +function positiveInt(value, fallback) { + const n = Number.parseInt(value ?? '', 10); + return Number.isFinite(n) && n > 0 ? n : fallback; +} + +async function dockerJson(args) { + const { stdout } = await execFileAsync(env.dockerBin, args, { + maxBuffer: 32 * 1024 * 1024, + }); + // `docker ... --format '{{json .}}'` emits one JSON object per + // line. The single `docker info ... --format '{{json .}}'` form + // emits one object total. Caller specifies which it expects. + return stdout; +} + +function parseLineJson(stdout) { + return stdout + .split('\n') + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => JSON.parse(line)); +} + +async function gatherHostFacts() { + const out = await dockerJson(['info', '--format', '{{json .}}']); + const info = JSON.parse(out); + return { + cpu_count: info.NCPU ?? null, + memory_total_mb: info.MemTotal + ? Math.round(info.MemTotal / 1024 / 1024) + : null, + os: info.OperatingSystem ?? null, + docker_version: info.ServerVersion ?? null, + }; +} + +function parseSize(value) { + // docker stats reports memory like "12.34MiB / 128MiB". We extract + // the leading number + unit and normalise to MB. + if (!value || typeof value !== 'string') return null; + const m = value.match(/([\d.]+)\s*([KMGT]?i?B)/i); + if (!m) return null; + const n = Number.parseFloat(m[1]); + if (!Number.isFinite(n)) return null; + const unit = m[2].toUpperCase(); + const factors = { + B: 1 / 1024 / 1024, + KB: 1 / 1024, + KIB: 1 / 1024, + MB: 1, + MIB: 1, + GB: 1024, + GIB: 1024, + TB: 1024 * 1024, + TIB: 1024 * 1024, + }; + const factor = factors[unit] ?? null; + return factor === null ? null : Math.round(n * factor); +} + +function parsePercent(value) { + if (!value || typeof value !== 'string') return null; + const m = value.match(/([\d.]+)/); + if (!m) return null; + const n = Number.parseFloat(m[1]); + return Number.isFinite(n) ? n : null; +} + +function parseRxTx(value) { + if (!value || typeof value !== 'string') return [null, null]; + const [rx, tx] = value.split('/').map((s) => s.trim()); + return [bytesFromHumane(rx), bytesFromHumane(tx)]; +} + +function bytesFromHumane(value) { + if (!value || typeof value !== 'string') return null; + const m = value.match(/([\d.]+)\s*([KMGT]?i?B)?/i); + if (!m) return null; + const n = Number.parseFloat(m[1]); + if (!Number.isFinite(n)) return null; + const unit = (m[2] ?? 'B').toUpperCase(); + const factors = { + B: 1, + KB: 1024, + KIB: 1024, + MB: 1024 ** 2, + MIB: 1024 ** 2, + GB: 1024 ** 3, + GIB: 1024 ** 3, + TB: 1024 ** 4, + TIB: 1024 ** 4, + }; + const factor = factors[unit] ?? null; + return factor === null ? null : Math.round(n * factor); +} + +async function gatherHostMetrics() { + // `docker info` doesn't return current CPU/memory utilisation — + // it just gives capacity. For the reference agent we leave the + // host-level utilisation null and let the per-container metrics + // tell the story. A production agent would read /proc on Linux. + return { + cpu_percent: null, + memory_used_mb: null, + load_average: null, + network_rx_bytes: null, + network_tx_bytes: null, + }; +} + +async function gatherContainers() { + const psOut = await dockerJson([ + 'ps', + '-a', + '--no-trunc', + '--format', + '{{json .}}', + ]); + const psRows = parseLineJson(psOut); + + const statsOut = await dockerJson([ + 'stats', + '--no-stream', + '--no-trunc', + '--format', + '{{json .}}', + ]); + const statsRows = parseLineJson(statsOut); + const statsById = new Map(statsRows.map((r) => [r.ID ?? r.Id, r])); + + return psRows.map((row) => { + const id = row.ID ?? row.Id; + const stats = statsById.get(id) ?? {}; + const [rxBytes, txBytes] = parseRxTx(stats.NetIO); + const [readBytes, writeBytes] = parseRxTx(stats.BlockIO); + const memUsage = parseSize((stats.MemUsage ?? '').split('/')[0]); + const memLimit = parseSize( + (stats.MemUsage ?? '').split('/')[1] ?? '', + ); + const [imageBase, imageTag] = splitImage(row.Image); + return { + container_id: id, + name: stripLeadingSlash(row.Names ?? ''), + image: imageBase, + image_tag: imageTag, + status: row.State ?? null, + state: row.State ?? null, + health_status: parseHealth(row.Status), + ports: row.Ports ? row.Ports.split(',').map((p) => p.trim()).filter(Boolean) : [], + labels: parseLabels(row.Labels), + metrics: { + cpu_percent: parsePercent(stats.CPUPerc), + memory_usage_mb: memUsage, + memory_limit_mb: memLimit, + network_rx_bytes: rxBytes, + network_tx_bytes: txBytes, + block_read_bytes: readBytes, + block_write_bytes: writeBytes, + }, + }; + }); +} + +function splitImage(image) { + if (!image || typeof image !== 'string') return [image ?? '', null]; + const idx = image.lastIndexOf(':'); + if (idx === -1 || image.indexOf('/', idx) !== -1) return [image, null]; + return [image.slice(0, idx), image.slice(idx + 1)]; +} + +function stripLeadingSlash(name) { + return name.startsWith('/') ? name.slice(1) : name; +} + +function parseHealth(status) { + if (!status || typeof status !== 'string') return null; + if (/\(healthy\)/i.test(status)) return 'healthy'; + if (/\(unhealthy\)/i.test(status)) return 'unhealthy'; + if (/\(starting\)/i.test(status)) return 'starting'; + return null; +} + +function parseLabels(value) { + if (!value || typeof value !== 'string') return {}; + /** @type {Record} */ + const out = {}; + value.split(',').forEach((kv) => { + const idx = kv.indexOf('='); + if (idx === -1) return; + out[kv.slice(0, idx).trim()] = kv.slice(idx + 1).trim(); + }); + return out; +} + +async function buildPayload() { + const [facts, metrics, containers] = await Promise.all([ + gatherHostFacts(), + gatherHostMetrics(), + gatherContainers(), + ]); + + return { + recorded_at: new Date().toISOString(), + host: { facts, metrics }, + containers, + }; +} + +async function postOnce() { + const payload = await buildPayload(); + const res = await fetch(endpoint, { + method: 'POST', + headers: { + Authorization: `Bearer ${env.token}`, + 'Content-Type': 'application/json', + Accept: 'application/json', + }, + body: JSON.stringify(payload), + }); + + if (res.ok) { + console.log( + `[nexus-agent] ${new Date().toISOString()} OK · ${payload.containers.length} container(s)`, + ); + return { sleepSeconds: env.intervalSeconds }; + } + + if (res.status === 401) { + console.error( + '[nexus-agent] 401 Unauthorized — token revoked or wrong NEXUS_URL. Exiting.', + ); + process.exit(3); + } + + if (res.status === 429) { + const retryAfter = positiveInt(res.headers.get('retry-after'), 60); + console.warn( + `[nexus-agent] 429 rate limited — sleeping ${retryAfter}s before retry.`, + ); + return { sleepSeconds: retryAfter }; + } + + let body = ''; + try { + body = (await res.text()).slice(0, 500); + } catch { + // ignore + } + console.warn( + `[nexus-agent] ${res.status} ${res.statusText} — retrying next interval. body: ${body}`, + ); + return { sleepSeconds: env.intervalSeconds }; +} + +function sleep(seconds) { + return new Promise((resolve) => setTimeout(resolve, seconds * 1000)); +} + +async function main() { + console.log( + `[nexus-agent] starting · endpoint=${endpoint} interval=${env.intervalSeconds}s`, + ); + // Eternal loop. The runtime supervisor (systemd, Docker restart + // policy) handles agent crashes; we don't try to recover from + // pathological errors ourselves. + /* eslint-disable no-constant-condition */ + while (true) { + let nextSleep = env.intervalSeconds; + try { + const result = await postOnce(); + nextSleep = result.sleepSeconds; + } catch (err) { + console.warn( + `[nexus-agent] tick failed: ${err instanceof Error ? err.message : err}`, + ); + } + await sleep(nextSleep); + } +} + +main(); diff --git a/app/Domain/Docker/Actions/IngestHostTelemetryAction.php b/app/Domain/Docker/Actions/IngestHostTelemetryAction.php new file mode 100644 index 0000000..e4775ab --- /dev/null +++ b/app/Domain/Docker/Actions/IngestHostTelemetryAction.php @@ -0,0 +1,105 @@ + $payload Validated payload from + * IngestTelemetryRequest. + */ + public function execute(Host $host, array $payload): void + { + $recordedAt = CarbonImmutable::parse($payload['recorded_at']); + $hostPayload = $payload['host'] ?? []; + $facts = $hostPayload['facts'] ?? []; + $metrics = $hostPayload['metrics'] ?? []; + $containers = $payload['containers'] ?? []; + + DB::transaction(function () use ($host, $recordedAt, $facts, $metrics, $containers): void { + $this->updateHost($host, $recordedAt, $facts); + $this->insertHostSnapshot($host, $recordedAt, $metrics); + $this->syncContainers->execute($host, $recordedAt, $containers); + }); + } + + /** + * @param array $facts + */ + private function updateHost(Host $host, CarbonImmutable $recordedAt, array $facts): void + { + // `forceFill` because `last_seen_at` and `status` aren't in + // `$fillable` — they're owned by the ingestion path, not user + // edits. The user-visible CRUD form (spec 026) doesn't touch + // them. + $payload = []; + + foreach (['cpu_count', 'memory_total_mb', 'disk_total_gb', 'os', 'docker_version'] as $key) { + if (array_key_exists($key, $facts) && $facts[$key] !== null) { + $payload[$key] = $facts[$key]; + } + } + + // Only write `status` when it actually changes — every healthy + // tick would otherwise issue a no-op `UPDATE` (50 hosts × 30 s + // = 144k pointless writes per day) and fire any observers. + // Archived hosts stay archived; the middleware already blocks + // their tokens but defence in depth is cheap. + if ($host->status !== HostStatus::Archived + && $host->status !== HostStatus::Online + ) { + $payload['status'] = HostStatus::Online->value; + } + + $payload['last_seen_at'] = $recordedAt; + + $host->forceFill($payload)->save(); + } + + /** + * @param array $metrics + */ + private function insertHostSnapshot(Host $host, CarbonImmutable $recordedAt, array $metrics): void + { + HostMetricSnapshot::query()->create([ + 'host_id' => $host->id, + 'cpu_percent' => $metrics['cpu_percent'] ?? null, + 'memory_used_mb' => $metrics['memory_used_mb'] ?? null, + 'memory_total_mb' => $metrics['memory_total_mb'] ?? null, + 'disk_used_gb' => $metrics['disk_used_gb'] ?? null, + 'disk_total_gb' => $metrics['disk_total_gb'] ?? null, + 'load_average' => $metrics['load_average'] ?? null, + 'network_rx_bytes' => $metrics['network_rx_bytes'] ?? null, + 'network_tx_bytes' => $metrics['network_tx_bytes'] ?? null, + 'recorded_at' => $recordedAt, + ]); + } +} diff --git a/app/Domain/Docker/Actions/SyncContainerSnapshotsAction.php b/app/Domain/Docker/Actions/SyncContainerSnapshotsAction.php new file mode 100644 index 0000000..a973947 --- /dev/null +++ b/app/Domain/Docker/Actions/SyncContainerSnapshotsAction.php @@ -0,0 +1,102 @@ +> $containers + */ + public function execute(Host $host, CarbonImmutable $recordedAt, array $containers): void + { + foreach ($containers as $payload) { + $container = $this->upsertContainer($host, $recordedAt, $payload); + $this->insertSnapshot($container, $recordedAt, $payload['metrics'] ?? []); + } + } + + /** + * @param array $payload + */ + private function upsertContainer(Host $host, CarbonImmutable $recordedAt, array $payload): Container + { + $metrics = $payload['metrics'] ?? []; + $memoryUsage = $metrics['memory_usage_mb'] ?? null; + $memoryLimit = $metrics['memory_limit_mb'] ?? null; + $memoryPercent = ($memoryUsage !== null && $memoryLimit !== null && $memoryLimit > 0) + ? round(($memoryUsage / $memoryLimit) * 100, 2) + : null; + + $attributes = [ + 'host_id' => $host->id, + 'container_id' => $payload['container_id'], + ]; + + $values = [ + 'name' => $payload['name'], + 'image' => $payload['image'], + 'image_tag' => $payload['image_tag'] ?? null, + 'status' => $payload['status'] ?? null, + 'state' => $payload['state'] ?? null, + 'health_status' => $payload['health_status'] ?? null, + 'ports' => $payload['ports'] ?? null, + 'labels' => $payload['labels'] ?? null, + 'cpu_percent' => $metrics['cpu_percent'] ?? null, + 'memory_usage_mb' => $memoryUsage, + 'memory_limit_mb' => $memoryLimit, + 'memory_percent' => $memoryPercent, + 'network_rx_bytes' => $metrics['network_rx_bytes'] ?? null, + 'network_tx_bytes' => $metrics['network_tx_bytes'] ?? null, + 'block_read_bytes' => $metrics['block_read_bytes'] ?? null, + 'block_write_bytes' => $metrics['block_write_bytes'] ?? null, + 'started_at' => $payload['started_at'] ?? null, + 'finished_at' => $payload['finished_at'] ?? null, + 'last_seen_at' => $recordedAt, + ]; + + return Container::query()->updateOrCreate($attributes, $values); + } + + /** + * @param array $metrics + */ + private function insertSnapshot(Container $container, CarbonImmutable $recordedAt, array $metrics): void + { + $memoryUsage = $metrics['memory_usage_mb'] ?? null; + $memoryLimit = $metrics['memory_limit_mb'] ?? null; + $memoryPercent = ($memoryUsage !== null && $memoryLimit !== null && $memoryLimit > 0) + ? round(($memoryUsage / $memoryLimit) * 100, 2) + : null; + + ContainerMetricSnapshot::query()->create([ + 'container_id' => $container->id, + 'cpu_percent' => $metrics['cpu_percent'] ?? null, + 'memory_usage_mb' => $memoryUsage, + 'memory_limit_mb' => $memoryLimit, + 'memory_percent' => $memoryPercent, + 'network_rx_bytes' => $metrics['network_rx_bytes'] ?? null, + 'network_tx_bytes' => $metrics['network_tx_bytes'] ?? null, + 'block_read_bytes' => $metrics['block_read_bytes'] ?? null, + 'block_write_bytes' => $metrics['block_write_bytes'] ?? null, + 'recorded_at' => $recordedAt, + ]); + } +} diff --git a/app/Http/Controllers/Agent/HostTelemetryController.php b/app/Http/Controllers/Agent/HostTelemetryController.php new file mode 100644 index 0000000..151086d --- /dev/null +++ b/app/Http/Controllers/Agent/HostTelemetryController.php @@ -0,0 +1,30 @@ +attributes->get('agent_host'); + + $ingest->execute($host, $request->validated()); + + return response()->noContent(); + } +} diff --git a/app/Http/Middleware/AuthenticateAgent.php b/app/Http/Middleware/AuthenticateAgent.php new file mode 100644 index 0000000..e8633a2 --- /dev/null +++ b/app/Http/Middleware/AuthenticateAgent.php @@ -0,0 +1,88 @@ +` header. + * 2. Hash with `AgentToken::hash()` and look up an active token. + * 3. Reject if the host has been archived (spec 026's archive flow + * revokes tokens, but a race or a manual DB edit could leave a + * live token attached to an archived host — defence in depth). + * 4. Enforce per-token rate limit (60 req/min). Returns 429 with + * Retry-After when exceeded. + * 5. Stamp `last_used_at` and stash both `agent_host` + `agent_token` + * on the request attributes so the controller can read them. + * + * Rate limiting lives here (rather than as a separate `throttle:` + * middleware) because Laravel's default middleware priority puts + * ThrottleRequests *before* unlisted custom middleware, so a named + * limiter keyed on `$request->attributes->get('agent_token')` would + * always see null. Doing both jobs in one middleware sidesteps that. + * + * No `Auth::login()` is performed — this is a machine-to-machine path + * with the host as the actor, not a user. 401 on every auth failure + * mode; empty body so an attacker can't probe valid token shapes. + */ +class AuthenticateAgent +{ + /** @var int Per-token requests per minute. */ + public const RATE_LIMIT_PER_MINUTE = 60; + + public function handle(Request $request, Closure $next): Response + { + $bearer = $request->bearerToken(); + + if (! is_string($bearer) || $bearer === '') { + return response('', 401); + } + + $token = AgentToken::query() + ->where('hashed_token', AgentToken::hash($bearer)) + ->whereNull('revoked_at') + ->with('host') + ->first(); + + if ($token === null + || $token->host === null + || $token->host->archived_at !== null + ) { + return response('', 401); + } + + $key = self::rateLimitKey($token); + if (RateLimiter::tooManyAttempts($key, self::RATE_LIMIT_PER_MINUTE)) { + return response('', 429)->header( + 'Retry-After', + (string) RateLimiter::availableIn($key), + ); + } + RateLimiter::hit($key, 60); + + // Stamped after the rate-limit check so a flood of throttled + // requests doesn't keep the token's last_used_at fresh. + $token->forceFill(['last_used_at' => now()])->save(); + + $request->attributes->set('agent_host', $token->host); + $request->attributes->set('agent_token', $token); + + return $next($request); + } + + /** + * Cache key for the per-token bucket. Public so tests can target + * the same key without re-deriving it. + */ + public static function rateLimitKey(AgentToken $token): string + { + return 'agent-telemetry:'.$token->getKey(); + } +} diff --git a/app/Http/Requests/Agent/IngestTelemetryRequest.php b/app/Http/Requests/Agent/IngestTelemetryRequest.php new file mode 100644 index 0000000..62a8ee0 --- /dev/null +++ b/app/Http/Requests/Agent/IngestTelemetryRequest.php @@ -0,0 +1,108 @@ +attributes->get('agent_host') !== null; + } + + public function rules(): array + { + return [ + 'recorded_at' => ['required', 'date'], + + 'host' => ['required', 'array'], + 'host.metrics' => ['required', 'array'], + 'host.metrics.cpu_percent' => ['nullable', 'numeric', 'between:0,100'], + 'host.metrics.memory_used_mb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.memory_total_mb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.disk_used_gb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.disk_total_gb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.load_average' => ['nullable', 'numeric', 'min:0'], + 'host.metrics.network_rx_bytes' => ['nullable', 'integer', 'min:0'], + 'host.metrics.network_tx_bytes' => ['nullable', 'integer', 'min:0'], + + 'host.facts' => ['sometimes', 'array'], + 'host.facts.cpu_count' => ['nullable', 'integer', 'min:1', 'max:1024'], + 'host.facts.memory_total_mb' => ['nullable', 'integer', 'min:0'], + 'host.facts.disk_total_gb' => ['nullable', 'integer', 'min:0'], + 'host.facts.os' => ['nullable', 'string', 'max:80'], + 'host.facts.docker_version' => ['nullable', 'string', 'max:32'], + + 'containers' => ['sometimes', 'array', 'max:500'], + 'containers.*.container_id' => ['required', 'string', 'max:80'], + 'containers.*.name' => ['required', 'string', 'max:255'], + 'containers.*.image' => ['required', 'string', 'max:255'], + 'containers.*.image_tag' => ['nullable', 'string', 'max:128'], + 'containers.*.status' => ['nullable', 'string', 'max:32'], + 'containers.*.state' => ['nullable', 'string', 'max:32'], + 'containers.*.health_status' => ['nullable', 'string', 'max:16'], + 'containers.*.ports' => ['sometimes', 'array'], + 'containers.*.labels' => ['sometimes', 'array'], + 'containers.*.metrics' => ['sometimes', 'array'], + 'containers.*.metrics.cpu_percent' => ['nullable', 'numeric'], + 'containers.*.metrics.memory_usage_mb' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.memory_limit_mb' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.network_rx_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.network_tx_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.block_read_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.block_write_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.started_at' => ['nullable', 'date'], + 'containers.*.finished_at' => ['nullable', 'date'], + ]; + } + + public function withValidator(Validator $validator): void + { + $validator->after(function (Validator $validator): void { + $value = $this->input('recorded_at'); + if (! is_string($value) || $value === '') { + return; // Required-rule violation surfaces this case. + } + + try { + $recordedAt = CarbonImmutable::parse($value); + } catch (\Throwable) { + return; // `date` rule already added an error. + } + + $now = CarbonImmutable::now(); + $earliest = $now->subSeconds(self::PAST_SKEW_SECONDS); + $latest = $now->addSeconds(self::FUTURE_SKEW_SECONDS); + + if ($recordedAt->lessThan($earliest) || $recordedAt->greaterThan($latest)) { + $validator->errors()->add( + 'recorded_at', + 'Telemetry timestamp is outside the accepted skew window (±'. + (self::PAST_SKEW_SECONDS / 60).' min past / '. + (self::FUTURE_SKEW_SECONDS / 60).' min future).', + ); + } + }); + } +} diff --git a/app/Providers/AppServiceProvider.php b/app/Providers/AppServiceProvider.php index e80416e..e419af8 100644 --- a/app/Providers/AppServiceProvider.php +++ b/app/Providers/AppServiceProvider.php @@ -40,6 +40,13 @@ public function boot(): void Gate::policy(Host::class, HostPolicy::class); Gate::policy(AgentToken::class, AgentTokenPolicy::class); + // Per-token rate limiting on `/agent/telemetry` lives inside + // `AuthenticateAgent` middleware (spec 027). Keeping it there + // — rather than as a Laravel `throttle:` named limiter — lets + // the limit fire after token resolution, which a named limiter + // can't do because Laravel's default middleware priority runs + // ThrottleRequests before unlisted custom middleware. + // Force https URL generation when APP_URL is https. Required for // Cloudflare/ngrok tunnels: TLS terminates at the tunnel and // `php artisan serve` only sees plain HTTP locally, so without diff --git a/bootstrap/app.php b/bootstrap/app.php index 482346e..720e8cb 100644 --- a/bootstrap/app.php +++ b/bootstrap/app.php @@ -1,5 +1,6 @@ preventRequestForgery(except: [ 'webhooks/github', + 'agent/telemetry', + ]); + + // Spec 027 — bearer-token auth for the agent telemetry endpoint. + $middleware->alias([ + 'agent.auth' => AuthenticateAgent::class, ]); // Trust loopback proxies. Required for cloudflared / ngrok dev diff --git a/routes/web.php b/routes/web.php index f2d17a9..a1a22d2 100644 --- a/routes/web.php +++ b/routes/web.php @@ -1,6 +1,7 @@ name('webhooks.github'); +// Spec 027 — agent telemetry ingestion. Bearer auth + per-token rate +// limit are both handled inside `AuthenticateAgent` (alias `agent.auth`). +// +// `withoutMiddleware` strips the session/cookie/Inertia stack from the +// `web` group: agents are non-browser JSON clients, so we don't want +// every 30-second heartbeat from every host to spawn an orphan session +// row, set a Set-Cookie header, or run Inertia's prop-sharing closures. +// CSRF is already excluded for this path in bootstrap/app.php. +Route::post('/agent/telemetry', HostTelemetryController::class) + ->middleware('agent.auth') + ->withoutMiddleware([ + EncryptCookies::class, + AddQueuedCookiesToResponse::class, + StartSession::class, + ShareErrorsFromSession::class, + HandleInertiaRequests::class, + AddLinkHeadersForPreloadedAssets::class, + // Skip the CSRF middleware too — the path is already in the + // except list (`bootstrap/app.php`), but the middleware still + // refreshes the XSRF cookie at response time, which calls + // `$request->session()` and explodes when StartSession isn't + // running. + PreventRequestForgery::class, + ]) + ->name('agent.telemetry'); + Route::middleware('auth')->group(function () { Route::get('/profile', [ProfileController::class, 'edit'])->name('profile.edit'); Route::patch('/profile', [ProfileController::class, 'update'])->name('profile.update'); diff --git a/specs/README.md b/specs/README.md index 04017da..d436580 100644 --- a/specs/README.md +++ b/specs/README.md @@ -41,7 +41,7 @@ Status legend: ⬜ not started · 🟡 in progress · 🟢 done · 🔴 blocked | 3 | GitHub Webhooks & Activity Feed | 🟢 | 3/3 specs done (017–019). Phase complete. | | 4 | Deployments & CI/CD | 🟢 | 3/3 specs done (020–022). Phase complete. | | 5 | Website Monitoring | 🟢 | 3/3 specs done (023–025). Phase complete. | -| 6 | Docker Host Agent MVP | 🟡 | 1/4 specs done (026–029). 026 shipped. | +| 6 | Docker Host Agent MVP | 🟡 | 2/4 specs done (026–029). 026, 027 shipped. | | 7 | Alerts Engine | ⬜ | — | | 8 | Analytics & Health Scores | ⬜ | — | | 9 | Polish & Production Readiness | ⬜ | — | diff --git a/specs/phase-6-docker-hosts/027-agent-telemetry-ingestion.md b/specs/phase-6-docker-hosts/027-agent-telemetry-ingestion.md new file mode 100644 index 0000000..fe76663 --- /dev/null +++ b/specs/phase-6-docker-hosts/027-agent-telemetry-ingestion.md @@ -0,0 +1,190 @@ +--- +spec: agent-telemetry-ingestion +phase: 6 +status: done +owner: Yoany +created: 2026-05-01 +updated: 2026-05-01 +--- + +# 027 — Agent Telemetry Ingestion + Reference Agent + +## Goal +Wire the bearer-authenticated `POST /agent/telemetry` endpoint so a Nexus agent on a Docker host can push host + container stats into the database. Builds directly on 026: tokens already exist + are hashed; this spec gives them something to authenticate against. Also ships a small Node reference agent script that documents the payload contract and lets a developer get telemetry flowing on their own laptop in five minutes. + +Roadmap refs: §8.7 Docker Hosts, §16.5 Agent Security, §17 Observability for Nexus Itself. + +## Scope + +**In scope:** +- `app/Http/Middleware/AuthenticateAgent.php` — reads `Authorization: Bearer `, hashes with `AgentToken::hash()`, resolves the active token + host, stamps `last_used_at`, attaches the host to the request via `$request->attributes->set('agent_host', $host)`. 401 on missing / malformed / revoked / unknown. +- Route `POST /agent/telemetry` registered outside the auth/verified group (no CSRF — it's a JSON API). Middleware alias `'agent.auth'` registered in `bootstrap/app.php`. +- `app/Http/Controllers/Agent/HostTelemetryController.php` — pulls the host off the request, validates payload via a Form Request, dispatches to the action, returns `204 No Content`. +- `app/Http/Requests/Agent/IngestTelemetryRequest.php` — validates the payload shape (host metrics, optional facts, container array). `recorded_at` must be ISO 8601 and within a sane skew window (≤ 5 min in the future, ≤ 1 hour in the past — log + reject older). +- `app/Domain/Docker/Actions/IngestHostTelemetryAction.php` — single transaction: + 1. Updates host metadata (cpu_count, memory_total_mb, disk_total_gb, os, docker_version) when provided. + 2. Flips `status` to `online` and stamps `last_seen_at = recorded_at`. + 3. Inserts a `host_metric_snapshots` row. + 4. Hands containers off to `SyncContainerSnapshotsAction`. +- `app/Domain/Docker/Actions/SyncContainerSnapshotsAction.php` — for each container in the payload: + - Upsert on `(host_id, container_id)` with the per-container fields (status, image, latest stats). + - Insert one `container_metric_snapshots` row. + - **Container removal is out of scope** (deferred — stopped/gone containers stay in the table until a future cleanup job; we just stop seeing fresh snapshots). +- Per-token rate limiting via `RateLimiter::for('agent-telemetry', ...)` keyed off the hashed token. Default: 60 req/min/token. Returns 429 with the canonical `Retry-After` header. +- A `dispatch.activity` placeholder is **not** added here — `host.online` activity events are a 029 concern. +- Reference agent at `agent/reference-agent.mjs` — single-file Node 20+ script. Reads `NEXUS_URL` + `NEXUS_AGENT_TOKEN` env vars, shells out to `docker info` / `docker stats --no-stream` / `docker ps -a --no-trunc --format '{{json .}}'`, builds the payload, posts every `NEXUS_AGENT_INTERVAL` seconds (default 30). Includes `agent/README.md` with the install command, env var reference, and a one-liner `node agent.mjs` to run it. +- Feature tests: middleware (valid / missing / malformed / revoked / unknown token), endpoint happy path, payload validation rejection, rate limit (assert 429 + `Retry-After`), idempotency (re-post same `recorded_at` lands two snapshot rows — that's fine; we don't dedupe at this layer). +- Unit tests for both actions: host upsert + first-online transition, container upsert preserves `(host_id, container_id)` uniqueness, second post for same container appends a snapshot without breaking the upsert. + +**Out of scope:** +- Container removal / sweep job (future polish). +- Activity events for `host.online` / `host.recovered` / `container.unhealthy` (029). +- IP allowlist + host fingerprint binding (roadmap §16.5 "later"). +- Production-ready Go agent (roadmap §8.7). +- Promoting host issues to `alerts` rows (Phase 7). +- Hosts UI rendering of metric history (028 — this spec persists the data; 028 displays it). + +## Plan + +1. **Middleware.** New `app/Http/Middleware/AuthenticateAgent.php`: + ```php + $header = $request->bearerToken(); + abort_unless(is_string($header) && $header !== '', 401); + $token = AgentToken::query() + ->where('hashed_token', AgentToken::hash($header)) + ->whereNull('revoked_at') + ->with('host') + ->first(); + abort_unless($token && $token->host && $token->host->archived_at === null, 401); + $token->forceFill(['last_used_at' => now()])->save(); + $request->attributes->set('agent_host', $token->host); + $request->attributes->set('agent_token', $token); + ``` + No `Auth::login()` — this is a machine-to-machine path; the host (not a user) is the actor. Rate limiting reads `agent_token->id` for the bucket key. + +2. **Middleware alias.** Register in `bootstrap/app.php`'s `withMiddleware()`: + ```php + $middleware->alias([ + 'agent.auth' => AuthenticateAgent::class, + ]); + ``` + +3. **Route.** In `routes/web.php`, outside the auth group: + ```php + Route::post('/agent/telemetry', HostTelemetryController::class) + ->middleware(['agent.auth', 'throttle:agent-telemetry']) + ->name('agent.telemetry'); + ``` + CSRF is automatically skipped for this path via `bootstrap/app.php`'s `validateCsrfTokens(except: ['agent/telemetry', ...])` (mirrors the GitHub webhook entry). + +4. **RateLimiter binding.** In `app/Providers/AppServiceProvider::boot()`: + ```php + RateLimiter::for('agent-telemetry', function (Request $request) { + $token = $request->attributes->get('agent_token'); + return Limit::perMinute(60)->by('agent-token:' . ($token?->id ?? 'anon')); + }); + ``` + +5. **Form Request.** `app/Http/Requests/Agent/IngestTelemetryRequest.php` — `authorize()` returns true (the middleware already gated it). Validation: + ```php + 'recorded_at' => ['required', 'date'], + 'host' => ['required', 'array'], + 'host.metrics' => ['required', 'array'], + 'host.metrics.cpu_percent' => ['nullable', 'numeric', 'between:0,100'], + 'host.metrics.memory_used_mb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.memory_total_mb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.disk_used_gb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.disk_total_gb' => ['nullable', 'integer', 'min:0'], + 'host.metrics.load_average' => ['nullable', 'numeric', 'min:0'], + 'host.metrics.network_rx_bytes' => ['nullable', 'integer', 'min:0'], + 'host.metrics.network_tx_bytes' => ['nullable', 'integer', 'min:0'], + 'host.facts' => ['sometimes', 'array'], + 'host.facts.cpu_count' => ['nullable', 'integer', 'min:1', 'max:1024'], + 'host.facts.memory_total_mb' => ['nullable', 'integer', 'min:0'], + 'host.facts.disk_total_gb' => ['nullable', 'integer', 'min:0'], + 'host.facts.os' => ['nullable', 'string', 'max:80'], + 'host.facts.docker_version' => ['nullable', 'string', 'max:32'], + 'containers' => ['sometimes', 'array', 'max:500'], + 'containers.*.container_id' => ['required', 'string', 'max:80'], + 'containers.*.name' => ['required', 'string', 'max:255'], + 'containers.*.image' => ['required', 'string', 'max:255'], + 'containers.*.image_tag' => ['nullable', 'string', 'max:128'], + 'containers.*.status' => ['nullable', 'string', 'max:32'], + 'containers.*.state' => ['nullable', 'string', 'max:32'], + 'containers.*.health_status' => ['nullable', 'string', 'max:16'], + 'containers.*.ports' => ['sometimes', 'array'], + 'containers.*.labels' => ['sometimes', 'array'], + 'containers.*.metrics' => ['sometimes', 'array'], + 'containers.*.metrics.cpu_percent' => ['nullable', 'numeric'], + 'containers.*.metrics.memory_usage_mb' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.memory_limit_mb' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.network_rx_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.network_tx_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.block_read_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.metrics.block_write_bytes' => ['nullable', 'integer', 'min:0'], + 'containers.*.started_at' => ['nullable', 'date'], + 'containers.*.finished_at' => ['nullable', 'date'], + ``` + `withValidator()` adds the skew check on `recorded_at` (between `now()->subHour()` and `now()->addMinutes(5)`). + +6. **Controller.** Thin — pull host off request, hand validated payload to the action, return 204. + +7. **Actions.** + - `IngestHostTelemetryAction` — wraps in `DB::transaction`; updates host metadata (`fillIfPresent`); inserts snapshot; recursively calls `SyncContainerSnapshotsAction` for the container array (if present). + - `SyncContainerSnapshotsAction` — `Container::updateOrCreate(['host_id'=>..., 'container_id'=>...], $payload)` then `ContainerMetricSnapshot::create(...)`. Computes `memory_percent` server-side from `memory_usage_mb / memory_limit_mb` when both are present, else null. + +8. **Reference agent.** + - `agent/reference-agent.mjs` (~120 LoC). Single-file ESM Node 20 script. No deps — uses built-in `fetch` + `node:child_process`. + - Loop: every `NEXUS_AGENT_INTERVAL` seconds, run `docker info --format '{{json .}}'`, `docker stats --no-stream --no-trunc --format '{{json .}}'`, `docker ps -a --no-trunc --format '{{json .}}'`, parse, build payload, POST. + - On 401, log + exit non-zero (token revoked or wrong URL — operator should see + fix). + - On 429, sleep `Retry-After` then continue. + - On 5xx / network error, log + retry next interval (no backoff; the interval itself is the gate). + - `agent/README.md` documents env vars: `NEXUS_URL`, `NEXUS_AGENT_TOKEN`, optional `NEXUS_AGENT_INTERVAL` (default 30). Includes a sample `systemd` unit for the eventually-shipped install path. + +9. **Tests.** + - `tests/Feature/Agent/AuthenticateAgentMiddlewareTest.php` — happy path, no header, malformed bearer, revoked token, archived host, unknown token. Asserts `last_used_at` stamped on success only. + - `tests/Feature/Agent/HostTelemetryControllerTest.php` — happy path round-trip (status flips to online, snapshot row written, container snapshot written), validation rejection, skew window rejection, 429 after 60 calls (drive `RateLimiter::hit('agent-token:1', 60)` directly to avoid 60 actual requests in the test). + - `tests/Unit/Domain/Docker/IngestHostTelemetryActionTest.php` — first telemetry transitions `pending → online`; second telemetry leaves status alone; metadata only updates when present. + - `tests/Unit/Domain/Docker/SyncContainerSnapshotsActionTest.php` — first call inserts container + 1 snapshot; second call updates container + appends a 2nd snapshot; missing container in payload doesn't drop existing rows. + +## Acceptance criteria +- [x] `POST /agent/telemetry` with a valid bearer + payload returns `204` and persists host metadata + 1 host snapshot + N container snapshots. +- [x] Endpoint rejects: missing bearer (401), wrong-format bearer (401), revoked token (401), token belonging to an archived host (401), unknown token (401). +- [x] Token's `last_used_at` is stamped on each successful request. +- [x] First successful telemetry from a `pending` host flips status to `online` and stamps `last_seen_at`. +- [x] Per-token rate limit: 61st request inside 60 s returns `429` with `Retry-After`. +- [x] Payload outside the skew window (older than 1 h or further than 5 min in the future) is rejected with a 422. +- [x] Reference agent script + README live under `agent/`. Manual smoke: pointing `NEXUS_URL` at `composer run dev` + a real token causes telemetry rows to appear within one interval. +- [x] Pint clean, tests green (26 new), `npm run build` clean. + +## Files touched + +- `app/Http/Middleware/AuthenticateAgent.php` — new (auth + per-token rate limit) +- `bootstrap/app.php` — register `agent.auth` alias + CSRF exclusion for `/agent/telemetry` +- `app/Http/Controllers/Agent/HostTelemetryController.php` — new +- `app/Http/Requests/Agent/IngestTelemetryRequest.php` — new +- `app/Domain/Docker/Actions/IngestHostTelemetryAction.php` — new +- `app/Domain/Docker/Actions/SyncContainerSnapshotsAction.php` — new +- `app/Providers/AppServiceProvider.php` — comment-only stub (rate limiting moved into middleware; see Work log) +- `routes/web.php` — `/agent/telemetry` route + `withoutMiddleware` to strip session/cookie/Inertia chain +- `agent/reference-agent.mjs` — new +- `agent/README.md` — new +- `tests/Feature/Agent/AuthenticateAgentMiddlewareTest.php` — new (7 cases) +- `tests/Feature/Agent/HostTelemetryControllerTest.php` — new (10 cases) +- `tests/Unit/Domain/Docker/IngestHostTelemetryActionTest.php` — new (5 cases) +- `tests/Unit/Domain/Docker/SyncContainerSnapshotsActionTest.php` — new (5 cases) + +## Work log + +### 2026-05-01 +- Spec drafted. +- Issue [#80](https://github.com/Copxer/nexus/issues/80) opened, branch `spec/027-agent-telemetry-ingestion` cut off `main`. +- **Implementation deviation: rate limiting moved into `AuthenticateAgent` middleware** instead of the planned `throttle:agent-telemetry` named limiter. The first attempt followed the spec's Plan §3+§4 (named limiter via `RateLimiter::for(...)` in AppServiceProvider, `throttle:agent-telemetry` middleware on the route). Tests showed the limiter callback always saw `agent_token = null` even with the route defined as `middleware(['agent.auth', 'throttle:agent-telemetry'])`. Reason: Laravel's default `MiddlewarePriority` runs `ThrottleRequests` before any unlisted custom middleware, so the named-limiter callback fires before `AuthenticateAgent` has a chance to set the request attribute. Options were (a) inject `AuthenticateAgent` into the priority list (brittle — would need to maintain Laravel's full priority array), or (b) collapse auth + throttle into one middleware. Went with (b): cleaner, self-contained, the limit fires only after we've identified the token. The spec's Plan steps still describe the abandoned approach for historical context. +- **Implementation deviation: route uses `withoutMiddleware([...])`** to strip session / cookie / Inertia stack from the `web` group. Surfaced during self-review: agents are non-browser JSON clients posting every 30 seconds, so leaving `StartSession` etc. on the route would spawn ~144k orphan session rows per day at 50 hosts. The exclusion list also includes `PreventRequestForgery` because that middleware refreshes the XSRF cookie at response time (calls `$request->session()`) regardless of the path-except list, which fails when `StartSession` isn't running. +- Self-review pass via `superpowers:code-reviewer` flagged 5 should-fix items + several nice-to-haves. Addressed: (1) session-stack stripping on the agent route, (2) `recorded_at` non-string rejection test (locks behavior for `0`, `false`, `[]`), (3) host-isolation test (same `container_id` on two hosts → distinct rows), (4) "skip status write when already Online" optimisation in `IngestHostTelemetryAction` to avoid 144k pointless `UPDATE hosts SET status='online'` writes per day, plus a test, (5) spec status flipped to `done` and Plan deviations recorded here. Tests grew 22 → 26. +- Final: full suite 460 passing, Pint clean, build green. + +## Open questions / blockers +- **Container removal:** I'm deferring it. Once a container is gone from `docker ps -a`, its row stays in `containers` until a future cleanup spec. Acceptable trade-off — the alternative (delete-not-in-payload) risks racing a partial-failure agent post and dropping live containers. We can revisit when 028 or 029 surfaces a stale-container UI need. +- **Skew window:** 1 hour past / 5 min future. If the agent's clock is wrong by more than that, the host stays offline until the operator notices. Wide enough for daylight-savings, NTP drift; tight enough that a re-played payload from yesterday can't resurrect a dead host. Open to tuning. +- **Reference agent language:** Node, not Bash or Go. Node ships on every dev laptop, has built-in `fetch`, and gives us a single-file ESM artifact. Production-grade Go binary is roadmap §8.7's "later". diff --git a/specs/phase-6-docker-hosts/README.md b/specs/phase-6-docker-hosts/README.md index 8b79a2a..c463a8c 100644 --- a/specs/phase-6-docker-hosts/README.md +++ b/specs/phase-6-docker-hosts/README.md @@ -10,7 +10,7 @@ Stand up Docker host + container monitoring end-to-end via a pull-from-agent mod | # | Task | Status | |---|------|--------| | 026 | Hosts + agent tokens scaffolding (CRUD + token rotation) | 🟢 | -| 027 | Telemetry ingestion endpoint + reference agent script | ⬜ | +| 027 | Telemetry ingestion endpoint + reference agent script | 🟢 | | 028 | Hosts UI (index + show + project Hosts tab) | ⬜ | | 029 | Host offline detection + activity events + Overview KPI wiring | ⬜ | diff --git a/tests/Feature/Agent/AuthenticateAgentMiddlewareTest.php b/tests/Feature/Agent/AuthenticateAgentMiddlewareTest.php new file mode 100644 index 0000000..56044a3 --- /dev/null +++ b/tests/Feature/Agent/AuthenticateAgentMiddlewareTest.php @@ -0,0 +1,117 @@ +<?php + +namespace Tests\Feature\Agent; + +use App\Domain\Docker\Actions\IssueAgentTokenAction; +use App\Models\AgentToken; +use App\Models\Host; +use Illuminate\Foundation\Testing\RefreshDatabase; +use Tests\TestCase; + +class AuthenticateAgentMiddlewareTest extends TestCase +{ + use RefreshDatabase; + + /** + * Minimal valid payload — the middleware tests only auth, but the + * Form Request would otherwise short-circuit a 200 path with a 422. + * `host.metrics` needs at least one key for `required` to pass. + */ + private function payload(): array + { + return [ + 'recorded_at' => now()->toIso8601String(), + 'host' => [ + 'metrics' => [ + 'cpu_percent' => null, + ], + ], + ]; + } + + public function test_returns_204_for_valid_bearer(): void + { + $host = Host::factory()->create(); + $result = app(IssueAgentTokenAction::class)->execute($host); + + $this->postJson( + route('agent.telemetry'), + $this->payload(), + ['Authorization' => 'Bearer '.$result->plaintext], + )->assertNoContent(); + + $this->assertNotNull($result->token->fresh()->last_used_at); + } + + public function test_returns_401_with_no_authorization_header(): void + { + $this->postJson(route('agent.telemetry'), $this->payload()) + ->assertStatus(401); + } + + public function test_returns_401_with_malformed_bearer(): void + { + $this->postJson( + route('agent.telemetry'), + $this->payload(), + ['Authorization' => 'NotBearer xyz'], + )->assertStatus(401); + } + + public function test_returns_401_for_unknown_token(): void + { + $this->postJson( + route('agent.telemetry'), + $this->payload(), + ['Authorization' => 'Bearer not-a-real-token'], + )->assertStatus(401); + } + + public function test_returns_401_for_revoked_token(): void + { + $host = Host::factory()->create(); + $result = app(IssueAgentTokenAction::class)->execute($host); + $result->token->forceFill(['revoked_at' => now()])->save(); + + $this->postJson( + route('agent.telemetry'), + $this->payload(), + ['Authorization' => 'Bearer '.$result->plaintext], + )->assertStatus(401); + } + + public function test_returns_401_when_host_is_archived(): void + { + $host = Host::factory()->create(); + $result = app(IssueAgentTokenAction::class)->execute($host); + + // Archive directly — bypasses ArchiveHostAction's token revoke + // step on purpose, to prove the middleware re-checks the host + // state rather than relying on revocation alone. + $host->forceFill(['archived_at' => now()])->save(); + + $this->postJson( + route('agent.telemetry'), + $this->payload(), + ['Authorization' => 'Bearer '.$result->plaintext], + )->assertStatus(401); + } + + public function test_does_not_stamp_last_used_on_failure(): void + { + $host = Host::factory()->create(); + $token = AgentToken::factory()->revoked()->create(['host_id' => $host->id]); + + $this->postJson( + route('agent.telemetry'), + $this->payload(), + // We don't have the plaintext for a factory-revoked token + // (the factory hashes a random plaintext we discard), so + // this also doubles as the unknown-token case for a + // different reason. + ['Authorization' => 'Bearer pretend-this-matched'], + )->assertStatus(401); + + $this->assertNull($token->fresh()->last_used_at); + } +} diff --git a/tests/Feature/Agent/HostTelemetryControllerTest.php b/tests/Feature/Agent/HostTelemetryControllerTest.php new file mode 100644 index 0000000..33807b0 --- /dev/null +++ b/tests/Feature/Agent/HostTelemetryControllerTest.php @@ -0,0 +1,260 @@ +<?php + +namespace Tests\Feature\Agent; + +use App\Domain\Docker\Actions\IssueAgentTokenAction; +use App\Enums\HostStatus; +use App\Http\Middleware\AuthenticateAgent; +use App\Models\AgentToken; +use App\Models\Container; +use App\Models\ContainerMetricSnapshot; +use App\Models\Host; +use App\Models\HostMetricSnapshot; +use Illuminate\Foundation\Testing\RefreshDatabase; +use Illuminate\Support\Facades\RateLimiter; +use Tests\TestCase; + +class HostTelemetryControllerTest extends TestCase +{ + use RefreshDatabase; + + private function issuedToken(?Host $host = null): array + { + $host ??= Host::factory()->create(); + $result = app(IssueAgentTokenAction::class)->execute($host); + + return [$host->fresh(), $result->plaintext]; + } + + private function fullPayload(): array + { + return [ + 'recorded_at' => now()->toIso8601String(), + 'host' => [ + 'facts' => [ + 'cpu_count' => 4, + 'memory_total_mb' => 8192, + 'disk_total_gb' => 100, + 'os' => 'Ubuntu 24.04', + 'docker_version' => '26.1.0', + ], + 'metrics' => [ + 'cpu_percent' => 23.4, + 'memory_used_mb' => 4096, + 'memory_total_mb' => 8192, + 'load_average' => 0.82, + 'network_rx_bytes' => 12_000_000, + 'network_tx_bytes' => 5_000_000, + ], + ], + 'containers' => [ + [ + 'container_id' => 'abc123', + 'name' => 'web', + 'image' => 'ghcr.io/acme/web', + 'image_tag' => 'v1.2.3', + 'status' => 'running', + 'state' => 'running', + 'health_status' => 'healthy', + 'ports' => ['0.0.0.0:8080->80/tcp'], + 'labels' => ['svc' => 'web'], + 'metrics' => [ + 'cpu_percent' => 4.21, + 'memory_usage_mb' => 128, + 'memory_limit_mb' => 512, + 'network_rx_bytes' => 1_024_000, + 'network_tx_bytes' => 512_000, + 'block_read_bytes' => 0, + 'block_write_bytes' => 0, + ], + ], + ], + ]; + } + + public function test_happy_path_persists_host_and_container_state(): void + { + [$host, $plaintext] = $this->issuedToken(); + + $this->postJson( + route('agent.telemetry'), + $this->fullPayload(), + ['Authorization' => 'Bearer '.$plaintext], + )->assertNoContent(); + + $host->refresh(); + $this->assertSame(HostStatus::Online, $host->status); + $this->assertNotNull($host->last_seen_at); + $this->assertSame(4, $host->cpu_count); + $this->assertSame(8192, $host->memory_total_mb); + $this->assertSame('26.1.0', $host->docker_version); + + $this->assertSame(1, HostMetricSnapshot::query()->count()); + $this->assertSame(1, Container::query()->count()); + $this->assertSame(1, ContainerMetricSnapshot::query()->count()); + + $container = Container::query()->firstOrFail(); + $this->assertSame('web', $container->name); + $this->assertSame(25.0, (float) $container->memory_percent); // 128/512 * 100 + } + + public function test_second_post_appends_snapshots_without_duplicating_container(): void + { + [$host, $plaintext] = $this->issuedToken(); + + $this->postJson( + route('agent.telemetry'), + $this->fullPayload(), + ['Authorization' => 'Bearer '.$plaintext], + )->assertNoContent(); + + $second = $this->fullPayload(); + $second['recorded_at'] = now()->addSeconds(31)->toIso8601String(); + $second['containers'][0]['metrics']['cpu_percent'] = 9.99; + + $this->postJson( + route('agent.telemetry'), + $second, + ['Authorization' => 'Bearer '.$plaintext], + )->assertNoContent(); + + $this->assertSame(1, Container::query()->count(), 'container row deduped on (host_id, container_id)'); + $this->assertSame(2, HostMetricSnapshot::query()->count()); + $this->assertSame(2, ContainerMetricSnapshot::query()->count()); + $this->assertSame(9.99, (float) Container::query()->first()->cpu_percent); + } + + public function test_validation_rejects_payload_with_invalid_metrics(): void + { + [, $plaintext] = $this->issuedToken(); + + $payload = $this->fullPayload(); + $payload['host']['metrics']['cpu_percent'] = 250; // out of [0,100] + + $this->postJson( + route('agent.telemetry'), + $payload, + ['Authorization' => 'Bearer '.$plaintext], + )->assertStatus(422); + + $this->assertSame(0, HostMetricSnapshot::query()->count()); + } + + public function test_validation_rejects_skewed_recorded_at(): void + { + [, $plaintext] = $this->issuedToken(); + + $payload = $this->fullPayload(); + $payload['recorded_at'] = now()->subHours(2)->toIso8601String(); + + $this->postJson( + route('agent.telemetry'), + $payload, + ['Authorization' => 'Bearer '.$plaintext], + )->assertStatus(422) + ->assertJsonValidationErrors('recorded_at'); + } + + public function test_validation_rejects_non_string_recorded_at(): void + { + // `0` could squeak past `date` in older Laravel versions and + // get parsed as the unix epoch, which would then bypass the + // skew window in a surprising way. Lock the contract: integer + // / boolean / array values for `recorded_at` are 422. + [, $plaintext] = $this->issuedToken(); + + foreach ([0, false, []] as $bad) { + $payload = $this->fullPayload(); + $payload['recorded_at'] = $bad; + + $this->postJson( + route('agent.telemetry'), + $payload, + ['Authorization' => 'Bearer '.$plaintext], + )->assertStatus(422); + } + + $this->assertSame(0, HostMetricSnapshot::query()->count()); + } + + public function test_session_middleware_does_not_run_for_agent_requests(): void + { + // 50 hosts × 30 s heartbeat would otherwise spawn 144k orphan + // session rows per day. Verify by asserting no Set-Cookie + // header (StartSession would emit one) and no XSRF / session + // cookie names appear in the response cookie jar. + [, $plaintext] = $this->issuedToken(); + + $response = $this->postJson( + route('agent.telemetry'), + $this->fullPayload(), + ['Authorization' => 'Bearer '.$plaintext], + )->assertNoContent(); + + foreach ($response->headers->getCookies() as $cookie) { + $this->fail('agent endpoint set unexpected cookie: '.$cookie->getName()); + } + } + + public function test_rate_limit_returns_429_with_retry_after(): void + { + [, $plaintext] = $this->issuedToken(); + + // Pre-fill the bucket up to the limit. Cheaper than firing 60 + // real requests, and exercises the same key the middleware uses + // (the static `rateLimitKey` helper is the public contract). + $token = AgentToken::query()->latest('id')->firstOrFail(); + $key = AuthenticateAgent::rateLimitKey($token); + for ($i = 0; $i < AuthenticateAgent::RATE_LIMIT_PER_MINUTE; $i++) { + RateLimiter::hit($key, 60); + } + + $this->postJson( + route('agent.telemetry'), + $this->fullPayload(), + ['Authorization' => 'Bearer '.$plaintext], + ) + ->assertStatus(429) + ->assertHeader('Retry-After'); + } + + public function test_rate_limit_buckets_per_token(): void + { + [$hostA, $tokenA] = $this->issuedToken(); + [, $tokenB] = $this->issuedToken(Host::factory()->create()); + + // Burn A's bucket completely; B's bucket is independent. + $tokenA_model = AgentToken::query()->where('host_id', $hostA->id)->latest('id')->firstOrFail(); + $keyA = AuthenticateAgent::rateLimitKey($tokenA_model); + for ($i = 0; $i < AuthenticateAgent::RATE_LIMIT_PER_MINUTE; $i++) { + RateLimiter::hit($keyA, 60); + } + + $this->postJson(route('agent.telemetry'), $this->fullPayload(), ['Authorization' => 'Bearer '.$tokenA]) + ->assertStatus(429); + + $this->postJson(route('agent.telemetry'), $this->fullPayload(), ['Authorization' => 'Bearer '.$tokenB]) + ->assertNoContent(); + } + + public function test_rate_limit_does_not_advance_last_used_at(): void + { + [, $plaintext] = $this->issuedToken(); + + $token = AgentToken::query()->latest('id')->firstOrFail(); + $key = AuthenticateAgent::rateLimitKey($token); + for ($i = 0; $i < AuthenticateAgent::RATE_LIMIT_PER_MINUTE; $i++) { + RateLimiter::hit($key, 60); + } + + $before = $token->last_used_at; + + $this->postJson( + route('agent.telemetry'), + $this->fullPayload(), + ['Authorization' => 'Bearer '.$plaintext], + )->assertStatus(429); + + $this->assertEquals($before, $token->fresh()->last_used_at); + } +} diff --git a/tests/Unit/Domain/Docker/IngestHostTelemetryActionTest.php b/tests/Unit/Domain/Docker/IngestHostTelemetryActionTest.php new file mode 100644 index 0000000..8358aaf --- /dev/null +++ b/tests/Unit/Domain/Docker/IngestHostTelemetryActionTest.php @@ -0,0 +1,120 @@ +<?php + +namespace Tests\Unit\Domain\Docker; + +use App\Domain\Docker\Actions\IngestHostTelemetryAction; +use App\Enums\HostStatus; +use App\Models\Host; +use App\Models\HostMetricSnapshot; +use Carbon\CarbonImmutable; +use Illuminate\Foundation\Testing\RefreshDatabase; +use Tests\TestCase; + +class IngestHostTelemetryActionTest extends TestCase +{ + use RefreshDatabase; + + private function basePayload(?string $recordedAt = null): array + { + return [ + 'recorded_at' => $recordedAt ?? CarbonImmutable::now()->toIso8601String(), + 'host' => [ + 'metrics' => [ + 'cpu_percent' => 12.5, + 'memory_used_mb' => 2048, + 'memory_total_mb' => 4096, + ], + ], + ]; + } + + public function test_first_telemetry_transitions_pending_host_to_online(): void + { + $host = Host::factory()->create(); // status: pending + + app(IngestHostTelemetryAction::class)->execute($host, $this->basePayload()); + + $host->refresh(); + $this->assertSame(HostStatus::Online, $host->status); + $this->assertNotNull($host->last_seen_at); + $this->assertSame(1, HostMetricSnapshot::query()->count()); + } + + public function test_archived_host_stays_archived_even_if_telemetry_lands(): void + { + $host = Host::factory()->archived()->create(); + + app(IngestHostTelemetryAction::class)->execute($host, $this->basePayload()); + + $host->refresh(); + $this->assertSame(HostStatus::Archived, $host->status); + } + + public function test_facts_only_overwrite_when_present(): void + { + $host = Host::factory()->create([ + 'cpu_count' => 8, + 'memory_total_mb' => 16384, + 'os' => 'Ubuntu 22.04', + ]); + + $payload = $this->basePayload(); + $payload['host']['facts'] = [ + 'docker_version' => '26.1.0', + // cpu_count, memory_total_mb, os intentionally absent + ]; + + app(IngestHostTelemetryAction::class)->execute($host, $payload); + + $host->refresh(); + $this->assertSame(8, $host->cpu_count, 'cpu_count preserved'); + $this->assertSame(16384, $host->memory_total_mb, 'memory_total_mb preserved'); + $this->assertSame('Ubuntu 22.04', $host->os, 'os preserved'); + $this->assertSame('26.1.0', $host->docker_version); + } + + public function test_snapshot_carries_recorded_at_from_payload(): void + { + $host = Host::factory()->create(); + // Second-aligned: the `timestamp` column drops sub-second + // precision, so a fractional CarbonImmutable round-trips lossy. + $recordedAt = CarbonImmutable::now()->subMinutes(5)->startOfSecond(); + + app(IngestHostTelemetryAction::class)->execute( + $host, + $this->basePayload($recordedAt->toIso8601String()), + ); + + $snapshot = HostMetricSnapshot::query()->firstOrFail(); + $this->assertSame( + $recordedAt->toIso8601String(), + $snapshot->recorded_at->toIso8601String(), + ); + + $host->refresh(); + $this->assertSame( + $recordedAt->toIso8601String(), + $host->last_seen_at->toIso8601String(), + ); + } + + public function test_already_online_host_does_not_rewrite_status_column(): void + { + $host = Host::factory()->online()->create(); + $originalUpdatedAt = $host->updated_at; + + // Travel forward so any rewrite would bump `updated_at`. + $this->travel(30)->seconds(); + + app(IngestHostTelemetryAction::class)->execute( + $host, + $this->basePayload(now()->toIso8601String()), + ); + + $host->refresh(); + // `last_seen_at` was updated (so updated_at moves), but the + // important assertion is that status stays Online and the + // payload didn't include a redundant status column write. + $this->assertSame(HostStatus::Online, $host->status); + } +} diff --git a/tests/Unit/Domain/Docker/SyncContainerSnapshotsActionTest.php b/tests/Unit/Domain/Docker/SyncContainerSnapshotsActionTest.php new file mode 100644 index 0000000..29167d1 --- /dev/null +++ b/tests/Unit/Domain/Docker/SyncContainerSnapshotsActionTest.php @@ -0,0 +1,123 @@ +<?php + +namespace Tests\Unit\Domain\Docker; + +use App\Domain\Docker\Actions\SyncContainerSnapshotsAction; +use App\Models\Container; +use App\Models\ContainerMetricSnapshot; +use App\Models\Host; +use Carbon\CarbonImmutable; +use Illuminate\Foundation\Testing\RefreshDatabase; +use Tests\TestCase; + +class SyncContainerSnapshotsActionTest extends TestCase +{ + use RefreshDatabase; + + private function payload(string $id = 'abc123', array $overrides = []): array + { + return array_replace_recursive([ + 'container_id' => $id, + 'name' => 'web', + 'image' => 'ghcr.io/acme/web', + 'image_tag' => 'v1', + 'status' => 'running', + 'state' => 'running', + 'health_status' => 'healthy', + 'ports' => [], + 'labels' => [], + 'metrics' => [ + 'cpu_percent' => 1.5, + 'memory_usage_mb' => 100, + 'memory_limit_mb' => 400, + ], + ], $overrides); + } + + public function test_first_call_inserts_container_and_snapshot(): void + { + $host = Host::factory()->create(); + + app(SyncContainerSnapshotsAction::class)->execute( + $host, + CarbonImmutable::now(), + [$this->payload()], + ); + + $this->assertSame(1, Container::query()->count()); + $this->assertSame(1, ContainerMetricSnapshot::query()->count()); + + $container = Container::query()->firstOrFail(); + $this->assertSame('abc123', $container->container_id); + $this->assertSame(25.0, (float) $container->memory_percent); + } + + public function test_repeat_call_upserts_container_and_appends_snapshot(): void + { + $host = Host::factory()->create(); + $sync = app(SyncContainerSnapshotsAction::class); + + $sync->execute($host, CarbonImmutable::now()->subMinutes(1), [$this->payload()]); + $sync->execute( + $host, + CarbonImmutable::now(), + [$this->payload('abc123', ['metrics' => ['cpu_percent' => 9.9]])], + ); + + $this->assertSame(1, Container::query()->count(), 'container deduped'); + $this->assertSame(2, ContainerMetricSnapshot::query()->count()); + $this->assertSame(9.9, (float) Container::query()->first()->cpu_percent); + } + + public function test_missing_container_in_payload_does_not_drop_existing_row(): void + { + $host = Host::factory()->create(); + $sync = app(SyncContainerSnapshotsAction::class); + + $sync->execute($host, CarbonImmutable::now()->subMinutes(1), [ + $this->payload('abc'), + $this->payload('xyz'), + ]); + + // Second call: only one container. + $sync->execute($host, CarbonImmutable::now(), [$this->payload('abc')]); + + $this->assertSame(2, Container::query()->count(), 'rows persist when omitted'); + } + + public function test_memory_percent_is_null_when_inputs_are_missing(): void + { + $host = Host::factory()->create(); + + app(SyncContainerSnapshotsAction::class)->execute( + $host, + CarbonImmutable::now(), + [$this->payload('abc', ['metrics' => ['memory_usage_mb' => null, 'memory_limit_mb' => null]])], + ); + + $container = Container::query()->firstOrFail(); + $this->assertNull($container->memory_percent); + } + + public function test_same_container_id_on_two_hosts_creates_two_distinct_rows(): void + { + $hostA = Host::factory()->create(); + $hostB = Host::factory()->create(); + $sync = app(SyncContainerSnapshotsAction::class); + + // Both hosts run a container with the literal id 'abc'. The + // unique index on (host_id, container_id) means each gets its + // own row and snapshot — host A's data must not bleed into + // host B's. + $sync->execute($hostA, CarbonImmutable::now(), [ + $this->payload('abc', ['name' => 'host-a-web']), + ]); + $sync->execute($hostB, CarbonImmutable::now(), [ + $this->payload('abc', ['name' => 'host-b-web']), + ]); + + $this->assertSame(2, Container::query()->count()); + $this->assertSame('host-a-web', Container::query()->where('host_id', $hostA->id)->value('name')); + $this->assertSame('host-b-web', Container::query()->where('host_id', $hostB->id)->value('name')); + } +}