From d6433bb70af9a4a487747e43691494834d9d2dd2 Mon Sep 17 00:00:00 2001 From: "Aaron K. Clark" Date: Sun, 17 May 2026 23:58:20 -0500 Subject: [PATCH] feat(observability): Prometheus /metrics endpoint (P4-J) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Architect audit P4-J. New `GET /metrics` Prometheus scrape endpoint plus per-request instrumentation middleware. Exposed metrics: - prom-client default Node.js metrics (event-loop lag, heap, GC, open FDs, etc.) auto-registered. - `http_requests_total{method,route,status}` Counter — one bump per HTTP request, regardless of which controller served it. - `http_request_duration_seconds{method,route,status}` Histogram with buckets [5ms, 10ms, 25ms, 50ms, 100ms, 250ms, 500ms, 1s, 2.5s, 5s, 10s] — sized for a JSON API where the bulk of requests are sub-50ms and the long tail rarely exceeds 2s. Cardinality protection: - The `route` label uses Express's route PATTERN (`/v1/customer/:id`), not the rendered URL. Hitting `/v1/customer/42`, `/v1/customer/43`, ... all roll up to the same series. - 404s have no matched route; we bucket those under `` so someone mistyping URLs can't blow up the metric store. Cardinality-on-purpose dropped: - No `authKey` label. Per-key cardinality would explode the time-series count. Per-tenant breakdowns belong in structured logs aggregated server-side, not in Prometheus labels. Auth: - Default: /metrics is unauthenticated. The intended deployment puts Prometheus on the same private network and lets the reverse proxy gate exposure. - Setting `METRICS_BEARER_TOKEN` flips on `Authorization: Bearer ` enforcement. Token comparison via `crypto.timingSafeEqual` so a leaked token can't be enumerated by timing the response. Tests: - `tests/api/metrics.test.js` (7 cases): route mounting, text-format content-type, default-no-auth pass-through, full METRICS_BEARER_TOKEN gate matrix (no header / wrong token / right token), metrics-registered scrape assertion. - Full suite: 365 pass / 4 skip. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 10 +++ app/middleware/metrics.js | 148 ++++++++++++++++++++++++++++++++++++++ app/routers/router.js | 6 ++ package-lock.json | 38 ++++++++++ package.json | 1 + server.js | 6 ++ tests/api/metrics.test.js | 104 +++++++++++++++++++++++++++ 7 files changed, 313 insertions(+) create mode 100644 app/middleware/metrics.js create mode 100644 tests/api/metrics.test.js diff --git a/CHANGELOG.md b/CHANGELOG.md index 29d85a0..3f848ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **Prometheus `/metrics` endpoint** (P4-J). Exposes prom-client's + default Node.js metrics (event-loop lag, heap, GC, etc.) plus + per-request `http_requests_total{method,route,status}` and + `http_request_duration_seconds{method,route,status}` series. + Route labels use the Express pattern (`/v1/customer/:id`) not the + rendered path, so cardinality stays bounded. + Authentication is OPTIONAL: unset `METRICS_BEARER_TOKEN` leaves + the endpoint open (the usual private-network deployment); setting + it requires `Authorization: Bearer ` on the scrape. Token + comparison is constant-time. - **`migration` field on `GET /healthz`** (P4-I). Body now reports the last applied migration name from `SequelizeMeta` (e.g. `"20260519000000-idempotency-keys"`). Lets a rolling-deploy caller diff --git a/app/middleware/metrics.js b/app/middleware/metrics.js new file mode 100644 index 0000000..cb7d016 --- /dev/null +++ b/app/middleware/metrics.js @@ -0,0 +1,148 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Aaron K. Clark +"use strict"; + +/** + * Prometheus metrics + middleware. + * + * Why + * /healthz is binary (up/down). For SLO work — error rates, p95 + * latency by route, throughput per tenant — we need a richer + * surface. Prometheus is the de-facto pull-based standard and + * plays nicely with Grafana, Alertmanager, and most operators' + * existing infra. + * + * What's exposed + * - prom-client default Node.js metrics (event-loop lag, heap, + * GC, etc.) — auto-registered. + * - `http_requests_total{method,route,status}` counter — one + * bump per HTTP request, labelled with the original Express + * route pattern (e.g. `/v1/customer/:id` not the rendered + * `/v1/customer/42`) so cardinality stays bounded. + * - `http_request_duration_seconds{method,route,status}` + * histogram — for p50/p95/p99 latency calculations in + * Prometheus. + * + * What's NOT exposed + * - authKey or req.companyId labels. Putting either in label + * cardinality would explode the metric store (one time-series + * per unique key). Per-tenant metrics belong in structured + * logs aggregated server-side. + * - Path-as-label without route pattern. `req.route?.path` + * gives us the pattern after Express has matched; on a 404 + * `req.route` is undefined and we fall back to `''` + * to keep the cardinality cap intact. + * + * Auth + * - Default: /metrics is OPEN. The intended deployment puts + * Prometheus on the same private network and lets the + * reverse proxy gate exposure. + * - Operators on shared infra can set METRICS_BEARER_TOKEN + * to require `Authorization: Bearer ` on the scrape. + * If the env var is unset, no auth is enforced. The check + * is a constant-time compare so a leaked token can't be + * enumerated character-by-character via timing. + */ + +const crypto = require('crypto'); +const promClient = require('prom-client'); + +const registry = new promClient.Registry(); +promClient.collectDefaultMetrics({ register: registry }); + +const httpRequestsTotal = new promClient.Counter({ + name: 'http_requests_total', + help: 'Total HTTP requests, labelled by method, route pattern, and status code.', + labelNames: ['method', 'route', 'status'], + registers: [registry], +}); + +const httpRequestDuration = new promClient.Histogram({ + name: 'http_request_duration_seconds', + help: 'HTTP request duration in seconds, labelled by method, route, status.', + labelNames: ['method', 'route', 'status'], + // Buckets sized to a JSON API: most requests in the 1-50ms range, + // tail in the 100ms-2s range, with a few headroom buckets for + // outliers (cold migrations, DB stalls). + buckets: [0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10], + registers: [registry], +}); + +/** + * Express middleware: time each request and bump the counters on + * `res.finish`. Mounted ONCE at the top of the chain in server.js; + * the listener fires regardless of which controller served the + * response, so 404s and middleware-level errors are captured. + */ +function metricsMiddleware(req, res, next) { + const start = process.hrtime.bigint(); + res.on('finish', () => { + const elapsedSec = Number(process.hrtime.bigint() - start) / 1e9; + // After Express has matched, req.route?.path is the route + // pattern. For 404s req.route is undefined; bucket those + // into a single label so the metric doesn't grow per + // distinct mistyped URL. + const route = (req.route && req.route.path) + || (req.baseUrl && req.route && req.baseUrl + req.route.path) + || ''; + const labels = { + method: req.method, + route, + status: String(res.statusCode), + }; + httpRequestsTotal.inc(labels); + httpRequestDuration.observe(labels, elapsedSec); + }); + next(); +} + +/** + * Optional bearer-token check on /metrics. Pulled into its own + * function so the gate can be unit-tested without spinning the + * server. + */ +function checkMetricsAuth(req) { + const required = process.env.METRICS_BEARER_TOKEN; + if (!required) return true; // gate disabled + + const header = req.get && req.get('Authorization'); + if (!header) return false; + const m = /^Bearer\s+(.+)$/.exec(header); + if (!m) return false; + const supplied = m[1]; + + // Constant-time compare on equal-length buffers; if lengths + // differ we still pay the comparison cost (so a length check + // doesn't leak). + const a = Buffer.from(supplied); + const b = Buffer.from(required); + if (a.length !== b.length) { + // Hash both to equal-length bytes so timingSafeEqual is + // willing to run; result is still deterministic-false + // because the original lengths differed. + return crypto.timingSafeEqual( + crypto.createHash('sha256').update(a).digest(), + crypto.createHash('sha256').update(b).digest(), + ) && false; + } + return crypto.timingSafeEqual(a, b); +} + +/** + * GET /metrics handler. Emits prom-client's text-format payload + * with the correct Content-Type for a Prometheus scrape. + */ +async function metricsHandler(req, res) { + if (!checkMetricsAuth(req)) { + return res.status(401).json({ message: 'Unauthorized.' }); + } + res.setHeader('Content-Type', registry.contentType); + return res.status(200).send(await registry.metrics()); +} + +module.exports = { + metricsMiddleware, + metricsHandler, + checkMetricsAuth, + registry, +}; diff --git a/app/routers/router.js b/app/routers/router.js index f1428f8..d679d2a 100644 --- a/app/routers/router.js +++ b/app/routers/router.js @@ -48,6 +48,12 @@ const inventoryTransactionSchemas = require('../schemas/inventorytransaction.sch // of the API process and reachability of the database. router.get('/healthz', health.healthz); +// Prometheus scrape endpoint. Auth is optional, gated by +// METRICS_BEARER_TOKEN env var; unset => unauthenticated, the +// usual private-network deployment pattern. +const { metricsHandler } = require('../middleware/metrics.js'); +router.get('/metrics', metricsHandler); + // attachAuth runs on every /v1/* request and populates // req.authKey / req.isMaster / req.companyId without rejecting. // Existing controllers still have their inline authKey check; once diff --git a/package-lock.json b/package-lock.json index e9915e5..48cc0e7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -20,6 +20,7 @@ "pg-hstore": "^2.3.4", "pino": "^10.3.1", "pino-http": "^11.0.0", + "prom-client": "^15.1.3", "sequelize": "^6.37.8", "sequelize-cli": "^6.6.5", "swagger-ui-express": "^5.0.1", @@ -126,6 +127,15 @@ "integrity": "sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==", "license": "MIT" }, + "node_modules/@opentelemetry/api": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz", + "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==", + "license": "Apache-2.0", + "engines": { + "node": ">=8.0.0" + } + }, "node_modules/@oxc-project/types": { "version": "0.130.0", "resolved": "https://registry.npmjs.org/@oxc-project/types/-/types-0.130.0.tgz", @@ -717,6 +727,12 @@ "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", "license": "MIT" }, + "node_modules/bintrees": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/bintrees/-/bintrees-1.0.2.tgz", + "integrity": "sha512-VOMgTMwjAaUG580SXn3LacVgjurrbMme7ZZNYGSSV7mmtY6QQRh0Eg3pwIcntQ77DErK1L0NxkbetjcoXzVwKw==", + "license": "MIT" + }, "node_modules/bluebird": { "version": "3.7.2", "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", @@ -2614,6 +2630,19 @@ ], "license": "MIT" }, + "node_modules/prom-client": { + "version": "15.1.3", + "resolved": "https://registry.npmjs.org/prom-client/-/prom-client-15.1.3.tgz", + "integrity": "sha512-6ZiOBfCywsD4k1BN9IX0uZhF+tJkV8q8llP64G5Hajs4JOeVLPCwpPVcpXy3BwYiUGgyJzsJJQeOIv7+hDSq8g==", + "license": "Apache-2.0", + "dependencies": { + "@opentelemetry/api": "^1.4.0", + "tdigest": "^0.1.1" + }, + "engines": { + "node": "^16 || ^18 || >=20" + } + }, "node_modules/proto-list": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/proto-list/-/proto-list-1.2.4.tgz", @@ -3379,6 +3408,15 @@ "express": ">=4.0.0 || >=5.0.0-beta" } }, + "node_modules/tdigest": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/tdigest/-/tdigest-0.1.2.tgz", + "integrity": "sha512-+G0LLgjjo9BZX2MfdvPfH+MKLCrxlXSYec5DaPYP1fe6Iyhf0/fSmJ0bFiZ1F8BT6cGXl2LpltQptzjXKWEkKA==", + "license": "MIT", + "dependencies": { + "bintrees": "1.0.2" + } + }, "node_modules/thread-stream": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/thread-stream/-/thread-stream-4.2.0.tgz", diff --git a/package.json b/package.json index 02498bd..f5b50b9 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,7 @@ "pg-hstore": "^2.3.4", "pino": "^10.3.1", "pino-http": "^11.0.0", + "prom-client": "^15.1.3", "sequelize": "^6.37.8", "sequelize-cli": "^6.6.5", "swagger-ui-express": "^5.0.1", diff --git a/server.js b/server.js index 760f7d6..e292e8e 100644 --- a/server.js +++ b/server.js @@ -13,6 +13,7 @@ const db = require('./app/config/db.config.js'); const log = require('./app/config/logger.js'); const router = require('./app/routers/router.js'); const { errorHandler, notFound } = require('./app/middleware/error-handler.js'); +const { metricsMiddleware } = require('./app/middleware/metrics.js'); const app = express(); @@ -154,6 +155,11 @@ if (rateLimitMax !== 0) { app.use('/v1', v1Limiter); } +// Metrics observer. Mounted BEFORE the router so it sees every +// request that flows through (including 404s). The handler at +// /metrics is exposed inside the router itself. +app.use(metricsMiddleware); + app.use('/', router); // 404 fallthrough + global error handler. Order matters — these diff --git a/tests/api/metrics.test.js b/tests/api/metrics.test.js new file mode 100644 index 0000000..b49a772 --- /dev/null +++ b/tests/api/metrics.test.js @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright 2026 Aaron K. Clark +// +// HTTP smoke tests for the /metrics scrape endpoint (P4-J). + +import { describe, test, expect, vi, beforeAll, beforeEach, afterEach } from 'vitest'; +import request from 'supertest'; +import express from 'express'; + +vi.mock('../../app/config/db.config.js', () => ({ + sequelize: { + query: vi.fn().mockResolvedValue([]), + QueryTypes: { SELECT: 'SELECT' }, + }, + Sequelize: {}, + Customer: {}, TimeEntry: {}, Worker: {}, BillingType: {}, + InventoryItem: {}, Company: {}, Job: {}, Invoice: {}, + CustomerPayment: {}, InvoiceJob: {}, ProductEntry: {}, + VersionInfo: {}, PurchaseOrderVendor: {}, PurchaseOrderHeader: {}, + PurchaseOrderLine: {}, InventoryTransaction: {}, + ApiKey: {}, ApiMaster: {}, +})); + +let app; + +beforeAll(async () => { + const router = (await import('../../app/routers/router.js')).default + || require('../../app/routers/router.js'); + const { metricsMiddleware } = await import('../../app/middleware/metrics.js'); + app = express(); + app.use(express.json()); + app.use(metricsMiddleware); + app.use('/', router); +}); + +describe('GET /metrics', () => { + test('route is mounted (not 404)', async () => { + const res = await request(app).get('/metrics'); + expect(res.status).not.toBe(404); + }); + + test('returns Prometheus text-format on a default scrape', async () => { + const res = await request(app).get('/metrics'); + expect(res.status).toBe(200); + // prom-client sets `text/plain; version=0.0.4; charset=utf-8`. + expect(res.headers['content-type']).toMatch(/text\/plain/); + // Should include at least one default Node.js metric line. + expect(res.text).toMatch(/^# HELP/m); + expect(res.text).toMatch(/^# TYPE/m); + expect(res.text).toMatch(/process_cpu_user_seconds_total|nodejs_/); + }); + + test('the http_* metrics are registered (visible in scrape output)', async () => { + // Fire a couple of upstream requests to give the middleware + // a chance to record samples — though vi.mock + nested CJS + // requires (P5-M) sometimes interpose between the middleware + // and the registered counter. The hard requirement we pin + // here is just that the metrics ARE registered, so a future + // refactor that drops the declaration shows up as a failure. + await request(app).get('/healthz'); + await request(app).get('/v1/whoami').set('authKey', 'any'); + const res = await request(app).get('/metrics'); + expect(res.text).toContain('http_requests_total'); + expect(res.text).toContain('http_request_duration_seconds'); + expect(res.text).toMatch(/# TYPE http_requests_total counter/); + expect(res.text).toMatch(/# TYPE http_request_duration_seconds histogram/); + }); + + test('does not require authKey header (orthogonal to API auth)', async () => { + const res = await request(app).get('/metrics'); + expect(res.status).not.toBe(403); + }); +}); + +describe('GET /metrics — METRICS_BEARER_TOKEN gate', () => { + const ORIGINAL = process.env.METRICS_BEARER_TOKEN; + + beforeEach(() => { + process.env.METRICS_BEARER_TOKEN = 'secret-test-token'; + }); + afterEach(() => { + if (ORIGINAL === undefined) delete process.env.METRICS_BEARER_TOKEN; + else process.env.METRICS_BEARER_TOKEN = ORIGINAL; + }); + + test('401 when the env var is set but no Authorization header is supplied', async () => { + const res = await request(app).get('/metrics'); + expect(res.status).toBe(401); + }); + + test('401 when the Authorization header carries the wrong token', async () => { + const res = await request(app) + .get('/metrics') + .set('Authorization', 'Bearer not-the-token'); + expect(res.status).toBe(401); + }); + + test('200 when the Authorization header carries the right token', async () => { + const res = await request(app) + .get('/metrics') + .set('Authorization', 'Bearer secret-test-token'); + expect(res.status).toBe(200); + }); +});