From eccc29a75f4f8aec99dcf28e69a2ba8d535b2c9b Mon Sep 17 00:00:00 2001
From: Jake Bromberg <jake@funlandresearch.com>
Date: Tue, 19 May 2026 11:25:04 -0700
Subject: [PATCH] Disable LML client limiter in CI to fix metadata-lml suite
 drain (BS#955)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

G4 (PR #948 / BS#906) added a module-level Semaphore(5) + TokenBucket(50/min) to apps/backend/services/lml/lml.client.ts as a prod safety net mirroring LML's Discogs ceilings. Under integration's --runInBand the bucket persists across spec files; the full suite triggers > 50 fire-and-forget /lookup calls during earlier suites, draining the bucket. By the time metadata-lml.spec.js runs, every call waits ~1200ms for a token and the proxy test times out at 30s. The unit test author for G4 had already documented this risk (tests/unit/services/lml.client.test.ts:792 sets `LML_CLIENT_RATE_PER_MIN: '60000'` with the comment "effectively no rate cap in tests") but the convention wasn't propagated to either CI environment.

CI for this repo has two surfaces that diverged independently:
- .github/workflows/test.yml `Start services` env — actual GitHub Actions CI, host node processes
- dev_env/docker-compose.yml ci backend env — local `npm run ci:testmock`

Both override LML_CLIENT_MAX_CONCURRENT and LML_CLIENT_RATE_PER_MIN to high values so the limiter is effectively a no-op in tests. The source-grep test pins both surfaces against future drift.

Verified locally: full integration suite shows `PASS tests/integration/metadata-lml.spec.js` with all 11 tests green, including the six that were failing on CI:
- adding a non-library track triggers LML search (was: lookupCalls.length === 0)
- artwork_url, streaming, artist_bio, search URLs (was: Received has value: null)
- GET /proxy/metadata/album returns enriched metadata (was: 30002 ms timeout) — now 7 ms
---
 .github/workflows/test.yml                    |   9 ++
 dev_env/docker-compose.yml                    |   8 ++
 .../unit/scripts/lml-limiter-test-env.test.ts | 129 ++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 tests/unit/scripts/lml-limiter-test-env.test.ts

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6134404a..d765161f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -357,6 +357,15 @@ jobs:
           # tubafrenzy-webhook integration spec needs this wired explicitly.
           # Matches the spec's INTERNAL_KEY fallback default.
           ETL_NOTIFY_KEY: test-secret-key
+          # LML client limiter: disable in CI. The integration suite runs
+          # hundreds of fire-and-forget /lookup calls across spec files under
+          # --runInBand; the module-level TokenBucket(50/min) would drain
+          # partway through and subsequent calls would queue ~1200ms each,
+          # timing out metadata-lml tests at 30s (BS#955). Mirrors the
+          # unit-test override convention at
+          # tests/unit/services/lml.client.test.ts:792-793.
+          LML_CLIENT_MAX_CONCURRENT: '10000'
+          LML_CLIENT_RATE_PER_MIN: '60000'
         run: |
           node dev_env/mock-api-server/dist/server.js > /tmp/mock-api.log 2>&1 &
           node apps/auth/dist/app.js > /tmp/auth.log 2>&1 &
diff --git a/dev_env/docker-compose.yml b/dev_env/docker-compose.yml
index 51b03ed4..64d6789d 100644
--- a/dev_env/docker-compose.yml
+++ b/dev_env/docker-compose.yml
@@ -185,6 +185,14 @@ services:
       - METADATA_ALBUM_CACHE_MAX_SIZE=${METADATA_ALBUM_CACHE_MAX_SIZE:-1000}
       - METADATA_ARTIST_CACHE_MAX_SIZE=${METADATA_ARTIST_CACHE_MAX_SIZE:-500}
       - METADATA_ROTATION_PRIORITY=${METADATA_ROTATION_PRIORITY:-2.0}
+      # LML client limiter: disable in CI. The integration suite runs hundreds
+      # of fire-and-forget /lookup calls across spec files under --runInBand;
+      # the module-level TokenBucket(50/min) would drain partway through and
+      # subsequent calls would queue ~1200ms each, timing out metadata-lml
+      # tests at 30s (BS#955). Mirrors the unit-test override convention at
+      # tests/unit/services/lml.client.test.ts:792-793.
+      - LML_CLIENT_MAX_CONCURRENT=10000
+      - LML_CLIENT_RATE_PER_MIN=60000
       - USE_MOCK_SERVICES=false
       # Anonymous device authentication
       - ANON_DEVICE_JWT_SECRET=${ANON_DEVICE_JWT_SECRET:-ci-test-secret-key-for-anonymous-devices}
diff --git a/tests/unit/scripts/lml-limiter-test-env.test.ts b/tests/unit/scripts/lml-limiter-test-env.test.ts
new file mode 100644
index 00000000..b89e60ba
--- /dev/null
+++ b/tests/unit/scripts/lml-limiter-test-env.test.ts
@@ -0,0 +1,129 @@
+/**
+ * Pin that BOTH CI environments disable the LML client's process-wide rate
+ * limiter (BS#955):
+ *
+ *   1. `dev_env/docker-compose.yml` — for `npm run ci:testmock` (local
+ *      docker-based CI repro).
+ *   2. `.github/workflows/test.yml` `Start services` env block — for actual
+ *      GitHub Actions CI, which runs the backend as a host node process and
+ *      does NOT load env from docker-compose.
+ *
+ * These two surfaces drift independently; the local repro is a docker stack
+ * and the production CI is host processes with inline workflow env. Both
+ * need to override or the integration suite regresses.
+ *
+ * Why: G4 (PR #948 / BS#906) added a module-level `Semaphore(5)` +
+ * `TokenBucket(50/min)` to `apps/backend/services/lml/lml.client.ts` as a
+ * prod safety net mirroring LML's Discogs ceilings. In integration tests
+ * (`--runInBand`), the bucket persists across spec files. The full
+ * integration suite triggers > 50 fire-and-forget `/lookup` calls during
+ * earlier suites, draining the bucket; by the time `metadata-lml.spec.js`
+ * runs, every call waits ~1200 ms for a token and the proxy test times out
+ * at 30 s.
+ *
+ * The unit-test author for G4 already set `LML_CLIENT_RATE_PER_MIN: '60000'`
+ * in the unit harness with the comment "effectively no rate cap in tests"
+ * (see `tests/unit/services/lml.client.test.ts:792`). This test pins the
+ * same convention into both CI environments so the next limiter env var
+ * (or a future tweak to G4's defaults) can't silently regress the suite.
+ *
+ * Source-grep test (no docker, no PG) — same style as the adjacent
+ * `init-db-clone-gate.test.ts`.
+ */
+
+import * as fs from 'fs';
+import * as path from 'path';
+
+const repoRoot = path.resolve(__dirname, '../../..');
+const composePath = path.join(repoRoot, 'dev_env/docker-compose.yml');
+const workflowPath = path.join(repoRoot, '.github/workflows/test.yml');
+
+describe('LML client limiter env vars (BS#955)', () => {
+  describe('dev_env/docker-compose.yml ci backend (local ci:testmock)', () => {
+    const backendBlock = extractServiceBlock('backend');
+
+    it('sets LML_CLIENT_MAX_CONCURRENT to a high value (test override)', () => {
+      // Default in lml.client.ts is 5 (mirrors LML's discogs_max_concurrent).
+      // Local CI needs an effectively-unlimited override so the integration
+      // suite, which serializes hundreds of LML calls under --runInBand,
+      // never queues.
+      const captured = backendBlock.match(/LML_CLIENT_MAX_CONCURRENT=(\d+)/)?.[1];
+      expect(captured).toBeDefined();
+      expect(Number(captured)).toBeGreaterThanOrEqual(1000);
+    });
+
+    it('sets LML_CLIENT_RATE_PER_MIN to a high value (test override)', () => {
+      // Default in lml.client.ts is 50/min (mirrors LML's discogs_rate_limit).
+      // Local CI needs >= 10000/min so the TokenBucket cannot drain across
+      // the full integration suite. Matches the unit-test convention at
+      // tests/unit/services/lml.client.test.ts:792-793.
+      const captured = backendBlock.match(/LML_CLIENT_RATE_PER_MIN=(\d+)/)?.[1];
+      expect(captured).toBeDefined();
+      expect(Number(captured)).toBeGreaterThanOrEqual(10000);
+    });
+  });
+
+  describe('.github/workflows/test.yml Start services env (GHA CI)', () => {
+    // GHA's Start-services step has its own inline `env:` block that's the
+    // sole source of env vars for the host node processes. docker-compose is
+    // never touched on that path.
+    const startServicesBlock = extractStartServicesEnv();
+
+    it('sets LML_CLIENT_MAX_CONCURRENT to a high value (test override)', () => {
+      // String form expected ('10000') because the workflow env block is YAML
+      // — explicit quoting prevents YAML's number-typing from producing a
+      // value the loader handles differently than the compose-file path.
+      const captured = startServicesBlock.match(/LML_CLIENT_MAX_CONCURRENT:\s*['"]?(\d+)['"]?/)?.[1];
+      expect(captured).toBeDefined();
+      expect(Number(captured)).toBeGreaterThanOrEqual(1000);
+    });
+
+    it('sets LML_CLIENT_RATE_PER_MIN to a high value (test override)', () => {
+      const captured = startServicesBlock.match(/LML_CLIENT_RATE_PER_MIN:\s*['"]?(\d+)['"]?/)?.[1];
+      expect(captured).toBeDefined();
+      expect(Number(captured)).toBeGreaterThanOrEqual(10000);
+    });
+  });
+});
+
+/**
+ * Extract a single docker-compose service block by its name. Matches the
+ * `^  <name>:` line and everything up to (but not including) the next
+ * top-level service header (`^  <ident>:`). Throws if the service isn't
+ * found — silent fall-through to '' would let positive-match assertions
+ * trivially fail without surfacing the typo or layout drift.
+ *
+ * The end-of-file anchor `(?![\s\S])` covers the case where the matched
+ * service is the last block in the file; without it the lookahead
+ * `(?=^\s{2}\w)` would fail and the match would return null.
+ */
+function extractServiceBlock(name: string): string {
+  const compose = fs.readFileSync(composePath, 'utf-8');
+  const escaped = name.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&');
+  const pattern = new RegExp(`^\\s{2}${escaped}:[\\s\\S]*?(?=^\\s{2}\\w|(?![\\s\\S]))`, 'm');
+  const block = compose.match(pattern)?.[0];
+  if (!block) {
+    throw new Error(`docker-compose service block not found: ${name}`);
+  }
+  return block;
+}
+
+/**
+ * Extract the `env:` block of the `Start services` step from the integration
+ * test workflow. Anchors on the step `name:` line and slices up to the `run:`
+ * key — the env keys themselves live in between. Throws if either anchor
+ * isn't found so a workflow rename surfaces immediately.
+ */
+function extractStartServicesEnv(): string {
+  const workflow = fs.readFileSync(workflowPath, 'utf-8');
+  const startIdx = workflow.indexOf('- name: Start services');
+  if (startIdx === -1) {
+    throw new Error('workflow step `Start services` not found in test.yml');
+  }
+  const stepSlice = workflow.slice(startIdx);
+  const runIdx = stepSlice.indexOf('\n        run:');
+  if (runIdx === -1) {
+    throw new Error('workflow step `Start services` has no `run:` key');
+  }
+  return stepSlice.slice(0, runIdx);
+}