jjackson · jjackson · May 19, 2026 · May 19, 2026
diff --git a/video-production/connect-videos/scripts/render.ts b/video-production/connect-videos/scripts/render.ts
@@ -6,7 +6,7 @@ import { execSync } from "node:child_process";
 import { loadProgramSpec } from "../src/lib/spec.node";
 import { loadDefaults, resolveBeats, type ResolvedTimeline, type ResolvedBeat } from "../src/lib/beats.node";
 import { resolveRun, specPath, outputPath } from "../src/lib/runs.node";
-import { synthesize, synthesizePerBeat, type PerBeatNarration } from "../src/lib/voiceover";
+import { synthesize, synthesizePerBeat, readAlignment, wordStartSeconds, type PerBeatNarration } from "../src/lib/voiceover";
 import { estimateCaptionTimeline, captionsFromBeats } from "../src/lib/captions";
 import { resolveAssetRefs, formatMissingError } from "../src/lib/asset-resolver.node";
 
@@ -227,7 +227,34 @@ async function main() {
   for (const b of timeline.beats) {
     beatOverrides[b.id] = { seconds: b.seconds };
   }
-  const props = { programSlug: cli.program, specYaml, beatOverrides, captions };
+
+  // Extract cycle-step timestamps from the cycle beat's TTS alignment.
+  // ElevenLabs' /with-timestamps endpoint returns per-character start
+  // seconds in the synthesized audio. We look up where the four cycle
+  // keywords actually start being spoken (case-insensitive stem match:
+  // "verif" matches verify/verified, "paid" matches paid/paying) and
+  // pass those as concrete numbers to the Cycle component. With these,
+  // the highlight transitions on the spoken word — not on a guessed
+  // proportional position.
+  const cyclePerBeat = perBeat.find((p) => p.beatId === "cycle");
+  let cycleStepStartSeconds:
+    | { learn?: number; deliver?: number; verify?: number; pay?: number }
+    | undefined;
+  if (cyclePerBeat) {
+    const sidecar = cyclePerBeat.audioPath.replace(/\.mp3$/, ".json");
+    const alignment = readAlignment(sidecar);
+    if (alignment) {
+      cycleStepStartSeconds = {
+        learn: wordStartSeconds(alignment, "learn") ?? undefined,
+        deliver: wordStartSeconds(alignment, "deliver") ?? undefined,
+        verify: wordStartSeconds(alignment, "verif") ?? undefined,
+        pay: wordStartSeconds(alignment, "paid") ?? wordStartSeconds(alignment, "pay") ?? undefined,
+      };
+      console.log("Cycle step timings (seconds into cycle audio):", cycleStepStartSeconds);
+    }
+  }
+
+  const props = { programSlug: cli.program, specYaml, beatOverrides, captions, cycleStepStartSeconds };
   const tmpPropsFile = path.join(os.tmpdir(), `remotion-props-${Date.now()}.json`);
   fs.writeFileSync(tmpPropsFile, JSON.stringify(props));
   const propsArg = `--props=${JSON.stringify(tmpPropsFile)}`;

diff --git a/video-production/connect-videos/src/Root.tsx b/video-production/connect-videos/src/Root.tsx
@@ -31,6 +31,19 @@ interface VideoProps {
    */
   beatOverrides?: Record<string, { seconds?: number }>;
   captions?: { startFrame: number; endFrame: number; text: string }[];
+  /**
+   * Exact seconds-into-cycle-audio for each cycle keyword, extracted
+   * from the ElevenLabs alignment data at render time. When present,
+   * the Intro/Cycle component switches the highlight on the spoken
+   * word; when absent, falls back to the word-index proportional
+   * estimate. Studio preview omits this (no audio synth).
+   */
+  cycleStepStartSeconds?: {
+    learn?: number;
+    deliver?: number;
+    verify?: number;
+    pay?: number;
+  };
 }
 
 // Programs registered for Studio preview. Add new entries here as program
@@ -61,6 +74,7 @@ const ProgramVideo: React.FC<VideoProps> = ({
   specYaml,
   beatOverrides,
   captions = [],
+  cycleStepStartSeconds,
 }) => {
   // Render-CLI path: spec passed verbatim via props. Studio-preview
   // path: look up the slug in the bundled registry. The render CLI
@@ -101,7 +115,12 @@ const ProgramVideo: React.FC<VideoProps> = ({
           // Cycle highlight syncs to the keyword positions in this
           // beat's narration ("learn"/"deliver"/"verif"/"pay") so the
           // ring lights up the right step as the voiceover names it.
+          // When cycleStepStartSeconds is provided (post-2026-05-19,
+          // from ElevenLabs alignment), Cycle uses the exact spoken
+          // timestamps; otherwise it falls back to a word-index
+          // proportional estimate parsed from the narration text.
           cycleNarration={spec.narration?.by_beat?.cycle}
+          cycleStepStartSeconds={cycleStepStartSeconds}
         />
       </Sequence>
       <Sequence

diff --git a/video-production/connect-videos/src/compositions/Intro.tsx b/video-production/connect-videos/src/compositions/Intro.tsx
@@ -12,12 +12,20 @@ interface Props {
   programName: string;
   brand: Brand;
   beatFrames: { hook: number; cycle: number; handoff: number };
-  // Optional narration text for the cycle beat. When provided, the
-  // cycle highlight walks at the timestamps where "learn", "deliver",
-  // "verify(ied)" and "pay(ied/paid)" appear in the audio (estimated
-  // proportional to word position). Without it, falls back to evenly
-  // spaced quarters.
+  // Optional narration text for the cycle beat. Used only when
+  // cycleStepStartSeconds isn't provided — falls back to the
+  // word-index proportional estimate.
   cycleNarration?: string;
+  // Exact seconds-into-cycle-audio at which each spoken keyword
+  // starts. Extracted from ElevenLabs' alignment data at render time
+  // (see voiceover.ts::wordStartSeconds). When provided, the highlight
+  // transitions on the spoken word — no estimation.
+  cycleStepStartSeconds?: {
+    learn?: number;
+    deliver?: number;
+    verify?: number;
+    pay?: number;
+  };
 }
 
 const Hook: React.FC<{ tagline: string }> = ({ tagline }) => {
@@ -110,27 +118,63 @@ const Cycle: React.FC<{
   durationFrames: number;
   steps: readonly [string, string, string, string];
   narration?: string;
-}> = ({ durationFrames, steps, narration }) => {
+  stepStartSeconds?: {
+    learn?: number;
+    deliver?: number;
+    verify?: number;
+    pay?: number;
+  };
+}> = ({ durationFrames, steps, narration, stepStartSeconds }) => {
   const frame = useCurrentFrame();
-  // Reserve the first 12 frames (0.4s @ 30fps) for the stagger-in.
-  const STAGGER = 12;
-  const walkBudget = durationFrames - STAGGER;
-  const positions = keywordPositions(narration);
-  let activeIndex: number;
-  if (positions) {
-    // The highlight switches to step i at positions[i] * walkBudget.
-    // We pick the highest i whose boundary is already past.
-    const t = (frame - STAGGER) / walkBudget; // 0..1
-    activeIndex = 0;
-    for (let i = 0; i < 4; i++) {
-      if (t >= positions[i]) activeIndex = i;
+  const { fps } = useVideoConfig();
+
+  // Three timing strategies, in order of preference:
+  //   1. Concrete second-offsets from ElevenLabs alignment data
+  //      (stepStartSeconds). The highlight transitions on the spoken
+  //      word — what the user wants.
+  //   2. Word-index proportional positions parsed from the narration
+  //      text (positions[]). Reasonable estimate when alignment isn't
+  //      available (e.g. Studio preview, or programs that haven't
+  //      re-rendered post-alignment-switch).
+  //   3. Even quarters across the beat duration. Last-resort fallback
+  //      when there's no narration text at all.
+  let activeIndex = 0;
+
+  if (stepStartSeconds && (
+    stepStartSeconds.learn !== undefined ||
+    stepStartSeconds.deliver !== undefined ||
+    stepStartSeconds.verify !== undefined ||
+    stepStartSeconds.pay !== undefined
+  )) {
+    // The cycle audio runs synchronously with the cycle beat (the mux
+    // step places the per-beat clip at beat.startFrame). So
+    // "seconds into cycle audio" == "seconds into this Sequence" ==
+    // frame / fps.
+    const t = frame / fps;
+    if (stepStartSeconds.pay !== undefined && t >= stepStartSeconds.pay) {
+      activeIndex = 3;
+    } else if (stepStartSeconds.verify !== undefined && t >= stepStartSeconds.verify) {
+      activeIndex = 2;
+    } else if (stepStartSeconds.deliver !== undefined && t >= stepStartSeconds.deliver) {
+      activeIndex = 1;
+    } else if (stepStartSeconds.learn !== undefined && t >= stepStartSeconds.learn) {
+      activeIndex = 0;
     }
-    activeIndex = Math.min(3, Math.max(0, activeIndex));
   } else {
-    // Fallback: even quarters (legacy behaviour for programs that
-    // don't provide a cycle narration string).
-    const stepDuration = walkBudget / 4;
-    activeIndex = Math.min(3, Math.max(0, Math.floor((frame - STAGGER) / stepDuration)));
+    // Reserve the first 12 frames for the stagger-in.
+    const STAGGER = 12;
+    const walkBudget = durationFrames - STAGGER;
+    const positions = keywordPositions(narration);
+    if (positions) {
+      const t = (frame - STAGGER) / walkBudget;
+      for (let i = 0; i < 4; i++) {
+        if (t >= positions[i]) activeIndex = i;
+      }
+    } else {
+      const stepDuration = walkBudget / 4;
+      activeIndex = Math.floor((frame - STAGGER) / stepDuration);
+    }
+    activeIndex = Math.min(3, Math.max(0, activeIndex));
   }
   return (
     <AbsoluteFill
@@ -169,13 +213,24 @@ const Handoff: React.FC<{ programName: string }> = ({ programName }) => (
   </AbsoluteFill>
 );
 
-export const Intro: React.FC<Props> = ({ programName, brand, beatFrames, cycleNarration }) => (
+export const Intro: React.FC<Props> = ({
+  programName,
+  brand,
+  beatFrames,
+  cycleNarration,
+  cycleStepStartSeconds,
+}) => (
   <>
     <Sequence durationInFrames={beatFrames.hook}>
       <Hook tagline={brand.tagline} />
     </Sequence>
     <Sequence from={beatFrames.hook} durationInFrames={beatFrames.cycle}>
-      <Cycle durationFrames={beatFrames.cycle} steps={brand.cycleSteps} narration={cycleNarration} />
+      <Cycle
+        durationFrames={beatFrames.cycle}
+        steps={brand.cycleSteps}
+        narration={cycleNarration}
+        stepStartSeconds={cycleStepStartSeconds}
+      />
     </Sequence>
     <Sequence from={beatFrames.hook + beatFrames.cycle} durationInFrames={beatFrames.handoff}>
       <Handoff programName={programName} />

diff --git a/video-production/connect-videos/src/lib/voiceover.ts b/video-production/connect-videos/src/lib/voiceover.ts
@@ -1,5 +1,5 @@
 import { createHash } from "node:crypto";
-import { existsSync, mkdirSync, writeFileSync } from "node:fs";
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
 import path from "node:path";
 import { probeDurationSeconds } from "./probe";
 
@@ -19,49 +19,124 @@ export interface SynthesizeArgs {
   fetchImpl?: typeof fetch;
 }
 
+/**
+ * Per-character alignment timings returned by ElevenLabs'
+ * /with-timestamps endpoint. `characters[i]` is the literal character,
+ * `*_seconds[i]` is when it starts/ends within the synthesized clip.
+ * Used to compute exact moments for "learn"/"deliver"/"verify"/"paid"
+ * so the cycle highlight transitions on the spoken word rather than
+ * on a proportional word-index estimate.
+ */
+export interface Alignment {
+  characters: string[];
+  character_start_times_seconds: number[];
+  character_end_times_seconds: number[];
+}
+
+/**
+ * Find the start time (in seconds, relative to the clip) where `word`
+ * first appears in the alignment. Case-insensitive prefix match: passing
+ * "verif" matches both "verify" and "verified"; "paid" matches "paid"
+ * but not "pay". Returns `null` if the word never appears.
+ *
+ * Implementation: rebuild the synthesized string from `characters[]`,
+ * find the word boundary index, look up its start_seconds.
+ */
+export function wordStartSeconds(
+  alignment: Alignment, wordStem: string,
+): number | null {
+  const stem = wordStem.toLowerCase();
+  const text = alignment.characters.join("").toLowerCase();
+  // Match word boundary: preceded by non-letter (or start of string)
+  // and starts with the stem.
+  const re = new RegExp(`(^|[^a-z])${stem}`);
+  const m = text.match(re);
+  if (!m || m.index === undefined) return null;
+  const idx = m.index + (m[1] ? m[1].length : 0);
+  if (idx < 0 || idx >= alignment.character_start_times_seconds.length) {
+    return null;
+  }
+  return alignment.character_start_times_seconds[idx];
+}
+
+/**
+ * Read the alignment sidecar for an already-synthesized clip. Returns
+ * `null` if the file is missing or was written before the timestamps
+ * endpoint switch (legacy sidecars don't carry `alignment`).
+ */
+export function readAlignment(jsonPath: string): Alignment | null {
+  try {
+    const raw = JSON.parse(readFileSync(jsonPath, "utf8"));
+    if (raw && Array.isArray(raw.alignment?.characters)) {
+      return raw.alignment as Alignment;
+    }
+  } catch {
+    // fall through
+  }
+  return null;
+}
+
 export async function synthesize(args: SynthesizeArgs): Promise<string> {
   const { script, voiceId, model, cacheDir, apiKey } = args;
   const key = cacheKey(script, voiceId, model);
   mkdirSync(cacheDir, { recursive: true });
   const mp3Path = path.join(cacheDir, `${key}.mp3`);
   const jsonPath = path.join(cacheDir, `${key}.json`);
-  if (existsSync(mp3Path) && existsSync(jsonPath)) return mp3Path;
 
-  if (!existsSync(mp3Path)) {
-    const fetchImpl = args.fetchImpl ?? fetch;
-    const resp = await fetchImpl(
-      `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
-      {
-        method: "POST",
-        headers: {
-          "xi-api-key": apiKey,
-          "content-type": "application/json",
-          accept: "audio/mpeg",
+  // Cache hit only when BOTH mp3 and sidecar with alignment exist —
+  // if the sidecar predates the alignment switch we re-synth so the
+  // cycle's word-timing path can drive off real character timings.
+  if (existsSync(mp3Path) && existsSync(jsonPath)) {
+    const cached = readAlignment(jsonPath);
+    if (cached) return mp3Path;
+  }
+
+  let alignment: Alignment | null = null;
+
+  const fetchImpl = args.fetchImpl ?? fetch;
+  // /with-timestamps returns base64-encoded mp3 + per-character timing.
+  // Both arrive in one call so the audio bytes and the alignment can't
+  // disagree (which they would if we synthesized once for the mp3 and
+  // re-synthesized later just for timings — ElevenLabs samples a fresh
+  // delivery each time even with stability tuned high).
+  const resp = await fetchImpl(
+    `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
+    {
+      method: "POST",
+      headers: {
+        "xi-api-key": apiKey,
+        "content-type": "application/json",
+        accept: "application/json",
+      },
+      body: JSON.stringify({
+        text: script,
+        model_id: model,
+        voice_settings: {
+          stability: 0.6,
+          similarity_boost: 0.45,
+          style: 0.2,
+          use_speaker_boost: true,
         },
-        body: JSON.stringify({
-          text: script,
-          model_id: model,
-          // Softer, more documentary-style delivery: higher stability for
-          // calmer pacing, lower similarity_boost for a more natural read,
-          // small style nudge away from a flat baseline.
-          voice_settings: {
-            stability: 0.6,
-            similarity_boost: 0.45,
-            style: 0.2,
-            use_speaker_boost: true,
-          },
-        }),
-      }
-    );
-    if (!resp.ok) {
-      throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`);
-    }
-    const buf = Buffer.from(await resp.arrayBuffer());
-    writeFileSync(mp3Path, buf);
+      }),
+    },
+  );
+  if (!resp.ok) {
+    throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`);
+  }
+  const payload = (await resp.json()) as {
+    audio_base64?: string;
+    alignment?: Alignment;
+  };
+  if (!payload.audio_base64) {
+    throw new Error("ElevenLabs response missing audio_base64");
   }
+  const buf = Buffer.from(payload.audio_base64, "base64");
+  writeFileSync(mp3Path, buf);
+  alignment = payload.alignment ?? null;
 
-  // Always (re)write the sidecar when missing — covers (a) brand-new
-  // synthesis and (b) pre-sidecar mp3s left over from an earlier render.
+  // Write the sidecar with alignment data. Legacy fields (voice_id,
+  // model, text, duration_sec, generated_at) stay so the audio library
+  // sync still parses them.
   writeFileSync(
     jsonPath,
     JSON.stringify(
@@ -71,6 +146,7 @@ export async function synthesize(args: SynthesizeArgs): Promise<string> {
         text: script,
         duration_sec: probeDurationSeconds(mp3Path),
         generated_at: new Date().toISOString(),
+        alignment,
       },
       null,
       2,