diff --git a/video-production/connect-videos/scripts/render.ts b/video-production/connect-videos/scripts/render.ts index 285bc00..3b77de4 100644 --- a/video-production/connect-videos/scripts/render.ts +++ b/video-production/connect-videos/scripts/render.ts @@ -6,7 +6,7 @@ import { execSync } from "node:child_process"; import { loadProgramSpec } from "../src/lib/spec.node"; import { loadDefaults, resolveBeats, type ResolvedTimeline, type ResolvedBeat } from "../src/lib/beats.node"; import { resolveRun, specPath, outputPath } from "../src/lib/runs.node"; -import { synthesize, synthesizePerBeat, type PerBeatNarration } from "../src/lib/voiceover"; +import { synthesize, synthesizePerBeat, readAlignment, wordStartSeconds, type PerBeatNarration } from "../src/lib/voiceover"; import { estimateCaptionTimeline, captionsFromBeats } from "../src/lib/captions"; import { resolveAssetRefs, formatMissingError } from "../src/lib/asset-resolver.node"; @@ -227,7 +227,34 @@ async function main() { for (const b of timeline.beats) { beatOverrides[b.id] = { seconds: b.seconds }; } - const props = { programSlug: cli.program, specYaml, beatOverrides, captions }; + + // Extract cycle-step timestamps from the cycle beat's TTS alignment. + // ElevenLabs' /with-timestamps endpoint returns per-character start + // seconds in the synthesized audio. We look up where the four cycle + // keywords actually start being spoken (case-insensitive stem match: + // "verif" matches verify/verified, "paid" matches paid/paying) and + // pass those as concrete numbers to the Cycle component. With these, + // the highlight transitions on the spoken word — not on a guessed + // proportional position. + const cyclePerBeat = perBeat.find((p) => p.beatId === "cycle"); + let cycleStepStartSeconds: + | { learn?: number; deliver?: number; verify?: number; pay?: number } + | undefined; + if (cyclePerBeat) { + const sidecar = cyclePerBeat.audioPath.replace(/\.mp3$/, ".json"); + const alignment = readAlignment(sidecar); + if (alignment) { + cycleStepStartSeconds = { + learn: wordStartSeconds(alignment, "learn") ?? undefined, + deliver: wordStartSeconds(alignment, "deliver") ?? undefined, + verify: wordStartSeconds(alignment, "verif") ?? undefined, + pay: wordStartSeconds(alignment, "paid") ?? wordStartSeconds(alignment, "pay") ?? undefined, + }; + console.log("Cycle step timings (seconds into cycle audio):", cycleStepStartSeconds); + } + } + + const props = { programSlug: cli.program, specYaml, beatOverrides, captions, cycleStepStartSeconds }; const tmpPropsFile = path.join(os.tmpdir(), `remotion-props-${Date.now()}.json`); fs.writeFileSync(tmpPropsFile, JSON.stringify(props)); const propsArg = `--props=${JSON.stringify(tmpPropsFile)}`; diff --git a/video-production/connect-videos/src/Root.tsx b/video-production/connect-videos/src/Root.tsx index 534cd86..2ae4eba 100644 --- a/video-production/connect-videos/src/Root.tsx +++ b/video-production/connect-videos/src/Root.tsx @@ -31,6 +31,19 @@ interface VideoProps { */ beatOverrides?: Record; captions?: { startFrame: number; endFrame: number; text: string }[]; + /** + * Exact seconds-into-cycle-audio for each cycle keyword, extracted + * from the ElevenLabs alignment data at render time. When present, + * the Intro/Cycle component switches the highlight on the spoken + * word; when absent, falls back to the word-index proportional + * estimate. Studio preview omits this (no audio synth). + */ + cycleStepStartSeconds?: { + learn?: number; + deliver?: number; + verify?: number; + pay?: number; + }; } // Programs registered for Studio preview. Add new entries here as program @@ -61,6 +74,7 @@ const ProgramVideo: React.FC = ({ specYaml, beatOverrides, captions = [], + cycleStepStartSeconds, }) => { // Render-CLI path: spec passed verbatim via props. Studio-preview // path: look up the slug in the bundled registry. The render CLI @@ -101,7 +115,12 @@ const ProgramVideo: React.FC = ({ // Cycle highlight syncs to the keyword positions in this // beat's narration ("learn"/"deliver"/"verif"/"pay") so the // ring lights up the right step as the voiceover names it. + // When cycleStepStartSeconds is provided (post-2026-05-19, + // from ElevenLabs alignment), Cycle uses the exact spoken + // timestamps; otherwise it falls back to a word-index + // proportional estimate parsed from the narration text. cycleNarration={spec.narration?.by_beat?.cycle} + cycleStepStartSeconds={cycleStepStartSeconds} /> = ({ tagline }) => { @@ -110,27 +118,63 @@ const Cycle: React.FC<{ durationFrames: number; steps: readonly [string, string, string, string]; narration?: string; -}> = ({ durationFrames, steps, narration }) => { + stepStartSeconds?: { + learn?: number; + deliver?: number; + verify?: number; + pay?: number; + }; +}> = ({ durationFrames, steps, narration, stepStartSeconds }) => { const frame = useCurrentFrame(); - // Reserve the first 12 frames (0.4s @ 30fps) for the stagger-in. - const STAGGER = 12; - const walkBudget = durationFrames - STAGGER; - const positions = keywordPositions(narration); - let activeIndex: number; - if (positions) { - // The highlight switches to step i at positions[i] * walkBudget. - // We pick the highest i whose boundary is already past. - const t = (frame - STAGGER) / walkBudget; // 0..1 - activeIndex = 0; - for (let i = 0; i < 4; i++) { - if (t >= positions[i]) activeIndex = i; + const { fps } = useVideoConfig(); + + // Three timing strategies, in order of preference: + // 1. Concrete second-offsets from ElevenLabs alignment data + // (stepStartSeconds). The highlight transitions on the spoken + // word — what the user wants. + // 2. Word-index proportional positions parsed from the narration + // text (positions[]). Reasonable estimate when alignment isn't + // available (e.g. Studio preview, or programs that haven't + // re-rendered post-alignment-switch). + // 3. Even quarters across the beat duration. Last-resort fallback + // when there's no narration text at all. + let activeIndex = 0; + + if (stepStartSeconds && ( + stepStartSeconds.learn !== undefined || + stepStartSeconds.deliver !== undefined || + stepStartSeconds.verify !== undefined || + stepStartSeconds.pay !== undefined + )) { + // The cycle audio runs synchronously with the cycle beat (the mux + // step places the per-beat clip at beat.startFrame). So + // "seconds into cycle audio" == "seconds into this Sequence" == + // frame / fps. + const t = frame / fps; + if (stepStartSeconds.pay !== undefined && t >= stepStartSeconds.pay) { + activeIndex = 3; + } else if (stepStartSeconds.verify !== undefined && t >= stepStartSeconds.verify) { + activeIndex = 2; + } else if (stepStartSeconds.deliver !== undefined && t >= stepStartSeconds.deliver) { + activeIndex = 1; + } else if (stepStartSeconds.learn !== undefined && t >= stepStartSeconds.learn) { + activeIndex = 0; } - activeIndex = Math.min(3, Math.max(0, activeIndex)); } else { - // Fallback: even quarters (legacy behaviour for programs that - // don't provide a cycle narration string). - const stepDuration = walkBudget / 4; - activeIndex = Math.min(3, Math.max(0, Math.floor((frame - STAGGER) / stepDuration))); + // Reserve the first 12 frames for the stagger-in. + const STAGGER = 12; + const walkBudget = durationFrames - STAGGER; + const positions = keywordPositions(narration); + if (positions) { + const t = (frame - STAGGER) / walkBudget; + for (let i = 0; i < 4; i++) { + if (t >= positions[i]) activeIndex = i; + } + } else { + const stepDuration = walkBudget / 4; + activeIndex = Math.floor((frame - STAGGER) / stepDuration); + } + activeIndex = Math.min(3, Math.max(0, activeIndex)); } return ( = ({ programName }) => ( ); -export const Intro: React.FC = ({ programName, brand, beatFrames, cycleNarration }) => ( +export const Intro: React.FC = ({ + programName, + brand, + beatFrames, + cycleNarration, + cycleStepStartSeconds, +}) => ( <> - + diff --git a/video-production/connect-videos/src/lib/voiceover.ts b/video-production/connect-videos/src/lib/voiceover.ts index 205736e..e4be166 100644 --- a/video-production/connect-videos/src/lib/voiceover.ts +++ b/video-production/connect-videos/src/lib/voiceover.ts @@ -1,5 +1,5 @@ import { createHash } from "node:crypto"; -import { existsSync, mkdirSync, writeFileSync } from "node:fs"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import path from "node:path"; import { probeDurationSeconds } from "./probe"; @@ -19,49 +19,124 @@ export interface SynthesizeArgs { fetchImpl?: typeof fetch; } +/** + * Per-character alignment timings returned by ElevenLabs' + * /with-timestamps endpoint. `characters[i]` is the literal character, + * `*_seconds[i]` is when it starts/ends within the synthesized clip. + * Used to compute exact moments for "learn"/"deliver"/"verify"/"paid" + * so the cycle highlight transitions on the spoken word rather than + * on a proportional word-index estimate. + */ +export interface Alignment { + characters: string[]; + character_start_times_seconds: number[]; + character_end_times_seconds: number[]; +} + +/** + * Find the start time (in seconds, relative to the clip) where `word` + * first appears in the alignment. Case-insensitive prefix match: passing + * "verif" matches both "verify" and "verified"; "paid" matches "paid" + * but not "pay". Returns `null` if the word never appears. + * + * Implementation: rebuild the synthesized string from `characters[]`, + * find the word boundary index, look up its start_seconds. + */ +export function wordStartSeconds( + alignment: Alignment, wordStem: string, +): number | null { + const stem = wordStem.toLowerCase(); + const text = alignment.characters.join("").toLowerCase(); + // Match word boundary: preceded by non-letter (or start of string) + // and starts with the stem. + const re = new RegExp(`(^|[^a-z])${stem}`); + const m = text.match(re); + if (!m || m.index === undefined) return null; + const idx = m.index + (m[1] ? m[1].length : 0); + if (idx < 0 || idx >= alignment.character_start_times_seconds.length) { + return null; + } + return alignment.character_start_times_seconds[idx]; +} + +/** + * Read the alignment sidecar for an already-synthesized clip. Returns + * `null` if the file is missing or was written before the timestamps + * endpoint switch (legacy sidecars don't carry `alignment`). + */ +export function readAlignment(jsonPath: string): Alignment | null { + try { + const raw = JSON.parse(readFileSync(jsonPath, "utf8")); + if (raw && Array.isArray(raw.alignment?.characters)) { + return raw.alignment as Alignment; + } + } catch { + // fall through + } + return null; +} + export async function synthesize(args: SynthesizeArgs): Promise { const { script, voiceId, model, cacheDir, apiKey } = args; const key = cacheKey(script, voiceId, model); mkdirSync(cacheDir, { recursive: true }); const mp3Path = path.join(cacheDir, `${key}.mp3`); const jsonPath = path.join(cacheDir, `${key}.json`); - if (existsSync(mp3Path) && existsSync(jsonPath)) return mp3Path; - if (!existsSync(mp3Path)) { - const fetchImpl = args.fetchImpl ?? fetch; - const resp = await fetchImpl( - `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`, - { - method: "POST", - headers: { - "xi-api-key": apiKey, - "content-type": "application/json", - accept: "audio/mpeg", + // Cache hit only when BOTH mp3 and sidecar with alignment exist — + // if the sidecar predates the alignment switch we re-synth so the + // cycle's word-timing path can drive off real character timings. + if (existsSync(mp3Path) && existsSync(jsonPath)) { + const cached = readAlignment(jsonPath); + if (cached) return mp3Path; + } + + let alignment: Alignment | null = null; + + const fetchImpl = args.fetchImpl ?? fetch; + // /with-timestamps returns base64-encoded mp3 + per-character timing. + // Both arrive in one call so the audio bytes and the alignment can't + // disagree (which they would if we synthesized once for the mp3 and + // re-synthesized later just for timings — ElevenLabs samples a fresh + // delivery each time even with stability tuned high). + const resp = await fetchImpl( + `https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`, + { + method: "POST", + headers: { + "xi-api-key": apiKey, + "content-type": "application/json", + accept: "application/json", + }, + body: JSON.stringify({ + text: script, + model_id: model, + voice_settings: { + stability: 0.6, + similarity_boost: 0.45, + style: 0.2, + use_speaker_boost: true, }, - body: JSON.stringify({ - text: script, - model_id: model, - // Softer, more documentary-style delivery: higher stability for - // calmer pacing, lower similarity_boost for a more natural read, - // small style nudge away from a flat baseline. - voice_settings: { - stability: 0.6, - similarity_boost: 0.45, - style: 0.2, - use_speaker_boost: true, - }, - }), - } - ); - if (!resp.ok) { - throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`); - } - const buf = Buffer.from(await resp.arrayBuffer()); - writeFileSync(mp3Path, buf); + }), + }, + ); + if (!resp.ok) { + throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`); + } + const payload = (await resp.json()) as { + audio_base64?: string; + alignment?: Alignment; + }; + if (!payload.audio_base64) { + throw new Error("ElevenLabs response missing audio_base64"); } + const buf = Buffer.from(payload.audio_base64, "base64"); + writeFileSync(mp3Path, buf); + alignment = payload.alignment ?? null; - // Always (re)write the sidecar when missing — covers (a) brand-new - // synthesis and (b) pre-sidecar mp3s left over from an earlier render. + // Write the sidecar with alignment data. Legacy fields (voice_id, + // model, text, duration_sec, generated_at) stay so the audio library + // sync still parses them. writeFileSync( jsonPath, JSON.stringify( @@ -71,6 +146,7 @@ export async function synthesize(args: SynthesizeArgs): Promise { text: script, duration_sec: probeDurationSeconds(mp3Path), generated_at: new Date().toISOString(), + alignment, }, null, 2,