Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 29 additions & 2 deletions video-production/connect-videos/scripts/render.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { execSync } from "node:child_process";
import { loadProgramSpec } from "../src/lib/spec.node";
import { loadDefaults, resolveBeats, type ResolvedTimeline, type ResolvedBeat } from "../src/lib/beats.node";
import { resolveRun, specPath, outputPath } from "../src/lib/runs.node";
import { synthesize, synthesizePerBeat, type PerBeatNarration } from "../src/lib/voiceover";
import { synthesize, synthesizePerBeat, readAlignment, wordStartSeconds, type PerBeatNarration } from "../src/lib/voiceover";
import { estimateCaptionTimeline, captionsFromBeats } from "../src/lib/captions";
import { resolveAssetRefs, formatMissingError } from "../src/lib/asset-resolver.node";

Expand Down Expand Up @@ -227,7 +227,34 @@ async function main() {
for (const b of timeline.beats) {
beatOverrides[b.id] = { seconds: b.seconds };
}
const props = { programSlug: cli.program, specYaml, beatOverrides, captions };

// Extract cycle-step timestamps from the cycle beat's TTS alignment.
// ElevenLabs' /with-timestamps endpoint returns per-character start
// seconds in the synthesized audio. We look up where the four cycle
// keywords actually start being spoken (case-insensitive stem match:
// "verif" matches verify/verified, "paid" matches paid/paying) and
// pass those as concrete numbers to the Cycle component. With these,
// the highlight transitions on the spoken word — not on a guessed
// proportional position.
const cyclePerBeat = perBeat.find((p) => p.beatId === "cycle");
let cycleStepStartSeconds:
| { learn?: number; deliver?: number; verify?: number; pay?: number }
| undefined;
if (cyclePerBeat) {
const sidecar = cyclePerBeat.audioPath.replace(/\.mp3$/, ".json");
const alignment = readAlignment(sidecar);
if (alignment) {
cycleStepStartSeconds = {
learn: wordStartSeconds(alignment, "learn") ?? undefined,
deliver: wordStartSeconds(alignment, "deliver") ?? undefined,
verify: wordStartSeconds(alignment, "verif") ?? undefined,
pay: wordStartSeconds(alignment, "paid") ?? wordStartSeconds(alignment, "pay") ?? undefined,
};
console.log("Cycle step timings (seconds into cycle audio):", cycleStepStartSeconds);
}
}

const props = { programSlug: cli.program, specYaml, beatOverrides, captions, cycleStepStartSeconds };
const tmpPropsFile = path.join(os.tmpdir(), `remotion-props-${Date.now()}.json`);
fs.writeFileSync(tmpPropsFile, JSON.stringify(props));
const propsArg = `--props=${JSON.stringify(tmpPropsFile)}`;
Expand Down
19 changes: 19 additions & 0 deletions video-production/connect-videos/src/Root.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ interface VideoProps {
*/
beatOverrides?: Record<string, { seconds?: number }>;
captions?: { startFrame: number; endFrame: number; text: string }[];
/**
* Exact seconds-into-cycle-audio for each cycle keyword, extracted
* from the ElevenLabs alignment data at render time. When present,
* the Intro/Cycle component switches the highlight on the spoken
* word; when absent, falls back to the word-index proportional
* estimate. Studio preview omits this (no audio synth).
*/
cycleStepStartSeconds?: {
learn?: number;
deliver?: number;
verify?: number;
pay?: number;
};
}

// Programs registered for Studio preview. Add new entries here as program
Expand Down Expand Up @@ -61,6 +74,7 @@ const ProgramVideo: React.FC<VideoProps> = ({
specYaml,
beatOverrides,
captions = [],
cycleStepStartSeconds,
}) => {
// Render-CLI path: spec passed verbatim via props. Studio-preview
// path: look up the slug in the bundled registry. The render CLI
Expand Down Expand Up @@ -101,7 +115,12 @@ const ProgramVideo: React.FC<VideoProps> = ({
// Cycle highlight syncs to the keyword positions in this
// beat's narration ("learn"/"deliver"/"verif"/"pay") so the
// ring lights up the right step as the voiceover names it.
// When cycleStepStartSeconds is provided (post-2026-05-19,
// from ElevenLabs alignment), Cycle uses the exact spoken
// timestamps; otherwise it falls back to a word-index
// proportional estimate parsed from the narration text.
cycleNarration={spec.narration?.by_beat?.cycle}
cycleStepStartSeconds={cycleStepStartSeconds}
/>
</Sequence>
<Sequence
Expand Down
105 changes: 80 additions & 25 deletions video-production/connect-videos/src/compositions/Intro.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,20 @@ interface Props {
programName: string;
brand: Brand;
beatFrames: { hook: number; cycle: number; handoff: number };
// Optional narration text for the cycle beat. When provided, the
// cycle highlight walks at the timestamps where "learn", "deliver",
// "verify(ied)" and "pay(ied/paid)" appear in the audio (estimated
// proportional to word position). Without it, falls back to evenly
// spaced quarters.
// Optional narration text for the cycle beat. Used only when
// cycleStepStartSeconds isn't provided — falls back to the
// word-index proportional estimate.
cycleNarration?: string;
// Exact seconds-into-cycle-audio at which each spoken keyword
// starts. Extracted from ElevenLabs' alignment data at render time
// (see voiceover.ts::wordStartSeconds). When provided, the highlight
// transitions on the spoken word — no estimation.
cycleStepStartSeconds?: {
learn?: number;
deliver?: number;
verify?: number;
pay?: number;
};
}

const Hook: React.FC<{ tagline: string }> = ({ tagline }) => {
Expand Down Expand Up @@ -110,27 +118,63 @@ const Cycle: React.FC<{
durationFrames: number;
steps: readonly [string, string, string, string];
narration?: string;
}> = ({ durationFrames, steps, narration }) => {
stepStartSeconds?: {
learn?: number;
deliver?: number;
verify?: number;
pay?: number;
};
}> = ({ durationFrames, steps, narration, stepStartSeconds }) => {
const frame = useCurrentFrame();
// Reserve the first 12 frames (0.4s @ 30fps) for the stagger-in.
const STAGGER = 12;
const walkBudget = durationFrames - STAGGER;
const positions = keywordPositions(narration);
let activeIndex: number;
if (positions) {
// The highlight switches to step i at positions[i] * walkBudget.
// We pick the highest i whose boundary is already past.
const t = (frame - STAGGER) / walkBudget; // 0..1
activeIndex = 0;
for (let i = 0; i < 4; i++) {
if (t >= positions[i]) activeIndex = i;
const { fps } = useVideoConfig();

// Three timing strategies, in order of preference:
// 1. Concrete second-offsets from ElevenLabs alignment data
// (stepStartSeconds). The highlight transitions on the spoken
// word — what the user wants.
// 2. Word-index proportional positions parsed from the narration
// text (positions[]). Reasonable estimate when alignment isn't
// available (e.g. Studio preview, or programs that haven't
// re-rendered post-alignment-switch).
// 3. Even quarters across the beat duration. Last-resort fallback
// when there's no narration text at all.
let activeIndex = 0;

if (stepStartSeconds && (
stepStartSeconds.learn !== undefined ||
stepStartSeconds.deliver !== undefined ||
stepStartSeconds.verify !== undefined ||
stepStartSeconds.pay !== undefined
)) {
// The cycle audio runs synchronously with the cycle beat (the mux
// step places the per-beat clip at beat.startFrame). So
// "seconds into cycle audio" == "seconds into this Sequence" ==
// frame / fps.
const t = frame / fps;
if (stepStartSeconds.pay !== undefined && t >= stepStartSeconds.pay) {
activeIndex = 3;
} else if (stepStartSeconds.verify !== undefined && t >= stepStartSeconds.verify) {
activeIndex = 2;
} else if (stepStartSeconds.deliver !== undefined && t >= stepStartSeconds.deliver) {
activeIndex = 1;
} else if (stepStartSeconds.learn !== undefined && t >= stepStartSeconds.learn) {
activeIndex = 0;
}
activeIndex = Math.min(3, Math.max(0, activeIndex));
} else {
// Fallback: even quarters (legacy behaviour for programs that
// don't provide a cycle narration string).
const stepDuration = walkBudget / 4;
activeIndex = Math.min(3, Math.max(0, Math.floor((frame - STAGGER) / stepDuration)));
// Reserve the first 12 frames for the stagger-in.
const STAGGER = 12;
const walkBudget = durationFrames - STAGGER;
const positions = keywordPositions(narration);
if (positions) {
const t = (frame - STAGGER) / walkBudget;
for (let i = 0; i < 4; i++) {
if (t >= positions[i]) activeIndex = i;
}
} else {
const stepDuration = walkBudget / 4;
activeIndex = Math.floor((frame - STAGGER) / stepDuration);
}
activeIndex = Math.min(3, Math.max(0, activeIndex));
}
return (
<AbsoluteFill
Expand Down Expand Up @@ -169,13 +213,24 @@ const Handoff: React.FC<{ programName: string }> = ({ programName }) => (
</AbsoluteFill>
);

export const Intro: React.FC<Props> = ({ programName, brand, beatFrames, cycleNarration }) => (
export const Intro: React.FC<Props> = ({
programName,
brand,
beatFrames,
cycleNarration,
cycleStepStartSeconds,
}) => (
<>
<Sequence durationInFrames={beatFrames.hook}>
<Hook tagline={brand.tagline} />
</Sequence>
<Sequence from={beatFrames.hook} durationInFrames={beatFrames.cycle}>
<Cycle durationFrames={beatFrames.cycle} steps={brand.cycleSteps} narration={cycleNarration} />
<Cycle
durationFrames={beatFrames.cycle}
steps={brand.cycleSteps}
narration={cycleNarration}
stepStartSeconds={cycleStepStartSeconds}
/>
</Sequence>
<Sequence from={beatFrames.hook + beatFrames.cycle} durationInFrames={beatFrames.handoff}>
<Handoff programName={programName} />
Expand Down
144 changes: 110 additions & 34 deletions video-production/connect-videos/src/lib/voiceover.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { createHash } from "node:crypto";
import { existsSync, mkdirSync, writeFileSync } from "node:fs";
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
import path from "node:path";
import { probeDurationSeconds } from "./probe";

Expand All @@ -19,49 +19,124 @@ export interface SynthesizeArgs {
fetchImpl?: typeof fetch;
}

/**
* Per-character alignment timings returned by ElevenLabs'
* /with-timestamps endpoint. `characters[i]` is the literal character,
* `*_seconds[i]` is when it starts/ends within the synthesized clip.
* Used to compute exact moments for "learn"/"deliver"/"verify"/"paid"
* so the cycle highlight transitions on the spoken word rather than
* on a proportional word-index estimate.
*/
export interface Alignment {
characters: string[];
character_start_times_seconds: number[];
character_end_times_seconds: number[];
}

/**
* Find the start time (in seconds, relative to the clip) where `word`
* first appears in the alignment. Case-insensitive prefix match: passing
* "verif" matches both "verify" and "verified"; "paid" matches "paid"
* but not "pay". Returns `null` if the word never appears.
*
* Implementation: rebuild the synthesized string from `characters[]`,
* find the word boundary index, look up its start_seconds.
*/
export function wordStartSeconds(
alignment: Alignment, wordStem: string,
): number | null {
const stem = wordStem.toLowerCase();
const text = alignment.characters.join("").toLowerCase();
// Match word boundary: preceded by non-letter (or start of string)
// and starts with the stem.
const re = new RegExp(`(^|[^a-z])${stem}`);
const m = text.match(re);
if (!m || m.index === undefined) return null;
const idx = m.index + (m[1] ? m[1].length : 0);
if (idx < 0 || idx >= alignment.character_start_times_seconds.length) {
return null;
}
return alignment.character_start_times_seconds[idx];
}

/**
* Read the alignment sidecar for an already-synthesized clip. Returns
* `null` if the file is missing or was written before the timestamps
* endpoint switch (legacy sidecars don't carry `alignment`).
*/
export function readAlignment(jsonPath: string): Alignment | null {
try {
const raw = JSON.parse(readFileSync(jsonPath, "utf8"));
if (raw && Array.isArray(raw.alignment?.characters)) {
return raw.alignment as Alignment;
}
} catch {
// fall through
}
return null;
}

export async function synthesize(args: SynthesizeArgs): Promise<string> {
const { script, voiceId, model, cacheDir, apiKey } = args;
const key = cacheKey(script, voiceId, model);
mkdirSync(cacheDir, { recursive: true });
const mp3Path = path.join(cacheDir, `${key}.mp3`);
const jsonPath = path.join(cacheDir, `${key}.json`);
if (existsSync(mp3Path) && existsSync(jsonPath)) return mp3Path;

if (!existsSync(mp3Path)) {
const fetchImpl = args.fetchImpl ?? fetch;
const resp = await fetchImpl(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}`,
{
method: "POST",
headers: {
"xi-api-key": apiKey,
"content-type": "application/json",
accept: "audio/mpeg",
// Cache hit only when BOTH mp3 and sidecar with alignment exist —
// if the sidecar predates the alignment switch we re-synth so the
// cycle's word-timing path can drive off real character timings.
if (existsSync(mp3Path) && existsSync(jsonPath)) {
const cached = readAlignment(jsonPath);
if (cached) return mp3Path;
}

let alignment: Alignment | null = null;

const fetchImpl = args.fetchImpl ?? fetch;
// /with-timestamps returns base64-encoded mp3 + per-character timing.
// Both arrive in one call so the audio bytes and the alignment can't
// disagree (which they would if we synthesized once for the mp3 and
// re-synthesized later just for timings — ElevenLabs samples a fresh
// delivery each time even with stability tuned high).
const resp = await fetchImpl(
`https://api.elevenlabs.io/v1/text-to-speech/${voiceId}/with-timestamps`,
{
method: "POST",
headers: {
"xi-api-key": apiKey,
"content-type": "application/json",
accept: "application/json",
},
body: JSON.stringify({
text: script,
model_id: model,
voice_settings: {
stability: 0.6,
similarity_boost: 0.45,
style: 0.2,
use_speaker_boost: true,
},
body: JSON.stringify({
text: script,
model_id: model,
// Softer, more documentary-style delivery: higher stability for
// calmer pacing, lower similarity_boost for a more natural read,
// small style nudge away from a flat baseline.
voice_settings: {
stability: 0.6,
similarity_boost: 0.45,
style: 0.2,
use_speaker_boost: true,
},
}),
}
);
if (!resp.ok) {
throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`);
}
const buf = Buffer.from(await resp.arrayBuffer());
writeFileSync(mp3Path, buf);
}),
},
);
if (!resp.ok) {
throw new Error(`ElevenLabs HTTP ${resp.status}: ${await safeText(resp)}`);
}
const payload = (await resp.json()) as {
audio_base64?: string;
alignment?: Alignment;
};
if (!payload.audio_base64) {
throw new Error("ElevenLabs response missing audio_base64");
}
const buf = Buffer.from(payload.audio_base64, "base64");
writeFileSync(mp3Path, buf);
alignment = payload.alignment ?? null;

// Always (re)write the sidecar when missing — covers (a) brand-new
// synthesis and (b) pre-sidecar mp3s left over from an earlier render.
// Write the sidecar with alignment data. Legacy fields (voice_id,
// model, text, duration_sec, generated_at) stay so the audio library
// sync still parses them.
writeFileSync(
jsonPath,
JSON.stringify(
Expand All @@ -71,6 +146,7 @@ export async function synthesize(args: SynthesizeArgs): Promise<string> {
text: script,
duration_sec: probeDurationSeconds(mp3Path),
generated_at: new Date().toISOString(),
alignment,
},
null,
2,
Expand Down
Loading