codingcat.dev/lib/services/elevenlabs.ts at a873a908800caff1eaec11f0bb1877403ea3d31e · CodingCatDev/codingcat.dev · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
/**
 * ElevenLabs Text-to-Speech Service
 *
 * Converts script text into MP3 audio using the ElevenLabs TTS API v1.
 * Part of the CodingCat.dev automated video pipeline:
 *   script text → ElevenLabs TTS → MP3 audio → upload to GCS → Remotion render
 */

import {
  aggregateToWordTimestamps,
  type CharacterAlignment,
  type WordTimestamp,
  type SceneAudioResult,
} from "@/lib/utils/audio-timestamps";
import { getConfigValue } from "@/lib/config";

const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1";

/** Configuration for the ElevenLabs TTS service. */
export type ElevenLabsConfig = {
  /** ElevenLabs API key for authentication. */
  apiKey: string;
  /** ElevenLabs voice ID to use for speech synthesis. */
  voiceId: string;
};

/** Voice settings passed to the ElevenLabs API. */
interface VoiceSettings {
  stability: number;
  similarity_boost: number;
  style: number;
}

/** Request body for the ElevenLabs TTS endpoint. */
interface TTSRequestBody {
  text: string;
  model_id: string;
  voice_settings: VoiceSettings;
}

/** Shape of the script object used in the video pipeline. */
export interface VideoScript {
  hook: string;
  scenes: Array<{
    sceneNumber?: number;
    narration: string;
    visualDescription?: string;
    bRollKeywords?: string[];
    durationEstimate?: number;
  }>;
  cta: string;
}

/** Response from ElevenLabs /with-timestamps endpoint */
interface TTSWithTimestampsResponse {
  audio_base64: string;
  alignment: CharacterAlignment;
}

/**
 * Reads the ElevenLabs configuration from environment variables.
 *
 * @returns The resolved {@link ElevenLabsConfig}.
 * @throws {Error} If required environment variables are missing.
 */
async function getElevenLabsConfig(): Promise<ElevenLabsConfig> {
  const apiKey = process.env.ELEVENLABS_API_KEY;
  const voiceId = await getConfigValue(
    "pipeline_config", "elevenLabsVoiceId",
    process.env.ELEVENLABS_VOICE_ID || "pNInz6obpgDQGcFmaJgB"
  );

  if (!apiKey) {
    throw new Error(
      "Missing ELEVENLABS_API_KEY environment variable. " +
        "Set it in your .env.local or deployment environment."
    );
  }

  return { apiKey, voiceId };
}

/**
 * Generate speech audio from plain text using the ElevenLabs TTS API.
 *
 * Calls the ElevenLabs v1 text-to-speech endpoint with the
 * `eleven_multilingual_v2` model and returns the resulting MP3 audio
 * as a Node.js `Buffer`.
 *
 * @param text - The text to convert to speech.
 * @returns A `Buffer` containing the MP3 audio data.
 * @throws {Error} If the text is empty, env vars are missing, or the API request fails.
 *
 * @example
 * ```ts
 * import { generateSpeech } from "@/lib/services/elevenlabs";
 *
 * const mp3Buffer = await generateSpeech("Hello from CodingCat.dev!");
 * ```
 */
export async function generateSpeech(text: string): Promise<Buffer> {
  if (!text || text.trim().length === 0) {
    throw new Error("Cannot generate speech from empty text.");
  }

  const { apiKey, voiceId } = await getElevenLabsConfig();

  const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}`;

  const body: TTSRequestBody = {
    text,
    model_id: "eleven_multilingual_v2",
    voice_settings: {
      stability: 0.5,
      similarity_boost: 0.75,
      style: 0.5,
    },
  };

  let response: Response;

  try {
    response = await fetch(url, {
      method: "POST",
      headers: {
        Accept: "audio/mpeg",
        "Content-Type": "application/json",
        "xi-api-key": apiKey,
      },
      body: JSON.stringify(body),
    });
  } catch (error) {
    throw new Error(
      `ElevenLabs API request failed: ${error instanceof Error ? error.message : String(error)}`
    );
  }

  if (!response.ok) {
    let errorDetail: string;

    try {
      const errorBody = await response.json();
      errorDetail =
        errorBody?.detail?.message ||
        errorBody?.detail ||
        JSON.stringify(errorBody);
    } catch {
      errorDetail = response.statusText || "Unknown error";
    }

    throw new Error(
      `ElevenLabs TTS API error (${response.status}): ${errorDetail}`
    );
  }

  const arrayBuffer = await response.arrayBuffer();

  if (arrayBuffer.byteLength === 0) {
    throw new Error("ElevenLabs API returned an empty audio response.");
  }

  return Buffer.from(arrayBuffer);
}

/**
 * Generate speech audio from a structured video script.
 *
 * Concatenates the script's hook, scene narrations, and call-to-action
 * into a single text block (separated by pauses) and converts it to
 * MP3 audio via {@link generateSpeech}.
 *
 * @param script - The video script containing a hook, scenes with narrations, and a CTA.
 * @returns A `Buffer` containing the MP3 audio data.
 * @throws {Error} If the script produces empty text or the TTS call fails.
 *
 * @example
 * ```ts
 * import { generateSpeechFromScript } from "@/lib/services/elevenlabs";
 *
 * const mp3Buffer = await generateSpeechFromScript({
 *   hook: "Did you know you can automate video creation?",
 *   scenes: [
 *     { narration: "First, we generate a script using AI." },
 *     { narration: "Then, we convert it to speech with ElevenLabs." },
 *   ],
 *   cta: "Subscribe to CodingCat.dev for more!",
 * });
 * ```
 */
export async function generateSpeechFromScript(
  script: VideoScript
): Promise<Buffer> {
  const sections: string[] = [];

  if (script.hook?.trim()) {
    sections.push(script.hook.trim());
  }

  if (script.scenes && Array.isArray(script.scenes)) {
    for (const scene of script.scenes) {
      if (scene.narration?.trim()) {
        sections.push(scene.narration.trim());
      }
    }
  }

  if (script.cta?.trim()) {
    sections.push(script.cta.trim());
  }

  if (sections.length === 0) {
    throw new Error(
      "Cannot generate speech from an empty script. " +
        "Provide at least a hook, one scene narration, or a CTA."
    );
  }

  // Join sections with ". " to create natural pauses between parts.
  // Ensure each section ends cleanly before adding the pause separator.
  const combinedText = sections
    .map((s) => (s.endsWith(".") ? s : `${s}.`))
    .join(" ");

  return generateSpeech(combinedText);
}

/**
 * Generate speech with word-level timestamps using the ElevenLabs
 * `/text-to-speech/{voiceId}/with-timestamps` endpoint.
 *
 * Returns both the audio buffer and word-level timing data that can be
 * used to sync Remotion visuals to the narration.
 *
 * @param text - The text to convert to speech.
 * @returns Audio buffer + word-level timestamps.
 */
export async function generateSpeechWithTimestamps(
  text: string
): Promise<SceneAudioResult> {
  if (!text || text.trim().length === 0) {
    throw new Error("Cannot generate speech from empty text.");
  }

  const { apiKey, voiceId } = await getElevenLabsConfig();

  const url = `${ELEVENLABS_API_BASE}/text-to-speech/${voiceId}/with-timestamps`;

  const body: TTSRequestBody = {
    text,
    model_id: "eleven_multilingual_v2",
    voice_settings: {
      stability: 0.5,
      similarity_boost: 0.75,
      style: 0.5,
    },
  };

  let response: Response;

  try {
    response = await fetch(url, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        "xi-api-key": apiKey,
      },
      body: JSON.stringify(body),
    });
  } catch (error) {
    throw new Error(
      `ElevenLabs timestamps API request failed: ${error instanceof Error ? error.message : String(error)}`
    );
  }

  if (!response.ok) {
    let errorDetail: string;
    try {
      const errorBody = await response.json();
      errorDetail =
        errorBody?.detail?.message ||
        errorBody?.detail ||
        JSON.stringify(errorBody);
    } catch {
      errorDetail = response.statusText || "Unknown error";
    }
    throw new Error(
      `ElevenLabs timestamps API error (${response.status}): ${errorDetail}`
    );
  }

  const data = (await response.json()) as TTSWithTimestampsResponse;

  if (!data.audio_base64) {
    throw new Error("ElevenLabs timestamps API returned no audio data.");
  }

  const audioBuffer = Buffer.from(data.audio_base64, "base64");
  const wordTimestamps = aggregateToWordTimestamps(data.alignment);

  // Calculate duration from the last word's end time, or estimate from buffer
  const durationMs =
    wordTimestamps.length > 0
      ? wordTimestamps[wordTimestamps.length - 1].endMs
      : Math.round((audioBuffer.length / 32000) * 1000); // rough estimate for MP3

  return {
    audioBase64: data.audio_base64,
    audioBuffer,
    wordTimestamps,
    durationMs,
  };
}

/**
 * Generate per-scene audio with timestamps from a structured video script.
 *
 * Instead of concatenating everything into one blob, this generates
 * separate audio for each section (hook, scenes, CTA) with word-level
 * timestamps. This enables:
 * - Precise scene boundary timing
 * - Per-scene word timestamps for visual sync
 * - Fault isolation (retry one scene instead of all)
 *
 * @param script - The video script
 * @returns Array of SceneAudioResult, one per section (hook + scenes + CTA)
 */
export async function generatePerSceneAudio(
  script: VideoScript
): Promise<{
  hook: SceneAudioResult;
  scenes: SceneAudioResult[];
  cta: SceneAudioResult;
  totalDurationMs: number;
}> {
  const sections: { label: string; text: string }[] = [];

  if (script.hook?.trim()) {
    sections.push({ label: "hook", text: script.hook.trim() });
  } else {
    throw new Error("Script must have a hook.");
  }

  if (!script.scenes?.length) {
    throw new Error("Script must have at least one scene.");
  }

  for (const scene of script.scenes) {
    if (scene.narration?.trim()) {
      sections.push({
        label: `scene-${scene.sceneNumber ?? sections.length}`,
        text: scene.narration.trim(),
      });
    }
  }

  if (script.cta?.trim()) {
    sections.push({ label: "cta", text: script.cta.trim() });
  } else {
    throw new Error("Script must have a CTA.");
  }

  console.log(
    `[elevenlabs] Generating per-scene audio for ${sections.length} sections...`
  );

  // Generate audio for all sections concurrently (with a concurrency limit)
  const CONCURRENCY = 3;
  const results: SceneAudioResult[] = [];

  for (let i = 0; i < sections.length; i += CONCURRENCY) {
    const batch = sections.slice(i, i + CONCURRENCY);
    const batchResults = await Promise.all(
      batch.map(async (section) => {
        console.log(
          `[elevenlabs] Generating audio for ${section.label} (${section.text.length} chars)...`
        );
        return generateSpeechWithTimestamps(section.text);
      })
    );
    results.push(...batchResults);
  }

  const totalDurationMs = results.reduce((sum, r) => sum + r.durationMs, 0);

  console.log(
    `[elevenlabs] Per-scene audio complete: ${results.length} sections, ${Math.round(totalDurationMs / 1000)}s total`
  );

  // Split results back into hook, scenes, CTA
  const hookResult = results[0];
  const sceneResults = results.slice(1, results.length - 1);
  const ctaResult = results[results.length - 1];

  return {
    hook: hookResult,
    scenes: sceneResults,
    cta: ctaResult,
    totalDurationMs,
  };
}

// Re-export timestamp types for consumers
export type { WordTimestamp, SceneAudioResult, CharacterAlignment } from "@/lib/utils/audio-timestamps";