diff --git a/.changeset/elevenlabs-rest-adapters.md b/.changeset/elevenlabs-rest-adapters.md new file mode 100644 index 000000000..d5048bef3 --- /dev/null +++ b/.changeset/elevenlabs-rest-adapters.md @@ -0,0 +1,13 @@ +--- +'@tanstack/ai-elevenlabs': minor +--- + +feat: add REST adapters to @tanstack/ai-elevenlabs and migrate realtime to the renamed SDK + +Extends `@tanstack/ai-elevenlabs` (previously realtime-only) with three tree-shakeable REST adapters built on the official `@elevenlabs/elevenlabs-js` SDK (v2.44+): + +- `elevenlabsSpeech()` — text-to-speech on `eleven_v3`, `eleven_multilingual_v2`, `eleven_flash_*`, `eleven_turbo_*` +- `elevenlabsAudio()` — music (`music_v1`, with structured composition plans) and sound effects (`eleven_text_to_sound_v2`/`v1`) via a single adapter that dispatches by model +- `elevenlabsTranscription()` — Scribe v1/v2 speech-to-text with diarization, keyterm biasing, PII redaction, and word-level timestamps + +Also migrates the existing realtime adapter off the deprecated `@11labs/client` onto the renamed `@elevenlabs/client` package. diff --git a/examples/ts-react-chat/.env.example b/examples/ts-react-chat/.env.example index 2bdb43f49..609f12ce0 100644 --- a/examples/ts-react-chat/.env.example +++ b/examples/ts-react-chat/.env.example @@ -1,11 +1,35 @@ -# OpenAI API Key +# OpenAI API Key (chat, images, video, speech, transcription, summarize) # Get yours at: https://platform.openai.com/api-keys -OPENAI_API_KEY=sk-... +OPENAI_API_KEY= -# ElevenLabs API Key (for realtime voice) +# Anthropic API Key (chat) +# Get yours at: https://console.anthropic.com/settings/keys +ANTHROPIC_API_KEY= + +# Google Gemini API Key (chat, audio, speech) +# Get yours at: https://aistudio.google.com/apikey +GOOGLE_API_KEY= + +# xAI / Grok API Key (chat) +# Get yours at: https://console.x.ai +XAI_API_KEY= + +# Groq API Key (chat) +# Get yours at: https://console.groq.com/keys +GROQ_API_KEY= + +# OpenRouter API Key (chat, images) +# Get yours at: https://openrouter.ai/keys +OPENROUTER_API_KEY= + +# fal.ai Key (audio, speech, transcription) +# Get yours at: https://fal.ai/dashboard/keys +FAL_KEY= + +# ElevenLabs API Key (realtime voice) # Get yours at: https://elevenlabs.io/app/settings/api-keys -ELEVENLABS_API_KEY=xi-... +ELEVENLABS_API_KEY= -# ElevenLabs Agent ID (for realtime voice) +# ElevenLabs Agent ID (realtime voice) # Create an agent at: https://elevenlabs.io/app/conversational-ai -ELEVENLABS_AGENT_ID=... +ELEVENLABS_AGENT_ID= diff --git a/examples/ts-react-chat/src/lib/audio-providers.ts b/examples/ts-react-chat/src/lib/audio-providers.ts index 226aeb002..5ff72fae2 100644 --- a/examples/ts-react-chat/src/lib/audio-providers.ts +++ b/examples/ts-react-chat/src/lib/audio-providers.ts @@ -6,7 +6,12 @@ * and audio generation flows. */ -export type SpeechProviderId = 'openai' | 'gemini' | 'fal' | 'grok' +export type SpeechProviderId = + | 'openai' + | 'gemini' + | 'fal' + | 'grok' + | 'elevenlabs' export interface SpeechProviderConfig { id: SpeechProviderId @@ -68,9 +73,21 @@ export const SPEECH_PROVIDERS: ReadonlyArray = [ ], placeholder: 'Enter text for Grok speech…', }, + { + id: 'elevenlabs', + label: 'ElevenLabs', + model: 'eleven_multilingual_v2', + voices: [ + { id: '21m00Tcm4TlvDq8ikWAM', label: 'Rachel' }, + { id: 'AZnzlk1XvdvUeBnXmlld', label: 'Domi' }, + { id: 'EXAVITQu4vr4xnSDxMaL', label: 'Bella' }, + { id: 'pNInz6obpgDQGcFmaJgB', label: 'Adam' }, + ], + placeholder: 'Enter text to synthesize with ElevenLabs…', + }, ] -export type TranscriptionProviderId = 'openai' | 'fal' | 'grok' +export type TranscriptionProviderId = 'openai' | 'fal' | 'grok' | 'elevenlabs' export interface TranscriptionProviderConfig { id: TranscriptionProviderId @@ -99,9 +116,21 @@ export const TRANSCRIPTION_PROVIDERS: ReadonlyArray model: 'grok-stt', description: 'xAI speech-to-text with word-level timestamps.', }, + { + id: 'elevenlabs', + label: 'ElevenLabs Scribe', + model: 'scribe_v1', + description: + 'ElevenLabs Scribe with diarization, keyterm biasing, and PII redaction.', + }, ] -export type AudioProviderId = 'gemini-lyria' | 'fal-audio' | 'fal-sfx' +export type AudioProviderId = + | 'gemini-lyria' + | 'fal-audio' + | 'fal-sfx' + | 'elevenlabs-music' + | 'elevenlabs-sfx' export interface AudioProviderConfig { id: AudioProviderId @@ -244,4 +273,70 @@ export const AUDIO_PROVIDERS: ReadonlyArray = [ }, ], }, + { + id: 'elevenlabs-music', + label: 'ElevenLabs Music', + model: 'music_v1', + models: [{ id: 'music_v1', label: 'Music v1' }], + description: + 'ElevenLabs Music — free-form prompts or structured composition plans.', + placeholder: 'An upbeat synthwave track with driving drums and arpeggios', + defaultDuration: 15, + samplePrompts: [ + { + label: 'Synthwave drive', + prompt: + 'An upbeat synthwave track with driving drums, warm analog pads, and glittering arpeggios.', + }, + { + label: 'Cinematic reveal', + prompt: + 'A cinematic reveal score with soaring strings, low brass stabs, and a sudden timpani hit.', + }, + { + label: 'Acoustic campfire', + prompt: + 'A gentle acoustic campfire tune: fingerpicked guitar, soft harmonica, and distant crickets.', + }, + { + label: 'Angry kazoo orchestra', + prompt: + 'A furious kazoo orchestra performing an operatic aria about overdue library books.', + }, + ], + }, + { + id: 'elevenlabs-sfx', + label: 'ElevenLabs SFX', + model: 'eleven_text_to_sound_v2', + models: [ + { id: 'eleven_text_to_sound_v2', label: 'Text-to-Sound v2' }, + { id: 'eleven_text_to_sound_v1', label: 'Text-to-Sound v1' }, + ], + description: + 'ElevenLabs text-to-sound for short effects, 0.5–30 seconds per clip.', + placeholder: 'A whoosh followed by a deep bass impact', + defaultDuration: 5, + samplePrompts: [ + { + label: 'Trailer whoosh', + prompt: 'A cinematic whoosh followed by a deep sub-bass impact.', + }, + { + label: 'Sword unsheathe', + prompt: + 'The crisp metallic ring of a sword being drawn from a leather scabbard.', + }, + { + label: 'UI confirmation', + prompt: + 'A short, satisfying UI confirmation tone with a subtle sparkle tail.', + }, + { + label: 'Anxious toaster', + prompt: + 'A small kitchen toaster having an anxiety attack: frantic clicks, steam, and a plaintive ding.', + }, + ], + }, ] diff --git a/examples/ts-react-chat/src/lib/server-audio-adapters.ts b/examples/ts-react-chat/src/lib/server-audio-adapters.ts index 77336629d..79250f48a 100644 --- a/examples/ts-react-chat/src/lib/server-audio-adapters.ts +++ b/examples/ts-react-chat/src/lib/server-audio-adapters.ts @@ -8,6 +8,11 @@ import { openaiSpeech, openaiTranscription } from '@tanstack/ai-openai' import { geminiAudio, geminiSpeech } from '@tanstack/ai-gemini' import { falAudio, falSpeech, falTranscription } from '@tanstack/ai-fal' +import { + elevenlabsAudio, + elevenlabsSpeech, + elevenlabsTranscription, +} from '@tanstack/ai-elevenlabs' import { grokSpeech, grokTranscription } from '@tanstack/ai-grok' import type { AnyAudioAdapter, @@ -48,6 +53,8 @@ export function buildSpeechAdapter(provider: SpeechProviderId): AnyTTSAdapter { return falSpeech(config.model) case 'grok': return grokSpeech(config.model as 'grok-tts') + case 'elevenlabs': + return elevenlabsSpeech(config.model) } } @@ -62,6 +69,8 @@ export function buildTranscriptionAdapter( return falTranscription(config.model) case 'grok': return grokTranscription(config.model as 'grok-stt') + case 'elevenlabs': + return elevenlabsTranscription(config.model) } } @@ -79,6 +88,9 @@ export function buildAudioAdapter( case 'fal-audio': case 'fal-sfx': return falAudio(model) + case 'elevenlabs-music': + case 'elevenlabs-sfx': + return elevenlabsAudio(model) } } diff --git a/examples/ts-react-chat/src/lib/server-fns.ts b/examples/ts-react-chat/src/lib/server-fns.ts index b1e5d9e59..c64168d23 100644 --- a/examples/ts-react-chat/src/lib/server-fns.ts +++ b/examples/ts-react-chat/src/lib/server-fns.ts @@ -67,15 +67,21 @@ function rethrowAudioAdapterError(err: unknown): never { } const SPEECH_PROVIDER_SCHEMA = z - .enum(['openai', 'gemini', 'fal', 'grok']) + .enum(['openai', 'gemini', 'fal', 'grok', 'elevenlabs']) .optional() const TRANSCRIPTION_PROVIDER_SCHEMA = z - .enum(['openai', 'fal', 'grok']) + .enum(['openai', 'fal', 'grok', 'elevenlabs']) .optional() const AUDIO_PROVIDER_SCHEMA = z - .enum(['gemini-lyria', 'fal-audio', 'fal-sfx']) + .enum([ + 'gemini-lyria', + 'fal-audio', + 'fal-sfx', + 'elevenlabs-music', + 'elevenlabs-sfx', + ]) .optional() // ============================================================================= diff --git a/examples/ts-react-chat/src/lib/use-realtime.ts b/examples/ts-react-chat/src/lib/use-realtime.ts index 620c2804f..dbe60fc24 100644 --- a/examples/ts-react-chat/src/lib/use-realtime.ts +++ b/examples/ts-react-chat/src/lib/use-realtime.ts @@ -12,7 +12,7 @@ import { realtimeClientTools } from '@/lib/realtime-tools' type Provider = 'openai' | 'elevenlabs' | 'grok' const getRealtimeTokenFn = createServerFn({ method: 'POST' }) - .inputValidator((data: { provider: Provider; agentId?: string }) => { + .inputValidator((data: { provider: Provider; language?: string }) => { if (!data.provider) throw new Error('Provider is required') return data }) @@ -26,14 +26,10 @@ const getRealtimeTokenFn = createServerFn({ method: 'POST' }) } if (data.provider === 'elevenlabs') { - const agentId = data.agentId || process.env.ELEVENLABS_AGENT_ID - if (!agentId) { - throw new Error( - 'ElevenLabs agent ID is required. Set ELEVENLABS_AGENT_ID or pass agentId in request body.', - ) - } return realtimeToken({ - adapter: elevenlabsRealtimeToken({ agentId }), + adapter: elevenlabsRealtimeToken({ + ...(data.language ? { overrides: { language: data.language } } : {}), + }), }) } @@ -59,7 +55,7 @@ function adapterForProvider(provider: Provider) { export function useRealtime({ provider, - agentId, + language, voice, outputModalities, temperature, @@ -67,7 +63,7 @@ export function useRealtime({ semanticEagerness, }: { provider: Provider - agentId: string + language?: string voice?: string outputModalities?: Array<'audio' | 'text'> temperature?: number @@ -79,7 +75,7 @@ export function useRealtime({ getRealtimeTokenFn({ data: { provider, - ...(provider === 'elevenlabs' && agentId ? { agentId } : {}), + ...(provider === 'elevenlabs' && language ? { language } : {}), }, }), adapter: adapterForProvider(provider), diff --git a/examples/ts-react-chat/src/routes/api.generate.audio.ts b/examples/ts-react-chat/src/routes/api.generate.audio.ts index aade04c8b..ab9820b55 100644 --- a/examples/ts-react-chat/src/routes/api.generate.audio.ts +++ b/examples/ts-react-chat/src/routes/api.generate.audio.ts @@ -8,7 +8,13 @@ import { } from '../lib/server-audio-adapters' const AUDIO_PROVIDER_SCHEMA = z - .enum(['gemini-lyria', 'fal-audio', 'fal-sfx']) + .enum([ + 'gemini-lyria', + 'fal-audio', + 'fal-sfx', + 'elevenlabs-music', + 'elevenlabs-sfx', + ]) .optional() const AUDIO_BODY_SCHEMA = z.object({ diff --git a/examples/ts-react-chat/src/routes/api.generate.speech.ts b/examples/ts-react-chat/src/routes/api.generate.speech.ts index 92057ad4f..d0f4240e6 100644 --- a/examples/ts-react-chat/src/routes/api.generate.speech.ts +++ b/examples/ts-react-chat/src/routes/api.generate.speech.ts @@ -8,7 +8,7 @@ import { } from '../lib/server-audio-adapters' const SPEECH_PROVIDER_SCHEMA = z - .enum(['openai', 'gemini', 'fal', 'grok']) + .enum(['openai', 'gemini', 'fal', 'grok', 'elevenlabs']) .optional() const SPEECH_BODY_SCHEMA = z.object({ diff --git a/examples/ts-react-chat/src/routes/api.transcribe.ts b/examples/ts-react-chat/src/routes/api.transcribe.ts index e6131ad32..b841ea904 100644 --- a/examples/ts-react-chat/src/routes/api.transcribe.ts +++ b/examples/ts-react-chat/src/routes/api.transcribe.ts @@ -8,7 +8,7 @@ import { } from '../lib/server-audio-adapters' const TRANSCRIPTION_PROVIDER_SCHEMA = z - .enum(['openai', 'fal', 'grok']) + .enum(['openai', 'fal', 'grok', 'elevenlabs']) .optional() const TRANSCRIBE_BODY_SCHEMA = z.object({ diff --git a/examples/ts-react-chat/src/routes/realtime.tsx b/examples/ts-react-chat/src/routes/realtime.tsx index bed289da1..5f75fd3e7 100644 --- a/examples/ts-react-chat/src/routes/realtime.tsx +++ b/examples/ts-react-chat/src/routes/realtime.tsx @@ -31,6 +31,18 @@ const OUTPUT_MODE_OPTIONS: Array<{ value: OutputMode; label: string }> = [ { value: 'audio-only', label: 'Audio Only' }, ] +const LANGUAGE_OPTIONS: Array<{ value: string; label: string }> = [ + { value: '', label: 'Agent default' }, + { value: 'en', label: 'English' }, + { value: 'es', label: 'Spanish' }, + { value: 'fr', label: 'French' }, + { value: 'de', label: 'German' }, + { value: 'it', label: 'Italian' }, + { value: 'pt', label: 'Portuguese' }, + { value: 'ja', label: 'Japanese' }, + { value: 'zh', label: 'Chinese' }, +] + function outputModeToModalities( mode: OutputMode, ): Array<'audio' | 'text'> | undefined { @@ -48,7 +60,7 @@ function outputModeToModalities( function RealtimePage() { const [provider, setProvider] = useState('openai') - const [agentId, setAgentId] = useState('') + const [language, setLanguage] = useState('') const [grokVoice, setGrokVoice] = useState('eve') const [textInput, setTextInput] = useState('') const [outputMode, setOutputMode] = useState('audio+text') @@ -77,7 +89,7 @@ function RealtimePage() { getOutputTimeDomainData, } = useRealtime({ provider, - agentId, + language: language || undefined, voice: provider === 'grok' ? grokVoice : undefined, outputModalities: outputModeToModalities(outputMode), temperature, @@ -232,20 +244,24 @@ function RealtimePage() { - {/* ElevenLabs Agent ID (conditional) */} + {/* Language override (ElevenLabs only) */} {provider === 'elevenlabs' && (
- setAgentId(e.target.value)} - placeholder="Your ElevenLabs Agent ID" +
)} diff --git a/examples/ts-react-chat/vite.config.ts b/examples/ts-react-chat/vite.config.ts index 809a10692..b9b166edb 100644 --- a/examples/ts-react-chat/vite.config.ts +++ b/examples/ts-react-chat/vite.config.ts @@ -7,9 +7,24 @@ import { nitroV2Plugin } from '@tanstack/nitro-v2-vite-plugin' import { devtools } from '@tanstack/devtools-vite' const config = defineConfig({ + // Server-side only fix. @elevenlabs/elevenlabs-js ships a top-level + // `function getHeader(…)` that collides with h3's auto-imported + // `getHeader` when vite inlines it into the SSR bundle. The SDK is + // only imported by server-side adapter factories (see + // `src/lib/server-audio-adapters.ts`), so tree-shaking already keeps + // it out of the client bundle — this option only affects the SSR + // build, where we want the SDK resolved at runtime via require() + // instead of inlined into the rollup chunk. + ssr: { + external: ['@elevenlabs/elevenlabs-js'], + }, plugins: [ devtools(), - nitroV2Plugin(), + nitroV2Plugin({ + externals: { + external: ['@elevenlabs/elevenlabs-js'], + }, + }), // this is the plugin that enables path aliases viteTsConfigPaths({ projects: ['./tsconfig.json'], diff --git a/package.json b/package.json index e418bfedd..d45a23b06 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,7 @@ "build:all": "nx run-many --targets=build --exclude=examples/**", "watch": "pnpm run build:all && env NX_DAEMON=true nx watch --all -- pnpm run build:all", "dev": "pnpm run watch", + "dev:chat": "pnpm --filter ts-react-chat dev", "format": "prettier --experimental-cli --ignore-unknown '**/*' --write", "generate-docs": "node scripts/generate-docs.ts && pnpm run copy:readme", "generate:models": "pnpm generate:models:fetch && pnpm regenerate:models && tsx scripts/sync-provider-models.ts && pnpm format", diff --git a/packages/typescript/ai-elevenlabs/package.json b/packages/typescript/ai-elevenlabs/package.json index d71f5595b..7f249e5ef 100644 --- a/packages/typescript/ai-elevenlabs/package.json +++ b/packages/typescript/ai-elevenlabs/package.json @@ -15,7 +15,14 @@ "voice", "realtime", "tanstack", - "adapter" + "adapter", + "tts", + "text-to-speech", + "audio-generation", + "music", + "sound-effects", + "transcription", + "speech-to-text" ], "type": "module", "module": "./dist/esm/index.js", @@ -41,7 +48,8 @@ "test:types": "tsc" }, "dependencies": { - "@11labs/client": "^0.2.0" + "@elevenlabs/client": "^1.3.1", + "@elevenlabs/elevenlabs-js": "^2.44.0" }, "peerDependencies": { "@tanstack/ai": "workspace:^", diff --git a/packages/typescript/ai-elevenlabs/src/adapters/audio.ts b/packages/typescript/ai-elevenlabs/src/adapters/audio.ts new file mode 100644 index 000000000..10b0ce535 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/adapters/audio.ts @@ -0,0 +1,257 @@ +import { BaseAudioAdapter } from '@tanstack/ai/adapters' +import { + arrayBufferToBase64, + createElevenLabsClient, + generateId, + parseOutputFormat, + readStreamToArrayBuffer, +} from '../utils/client' +import { + isElevenLabsMusicModel, + isElevenLabsSoundEffectsModel, +} from '../model-meta' +import type { ElevenLabsClient } from '@elevenlabs/elevenlabs-js' +import type { + AudioGenerationOptions, + AudioGenerationResult, +} from '@tanstack/ai' +import type { ElevenLabsClientConfig } from '../utils/client' +import type { + ElevenLabsAudioModel, + ElevenLabsMusicModel, + ElevenLabsOutputFormat, + ElevenLabsSoundEffectsModel, +} from '../model-meta' + +/** + * Structured composition plan for ElevenLabs music generation. Mutually + * exclusive with a free-form `prompt` on the `generateAudio()` call — when + * supplied, `prompt` is ignored by ElevenLabs. + * + * We mirror the SDK's camelCase naming. Lengths are in milliseconds. + * @see https://elevenlabs.io/docs/api-reference/music/compose + */ +export interface ElevenLabsMusicCompositionPlan { + /** Positive global style descriptors (mood, instruments, tempo, …). */ + positiveGlobalStyles?: Array + /** Negative global style descriptors — styles to avoid. */ + negativeGlobalStyles?: Array + /** Section definitions (verse/chorus/bridge/…) with local style hints. */ + sections?: Array<{ + sectionName: string + positiveLocalStyles?: Array + negativeLocalStyles?: Array + durationMs?: number + lines?: Array + }> +} + +/** + * Provider options common to all ElevenLabs audio endpoints. + */ +interface CommonAudioOptions { + /** Output audio format. Defaults to `mp3_44100_128`. */ + outputFormat?: ElevenLabsOutputFormat +} + +/** + * Provider options for music generation (`music_v1`). + */ +export interface ElevenLabsMusicProviderOptions extends CommonAudioOptions { + /** Structured composition plan. Mutually exclusive with `prompt`/`duration`. */ + compositionPlan?: ElevenLabsMusicCompositionPlan + /** Deterministic sampling seed (incompatible with `prompt`). */ + seed?: number + /** Force the output to be purely instrumental (prompt-mode only). */ + forceInstrumental?: boolean + /** Strictly respect section durations in `compositionPlan`. */ + respectSectionsDurations?: boolean +} + +/** + * Provider options for sound-effect generation (`eleven_text_to_sound_v*`). + */ +export interface ElevenLabsSoundEffectsProviderOptions extends CommonAudioOptions { + /** Prompt influence, 0..1. Default 0.3. Higher = more prompt adherence. */ + promptInfluence?: number + /** Generate a loopable SFX (v2 only). */ + loop?: boolean +} + +/** + * Union of per-model provider options. We keep both branches on one type so + * the adapter stays tree-shakeable; callers narrow by model at the factory. + */ +export type ElevenLabsAudioProviderOptions = + | (ElevenLabsMusicProviderOptions & ElevenLabsSoundEffectsProviderOptions) + | ElevenLabsMusicProviderOptions + | ElevenLabsSoundEffectsProviderOptions + +/** + * ElevenLabs audio generation adapter. Dispatches to music or SFX endpoints + * based on the model id. Music → `client.music.compose`, SFX → + * `client.textToSoundEffects.convert`. + * + * @example + * ```ts + * const music = elevenlabsAudio('music_v1') + * await generateAudio({ adapter: music, prompt: 'lo-fi beat', duration: 15 }) + * + * const sfx = elevenlabsAudio('eleven_text_to_sound_v2') + * await generateAudio({ adapter: sfx, prompt: 'glass shattering', duration: 3 }) + * ``` + */ +export class ElevenLabsAudioAdapter< + TModel extends ElevenLabsAudioModel, +> extends BaseAudioAdapter { + readonly name = 'elevenlabs' as const + + private client: ElevenLabsClient + + constructor(model: TModel, config?: ElevenLabsClientConfig) { + super(model, config ?? {}) + this.client = createElevenLabsClient(config) + } + + async generateAudio( + options: AudioGenerationOptions, + ): Promise { + const { logger } = options + logger.request( + `activity=generateAudio provider=elevenlabs model=${this.model}`, + { provider: 'elevenlabs', model: this.model }, + ) + try { + if (isElevenLabsMusicModel(this.model)) { + return await this.runMusic(options) + } + if (isElevenLabsSoundEffectsModel(this.model)) { + return await this.runSoundEffects(options) + } + throw new Error( + `Unsupported ElevenLabs audio model "${this.model}". Expected one of: music_v1, eleven_text_to_sound_v2, eleven_text_to_sound_v1.`, + ) + } catch (error) { + logger.errors('elevenlabs.generateAudio fatal', { + error, + source: 'elevenlabs.generateAudio', + }) + throw error + } + } + + private async runMusic( + options: AudioGenerationOptions, + ): Promise { + // Gated by isElevenLabsMusicModel() in generateAudio(). + const modelId = this.model as ElevenLabsMusicModel + const music = (options.modelOptions ?? {}) as ElevenLabsMusicProviderOptions + const outputFormat = music.outputFormat + + const stream = await this.client.music.compose({ + modelId, + ...(options.prompt && !music.compositionPlan + ? { prompt: options.prompt } + : {}), + ...(music.compositionPlan + ? { compositionPlan: toMusicPrompt(music.compositionPlan) } + : {}), + ...(options.duration != null && !music.compositionPlan + ? { musicLengthMs: Math.round(options.duration * 1000) } + : {}), + ...(outputFormat ? { outputFormat } : {}), + ...(music.seed != null ? { seed: music.seed } : {}), + ...(music.forceInstrumental != null + ? { forceInstrumental: music.forceInstrumental } + : {}), + ...(music.respectSectionsDurations != null + ? { respectSectionsDurations: music.respectSectionsDurations } + : {}), + }) + + return this.finalize(stream, outputFormat, options.duration) + } + + private async runSoundEffects( + options: AudioGenerationOptions, + ): Promise { + // Gated by isElevenLabsSoundEffectsModel() in generateAudio(). + const modelId = this.model as ElevenLabsSoundEffectsModel + const sfx = (options.modelOptions ?? + {}) as ElevenLabsSoundEffectsProviderOptions + const outputFormat = sfx.outputFormat + + const stream = await this.client.textToSoundEffects.convert({ + text: options.prompt, + modelId, + ...(options.duration != null + ? { durationSeconds: options.duration } + : {}), + ...(outputFormat ? { outputFormat } : {}), + ...(sfx.promptInfluence != null + ? { promptInfluence: sfx.promptInfluence } + : {}), + ...(sfx.loop != null ? { loop: sfx.loop } : {}), + }) + + return this.finalize(stream, outputFormat, options.duration) + } + + private async finalize( + stream: ReadableStream, + outputFormat: ElevenLabsOutputFormat | undefined, + duration: number | undefined, + ): Promise { + const buffer = await readStreamToArrayBuffer(stream) + const base64 = arrayBufferToBase64(buffer) + const { contentType } = parseOutputFormat(outputFormat) + return { + id: generateId(this.name), + model: this.model, + audio: { + b64Json: base64, + contentType, + ...(duration != null ? { duration } : {}), + }, + } + } + + protected override generateId(): string { + return generateId(this.name) + } +} + +function toMusicPrompt(plan: ElevenLabsMusicCompositionPlan) { + return { + positiveGlobalStyles: plan.positiveGlobalStyles ?? [], + negativeGlobalStyles: plan.negativeGlobalStyles ?? [], + sections: (plan.sections ?? []).map((section) => ({ + sectionName: section.sectionName, + positiveLocalStyles: section.positiveLocalStyles ?? [], + negativeLocalStyles: section.negativeLocalStyles ?? [], + durationMs: section.durationMs ?? 10000, + lines: section.lines ?? [], + })), + } +} + +/** + * Create an ElevenLabs audio adapter using `ELEVENLABS_API_KEY` from env. + */ +export function elevenlabsAudio( + model: TModel, + config?: ElevenLabsClientConfig, +): ElevenLabsAudioAdapter { + return new ElevenLabsAudioAdapter(model, config) +} + +/** + * Create an ElevenLabs audio adapter with an explicit API key. + */ +export function createElevenLabsAudio( + model: TModel, + apiKey: string, + config?: Omit, +): ElevenLabsAudioAdapter { + return new ElevenLabsAudioAdapter(model, { apiKey, ...config }) +} diff --git a/packages/typescript/ai-elevenlabs/src/adapters/speech.ts b/packages/typescript/ai-elevenlabs/src/adapters/speech.ts new file mode 100644 index 000000000..746c96ff7 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/adapters/speech.ts @@ -0,0 +1,240 @@ +import { BaseTTSAdapter } from '@tanstack/ai/adapters' +import { + arrayBufferToBase64, + createElevenLabsClient, + generateId, + parseOutputFormat, + readStreamToArrayBuffer, +} from '../utils/client' +import type { ElevenLabsClient } from '@elevenlabs/elevenlabs-js' +import type { TTSOptions, TTSResult } from '@tanstack/ai' +import type { ElevenLabsClientConfig } from '../utils/client' +import type { ElevenLabsOutputFormat, ElevenLabsTTSModel } from '../model-meta' + +/** + * ElevenLabs voice settings overrides. All fields are optional — omitted + * values fall back to the voice's stored defaults. + * @see https://elevenlabs.io/docs/api-reference/text-to-speech/convert + */ +export interface ElevenLabsVoiceSettings { + /** Voice stability, 0..1. Default 0.5. */ + stability?: number + /** Similarity boost, 0..1. Default 0.75. */ + similarityBoost?: number + /** Style exaggeration, 0..1. Default 0. */ + style?: number + /** Playback speed. Default 1.0. */ + speed?: number + /** Clarity/presence boost. Default true. */ + useSpeakerBoost?: boolean +} + +/** + * Provider-specific TTS options. `voice` on `generateSpeech()` takes priority + * over `voiceId` here, but we expose the same field for callers that prefer + * to keep voice configuration inside the adapter config. + */ +export interface ElevenLabsSpeechProviderOptions { + /** ElevenLabs voice ID to synthesize. Required if `generateSpeech().voice` is not set. */ + voiceId?: string + /** Output audio format encoded as `codec_samplerate[_bitrate]`. Defaults to `mp3_44100_128`. */ + outputFormat?: ElevenLabsOutputFormat + /** Voice-settings overrides for this request only. */ + voiceSettings?: ElevenLabsVoiceSettings + /** ISO-639-1 language code to enforce (e.g. `'en'`, `'ja'`). */ + languageCode?: string + /** Deterministic sampling seed, 0..4294967295. */ + seed?: number + /** Previous text for stitching adjacent clips. */ + previousText?: string + /** Next text for stitching adjacent clips. */ + nextText?: string + /** Previous request IDs for stitching (max 3). */ + previousRequestIds?: Array + /** Next request IDs for stitching (max 3). */ + nextRequestIds?: Array + /** Text normalization toggle. Default `'auto'`. */ + applyTextNormalization?: 'auto' | 'on' | 'off' + /** Language-specific text normalization (currently Japanese only, adds latency). */ + applyLanguageTextNormalization?: boolean + /** Latency optimization level, 0..4. */ + optimizeStreamingLatency?: number + /** Enable logging. Set false for zero-retention mode (enterprise only). */ + enableLogging?: boolean +} + +/** + * ElevenLabs text-to-speech adapter built on the official + * `@elevenlabs/elevenlabs-js` SDK. + * + * @example + * ```ts + * const adapter = elevenlabsSpeech('eleven_multilingual_v2') + * const result = await generateSpeech({ + * adapter, + * text: 'Hello, world!', + * voice: '21m00Tcm4TlvDq8ikWAM', + * }) + * ``` + */ +export class ElevenLabsSpeechAdapter< + TModel extends ElevenLabsTTSModel, +> extends BaseTTSAdapter { + readonly name = 'elevenlabs' as const + + private client: ElevenLabsClient + + constructor(model: TModel, config?: ElevenLabsClientConfig) { + super(model, config ?? {}) + this.client = createElevenLabsClient(config) + } + + async generateSpeech( + options: TTSOptions, + ): Promise { + const { logger } = options + logger.request( + `activity=generateSpeech provider=elevenlabs model=${this.model}`, + { provider: 'elevenlabs', model: this.model }, + ) + try { + const voiceId = options.voice ?? options.modelOptions?.voiceId + if (!voiceId) { + throw new Error( + 'ElevenLabs TTS requires a voice. Pass `voice` on generateSpeech() or `voiceId` in modelOptions.', + ) + } + const { + outputFormat, + voiceSettings, + languageCode, + seed, + previousText, + nextText, + previousRequestIds, + nextRequestIds, + applyTextNormalization, + applyLanguageTextNormalization, + optimizeStreamingLatency, + enableLogging, + } = options.modelOptions ?? {} + const effectiveOutputFormat = + outputFormat ?? inferOutputFormatFromResponseFormat(options.format) + + const stream = await this.client.textToSpeech.convert(voiceId, { + text: options.text, + modelId: this.model, + ...(effectiveOutputFormat + ? { outputFormat: effectiveOutputFormat } + : {}), + ...(voiceSettings + ? { voiceSettings: mapVoiceSettings(voiceSettings, options.speed) } + : options.speed != null + ? { voiceSettings: { speed: options.speed } } + : {}), + ...(languageCode ? { languageCode } : {}), + ...(seed != null ? { seed } : {}), + ...(previousText ? { previousText } : {}), + ...(nextText ? { nextText } : {}), + ...(previousRequestIds ? { previousRequestIds } : {}), + ...(nextRequestIds ? { nextRequestIds } : {}), + ...(applyTextNormalization ? { applyTextNormalization } : {}), + ...(applyLanguageTextNormalization != null + ? { applyLanguageTextNormalization } + : {}), + ...(optimizeStreamingLatency != null + ? { optimizeStreamingLatency } + : {}), + ...(enableLogging != null ? { enableLogging } : {}), + }) + + const buffer = await readStreamToArrayBuffer(stream) + const base64 = arrayBufferToBase64(buffer) + const { format, contentType } = parseOutputFormat(effectiveOutputFormat) + + return { + id: generateId(this.name), + model: this.model, + audio: base64, + format, + contentType, + } + } catch (error) { + logger.errors('elevenlabs.generateSpeech fatal', { + error, + source: 'elevenlabs.generateSpeech', + }) + throw error + } + } + + protected override generateId(): string { + return generateId(this.name) + } +} + +function mapVoiceSettings( + settings: ElevenLabsVoiceSettings, + speedOverride: number | undefined, +): Record { + return { + ...(settings.stability != null ? { stability: settings.stability } : {}), + ...(settings.similarityBoost != null + ? { similarityBoost: settings.similarityBoost } + : {}), + ...(settings.style != null ? { style: settings.style } : {}), + ...(speedOverride != null + ? { speed: speedOverride } + : settings.speed != null + ? { speed: settings.speed } + : {}), + ...(settings.useSpeakerBoost != null + ? { useSpeakerBoost: settings.useSpeakerBoost } + : {}), + } +} + +/** + * Map the standard TTSOptions `format` (mp3/opus/aac/flac/wav/pcm) to a + * reasonable ElevenLabs `outputFormat` so callers don't need to know the + * full codec/samplerate string for the common case. + */ +function inferOutputFormatFromResponseFormat( + format: TTSOptions['format'] | undefined, +): ElevenLabsOutputFormat | undefined { + switch (format) { + case 'mp3': + return 'mp3_44100_128' + case 'pcm': + return 'pcm_44100' + case 'opus': + return 'opus_48000_128' + case undefined: + return undefined + default: + // `aac` / `flac` / `wav` are not native ElevenLabs formats — + // fall back to mp3 rather than blowing up mid-request. + return 'mp3_44100_128' + } +} + +/** + * Create an ElevenLabs speech adapter using `ELEVENLABS_API_KEY` from env. + */ +export function elevenlabsSpeech( + model: TModel, + config?: ElevenLabsClientConfig, +): ElevenLabsSpeechAdapter { + return new ElevenLabsSpeechAdapter(model, config) +} + +/** + * Create an ElevenLabs speech adapter with an explicit API key. + */ +export function createElevenLabsSpeech( + model: TModel, + apiKey: string, + config?: Omit, +): ElevenLabsSpeechAdapter { + return new ElevenLabsSpeechAdapter(model, { apiKey, ...config }) +} diff --git a/packages/typescript/ai-elevenlabs/src/adapters/transcription.ts b/packages/typescript/ai-elevenlabs/src/adapters/transcription.ts new file mode 100644 index 000000000..2b110d18d --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/adapters/transcription.ts @@ -0,0 +1,338 @@ +import { BaseTranscriptionAdapter } from '@tanstack/ai/adapters' +import { + createElevenLabsClient, + dataUrlToBlob, + generateId, +} from '../utils/client' +import type { ElevenLabsClient } from '@elevenlabs/elevenlabs-js' +import type { + TranscriptionOptions, + TranscriptionResult, + TranscriptionSegment, + TranscriptionWord, +} from '@tanstack/ai' +import type { ElevenLabsClientConfig } from '../utils/client' +import type { ElevenLabsTranscriptionModel } from '../model-meta' + +/** + * Provider-specific options for ElevenLabs Scribe transcription. Fields map + * 1:1 onto the SDK's `BodySpeechToTextV1SpeechToTextPost` — mirroring the + * names so documentation stays useful. + * @see https://elevenlabs.io/docs/api-reference/speech-to-text/convert + */ +export interface ElevenLabsTranscriptionProviderOptions { + /** Annotate non-speech events like (laughter), (footsteps), …. */ + tagAudioEvents?: boolean + /** Maximum number of speakers in the audio (1..32). */ + numSpeakers?: number + /** Timestamp granularity for words. */ + timestampsGranularity?: 'word' | 'character' | 'none' + /** Enable speaker diarization. */ + diarize?: boolean + /** Diarization threshold (requires `diarize=true` and no `numSpeakers`). */ + diarizationThreshold?: number + /** Detect speaker roles (agent/customer). Requires diarize=true. */ + detectSpeakerRoles?: boolean + /** Bias the model towards these keyterms (max 1000). */ + keyterms?: Array + /** + * Entity detection: `'all'`, a category (`'pii'`, `'phi'`, `'pci'`, + * `'other'`, `'offensive_language'`), or a specific entity type. + */ + entityDetection?: string + /** Redact entities from the transcript text. Must be a subset of `entityDetection`. */ + entityRedaction?: string + /** How redacted entities are formatted. */ + entityRedactionMode?: string + /** Whether to skip filler words / non-speech sounds (scribe_v2 only). */ + noVerbatim?: boolean + /** Sampling temperature (0..2). */ + temperature?: number + /** Deterministic sampling seed (0..2147483647). */ + seed?: number + /** Use `false` for zero-retention mode (enterprise only). */ + enableLogging?: boolean + /** Multi-channel audio with one speaker per channel. Max 5 channels. */ + useMultiChannel?: boolean + /** + * Hint for audio format. Use `'pcm_s16le_16'` to skip encoding for 16-bit + * PCM @ 16kHz mono little-endian inputs (lower latency). + */ + fileFormat?: 'pcm_s16le_16' | 'other' +} + +/** + * ElevenLabs speech-to-text adapter built on the official SDK's Scribe family. + * + * @example + * ```ts + * const adapter = elevenlabsTranscription('scribe_v1') + * const result = await generateTranscription({ + * adapter, + * audio: fileInput, + * language: 'en', + * }) + * ``` + */ +export class ElevenLabsTranscriptionAdapter< + TModel extends ElevenLabsTranscriptionModel, +> extends BaseTranscriptionAdapter< + TModel, + ElevenLabsTranscriptionProviderOptions +> { + readonly name = 'elevenlabs' as const + + private client: ElevenLabsClient + + constructor(model: TModel, config?: ElevenLabsClientConfig) { + super(model, config ?? {}) + this.client = createElevenLabsClient(config) + } + + async transcribe( + options: TranscriptionOptions, + ): Promise { + const { logger } = options + logger.request( + `activity=generateTranscription provider=elevenlabs model=${this.model}`, + { provider: 'elevenlabs', model: this.model }, + ) + try { + const modelOpts = options.modelOptions ?? {} + const audioInput = normalizeAudioInput(options.audio) + + const response = await this.client.speechToText.convert({ + modelId: this.model, + ...(audioInput.kind === 'file' + ? { file: audioInput.value } + : { cloudStorageUrl: audioInput.value }), + ...(options.language ? { languageCode: options.language } : {}), + ...(modelOpts.tagAudioEvents != null + ? { tagAudioEvents: modelOpts.tagAudioEvents } + : {}), + ...(modelOpts.numSpeakers != null + ? { numSpeakers: modelOpts.numSpeakers } + : {}), + ...(modelOpts.timestampsGranularity + ? { timestampsGranularity: modelOpts.timestampsGranularity } + : {}), + ...(modelOpts.diarize != null ? { diarize: modelOpts.diarize } : {}), + ...(modelOpts.diarizationThreshold != null + ? { diarizationThreshold: modelOpts.diarizationThreshold } + : {}), + ...(modelOpts.detectSpeakerRoles != null + ? { detectSpeakerRoles: modelOpts.detectSpeakerRoles } + : {}), + ...(modelOpts.keyterms ? { keyterms: modelOpts.keyterms } : {}), + ...(modelOpts.entityDetection + ? { entityDetection: modelOpts.entityDetection } + : {}), + ...(modelOpts.entityRedaction + ? { entityRedaction: modelOpts.entityRedaction } + : {}), + ...(modelOpts.entityRedactionMode + ? { entityRedactionMode: modelOpts.entityRedactionMode } + : {}), + ...(modelOpts.noVerbatim != null + ? { noVerbatim: modelOpts.noVerbatim } + : {}), + ...(modelOpts.temperature != null + ? { temperature: modelOpts.temperature } + : {}), + ...(modelOpts.seed != null ? { seed: modelOpts.seed } : {}), + ...(modelOpts.enableLogging != null + ? { enableLogging: modelOpts.enableLogging } + : {}), + ...(modelOpts.useMultiChannel != null + ? { useMultiChannel: modelOpts.useMultiChannel } + : {}), + ...(modelOpts.fileFormat ? { fileFormat: modelOpts.fileFormat } : {}), + } as Parameters[0]) + + return this.transformResponse(response) + } catch (error) { + logger.errors('elevenlabs.generateTranscription fatal', { + error, + source: 'elevenlabs.generateTranscription', + }) + throw error + } + } + + private transformResponse( + response: Awaited>, + ): TranscriptionResult { + // The SDK types this as a union of single- and multi-channel responses. + // We treat multi-channel as "join the channel transcripts" — consumers + // who care about per-channel detail can re-parse from `modelOptions`. + const data = response as unknown as { + text?: string + languageCode?: string + languageProbability?: number + words?: Array<{ + text: string + start?: number + end?: number + type: string + speakerId?: string + }> + audioDurationSecs?: number + transcripts?: Array<{ + text?: string + languageCode?: string + words?: Array<{ + text: string + start?: number + end?: number + type: string + speakerId?: string + }> + audioDurationSecs?: number + }> + } + + if (data.transcripts) { + const joinedText = data.transcripts + .map((t) => t.text ?? '') + .filter(Boolean) + .join('\n') + const joinedWords = data.transcripts.flatMap((t) => t.words ?? []) + const duration = data.transcripts.reduce( + (max, t) => Math.max(max, t.audioDurationSecs ?? 0), + 0, + ) + const firstLang = data.transcripts.find( + (t) => t.languageCode, + )?.languageCode + return { + id: generateId(this.name), + model: this.model, + text: joinedText, + ...(firstLang ? { language: firstLang } : {}), + ...(duration ? { duration } : {}), + ...buildWordsAndSegments(joinedWords), + } + } + + return { + id: generateId(this.name), + model: this.model, + text: data.text ?? '', + ...(data.languageCode ? { language: data.languageCode } : {}), + ...(data.audioDurationSecs ? { duration: data.audioDurationSecs } : {}), + ...buildWordsAndSegments(data.words ?? []), + } + } + + protected override generateId(): string { + return generateId(this.name) + } +} + +type NormalizedAudio = + | { kind: 'file'; value: Blob } + | { kind: 'url'; value: string } + +function normalizeAudioInput( + audio: TranscriptionOptions['audio'], +): NormalizedAudio { + if (audio instanceof ArrayBuffer) { + return { kind: 'file', value: new Blob([audio]) } + } + if (typeof audio === 'string') { + const blob = dataUrlToBlob(audio) + if (blob) return { kind: 'file', value: blob } + return { kind: 'url', value: audio } + } + // Blob or File both fit the SDK's `Uploadable` contract. + return { kind: 'file', value: audio } +} + +function buildWordsAndSegments( + words: Array<{ + text: string + start?: number + end?: number + type: string + speakerId?: string + }>, +): { + words?: Array + segments?: Array +} { + const timedWords = words.filter( + (w) => + typeof w.start === 'number' && + typeof w.end === 'number' && + w.type !== 'spacing', + ) + if (timedWords.length === 0) return {} + + const outWords: Array = timedWords.map((w) => ({ + word: w.text, + start: w.start!, + end: w.end!, + })) + + // Group contiguous words that share a speaker into segments. If no speaker + // is ever set, we still emit one segment per sentence-ish grouping. + const segments: Array = [] + let current: { + start: number + end: number + text: string + speaker?: string + } | null = null + + for (const w of timedWords) { + if (!current) { + current = { + start: w.start!, + end: w.end!, + text: w.text, + ...(w.speakerId ? { speaker: w.speakerId } : {}), + } + continue + } + if (w.speakerId && current.speaker !== w.speakerId) { + segments.push({ id: segments.length, ...current }) + current = { + start: w.start!, + end: w.end!, + text: w.text, + speaker: w.speakerId, + } + continue + } + current.end = w.end! + current.text = current.text ? `${current.text} ${w.text}` : w.text + } + if (current) segments.push({ id: segments.length, ...current }) + + return { words: outWords, segments } +} + +/** + * Create an ElevenLabs transcription adapter using `ELEVENLABS_API_KEY` from env. + */ +export function elevenlabsTranscription< + TModel extends ElevenLabsTranscriptionModel, +>( + model: TModel, + config?: ElevenLabsClientConfig, +): ElevenLabsTranscriptionAdapter { + return new ElevenLabsTranscriptionAdapter(model, config) +} + +/** + * Create an ElevenLabs transcription adapter with an explicit API key. + */ +export function createElevenLabsTranscription< + TModel extends ElevenLabsTranscriptionModel, +>( + model: TModel, + apiKey: string, + config?: Omit, +): ElevenLabsTranscriptionAdapter { + return new ElevenLabsTranscriptionAdapter(model, { apiKey, ...config }) +} diff --git a/packages/typescript/ai-elevenlabs/src/index.ts b/packages/typescript/ai-elevenlabs/src/index.ts index 8f3789e84..8e5ffb7d4 100644 --- a/packages/typescript/ai-elevenlabs/src/index.ts +++ b/packages/typescript/ai-elevenlabs/src/index.ts @@ -11,3 +11,67 @@ export type { ElevenLabsVADConfig, ElevenLabsClientTool, } from './realtime/index' + +// ============================================================================ +// Speech (Text-to-Speech) Adapter +// ============================================================================ + +export { + ElevenLabsSpeechAdapter, + createElevenLabsSpeech, + elevenlabsSpeech, + type ElevenLabsSpeechProviderOptions, + type ElevenLabsVoiceSettings, +} from './adapters/speech' + +// ============================================================================ +// Audio (Music + Sound Effects) Adapter +// ============================================================================ + +export { + ElevenLabsAudioAdapter, + createElevenLabsAudio, + elevenlabsAudio, + type ElevenLabsAudioProviderOptions, + type ElevenLabsMusicProviderOptions, + type ElevenLabsSoundEffectsProviderOptions, + type ElevenLabsMusicCompositionPlan, +} from './adapters/audio' + +// ============================================================================ +// Transcription (Speech-to-Text) Adapter +// ============================================================================ + +export { + ElevenLabsTranscriptionAdapter, + createElevenLabsTranscription, + elevenlabsTranscription, + type ElevenLabsTranscriptionProviderOptions, +} from './adapters/transcription' + +// ============================================================================ +// Model Metadata +// ============================================================================ + +export { + ELEVENLABS_TTS_MODELS, + ELEVENLABS_AUDIO_MODELS, + ELEVENLABS_TRANSCRIPTION_MODELS, + isElevenLabsMusicModel, + isElevenLabsSoundEffectsModel, + type ElevenLabsTTSModel, + type ElevenLabsAudioModel, + type ElevenLabsMusicModel, + type ElevenLabsSoundEffectsModel, + type ElevenLabsTranscriptionModel, + type ElevenLabsOutputFormat, +} from './model-meta' + +// ============================================================================ +// Utilities +// ============================================================================ + +export { + getElevenLabsApiKeyFromEnv, + type ElevenLabsClientConfig, +} from './utils/index' diff --git a/packages/typescript/ai-elevenlabs/src/model-meta.ts b/packages/typescript/ai-elevenlabs/src/model-meta.ts new file mode 100644 index 000000000..693d019f8 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/model-meta.ts @@ -0,0 +1,79 @@ +import type { ElevenLabs } from '@elevenlabs/elevenlabs-js' + +/** + * ElevenLabs model identifiers. The lists below are the source of truth — + * callers are blocked from passing unknown model IDs. Keep them in sync with + * the ElevenLabs SDK via the automated update pipeline. + */ + +/** + * Text-to-speech models. + * @see https://elevenlabs.io/docs/models + */ +export const ELEVENLABS_TTS_MODELS = [ + 'eleven_v3', + 'eleven_multilingual_v2', + 'eleven_flash_v2_5', + 'eleven_flash_v2', + 'eleven_turbo_v2_5', + 'eleven_turbo_v2', + 'eleven_monolingual_v1', +] as const + +export type ElevenLabsTTSModel = (typeof ELEVENLABS_TTS_MODELS)[number] + +/** + * Audio generation models — music (`music_v1`) + sound effects + * (`eleven_text_to_sound_v*`) share one `generateAudio` adapter. + * The adapter dispatches by model id so callers pick behavior via the model. + * + * @see https://elevenlabs.io/docs/overview/capabilities/music + * @see https://elevenlabs.io/docs/overview/capabilities/sound-effects + */ +export const ELEVENLABS_AUDIO_MODELS = [ + 'music_v1', + 'eleven_text_to_sound_v2', + 'eleven_text_to_sound_v1', +] as const + +export type ElevenLabsAudioModel = (typeof ELEVENLABS_AUDIO_MODELS)[number] + +/** Music models within the audio family. */ +export type ElevenLabsMusicModel = 'music_v1' +/** SFX models within the audio family. */ +export type ElevenLabsSoundEffectsModel = + | 'eleven_text_to_sound_v2' + | 'eleven_text_to_sound_v1' + +export function isElevenLabsMusicModel( + model: string, +): model is ElevenLabsMusicModel { + return model === 'music_v1' +} + +export function isElevenLabsSoundEffectsModel( + model: string, +): model is ElevenLabsSoundEffectsModel { + return model.startsWith('eleven_text_to_sound_') +} + +/** + * Speech-to-text (transcription) models — Scribe family. + * @see https://elevenlabs.io/docs/overview/capabilities/speech-to-text + */ +export const ELEVENLABS_TRANSCRIPTION_MODELS = [ + 'scribe_v2', + 'scribe_v1', +] as const + +export type ElevenLabsTranscriptionModel = + (typeof ELEVENLABS_TRANSCRIPTION_MODELS)[number] + +/** + * Supported `output_format` strings, encoded as `codec_samplerate[_bitrate]`. + * Aliased to the SDK's `AllowedOutputFormats` so the list stays in sync + * automatically whenever the `@elevenlabs/elevenlabs-js` dependency is bumped. + * + * @see https://elevenlabs.io/docs/api-reference/text-to-speech/convert + */ +export type ElevenLabsOutputFormat = ElevenLabs.AllowedOutputFormats diff --git a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts index a347d013c..9dbfabc31 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/adapter.ts @@ -1,4 +1,4 @@ -import { Conversation } from '@11labs/client' +import { Conversation } from '@elevenlabs/client' import { resolveDebugOption } from '@tanstack/ai/adapter-internals' import type { AnyClientTool, @@ -18,7 +18,7 @@ import type { ElevenLabsRealtimeOptions } from './types' /** * Creates an ElevenLabs realtime adapter for client-side use. * - * Wraps the @11labs/client SDK for voice conversations. + * Wraps the @elevenlabs/client SDK for voice conversations. * * @param options - Optional configuration * @returns A RealtimeAdapter for use with RealtimeClient @@ -91,7 +91,7 @@ async function createElevenLabsConnection( } // Convert TanStack tool definitions to ElevenLabs clientTools format. - // @11labs/client@0.2.0 expects plain async functions, not objects. + // @elevenlabs/client expects plain async functions, not objects. const elevenLabsClientTools: Record< string, (params: unknown) => Promise diff --git a/packages/typescript/ai-elevenlabs/src/realtime/token.ts b/packages/typescript/ai-elevenlabs/src/realtime/token.ts index 030d0c9a9..6a115fe13 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/token.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/token.ts @@ -1,40 +1,19 @@ +import { + createElevenLabsClient, + getElevenLabsAgentIdFromEnv, +} from '../utils/client' import type { RealtimeToken, RealtimeTokenAdapter } from '@tanstack/ai' import type { ElevenLabsRealtimeTokenOptions } from './types' -const ELEVENLABS_API_URL = 'https://api.elevenlabs.io/v1' - -/** - * Get ElevenLabs API key from environment - */ -function getElevenLabsApiKey(): string { - // Check process.env (Node.js) - if (typeof process !== 'undefined' && process.env.ELEVENLABS_API_KEY) { - return process.env.ELEVENLABS_API_KEY - } - - // Check window.env (Browser with injected env) - if ( - typeof window !== 'undefined' && - (window as unknown as { env?: { ELEVENLABS_API_KEY?: string } }).env - ?.ELEVENLABS_API_KEY - ) { - return (window as unknown as { env: { ELEVENLABS_API_KEY: string } }).env - .ELEVENLABS_API_KEY - } - - throw new Error( - 'ELEVENLABS_API_KEY not found in environment variables. ' + - 'Please set ELEVENLABS_API_KEY in your environment.', - ) -} - /** * Creates an ElevenLabs realtime token adapter. * - * This adapter generates signed URLs for client-side connections. - * The signed URL is valid for 30 minutes. + * Uses the official `@elevenlabs/elevenlabs-js` SDK to request a signed URL + * for client-side conversation connections. The signed URL is valid for + * 30 minutes. * - * @param options - Configuration options including agentId + * @param options - Configuration. `agentId` falls back to + * `ELEVENLABS_AGENT_ID` in the environment when omitted. * @returns A RealtimeTokenAdapter for use with realtimeToken() * * @example @@ -42,51 +21,37 @@ function getElevenLabsApiKey(): string { * import { realtimeToken } from '@tanstack/ai' * import { elevenlabsRealtimeToken } from '@tanstack/ai-elevenlabs' * + * // Reads ELEVENLABS_AGENT_ID from env: + * const token = await realtimeToken({ adapter: elevenlabsRealtimeToken() }) + * + * // Or pass explicitly: * const token = await realtimeToken({ - * adapter: elevenlabsRealtimeToken({ - * agentId: 'your-agent-id', - * }), + * adapter: elevenlabsRealtimeToken({ agentId: 'your-agent-id' }), * }) * ``` */ export function elevenlabsRealtimeToken( - options: ElevenLabsRealtimeTokenOptions, + options: ElevenLabsRealtimeTokenOptions = {}, ): RealtimeTokenAdapter { - const apiKey = getElevenLabsApiKey() + const client = createElevenLabsClient() return { provider: 'elevenlabs', async generateToken(): Promise { - const { agentId, overrides } = options + const { overrides } = options + const agentId = options.agentId ?? getElevenLabsAgentIdFromEnv() - // Get signed URL from ElevenLabs - const response = await fetch( - `${ELEVENLABS_API_URL}/convai/conversation/get_signed_url?agent_id=${agentId}`, - { - method: 'GET', - headers: { - 'xi-api-key': apiKey, - }, - }, + const response = await client.conversationalAi.conversations.getSignedUrl( + { agentId }, ) - if (!response.ok) { - const errorText = await response.text() - throw new Error( - `ElevenLabs signed URL request failed: ${response.status} ${errorText}`, - ) - } - - const data = await response.json() - const signedUrl = data.signed_url as string - // Signed URLs are valid for 30 minutes const expiresAt = Date.now() + 30 * 60 * 1000 return { provider: 'elevenlabs', - token: signedUrl, + token: response.signedUrl, expiresAt, config: { voice: overrides?.voiceId, diff --git a/packages/typescript/ai-elevenlabs/src/realtime/types.ts b/packages/typescript/ai-elevenlabs/src/realtime/types.ts index 12d7714f6..ee32ff917 100644 --- a/packages/typescript/ai-elevenlabs/src/realtime/types.ts +++ b/packages/typescript/ai-elevenlabs/src/realtime/types.ts @@ -4,8 +4,11 @@ import type { DebugOption } from '@tanstack/ai' * Options for the ElevenLabs realtime token adapter */ export interface ElevenLabsRealtimeTokenOptions { - /** Agent ID configured in ElevenLabs dashboard */ - agentId: string + /** + * Agent ID configured in ElevenLabs dashboard. Falls back to + * `ELEVENLABS_AGENT_ID` in the environment when omitted. + */ + agentId?: string /** Optional override values for the agent */ overrides?: { /** Custom voice ID to use */ diff --git a/packages/typescript/ai-elevenlabs/src/utils/client.ts b/packages/typescript/ai-elevenlabs/src/utils/client.ts new file mode 100644 index 000000000..86e431d82 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/utils/client.ts @@ -0,0 +1,193 @@ +import { ElevenLabsClient } from '@elevenlabs/elevenlabs-js' +import type { ElevenLabsOutputFormat } from '../model-meta' + +/** + * Configuration for any ElevenLabs adapter. When `apiKey` is omitted we read + * `ELEVENLABS_API_KEY` from `process.env` / `window.env` to match the + * pattern the realtime adapters already use. + */ +export interface ElevenLabsClientConfig { + apiKey?: string + /** Override the API base URL — handy for tests + self-hosted proxies. */ + baseUrl?: string + /** Per-request timeout passed through to the SDK (ms). */ + timeoutInSeconds?: number + /** Override the number of SDK-level retries. */ + maxRetries?: number +} + +interface EnvObject { + ELEVENLABS_API_KEY?: string + ELEVENLABS_AGENT_ID?: string +} + +interface WindowWithEnv { + env?: EnvObject +} + +function getEnvironment(): EnvObject | undefined { + if (typeof globalThis !== 'undefined') { + const win = (globalThis as { window?: WindowWithEnv }).window + if (win?.env) return win.env + } + if (typeof process !== 'undefined') { + return process.env as EnvObject + } + return undefined +} + +export function getElevenLabsApiKeyFromEnv(): string { + const key = getEnvironment()?.ELEVENLABS_API_KEY + if (!key) { + throw new Error( + 'ELEVENLABS_API_KEY is required. Please set it in your environment ' + + 'variables or pass it explicitly to the adapter factory.', + ) + } + return key +} + +export function getElevenLabsAgentIdFromEnv(): string { + const id = getEnvironment()?.ELEVENLABS_AGENT_ID + if (!id) { + throw new Error( + 'ELEVENLABS_AGENT_ID is required. Please set it in your environment ' + + 'variables or pass `agentId` explicitly to elevenlabsRealtimeToken().', + ) + } + return id +} + +/** + * Build an `ElevenLabsClient` with env-based or explicit credentials. + * Each adapter calls this once at construction time so unit tests can + * pass in an explicit key without needing `process.env`. + */ +export function createElevenLabsClient( + config?: ElevenLabsClientConfig, +): ElevenLabsClient { + const apiKey = config?.apiKey ?? getElevenLabsApiKeyFromEnv() + return new ElevenLabsClient({ + apiKey, + ...(config?.baseUrl ? { baseUrl: config.baseUrl } : {}), + ...(config?.timeoutInSeconds != null + ? { timeoutInSeconds: config.timeoutInSeconds } + : {}), + ...(config?.maxRetries != null ? { maxRetries: config.maxRetries } : {}), + }) +} + +export function generateId(prefix: string): string { + return `${prefix}-${Date.now()}-${Math.random().toString(36).substring(2)}` +} + +/** + * Convert an ArrayBuffer to base64 in a cross-runtime way. + * + * The naive `btoa(String.fromCharCode(...bytes))` form blows up V8's argument + * limit (~65k) on realistic audio payloads, so we either use `Buffer` + * (Node / Bun) or walk the byte array in a single loop (browser). + */ +export function arrayBufferToBase64(buffer: ArrayBuffer): string { + if (typeof Buffer !== 'undefined' && typeof Buffer.from === 'function') { + return Buffer.from(buffer).toString('base64') + } + const view = new Uint8Array(buffer) + let binary = '' + for (let i = 0; i < view.byteLength; i += 1) { + binary += String.fromCharCode(view[i]!) + } + return btoa(binary) +} + +/** + * Drain a `ReadableStream` (what the ElevenLabs SDK returns for + * audio endpoints) into a single `Uint8Array`, then expose it as an + * `ArrayBuffer` slice. We concatenate ourselves rather than going through + * `new Response(stream).arrayBuffer()` so we stay runtime-agnostic. + */ +export async function readStreamToArrayBuffer( + stream: ReadableStream, +): Promise { + const reader = stream.getReader() + const chunks: Array = [] + let total = 0 + try { + let result = await reader.read() + while (!result.done) { + chunks.push(result.value) + total += result.value.byteLength + result = await reader.read() + } + } finally { + reader.releaseLock() + } + const merged = new Uint8Array(total) + let offset = 0 + for (const chunk of chunks) { + merged.set(chunk, offset) + offset += chunk.byteLength + } + return merged.buffer.slice( + merged.byteOffset, + merged.byteOffset + merged.byteLength, + ) +} + +/** + * Decode a `data:` URL into a Blob for upload to the ElevenLabs STT API + * (which only accepts multipart files or https URLs). Supports base64 and + * URL-encoded payloads. Returns `undefined` for non-data-URL strings so the + * caller can fall through to treating the input as an https URL. + */ +export function dataUrlToBlob(value: string): Blob | undefined { + if (!value.startsWith('data:')) return undefined + const commaIndex = value.indexOf(',') + if (commaIndex === -1) return undefined + + const header = value.slice(5, commaIndex) + const payload = value.slice(commaIndex + 1) + const isBase64 = /;base64$/i.test(header) + const mimeType = header.split(';')[0] || 'application/octet-stream' + + if (isBase64) { + const binary = atob(payload) + const bytes = new Uint8Array(binary.length) + for (let i = 0; i < binary.length; i += 1) { + bytes[i] = binary.charCodeAt(i) + } + return new Blob([bytes], { type: mimeType }) + } + + return new Blob([decodeURIComponent(payload)], { type: mimeType }) +} + +/** + * Break an ElevenLabs `output_format` string (`mp3_44100_128`, + * `pcm_24000`, `opus_48000_64`, `ulaw_8000`, ...) into a file extension and + * content-type suitable for `TTSResult` / `AudioGenerationResult` consumers. + * + * Unknown codecs fall back to `mp3` / `audio/mpeg` because the ElevenLabs + * default is `mp3_44100_128` — mispredicting on an exotic format is safer + * than throwing in the adapter. + */ +export function parseOutputFormat(fmt: ElevenLabsOutputFormat | undefined): { + format: string + contentType: string +} { + const codec = (fmt || 'mp3_44100_128').split('_')[0]?.toLowerCase() + switch (codec) { + case 'mp3': + return { format: 'mp3', contentType: 'audio/mpeg' } + case 'pcm': + return { format: 'pcm', contentType: 'audio/pcm' } + case 'opus': + return { format: 'opus', contentType: 'audio/opus' } + case 'ulaw': + return { format: 'ulaw', contentType: 'audio/basic' } + case 'alaw': + return { format: 'alaw', contentType: 'audio/x-alaw-basic' } + default: + return { format: 'mp3', contentType: 'audio/mpeg' } + } +} diff --git a/packages/typescript/ai-elevenlabs/src/utils/index.ts b/packages/typescript/ai-elevenlabs/src/utils/index.ts new file mode 100644 index 000000000..8d017e68c --- /dev/null +++ b/packages/typescript/ai-elevenlabs/src/utils/index.ts @@ -0,0 +1,10 @@ +export { + arrayBufferToBase64, + createElevenLabsClient, + dataUrlToBlob, + generateId, + getElevenLabsApiKeyFromEnv, + parseOutputFormat, + readStreamToArrayBuffer, + type ElevenLabsClientConfig, +} from './client' diff --git a/packages/typescript/ai-elevenlabs/tests/audio-adapter.test.ts b/packages/typescript/ai-elevenlabs/tests/audio-adapter.test.ts new file mode 100644 index 000000000..af4988251 --- /dev/null +++ b/packages/typescript/ai-elevenlabs/tests/audio-adapter.test.ts @@ -0,0 +1,165 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' +import type { AudioGenerationOptions } from '@tanstack/ai' + +const composeMock = vi.fn() +const sfxConvertMock = vi.fn() + +vi.mock('@elevenlabs/elevenlabs-js', () => ({ + ElevenLabsClient: class { + music = { compose: composeMock } + textToSoundEffects = { convert: sfxConvertMock } + }, +})) + +import { elevenlabsAudio } from '../src/adapters/audio' + +function makeLogger() { + return { + request: vi.fn(), + response: vi.fn(), + provider: vi.fn(), + errors: vi.fn(), + } as unknown as AudioGenerationOptions['logger'] +} + +function makeStream(bytes: Uint8Array): ReadableStream { + return new ReadableStream({ + start(controller) { + controller.enqueue(bytes) + controller.close() + }, + }) +} + +describe('elevenlabsAudio adapter — music_v1', () => { + beforeEach(() => { + composeMock.mockReset() + sfxConvertMock.mockReset() + }) + + it('calls client.music.compose with prompt + duration in ms', async () => { + composeMock.mockResolvedValue(makeStream(new Uint8Array([1, 2]))) + const adapter = elevenlabsAudio('music_v1', { apiKey: 'k' }) + + const result = await adapter.generateAudio({ + model: 'music_v1', + prompt: 'jazz trio', + duration: 15, + logger: makeLogger(), + }) + + expect(sfxConvertMock).not.toHaveBeenCalled() + expect(composeMock).toHaveBeenCalledTimes(1) + expect(composeMock.mock.calls[0]![0]).toMatchObject({ + modelId: 'music_v1', + prompt: 'jazz trio', + musicLengthMs: 15000, + }) + expect(result.audio.b64Json).toBe(Buffer.from([1, 2]).toString('base64')) + }) + + it('drops prompt + duration when compositionPlan is supplied', async () => { + composeMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsAudio('music_v1', { apiKey: 'k' }) + + await adapter.generateAudio({ + model: 'music_v1', + prompt: 'ignored', + duration: 20, + modelOptions: { + compositionPlan: { + positiveGlobalStyles: ['jazz'], + sections: [ + { + sectionName: 'verse', + durationMs: 8000, + lines: ['hello'], + }, + ], + }, + }, + logger: makeLogger(), + }) + + const body = composeMock.mock.calls[0]![0] + expect(body.prompt).toBeUndefined() + expect(body.musicLengthMs).toBeUndefined() + expect(body.compositionPlan).toMatchObject({ + positiveGlobalStyles: ['jazz'], + sections: [ + expect.objectContaining({ + sectionName: 'verse', + durationMs: 8000, + lines: ['hello'], + }), + ], + }) + }) +}) + +describe('elevenlabsAudio adapter — sound effects', () => { + beforeEach(() => { + composeMock.mockReset() + sfxConvertMock.mockReset() + }) + + it('calls client.textToSoundEffects.convert with text + duration', async () => { + sfxConvertMock.mockResolvedValue(makeStream(new Uint8Array([9]))) + const adapter = elevenlabsAudio('eleven_text_to_sound_v2', { + apiKey: 'k', + }) + + const result = await adapter.generateAudio({ + model: 'eleven_text_to_sound_v2', + prompt: 'glass breaking', + duration: 3, + modelOptions: { promptInfluence: 0.7, loop: true }, + logger: makeLogger(), + }) + + expect(composeMock).not.toHaveBeenCalled() + expect(sfxConvertMock).toHaveBeenCalledTimes(1) + expect(sfxConvertMock.mock.calls[0]![0]).toMatchObject({ + text: 'glass breaking', + modelId: 'eleven_text_to_sound_v2', + durationSeconds: 3, + promptInfluence: 0.7, + loop: true, + }) + expect(result.audio.b64Json).toBe(Buffer.from([9]).toString('base64')) + expect(result.audio.duration).toBe(3) + }) + + it('routes eleven_text_to_sound_v1 to the SFX endpoint too', async () => { + sfxConvertMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsAudio('eleven_text_to_sound_v1', { + apiKey: 'k', + }) + await adapter.generateAudio({ + model: 'eleven_text_to_sound_v1', + prompt: 'rain', + logger: makeLogger(), + }) + expect(sfxConvertMock).toHaveBeenCalled() + }) +}) + +describe('elevenlabsAudio adapter — unknown model', () => { + beforeEach(() => { + composeMock.mockReset() + sfxConvertMock.mockReset() + }) + + it('throws a helpful error for unrecognized models', async () => { + const adapter = elevenlabsAudio('not-a-real-model', { apiKey: 'k' }) + const logger = makeLogger() + await expect( + adapter.generateAudio({ + model: 'not-a-real-model', + prompt: 'x', + logger, + }), + ).rejects.toThrow(/Unsupported ElevenLabs audio model/i) + expect(logger.errors).toHaveBeenCalled() + }) +}) diff --git a/packages/typescript/ai-elevenlabs/tests/realtime-adapter.test.ts b/packages/typescript/ai-elevenlabs/tests/realtime-adapter.test.ts index 91605af1e..a4cbb8a4a 100644 --- a/packages/typescript/ai-elevenlabs/tests/realtime-adapter.test.ts +++ b/packages/typescript/ai-elevenlabs/tests/realtime-adapter.test.ts @@ -5,7 +5,7 @@ import type { AnyClientTool, RealtimeMessage } from '@tanstack/ai' // Capture the session options passed to Conversation.startSession let capturedSessionOptions: Record = {} -vi.mock('@11labs/client', () => ({ +vi.mock('@elevenlabs/client', () => ({ Conversation: { startSession: vi.fn(async (options: Record) => { capturedSessionOptions = options @@ -146,7 +146,7 @@ describe('elevenlabsRealtime adapter', () => { }) describe('clientTools registration', () => { - it('should pass client tools as plain functions to @11labs/client', async () => { + it('should pass client tools as plain functions to @elevenlabs/client', async () => { const mockTool: AnyClientTool = { name: 'get_weather', description: 'Get current weather', diff --git a/packages/typescript/ai-elevenlabs/tests/speech-adapter.test.ts b/packages/typescript/ai-elevenlabs/tests/speech-adapter.test.ts new file mode 100644 index 000000000..6c1041abf --- /dev/null +++ b/packages/typescript/ai-elevenlabs/tests/speech-adapter.test.ts @@ -0,0 +1,166 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' +import type { TTSOptions } from '@tanstack/ai' + +const convertMock = vi.fn() + +vi.mock('@elevenlabs/elevenlabs-js', () => ({ + ElevenLabsClient: class { + textToSpeech = { convert: convertMock } + }, +})) + +import { elevenlabsSpeech } from '../src/adapters/speech' + +function makeLogger() { + return { + request: vi.fn(), + response: vi.fn(), + provider: vi.fn(), + errors: vi.fn(), + } as unknown as TTSOptions['logger'] +} + +function makeStream(bytes: Uint8Array): ReadableStream { + return new ReadableStream({ + start(controller) { + controller.enqueue(bytes) + controller.close() + }, + }) +} + +describe('elevenlabsSpeech adapter', () => { + beforeEach(() => { + convertMock.mockReset() + }) + + it('forwards text + modelId + voiceId to the SDK and returns base64', async () => { + convertMock.mockResolvedValue(makeStream(new Uint8Array([1, 2, 3]))) + const adapter = elevenlabsSpeech('eleven_multilingual_v2', { + apiKey: 'test-key', + }) + + const result = await adapter.generateSpeech({ + model: 'eleven_multilingual_v2', + text: 'Hello there', + voice: 'voice-1', + logger: makeLogger(), + }) + + expect(convertMock).toHaveBeenCalledTimes(1) + const [voiceId, body] = convertMock.mock.calls[0]! + expect(voiceId).toBe('voice-1') + expect(body).toMatchObject({ + text: 'Hello there', + modelId: 'eleven_multilingual_v2', + }) + expect(result).toMatchObject({ + model: 'eleven_multilingual_v2', + audio: Buffer.from([1, 2, 3]).toString('base64'), + format: 'mp3', + contentType: 'audio/mpeg', + }) + expect(result.id).toMatch(/^elevenlabs-/) + }) + + it('prefers options.voice over modelOptions.voiceId', async () => { + convertMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + + await adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + voice: 'explicit-voice', + modelOptions: { voiceId: 'fallback-voice' }, + logger: makeLogger(), + }) + + expect(convertMock.mock.calls[0]![0]).toBe('explicit-voice') + }) + + it('falls back to modelOptions.voiceId when options.voice is missing', async () => { + convertMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + + await adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + modelOptions: { voiceId: 'fallback-voice' }, + logger: makeLogger(), + }) + + expect(convertMock.mock.calls[0]![0]).toBe('fallback-voice') + }) + + it('throws when no voice is provided', async () => { + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + const logger = makeLogger() + + await expect( + adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + logger, + }), + ).rejects.toThrow(/requires a voice/i) + expect(logger.errors).toHaveBeenCalled() + }) + + it('translates TTSOptions.format to the closest ElevenLabs outputFormat', async () => { + convertMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + + const result = await adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + voice: 'v', + format: 'pcm', + logger: makeLogger(), + }) + + expect(convertMock.mock.calls[0]![1].outputFormat).toBe('pcm_44100') + expect(result.format).toBe('pcm') + expect(result.contentType).toBe('audio/pcm') + }) + + it('merges voiceSettings and promotes options.speed', async () => { + convertMock.mockResolvedValue(makeStream(new Uint8Array())) + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + + await adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + voice: 'v', + speed: 1.25, + modelOptions: { + voiceSettings: { stability: 0.4, similarityBoost: 0.6 }, + }, + logger: makeLogger(), + }) + + expect(convertMock.mock.calls[0]![1].voiceSettings).toEqual({ + stability: 0.4, + similarityBoost: 0.6, + speed: 1.25, + }) + }) + + it('reports SDK errors through logger.errors', async () => { + convertMock.mockRejectedValue(new Error('boom')) + const adapter = elevenlabsSpeech('eleven_v3', { apiKey: 'k' }) + const logger = makeLogger() + + await expect( + adapter.generateSpeech({ + model: 'eleven_v3', + text: 'hi', + voice: 'v', + logger, + }), + ).rejects.toThrow('boom') + expect(logger.errors).toHaveBeenCalledWith( + 'elevenlabs.generateSpeech fatal', + expect.objectContaining({ source: 'elevenlabs.generateSpeech' }), + ) + }) +}) diff --git a/packages/typescript/ai-elevenlabs/tests/transcription-adapter.test.ts b/packages/typescript/ai-elevenlabs/tests/transcription-adapter.test.ts new file mode 100644 index 000000000..d72e59e9f --- /dev/null +++ b/packages/typescript/ai-elevenlabs/tests/transcription-adapter.test.ts @@ -0,0 +1,163 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' +import type { TranscriptionOptions } from '@tanstack/ai' + +const convertMock = vi.fn() + +vi.mock('@elevenlabs/elevenlabs-js', () => ({ + ElevenLabsClient: class { + speechToText = { convert: convertMock } + }, +})) + +import { elevenlabsTranscription } from '../src/adapters/transcription' + +function makeLogger() { + return { + request: vi.fn(), + response: vi.fn(), + provider: vi.fn(), + errors: vi.fn(), + } as unknown as TranscriptionOptions['logger'] +} + +describe('elevenlabsTranscription adapter', () => { + beforeEach(() => { + convertMock.mockReset() + }) + + it('passes diarize + keyterms through to the SDK', async () => { + convertMock.mockResolvedValue({ + text: 'hello world', + languageCode: 'eng', + words: [], + audioDurationSecs: 1.2, + }) + const adapter = elevenlabsTranscription('scribe_v1', { apiKey: 'k' }) + + const result = await adapter.transcribe({ + model: 'scribe_v1', + audio: new Blob([new Uint8Array([1, 2, 3])]), + language: 'en', + modelOptions: { + diarize: true, + keyterms: ['foo', 'bar'], + timestampsGranularity: 'word', + }, + logger: makeLogger(), + }) + + expect(convertMock).toHaveBeenCalledTimes(1) + expect(convertMock.mock.calls[0]![0]).toMatchObject({ + modelId: 'scribe_v1', + languageCode: 'en', + diarize: true, + keyterms: ['foo', 'bar'], + timestampsGranularity: 'word', + }) + expect(result).toMatchObject({ + text: 'hello world', + language: 'eng', + duration: 1.2, + }) + }) + + it('decodes a data: URL audio input to a Blob file upload', async () => { + convertMock.mockResolvedValue({ text: '', words: [] }) + const adapter = elevenlabsTranscription('scribe_v1', { apiKey: 'k' }) + + const dataUrl = + 'data:audio/wav;base64,' + Buffer.from([10, 20, 30]).toString('base64') + + await adapter.transcribe({ + model: 'scribe_v1', + audio: dataUrl, + logger: makeLogger(), + }) + + const body = convertMock.mock.calls[0]![0] + expect(body.file).toBeInstanceOf(Blob) + expect(body.cloudStorageUrl).toBeUndefined() + }) + + it('treats a plain https string as cloudStorageUrl', async () => { + convertMock.mockResolvedValue({ text: '', words: [] }) + const adapter = elevenlabsTranscription('scribe_v1', { apiKey: 'k' }) + + await adapter.transcribe({ + model: 'scribe_v1', + audio: 'https://example.com/audio.mp3', + logger: makeLogger(), + }) + + const body = convertMock.mock.calls[0]![0] + expect(body.cloudStorageUrl).toBe('https://example.com/audio.mp3') + expect(body.file).toBeUndefined() + }) + + it('wraps ArrayBuffer inputs into a Blob file upload', async () => { + convertMock.mockResolvedValue({ text: '', words: [] }) + const adapter = elevenlabsTranscription('scribe_v1', { apiKey: 'k' }) + + const buffer = new Uint8Array([1, 2, 3]).buffer + + await adapter.transcribe({ + model: 'scribe_v1', + audio: buffer, + logger: makeLogger(), + }) + + const body = convertMock.mock.calls[0]![0] + expect(body.file).toBeInstanceOf(Blob) + }) + + it('builds word-level + diarized segments from the response', async () => { + convertMock.mockResolvedValue({ + text: 'hello world hi there', + languageCode: 'eng', + words: [ + { text: 'hello', start: 0, end: 0.4, type: 'word', speakerId: 's1' }, + { text: 'world', start: 0.4, end: 0.9, type: 'word', speakerId: 's1' }, + { text: ' ', start: 0.9, end: 1.0, type: 'spacing' }, + { text: 'hi', start: 1.0, end: 1.3, type: 'word', speakerId: 's2' }, + { text: 'there', start: 1.3, end: 1.8, type: 'word', speakerId: 's2' }, + ], + audioDurationSecs: 2, + }) + const adapter = elevenlabsTranscription('scribe_v2', { apiKey: 'k' }) + + const result = await adapter.transcribe({ + model: 'scribe_v2', + audio: new Blob(), + logger: makeLogger(), + }) + + expect(result.words).toHaveLength(4) + expect(result.segments).toHaveLength(2) + expect(result.segments?.[0]).toMatchObject({ + speaker: 's1', + text: 'hello world', + }) + expect(result.segments?.[1]).toMatchObject({ + speaker: 's2', + text: 'hi there', + }) + }) + + it('logs and rethrows SDK errors', async () => { + convertMock.mockRejectedValue(new Error('stt down')) + const adapter = elevenlabsTranscription('scribe_v1', { apiKey: 'k' }) + const logger = makeLogger() + + await expect( + adapter.transcribe({ + model: 'scribe_v1', + audio: new Blob(), + logger, + }), + ).rejects.toThrow('stt down') + expect(logger.errors).toHaveBeenCalledWith( + 'elevenlabs.generateTranscription fatal', + expect.objectContaining({ source: 'elevenlabs.generateTranscription' }), + ) + }) +}) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index eb41b0817..6d68fcccb 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1127,9 +1127,12 @@ importers: packages/typescript/ai-elevenlabs: dependencies: - '@11labs/client': - specifier: ^0.2.0 - version: 0.2.0(@types/dom-mediacapture-record@1.0.22) + '@elevenlabs/client': + specifier: ^1.3.1 + version: 1.3.1(@types/dom-mediacapture-record@1.0.22) + '@elevenlabs/elevenlabs-js': + specifier: ^2.44.0 + version: 2.44.0 devDependencies: '@tanstack/ai': specifier: workspace:* @@ -1671,7 +1674,7 @@ importers: dependencies: '@copilotkit/aimock': specifier: latest - version: 1.14.0 + version: 1.15.1(vitest@4.1.4(@types/node@24.10.3)(happy-dom@20.0.11)(jsdom@27.3.0(postcss@8.5.9))(vite@7.3.1(@types/node@24.10.3)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2))) '@tailwindcss/vite': specifier: ^4.1.18 version: 4.1.18(vite@7.3.1(@types/node@24.10.3)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) @@ -1684,6 +1687,9 @@ importers: '@tanstack/ai-client': specifier: workspace:* version: link:../../packages/typescript/ai-client + '@tanstack/ai-elevenlabs': + specifier: workspace:* + version: link:../../packages/typescript/ai-elevenlabs '@tanstack/ai-gemini': specifier: workspace:* version: link:../../packages/typescript/ai-gemini @@ -1884,10 +1890,6 @@ importers: packages: - '@11labs/client@0.2.0': - resolution: {integrity: sha512-GBplAV4WDbcoThsIzdSDPN3xbcitK0ZZ4iJfJZKfltqvgvS6Uw8GZxHwVgiPwnQoA3uosYyY3L9TuPwmel18xQ==} - deprecated: This package is no longer maintained. Please use @elevenlabs/client for the latest version - '@acemir/cssom@0.9.29': resolution: {integrity: sha512-G90x0VW+9nW4dFajtjCoT+NM0scAfH9Mb08IcjgFHYbfiL/lU04dTF9JuVOi3/OH+DJCQdcIseSXkdCB9Ky6JA==} @@ -2189,10 +2191,18 @@ packages: '@cloudflare/workers-types@4.20260317.1': resolution: {integrity: sha512-+G4eVwyCpm8Au1ex8vQBCuA9wnwqetz4tPNRoB/53qvktERWBRMQnrtvC1k584yRE3emMThtuY0gWshvSJ++PQ==} - '@copilotkit/aimock@1.14.0': - resolution: {integrity: sha512-1NqwWEameArC7HWT7UHBlkq3pNlCA0eHBocaeL6mS5CULolT9XFL27tC9jJ+OSmREzLwkKbFYaAl2SssaXexVA==} - engines: {node: '>=20.15.0'} + '@copilotkit/aimock@1.15.1': + resolution: {integrity: sha512-DG9p6fKdYmuTW0zaUe9iDbgB/CM3SWhpdhVBrszQ6+L2UW4+DZB0gvICFQXRWhVXMpqxEkI9Pqhm/MtMb8li9A==} + engines: {node: '>=24.0.0'} hasBin: true + peerDependencies: + jest: '>=29' + vitest: '>=3' + peerDependenciesMeta: + jest: + optional: true + vitest: + optional: true '@crazydos/vue-markdown@1.1.4': resolution: {integrity: sha512-0I1QMP59LJ3aEjE7bolgvPU4JAFt+pykdDo5674CbsCwFo7OVFos50+MPhGdWflCz1mac5t152lB1qvV/tR/rw==} @@ -2247,6 +2257,16 @@ packages: '@deno/shim-deno@0.19.2': resolution: {integrity: sha512-q3VTHl44ad8T2Tw2SpeAvghdGOjlnLPDNO2cpOxwMrBE/PVas6geWpbpIgrM+czOCH0yejp0yi8OaTuB+NU40Q==} + '@elevenlabs/client@1.3.1': + resolution: {integrity: sha512-bQUxA/X7TZRSSZ6UM6a6A+1qQy5Wh7vMn+zbZP6Yl1WrupxHL4M0XMnl/n9+fsol1Ib4tN/2Nhx1E5JDS7QdKw==} + + '@elevenlabs/elevenlabs-js@2.44.0': + resolution: {integrity: sha512-qmggk9IvAQTkepA+A1d2VBIIiYXszrUaCDx0dJk2KeCMABFHOo7iHzLCG0bjzXO7YwF4ksc44ubkwMKYsqn26A==} + engines: {node: '>=18.0.0'} + + '@elevenlabs/types@0.9.1': + resolution: {integrity: sha512-lkWAMaFJLsGNWcblryBRHbtozMh7wHy0YsqURLwQwjhpvycuFN5qUa94CZtVU01jFwrfr5h1ca24+nYEkKf0Ew==} + '@emnapi/core@1.10.0': resolution: {integrity: sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==} @@ -7234,6 +7254,9 @@ packages: comma-separated-tokens@2.0.3: resolution: {integrity: sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==} + command-exists@1.2.9: + resolution: {integrity: sha512-LTQ/SGc+s0Xc0Fu5WaKnR0YiygZkm9eKFvyS+fRsU7/ZWFF8ykFM6Pc9aCVf1+xasOOZpO3BAVgVrKvsqKHV7w==} + commander@10.0.1: resolution: {integrity: sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug==} engines: {node: '>=14'} @@ -12001,12 +12024,6 @@ packages: snapshots: - '@11labs/client@0.2.0(@types/dom-mediacapture-record@1.0.22)': - dependencies: - livekit-client: 2.17.2(@types/dom-mediacapture-record@1.0.22) - transitivePeerDependencies: - - '@types/dom-mediacapture-record' - '@acemir/cssom@0.9.29': {} '@ag-ui/core@0.0.49': @@ -12437,7 +12454,9 @@ snapshots: '@cloudflare/workers-types@4.20260317.1': {} - '@copilotkit/aimock@1.14.0': {} + '@copilotkit/aimock@1.15.1(vitest@4.1.4(@types/node@24.10.3)(happy-dom@20.0.11)(jsdom@27.3.0(postcss@8.5.9))(vite@7.3.1(@types/node@24.10.3)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)))': + optionalDependencies: + vitest: 4.1.4(@types/node@24.10.3)(happy-dom@20.0.11)(jsdom@27.3.0(postcss@8.5.9))(vite@7.3.1(@types/node@24.10.3)(jiti@2.6.1)(lightningcss@1.30.2)(terser@5.44.1)(tsx@4.21.0)(yaml@2.8.2)) '@crazydos/vue-markdown@1.1.4(vue@3.5.25(typescript@5.9.3))': dependencies: @@ -12488,6 +12507,25 @@ snapshots: '@deno/shim-deno-test': 0.5.0 which: 4.0.0 + '@elevenlabs/client@1.3.1(@types/dom-mediacapture-record@1.0.22)': + dependencies: + '@elevenlabs/types': 0.9.1 + livekit-client: 2.17.2(@types/dom-mediacapture-record@1.0.22) + transitivePeerDependencies: + - '@types/dom-mediacapture-record' + + '@elevenlabs/elevenlabs-js@2.44.0': + dependencies: + command-exists: 1.2.9 + node-fetch: 2.7.0 + ws: 8.19.0 + transitivePeerDependencies: + - bufferutil + - encoding + - utf-8-validate + + '@elevenlabs/types@0.9.1': {} + '@emnapi/core@1.10.0': dependencies: '@emnapi/wasi-threads': 1.2.1 @@ -15296,14 +15334,14 @@ snapshots: '@tanstack/devtools-event-bus@0.3.3': dependencies: - ws: 8.18.3 + ws: 8.19.0 transitivePeerDependencies: - bufferutil - utf-8-validate '@tanstack/devtools-event-bus@0.4.1': dependencies: - ws: 8.18.3 + ws: 8.19.0 transitivePeerDependencies: - bufferutil - utf-8-validate @@ -18004,6 +18042,8 @@ snapshots: comma-separated-tokens@2.0.3: {} + command-exists@1.2.9: {} + commander@10.0.1: {} commander@13.1.0: {} diff --git a/testing/e2e/package.json b/testing/e2e/package.json index 0dc700b0d..89b668d15 100644 --- a/testing/e2e/package.json +++ b/testing/e2e/package.json @@ -16,6 +16,7 @@ "@tanstack/ai": "workspace:*", "@tanstack/ai-anthropic": "workspace:*", "@tanstack/ai-client": "workspace:*", + "@tanstack/ai-elevenlabs": "workspace:*", "@tanstack/ai-gemini": "workspace:*", "@tanstack/ai-grok": "workspace:*", "@tanstack/ai-groq": "workspace:*", diff --git a/testing/e2e/src/lib/feature-support.ts b/testing/e2e/src/lib/feature-support.ts index db0696e4e..a6abbe034 100644 --- a/testing/e2e/src/lib/feature-support.ts +++ b/testing/e2e/src/lib/feature-support.ts @@ -121,6 +121,9 @@ export const matrix: Record> = { ]), // Gemini excluded: aimock doesn't mock Gemini's Imagen predict endpoint format 'image-gen': new Set(['openai', 'grok']), + // ElevenLabs factories exist in media-providers.ts but aimock doesn't + // mock api.elevenlabs.io yet, so we keep it out of the live matrix for + // now. Re-add once aimock ships ElevenLabs stubs. tts: new Set(['openai', 'grok']), transcription: new Set(['openai', 'grok']), 'video-gen': new Set(['openai']), diff --git a/testing/e2e/src/lib/media-providers.ts b/testing/e2e/src/lib/media-providers.ts index 6887660b2..aa166fb41 100644 --- a/testing/e2e/src/lib/media-providers.ts +++ b/testing/e2e/src/lib/media-providers.ts @@ -10,6 +10,10 @@ import { createGrokSpeech, createGrokTranscription, } from '@tanstack/ai-grok' +import { + createElevenLabsSpeech, + createElevenLabsTranscription, +} from '@tanstack/ai-elevenlabs' import type { Provider } from '@/lib/types' const LLMOCK_DEFAULT_BASE = process.env.LLMOCK_URL || 'http://127.0.0.1:4010' @@ -72,6 +76,10 @@ export function createTTSAdapter( baseURL: openaiUrl(aimockPort), defaultHeaders: headers, }), + elevenlabs: () => + createElevenLabsSpeech('eleven_multilingual_v2', DUMMY_KEY, { + baseUrl: llmockBase(aimockPort), + }), } const factory = factories[provider] if (!factory) throw new Error(`No TTS adapter for provider: ${provider}`) @@ -95,6 +103,10 @@ export function createTranscriptionAdapter( baseURL: openaiUrl(aimockPort), defaultHeaders: headers, }), + elevenlabs: () => + createElevenLabsTranscription('scribe_v1', DUMMY_KEY, { + baseUrl: llmockBase(aimockPort), + }), } const factory = factories[provider] if (!factory) diff --git a/testing/e2e/src/lib/providers.ts b/testing/e2e/src/lib/providers.ts index 35b720b61..d2051ef25 100644 --- a/testing/e2e/src/lib/providers.ts +++ b/testing/e2e/src/lib/providers.ts @@ -20,6 +20,10 @@ const defaultModels: Record = { groq: 'llama-3.3-70b-versatile', grok: 'grok-3', openrouter: 'openai/gpt-4o', + // ElevenLabs has no chat/text model — the support matrix already filters + // it out of text features, but we still need an entry to satisfy the + // Record constraint. + elevenlabs: '', } export function createTextAdapter( @@ -92,6 +96,11 @@ export function createTextAdapter( : openaiUrl, }), }), + elevenlabs: () => { + throw new Error( + 'ElevenLabs has no text/chat adapter — use createTTSAdapter or createTranscriptionAdapter.', + ) + }, } return factories[provider]() diff --git a/testing/e2e/src/lib/types.ts b/testing/e2e/src/lib/types.ts index 00c848157..cf578217e 100644 --- a/testing/e2e/src/lib/types.ts +++ b/testing/e2e/src/lib/types.ts @@ -8,6 +8,7 @@ export type Provider = | 'grok' | 'groq' | 'openrouter' + | 'elevenlabs' export type Feature = | 'chat' @@ -37,6 +38,7 @@ export const ALL_PROVIDERS: Provider[] = [ 'grok', 'groq', 'openrouter', + 'elevenlabs', ] export const ALL_FEATURES: Feature[] = [ diff --git a/testing/e2e/tests/test-matrix.ts b/testing/e2e/tests/test-matrix.ts index 425b49518..fea85dc59 100644 --- a/testing/e2e/tests/test-matrix.ts +++ b/testing/e2e/tests/test-matrix.ts @@ -21,6 +21,7 @@ export const providers: Provider[] = [ 'groq', 'grok', 'openrouter', + 'elevenlabs', ] export { isSupported } diff --git a/testing/e2e/vite.config.ts b/testing/e2e/vite.config.ts index 734b135c8..e522fe93b 100644 --- a/testing/e2e/vite.config.ts +++ b/testing/e2e/vite.config.ts @@ -6,8 +6,22 @@ import tailwindcss from '@tailwindcss/vite' import { nitroV2Plugin } from '@tanstack/nitro-v2-vite-plugin' const config = defineConfig({ + // Server-side only fix. @elevenlabs/elevenlabs-js ships a top-level + // `function getHeader(…)` that collides with h3's auto-imported + // `getHeader` when vite inlines it into the SSR bundle. The SDK is + // only imported by server routes (api.tts*.ts, api.transcription*.ts), + // so tree-shaking already keeps it out of the client bundle — this + // option only affects the SSR build, where we want the SDK resolved at + // runtime via require() instead of inlined into the rollup chunk. + ssr: { + external: ['@elevenlabs/elevenlabs-js'], + }, plugins: [ - nitroV2Plugin(), + nitroV2Plugin({ + externals: { + external: ['@elevenlabs/elevenlabs-js'], + }, + }), viteTsConfigPaths({ projects: ['./tsconfig.json'], }),