Skip to content

Commit efb582e

Browse files
feat(voice): voice input migration to eleven labs (#4041)
* feat(speech): unified voice interface * add metering for voice input usage * ip key * use shared getclientip helper, fix deployed chat * cleanup code * prep merge * merge staging in * add billing check * add voice input section * remove skip billing * address comments
1 parent 3c7bfa7 commit efb582e

File tree

27 files changed

+15649
-453
lines changed

27 files changed

+15649
-453
lines changed

apps/docs/content/docs/en/execution/costs.mdx

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,21 @@ Use your own API keys for AI model providers instead of Sim's hosted keys to pay
135135

136136
When configured, workflows use your key instead of Sim's hosted keys. If removed, workflows automatically fall back to hosted keys with the multiplier.
137137

138+
## Voice Input
139+
140+
Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.
141+
142+
| Context | Cost per session | Max duration |
143+
|---------|-----------------|--------------|
144+
| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
145+
| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |
146+
147+
Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.
148+
149+
<Callout type="info">
150+
Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.
151+
</Callout>
152+
138153
## Plans
139154

140155
Sim has two paid plan tiers — **Pro** and **Max**. Either can be used individually or with a team. Team plans pool credits across all seats in the organization.

apps/sim/app/api/a2a/serve/[agentId]/route.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import {
1515
import { type AuthResult, AuthType, checkHybridAuth } from '@/lib/auth/hybrid'
1616
import { acquireLock, getRedisClient, releaseLock } from '@/lib/core/config/redis'
1717
import { validateUrlWithDNS } from '@/lib/core/security/input-validation.server'
18+
import { getClientIp } from '@/lib/core/utils/request'
1819
import { SSE_HEADERS } from '@/lib/core/utils/sse'
1920
import { getBaseUrl } from '@/lib/core/utils/urls'
2021
import { generateId } from '@/lib/core/utils/uuid'
@@ -52,10 +53,9 @@ function getCallerFingerprint(request: NextRequest, userId?: string | null): str
5253
return `user:${userId}`
5354
}
5455

55-
const forwardedFor = request.headers.get('x-forwarded-for')?.split(',')[0]?.trim()
56-
const realIp = request.headers.get('x-real-ip')?.trim()
56+
const clientIp = getClientIp(request)
5757
const userAgent = request.headers.get('user-agent')?.trim() || 'unknown'
58-
return `public:${forwardedFor || realIp || 'unknown'}:${userAgent}`
58+
return `public:${clientIp}:${userAgent}`
5959
}
6060

6161
function hasCallerAccessToTask(

apps/sim/app/api/demo-requests/route.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { type NextRequest, NextResponse } from 'next/server'
33
import { env } from '@/lib/core/config/env'
44
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
55
import { RateLimiter } from '@/lib/core/rate-limiter'
6-
import { generateRequestId } from '@/lib/core/utils/request'
6+
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
77
import { getEmailDomain } from '@/lib/core/utils/urls'
88
import { sendEmail } from '@/lib/messaging/email/mailer'
99
import { getFromEmailAddress } from '@/lib/messaging/email/utils'
@@ -25,7 +25,7 @@ export async function POST(req: NextRequest) {
2525
const requestId = generateRequestId()
2626

2727
try {
28-
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
28+
const ip = getClientIp(req)
2929
const storageKey = `public:demo-request:${ip}`
3030

3131
const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(

apps/sim/app/api/help/integration-request/route.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ import { z } from 'zod'
44
import { env } from '@/lib/core/config/env'
55
import type { TokenBucketConfig } from '@/lib/core/rate-limiter'
66
import { RateLimiter } from '@/lib/core/rate-limiter'
7-
import { generateRequestId } from '@/lib/core/utils/request'
7+
import { generateRequestId, getClientIp } from '@/lib/core/utils/request'
88
import { getEmailDomain } from '@/lib/core/utils/urls'
99
import { sendEmail } from '@/lib/messaging/email/mailer'
1010
import {
@@ -37,7 +37,7 @@ export async function POST(req: NextRequest) {
3737
const requestId = generateRequestId()
3838

3939
try {
40-
const ip = req.headers.get('x-forwarded-for')?.split(',')[0]?.trim() ?? 'unknown'
40+
const ip = getClientIp(req)
4141
const storageKey = `public:integration-request:${ip}`
4242

4343
const { allowed, remaining, resetAt } = await rateLimiter.checkRateLimitDirect(
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { NextResponse } from 'next/server'
2+
import { hasSTTService } from '@/lib/speech/config'
3+
4+
/**
5+
* Returns whether server-side STT is configured.
6+
* Unauthenticated — the response is a single boolean,
7+
* not sensitive data, and deployed chat visitors need it.
8+
*/
9+
export async function GET() {
10+
return NextResponse.json({ sttAvailable: hasSTTService() })
11+
}
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import { db } from '@sim/db'
2+
import { chat } from '@sim/db/schema'
3+
import { createLogger } from '@sim/logger'
4+
import { eq } from 'drizzle-orm'
5+
import { type NextRequest, NextResponse } from 'next/server'
6+
import { getSession } from '@/lib/auth'
7+
import { hasExceededCostLimit } from '@/lib/billing/core/subscription'
8+
import { recordUsage } from '@/lib/billing/core/usage-log'
9+
import { env } from '@/lib/core/config/env'
10+
import { getCostMultiplier, isBillingEnabled } from '@/lib/core/config/feature-flags'
11+
import { RateLimiter } from '@/lib/core/rate-limiter'
12+
import { validateAuthToken } from '@/lib/core/security/deployment'
13+
import { getClientIp } from '@/lib/core/utils/request'
14+
15+
const logger = createLogger('SpeechTokenAPI')
16+
17+
export const dynamic = 'force-dynamic'
18+
19+
const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'
20+
21+
const VOICE_SESSION_COST_PER_MIN = 0.008
22+
const WORKSPACE_SESSION_MAX_MINUTES = 3
23+
const CHAT_SESSION_MAX_MINUTES = 1
24+
25+
const STT_TOKEN_RATE_LIMIT = {
26+
maxTokens: 30,
27+
refillRate: 3,
28+
refillIntervalMs: 72 * 1000,
29+
} as const
30+
31+
const rateLimiter = new RateLimiter()
32+
33+
async function validateChatAuth(
34+
request: NextRequest,
35+
chatId: string
36+
): Promise<{ valid: boolean; ownerId?: string }> {
37+
try {
38+
const chatResult = await db
39+
.select({
40+
id: chat.id,
41+
userId: chat.userId,
42+
isActive: chat.isActive,
43+
authType: chat.authType,
44+
password: chat.password,
45+
})
46+
.from(chat)
47+
.where(eq(chat.id, chatId))
48+
.limit(1)
49+
50+
if (chatResult.length === 0 || !chatResult[0].isActive) {
51+
return { valid: false }
52+
}
53+
54+
const chatData = chatResult[0]
55+
56+
if (chatData.authType === 'public') {
57+
return { valid: true, ownerId: chatData.userId }
58+
}
59+
60+
const cookieName = `chat_auth_${chatId}`
61+
const authCookie = request.cookies.get(cookieName)
62+
if (authCookie && validateAuthToken(authCookie.value, chatId, chatData.password)) {
63+
return { valid: true, ownerId: chatData.userId }
64+
}
65+
66+
return { valid: false }
67+
} catch (error) {
68+
logger.error('Error validating chat auth for STT:', error)
69+
return { valid: false }
70+
}
71+
}
72+
73+
export async function POST(request: NextRequest) {
74+
try {
75+
const body = await request.json().catch(() => ({}))
76+
const chatId = body?.chatId as string | undefined
77+
78+
let billingUserId: string | undefined
79+
80+
if (chatId) {
81+
const chatAuth = await validateChatAuth(request, chatId)
82+
if (!chatAuth.valid) {
83+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
84+
}
85+
billingUserId = chatAuth.ownerId
86+
} else {
87+
const session = await getSession()
88+
if (!session?.user?.id) {
89+
return NextResponse.json({ error: 'Unauthorized' }, { status: 401 })
90+
}
91+
billingUserId = session.user.id
92+
}
93+
94+
if (isBillingEnabled) {
95+
const rateLimitKey = chatId
96+
? `stt-token:chat:${chatId}:${getClientIp(request)}`
97+
: `stt-token:user:${billingUserId}`
98+
99+
const rateCheck = await rateLimiter.checkRateLimitDirect(rateLimitKey, STT_TOKEN_RATE_LIMIT)
100+
if (!rateCheck.allowed) {
101+
return NextResponse.json(
102+
{ error: 'Voice input rate limit exceeded. Please try again later.' },
103+
{
104+
status: 429,
105+
headers: {
106+
'Retry-After': String(Math.ceil((rateCheck.retryAfterMs ?? 60000) / 1000)),
107+
},
108+
}
109+
)
110+
}
111+
}
112+
113+
if (billingUserId && isBillingEnabled) {
114+
const exceeded = await hasExceededCostLimit(billingUserId)
115+
if (exceeded) {
116+
return NextResponse.json(
117+
{ error: 'Usage limit exceeded. Please upgrade your plan to continue.' },
118+
{ status: 402 }
119+
)
120+
}
121+
}
122+
123+
const apiKey = env.ELEVENLABS_API_KEY
124+
if (!apiKey?.trim()) {
125+
return NextResponse.json(
126+
{ error: 'Speech-to-text service is not configured' },
127+
{ status: 503 }
128+
)
129+
}
130+
131+
const response = await fetch(ELEVENLABS_TOKEN_URL, {
132+
method: 'POST',
133+
headers: { 'xi-api-key': apiKey },
134+
})
135+
136+
if (!response.ok) {
137+
const errBody = await response.json().catch(() => ({}))
138+
const message =
139+
errBody.detail || errBody.message || `Token request failed (${response.status})`
140+
logger.error('ElevenLabs token request failed', { status: response.status, message })
141+
return NextResponse.json({ error: message }, { status: 502 })
142+
}
143+
144+
const data = await response.json()
145+
146+
if (billingUserId) {
147+
const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
148+
const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes
149+
150+
await recordUsage({
151+
userId: billingUserId,
152+
entries: [
153+
{
154+
category: 'fixed',
155+
source: 'voice-input',
156+
description: `Voice input session (${maxMinutes} min)`,
157+
cost: sessionCost * getCostMultiplier(),
158+
},
159+
],
160+
}).catch((err) => {
161+
logger.warn('Failed to record voice input usage, continuing:', err)
162+
})
163+
}
164+
165+
return NextResponse.json({ token: data.token })
166+
} catch (error) {
167+
const message = error instanceof Error ? error.message : 'Failed to generate speech token'
168+
logger.error('Speech token error:', error)
169+
return NextResponse.json({ error: message }, { status: 500 })
170+
}
171+
}

apps/sim/app/chat/[identifier]/chat.tsx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,14 @@ export default function ChatClient({ identifier }: { identifier: string }) {
127127
const [authRequired, setAuthRequired] = useState<'password' | 'email' | 'sso' | null>(null)
128128

129129
const [isVoiceFirstMode, setIsVoiceFirstMode] = useState(false)
130+
const [sttAvailable, setSttAvailable] = useState(false)
131+
132+
useEffect(() => {
133+
fetch('/api/settings/voice')
134+
.then((r) => (r.ok ? r.json() : { sttAvailable: false }))
135+
.then((data) => setSttAvailable(data.sttAvailable === true))
136+
.catch(() => setSttAvailable(false))
137+
}, [])
130138
const { isStreamingResponse, abortControllerRef, stopStreaming, handleStreamedResponse } =
131139
useChatStreaming()
132140
const audioContextRef = useRef<AudioContext | null>(null)
@@ -443,8 +451,9 @@ export default function ChatClient({ identifier }: { identifier: string }) {
443451
}, [isStreamingResponse, stopStreaming, setMessages, stopAudio])
444452

445453
const handleVoiceStart = useCallback(() => {
454+
if (!sttAvailable) return
446455
setIsVoiceFirstMode(true)
447-
}, [])
456+
}, [sttAvailable])
448457

449458
const handleExitVoiceMode = useCallback(() => {
450459
setIsVoiceFirstMode(false)
@@ -494,6 +503,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
494503
isStreaming={isStreamingResponse}
495504
isPlayingAudio={isPlayingAudio}
496505
audioContextRef={audioContextRef}
506+
chatId={chatConfig?.id}
497507
messages={messages.map((msg) => ({
498508
content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content),
499509
type: msg.type,
@@ -529,6 +539,7 @@ export default function ChatClient({ identifier }: { identifier: string }) {
529539
isStreaming={isStreamingResponse}
530540
onStopStreaming={() => stopStreaming(setMessages)}
531541
onVoiceStart={handleVoiceStart}
542+
sttAvailable={sttAvailable}
532543
/>
533544
</div>
534545
</div>

apps/sim/app/chat/components/input/input.tsx

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,6 @@ const logger = createLogger('ChatInput')
1414

1515
const MAX_TEXTAREA_HEIGHT = 200
1616

17-
const IS_STT_AVAILABLE =
18-
typeof window !== 'undefined' &&
19-
!!(
20-
(window as Window & { SpeechRecognition?: unknown; webkitSpeechRecognition?: unknown })
21-
.SpeechRecognition ||
22-
(window as Window & { webkitSpeechRecognition?: unknown }).webkitSpeechRecognition
23-
)
24-
2517
interface AttachedFile {
2618
id: string
2719
name: string
@@ -37,7 +29,15 @@ export const ChatInput: React.FC<{
3729
onStopStreaming?: () => void
3830
onVoiceStart?: () => void
3931
voiceOnly?: boolean
40-
}> = ({ onSubmit, isStreaming = false, onStopStreaming, onVoiceStart, voiceOnly = false }) => {
32+
sttAvailable?: boolean
33+
}> = ({
34+
onSubmit,
35+
isStreaming = false,
36+
onStopStreaming,
37+
onVoiceStart,
38+
voiceOnly = false,
39+
sttAvailable = false,
40+
}) => {
4141
const fileInputRef = useRef<HTMLInputElement>(null)
4242
const textareaRef = useRef<HTMLTextAreaElement>(null)
4343
const [inputValue, setInputValue] = useState('')
@@ -142,7 +142,7 @@ export const ChatInput: React.FC<{
142142
return (
143143
<Tooltip.Provider>
144144
<div className='flex items-center justify-center'>
145-
{IS_STT_AVAILABLE && (
145+
{sttAvailable && (
146146
<Tooltip.Root>
147147
<Tooltip.Trigger asChild>
148148
<div>
@@ -295,7 +295,7 @@ export const ChatInput: React.FC<{
295295

296296
{/* Right: mic + send */}
297297
<div className='flex items-center gap-1.5'>
298-
{IS_STT_AVAILABLE && (
298+
{sttAvailable && (
299299
<Tooltip.Root>
300300
<Tooltip.Trigger asChild>
301301
<button

0 commit comments

Comments
 (0)