Skip to content

Commit 7f1a16c

Browse files
committed
address comments
1 parent c079326 commit 7f1a16c

File tree

6 files changed

+38
-18
lines changed

6 files changed

+38
-18
lines changed

apps/docs/content/docs/en/execution/costs.mdx

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,12 @@ When configured, workflows use your key instead of Sim's hosted keys. If removed
139139

140140
Voice input uses ElevenLabs Scribe v2 Realtime for speech-to-text transcription. It is available in the Mothership chat and in deployed chat voice mode.
141141

142-
| Item | Cost | Notes |
143-
|------|------|-------|
144-
| Voice session | ~5 credits ($0.024) | Flat charge per session (up to 3 minutes) |
145-
| Session limit | 3 minutes | After 3 minutes, the session ends automatically. Start a new session to continue. |
142+
| Context | Cost per session | Max duration |
143+
|---------|-----------------|--------------|
144+
| Mothership (workspace) | ~5 credits ($0.024) | 3 minutes |
145+
| Deployed chat (voice mode) | ~2 credits ($0.008) | 1 minute |
146146

147-
Voice sessions are billed when the session starts (when you click the mic button or enter voice mode). Multi-turn voice conversations in deployed chat reuse the same session billing — reconnections within a single voice call are not charged again.
147+
Each voice session is billed when it starts. In deployed chat voice mode, each conversation turn (speak → agent responds → speak again) is a separate session. Multi-turn conversations are billed per turn.
148148

149149
<Callout type="info">
150150
Voice input requires `ELEVENLABS_API_KEY` to be configured. When the key is not set, voice input controls are hidden.

apps/sim/app/api/speech/token/route.ts

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ export const dynamic = 'force-dynamic'
1919
const ELEVENLABS_TOKEN_URL = 'https://api.elevenlabs.io/v1/single-use-token/realtime_scribe'
2020

2121
const VOICE_SESSION_COST_PER_MIN = 0.008
22-
const VOICE_SESSION_MAX_MINUTES = 3
23-
const VOICE_SESSION_COST = VOICE_SESSION_COST_PER_MIN * VOICE_SESSION_MAX_MINUTES
22+
const WORKSPACE_SESSION_MAX_MINUTES = 3
23+
const CHAT_SESSION_MAX_MINUTES = 1
2424

2525
const STT_TOKEN_RATE_LIMIT = {
2626
maxTokens: 30,
@@ -144,14 +144,17 @@ export async function POST(request: NextRequest) {
144144
const data = await response.json()
145145

146146
if (billingUserId) {
147+
const maxMinutes = chatId ? CHAT_SESSION_MAX_MINUTES : WORKSPACE_SESSION_MAX_MINUTES
148+
const sessionCost = VOICE_SESSION_COST_PER_MIN * maxMinutes
149+
147150
await recordUsage({
148151
userId: billingUserId,
149152
entries: [
150153
{
151154
category: 'fixed',
152155
source: 'voice-input',
153-
description: `Voice input session (${VOICE_SESSION_MAX_MINUTES} min)`,
154-
cost: VOICE_SESSION_COST * getCostMultiplier(),
156+
description: `Voice input session (${maxMinutes} min)`,
157+
cost: sessionCost * getCostMultiplier(),
155158
},
156159
],
157160
}).catch((err) => {

apps/sim/app/chat/components/voice-interface/voice-interface.tsx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { arrayBufferToBase64, floatTo16BitPCM } from '@/lib/speech/audio'
1010
import {
1111
CHUNK_SEND_INTERVAL_MS,
1212
ELEVENLABS_WS_URL,
13-
MAX_SESSION_MS,
13+
MAX_CHAT_SESSION_MS,
1414
SAMPLE_RATE,
1515
} from '@/lib/speech/config'
1616

@@ -321,7 +321,7 @@ export function VoiceInterface({
321321
stopSendingAudio()
322322
closeWebSocket()
323323
updateState('idle')
324-
}, MAX_SESSION_MS)
324+
}, MAX_CHAT_SESSION_MS)
325325
}, [connectWebSocket, updateState, startSendingAudio, stopSendingAudio, closeWebSocket])
326326

327327
const stopListening = useCallback(() => {

apps/sim/app/workspace/[workspaceId]/home/components/user-input/user-input.tsx

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ export function UserInput({
243243
isListening,
244244
isSupported: isSttSupported,
245245
toggleListening: rawToggle,
246+
resetTranscript,
246247
} = useSpeechToText({ onTranscript: handleTranscript })
247248

248249
const toggleListening = useCallback(() => {
@@ -432,13 +433,15 @@ export function UserInput({
432433
)
433434
setValue('')
434435
valueRef.current = ''
436+
sttPrefixRef.current = ''
437+
resetTranscript()
435438
currentFiles.clearAttachedFiles()
436439
currentContext.clearContexts()
437440

438441
if (textareaRef.current) {
439442
textareaRef.current.style.height = 'auto'
440443
}
441-
}, [onSubmit, textareaRef])
444+
}, [onSubmit, textareaRef, resetTranscript])
442445

443446
const handleKeyDown = useCallback(
444447
(e: React.KeyboardEvent<HTMLTextAreaElement>) => {

apps/sim/hooks/use-speech-to-text.ts

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ interface UseSpeechToTextReturn {
2626
isSupported: boolean
2727
permissionState: PermissionState
2828
toggleListening: () => void
29+
resetTranscript: () => void
2930
}
3031

3132
export function useSpeechToText({
@@ -334,18 +335,29 @@ export function useSpeechToText({
334335
streamRef.current = null
335336
}
336337

337-
setTimeout(() => {
338-
if (wsRef.current) {
339-
wsRef.current.close()
340-
wsRef.current = null
341-
}
342-
}, 2000)
338+
const wsToClose = wsRef.current
339+
wsRef.current = null
340+
if (wsToClose) {
341+
setTimeout(() => {
342+
if (
343+
wsToClose.readyState === WebSocket.OPEN ||
344+
wsToClose.readyState === WebSocket.CONNECTING
345+
) {
346+
wsToClose.close()
347+
}
348+
}, 2000)
349+
}
343350

344351
setIsListening(false)
345352
}, [flushAudioBuffer])
346353

347354
stopStreamingRef.current = stopStreaming
348355

356+
const resetTranscript = useCallback(() => {
357+
committedTextRef.current = ''
358+
isFirstChunkRef.current = true
359+
}, [])
360+
349361
const toggleListening = useCallback(() => {
350362
if (isListening) {
351363
stopStreaming()
@@ -367,5 +379,6 @@ export function useSpeechToText({
367379
isSupported,
368380
permissionState,
369381
toggleListening,
382+
resetTranscript,
370383
}
371384
}

apps/sim/lib/speech/config.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ export const ELEVENLABS_WS_URL = 'wss://api.elevenlabs.io/v1/speech-to-text/real
44
export const SAMPLE_RATE = 16000
55
export const CHUNK_SEND_INTERVAL_MS = 250
66
export const MAX_SESSION_MS = 3 * 60 * 1000
7+
export const MAX_CHAT_SESSION_MS = 1 * 60 * 1000
78

89
/**
910
* Whether a speech-to-text provider is configured.

0 commit comments

Comments
 (0)