websocket-control-stackchan/stackchan_server/speak.py at main · 74th/websocket-control-stackchan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
from __future__ import annotations

import asyncio
import io
import wave
from datetime import UTC, datetime
from logging import getLogger
from pathlib import Path
from typing import Awaitable, Callable

from fastapi import WebSocket, WebSocketDisconnect

from .listen import TimeoutError
from .protobuf_ws import (
    encode_audio_wav_data_message,
    encode_audio_wav_end_message,
    encode_audio_wav_start_message,
)
from .types import AudioFormat, SpeechSynthesizer, StreamingSpeechSynthesizer

logger = getLogger(__name__)


class SpeakHandler:
    def __init__(
        self,
        *,
        websocket: WebSocket,
        down_wav_chunk: int,
        down_segment_millis: int,
        down_segment_stagger_millis: int,
        sample_width: int,
        speech_synthesizer: SpeechSynthesizer,
        recordings_dir: Path,
        debug_recording: bool,
    ) -> None:
        self.ws = websocket
        self.down_wav_chunk = down_wav_chunk
        self.down_segment_millis = down_segment_millis
        self.down_segment_stagger_millis = down_segment_stagger_millis
        self.sample_width = sample_width
        self.speech_synthesizer = speech_synthesizer
        self.recordings_dir = recordings_dir
        self.debug_recording = debug_recording

        self._speaking = False
        self._speak_finished_counter = 0

    @property
    def speaking(self) -> bool:
        return self._speaking

    def handle_speak_done_event(self) -> None:
        self._speak_finished_counter += 1
        self._speaking = False
        logger.info("Received speak done event")

    async def speak(
        self,
        text: str,
        *,
        next_seq: Callable[[], int],
        send_state_command: Callable[[int], Awaitable[None]],
        idle_state: int,
        is_closed: Callable[[], bool],
    ) -> None:
        start_counter = self._speak_finished_counter
        await self._start_talking_stream(text, next_seq=next_seq)
        if not self._speaking:
            return
        await self._wait_for_speaking_finished(
            min_counter=start_counter + 1,
            timeout_seconds=120.0,
            is_closed=is_closed,
        )
        if not is_closed():
            await send_state_command(idle_state)

    async def _wait_for_speaking_finished(
        self,
        *,
        min_counter: int,
        timeout_seconds: float | None,
        is_closed: Callable[[], bool],
    ) -> None:
        loop = asyncio.get_running_loop()
        deadline = (loop.time() + timeout_seconds) if timeout_seconds else None
        while True:
            if self._speak_finished_counter >= min_counter:
                return
            if is_closed():
                raise WebSocketDisconnect()
            if deadline and loop.time() >= deadline:
                raise TimeoutError("Timed out waiting for speaking finished event")
            await asyncio.sleep(0.05)

    async def _start_talking_stream(self, text: str, *, next_seq: Callable[[], int]) -> None:
        self._speaking = True
        try:
            if isinstance(self.speech_synthesizer, StreamingSpeechSynthesizer):
                await self._start_talking_streaming(
                    text,
                    self.speech_synthesizer,
                    next_seq=next_seq,
                )
                return
            wav_bytes = await self.speech_synthesizer.synthesize(text)
            logger.info("Synthesized wav_bytes=%d text_chars=%d", len(wav_bytes), len(text))
            pcm_bytes, tts_sample_rate, tts_channels, tts_sample_width = self._extract_pcm(wav_bytes)
            logger.info(
                "Synthesized audio sample_rate=%d channels=%d sample_width=%d pcm_bytes=%d",
                tts_sample_rate,
                tts_channels,
                tts_sample_width,
                len(pcm_bytes),
            )
            if len(pcm_bytes) == 0:
                logger.warning("Synthesized audio is empty")
                self._speaking = False
                return

            if tts_sample_width != self.sample_width:
                await self.ws.send_json({"error": f"unsupported sample width {tts_sample_width}"})
                self._speaking = False
                return

            if self.debug_recording:
                filepath, filename = self._save_wav(wav_bytes)
                logger.info("Saved synthesized WAV: %s", filename)
                await self.ws.send_json({"tts_debug_path": f"recordings/{filename}", "tts_debug_bytes": len(wav_bytes)})

            bytes_per_second = tts_sample_rate * tts_channels * tts_sample_width
            segment_bytes = int(bytes_per_second * (self.down_segment_millis / 1000))

            if segment_bytes <= 0:
                await self.ws.send_json({"error": "invalid segment size computed"})
                self._speaking = False
                return

            await self._send_segments(
                pcm_bytes,
                tts_sample_rate,
                tts_channels,
                segment_bytes,
                next_seq=next_seq,
            )
        except Exception as exc:  # pragma: no cover
            self._speaking = False
            logger.exception("Speech synthesis failed")
            await self.ws.send_json({"error": f"speech synthesis failed: {exc}"})

    async def _start_talking_streaming(
        self,
        text: str,
        speech_synthesizer: StreamingSpeechSynthesizer,
        *,
        next_seq: Callable[[], int],
    ) -> None:
        output_format = speech_synthesizer.output_format
        logger.info(
            "Streaming synthesized audio sample_rate=%d channels=%d sample_width=%d",
            output_format.sample_rate_hz,
            output_format.channels,
            output_format.sample_width,
        )
        if output_format.sample_width != self.sample_width:
            await self.ws.send_json({"error": f"unsupported sample width {output_format.sample_width}"})
            self._speaking = False
            return

        bytes_per_second = (
            output_format.sample_rate_hz * output_format.channels * output_format.sample_width
        )
        segment_bytes = int(bytes_per_second * (self.down_segment_millis / 1000))
        if segment_bytes <= 0:
            await self.ws.send_json({"error": "invalid segment size computed"})
            self._speaking = False
            return

        pending = bytearray()
        saved_pcm = bytearray()
        segment_count = 0
        base_time: float | None = None
        async for chunk in speech_synthesizer.synthesize_stream(text):
            pending.extend(chunk)
            if self.debug_recording:
                saved_pcm.extend(chunk)
            while len(pending) >= segment_bytes:
                segment = bytes(pending[:segment_bytes])
                del pending[:segment_bytes]
                base_time = await self._wait_for_segment_slot(segment_count, base_time=base_time)
                await self._send_segment(
                    segment,
                    output_format.sample_rate_hz,
                    output_format.channels,
                    next_seq=next_seq,
                )
                segment_count += 1
        if pending:
            base_time = await self._wait_for_segment_slot(segment_count, base_time=base_time)
            await self._send_segment(
                bytes(pending),
                output_format.sample_rate_hz,
                output_format.channels,
                next_seq=next_seq,
            )
            segment_count += 1
        logger.info("Prepared %d playback segments from streaming TTS", segment_count)

        if self.debug_recording and saved_pcm:
            wav_bytes = self._wrap_pcm_as_wav(bytes(saved_pcm), output_format)
            filepath, filename = self._save_wav(wav_bytes)
            logger.info("Saved synthesized WAV: %s", filename)
            await self.ws.send_json({"tts_debug_path": f"recordings/{filename}", "tts_debug_bytes": len(wav_bytes)})

        if segment_count == 0:
            logger.warning("Synthesized audio is empty")
            self._speaking = False

    def _extract_pcm(self, wav_bytes: bytes) -> tuple[bytes, int, int, int]:
        with wave.open(io.BytesIO(wav_bytes), "rb") as wf:
            pcm_bytes = wf.readframes(wf.getnframes())
            tts_sample_rate = wf.getframerate()
            tts_channels = wf.getnchannels()
            tts_sample_width = wf.getsampwidth()
        return pcm_bytes, tts_sample_rate, tts_channels, tts_sample_width

    def _save_wav(self, wav_bytes: bytes) -> tuple[Path, str]:
        timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S_%f")
        filename = f"tts_ws_{timestamp}.wav"
        filepath = self.recordings_dir / filename
        filepath.write_bytes(wav_bytes)
        return filepath, filename

    def _wrap_pcm_as_wav(self, pcm_bytes: bytes, audio_format: AudioFormat) -> bytes:
        with io.BytesIO() as buffer:
            with wave.open(buffer, "wb") as wav_fp:
                wav_fp.setnchannels(audio_format.channels)
                wav_fp.setsampwidth(audio_format.sample_width)
                wav_fp.setframerate(audio_format.sample_rate_hz)
                wav_fp.writeframes(pcm_bytes)
            return buffer.getvalue()

    async def _wait_for_segment_slot(self, segment_index: int, *, base_time: float | None) -> float:
        loop = asyncio.get_running_loop()
        if base_time is None:
            return loop.time()

        if segment_index == 0:
            target_ms = 0
        elif segment_index == 1:
            target_ms = self.down_segment_stagger_millis
        else:
            target_ms = self.down_segment_stagger_millis + (segment_index - 1) * self.down_segment_millis

        target_time = base_time + target_ms / 1000
        now = loop.time()
        if target_time > now:
            await asyncio.sleep(target_time - now)
        return base_time

    async def _send_segments(
        self,
        pcm_bytes: bytes,
        tts_sample_rate: int,
        tts_channels: int,
        segment_bytes: int,
        *,
        next_seq: Callable[[], int],
    ) -> None:
        segments: list[bytes] = []
        offset = 0
        total = len(pcm_bytes)
        while offset < total:
            segments.append(pcm_bytes[offset : offset + segment_bytes])
            offset += segment_bytes
        logger.info("Prepared %d playback segments", len(segments))

        loop = asyncio.get_running_loop()
        base_time = loop.time()

        for idx, segment in enumerate(segments):
            if idx == 0:
                target_ms = 0
            elif idx == 1:
                target_ms = self.down_segment_stagger_millis
            else:
                target_ms = self.down_segment_stagger_millis + (idx - 1) * self.down_segment_millis

            target_time = base_time + target_ms / 1000
            now = loop.time()
            if target_time > now:
                await asyncio.sleep(target_time - now)

            await self._send_segment(segment, tts_sample_rate, tts_channels, next_seq=next_seq)

    async def _send_segment(
        self,
        segment_pcm: bytes,
        tts_sample_rate: int,
        tts_channels: int,
        *,
        next_seq: Callable[[], int],
    ) -> None:
        logger.info("Sending segment bytes=%d", len(segment_pcm))
        await self.ws.send_bytes(
            encode_audio_wav_start_message(
                next_seq(),
                sample_rate=tts_sample_rate,
                channels=tts_channels,
            )
        )

        seg_offset = 0
        seg_total = len(segment_pcm)
        while seg_offset < seg_total:
            chunk = segment_pcm[seg_offset : seg_offset + self.down_wav_chunk]
            await self.ws.send_bytes(encode_audio_wav_data_message(next_seq(), chunk))
            seg_offset += len(chunk)

        await self.ws.send_bytes(encode_audio_wav_end_message(next_seq()))

__all__ = ["SpeakHandler"]