Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 17 additions & 5 deletions astrbot/core/astr_agent_run_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
)
from astrbot.core.provider.entities import LLMResponse
from astrbot.core.provider.provider import TTSProvider
from astrbot.core.utils.tts_text_filter import FilteredQueue

AgentRunner = ToolLoopAgentRunner[AstrAgentContext]

Expand Down Expand Up @@ -354,6 +355,8 @@ async def run_live_agent(
show_tool_use: bool = True,
show_tool_call_result: bool = False,
show_reasoning: bool = False,
tts_filter_enable: bool = False,
tts_filter_custom_rules: list[str] | None = None,
buffer_intermediate_messages: bool = False,
) -> AsyncGenerator[MessageChain | None, None]:
"""Live Mode 的 Agent 运行器,支持流式 TTS
Expand All @@ -365,6 +368,8 @@ async def run_live_agent(
show_tool_use: 是否显示工具使用
show_tool_call_result: 是否显示工具返回结果
show_reasoning: 是否显示推理过程
tts_filter_enable: 是否启用 TTS 文本过滤
tts_filter_custom_rules: 自定义 TTS 过滤正则规则

Yields:
MessageChain: 包含文本或音频数据的消息链
Expand Down Expand Up @@ -398,15 +403,22 @@ async def run_live_agent(
first_chunk_received = False

# 创建队列
text_queue: asyncio.Queue[str | None] = asyncio.Queue()
raw_text_queue: asyncio.Queue[str | None] = asyncio.Queue()
# audio_queue stored bytes or (text, bytes)
audio_queue: asyncio.Queue[bytes | tuple[str, bytes] | None] = asyncio.Queue()

# 为 TTS 创建过滤队列(Feeder 写入原始文本,TTS 读取过滤后文本)
tts_text_queue: asyncio.Queue[str | None] | FilteredQueue = (
FilteredQueue(raw_text_queue, tts_filter_custom_rules)
if tts_filter_enable
else raw_text_queue
)

# 1. 启动 Agent Feeder 任务:负责运行 Agent 并将文本分句喂给 text_queue
feeder_task = asyncio.create_task(
_run_agent_feeder(
agent_runner,
text_queue,
raw_text_queue,
max_step,
show_tool_use,
show_tool_call_result,
Expand All @@ -415,14 +427,14 @@ async def run_live_agent(
)
)

# 2. 启动 TTS 任务:负责从 text_queue 读取文本并生成音频到 audio_queue
# 2. 启动 TTS 任务:负责从 tts_text_queue 读取文本并生成音频到 audio_queue
if support_stream:
tts_task = asyncio.create_task(
_safe_tts_stream_wrapper(tts_provider, text_queue, audio_queue)
_safe_tts_stream_wrapper(tts_provider, tts_text_queue, audio_queue)
)
else:
tts_task = asyncio.create_task(
_simulated_stream_tts(tts_provider, text_queue, audio_queue)
_simulated_stream_tts(tts_provider, tts_text_queue, audio_queue)
)

# 3. 主循环:从 audio_queue 读取音频并 yield
Expand Down
115 changes: 115 additions & 0 deletions astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,10 @@
"dual_output": False,
"use_file_service": False,
"trigger_probability": 1.0,
"tts_text_filter": {
"enable": False,
"custom_rules": [],
},
},
"provider_ltm_settings": {
"group_icl_enable": False,
Expand Down Expand Up @@ -1782,6 +1786,40 @@ class ChatProviderTemplate(TypedDict):
"gemini_tts_voice_name": "Leda",
"proxy": "",
},
"Qwen TTS Realtime(API)": {
"id": "qwen_tts_realtime",
"type": "qwen_tts_realtime",
"provider": "qwen",
"provider_type": "text_to_speech",
"hint": "千问实时语音合成,支持流式输入输出、低延迟响应。模型可选 qwen3-tts-flash-realtime、qwen3-tts-instruct-flash-realtime、qwen-tts-realtime 等。API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取",
"enable": False,
"api_key": "",
"model": "qwen3-tts-flash-realtime",
"qwen_tts_voice": "Cherry",
"qwen_tts_instructions": "",
"qwen_tts_optimize_instructions": False,
"qwen_tts_speech_rate": 1.0,
"qwen_tts_volume": 1.0,
"qwen_tts_pitch_rate": 1.0,
"qwen_tts_url": "wss://dashscope.aliyuncs.com/api-ws/v1/realtime",
"timeout": "30",
},
"CosyVoice TTS(API)": {
"id": "cosyvoice_tts",
"type": "cosyvoice_tts",
"provider": "cosyvoice",
"provider_type": "text_to_speech",
"hint": "CosyVoice 语音合成,支持多种系统音色和复刻音色。模型可选 cosyvoice-v3.5-plus、cosyvoice-v3.5-flash、cosyvoice-v3-plus、cosyvoice-v3-flash、cosyvoice-v2 等。API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取",
"enable": False,
"api_key": "",
"model": "cosyvoice-v3-flash",
"cosyvoice_voice": "longanyang",
"cosyvoice_speech_rate": 1.0,
"cosyvoice_volume": 1.0,
"cosyvoice_pitch_rate": 1.0,
"cosyvoice_base_url": "wss://dashscope.aliyuncs.com/api-ws/v1/inference",
"timeout": "20",
},
"OpenAI Embedding": {
"id": "openai_embedding",
"type": "openai_embedding",
Expand Down Expand Up @@ -2249,6 +2287,66 @@ class ChatProviderTemplate(TypedDict):
"hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)",
},
"dashscope_tts_voice": {"description": "音色", "type": "string"},
"qwen_tts_voice": {
"description": "音色",
"type": "string",
"hint": "Qwen TTS Realtime 音色名称,可选: Cherry(芊悦)、Serena(苏瑶)、Ethan(晨煦) 等。详见 https://help.aliyun.com/zh/model-studio/qwen-tts-realtime-api-reference",
},
"qwen_tts_instructions": {
"description": "指令控制",
"type": "string",
"hint": "通过自然语言描述控制语音表达效果,如'语速较快,带有明显的上扬语调'。仅 qwen3-tts-instruct-flash-realtime 模型支持。长度不超过 1600 Token。",
},
"qwen_tts_optimize_instructions": {
"description": "优化指令",
"type": "bool",
"hint": "启用后模型会自动优化指令描述以获得更好的效果。仅 qwen3-tts-instruct-flash-realtime 模型支持。",
},
"qwen_tts_speech_rate": {
"description": "语速",
"type": "number",
"hint": "语速调节比例,1.0 为正常语速,大于 1.0 加快,小于 1.0 减慢。",
},
"qwen_tts_volume": {
"description": "音量",
"type": "number",
"hint": "音量调节比例,1.0 为正常音量。",
},
"qwen_tts_pitch_rate": {
"description": "音调",
"type": "number",
"hint": "音调调节比例,1.0 为正常音调。",
},
"qwen_tts_url": {
"description": "WebSocket 地址",
"type": "string",
"hint": "Qwen TTS Realtime WebSocket 地址。北京: wss://dashscope.aliyuncs.com/api-ws/v1/realtime;新加坡: wss://dashscope-intl.aliyuncs.com/api-ws/v1/realtime",
},
"cosyvoice_voice": {
"description": "音色",
"type": "string",
"hint": "CosyVoice 音色名称,可选: longanyang、longxiaochun_v2 等。不同模型版本需使用对应版本的音色。详见 https://help.aliyun.com/zh/model-studio/cosyvoice-voice-list",
},
"cosyvoice_speech_rate": {
"description": "语速",
"type": "number",
"hint": "语速调节比例,1.0 为正常语速。仅部分模型支持。",
},
"cosyvoice_volume": {
"description": "音量",
"type": "number",
"hint": "音量调节比例,1.0 为正常音量。仅部分模型支持。",
},
"cosyvoice_pitch_rate": {
"description": "音调",
"type": "number",
"hint": "音调(音高)调节比例,1.0 为正常音调。仅部分模型支持。",
},
"cosyvoice_base_url": {
"description": "WebSocket 地址",
"type": "string",
"hint": "CosyVoice WebSocket 地址。北京: wss://dashscope.aliyuncs.com/api-ws/v1/inference;新加坡: wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference",
},
"gm_resp_image_modal": {
"description": "启用图片模态",
"type": "bool",
Expand Down Expand Up @@ -3127,6 +3225,23 @@ class ChatProviderTemplate(TypedDict):
"provider_tts_settings.enable": True,
},
},
"provider_tts_settings.tts_text_filter.enable": {
"description": "过滤 TTS 文本中的括号内容",
"type": "bool",
"hint": "开启后将自动去除 *文字*、【文字】、(文字) 等括号/标记内容,避免 TTS 朗读情绪标记",
"condition": {
"provider_tts_settings.enable": True,
},
},
"provider_tts_settings.tts_text_filter.custom_rules": {
"description": "自定义 TTS 过滤正则",
"type": "list",
"items": {"type": "string"},
"hint": "每行一条正则表达式,将匹配到的内容从 TTS 文本中移除",
"condition": {
"provider_tts_settings.enable": True,
},
},
"provider_settings.image_caption_prompt": {
"description": "图片转述提示词",
"type": "text",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,13 @@ async def process(
"[Live Mode] TTS Provider 未配置,将使用普通流式模式"
)

# 获取 TTS 文本过滤配置
tts_filter_cfg = self.ctx.astrbot_config.get(
"provider_tts_settings", {}
).get("tts_text_filter", {})
tts_filter_enable = tts_filter_cfg.get("enable", False)
tts_filter_rules = tts_filter_cfg.get("custom_rules", [])

# 使用 run_live_agent,总是使用流式响应
event.set_result(
MessageEventResult()
Expand All @@ -289,6 +296,8 @@ async def process(
self.show_tool_use,
self.show_tool_call_result,
show_reasoning=self.show_reasoning,
tts_filter_enable=tts_filter_enable,
tts_filter_custom_rules=tts_filter_rules,
buffer_intermediate_messages=self.buffer_intermediate_messages,
),
),
Expand Down
24 changes: 22 additions & 2 deletions astrbot/core/pipeline/result_decorate/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from astrbot.core.star.session_llm_manager import SessionServiceManager
from astrbot.core.star.star import star_map
from astrbot.core.star.star_handler import EventType, star_handlers_registry
from astrbot.core.utils.tts_text_filter import TTSTextFilter

from ..context import PipelineContext
from ..stage import Stage, register_stage, registered_stages
Expand Down Expand Up @@ -296,8 +297,27 @@ async def process(
for comp in result.chain:
if isinstance(comp, Plain) and len(comp.text) > 1:
try:
logger.info(f"TTS 请求: {comp.text}")
audio_path = await tts_provider.get_audio(comp.text)
# 应用 TTS 文本过滤
tts_filter_config = self.ctx.astrbot_config[
"provider_tts_settings"
].get("tts_text_filter", {})
tts_filter_enable = tts_filter_config.get("enable", False)
tts_custom_rules = tts_filter_config.get("custom_rules", [])
Comment thread
yuxwd marked this conversation as resolved.

if tts_filter_enable:
tts_text = TTSTextFilter.apply(
comp.text, tts_custom_rules
)
else:
tts_text = comp.text
Comment thread
yuxwd marked this conversation as resolved.

if not tts_text:
# 过滤后为空,跳过 TTS
new_chain.append(comp)
continue

logger.info(f"TTS 请求: {tts_text}")
audio_path = await tts_provider.get_audio(tts_text)
logger.info(f"TTS 结果: {audio_path}")
if not audio_path:
logger.error(
Expand Down
8 changes: 8 additions & 0 deletions astrbot/core/provider/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,14 @@ def dynamic_import_provider(self, type: str) -> None:
from .sources.gemini_tts_source import (
ProviderGeminiTTSAPI as ProviderGeminiTTSAPI,
)
case "qwen_tts_realtime":
from .sources.qwen_tts_realtime_source import (
ProviderQwenTTSRealtime as ProviderQwenTTSRealtime,
)
case "cosyvoice_tts":
from .sources.cosyvoice_tts_source import (
ProviderCosyVoiceTTS as ProviderCosyVoiceTTS,
)
case "openai_embedding":
from .sources.openai_embedding_source import (
OpenAIEmbeddingProvider as OpenAIEmbeddingProvider,
Expand Down
96 changes: 96 additions & 0 deletions astrbot/core/provider/sources/cosyvoice_tts_source.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""CosyVoice TTS provider using DashScope API.

Supports models:
- cosyvoice-v3.5-plus, cosyvoice-v3.5-flash
- cosyvoice-v3-plus, cosyvoice-v3-flash
- cosyvoice-v2, cosyvoice-v1
- sambert-* models

Uses dashscope.audio.tts_v2.SpeechSynthesizer for non-streaming TTS.
"""

import asyncio
import os
import uuid

from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue (bug_risk): Guard the CosyVoice DashScope import similarly to the Qwen provider to avoid hard ImportError.

This provider imports SpeechSynthesizer at module import time without a try/except, so if the DashScope TTS v2 API or audio.tts_v2 isn’t available, importing this module will raise and can break provider loading.

To align with Qwen and fail gracefully, wrap the import in a try/except ImportError, set AudioFormat and SpeechSynthesizer to None on failure, and then have _synthesize / get_audio raise a clear RuntimeError when the SDK capability is missing.


from astrbot.core import logger
from astrbot.core.utils.astrbot_path import get_astrbot_temp_path

from ..entities import ProviderType
from ..provider import TTSProvider
from ..register import register_provider_adapter


@register_provider_adapter(
"cosyvoice_tts",
"CosyVoice TTS (DashScope)",
provider_type=ProviderType.TEXT_TO_SPEECH,
)
class ProviderCosyVoiceTTS(TTSProvider):
def __init__(
self,
provider_config: dict,
provider_settings: dict,
) -> None:
super().__init__(provider_config, provider_settings)
self.chosen_api_key: str = provider_config.get("api_key", "")
self.voice: str = provider_config.get("cosyvoice_voice", "longanyang")
self.speech_rate: float = provider_config.get("cosyvoice_speech_rate", 1.0)
self.volume: float = provider_config.get("cosyvoice_volume", 1.0)
self.pitch_rate: float = provider_config.get("cosyvoice_pitch_rate", 1.0)
self.timeout_ms: float = float(provider_config.get("timeout", 20)) * 1000
self.base_url: str = provider_config.get(
"cosyvoice_base_url",
"wss://dashscope.aliyuncs.com/api-ws/v1/inference",
)

model = provider_config.get("model", "cosyvoice-v3-flash")
self.set_model(model)

if not self.base_url.startswith("wss://"):
logger.warning(
f"[CosyVoice TTS] WebSocket URL 未使用 wss:// 协议: {self.base_url}"
)

async def get_audio(self, text: str) -> str:
"""Synthesize speech using CosyVoice and return the audio file path."""
temp_dir = get_astrbot_temp_path()
os.makedirs(temp_dir, exist_ok=True)

audio_bytes = await self._synthesize(text)
if not audio_bytes:
raise RuntimeError(
f"Audio synthesis failed for model '{self.get_model()}'. "
"The model may not be supported or the service is unavailable.",
)

path = os.path.join(temp_dir, f"cosyvoice_tts_{uuid.uuid4()}.wav")
with open(path, "wb") as f:
f.write(audio_bytes)
return path

async def _synthesize(self, text: str) -> bytes | None:
"""Use CosyVoice SpeechSynthesizer to synthesize speech."""
loop = asyncio.get_running_loop()

model = self.get_model()
fmt = AudioFormat.WAV_24000HZ_MONO_16BIT

synthesizer = SpeechSynthesizer(
model=model,
voice=self.voice,
format=fmt,
api_key=self.chosen_api_key,
url=self.base_url,
)

audio_bytes = await loop.run_in_executor(
None,
synthesizer.call,
text,
self.timeout_ms,
)

return audio_bytes
Loading