diff --git a/.gitignore b/.gitignore index 4c5e972..348b73c 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,15 @@ venv/ __pycache__ __pycache__/ *output/ +*.wav +*.pkf +*.mp3 +*.flac +*.opus +*.ogg +*.aac +*.m4a + + +*.md +!README.md diff --git a/README.md b/README.md index f63d5d9..5cc37f3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ A modular command-line interface for text-to-speech synthesis, supporting multip ## Features -- Supports multiple TTS engines (currently OpenAI and Kokoro) +- Supports multiple TTS engines (OpenAI, Kokoro, Edge TTS, and Qwen3-TTS) - Automatic text chunking with configurable chunk sizes - Parallel processing with multiple workers - Cost estimation and confirmation for paid services @@ -22,6 +22,8 @@ source venv/bin/activate pip install -r requirements.txt ``` +*Note: Qwen3-TTS requires a GPU with CUDA support for optimal performance and will install `torch` as a dependency.* + ### Environment Setup For OpenAI TTS, you'll need to set your API key: @@ -47,6 +49,22 @@ Convert text to speech using default settings (Kokoro engine): python cli.py input.txt --output-dir ./output_audio ``` +### Using Edge TTS (Free, High Quality) + +```bash +python cli.py input.txt --engine edge-tts --voice en-US-ChristopherNeural +``` + +### Using Qwen3-TTS (Local Voice Cloning) + +```bash +# Voice Cloning Mode +python cli.py input.txt --engine qwen3-tts --mode clone --ref-audio sample.wav --ref-text "Text from sample audio." + +# Voice Design Mode +python cli.py input.txt --engine qwen3-tts --mode design --instruct "A calm, deep male voice with a slight British accent." +``` + ### Using Kokoro Engine with Custom Voice ```bash @@ -69,13 +87,34 @@ Do you want to proceed? (y/N): y | Parameter | Description | Default | |-----------|-------------|---------| | `--output-dir` | Output directory for audio files | `output` | -| `--chunk-size` | Maximum characters per chunk | 4000 | | `--max-workers` | Number of parallel workers | 4 | +| `--chunk-size` | Maximum characters per chunk | Engine-dependent (see below) | + +### Edge TTS Engine Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--voice` | Voice to use (e.g., `en-US-ChristopherNeural`, `en-GB-SoniaNeural`) | `en-US-ChristopherNeural` | +| `--chunk-size` | Maximum characters per chunk | 4000 | + +### Qwen3-TTS Engine Options + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--chunk-size` | Maximum characters per chunk | 2000 | +| `--mode` | Generation mode: `clone`, `design`, or `custom` | `clone` | +| `--model` | HuggingFace model ID | `Qwen/Qwen3-TTS-12Hz-1.7B-Base` | +| `--language` | Language for synthesis | `English` | +| `--ref-audio` | Path to reference audio for voice cloning | `""` | +| `--ref-text` | Transcript of reference audio for voice cloning | `""` | +| `--instruct` | Voice design instruction (design mode) | `Warm, clear narrator voice.` | +| `--speaker` | Speaker ID (custom mode) | `aiden` | ### Kokoro Engine Options | Parameter | Description | Default | |-----------|-------------|---------| +| `--chunk-size` | Maximum characters per chunk | 4000 | | `--lang-code` | Language code for synthesis | "a" | | `--speed` | Speech speed multiplier | 1.0 | | `--voice` | Voice to use `af_bella`, `af_nicole`, `af_sarah`, `af_sky`, `bf_emma`, `bf_isabella`, `am_adam`, `am_michael`, `bm_george`, `bm_lewis`| "am_michael" | @@ -85,6 +124,7 @@ Do you want to proceed? (y/N): y | Parameter | Description | Default | |-----------|-------------|---------| +| `--chunk-size` | Maximum characters per chunk | 4000 | | `--model` | OpenAI TTS model | "tts-1-hd" | | `--voice` | Voice to use: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`| "alloy" | | `--response-format` | Audio format for output | "wav" | @@ -94,10 +134,12 @@ Available OpenAI voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova ## Full Usage ```bash -usage: cli.py [-h] [--engine {kokoro,openai}] [--output-dir OUTPUT_DIR] - [--chunk-size CHUNK_SIZE] [--max-workers MAX_WORKERS] - [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE] - [--model MODEL] [--response-format RESPONSE_FORMAT] +usage: cli.py [-h] [--engine {kokoro,openai,edge-tts,qwen3-tts}] [--output-dir OUTPUT_DIR] + [--max-workers MAX_WORKERS] [--chunk-size CHUNK_SIZE] + [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE] + [--model MODEL] [--response-format RESPONSE_FORMAT] + [--mode MODE] [--language LANGUAGE] [--ref-audio REF_AUDIO] + [--ref-text REF_TEXT] [--instruct INSTRUCT] [--speaker SPEAKER] input_file ``` @@ -119,4 +161,5 @@ The project is designed to be easily extensible. To add a new TTS engine: - NLTK for text chunking - SoundFile for audio processing - Pydantic for configuration management -- OpenAI and Kokoro SDKs for respective engines +- OpenAI, Kokoro, Edge TTS, and Faster-Qwen3-TTS SDKs +- PyTorch (for Qwen3-TTS) diff --git a/cli.py b/cli.py index e509568..4c14b2a 100644 --- a/cli.py +++ b/cli.py @@ -1,9 +1,12 @@ # cli.py import argparse -import concurrent.futures -from concurrent.futures import ThreadPoolExecutor +import tempfile +import time +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path +from tqdm import tqdm + from tts_engine.config import TTSConfig from tts_engine.registry import TTS_REGISTRY from utils import ( @@ -66,9 +69,67 @@ def create_arg_parser(): help=f"({engines_str}) {info['description']}", ) + # Add benchmark flag + parser.add_argument( + "--thread-benchmark", + action="store_true", + default=False, + help="Benchmark thread counts (1,2,4,6,8) on a sample of chunks, then exit", + ) + return parser +def _run_benchmark(engine, chunks, output_dir): + """Benchmark different thread counts and report results.""" + sample = chunks[:3] if len(chunks) >= 3 else chunks + thread_counts = [1, 2, 4, 6, 8] + results = [] + best_time = float("inf") + consecutive_worse = 0 + + print(f"\nBenchmarking with {len(sample)} sample chunk(s)...\n") + print(f"{'Threads':>8} {'Time (s)':>9} {'vs best':>8} Note") + print("-" * 45) + + for n_threads in thread_counts: + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + start = time.perf_counter() + with ThreadPoolExecutor(max_workers=n_threads) as executor: + futures = [ + executor.submit( + engine.synthesize, + chunk, + tmp_path / f"bench_{i:04d}.wav", + chunk_index=i + 1, + ) + for i, chunk in enumerate(sample) + ] + for future in as_completed(futures): + future.result() + elapsed = time.perf_counter() - start + + note = "" + if elapsed < best_time: + best_time = elapsed + best_threads = n_threads + consecutive_worse = 0 + note = "<-- best so far" + else: + consecutive_worse += 1 + + results.append((n_threads, elapsed)) + ratio = elapsed / best_time + print(f"{n_threads:>8} {elapsed:>9.2f} {ratio:>7.2f}x {note}") + + if consecutive_worse >= 2: + print("\nStopping early: 2 consecutive results worse than best.") + break + + print(f"\nRecommended: --max-workers {best_threads}") + + def main(): parser = create_arg_parser() args = parser.parse_args() @@ -83,7 +144,7 @@ def main(): with open(args.input_file) as f: input_text = f.read() - total_chars = calculate_total_characters(input_text, config.chunk_size) + total_chars = calculate_total_characters(input_text, config.engine_config.chunk_size) # Calculate and confirm costs if necessary total_cost = calculate_cost(total_chars, config.engine_config.cost_per_char) @@ -96,9 +157,14 @@ def main(): engine = TTS_REGISTRY[args.engine]["engine"](config.engine_config) # Process text - chunker = TextChunker(config.chunk_size) + chunker = TextChunker(config.engine_config.chunk_size) chunks = chunker.process(input_text) + # Run thread benchmark if requested + if args.thread_benchmark: + _run_benchmark(engine, chunks, config.output_dir) + return + # Create output directory FileManager.create_output_dir(config.output_dir) @@ -114,9 +180,10 @@ def main(): for i, chunk in enumerate(chunks) ] - for future in concurrent.futures.as_completed(futures): - result = future.result() - print(f"Generated: {result.output_file}") + with tqdm(total=len(chunks), desc="Synthesizing", unit="chunk") as pbar: + for future in as_completed(futures): + future.result() + pbar.update(1) if __name__ == "__main__": diff --git a/requirements.txt b/requirements.txt index 3c513d7..70f75e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ pydantic==2.10.6 soundfile==0.13.1 openai==1.61.1 edge-tts==6.1.9 +faster-qwen3-tts +tqdm diff --git a/tts_engine/__init__.py b/tts_engine/__init__.py index b90e70f..4cbb7b0 100644 --- a/tts_engine/__init__.py +++ b/tts_engine/__init__.py @@ -1,3 +1,4 @@ from .kokoro import KokoroEngine from .openai import OpenAIEngine from .edgetts import EdgeTTSEngine +from .qwen3tts import Qwen3TTSEngine diff --git a/tts_engine/config.py b/tts_engine/config.py index 9675fe2..31aed29 100644 --- a/tts_engine/config.py +++ b/tts_engine/config.py @@ -21,6 +21,7 @@ class TTSEngineConfig(BaseConfig): engine_name: str = Field(..., frozen=True) cost_per_char: float = Field(default=0.0, description="Cost per character in USD") + chunk_size: int = Field(default=4000, description="Maximum characters per chunk") class KokoroConfig(TTSEngineConfig): @@ -47,6 +48,19 @@ class EdgeTTSConfig(TTSEngineConfig): cost_per_char: float = Field(default=0.0, description="Cost per character in USD") +class Qwen3TTSConfig(TTSEngineConfig): + engine_name: Literal["qwen3-tts"] = "qwen3-tts" + mode: str = Field(default="clone", description="Generation mode: clone, design, or custom") + model: str = Field(default="Qwen/Qwen3-TTS-12Hz-1.7B-Base", description="HuggingFace model ID") + language: str = Field(default="English", description="Language for synthesis") + ref_audio: str = Field(default="", description="Path to reference audio for voice cloning") + ref_text: str = Field(default="", description="Transcript of reference audio for voice cloning") + instruct: str = Field(default="Warm, clear narrator voice.", description="Voice design instruction (design mode)") + speaker: str = Field(default="aiden", description="Speaker ID (custom mode)") + chunk_size: int = Field(default=2000, description="Maximum characters per chunk") + cost_per_char: float = Field(default=0.0, description="Cost per character in USD") + + class TTSConfig(BaseConfig): """Main configuration supporting multiple engines""" @@ -54,7 +68,6 @@ class TTSConfig(BaseConfig): output_dir: Path = Field( default=Path("output"), description="Output directory for audio files" ) - chunk_size: int = Field(default=4000, description="Maximum characters per chunk") max_workers: int = Field(default=4, description="Number of parallel workers") @classmethod diff --git a/tts_engine/edgetts.py b/tts_engine/edgetts.py index cae754c..d258e12 100644 --- a/tts_engine/edgetts.py +++ b/tts_engine/edgetts.py @@ -24,12 +24,8 @@ def synthesize( self, text: str, output_path: Path, chunk_index: int = 1 ) -> SynthesisResult: """Synthesize text to speech using Edge TTS""" - print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)") - try: - print(f" 🔊 Generating audio with Edge TTS...") asyncio.run(self._synthesize_async(text, output_path)) - print(f" 💾 Saved audio to {output_path}") return SynthesisResult( output_file=output_path, diff --git a/tts_engine/kokoro.py b/tts_engine/kokoro.py index 78c75fd..0e7c40a 100644 --- a/tts_engine/kokoro.py +++ b/tts_engine/kokoro.py @@ -18,7 +18,6 @@ def __init__(self, config: KokoroConfig): def synthesize( self, text: str, output_path: Path, chunk_index: int = 1 ) -> SynthesisResult: - print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)") try: generator = self.pipeline( text, @@ -28,7 +27,6 @@ def synthesize( ) results = [] for segment_number, (_, _, audio) in enumerate(generator, start=1): - print(f" 🔊 Audio segment {chunk_index}.{segment_number}") filename = output_path.with_name( f"{output_path.stem}_s{segment_number:03d}.wav" ) diff --git a/tts_engine/openai.py b/tts_engine/openai.py index 2e34653..4348366 100644 --- a/tts_engine/openai.py +++ b/tts_engine/openai.py @@ -2,7 +2,6 @@ from tts_engine.config import OpenAIConfig from pathlib import Path from openai import OpenAI -import time class OpenAIEngine(TTSEngine): @@ -13,12 +12,7 @@ def __init__(self, config: OpenAIConfig): def synthesize( self, text: str, output_path: Path, chunk_index: int = 1 ) -> SynthesisResult: - print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)") - try: - print(f" 🔊 Requesting audio from OpenAI API...") - start_time = time.time() - response = self.client.audio.speech.create( model=self.config.model, voice=self.config.voice, @@ -26,12 +20,8 @@ def synthesize( response_format=self.config.response_format, ) - print(f" 💾 Saving audio to {output_path}...") response.stream_to_file(str(output_path)) - processing_time = time.time() - start_time - print(f" ✅ Chunk {chunk_index} completed in {processing_time:.1f}s") - return SynthesisResult( output_file=output_path, character_count=len(text), diff --git a/tts_engine/qwen3tts.py b/tts_engine/qwen3tts.py new file mode 100644 index 0000000..3c73c8d --- /dev/null +++ b/tts_engine/qwen3tts.py @@ -0,0 +1,53 @@ +# tts_engine/qwen3tts.py +from pathlib import Path +from .base import TTSEngine, SynthesisResult +from tts_engine.config import Qwen3TTSConfig +from utils.file_manager import FileManager + + +class Qwen3TTSEngine(TTSEngine): + def __init__(self, config: Qwen3TTSConfig): + super().__init__(config) + + import torch + from faster_qwen3_tts import FasterQwen3TTS + + print(f"Loading Qwen3-TTS model: {config.model} (mode: {config.mode})") + self.model = FasterQwen3TTS.from_pretrained(config.model, device="cuda", dtype=torch.float16) + + def synthesize(self, text: str, output_path: Path, chunk_index: int = 1) -> SynthesisResult: + try: + mode = self.config.mode + if mode == "clone": + if not self.config.ref_audio or not self.config.ref_text: # empty string = not provided + raise ValueError("Clone mode requires --ref-audio and --ref-text") + wavs, sr = self.model.generate_voice_clone( + text=text, + language=self.config.language, + ref_audio=self.config.ref_audio, + ref_text=self.config.ref_text, + ) + elif mode == "design": + wavs, sr = self.model.generate_voice_design( + text=text, + instruct=self.config.instruct, + language=self.config.language, + ) + elif mode == "custom": + wavs, sr = self.model.generate_custom_voice( + text=text, + speaker=self.config.speaker, + language=self.config.language, + ) + else: + raise ValueError(f"Unknown mode: {mode}. Use clone, design, or custom.") + + FileManager.safe_write_audio(output_path, wavs[0], sr) + + return SynthesisResult( + output_file=output_path, + character_count=len(text), + ) + except Exception as e: + print(f"Synthesis failed: {str(e)}") + raise diff --git a/tts_engine/registry.py b/tts_engine/registry.py index a318f62..1202edc 100644 --- a/tts_engine/registry.py +++ b/tts_engine/registry.py @@ -2,7 +2,8 @@ from .kokoro import KokoroEngine from .openai import OpenAIEngine from .edgetts import EdgeTTSEngine -from .config import KokoroConfig, OpenAIConfig, EdgeTTSConfig +from .qwen3tts import Qwen3TTSEngine +from .config import KokoroConfig, OpenAIConfig, EdgeTTSConfig, Qwen3TTSConfig TTS_REGISTRY = { "kokoro": { @@ -17,4 +18,8 @@ "engine": EdgeTTSEngine, "config": EdgeTTSConfig, }, + "qwen3-tts": { + "engine": Qwen3TTSEngine, + "config": Qwen3TTSConfig, + }, }