CarsonDavis · greg-randall · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026 · Mar 2, 2026
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,15 @@ venv/
 __pycache__
 __pycache__/
 *output/
+*.wav
+*.pkf
+*.mp3
+*.flac
+*.opus
+*.ogg
+*.aac
+*.m4a
+
+
+*.md
+!README.md
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ A modular command-line interface for text-to-speech synthesis, supporting multip
 
 ## Features
 
-- Supports multiple TTS engines (currently OpenAI and Kokoro)
+- Supports multiple TTS engines (OpenAI, Kokoro, Edge TTS, and Qwen3-TTS)
 - Automatic text chunking with configurable chunk sizes
 - Parallel processing with multiple workers
 - Cost estimation and confirmation for paid services
@@ -22,6 +22,8 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
 
+*Note: Qwen3-TTS requires a GPU with CUDA support for optimal performance and will install `torch` as a dependency.*
+
 ### Environment Setup
 
 For OpenAI TTS, you'll need to set your API key:
@@ -47,6 +49,22 @@ Convert text to speech using default settings (Kokoro engine):
 python cli.py input.txt --output-dir ./output_audio
 ```
 
+### Using Edge TTS (Free, High Quality)
+
+```bash
+python cli.py input.txt --engine edge-tts --voice en-US-ChristopherNeural
+```
+
+### Using Qwen3-TTS (Local Voice Cloning)
+
+```bash
+# Voice Cloning Mode
+python cli.py input.txt --engine qwen3-tts --mode clone --ref-audio sample.wav --ref-text "Text from sample audio."
+
+# Voice Design Mode
+python cli.py input.txt --engine qwen3-tts --mode design --instruct "A calm, deep male voice with a slight British accent."
+```
+
 ### Using Kokoro Engine with Custom Voice
 
 ```bash
@@ -69,13 +87,34 @@ Do you want to proceed? (y/N): y
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--output-dir` | Output directory for audio files | `output` |
-| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--max-workers` | Number of parallel workers | 4 |
+| `--chunk-size` | Maximum characters per chunk | Engine-dependent (see below) |
+
+### Edge TTS Engine Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--voice` | Voice to use (e.g., `en-US-ChristopherNeural`, `en-GB-SoniaNeural`) | `en-US-ChristopherNeural` |
+| `--chunk-size` | Maximum characters per chunk | 4000 |
+
+### Qwen3-TTS Engine Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 2000 |
+| `--mode` | Generation mode: `clone`, `design`, or `custom` | `clone` |
+| `--model` | HuggingFace model ID | `Qwen/Qwen3-TTS-12Hz-1.7B-Base` |
+| `--language` | Language for synthesis | `English` |
+| `--ref-audio` | Path to reference audio for voice cloning | `""` |
+| `--ref-text` | Transcript of reference audio for voice cloning | `""` |
+| `--instruct` | Voice design instruction (design mode) | `Warm, clear narrator voice.` |
+| `--speaker` | Speaker ID (custom mode) | `aiden` |
 
 ### Kokoro Engine Options
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--lang-code` | Language code for synthesis | "a" |
 | `--speed` | Speech speed multiplier | 1.0 |
 | `--voice` | Voice to use `af_bella`, `af_nicole`, `af_sarah`, `af_sky`, `bf_emma`, `bf_isabella`, `am_adam`, `am_michael`, `bm_george`, `bm_lewis`| "am_michael" |
@@ -85,6 +124,7 @@ Do you want to proceed? (y/N): y
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--model` | OpenAI TTS model | "tts-1-hd" |
 | `--voice` | Voice to use: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`| "alloy" |
 | `--response-format` | Audio format for output | "wav" |
@@ -94,10 +134,12 @@ Available OpenAI voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova
 ## Full Usage
 
 ```bash
-usage: cli.py [-h] [--engine {kokoro,openai}] [--output-dir OUTPUT_DIR] 
-              [--chunk-size CHUNK_SIZE] [--max-workers MAX_WORKERS] 
-              [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE] 
-              [--model MODEL] [--response-format RESPONSE_FORMAT] 
+usage: cli.py [-h] [--engine {kokoro,openai,edge-tts,qwen3-tts}] [--output-dir OUTPUT_DIR]
+              [--max-workers MAX_WORKERS] [--chunk-size CHUNK_SIZE]
+              [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE]
+              [--model MODEL] [--response-format RESPONSE_FORMAT]
+              [--mode MODE] [--language LANGUAGE] [--ref-audio REF_AUDIO]
+              [--ref-text REF_TEXT] [--instruct INSTRUCT] [--speaker SPEAKER]
               input_file
 ```
 
@@ -119,4 +161,5 @@ The project is designed to be easily extensible. To add a new TTS engine:
 - NLTK for text chunking
 - SoundFile for audio processing
 - Pydantic for configuration management
-- OpenAI and Kokoro SDKs for respective engines
+- OpenAI, Kokoro, Edge TTS, and Faster-Qwen3-TTS SDKs
+- PyTorch (for Qwen3-TTS)
diff --git a/cli.py b/cli.py
@@ -1,9 +1,12 @@
 # cli.py
 import argparse
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
+import tempfile
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
+from tqdm import tqdm
+
 from tts_engine.config import TTSConfig
 from tts_engine.registry import TTS_REGISTRY
 from utils import (
@@ -66,9 +69,67 @@ def create_arg_parser():
             help=f"({engines_str}) {info['description']}",
         )
 
+    # Add benchmark flag
+    parser.add_argument(
+        "--thread-benchmark",
+        action="store_true",
+        default=False,
+        help="Benchmark thread counts (1,2,4,6,8) on a sample of chunks, then exit",
+    )
+
     return parser
 
 
+def _run_benchmark(engine, chunks, output_dir):
+    """Benchmark different thread counts and report results."""
+    sample = chunks[:3] if len(chunks) >= 3 else chunks
+    thread_counts = [1, 2, 4, 6, 8]
+    results = []
+    best_time = float("inf")
+    consecutive_worse = 0
+
+    print(f"\nBenchmarking with {len(sample)} sample chunk(s)...\n")
+    print(f"{'Threads':>8}  {'Time (s)':>9}  {'vs best':>8}  Note")
+    print("-" * 45)
+
+    for n_threads in thread_counts:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            start = time.perf_counter()
+            with ThreadPoolExecutor(max_workers=n_threads) as executor:
+                futures = [
+                    executor.submit(
+                        engine.synthesize,
+                        chunk,
+                        tmp_path / f"bench_{i:04d}.wav",
+                        chunk_index=i + 1,
+                    )
+                    for i, chunk in enumerate(sample)
+                ]
+                for future in as_completed(futures):
+                    future.result()
+            elapsed = time.perf_counter() - start
+
+        note = ""
+        if elapsed < best_time:
+            best_time = elapsed
+            best_threads = n_threads
+            consecutive_worse = 0
+            note = "<-- best so far"
+        else:
+            consecutive_worse += 1
+
+        results.append((n_threads, elapsed))
+        ratio = elapsed / best_time
+        print(f"{n_threads:>8}  {elapsed:>9.2f}  {ratio:>7.2f}x  {note}")
+
+        if consecutive_worse >= 2:
+            print("\nStopping early: 2 consecutive results worse than best.")
+            break
+
+    print(f"\nRecommended: --max-workers {best_threads}")
+
+
 def main():
     parser = create_arg_parser()
     args = parser.parse_args()
@@ -83,7 +144,7 @@ def main():
     with open(args.input_file) as f:
         input_text = f.read()
 
-    total_chars = calculate_total_characters(input_text, config.chunk_size)
+    total_chars = calculate_total_characters(input_text, config.engine_config.chunk_size)
 
     # Calculate and confirm costs if necessary
     total_cost = calculate_cost(total_chars, config.engine_config.cost_per_char)
@@ -96,9 +157,14 @@ def main():
     engine = TTS_REGISTRY[args.engine]["engine"](config.engine_config)
 
     # Process text
-    chunker = TextChunker(config.chunk_size)
+    chunker = TextChunker(config.engine_config.chunk_size)
     chunks = chunker.process(input_text)
 
+    # Run thread benchmark if requested
+    if args.thread_benchmark:
+        _run_benchmark(engine, chunks, config.output_dir)
+        return
+
     # Create output directory
     FileManager.create_output_dir(config.output_dir)
 
@@ -114,9 +180,10 @@ def main():
             for i, chunk in enumerate(chunks)
         ]
 
-        for future in concurrent.futures.as_completed(futures):
-            result = future.result()
-            print(f"Generated: {result.output_file}")
+        with tqdm(total=len(chunks), desc="Synthesizing", unit="chunk") as pbar:
+            for future in as_completed(futures):
+                future.result()
+                pbar.update(1)
 
 
 if __name__ == "__main__":

diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,5 @@ pydantic==2.10.6
 soundfile==0.13.1
 openai==1.61.1
 edge-tts==6.1.9
+faster-qwen3-tts
+tqdm
diff --git a/tts_engine/__init__.py b/tts_engine/__init__.py
@@ -1,3 +1,4 @@
 from .kokoro import KokoroEngine
 from .openai import OpenAIEngine
 from .edgetts import EdgeTTSEngine
+from .qwen3tts import Qwen3TTSEngine
diff --git a/tts_engine/config.py b/tts_engine/config.py
@@ -21,6 +21,7 @@ class TTSEngineConfig(BaseConfig):
 
     engine_name: str = Field(..., frozen=True)
     cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
+    chunk_size: int = Field(default=4000, description="Maximum characters per chunk")
 
 
 class KokoroConfig(TTSEngineConfig):
@@ -47,14 +48,26 @@ class EdgeTTSConfig(TTSEngineConfig):
     cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
 
 
+class Qwen3TTSConfig(TTSEngineConfig):
+    engine_name: Literal["qwen3-tts"] = "qwen3-tts"
+    mode: str = Field(default="clone", description="Generation mode: clone, design, or custom")
+    model: str = Field(default="Qwen/Qwen3-TTS-12Hz-1.7B-Base", description="HuggingFace model ID")
+    language: str = Field(default="English", description="Language for synthesis")
+    ref_audio: str = Field(default="", description="Path to reference audio for voice cloning")
+    ref_text: str = Field(default="", description="Transcript of reference audio for voice cloning")
+    instruct: str = Field(default="Warm, clear narrator voice.", description="Voice design instruction (design mode)")
+    speaker: str = Field(default="aiden", description="Speaker ID (custom mode)")
+    chunk_size: int = Field(default=2000, description="Maximum characters per chunk")
+    cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
+
+
 class TTSConfig(BaseConfig):
     """Main configuration supporting multiple engines"""
 
     engine_config: TTSEngineConfig
     output_dir: Path = Field(
         default=Path("output"), description="Output directory for audio files"
     )
-    chunk_size: int = Field(default=4000, description="Maximum characters per chunk")
     max_workers: int = Field(default=4, description="Number of parallel workers")
 
     @classmethod

diff --git a/tts_engine/edgetts.py b/tts_engine/edgetts.py
@@ -24,12 +24,8 @@ def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
         """Synthesize text to speech using Edge TTS"""
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
-
         try:
-            print(f"   🔊 Generating audio with Edge TTS...")
             asyncio.run(self._synthesize_async(text, output_path))
-            print(f"   💾 Saved audio to {output_path}")
 
             return SynthesisResult(
                 output_file=output_path,

diff --git a/tts_engine/kokoro.py b/tts_engine/kokoro.py
@@ -18,7 +18,6 @@ def __init__(self, config: KokoroConfig):
     def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
         try:
             generator = self.pipeline(
                 text,
@@ -28,7 +27,6 @@ def synthesize(
             )
             results = []
             for segment_number, (_, _, audio) in enumerate(generator, start=1):
-                print(f"   🔊 Audio segment {chunk_index}.{segment_number}")
                 filename = output_path.with_name(
                     f"{output_path.stem}_s{segment_number:03d}.wav"
                 )

diff --git a/tts_engine/openai.py b/tts_engine/openai.py
@@ -2,7 +2,6 @@
 from tts_engine.config import OpenAIConfig
 from pathlib import Path
 from openai import OpenAI
-import time
 
 
 class OpenAIEngine(TTSEngine):
@@ -13,25 +12,16 @@ def __init__(self, config: OpenAIConfig):
     def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
-
         try:
-            print(f"   🔊 Requesting audio from OpenAI API...")
-            start_time = time.time()
-
             response = self.client.audio.speech.create(
                 model=self.config.model,
                 voice=self.config.voice,
                 input=text,
                 response_format=self.config.response_format,
             )
 
-            print(f"   💾 Saving audio to {output_path}...")
             response.stream_to_file(str(output_path))
 
-            processing_time = time.time() - start_time
-            print(f"   ✅ Chunk {chunk_index} completed in {processing_time:.1f}s")
-
             return SynthesisResult(
                 output_file=output_path,
                 character_count=len(text),