From e4d6d123e5ebef546689957b7285c5cb3aed280f Mon Sep 17 00:00:00 2001
From: Greg Randall <gregrr@gmail.com>
Date: Mon, 2 Mar 2026 12:09:38 -0500
Subject: [PATCH 1/5] feat: add Qwen3-TTS engine and update documentation for
 all engines

- Implement Qwen3-TTS engine supporting voice cloning, design, and custom modes
- Update README.md with comprehensive guides for Edge TTS and Qwen3-TTS
- Add faster-qwen3-tts to requirements.txt
- Configure .gitignore to exclude audio exports and temporary files
- Register Qwen3-TTS in the engine registry and configuration schema
---
 .gitignore             | 11 +++++++++
 README.md              | 47 +++++++++++++++++++++++++++++++++---
 requirements.txt       |  1 +
 tts_engine/__init__.py |  1 +
 tts_engine/config.py   | 12 +++++++++
 tts_engine/qwen3tts.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 tts_engine/registry.py |  7 +++++-
 7 files changed, 129 insertions(+), 5 deletions(-)
 create mode 100644 tts_engine/qwen3tts.py

diff --git a/.gitignore b/.gitignore
index 4c5e972..9f1f178 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,14 @@ venv/
 __pycache__
 __pycache__/
 *output/
+*.wav
+*.mp3
+*.flac
+*.opus
+*.ogg
+*.aac
+*.m4a
+
+
+*.md
+!README.md
diff --git a/README.md b/README.md
index f63d5d9..cb03f78 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ A modular command-line interface for text-to-speech synthesis, supporting multip
 
 ## Features
 
-- Supports multiple TTS engines (currently OpenAI and Kokoro)
+- Supports multiple TTS engines (OpenAI, Kokoro, Edge TTS, and Qwen3-TTS)
 - Automatic text chunking with configurable chunk sizes
 - Parallel processing with multiple workers
 - Cost estimation and confirmation for paid services
@@ -22,6 +22,8 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
 
+*Note: Qwen3-TTS requires a GPU with CUDA support for optimal performance and will install `torch` as a dependency.*
+
 ### Environment Setup
 
 For OpenAI TTS, you'll need to set your API key:
@@ -47,6 +49,22 @@ Convert text to speech using default settings (Kokoro engine):
 python cli.py input.txt --output-dir ./output_audio
 ```
 
+### Using Edge TTS (Free, High Quality)
+
+```bash
+python cli.py input.txt --engine edge-tts --voice en-US-ChristopherNeural
+```
+
+### Using Qwen3-TTS (Local Voice Cloning)
+
+```bash
+# Voice Cloning Mode
+python cli.py input.txt --engine qwen3-tts --mode clone --ref-audio sample.wav --ref-text "Text from sample audio."
+
+# Voice Design Mode
+python cli.py input.txt --engine qwen3-tts --mode design --instruct "A calm, deep male voice with a slight British accent."
+```
+
 ### Using Kokoro Engine with Custom Voice
 
 ```bash
@@ -72,6 +90,24 @@ Do you want to proceed? (y/N): y
 | `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--max-workers` | Number of parallel workers | 4 |
 
+### Edge TTS Engine Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--voice` | Voice to use (e.g., `en-US-ChristopherNeural`, `en-GB-SoniaNeural`) | `en-US-ChristopherNeural` |
+
+### Qwen3-TTS Engine Options
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--mode` | Generation mode: `clone`, `design`, or `custom` | `clone` |
+| `--model` | HuggingFace model ID | `Qwen/Qwen3-TTS-12Hz-1.7B-Base` |
+| `--language` | Language for synthesis | `English` |
+| `--ref-audio` | Path to reference audio for voice cloning | `""` |
+| `--ref-text` | Transcript of reference audio for voice cloning | `""` |
+| `--instruct` | Voice design instruction (design mode) | `Warm, clear narrator voice.` |
+| `--speaker` | Speaker ID (custom mode) | `aiden` |
+
 ### Kokoro Engine Options
 
 | Parameter | Description | Default |
@@ -94,10 +130,12 @@ Available OpenAI voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova
 ## Full Usage
 
 ```bash
-usage: cli.py [-h] [--engine {kokoro,openai}] [--output-dir OUTPUT_DIR] 
+usage: cli.py [-h] [--engine {kokoro,openai,edge-tts,qwen3-tts}] [--output-dir OUTPUT_DIR] 
               [--chunk-size CHUNK_SIZE] [--max-workers MAX_WORKERS] 
               [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE] 
-              [--model MODEL] [--response-format RESPONSE_FORMAT] 
+              [--model MODEL] [--response-format RESPONSE_FORMAT]
+              [--mode MODE] [--language LANGUAGE] [--ref-audio REF_AUDIO]
+              [--ref-text REF_TEXT] [--instruct INSTRUCT] [--speaker SPEAKER]
               input_file
 ```
 
@@ -119,4 +157,5 @@ The project is designed to be easily extensible. To add a new TTS engine:
 - NLTK for text chunking
 - SoundFile for audio processing
 - Pydantic for configuration management
-- OpenAI and Kokoro SDKs for respective engines
+- OpenAI, Kokoro, Edge TTS, and Faster-Qwen3-TTS SDKs
+- PyTorch (for Qwen3-TTS)
diff --git a/requirements.txt b/requirements.txt
index 3c513d7..f375457 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ pydantic==2.10.6
 soundfile==0.13.1
 openai==1.61.1
 edge-tts==6.1.9
+faster-qwen3-tts
diff --git a/tts_engine/__init__.py b/tts_engine/__init__.py
index b90e70f..4cbb7b0 100644
--- a/tts_engine/__init__.py
+++ b/tts_engine/__init__.py
@@ -1,3 +1,4 @@
 from .kokoro import KokoroEngine
 from .openai import OpenAIEngine
 from .edgetts import EdgeTTSEngine
+from .qwen3tts import Qwen3TTSEngine
diff --git a/tts_engine/config.py b/tts_engine/config.py
index 9675fe2..5b5dda3 100644
--- a/tts_engine/config.py
+++ b/tts_engine/config.py
@@ -47,6 +47,18 @@ class EdgeTTSConfig(TTSEngineConfig):
     cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
 
 
+class Qwen3TTSConfig(TTSEngineConfig):
+    engine_name: Literal["qwen3-tts"] = "qwen3-tts"
+    mode: str = Field(default="clone", description="Generation mode: clone, design, or custom")
+    model: str = Field(default="Qwen/Qwen3-TTS-12Hz-1.7B-Base", description="HuggingFace model ID")
+    language: str = Field(default="English", description="Language for synthesis")
+    ref_audio: str = Field(default="", description="Path to reference audio for voice cloning")
+    ref_text: str = Field(default="", description="Transcript of reference audio for voice cloning")
+    instruct: str = Field(default="Warm, clear narrator voice.", description="Voice design instruction (design mode)")
+    speaker: str = Field(default="aiden", description="Speaker ID (custom mode)")
+    cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
+
+
 class TTSConfig(BaseConfig):
     """Main configuration supporting multiple engines"""
 
diff --git a/tts_engine/qwen3tts.py b/tts_engine/qwen3tts.py
new file mode 100644
index 0000000..ffa9475
--- /dev/null
+++ b/tts_engine/qwen3tts.py
@@ -0,0 +1,55 @@
+# tts_engine/qwen3tts.py
+from pathlib import Path
+from .base import TTSEngine, SynthesisResult
+from tts_engine.config import Qwen3TTSConfig
+from utils.file_manager import FileManager
+
+
+class Qwen3TTSEngine(TTSEngine):
+    def __init__(self, config: Qwen3TTSConfig):
+        super().__init__(config)
+
+        import torch
+        from faster_qwen3_tts import FasterQwen3TTS
+
+        print(f"Loading Qwen3-TTS model: {config.model} (mode: {config.mode})")
+        self.model = FasterQwen3TTS.from_pretrained(config.model, device="cuda", dtype=torch.float16)
+
+    def synthesize(self, text: str, output_path: Path, chunk_index: int = 1) -> SynthesisResult:
+        print(f"Processing text chunk {chunk_index} ({len(text)} characters)")
+        try:
+            mode = self.config.mode
+            if mode == "clone":
+                if not self.config.ref_audio or not self.config.ref_text:  # empty string = not provided
+                    raise ValueError("Clone mode requires --ref-audio and --ref-text")
+                wavs, sr = self.model.generate_voice_clone(
+                    text=text,
+                    language=self.config.language,
+                    ref_audio=self.config.ref_audio,
+                    ref_text=self.config.ref_text,
+                )
+            elif mode == "design":
+                wavs, sr = self.model.generate_voice_design(
+                    text=text,
+                    instruct=self.config.instruct,
+                    language=self.config.language,
+                )
+            elif mode == "custom":
+                wavs, sr = self.model.generate_custom_voice(
+                    text=text,
+                    speaker=self.config.speaker,
+                    language=self.config.language,
+                )
+            else:
+                raise ValueError(f"Unknown mode: {mode}. Use clone, design, or custom.")
+
+            FileManager.safe_write_audio(output_path, wavs[0], sr)
+            print(f"   Saved chunk {chunk_index} to {output_path}")
+
+            return SynthesisResult(
+                output_file=output_path,
+                character_count=len(text),
+            )
+        except Exception as e:
+            print(f"Synthesis failed: {str(e)}")
+            raise
diff --git a/tts_engine/registry.py b/tts_engine/registry.py
index a318f62..1202edc 100644
--- a/tts_engine/registry.py
+++ b/tts_engine/registry.py
@@ -2,7 +2,8 @@
 from .kokoro import KokoroEngine
 from .openai import OpenAIEngine
 from .edgetts import EdgeTTSEngine
-from .config import KokoroConfig, OpenAIConfig, EdgeTTSConfig
+from .qwen3tts import Qwen3TTSEngine
+from .config import KokoroConfig, OpenAIConfig, EdgeTTSConfig, Qwen3TTSConfig
 
 TTS_REGISTRY = {
     "kokoro": {
@@ -17,4 +18,8 @@
         "engine": EdgeTTSEngine,
         "config": EdgeTTSConfig,
     },
+    "qwen3-tts": {
+        "engine": Qwen3TTSEngine,
+        "config": Qwen3TTSConfig,
+    },
 }

From f0a72374ed60b10183a46b768e8d19fdd7a3bf3b Mon Sep 17 00:00:00 2001
From: Greg Randall <gregrr@gmail.com>
Date: Mon, 2 Mar 2026 12:27:11 -0500
Subject: [PATCH 2/5] feat: implement engine-specific chunk sizes

- Move chunk_size from global TTSConfig to engine-specific TTSEngineConfig
- Set Qwen3-TTS default chunk size to 2000 for better stability
- Update cli.py to use engine-specific chunk_size during processing
- Update README.md tables to reflect per-engine chunking defaults
---
 README.md            | 12 ++++++++----
 cli.py               |  4 ++--
 tts_engine/config.py |  3 ++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index cb03f78..5cc37f3 100644
--- a/README.md
+++ b/README.md
@@ -87,19 +87,21 @@ Do you want to proceed? (y/N): y
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--output-dir` | Output directory for audio files | `output` |
-| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--max-workers` | Number of parallel workers | 4 |
+| `--chunk-size` | Maximum characters per chunk | Engine-dependent (see below) |
 
 ### Edge TTS Engine Options
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--voice` | Voice to use (e.g., `en-US-ChristopherNeural`, `en-GB-SoniaNeural`) | `en-US-ChristopherNeural` |
+| `--chunk-size` | Maximum characters per chunk | 4000 |
 
 ### Qwen3-TTS Engine Options
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 2000 |
 | `--mode` | Generation mode: `clone`, `design`, or `custom` | `clone` |
 | `--model` | HuggingFace model ID | `Qwen/Qwen3-TTS-12Hz-1.7B-Base` |
 | `--language` | Language for synthesis | `English` |
@@ -112,6 +114,7 @@ Do you want to proceed? (y/N): y
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--lang-code` | Language code for synthesis | "a" |
 | `--speed` | Speech speed multiplier | 1.0 |
 | `--voice` | Voice to use `af_bella`, `af_nicole`, `af_sarah`, `af_sky`, `bf_emma`, `bf_isabella`, `am_adam`, `am_michael`, `bm_george`, `bm_lewis`| "am_michael" |
@@ -121,6 +124,7 @@ Do you want to proceed? (y/N): y
 
 | Parameter | Description | Default |
 |-----------|-------------|---------|
+| `--chunk-size` | Maximum characters per chunk | 4000 |
 | `--model` | OpenAI TTS model | "tts-1-hd" |
 | `--voice` | Voice to use: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova`, `sage`, `shimmer`| "alloy" |
 | `--response-format` | Audio format for output | "wav" |
@@ -130,9 +134,9 @@ Available OpenAI voices: `alloy`, `ash`, `coral`, `echo`, `fable`, `onyx`, `nova
 ## Full Usage
 
 ```bash
-usage: cli.py [-h] [--engine {kokoro,openai,edge-tts,qwen3-tts}] [--output-dir OUTPUT_DIR] 
-              [--chunk-size CHUNK_SIZE] [--max-workers MAX_WORKERS] 
-              [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE] 
+usage: cli.py [-h] [--engine {kokoro,openai,edge-tts,qwen3-tts}] [--output-dir OUTPUT_DIR]
+              [--max-workers MAX_WORKERS] [--chunk-size CHUNK_SIZE]
+              [--lang-code LANG_CODE] [--speed SPEED] [--voice VOICE]
               [--model MODEL] [--response-format RESPONSE_FORMAT]
               [--mode MODE] [--language LANGUAGE] [--ref-audio REF_AUDIO]
               [--ref-text REF_TEXT] [--instruct INSTRUCT] [--speaker SPEAKER]
diff --git a/cli.py b/cli.py
index e509568..6300991 100644
--- a/cli.py
+++ b/cli.py
@@ -83,7 +83,7 @@ def main():
     with open(args.input_file) as f:
         input_text = f.read()
 
-    total_chars = calculate_total_characters(input_text, config.chunk_size)
+    total_chars = calculate_total_characters(input_text, config.engine_config.chunk_size)
 
     # Calculate and confirm costs if necessary
     total_cost = calculate_cost(total_chars, config.engine_config.cost_per_char)
@@ -96,7 +96,7 @@ def main():
     engine = TTS_REGISTRY[args.engine]["engine"](config.engine_config)
 
     # Process text
-    chunker = TextChunker(config.chunk_size)
+    chunker = TextChunker(config.engine_config.chunk_size)
     chunks = chunker.process(input_text)
 
     # Create output directory
diff --git a/tts_engine/config.py b/tts_engine/config.py
index 5b5dda3..31aed29 100644
--- a/tts_engine/config.py
+++ b/tts_engine/config.py
@@ -21,6 +21,7 @@ class TTSEngineConfig(BaseConfig):
 
     engine_name: str = Field(..., frozen=True)
     cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
+    chunk_size: int = Field(default=4000, description="Maximum characters per chunk")
 
 
 class KokoroConfig(TTSEngineConfig):
@@ -56,6 +57,7 @@ class Qwen3TTSConfig(TTSEngineConfig):
     ref_text: str = Field(default="", description="Transcript of reference audio for voice cloning")
     instruct: str = Field(default="Warm, clear narrator voice.", description="Voice design instruction (design mode)")
     speaker: str = Field(default="aiden", description="Speaker ID (custom mode)")
+    chunk_size: int = Field(default=2000, description="Maximum characters per chunk")
     cost_per_char: float = Field(default=0.0, description="Cost per character in USD")
 
 
@@ -66,7 +68,6 @@ class TTSConfig(BaseConfig):
     output_dir: Path = Field(
         default=Path("output"), description="Output directory for audio files"
     )
-    chunk_size: int = Field(default=4000, description="Maximum characters per chunk")
     max_workers: int = Field(default=4, description="Number of parallel workers")
 
     @classmethod

From eeaf7aa12d6e033d90f36fb293292aaaf6b29f62 Mon Sep 17 00:00:00 2001
From: Greg Randall <gregrr@gmail.com>
Date: Mon, 2 Mar 2026 13:20:16 -0500
Subject: [PATCH 3/5] feat: add tqdm progress bar and cleanup engine logging

---
 cli.py                 | 41 ++++++++++++++---------------------------
 requirements.txt       |  1 +
 tts_engine/edgetts.py  |  4 ----
 tts_engine/kokoro.py   |  2 --
 tts_engine/openai.py   | 10 ----------
 tts_engine/qwen3tts.py |  2 --
 6 files changed, 15 insertions(+), 45 deletions(-)

diff --git a/cli.py b/cli.py
index 6300991..70595a2 100644
--- a/cli.py
+++ b/cli.py
@@ -1,9 +1,10 @@
 # cli.py
 import argparse
-import concurrent.futures
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
+from tqdm import tqdm
+
 from tts_engine.config import TTSConfig
 from tts_engine.registry import TTS_REGISTRY
 from utils import (
@@ -66,31 +67,13 @@ def create_arg_parser():
             help=f"({engines_str}) {info['description']}",
         )
 
-    return parser
-
-
-def main():
-    parser = create_arg_parser()
-    args = parser.parse_args()
-
-    # Convert args to dict, only including non-None values
-    cli_args = {k: v for k, v in vars(args).items() if v is not None}
-
-    # Create config using factory method
-    config = TTSConfig.create(args.engine, cli_args)
-
-    # Read input text and calculate total characters
-    with open(args.input_file) as f:
-        input_text = f.read()
+    # Add benchmark flag
+    parser.add_argument(
+    )
 
-    total_chars = calculate_total_characters(input_text, config.engine_config.chunk_size)
+    return parser
 
-    # Calculate and confirm costs if necessary
-    total_cost = calculate_cost(total_chars, config.engine_config.cost_per_char)
 
-    if not get_user_confirmation(total_cost):
-        print("Operation cancelled by user.")
-        return
 
     # Create engine instance from registry
     engine = TTS_REGISTRY[args.engine]["engine"](config.engine_config)
@@ -99,6 +82,9 @@ def main():
     chunker = TextChunker(config.engine_config.chunk_size)
     chunks = chunker.process(input_text)
 
+    # Run thread benchmark if requested
+    if args.thread_benchmark:
+
     # Create output directory
     FileManager.create_output_dir(config.output_dir)
 
@@ -114,9 +100,10 @@ def main():
             for i, chunk in enumerate(chunks)
         ]
 
-        for future in concurrent.futures.as_completed(futures):
-            result = future.result()
-            print(f"Generated: {result.output_file}")
+        with tqdm(total=len(chunks), desc="Synthesizing", unit="chunk") as pbar:
+            for future in as_completed(futures):
+                future.result()
+                pbar.update(1)
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
index f375457..70f75e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,3 +4,4 @@ soundfile==0.13.1
 openai==1.61.1
 edge-tts==6.1.9
 faster-qwen3-tts
+tqdm
diff --git a/tts_engine/edgetts.py b/tts_engine/edgetts.py
index cae754c..d258e12 100644
--- a/tts_engine/edgetts.py
+++ b/tts_engine/edgetts.py
@@ -24,12 +24,8 @@ def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
         """Synthesize text to speech using Edge TTS"""
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
-        
         try:
-            print(f"   🔊 Generating audio with Edge TTS...")
             asyncio.run(self._synthesize_async(text, output_path))
-            print(f"   💾 Saved audio to {output_path}")
             
             return SynthesisResult(
                 output_file=output_path,
diff --git a/tts_engine/kokoro.py b/tts_engine/kokoro.py
index 78c75fd..0e7c40a 100644
--- a/tts_engine/kokoro.py
+++ b/tts_engine/kokoro.py
@@ -18,7 +18,6 @@ def __init__(self, config: KokoroConfig):
     def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
         try:
             generator = self.pipeline(
                 text,
@@ -28,7 +27,6 @@ def synthesize(
             )
             results = []
             for segment_number, (_, _, audio) in enumerate(generator, start=1):
-                print(f"   🔊 Audio segment {chunk_index}.{segment_number}")
                 filename = output_path.with_name(
                     f"{output_path.stem}_s{segment_number:03d}.wav"
                 )
diff --git a/tts_engine/openai.py b/tts_engine/openai.py
index 2e34653..4348366 100644
--- a/tts_engine/openai.py
+++ b/tts_engine/openai.py
@@ -2,7 +2,6 @@
 from tts_engine.config import OpenAIConfig
 from pathlib import Path
 from openai import OpenAI
-import time
 
 
 class OpenAIEngine(TTSEngine):
@@ -13,12 +12,7 @@ def __init__(self, config: OpenAIConfig):
     def synthesize(
         self, text: str, output_path: Path, chunk_index: int = 1
     ) -> SynthesisResult:
-        print(f"📦 Processing text chunk {chunk_index} ({len(text)} characters)")
-
         try:
-            print(f"   🔊 Requesting audio from OpenAI API...")
-            start_time = time.time()
-
             response = self.client.audio.speech.create(
                 model=self.config.model,
                 voice=self.config.voice,
@@ -26,12 +20,8 @@ def synthesize(
                 response_format=self.config.response_format,
             )
 
-            print(f"   💾 Saving audio to {output_path}...")
             response.stream_to_file(str(output_path))
 
-            processing_time = time.time() - start_time
-            print(f"   ✅ Chunk {chunk_index} completed in {processing_time:.1f}s")
-
             return SynthesisResult(
                 output_file=output_path,
                 character_count=len(text),
diff --git a/tts_engine/qwen3tts.py b/tts_engine/qwen3tts.py
index ffa9475..3c73c8d 100644
--- a/tts_engine/qwen3tts.py
+++ b/tts_engine/qwen3tts.py
@@ -16,7 +16,6 @@ def __init__(self, config: Qwen3TTSConfig):
         self.model = FasterQwen3TTS.from_pretrained(config.model, device="cuda", dtype=torch.float16)
 
     def synthesize(self, text: str, output_path: Path, chunk_index: int = 1) -> SynthesisResult:
-        print(f"Processing text chunk {chunk_index} ({len(text)} characters)")
         try:
             mode = self.config.mode
             if mode == "clone":
@@ -44,7 +43,6 @@ def synthesize(self, text: str, output_path: Path, chunk_index: int = 1) -> Synt
                 raise ValueError(f"Unknown mode: {mode}. Use clone, design, or custom.")
 
             FileManager.safe_write_audio(output_path, wavs[0], sr)
-            print(f"   Saved chunk {chunk_index} to {output_path}")
 
             return SynthesisResult(
                 output_file=output_path,

From ca0ef92812c773d6dc680896b0f2cdbe7d5271b3 Mon Sep 17 00:00:00 2001
From: Greg Randall <gregrr@gmail.com>
Date: Mon, 2 Mar 2026 13:20:18 -0500
Subject: [PATCH 4/5] feat: add thread count benchmarking tool

---
 cli.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/cli.py b/cli.py
index 70595a2..4c14b2a 100644
--- a/cli.py
+++ b/cli.py
@@ -1,5 +1,7 @@
 # cli.py
 import argparse
+import tempfile
+import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
 
@@ -69,11 +71,87 @@ def create_arg_parser():
 
     # Add benchmark flag
     parser.add_argument(
+        "--thread-benchmark",
+        action="store_true",
+        default=False,
+        help="Benchmark thread counts (1,2,4,6,8) on a sample of chunks, then exit",
     )
 
     return parser
 
 
+def _run_benchmark(engine, chunks, output_dir):
+    """Benchmark different thread counts and report results."""
+    sample = chunks[:3] if len(chunks) >= 3 else chunks
+    thread_counts = [1, 2, 4, 6, 8]
+    results = []
+    best_time = float("inf")
+    consecutive_worse = 0
+
+    print(f"\nBenchmarking with {len(sample)} sample chunk(s)...\n")
+    print(f"{'Threads':>8}  {'Time (s)':>9}  {'vs best':>8}  Note")
+    print("-" * 45)
+
+    for n_threads in thread_counts:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_path = Path(tmp_dir)
+            start = time.perf_counter()
+            with ThreadPoolExecutor(max_workers=n_threads) as executor:
+                futures = [
+                    executor.submit(
+                        engine.synthesize,
+                        chunk,
+                        tmp_path / f"bench_{i:04d}.wav",
+                        chunk_index=i + 1,
+                    )
+                    for i, chunk in enumerate(sample)
+                ]
+                for future in as_completed(futures):
+                    future.result()
+            elapsed = time.perf_counter() - start
+
+        note = ""
+        if elapsed < best_time:
+            best_time = elapsed
+            best_threads = n_threads
+            consecutive_worse = 0
+            note = "<-- best so far"
+        else:
+            consecutive_worse += 1
+
+        results.append((n_threads, elapsed))
+        ratio = elapsed / best_time
+        print(f"{n_threads:>8}  {elapsed:>9.2f}  {ratio:>7.2f}x  {note}")
+
+        if consecutive_worse >= 2:
+            print("\nStopping early: 2 consecutive results worse than best.")
+            break
+
+    print(f"\nRecommended: --max-workers {best_threads}")
+
+
+def main():
+    parser = create_arg_parser()
+    args = parser.parse_args()
+
+    # Convert args to dict, only including non-None values
+    cli_args = {k: v for k, v in vars(args).items() if v is not None}
+
+    # Create config using factory method
+    config = TTSConfig.create(args.engine, cli_args)
+
+    # Read input text and calculate total characters
+    with open(args.input_file) as f:
+        input_text = f.read()
+
+    total_chars = calculate_total_characters(input_text, config.engine_config.chunk_size)
+
+    # Calculate and confirm costs if necessary
+    total_cost = calculate_cost(total_chars, config.engine_config.cost_per_char)
+
+    if not get_user_confirmation(total_cost):
+        print("Operation cancelled by user.")
+        return
 
     # Create engine instance from registry
     engine = TTS_REGISTRY[args.engine]["engine"](config.engine_config)
@@ -84,6 +162,8 @@ def create_arg_parser():
 
     # Run thread benchmark if requested
     if args.thread_benchmark:
+        _run_benchmark(engine, chunks, config.output_dir)
+        return
 
     # Create output directory
     FileManager.create_output_dir(config.output_dir)

From 8d7fa6ecbdcf594f9447c1cd38257e14f7886c6d Mon Sep 17 00:00:00 2001
From: Greg Randall <gregrr@gmail.com>
Date: Mon, 2 Mar 2026 13:20:22 -0500
Subject: [PATCH 5/5] chore: ignore peak files (*.pkf)

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 9f1f178..348b73c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ __pycache__
 __pycache__/
 *output/
 *.wav
+*.pkf
 *.mp3
 *.flac
 *.opus