From fcfa28150c730632fc9b6793c715eafe29f522e5 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 13:12:26 +0300 Subject: [PATCH 1/3] Add basic CLI --- README.md | 8 ++++++- kittentts/__main__.py | 51 +++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 3 +++ setup.py | 5 +++++ 4 files changed, 66 insertions(+), 1 deletion(-) create mode 100644 kittentts/__main__.py diff --git a/README.md b/README.md index 88fa577..9e2569f 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Try Kitten TTS directly in your browser on [Hugging Face Spaces](https://hugging pip install https://github.com/KittenML/KittenTTS/releases/download/0.8.1/kittentts-0.8.1-py3-none-any.whl ``` -### Basic Usage +### API Usage ```python from kittentts import KittenTTS @@ -85,6 +85,12 @@ import soundfile as sf sf.write("output.wav", audio, 24000) ``` +### CLI usage + +``` +kittentts --output output.wav --text "This high quality TTS model works without a GPU" +``` + ### Advanced Usage ```python diff --git a/kittentts/__main__.py b/kittentts/__main__.py new file mode 100644 index 0000000..bcbd842 --- /dev/null +++ b/kittentts/__main__.py @@ -0,0 +1,51 @@ +import argparse +import datetime + +voices = [ + "expr-voice-2-m", + "expr-voice-2-f", + "expr-voice-3-m", + "expr-voice-3-f", + "expr-voice-4-m", + "expr-voice-4-f", + "expr-voice-5-m", + "expr-voice-5-f", +] + + +def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta: + from kittentts import KittenTTS + import soundfile as sf + + m = KittenTTS(model) + t0 = datetime.datetime.now() + audio = m.generate(text, voice=voice) + sf.write(output, audio, 24000) + t1 = datetime.datetime.now() + return t1 - t0 + + +def main() -> None: + ap = argparse.ArgumentParser(prog="kittentts", description="Run Kitten TTS model") + ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") + ap.add_argument("--text", required=True, help="Text to synthesize") + ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) + ap.add_argument("--output", help="Output audio file") + + args = ap.parse_args() + + if not args.output: + ts = datetime.datetime.now().isoformat(timespec="seconds").replace(":", "-") + args.output = f"{args.voice}-{ts}.wav" + + gen_time = run( + model=args.model, + voice=args.voice, + output=args.output, + text=args.text, + ) + print(f"Generated audio in {gen_time}, saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 8a60547..6bcf4fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,9 @@ dependencies = [ "huggingface_hub", ] +[project.scripts] +kittentts = "kittentts.__main__:main" + [project.urls] Homepage = "https://github.com/kittenml/kittentts" Repository = "https://github.com/kittenml/kittentts" diff --git a/setup.py b/setup.py index 5ff10c6..e870b24 100644 --- a/setup.py +++ b/setup.py @@ -43,4 +43,9 @@ "Bug Reports": "https://github.com/kittenml/kittentts/issues", "Source": "https://github.com/kittenml/kittentts", }, + entry_points={ + "console_scripts": [ + "kittentts=kittentts.__main__:main", + ], + }, ) From 6ddf3670f2de51900bedc0ba11d7ae1962ce2c96 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 15:19:39 +0300 Subject: [PATCH 2/3] Add support for stdout output --- kittentts/__main__.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/kittentts/__main__.py b/kittentts/__main__.py index bcbd842..a694839 100644 --- a/kittentts/__main__.py +++ b/kittentts/__main__.py @@ -1,5 +1,7 @@ import argparse import datetime +import io +import sys voices = [ "expr-voice-2-m", @@ -20,7 +22,13 @@ def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta m = KittenTTS(model) t0 = datetime.datetime.now() audio = m.generate(text, voice=voice) - sf.write(output, audio, 24000) + if output == "-": + # sf requires a seekable buffer for writing. + bio = io.BytesIO() + sf.write(bio, audio, 24000, format="WAV", subtype="PCM_16") + sys.stdout.buffer.write(bio.getvalue()) + else: + sf.write(output, audio, 24000) t1 = datetime.datetime.now() return t1 - t0 @@ -30,7 +38,7 @@ def main() -> None: ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") ap.add_argument("--text", required=True, help="Text to synthesize") ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) - ap.add_argument("--output", help="Output audio file") + ap.add_argument("--output", help="Output audio file (- for stdout; use with care)") args = ap.parse_args() @@ -44,7 +52,7 @@ def main() -> None: output=args.output, text=args.text, ) - print(f"Generated audio in {gen_time}, saved to {args.output}") + print(f"Generated audio in {gen_time}, saved to {args.output}", file=sys.stderr) if __name__ == "__main__": From d21858f459ffd66fc8f562e2481e77436618ed89 Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Wed, 6 Aug 2025 17:14:41 +0300 Subject: [PATCH 3/3] Add optional `--speed` --- kittentts/__main__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kittentts/__main__.py b/kittentts/__main__.py index a694839..73c7da6 100644 --- a/kittentts/__main__.py +++ b/kittentts/__main__.py @@ -15,13 +15,13 @@ ] -def run(*, model: str, voice: str, output: str, text: str) -> datetime.timedelta: +def run(*, model: str, voice: str, output: str, text: str, speed: float=1.0) -> datetime.timedelta: from kittentts import KittenTTS import soundfile as sf m = KittenTTS(model) t0 = datetime.datetime.now() - audio = m.generate(text, voice=voice) + audio = m.generate(text, voice=voice, speed=speed) if output == "-": # sf requires a seekable buffer for writing. bio = io.BytesIO() @@ -38,6 +38,7 @@ def main() -> None: ap.add_argument("--model", default="KittenML/kitten-tts-nano-0.1", help="Model to use") ap.add_argument("--text", required=True, help="Text to synthesize") ap.add_argument("--voice", default="expr-voice-2-f", help="Voice to use", choices=voices) + ap.add_argument("--speed", type=float, default=1.0, help="Speech speed (1.0 = normal)") ap.add_argument("--output", help="Output audio file (- for stdout; use with care)") args = ap.parse_args() @@ -51,6 +52,7 @@ def main() -> None: voice=args.voice, output=args.output, text=args.text, + speed=args.speed, ) print(f"Generated audio in {gen_time}, saved to {args.output}", file=sys.stderr)