初めに
1か月ほど前に Cosy-Voiceを改良した UtterTuneが公開されました。
リポジトリは以下です。
こちらはLoRAを用いてアクセントやピッチをより正確に発音できるように改善されています。
本家のリポジトリは CLIでしか動かないので、WebUIでも動くようにしつつ実際に推論を行ってみます
開発環境
- Windows 11
- uv 0.9.x
環境構築
ReadMeを参考に行っていきます
git clone https://github.com/your-username/UtterTune.git cd UtterTune git submodule update --init --recursive git submodule update --init --recursive
モデルをダウンロードします
mkdir -p pretrained_models # Download CosyVoice2-0.5B git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B # Download LoRA weights git lfs install git clone https://huggingface.co/shuheikatoinfo/UtterTune-CosyVoice2-ja-JSUTJVS lora_weights/UtterTune-CosyVoice2-ja-JSUTJVS
環境構築と設定を行います
uv venv -p 3.10 .venv/bin/activate uv pip install -r submodules/CosyVoice/requirements.txt -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com # サブmoduleのパスを追加します python - <<'PY' import site, os sp = next(p for p in site.getsitepackages() if p.endswith("site-packages")) pth = os.path.join(sp, "cosyvoice_submodule.pth") with open(pth, "w", encoding="utf-8") as f: f.write(os.path.abspath("submodules/CosyVoice") + "\n") f.write(os.path.abspath("submodules/CosyVoice/third_party/Matcha-TTS") + "\n") print("Wrote:", pth) PY
CLIで推論
まずは CLIで推論を行ってみます
python -m scripts.cv2.infer \ --base_model pretrained_models/CosyVoice2-0.5B \ --lora_dir lora_weights/UtterTune-CosyVoice2-ja-JSUTJVS \ --texts "魑魅魍魎が跋扈する。|<PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。" \ --prompt_wav prompts/wav/common_voice_ja_41758953.wav \ --prompt_text prompts/trans/common_voice_ja_41758953.txt \ --out_dir wavs_out
WebUIを作成・実行
次にGUIでも推論をして確認できる画面がほしいので作成していきます。 以下のような画面を作ります。

WebUIのコードは以下になります
#!/usr/bin/env python3 """ UtterTune WebUI - Gradio interface for UtterTune TTS """ import argparse import json import random import time from pathlib import Path import gradio as gr import numpy as np import safetensors.torch as st import torch import torchaudio from peft import LoraConfig, get_peft_model from scripts.cv2.patch import apply_patch apply_patch() from cosyvoice.cli.cosyvoice import CosyVoice2 from cosyvoice.tokenizer.tokenizer import get_qwen_tokenizer # Global variables cosyvoice = None lora_model = None tok = None new_ids = None device = None def load_models(base_model_path, lora_path=None): """Load CosyVoice2 base model and optionally LoRA weights""" global cosyvoice, lora_model, tok, new_ids, device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Loading base model from {base_model_path}...") cosyvoice = CosyVoice2(model_dir=base_model_path, fp16=False) if lora_path is not None: print(f"Loading LoRA from {lora_path}...") # Expand vocabulary tok = get_qwen_tokenizer( token_path=f"{base_model_path}/CosyVoice-BlankEN", skip_special_tokens=True ) # Register new special tokens new_tokens = ["<PHON_START>", "<PHON_END>"] tok.tokenizer.add_special_tokens({"additional_special_tokens": new_tokens}) tok.special_tokens["additional_special_tokens"].extend( [t for t in new_tokens if t not in tok.special_tokens["additional_special_tokens"]] ) base_model = cosyvoice.model.llm base_model.llm.model.resize_token_embeddings(len(tok.tokenizer)) new_ids = tok.tokenizer.convert_tokens_to_ids(new_tokens) # Load LoRA config with open(Path(lora_path) / "adapter_config.json") as f: adapter_config = json.load(f) lora_cfg = LoraConfig( r=adapter_config["r"], lora_alpha=adapter_config["lora_alpha"], lora_dropout=adapter_config["lora_dropout"], bias=adapter_config["bias"], target_modules=adapter_config["target_modules"], ) lora_model = get_peft_model(base_model, lora_cfg) # Load LoRA weights lora_weights = st.load_file(Path(lora_path) / "adapter_model.safetensors") lora_model.load_state_dict(lora_weights, strict=False) lora_model.to(device).eval() # Load embeddings of the new tokens rows = st.load_file(Path(lora_path) / "embed_patch.safetensors")["embed_rows"].to(device) with torch.no_grad(): lora_model.base_model.llm.model.get_input_embeddings().weight[new_ids] = rows print(f"LoRA loaded successfully. New token IDs: {new_ids}") print("Models loaded successfully!") def trim_wav(wav: torch.Tensor, sr: int, trigger_level: float = 7.0) -> torch.Tensor: """Trim silence from audio using VAD""" trimmed = torchaudio.functional.vad(wav, sr, trigger_level=trigger_level) if trimmed.shape[-1] > 0: trimmed_rev = torchaudio.functional.vad( trimmed.flip(-1), sr, trigger_level=trigger_level ) trimmed = trimmed_rev.flip(-1) return trimmed def generate_seed(): """Generate a random seed""" return random.randint(1, 100000000) def synthesize( text: str, use_lora: bool, prompt_audio, prompt_text: str, seed: int, trim_output: bool, progress=gr.Progress() ): """Synthesize speech from text""" global cosyvoice, lora_model, tok if not text or text.strip() == "": return None, "エラー: テキストを入力してください" if prompt_audio is None: return None, "エラー: プロンプト音声をアップロードしてください" if not prompt_text or prompt_text.strip() == "": return None, "エラー: プロンプトテキストを入力してください" try: # Set random seed torch.manual_seed(seed) np.random.seed(seed) # Switch model if needed if use_lora and lora_model is not None: cosyvoice.model.llm = lora_model cosyvoice.frontend.tokenizer = tok status = "UtterTune LoRA使用中" else: # Restore original model if LoRA was active if hasattr(cosyvoice.model, '_original_llm'): cosyvoice.model.llm = cosyvoice.model._original_llm status = "ベースモデル使用中" # Load and process prompt audio progress(0.2, desc="プロンプト音声を処理中...") prompt_wav, sr = torchaudio.load(prompt_audio) if sr != 16000: prompt_wav = torchaudio.functional.resample(prompt_wav, sr, 16000) if prompt_wav.shape[0] > 1: prompt_wav = prompt_wav.mean(dim=0, keepdim=True) prompt_speech_16k = trim_wav(prompt_wav, 16000) # Synthesize progress(0.5, desc="音声を合成中...") t0 = time.perf_counter() wav_iter = cosyvoice.inference_zero_shot( tts_text=text, prompt_text=prompt_text, prompt_speech_16k=prompt_speech_16k, ) wav_dict = next(wav_iter) dt = time.perf_counter() - t0 wav = wav_dict["tts_speech"] # Trim output if requested if trim_output: progress(0.9, desc="出力音声をトリミング中...") wav = trim_wav(wav, cosyvoice.sample_rate) # Convert to numpy for Gradio audio_np = wav.cpu().numpy().flatten() duration = len(audio_np) / cosyvoice.sample_rate info = f"{status}\n合成時間: {dt:.2f}秒\n音声長: {duration:.2f}秒\nRTF: {dt/duration:.2f}" progress(1.0, desc="完了!") return (cosyvoice.sample_rate, audio_np), info except Exception as e: import traceback error_msg = f"エラーが発生しました:\n{str(e)}\n\n{traceback.format_exc()}" print(error_msg) return None, error_msg def create_ui(args): """Create Gradio UI""" with gr.Blocks(title="UtterTune WebUI") as demo: gr.Markdown(""" # 🎛️ UtterTune WebUI **LoRAベースの音素レベル発音・韻律制御TTS** - 📄 [論文](https://www.arxiv.org/abs/2508.09767) - 🤗 [モデル](https://huggingface.co/shuheikatoinfo/UtterTune-CosyVoice2-ja-JSUTJVS) - 💻 [GitHub](https://github.com/shuheikatoinfo/UtterTune) """) gr.Markdown("## 使い方") with gr.Row(): with gr.Column(): gr.Markdown(""" ### 発音制御の記法 - `<PHON_START>カナ表記<PHON_END>` - 発音を明示的に指定 - `'` - アクセント核(高音→低音への遷移点) - `/` - 形態素境界 ### 例 ``` 魑魅魍魎が跋扈する。 ↓ 発音を制御 <PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。 ``` """) with gr.Column(): use_lora = gr.Checkbox( label="UtterTune LoRAを使用(発音制御を有効にする場合はチェック)", value=True if args.lora_dir else False, interactive=bool(args.lora_dir) ) if not args.lora_dir: gr.Markdown("⚠️ LoRAが読み込まれていません。`--lora_dir`を指定してください。") gr.Markdown("---") gr.Markdown("## 音声合成") with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="合成テキスト", lines=3, placeholder="ここにテキストを入力してください...\n発音制御を使う場合は<PHON_START>カナ<PHON_END>を使用", value="魑魅魍魎が跋扈する。" ) with gr.Row(): seed_btn = gr.Button("🎲 ランダムシード", size="sm") seed_input = gr.Number( label="シード値", value=42, precision=0 ) trim_output = gr.Checkbox( label="出力音声の前後をトリミング", value=False ) with gr.Column(): gr.Markdown("**プロンプト音声(参照音声)** - 4秒以下の音声を推奨") prompt_audio = gr.Audio( label="プロンプト音声", type="filepath" ) prompt_text_input = gr.Textbox( label="プロンプトテキスト(参照音声の書き起こし)", lines=2, placeholder="プロンプト音声の内容を入力...", ) generate_btn = gr.Button("🎤 音声を合成", variant="primary", size="lg") with gr.Row(): audio_output = gr.Audio( label="合成音声", type="numpy" ) info_output = gr.Textbox( label="情報", lines=5 ) # Examples gr.Markdown("---") gr.Markdown("## サンプル") gr.Examples( examples=[ [ "魑魅魍魎が跋扈する。", True, "prompts/wav/common_voice_ja_41758953.wav", "こんにちは。", 42, False ], [ "<PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。", True, "prompts/wav/common_voice_ja_41758953.wav", "こんにちは。", 42, False ], [ "午後に甘いレモンティーを友達と静かに味わった。", True, "prompts/wav/common_voice_ja_36360364.wav", "パンダの赤ちゃんが元気に育っています。", 42, False ], [ "午後に甘い<PHON_START>レモ'ンティー<PHON_END>を友達と静かに味わった。", True, "prompts/wav/common_voice_ja_36360364.wav", "パンダの赤ちゃんが元気に育っています。", 42, False ], ], inputs=[text_input, use_lora, prompt_audio, prompt_text_input, seed_input, trim_output], label="クリックして試してみてください" ) # Event handlers seed_btn.click( fn=lambda: generate_seed(), outputs=seed_input ) generate_btn.click( fn=synthesize, inputs=[text_input, use_lora, prompt_audio, prompt_text_input, seed_input, trim_output], outputs=[audio_output, info_output] ) return demo def main(): parser = argparse.ArgumentParser(description="UtterTune WebUI") parser.add_argument( "--base_model", type=str, default="pretrained_models/CosyVoice2-0.5B", help="Path to CosyVoice2 base model" ) parser.add_argument( "--lora_dir", type=str, default=None, help="Path to UtterTune LoRA weights" ) parser.add_argument( "--port", type=int, default=7860, help="Port for web interface" ) parser.add_argument( "--share", action="store_true", help="Create a public share link" ) args = parser.parse_args() # Load models load_models(args.base_model, args.lora_dir) # Create and launch UI demo = create_ui(args) demo.queue(max_size=4) demo.launch( server_name="0.0.0.0", server_port=args.port, share=args.share ) if __name__ == "__main__": main()
WebUIは以下のコマンドで実行します
uv run python webui.py --base_model pretrained_models/CosyVoice2-0.5B --lora_dir lora_weights/UtterTune-CosyVoice2-jp-JSUTJVS --port 7860