CosyVoice2にLoRAアダプターで 音素レベルの発音・韻律制御を実現するUtterTuneをCLIとWebUIで動かす

初めに

1か月ほど前に Cosy-Voiceを改良した UtterTuneが公開されました。

リポジトリは以下です。

github.com

こちらはLoRAを用いてアクセントやピッチをより正確に発音できるように改善されています。

本家のリポジトリCLIでしか動かないので、WebUIでも動くようにしつつ実際に推論を行ってみます

開発環境

環境構築

ReadMeを参考に行っていきます

git clone https://github.com/your-username/UtterTune.git
cd UtterTune
git submodule update --init --recursive
git submodule update --init --recursive

モデルをダウンロードします

mkdir -p pretrained_models

# Download CosyVoice2-0.5B
git clone https://www.modelscope.cn/iic/CosyVoice2-0.5B.git pretrained_models/CosyVoice2-0.5B

# Download LoRA weights
git lfs install
git clone https://huggingface.co/shuheikatoinfo/UtterTune-CosyVoice2-ja-JSUTJVS lora_weights/UtterTune-CosyVoice2-ja-JSUTJVS

環境構築と設定を行います

uv venv -p 3.10
.venv/bin/activate
uv pip install -r submodules/CosyVoice/requirements.txt  -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com

# サブmoduleのパスを追加します
python - <<'PY'
import site, os
sp = next(p for p in site.getsitepackages() if p.endswith("site-packages"))
pth = os.path.join(sp, "cosyvoice_submodule.pth")
with open(pth, "w", encoding="utf-8") as f:
    f.write(os.path.abspath("submodules/CosyVoice") + "\n")
    f.write(os.path.abspath("submodules/CosyVoice/third_party/Matcha-TTS") + "\n")
print("Wrote:", pth)
PY

CLIで推論

まずは CLIで推論を行ってみます

python -m scripts.cv2.infer \
    --base_model pretrained_models/CosyVoice2-0.5B \
    --lora_dir lora_weights/UtterTune-CosyVoice2-ja-JSUTJVS \
    --texts "魑魅魍魎が跋扈する。|<PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。" \
    --prompt_wav prompts/wav/common_voice_ja_41758953.wav \
    --prompt_text prompts/trans/common_voice_ja_41758953.txt \
    --out_dir wavs_out

WebUIを作成・実行

次にGUIでも推論をして確認できる画面がほしいので作成していきます。 以下のような画面を作ります。

WebUIのコードは以下になります

#!/usr/bin/env python3
"""
UtterTune WebUI - Gradio interface for UtterTune TTS
"""

import argparse
import json
import random
import time
from pathlib import Path

import gradio as gr
import numpy as np
import safetensors.torch as st
import torch
import torchaudio
from peft import LoraConfig, get_peft_model

from scripts.cv2.patch import apply_patch

apply_patch()

from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.tokenizer.tokenizer import get_qwen_tokenizer

# Global variables
cosyvoice = None
lora_model = None
tok = None
new_ids = None
device = None


def load_models(base_model_path, lora_path=None):
    """Load CosyVoice2 base model and optionally LoRA weights"""
    global cosyvoice, lora_model, tok, new_ids, device

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print(f"Loading base model from {base_model_path}...")
    cosyvoice = CosyVoice2(model_dir=base_model_path, fp16=False)

    if lora_path is not None:
        print(f"Loading LoRA from {lora_path}...")

        # Expand vocabulary
        tok = get_qwen_tokenizer(
            token_path=f"{base_model_path}/CosyVoice-BlankEN",
            skip_special_tokens=True
        )

        # Register new special tokens
        new_tokens = ["<PHON_START>", "<PHON_END>"]
        tok.tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})
        tok.special_tokens["additional_special_tokens"].extend(
            [t for t in new_tokens if t not in tok.special_tokens["additional_special_tokens"]]
        )

        base_model = cosyvoice.model.llm
        base_model.llm.model.resize_token_embeddings(len(tok.tokenizer))
        new_ids = tok.tokenizer.convert_tokens_to_ids(new_tokens)

        # Load LoRA config
        with open(Path(lora_path) / "adapter_config.json") as f:
            adapter_config = json.load(f)

        lora_cfg = LoraConfig(
            r=adapter_config["r"],
            lora_alpha=adapter_config["lora_alpha"],
            lora_dropout=adapter_config["lora_dropout"],
            bias=adapter_config["bias"],
            target_modules=adapter_config["target_modules"],
        )
        lora_model = get_peft_model(base_model, lora_cfg)

        # Load LoRA weights
        lora_weights = st.load_file(Path(lora_path) / "adapter_model.safetensors")
        lora_model.load_state_dict(lora_weights, strict=False)
        lora_model.to(device).eval()

        # Load embeddings of the new tokens
        rows = st.load_file(Path(lora_path) / "embed_patch.safetensors")["embed_rows"].to(device)

        with torch.no_grad():
            lora_model.base_model.llm.model.get_input_embeddings().weight[new_ids] = rows

        print(f"LoRA loaded successfully. New token IDs: {new_ids}")

    print("Models loaded successfully!")


def trim_wav(wav: torch.Tensor, sr: int, trigger_level: float = 7.0) -> torch.Tensor:
    """Trim silence from audio using VAD"""
    trimmed = torchaudio.functional.vad(wav, sr, trigger_level=trigger_level)

    if trimmed.shape[-1] > 0:
        trimmed_rev = torchaudio.functional.vad(
            trimmed.flip(-1), sr, trigger_level=trigger_level
        )
        trimmed = trimmed_rev.flip(-1)

    return trimmed


def generate_seed():
    """Generate a random seed"""
    return random.randint(1, 100000000)


def synthesize(
    text: str,
    use_lora: bool,
    prompt_audio,
    prompt_text: str,
    seed: int,
    trim_output: bool,
    progress=gr.Progress()
):
    """Synthesize speech from text"""
    global cosyvoice, lora_model, tok

    if not text or text.strip() == "":
        return None, "エラー: テキストを入力してください"

    if prompt_audio is None:
        return None, "エラー: プロンプト音声をアップロードしてください"

    if not prompt_text or prompt_text.strip() == "":
        return None, "エラー: プロンプトテキストを入力してください"

    try:
        # Set random seed
        torch.manual_seed(seed)
        np.random.seed(seed)

        # Switch model if needed
        if use_lora and lora_model is not None:
            cosyvoice.model.llm = lora_model
            cosyvoice.frontend.tokenizer = tok
            status = "UtterTune LoRA使用中"
        else:
            # Restore original model if LoRA was active
            if hasattr(cosyvoice.model, '_original_llm'):
                cosyvoice.model.llm = cosyvoice.model._original_llm
            status = "ベースモデル使用中"

        # Load and process prompt audio
        progress(0.2, desc="プロンプト音声を処理中...")
        prompt_wav, sr = torchaudio.load(prompt_audio)

        if sr != 16000:
            prompt_wav = torchaudio.functional.resample(prompt_wav, sr, 16000)

        if prompt_wav.shape[0] > 1:
            prompt_wav = prompt_wav.mean(dim=0, keepdim=True)

        prompt_speech_16k = trim_wav(prompt_wav, 16000)

        # Synthesize
        progress(0.5, desc="音声を合成中...")
        t0 = time.perf_counter()

        wav_iter = cosyvoice.inference_zero_shot(
            tts_text=text,
            prompt_text=prompt_text,
            prompt_speech_16k=prompt_speech_16k,
        )

        wav_dict = next(wav_iter)
        dt = time.perf_counter() - t0

        wav = wav_dict["tts_speech"]

        # Trim output if requested
        if trim_output:
            progress(0.9, desc="出力音声をトリミング中...")
            wav = trim_wav(wav, cosyvoice.sample_rate)

        # Convert to numpy for Gradio
        audio_np = wav.cpu().numpy().flatten()
        duration = len(audio_np) / cosyvoice.sample_rate

        info = f"{status}\n合成時間: {dt:.2f}秒\n音声長: {duration:.2f}秒\nRTF: {dt/duration:.2f}"

        progress(1.0, desc="完了!")

        return (cosyvoice.sample_rate, audio_np), info

    except Exception as e:
        import traceback
        error_msg = f"エラーが発生しました:\n{str(e)}\n\n{traceback.format_exc()}"
        print(error_msg)
        return None, error_msg


def create_ui(args):
    """Create Gradio UI"""

    with gr.Blocks(title="UtterTune WebUI") as demo:
        gr.Markdown("""
        # 🎛️ UtterTune WebUI

        **LoRAベースの音素レベル発音・韻律制御TTS**

        - 📄 [論文](https://www.arxiv.org/abs/2508.09767)
        - 🤗 [モデル](https://huggingface.co/shuheikatoinfo/UtterTune-CosyVoice2-ja-JSUTJVS)
        - 💻 [GitHub](https://github.com/shuheikatoinfo/UtterTune)
        """)

        gr.Markdown("## 使い方")

        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                ### 発音制御の記法
                - `<PHON_START>カナ表記<PHON_END>` - 発音を明示的に指定
                - `'` - アクセント核(高音→低音への遷移点)
                - `/` - 形態素境界

                ### 例
                ```
                魑魅魍魎が跋扈する。
                ↓ 発音を制御
                <PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。
                ```
                """)

            with gr.Column():
                use_lora = gr.Checkbox(
                    label="UtterTune LoRAを使用(発音制御を有効にする場合はチェック)",
                    value=True if args.lora_dir else False,
                    interactive=bool(args.lora_dir)
                )

                if not args.lora_dir:
                    gr.Markdown("⚠️ LoRAが読み込まれていません。`--lora_dir`を指定してください。")

        gr.Markdown("---")
        gr.Markdown("## 音声合成")

        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(
                    label="合成テキスト",
                    lines=3,
                    placeholder="ここにテキストを入力してください...\n発音制御を使う場合は<PHON_START>カナ<PHON_END>を使用",
                    value="魑魅魍魎が跋扈する。"
                )

                with gr.Row():
                    seed_btn = gr.Button("🎲 ランダムシード", size="sm")
                    seed_input = gr.Number(
                        label="シード値",
                        value=42,
                        precision=0
                    )

                trim_output = gr.Checkbox(
                    label="出力音声の前後をトリミング",
                    value=False
                )

            with gr.Column():
                gr.Markdown("**プロンプト音声(参照音声)** - 4秒以下の音声を推奨")
                prompt_audio = gr.Audio(
                    label="プロンプト音声",
                    type="filepath"
                )

                prompt_text_input = gr.Textbox(
                    label="プロンプトテキスト(参照音声の書き起こし)",
                    lines=2,
                    placeholder="プロンプト音声の内容を入力...",
                )

        generate_btn = gr.Button("🎤 音声を合成", variant="primary", size="lg")

        with gr.Row():
            audio_output = gr.Audio(
                label="合成音声",
                type="numpy"
            )
            info_output = gr.Textbox(
                label="情報",
                lines=5
            )

        # Examples
        gr.Markdown("---")
        gr.Markdown("## サンプル")

        gr.Examples(
            examples=[
                [
                    "魑魅魍魎が跋扈する。",
                    True,
                    "prompts/wav/common_voice_ja_41758953.wav",
                    "こんにちは。",
                    42,
                    False
                ],
                [
                    "<PHON_START>チ'ミ/モーリョー<PHON_END>が<PHON_START>バ'ッコ<PHON_END>する。",
                    True,
                    "prompts/wav/common_voice_ja_41758953.wav",
                    "こんにちは。",
                    42,
                    False
                ],
                [
                    "午後に甘いレモンティーを友達と静かに味わった。",
                    True,
                    "prompts/wav/common_voice_ja_36360364.wav",
                    "パンダの赤ちゃんが元気に育っています。",
                    42,
                    False
                ],
                [
                    "午後に甘い<PHON_START>レモ'ンティー<PHON_END>を友達と静かに味わった。",
                    True,
                    "prompts/wav/common_voice_ja_36360364.wav",
                    "パンダの赤ちゃんが元気に育っています。",
                    42,
                    False
                ],
            ],
            inputs=[text_input, use_lora, prompt_audio, prompt_text_input, seed_input, trim_output],
            label="クリックして試してみてください"
        )

        # Event handlers
        seed_btn.click(
            fn=lambda: generate_seed(),
            outputs=seed_input
        )

        generate_btn.click(
            fn=synthesize,
            inputs=[text_input, use_lora, prompt_audio, prompt_text_input, seed_input, trim_output],
            outputs=[audio_output, info_output]
        )

    return demo


def main():
    parser = argparse.ArgumentParser(description="UtterTune WebUI")
    parser.add_argument(
        "--base_model",
        type=str,
        default="pretrained_models/CosyVoice2-0.5B",
        help="Path to CosyVoice2 base model"
    )
    parser.add_argument(
        "--lora_dir",
        type=str,
        default=None,
        help="Path to UtterTune LoRA weights"
    )
    parser.add_argument(
        "--port",
        type=int,
        default=7860,
        help="Port for web interface"
    )
    parser.add_argument(
        "--share",
        action="store_true",
        help="Create a public share link"
    )
    args = parser.parse_args()

    # Load models
    load_models(args.base_model, args.lora_dir)

    # Create and launch UI
    demo = create_ui(args)
    demo.queue(max_size=4)
    demo.launch(
        server_name="0.0.0.0",
        server_port=args.port,
        share=args.share
    )


if __name__ == "__main__":
    main()

WebUIは以下のコマンドで実行します

uv run python webui.py --base_model pretrained_models/CosyVoice2-0.5B --lora_dir lora_weights/UtterTune-CosyVoice2-jp-JSUTJVS --port 7860