tacotron2をWindowsで動かす

初めに

いまさらですが、tacotron2を触ってみます

以下に現在の環境に合わせてuvで環境構築をしたRepositoryを作成しました

github.com

tacotron2の構成と課題

自己回帰 + Attentionのアーキテクチャになっているが以下のような課題があった

  • 推論に時間がかかる
  • Attention部分で確率の計算をしているためスキップや連続の音が間違えることがあった

開発環境

  • Windos 11
  • cuda 12.6

環境構築

uv環境に切り替える前提で進めていきます

まずは3.10の環境構築を行います

uv python pin 3.10

次に以下のようなpyproject.toml の作成します。

[project]
name = "tacotron2"
version = "0.1.0"
description = "Tacotron2 Text-to-Speech"
requires-python = ">=3.10,<3.12"
dependencies = [
    "gdown>=5.2.0",
    "inflect>=7.5.0",
    "librosa>=0.11.0",
    "matplotlib>=3.10.8",
    "scipy>=1.15.3",
    "torch>=2.6.0",
    "torchaudio>=2.6.0",
    "torchvision>=0.21.0",
    "unidecode>=1.4.0",
]

[[tool.uv.index]]
name = "pytorch-cu124"
url = "https://download.pytorch.org/whl/cu124"
explicit = true

[tool.uv.sources]
torch = { index = "pytorch-cu124" }
torchvision = { index = "pytorch-cu124" }
torchaudio = { index = "pytorch-cu124" }

これをもとに uv sync を行い依存関係をインストールします。このとき Pytorch Hubからモデルをダウンロードします

実行

環境が作成できたので、推論コードをを作成して実行していきます。

import argparse
import warnings
import torch
from scipy.io.wavfile import write


def load_models(device: str = "cuda"):
    """Load Tacotron2 and WaveGlow models from PyTorch Hub."""
    print("Loading Tacotron2...")
    tacotron2 = torch.hub.load(
        'NVIDIA/DeepLearningExamples:torchhub',
        'nvidia_tacotron2',
        model_math='fp32',
        trust_repo=True
    )
    tacotron2 = tacotron2.to(device).eval()

    print("Loading WaveGlow...")
    waveglow = torch.hub.load(
        'NVIDIA/DeepLearningExamples:torchhub',
        'nvidia_waveglow',
        model_math='fp32',
        trust_repo=True
    )
    waveglow = waveglow.remove_weightnorm(waveglow)
    waveglow = waveglow.to(device).eval()

    print("Loading text utilities...")
    utils = torch.hub.load(
        'NVIDIA/DeepLearningExamples:torchhub',
        'nvidia_tts_utils',
        trust_repo=True
    )

    return tacotron2, waveglow, utils


def synthesize(text: str, tacotron2, waveglow, utils, device: str = "cuda", sigma: float = 0.666):
    """Synthesize speech from text."""
    # Prepare text
    sequences, lengths = utils.prepare_input_sequence([text])
    sequences = sequences.to(device)
    lengths = lengths.to(device)

    # Generate mel spectrogram
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)

        # Generate audio
        audio = waveglow.infer(mel, sigma=sigma)

    # Convert to numpy
    audio = audio.squeeze().cpu().numpy()
    return audio


def main():
    parser = argparse.ArgumentParser(
        description='Text-to-Speech using Tacotron2 + WaveGlow',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    uv run python inference_cli.py --text "Hello, this is a test."
    uv run python inference_cli.py --text "The quick brown fox." --output fox.wav
    uv run python inference_cli.py --text "Hello" --sigma 0.8
        """
    )
    parser.add_argument('--text', type=str, required=True, help='Text to synthesize')
    parser.add_argument('--output', type=str, default='output.wav', help='Output WAV file')
    parser.add_argument('--sigma', type=float, default=0.666,
                        help='WaveGlow sigma (0.666 recommended, higher=more variation)')
    parser.add_argument('--device', type=str, default='cuda',
                        choices=['cuda', 'cpu'], help='Device to use')
    args = parser.parse_args()

    # Suppress warnings
    warnings.filterwarnings('ignore', category=UserWarning)
    warnings.filterwarnings('ignore', category=FutureWarning)

    # Check CUDA
    if args.device == 'cuda' and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        args.device = 'cpu'

    print(f"Device: {args.device}")
    print(f"Input text: {args.text}")
    print(f"Output file: {args.output}")
    print(f"Sigma: {args.sigma}")
    print()

    # Load models
    tacotron2, waveglow, utils = load_models(args.device)

    # Synthesize
    print("\nSynthesizing...")
    audio = synthesize(args.text, tacotron2, waveglow, utils, args.device, args.sigma)

    # Normalize and save
    audio = audio / max(abs(audio.max()), abs(audio.min()))  # Normalize to [-1, 1]
    audio_int16 = (audio * 32767).astype('int16')
    write(args.output, 22050, audio_int16)

    print(f"\nSaved: {args.output}")
    print(f"Duration: {len(audio) / 22050:.2f} seconds")


if __name__ == '__main__':
    main()

以下で実行すると 英吾の音声ファイルが生成されます

uv run python inference_cli.py --text "Hello, this is a test."