初めに
いまさらですが、tacotron2を触ってみます
以下に現在の環境に合わせてuvで環境構築をしたRepositoryを作成しました
tacotron2の構成と課題
自己回帰 + Attentionのアーキテクチャになっているが以下のような課題があった
- 推論に時間がかかる
- Attention部分で確率の計算をしているためスキップや連続の音が間違えることがあった
開発環境
- Windos 11
- cuda 12.6
環境構築
uv環境に切り替える前提で進めていきます
まずは3.10の環境構築を行います
uv python pin 3.10
次に以下のようなpyproject.toml の作成します。
[project] name = "tacotron2" version = "0.1.0" description = "Tacotron2 Text-to-Speech" requires-python = ">=3.10,<3.12" dependencies = [ "gdown>=5.2.0", "inflect>=7.5.0", "librosa>=0.11.0", "matplotlib>=3.10.8", "scipy>=1.15.3", "torch>=2.6.0", "torchaudio>=2.6.0", "torchvision>=0.21.0", "unidecode>=1.4.0", ] [[tool.uv.index]] name = "pytorch-cu124" url = "https://download.pytorch.org/whl/cu124" explicit = true [tool.uv.sources] torch = { index = "pytorch-cu124" } torchvision = { index = "pytorch-cu124" } torchaudio = { index = "pytorch-cu124" }
これをもとに uv sync を行い依存関係をインストールします。このとき Pytorch Hubからモデルをダウンロードします
実行
環境が作成できたので、推論コードをを作成して実行していきます。
import argparse import warnings import torch from scipy.io.wavfile import write def load_models(device: str = "cuda"): """Load Tacotron2 and WaveGlow models from PyTorch Hub.""" print("Loading Tacotron2...") tacotron2 = torch.hub.load( 'NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp32', trust_repo=True ) tacotron2 = tacotron2.to(device).eval() print("Loading WaveGlow...") waveglow = torch.hub.load( 'NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp32', trust_repo=True ) waveglow = waveglow.remove_weightnorm(waveglow) waveglow = waveglow.to(device).eval() print("Loading text utilities...") utils = torch.hub.load( 'NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils', trust_repo=True ) return tacotron2, waveglow, utils def synthesize(text: str, tacotron2, waveglow, utils, device: str = "cuda", sigma: float = 0.666): """Synthesize speech from text.""" # Prepare text sequences, lengths = utils.prepare_input_sequence([text]) sequences = sequences.to(device) lengths = lengths.to(device) # Generate mel spectrogram with torch.no_grad(): mel, _, _ = tacotron2.infer(sequences, lengths) # Generate audio audio = waveglow.infer(mel, sigma=sigma) # Convert to numpy audio = audio.squeeze().cpu().numpy() return audio def main(): parser = argparse.ArgumentParser( description='Text-to-Speech using Tacotron2 + WaveGlow', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: uv run python inference_cli.py --text "Hello, this is a test." uv run python inference_cli.py --text "The quick brown fox." --output fox.wav uv run python inference_cli.py --text "Hello" --sigma 0.8 """ ) parser.add_argument('--text', type=str, required=True, help='Text to synthesize') parser.add_argument('--output', type=str, default='output.wav', help='Output WAV file') parser.add_argument('--sigma', type=float, default=0.666, help='WaveGlow sigma (0.666 recommended, higher=more variation)') parser.add_argument('--device', type=str, default='cuda', choices=['cuda', 'cpu'], help='Device to use') args = parser.parse_args() # Suppress warnings warnings.filterwarnings('ignore', category=UserWarning) warnings.filterwarnings('ignore', category=FutureWarning) # Check CUDA if args.device == 'cuda' and not torch.cuda.is_available(): print("CUDA not available, falling back to CPU") args.device = 'cpu' print(f"Device: {args.device}") print(f"Input text: {args.text}") print(f"Output file: {args.output}") print(f"Sigma: {args.sigma}") print() # Load models tacotron2, waveglow, utils = load_models(args.device) # Synthesize print("\nSynthesizing...") audio = synthesize(args.text, tacotron2, waveglow, utils, args.device, args.sigma) # Normalize and save audio = audio / max(abs(audio.max()), abs(audio.min())) # Normalize to [-1, 1] audio_int16 = (audio * 32767).astype('int16') write(args.output, 22050, audio_int16) print(f"\nSaved: {args.output}") print(f"Duration: {len(audio) / 22050:.2f} seconds") if __name__ == '__main__': main()
以下で実行すると 英吾の音声ファイルが生成されます
uv run python inference_cli.py --text "Hello, this is a test."









