拡散モデルのTTSで(一応)日本語対応されているライブラリの StableTTSを触っていきます
- L4 GPU
- ubuntu22.04
sudo apt update sudo apt install ffmpeg
次に requirements.txt
を入れるのですが、numpyが2.0.0が公開された影響でバグるので requirements.txt
を変更して numpy<2にします
以下が変更した requirements.txt
torch torchaudio matplotlib numpy<2 tensorboard pypinyin jieba eng_to_ipa unidecode inflect pyopenjtalk-prebuilt numba tqdm IPython gradio soundfile
上記のURLから vocoder.pt
をダウンロードして、checkpoints/ 以下に配置します
python webui.py
- Input text(合成するテキスト文)
- Reference Speaker(参考にする音声)
- Language(言語設定)
import time def inference(text: str, ref_audio: torch.Tensor, language: str, checkpoint_path: str, step: int=10) -> torch.Tensor: start_time = time.time() # 関数の開始時間を記録 global last_checkpoint_path if checkpoint_path != last_checkpoint_path: load_start = time.time() tts_model.load_state_dict(torch.load(checkpoint_path, map_location='cpu')) last_checkpoint_path = checkpoint_path load_end = time.time() print(f"Model loading time: {load_end - load_start:.2f} seconds") phonemizer = g2p_mapping.get(language) prep_start = time.time() # prepare input for tts model x = torch.tensor(intersperse(cleaned_text_to_sequence(phonemizer(text)), item=0), dtype=torch.long, device=device).unsqueeze(0) x_len = torch.tensor([x.size(-1)], dtype=torch.long, device=device) waveform, sr = torchaudio.load(ref_audio) if sr != sample_rate: waveform = torchaudio.functional.resample(waveform, sr, sample_rate) y = mel_extractor(waveform).to(device) prep_end = time.time() print(f"Input preparation time: {prep_end - prep_start:.2f} seconds") # inference inference_start = time.time() mel = tts_model.synthesise(x, x_len, step, y=y, temperature=1, length_scale=1)['decoder_outputs'] audio = vocoder(mel) inference_end = time.time() print(f"Inference time: {inference_end - inference_start:.2f} seconds") # process output for gradio post_start = time.time() audio_output = (sample_rate, (audio.cpu().squeeze(0).numpy() * 32767).astype(np.int16)) # (samplerate, int16 audio) for gr.Audio mel_output = plot_mel_spectrogram(mel.cpu().squeeze(0).numpy()) # get the plot of mel post_end = time.time() print(f"Post-processing time: {post_end - post_start:.2f} seconds") end_time = time.time() # 関数の終了時間を記録 total_time = end_time - start_time print(f"Total execution time: {total_time:.2f} seconds") return audio_output, mel_output
Input preparation time: 1.10 seconds Inference time: 1.88 seconds Post-processing time: 0.04 seconds Total execution time: 3.08 seconds Input preparation time: 0.08 seconds Inference time: 0.23 seconds Post-processing time: 0.03 seconds Total execution time: 0.34 seconds
