初めに
音声からmel 情報を再構築をして再度音声に変換するライブラリ「BigVGAN」を動かしていきます。Demo用Gradioはありますが、あえて自分でコードを書いていきます
開発環境
準備
ライブラリの実行環境は ReadMeの通りに Anacondaを使用していきます
conda create -n bigvgan python=3.10 pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia conda activate bigvgan
音声情報を再構築
以下のコードは ReadMeのサンプルコードの続きで、再構築した音声情報から再度音声ファイルを保存しています
import torch import bigvgan import librosa from meldataset import get_mel_spectrogram import soundfile as sf device = 'cuda' if torch.cuda.is_available() else 'cpu' # instantiate the model. You can optionally set use_cuda_kernel=True for faster inference. model = bigvgan.BigVGAN.from_pretrained('nvidia/bigvgan_v2_24khz_100band_256x', use_cuda_kernel=False) # remove weight norm in the model and set to eval mode model.remove_weight_norm() model = model.eval().to(device) # load wav file and compute mel spectrogram wav_path = 'test.wav' wav, sr = librosa.load(wav_path, sr=model.h.sampling_rate, mono=True) # wav is np.ndarray with shape [T_time] and values in [-1, 1] wav = torch.FloatTensor(wav).unsqueeze(0) # wav is FloatTensor with shape [B(1), T_time] # compute mel spectrogram from the ground truth audio mel = get_mel_spectrogram(wav, model.h).to(device) # mel is FloatTensor with shape [B(1), C_mel, T_frame] # generate waveform from mel with torch.inference_mode(): wav_gen = model(mel) # wav_gen is FloatTensor with shape [B(1), 1, T_time] and values in [-1, 1] wav_gen_float = wav_gen.squeeze(0).cpu() # wav_gen is FloatTensor with shape [1, T_time] # you can convert the generated waveform to 16 bit linear PCM wav_gen_int16 = (wav_gen_float * 32767.0).numpy().astype('int16') # wav_gen is now np.ndarray with shape [1, T_time] and int16 dtype # 生成された音声を保存 (float形式) output_path = 'generated_audio.wav' sf.write(output_path, wav_gen_float[0].numpy(), model.h.sampling_rate) print(f"Generated audio saved to: {output_path}") # 生成された音声を保存 (int16形式) output_path_int16 = 'generated_audio_int16.wav' sf.write(output_path_int16, wav_gen_int16[0], model.h.sampling_rate) print(f"Generated audio (int16) saved to: {output_path_int16}")