開発環境
| 区分 | バージョン / 詳細 | 備考 |
|---|---|---|
| OS / イメージ | Ubuntu 22.04 (JupyterLab コンテナ) | uname -a で確認 |
| Python | 3.12.11 | python --version |
| CUDA Driver / Runtime | 12.x / 12.x | nvidia-smi で確認 |
| GPU | 4 × NVIDIA T4 | ID = 0 1 2 3 |
| PyTorch | 2.3.1 + cu12 | pip show torch |
| transformers | 4.42.1 | pip show transformers |
| utmosv2 | 0.5.1 | MOS 推論モデル |
| librosa | 0.10.2 | wav 読み込み |
| numpy | 1.26.x | 数値演算 |
| tqdm | 4.66.x | 進捗バー |
処理の実装
import argparse, json, os, time, multiprocessing as mp from pathlib import Path import soundfile as sf # librosa より高速に長さ取得 import torch, utmosv2 from tqdm import tqdm # ---------- helpers ------------------------------------------------- def get_duration(path: Path) -> float: """wav の長さ[sec] を高速に取得 (デコードしない)""" info = sf.info(str(path)) return info.frames / info.samplerate def update_json(json_path: Path, duration: float, mos: float): data = {} if json_path.exists(): try: data = json.loads(json_path.read_text(encoding="utf-8")) except Exception: pass data.setdefault("parakeet_jp_transcription", "") data.setdefault("anime_whisper_transcription", "") data["duration"] = duration data["speechMOS"] = round(float(mos), 6) json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") # ---------- 1 GPU worker -------------------------------------------- def gpu_worker(rank: int, gpu_id: int, fp16: bool, wav_list: list[Path], skip_existing: bool): torch.cuda.set_device(gpu_id) model = utmosv2.create_model(pretrained=True, device=f"cuda:{gpu_id}") if fp16: model.half() model.eval() bar = tqdm(total=len(wav_list), position=rank, ascii=True, desc=f"[GPU{gpu_id}]", ncols=80, leave=False) for wav in wav_list: try: jpath = wav.with_suffix(".json") if skip_existing and jpath.exists(): try: if "speechMOS" in json.loads(jpath.read_text()): bar.update() continue except Exception: pass dur = get_duration(wav) mos = model.predict(input_path=str(wav)) update_json(jpath, dur, mos) except Exception as e: # 例外は表示だけして処理継続 print(f"[GPU{gpu_id}] {wav} … {e}", flush=True) finally: bar.update() bar.close() # ---------- main ---------------------------------------------------- def main(): ap = argparse.ArgumentParser() ap.add_argument("--root", required=True, help="wav root ( .../speaker_id/wav/*.wav )") ap.add_argument("--gpus", nargs="+", type=int, default=[0], help="GPU id list (e.g. 0 1 2 3)") ap.add_argument("--fp16", action="store_true", help="load model in FP16 to save VRAM") ap.add_argument("--skip-existing", action="store_true", help="skip wavs whose JSON already contains speechMOS") cfg = ap.parse_args() root = Path(cfg.root).expanduser() wav_files = list(root.glob("*/wav/*.wav")) if not wav_files: print("💡 wav が見つかりません。ROOT パスを確認してください。") return ngpu = len(cfg.gpus) print(f"Processing {len(wav_files):,} wav files on {ngpu} GPU(s) …") # round-robin でファイルを各 GPU に振り分け buckets = [[] for _ in range(ngpu)] for idx, w in enumerate(wav_files): buckets[idx % ngpu].append(w) procs = [] for rank, (gpu_id, lst) in enumerate(zip(cfg.gpus, buckets)): p = mp.Process(target=gpu_worker, args=(rank, gpu_id, cfg.fp16, lst, cfg.skip_existing), daemon=False) p.start() procs.append(p) for p in procs: p.join() if __name__ == "__main__": # safetensors 強制 ―― torch.load 脆弱性 CVE-2025-32434 対策 os.environ["TRANSFORMERS_USE_SAFETENSORS"] = "1" from transformers import AutoModel _orig = AutoModel.from_pretrained AutoModel.from_pretrained = \ lambda name,*a,**k: _orig(name, *a, use_safetensors=True, **k) t0 = time.time() main() print(f"Done! Elapsed {time.time()-t0:.1f} s")
実行方法
python make_mos_jsons.py \ --root /data/moe-speech-plus/data \ --gpus 0 1 2 3 \ --fp16
実行時間は 30万ファイルで1-2分程度で終わりました
実装メモ
safetensors 強制ロード
from transformers import AutoModel _orig = AutoModel.from_pretrained AutoModel.from_pretrained = lambda n,*a,**k: _orig(n, *a, use_safetensors=True, **k) os.environ["TRANSFORMERS_USE_SAFETENSORS"] = "1"
1 GPU = 1 Processで処理をする
p = mp.Process(target=gpu_worker, args=(rank, gpu_id, cfg.fp16, bucket), daemon=False)
子プロセス内で DataLoader を使わない ため “daemon から子生成禁止” エラーを完全排除。 GPU 固定・FP16 で推論 ➜ 約 15 s / 512 kB WAV。