初めに

unity sentisを使ってtext to speechを実現する方法として、以下の二つがあります。

jets( unity/sentis-jets-text-to-speech)
piper.unity

今回は jetsではなく、piper.unityを動かしていきます。しかし以下のリポジトリでは sentis1.3-preで動いており2.x系で動かすには書き換える必要があります。今回はこのリポジトリを2.xに対応をして動かしていきます。

以下のリポジトリにて記事の内容で対応したものを公開しています。

github.com

開発環境

unity 6000.0.38f1
sentis 2.1.2

セットアップ

まずは既存のリポジトリをcloneします。

git clone https://github.com/Macoron/piper.unity.git

unity hubからプロジェクトを開きます。

また以下のURLから英語対応のonnxをダウンロードします。

en_US-lessac-medium.onnx

sentis2.1に対応

sentis 1.xから2.xで大きく変わったことは以下になります。

IBackend APIの削除 → Functional APIへの移行
IWorkerの変更 → Worker クラスを使用し、Execute() を Schedule() に変更
テンソルの更新 → TensorFloat や TensorInt から Tensor や Tensor に変更
テンソルデータの取得方法の変更 → ToReadOnlyArray() を DownloadToArray() に変更

これらを対応して、それぞれのクラスを以下のように書き換えます。

PiperSample.cs

using UnityEngine;
using UnityEngine.UI;
using System.Threading.Tasks;

namespace Piper
{
    public class PiperSample : MonoBehaviour
    {
        public PiperManager piper;
        public InputField input;
        public Button submitButton;
        public AudioSource source;

        private void Awake()
        {
            submitButton.onClick.AddListener(OnButtonPressed);
        }

        private async void OnButtonPressed()
        {
            string text = input.text;

            // 1. もしAudioSourceが再生中なら停止してクリップ破棄
            if (source.isPlaying) source.Stop();
            if (source.clip) Destroy(source.clip);

            // 2. 非同期でTTSを実行 (メインスレッド上で進行)
            AudioClip clip = await piper.TextToSpeechAsync(text);

            // 3. 再生
            source.clip = clip;
            source.Play();
        }
    }
}

PiperManager.cs

using UnityEngine;
using Unity.Sentis;
using System.Collections.Generic;
using System.IO;
using System.Threading.Tasks;

namespace Piper
{
    public class PiperManager : MonoBehaviour
    {
        public ModelAsset modelAsset;
        public int sampleRate = 22050;

        // Piperが必要とする入力スケールなど
        public float scaleSpeed   = 1.0f;
        public float scalePitch   = 1.0f;
        public float scaleGlottal = 0.8f;

        // espeak-ngのdataフォルダ
        public string espeakNgRelativePath = "espeak-ng-data";
        public string voice = "en-us";

        private Model runtimeModel;
        private Worker worker;
        
        [SerializeField] private BackendType backendType = BackendType.GPUCompute;

        private void Awake()
        {
            // 1. PiperWrapperを初期化
            string espeakPath = Path.Combine(Application.streamingAssetsPath, espeakNgRelativePath);
            PiperWrapper.InitPiper(espeakPath);

            // 2. Sentisモデルを読み込み、Worker作成
            runtimeModel = ModelLoader.Load(modelAsset);
            worker = new Worker(runtimeModel, backendType);
        }

        /// <summary>
        /// テキストをTTSし、AudioClipを返す非同期メソッド（Taskベース）。
        /// フレームをまたぐ際は Task.Yield() を使い、メインスレッド上でジョブを進めます。
        /// </summary>
        public async Task<AudioClip> TextToSpeechAsync(string text)
        {
            // 3. PiperWrapperでテキストをフォネマイズ
            var phonemeResult = PiperWrapper.ProcessText(text, voice);
            var allSamples = new List<float>();

            // 4. 文ごとに推論を実行 & 結合
            for (int s = 0; s < phonemeResult.Sentences.Length; s++)
            {
                var sentence = phonemeResult.Sentences[s];
                int[] phonemeIds = sentence.PhonemesIds;

                // 入力テンソル作成
                using var inputTensor = new Tensor<int>(new TensorShape(1, phonemeIds.Length), phonemeIds);
                using var inputLengthsTensor = new Tensor<int>(new TensorShape(1), new int[] { phonemeIds.Length });
                using var scalesTensor = new Tensor<float>(
                    new TensorShape(3),
                    new float[] { scaleSpeed, scalePitch, scaleGlottal }
                );

                // 入力名をモデルに合わせる (たとえば 0=input, 1=input_lengths, 2=scales)
                string inputName        = runtimeModel.inputs[0].name;
                string inputLengthsName = runtimeModel.inputs[1].name;
                string scalesName       = runtimeModel.inputs[2].name;

                worker.SetInput(inputName,         inputTensor);
                worker.SetInput(inputLengthsName,  inputLengthsTensor);
                worker.SetInput(scalesName,        scalesTensor);

                // スケジュール実行
                worker.Schedule();

                // 4-1. ScheduleIterableでジョブをフレームまたぎ進行
                var enumerator = worker.ScheduleIterable();
                while (enumerator.MoveNext())
                {
                    // コルーチンの代わりに Task.Yield() で1フレーム中断
                    await Task.Yield();
                }

                // 4-2. 出力を取得
                Tensor<float> outputTensor = worker.PeekOutput() as Tensor<float>;
                float[] sentenceSamples = outputTensor.DownloadToArray();
                allSamples.AddRange(sentenceSamples);
            }

            // 5. 音声波形をまとめて AudioClip 作成
            AudioClip clip = AudioClip.Create("PiperTTS", allSamples.Count, 1, sampleRate, false);
            clip.SetData(allSamples.ToArray(), 0);

            return clip;
        }

        private void OnDestroy()
        {
            PiperWrapper.FreePiper();
            if (worker != null)
            {
                worker.Dispose();
            }
        }
    }
}