S3Tokenizerを動かす

初めに

CosyVoiceの教師あり音声トークナイザーに関するコードが出ていなかったため issue、以下のRepositoryにて再現実装が行われました。こちらを動かしていきます

github.com

以下でライブラリのverを固定したRepositoryを公開しています

github.com

開発環境

セットアップ

uvの環境を作ります

uv venv -p 3.11
.venv\Scripts\activate

必要なライブラリをインストールします

uv pip install -r requirements.txt 
uv pip install torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121 --reinstall-force 
uv pip install onnxruntime soundfile

インストールが終わったら以下のようなライブラリが入っています

cffi==1.17.1
colorama==0.4.6
coloredlogs==15.0.1
einops==0.8.0
filelock==3.13.1
flatbuffers==24.3.25
fsspec==2024.2.0
humanfriendly==10.0
jinja2==3.1.3
markupsafe==2.1.5
mpmath==1.3.0
networkx==3.2.1
numpy==2.2.0
onnx==1.17.0
onnxruntime==1.20.1
packaging==24.2
protobuf==5.29.1
pycparser==2.22
pyreadline3==3.5.4
s3tokenizer==0.0.8
soundfile==0.12.1
sympy==1.13.1
torch==2.5.1+cu121
tqdm==4.67.1
typing-extensions==4.9.0

実行

サンプルの音声が入っているため、こちらを使って動かしていきます

python .\test\test_onnx.py 

結果は以下のようになります

=========torch=============
mels.size: torch.Size([2, 128, 420]), mels_lens: tensor([420, 411], dtype=torch.int32)
codes.size: torch.Size([2, 210]), codes_lens: tensor([210, 206], dtype=torch.int32)
wav[0]
tensor([ 143,  602,  702,  473, 2530, 2530,  646,  646,  646,  279, 1145,  440,
         279,  279, 2530, 2530, 2530, 2530, 1019,  501,  501,  631,  287,  465,
         465,  478, 3766,  183,  391,  621,   11, 3480, 3480,   16,  402, 1037,
         253,   55,  720,  227,  227,  159,  110,  103,  103,  103,  110,  110,
        2386,   28,  704,  143,  371,  371,  224,  389,  295,  295,  323,  557,
         681,  455,    4,  386,  403,  629, 2989, 2782,  507, 3290,   57,  215,
         626,   55,  342,  375,  596,  502,   12,   39,   90,  344, 1593, 1593,
         170,  471,  170, 2299, 2299,  112,   94,  629,   33,   33,   33,  348,
         348,  199,  137,  715,  750,  119,  490,  357,    4,  648, 1700,  468,
         468,  422,   47,  732,  224,  224,  224, 2691,  389,  174,  174, 1381,
        1381,   74,  100,    6,  584,  363,  225,  214,  197,  621,  209,   59,
         284,   17,  538,  460,   87,  720,  227,  618,  594,  151,  173,  540,
        1923,  361,  361,   96,  355,  223,  341,  297,    1,  137,  123,  662,
           4,  386,  386,  225,  214,  427,  215,  477, 1405, 3108,  122,  122,
         310,  149, 1479,  314,  486,  946, 3945,  202, 3539, 3539, 3539,  568,
         312,  312,  121,  139,  530,  395,  607,  443,  443,  367,  367,   81,
         367,  367,  367,  367,  727,  367,  221,  508,  508,  508, 1145,  691,
         367,  367,  573,  221,  221, 2530])
wav[1]
tensor([ 644,  404,  329,  329, 1145, 1145, 1145, 1145, 1145, 1145, 1145,  221,
         508,  221,  221,    8,  650,   36,  501,  573,  443,  443, 1849,  435,
          19,  293,   27,  468,  468,   27,   27,  468,  247,  179,  184,  580,
        1593, 1593, 3539, 2299,  266,  344,  606, 2330,  401,   90,   95,  716,
           4, 1935,    4,    4,  160,  495, 2989, 2782, 2782, 2782,    7,    7,
          84,  342,  375,  584,  227,  452,  141,  488, 4013,  568,  312,  312,
         314,  312,   17,  595,   52,  465,  465,  483,  483,  173,  173,  628,
         628,   96,  455,  688,  153,  153,  527,  628, 1529,  612,  514,  304,
        2579, 2579,  173,   20, 2579,   20,  562,   15, 2828, 2828,  620,  261,
        3480,  188,   88,   33,   33,   27,   39,   39, 2031,  734,  158,   50,
          50,  661,  309,  704,  143,  371,  371,  224,  563,  563,  563,  277,
         300,  159,  445,  133,  343,  343, 2514,  318,  377,  584,  386,  448,
         366,  427,  562, 2828, 1006, 1006, 2185, 2299,  732,  100,  211,  504,
         100,  457,  570,  349,  349,  523,  523,   84,  342,  482,  704, 2386,
         498,  277,   92, 2185,  579,  579,  579,   41,   41,   41,  406, 1104,
         236,  607,  395,  395,  642,  607,  642,  607,  607,  368,  368, 3158,
         573, 1145,  367,  367, 1145,  367,  636,  367,  367,  367,  607, 1145,
         636, 2189])
=========onnx===============
wav[0]
tensor([ 143,  602,  702,  473, 2530, 2530,  646,  646,  646,  279, 1145,  440,
         279,  279, 2530, 2530, 2530, 2530, 1019,  501,  501,  631,  287,  465,
         465,  478, 3766,  183,  391,  621,   11, 3480, 3480,   16,  402, 1037,
         253,   55,  720,  227,  227,  159,  110,  103,  103,  103,  110,  110,
        2386,   28,  704,  143,  371,  371,  224,  389,  295,  295,  323,  557,
         681,  455,    4,  386,  403,  629, 2989, 2782,  507, 3290,   57,  215,
         626,   55,  342,  375,  596,  502,   12,   39,   90,  344, 1593, 1593,
         170,  471,  170, 2299, 2299,  112,   94,  629,   33,   33,   33,  348,
         348,  199,  137,  715,  750,  119,  490,  357,    4,  648, 1700,  468,
         468,  422,   47,  732,  224,  224,  224, 2691,  389,  174,  174, 1381,
        1381,   74,  100,    6,  584,  363,  225,  214,  197,  621,  209,   59,
         284,   17,  538,  460,   87,  720,  227,  618,  594,  151,  173,  540,
        1923,  361,  361,   96,  355,  223,  341,  297,    1,  137,  123,  662,
           4,  386,  386,  225,  214,  427,  215,  477, 1405, 3108,  122,  122,
         310,  149, 1479,  314,  486,  946, 3945,  202, 3539, 3539, 3539,  568,
         312,  312,  121,  139,  530,  395,  607,  443,  443,  367,  367,   81,
         367,  367,  367,  367,  727,  367,  221,  508,  508,  508, 1145,  691,
         367,  367,  573,  221,  221, 2530])
all equal: True
miss rate: 0.0%
wav[1]
tensor([ 644,  404,  329,  329, 1145, 1145, 1145, 1145, 1145, 1145, 1145,  221,
         508,  221,  221,    8,  650,   36,  501,  573,  443,  443, 1849,  435,
          19,  293,   27,  468,  468,   27,   27,  468,  247,  179,  184,  580,
        1593, 1593, 3539, 2299,  266,  344,  606, 2330,  401,   90,   95,  716,
           4, 1935,    4,    4,  160,  495, 2989, 2782, 2782, 2782,    7,    7,
          84,  342,  375,  584,  227,  452,  141,  488, 4013,  568,  312,  312,
         314,  312,   17,  595,   52,  465,  465,  483,  483,  173,  173,  628,
         628,   96,  455,  688,  153,  153,  527,  628, 1529,  612,  514,  304,
        2579, 2579,  173,   20, 2579,   20,  562,   15, 2828, 2828,  620,  261,
        3480,  188,   88,   33,   33,   27,   39,   39, 2031,  734,  158,   50,
          50,  661,  309,  704,  143,  371,  371,  224,  563,  563,  563,  277,
         300,  159,  445,  133,  343,  343, 2514,  318,  377,  584,  386,  448,
         366,  427,  562, 2828, 1006, 1006, 2185, 2299,  732,  100,  211,  504,
         100,  457,  570,  349,  349,  523,  523,   84,  342,  482,  704, 2386,
         498,  277,   92, 2185,  579,  579,  579,   41,   41,   41,  406, 1104,
         236,  607,  395,  395,  642,  607,  642,  607,  607,  368,  368, 3158,
         573, 1145,  367,  367, 1145,  367, 1145,  367,  367,  367,  367,  367,
         692,  404])
all equal: False
miss rate: 2.427184581756592%