Adrian Lyjak commited on
Commit
41d8ad0
·
1 Parent(s): 81f03f2

Add ONNX models and config

Browse files
README.md CHANGED
@@ -1,3 +1,90 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers.js
4
+ language:
5
+ - en
6
+ base_model:
7
+ - hexgrad/Kokoro-82M
8
+ pipeline_tag: text-to-speech
9
+ ---
10
+
11
+ # Kokoro TTS
12
+
13
+ Kokoro is a frontier TTS model for its size of 82 million parameters (text in/audio out). These ONNX models have been exported from the original [Hugging Face](https://huggingface.co/hexgrad/Kokoro-82M) model via the [kokoro-onnx](https://github.com/adrianlyjak/kokoro-onnx-export) scripts.
14
+
15
+ ## Table of contents
16
+
17
+ - [Usage](#usage)
18
+ - [JavaScript](#javascript)
19
+ - [Python](#python)
20
+ - [Voices/Samples](#voicessamples)
21
+ - [Quantizations](#quantizations)
22
+
23
+
24
+ ## Usage
25
+
26
+ ### JavaScript
27
+
28
+ First, install the `kokoro-js` library from [NPM](https://npmjs.com/package/kokoro-js) using:
29
+ ```bash
30
+ npm i kokoro-js
31
+ ```
32
+
33
+ You can then generate speech as follows:
34
+
35
+ ```js
36
+ import { KokoroTTS } from "kokoro-js";
37
+
38
+ const model_id = "adrianlyjak/kokoro-onnx";
39
+ const tts = await KokoroTTS.from_pretrained(model_id, {
40
+ dtype: "q8", // Options: "fp32", "fp16", "q8", "q4", "q4f16"
41
+ });
42
+
43
+ const text = "Life is like a box of chocolates. You never know what you're gonna get.";
44
+ const audio = await tts.generate(text, {
45
+ // Use `tts.list_voices()` to list all available voices
46
+ voice: "af_heart",
47
+ });
48
+ audio.save("audio.wav");
49
+ ```
50
+
51
+
52
+ ### Python
53
+
54
+ ```python
55
+ import os
56
+ import numpy as np
57
+ from onnxruntime import InferenceSession
58
+
59
+ # You can generate token ids as follows:
60
+ # 1. Convert input text to phonemes using https://github.com/hexgrad/misaki
61
+ # 2. Map phonemes to ids using https://huggingface.co/hexgrad/Kokoro-82M/blob/785407d1adfa7ae8fbef8ffd85f34ca127da3039/config.json#L34-L148
62
+ tokens = [50, 157, 43, 135, 16, 53, 135, 46, 16, 43, 102, 16, 56, 156, 57, 135, 6, 16, 102, 62, 61, 16, 70, 56, 16, 138, 56, 156, 72, 56, 61, 85, 123, 83, 44, 83, 54, 16, 53, 65, 156, 86, 61, 62, 131, 83, 56, 4, 16, 54, 156, 43, 102, 53, 16, 156, 72, 61, 53, 102, 112, 16, 70, 56, 16, 138, 56, 44, 156, 76, 158, 123, 56, 16, 62, 131, 156, 43, 102, 54, 46, 16, 102, 48, 16, 81, 47, 102, 54, 16, 54, 156, 51, 158, 46, 16, 70, 16, 92, 156, 135, 46, 16, 54, 156, 43, 102, 48, 4, 16, 81, 47, 102, 16, 50, 156, 72, 64, 83, 56, 62, 16, 156, 51, 158, 64, 83, 56, 16, 44, 157, 102, 56, 16, 44, 156, 76, 158, 123, 56, 4]
63
+
64
+ # Context length is 512, but leave room for the pad token 0 at the start & end
65
+ assert len(tokens) <= 510, len(tokens)
66
+
67
+ # Style vector based on len(tokens), ref_s has shape (1, 256)
68
+ voices = np.fromfile('./voices/af_heart.bin', dtype=np.float32).reshape(-1, 1, 256)
69
+ ref_s = voices[len(tokens)]
70
+
71
+ # Add the pad ids, and reshape tokens, should now have shape (1, <=512)
72
+ tokens = [[0, *tokens, 0]]
73
+
74
+ model_name = 'model.onnx' # Options: model.onnx, model_fp16.onnx, model_quantized.onnx, model_q8f16.onnx, model_uint8.onnx, model_uint8f16.onnx, model_q4.onnx, model_q4f16.onnx
75
+ sess = InferenceSession(os.path.join('onnx', model_name))
76
+
77
+ audio = sess.run(None, dict(
78
+ input_ids=tokens,
79
+ style=ref_s,
80
+ speed=np.ones(1, dtype=np.float32),
81
+ ))[0]
82
+ ```
83
+
84
+ Optionally, save the audio to a file:
85
+ ```py
86
+ import scipy.io.wavfile as wavfile
87
+ wavfile.write('audio.wav', 24000, audio[0])
88
+ ```
89
+
90
+
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_type": "style_text_to_speech_2"
3
+ }
onnx/model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1404f5a5e1a6bf0f6e83f43b129a016b7828755d14b94030ce82ce26ec1a21f2
3
+ size 325474826
onnx/model_fp16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c0b9ac5693e7f3686471ac0816b703174368eb8a80b2f7ff806002b96c31ecd5
3
+ size 163756843
onnx/model_q8f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bed76326e05744fe7c56e20044578fa2273f6da18e21e90ca3fd29b5f7f95f43
3
+ size 86439949
onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4dca4381ec375d758c1de6b1e5d422e785d24422c79b0fb8e719d857aebf18
3
+ size 92494425
onnx/model_uint8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da49c6b31946fbbfd2beffd63447c51034959c23900c7ffb9ea3e9d53690c65
3
+ size 172120428
onnx/model_uint8f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df32d80f238da5c69cf5a3d381da5611a48ee00be32066c7263ea713af23de00
3
+ size 112872273
tokenizer.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [],
6
+ "normalizer": {
7
+ "type": "Replace",
8
+ "pattern": {
9
+ "Regex": "[^$;:,.!?\u2014\u2026\"()\u201c\u201d \u0303\u02a3\u02a5\u02a6\u02a8\u1d5d\uab67AIOQSTWY\u1d4aabcdefhijklmnopqrstuvwxyz\u0251\u0250\u0252\u00e6\u03b2\u0254\u0255\u00e7\u0256\u00f0\u02a4\u0259\u025a\u025b\u025c\u025f\u0261\u0265\u0268\u026a\u029d\u026f\u0270\u014b\u0273\u0272\u0274\u00f8\u0278\u03b8\u0153\u0279\u027e\u027b\u0281\u027d\u0282\u0283\u0288\u02a7\u028a\u028b\u028c\u0263\u0264\u03c7\u028e\u0292\u0294\u02c8\u02cc\u02d0\u02b0\u02b2\u2193\u2192\u2197\u2198\u1d7b]"
10
+ },
11
+ "content": ""
12
+ },
13
+ "pre_tokenizer": {
14
+ "type": "Split",
15
+ "pattern": {
16
+ "Regex": ""
17
+ },
18
+ "behavior": "Isolated",
19
+ "invert": false
20
+ },
21
+ "post_processor": {
22
+ "type": "TemplateProcessing",
23
+ "single": [
24
+ {
25
+ "SpecialToken": {
26
+ "id": "$",
27
+ "type_id": 0
28
+ }
29
+ },
30
+ {
31
+ "Sequence": {
32
+ "id": "A",
33
+ "type_id": 0
34
+ }
35
+ },
36
+ {
37
+ "SpecialToken": {
38
+ "id": "$",
39
+ "type_id": 0
40
+ }
41
+ }
42
+ ],
43
+ "special_tokens": {
44
+ "$": {
45
+ "id": "$",
46
+ "ids": [
47
+ 0
48
+ ],
49
+ "tokens": [
50
+ "$"
51
+ ]
52
+ }
53
+ }
54
+ },
55
+ "decoder": null,
56
+ "model": {
57
+ "vocab": {
58
+ "$": 0,
59
+ ";": 1,
60
+ ":": 2,
61
+ ",": 3,
62
+ ".": 4,
63
+ "!": 5,
64
+ "?": 6,
65
+ "\u2014": 9,
66
+ "\u2026": 10,
67
+ "\"": 11,
68
+ "(": 12,
69
+ ")": 13,
70
+ "\u201c": 14,
71
+ "\u201d": 15,
72
+ " ": 16,
73
+ "\u0303": 17,
74
+ "\u02a3": 18,
75
+ "\u02a5": 19,
76
+ "\u02a6": 20,
77
+ "\u02a8": 21,
78
+ "\u1d5d": 22,
79
+ "\uab67": 23,
80
+ "A": 24,
81
+ "I": 25,
82
+ "O": 31,
83
+ "Q": 33,
84
+ "S": 35,
85
+ "T": 36,
86
+ "W": 39,
87
+ "Y": 41,
88
+ "\u1d4a": 42,
89
+ "a": 43,
90
+ "b": 44,
91
+ "c": 45,
92
+ "d": 46,
93
+ "e": 47,
94
+ "f": 48,
95
+ "h": 50,
96
+ "i": 51,
97
+ "j": 52,
98
+ "k": 53,
99
+ "l": 54,
100
+ "m": 55,
101
+ "n": 56,
102
+ "o": 57,
103
+ "p": 58,
104
+ "q": 59,
105
+ "r": 60,
106
+ "s": 61,
107
+ "t": 62,
108
+ "u": 63,
109
+ "v": 64,
110
+ "w": 65,
111
+ "x": 66,
112
+ "y": 67,
113
+ "z": 68,
114
+ "\u0251": 69,
115
+ "\u0250": 70,
116
+ "\u0252": 71,
117
+ "\u00e6": 72,
118
+ "\u03b2": 75,
119
+ "\u0254": 76,
120
+ "\u0255": 77,
121
+ "\u00e7": 78,
122
+ "\u0256": 80,
123
+ "\u00f0": 81,
124
+ "\u02a4": 82,
125
+ "\u0259": 83,
126
+ "\u025a": 85,
127
+ "\u025b": 86,
128
+ "\u025c": 87,
129
+ "\u025f": 90,
130
+ "\u0261": 92,
131
+ "\u0265": 99,
132
+ "\u0268": 101,
133
+ "\u026a": 102,
134
+ "\u029d": 103,
135
+ "\u026f": 110,
136
+ "\u0270": 111,
137
+ "\u014b": 112,
138
+ "\u0273": 113,
139
+ "\u0272": 114,
140
+ "\u0274": 115,
141
+ "\u00f8": 116,
142
+ "\u0278": 118,
143
+ "\u03b8": 119,
144
+ "\u0153": 120,
145
+ "\u0279": 123,
146
+ "\u027e": 125,
147
+ "\u027b": 126,
148
+ "\u0281": 128,
149
+ "\u027d": 129,
150
+ "\u0282": 130,
151
+ "\u0283": 131,
152
+ "\u0288": 132,
153
+ "\u02a7": 133,
154
+ "\u028a": 135,
155
+ "\u028b": 136,
156
+ "\u028c": 138,
157
+ "\u0263": 139,
158
+ "\u0264": 140,
159
+ "\u03c7": 142,
160
+ "\u028e": 143,
161
+ "\u0292": 147,
162
+ "\u0294": 148,
163
+ "\u02c8": 156,
164
+ "\u02cc": 157,
165
+ "\u02d0": 158,
166
+ "\u02b0": 162,
167
+ "\u02b2": 164,
168
+ "\u2193": 169,
169
+ "\u2192": 171,
170
+ "\u2197": 172,
171
+ "\u2198": 173,
172
+ "\u1d7b": 177
173
+ }
174
+ }
175
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "model_max_length": 512,
3
+ "pad_token": "$",
4
+ "tokenizer_class": "PreTrainedTokenizer",
5
+ "unk_token": "$"
6
+ }