Spaces:
Running
on
Zero
Running
on
Zero
Sync from GitHub repo
Browse filesThis Space is synced from the GitHub repo: https://github.com/SWivid/F5-TTS. Please submit contributions to the Space there
README_REPO.md
CHANGED
@@ -112,7 +112,7 @@ docker container run --rm -it --gpus=all --mount 'type=volume,source=f5-tts,targ
|
|
112 |
Deployment solution with Triton and TensorRT-LLM.
|
113 |
|
114 |
#### Benchmark Results
|
115 |
-
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs.
|
116 |
|
117 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
118 |
|---------------------|----------------|-------------|--------|-----------------|
|
|
|
112 |
Deployment solution with Triton and TensorRT-LLM.
|
113 |
|
114 |
#### Benchmark Results
|
115 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
|
116 |
|
117 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
118 |
|---------------------|----------------|-------------|--------|-----------------|
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
-
version = "1.1.
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "f5-tts"
|
7 |
+
version = "1.1.2"
|
8 |
description = "F5-TTS: A Fairytaler that Fakes Fluent and Faithful Speech with Flow Matching"
|
9 |
readme = "README.md"
|
10 |
license = {text = "MIT License"}
|
src/f5_tts/runtime/triton_trtllm/README.md
CHANGED
@@ -57,7 +57,7 @@ benchmark.py --output-dir $log_dir \
|
|
57 |
```
|
58 |
|
59 |
### Benchmark Results
|
60 |
-
Decoding on a single L20 GPU, using 26 different prompt_audio
|
61 |
|
62 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
63 |
|---------------------|----------------|-------------|--------|-----------------|
|
|
|
57 |
```
|
58 |
|
59 |
### Benchmark Results
|
60 |
+
Decoding on a single L20 GPU, using 26 different prompt_audio & target_text pairs, 16 NFE.
|
61 |
|
62 |
| Model | Concurrency | Avg Latency | RTF | Mode |
|
63 |
|---------------------|----------------|-------------|--------|-----------------|
|
src/f5_tts/runtime/triton_trtllm/benchmark.py
CHANGED
@@ -168,7 +168,9 @@ def data_collator(batch, vocab_char_map, device="cuda", use_perf=False):
|
|
168 |
ref_mel_list.append(ref_mel)
|
169 |
ref_mel_len_list.append(ref_mel_len)
|
170 |
|
171 |
-
estimated_reference_target_mel_len.append(
|
|
|
|
|
172 |
|
173 |
max_seq_len = max(estimated_reference_target_mel_len)
|
174 |
ref_mel_batch = padded_mel_batch(ref_mel_list, max_seq_len)
|
|
|
168 |
ref_mel_list.append(ref_mel)
|
169 |
ref_mel_len_list.append(ref_mel_len)
|
170 |
|
171 |
+
estimated_reference_target_mel_len.append(
|
172 |
+
int(ref_mel.shape[0] * (1 + len(target_text.encode("utf-8")) / len(prompt_text.encode("utf-8"))))
|
173 |
+
)
|
174 |
|
175 |
max_seq_len = max(estimated_reference_target_mel_len)
|
176 |
ref_mel_batch = padded_mel_batch(ref_mel_list, max_seq_len)
|
src/f5_tts/runtime/triton_trtllm/model_repo_f5_tts/f5_tts/1/model.py
CHANGED
@@ -219,7 +219,9 @@ class TritonPythonModel:
|
|
219 |
|
220 |
reference_mel_len.append(mel_features.shape[1])
|
221 |
estimated_reference_target_mel_len.append(
|
222 |
-
int(
|
|
|
|
|
223 |
)
|
224 |
|
225 |
max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
|
|
|
219 |
|
220 |
reference_mel_len.append(mel_features.shape[1])
|
221 |
estimated_reference_target_mel_len.append(
|
222 |
+
int(
|
223 |
+
mel_features.shape[1] * (1 + len(target_text.encode("utf-8")) / len(reference_text.encode("utf-8")))
|
224 |
+
)
|
225 |
)
|
226 |
|
227 |
max_seq_len = min(max(estimated_reference_target_mel_len), self.max_mel_len)
|