Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +34 -0
- trained_190k_steps/.gitignore +16 -0
- trained_190k_steps/.vscode/launch.json +17 -0
- trained_190k_steps/BATCH_INFERENCE_README.md +30 -0
- trained_190k_steps/BiCodec/config.yaml +60 -0
- trained_190k_steps/BiCodec/model.safetensors +3 -0
- trained_190k_steps/Readme.md +130 -0
- trained_190k_steps/Readme_zh.md +130 -0
- trained_190k_steps/__init__.py +0 -0
- trained_190k_steps/__pycache__/spark_llm.cpython-311.pyc +0 -0
- trained_190k_steps/__pycache__/utilities.cpython-311.pyc +0 -0
- trained_190k_steps/added_tokens.json +3 -0
- trained_190k_steps/config.json +66 -0
- trained_190k_steps/config.yaml +7 -0
- trained_190k_steps/configuration_rwkv7.py +91 -0
- trained_190k_steps/generation_config.json +6 -0
- trained_190k_steps/hf_rwkv_tokenizer.py +280 -0
- trained_190k_steps/kafka.wav +3 -0
- trained_190k_steps/model.safetensors +3 -0
- trained_190k_steps/modeling_rwkvspeech.py +6 -0
- trained_190k_steps/output.wav +3 -0
- trained_190k_steps/output_0.wav +3 -0
- trained_190k_steps/output_1.wav +3 -0
- trained_190k_steps/output_10.wav +3 -0
- trained_190k_steps/output_2.wav +3 -0
- trained_190k_steps/output_3.wav +3 -0
- trained_190k_steps/output_4.wav +3 -0
- trained_190k_steps/output_5.wav +3 -0
- trained_190k_steps/output_6.wav +3 -0
- trained_190k_steps/output_7.wav +3 -0
- trained_190k_steps/output_8.wav +3 -0
- trained_190k_steps/output_9.wav +3 -0
- trained_190k_steps/rtf_test_results/rtf_test_results_20250714_103617.json +206 -0
- trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104243.json +206 -0
- trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104655.json +206 -0
- trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104853.json +206 -0
- trained_190k_steps/rtf_test_results/test_001.wav +3 -0
- trained_190k_steps/rtf_test_results/test_002.wav +3 -0
- trained_190k_steps/rtf_test_results/test_003.wav +3 -0
- trained_190k_steps/rtf_test_results/test_004.wav +3 -0
- trained_190k_steps/rtf_test_results/test_005.wav +3 -0
- trained_190k_steps/rtf_test_results/test_006.wav +3 -0
- trained_190k_steps/rtf_test_results/test_007.wav +3 -0
- trained_190k_steps/rtf_test_results/test_008.wav +3 -0
- trained_190k_steps/rtf_test_results/test_009.wav +3 -0
- trained_190k_steps/rtf_test_results/test_010.wav +3 -0
- trained_190k_steps/rtf_test_results/test_011.wav +3 -0
- trained_190k_steps/rtf_test_results/test_012.wav +3 -0
- trained_190k_steps/rtf_test_results/test_013.wav +3 -0
- trained_190k_steps/rtf_test_results/test_014.wav +3 -0
.gitattributes
CHANGED
@@ -48,3 +48,37 @@ trained_50_percents/output_6.wav filter=lfs diff=lfs merge=lfs -text
|
|
48 |
trained_50_percents/output_7.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
trained_50_percents/output_8.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
trained_50_percents/output_9.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
trained_50_percents/output_7.wav filter=lfs diff=lfs merge=lfs -text
|
49 |
trained_50_percents/output_8.wav filter=lfs diff=lfs merge=lfs -text
|
50 |
trained_50_percents/output_9.wav filter=lfs diff=lfs merge=lfs -text
|
51 |
+
trained_190k_steps/kafka.wav filter=lfs diff=lfs merge=lfs -text
|
52 |
+
trained_190k_steps/output.wav filter=lfs diff=lfs merge=lfs -text
|
53 |
+
trained_190k_steps/output_0.wav filter=lfs diff=lfs merge=lfs -text
|
54 |
+
trained_190k_steps/output_1.wav filter=lfs diff=lfs merge=lfs -text
|
55 |
+
trained_190k_steps/output_10.wav filter=lfs diff=lfs merge=lfs -text
|
56 |
+
trained_190k_steps/output_2.wav filter=lfs diff=lfs merge=lfs -text
|
57 |
+
trained_190k_steps/output_3.wav filter=lfs diff=lfs merge=lfs -text
|
58 |
+
trained_190k_steps/output_4.wav filter=lfs diff=lfs merge=lfs -text
|
59 |
+
trained_190k_steps/output_5.wav filter=lfs diff=lfs merge=lfs -text
|
60 |
+
trained_190k_steps/output_6.wav filter=lfs diff=lfs merge=lfs -text
|
61 |
+
trained_190k_steps/output_7.wav filter=lfs diff=lfs merge=lfs -text
|
62 |
+
trained_190k_steps/output_8.wav filter=lfs diff=lfs merge=lfs -text
|
63 |
+
trained_190k_steps/output_9.wav filter=lfs diff=lfs merge=lfs -text
|
64 |
+
trained_190k_steps/rtf_test_results/test_001.wav filter=lfs diff=lfs merge=lfs -text
|
65 |
+
trained_190k_steps/rtf_test_results/test_002.wav filter=lfs diff=lfs merge=lfs -text
|
66 |
+
trained_190k_steps/rtf_test_results/test_003.wav filter=lfs diff=lfs merge=lfs -text
|
67 |
+
trained_190k_steps/rtf_test_results/test_004.wav filter=lfs diff=lfs merge=lfs -text
|
68 |
+
trained_190k_steps/rtf_test_results/test_005.wav filter=lfs diff=lfs merge=lfs -text
|
69 |
+
trained_190k_steps/rtf_test_results/test_006.wav filter=lfs diff=lfs merge=lfs -text
|
70 |
+
trained_190k_steps/rtf_test_results/test_007.wav filter=lfs diff=lfs merge=lfs -text
|
71 |
+
trained_190k_steps/rtf_test_results/test_008.wav filter=lfs diff=lfs merge=lfs -text
|
72 |
+
trained_190k_steps/rtf_test_results/test_009.wav filter=lfs diff=lfs merge=lfs -text
|
73 |
+
trained_190k_steps/rtf_test_results/test_010.wav filter=lfs diff=lfs merge=lfs -text
|
74 |
+
trained_190k_steps/rtf_test_results/test_011.wav filter=lfs diff=lfs merge=lfs -text
|
75 |
+
trained_190k_steps/rtf_test_results/test_012.wav filter=lfs diff=lfs merge=lfs -text
|
76 |
+
trained_190k_steps/rtf_test_results/test_013.wav filter=lfs diff=lfs merge=lfs -text
|
77 |
+
trained_190k_steps/rtf_test_results/test_014.wav filter=lfs diff=lfs merge=lfs -text
|
78 |
+
trained_190k_steps/rtf_test_results/test_015.wav filter=lfs diff=lfs merge=lfs -text
|
79 |
+
trained_190k_steps/rtf_test_results/test_016.wav filter=lfs diff=lfs merge=lfs -text
|
80 |
+
trained_190k_steps/rtf_test_results/test_017.wav filter=lfs diff=lfs merge=lfs -text
|
81 |
+
trained_190k_steps/rtf_test_results/test_018.wav filter=lfs diff=lfs merge=lfs -text
|
82 |
+
trained_190k_steps/rtf_test_results/test_019.wav filter=lfs diff=lfs merge=lfs -text
|
83 |
+
trained_190k_steps/rtf_test_results/test_020.wav filter=lfs diff=lfs merge=lfs -text
|
84 |
+
trained_190k_steps/test.wav filter=lfs diff=lfs merge=lfs -text
|
trained_190k_steps/.gitignore
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python build artifacts
|
2 |
+
__pycache__/
|
3 |
+
*.pyc
|
4 |
+
|
5 |
+
# Environment variables
|
6 |
+
.env
|
7 |
+
|
8 |
+
# Virtual environment
|
9 |
+
venv/
|
10 |
+
|
11 |
+
# Model backups and outputs
|
12 |
+
model.fp32.safetensors
|
13 |
+
output.wav
|
14 |
+
|
15 |
+
# Temporary scripts
|
16 |
+
check_dtype.py
|
trained_190k_steps/.vscode/launch.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
// 使用 IntelliSense 了解相关属性。
|
3 |
+
// 悬停以查看现有属性的描述。
|
4 |
+
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
5 |
+
"version": "0.2.0",
|
6 |
+
"configurations": [
|
7 |
+
|
8 |
+
{
|
9 |
+
"name": "Python 调试程序: 当前文件",
|
10 |
+
"type": "debugpy",
|
11 |
+
"request": "launch",
|
12 |
+
"program": "${file}",
|
13 |
+
"console": "integratedTerminal",
|
14 |
+
"justMyCode": false
|
15 |
+
}
|
16 |
+
]
|
17 |
+
}
|
trained_190k_steps/BATCH_INFERENCE_README.md
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 批量推理功能说明
|
2 |
+
|
3 |
+
本文档介绍了 ReSpark TTS 模型的批量推理功能,该功能可以显著提高多个文本的语音合成效率。
|
4 |
+
|
5 |
+
## 使用方法
|
6 |
+
|
7 |
+
### 基本批量推理
|
8 |
+
```python
|
9 |
+
from utilities import generate_embeddings_batch
|
10 |
+
from tts_batch_infer import generate_speech_batch
|
11 |
+
|
12 |
+
# 准备文本列表
|
13 |
+
texts = [
|
14 |
+
"第一个要合成的文本。",
|
15 |
+
"第二个要合成的文本。",
|
16 |
+
"第三个要合成的文本。"
|
17 |
+
]
|
18 |
+
|
19 |
+
# 批量生成语音
|
20 |
+
wavs = generate_speech_batch(
|
21 |
+
model, tokenizer, texts, audio_tokenizer,
|
22 |
+
prompt_text="提示文本",
|
23 |
+
prompt_audio=prompt_audio,
|
24 |
+
device=device
|
25 |
+
)
|
26 |
+
|
27 |
+
# 保存音频文件
|
28 |
+
for i, wav in enumerate(wavs):
|
29 |
+
sf.write(f'output_{i}.wav', wav, sample_rate)
|
30 |
+
```
|
trained_190k_steps/BiCodec/config.yaml
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
audio_tokenizer:
|
2 |
+
mel_params:
|
3 |
+
sample_rate: 16000
|
4 |
+
n_fft: 1024
|
5 |
+
win_length: 640
|
6 |
+
hop_length: 320
|
7 |
+
mel_fmin: 10
|
8 |
+
mel_fmax: null
|
9 |
+
num_mels: 128
|
10 |
+
|
11 |
+
encoder:
|
12 |
+
input_channels: 1024
|
13 |
+
vocos_dim: 384
|
14 |
+
vocos_intermediate_dim: 2048
|
15 |
+
vocos_num_layers: 12
|
16 |
+
out_channels: 1024
|
17 |
+
sample_ratios: [1,1]
|
18 |
+
|
19 |
+
decoder:
|
20 |
+
input_channel: 1024
|
21 |
+
channels: 1536
|
22 |
+
rates: [8, 5, 4, 2]
|
23 |
+
kernel_sizes: [16,11,8,4]
|
24 |
+
|
25 |
+
quantizer:
|
26 |
+
input_dim: 1024
|
27 |
+
codebook_size: 8192
|
28 |
+
codebook_dim: 8
|
29 |
+
commitment: 0.25
|
30 |
+
codebook_loss_weight: 2.0
|
31 |
+
use_l2_normlize: True
|
32 |
+
threshold_ema_dead_code: 0.2
|
33 |
+
|
34 |
+
speaker_encoder:
|
35 |
+
input_dim: 128
|
36 |
+
out_dim: 1024
|
37 |
+
latent_dim: 128
|
38 |
+
token_num: 32
|
39 |
+
fsq_levels: [4, 4, 4, 4, 4, 4]
|
40 |
+
fsq_num_quantizers: 1
|
41 |
+
|
42 |
+
prenet:
|
43 |
+
input_channels: 1024
|
44 |
+
vocos_dim: 384
|
45 |
+
vocos_intermediate_dim: 2048
|
46 |
+
vocos_num_layers: 12
|
47 |
+
out_channels: 1024
|
48 |
+
condition_dim: 1024
|
49 |
+
sample_ratios: [1,1]
|
50 |
+
use_tanh_at_final: False
|
51 |
+
|
52 |
+
postnet:
|
53 |
+
input_channels: 1024
|
54 |
+
vocos_dim: 384
|
55 |
+
vocos_intermediate_dim: 2048
|
56 |
+
vocos_num_layers: 6
|
57 |
+
out_channels: 1024
|
58 |
+
use_tanh_at_final: False
|
59 |
+
|
60 |
+
|
trained_190k_steps/BiCodec/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
|
3 |
+
size 625518756
|
trained_190k_steps/Readme.md
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
4 |
+
|
5 |
+
# ReSpark TTS Model
|
6 |
+
|
7 |
+
This repository contains the ReSpark Text-to-Speech (TTS) model, a powerful and efficient model for generating high-quality speech from text. It is based on the RWKV architecture and utilizes the BiCodec tokenizer for audio processing.
|
8 |
+
|
9 |
+
## Installation
|
10 |
+
|
11 |
+
First, install the required dependencies:
|
12 |
+
|
13 |
+
```bash
|
14 |
+
pip install transformers rwkv-fla torch torchaudio torchvision transformers soundfile numpy librosa omegaconf soxr soundfile einx librosa
|
15 |
+
```
|
16 |
+
|
17 |
+
## Usage
|
18 |
+
|
19 |
+
The `tts.py` script provides a complete example of how to use this model for text-to-speech synthesis with voice cloning.
|
20 |
+
|
21 |
+
### Running the Test Script
|
22 |
+
|
23 |
+
To generate speech, simply run the script:
|
24 |
+
|
25 |
+
```bash
|
26 |
+
python tts.py
|
27 |
+
```
|
28 |
+
|
29 |
+
### How it Works
|
30 |
+
|
31 |
+
The script performs the following steps:
|
32 |
+
1. Loads the pre-trained `AutoModelForCausalLM` and `AutoTokenizer` from the current directory.
|
33 |
+
2. Initializes the `BiCodecTokenizer` for audio encoding and decoding.
|
34 |
+
3. Loads a reference audio file (`kafka.wav`) and its corresponding transcript (`prompt_text`) to provide a voice prompt.
|
35 |
+
4. Resamples the reference audio to match the model's expected sample rate (24000 Hz).
|
36 |
+
5. Takes a target text (`text`) to be synthesized.
|
37 |
+
6. Calls the `generate_speech` function, which generates audio based on the target text and the voice from the reference audio.
|
38 |
+
7. Saves the generated audio to `output.wav`.
|
39 |
+
|
40 |
+
You can modify the `prompt_text`, `prompt_audio_file`, and `text` variables in `tts.py` to synthesize different text with different voices.
|
41 |
+
|
42 |
+
### Example Code (`tts.py`)
|
43 |
+
|
44 |
+
```python
|
45 |
+
import os
|
46 |
+
import sys
|
47 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
48 |
+
print('add current dir to sys.path', current_dir)
|
49 |
+
sys.path.append(current_dir)
|
50 |
+
from sparktts.models.audio_tokenizer import BiCodecTokenizer
|
51 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
52 |
+
import soundfile as sf
|
53 |
+
import numpy as np
|
54 |
+
import torch
|
55 |
+
from utilities import generate_embeddings
|
56 |
+
|
57 |
+
def generate_speech(model, tokenizer, text, bicodec, prompt_text=None, prompt_audio=None,
|
58 |
+
max_new_tokens=3000, do_sample=True, top_k=50, top_p=0.95,
|
59 |
+
temperature=1.0, device="cuda:0"):
|
60 |
+
"""
|
61 |
+
Function to generate speech.
|
62 |
+
"""
|
63 |
+
eos_token_id = model.config.vocab_size - 1
|
64 |
+
|
65 |
+
embeddings = generate_embeddings(
|
66 |
+
model=model,
|
67 |
+
tokenizer=tokenizer,
|
68 |
+
text=text,
|
69 |
+
bicodec=bicodec,
|
70 |
+
prompt_text=prompt_text,
|
71 |
+
prompt_audio=prompt_audio
|
72 |
+
)
|
73 |
+
|
74 |
+
global_tokens = embeddings['global_tokens'].unsqueeze(0)
|
75 |
+
model.eval()
|
76 |
+
|
77 |
+
with torch.no_grad():
|
78 |
+
generated_outputs = model.generate(
|
79 |
+
inputs_embeds=embeddings['input_embs'],
|
80 |
+
attention_mask=torch.ones((1, embeddings['input_embs'].shape[1]),dtype=torch.long,device=device),
|
81 |
+
max_new_tokens=max_new_tokens,
|
82 |
+
do_sample=do_sample,
|
83 |
+
top_k=top_k,
|
84 |
+
top_p=top_p,
|
85 |
+
temperature=temperature,
|
86 |
+
eos_token_id=eos_token_id,
|
87 |
+
pad_token_id=tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
|
88 |
+
use_cache=True
|
89 |
+
)
|
90 |
+
|
91 |
+
semantic_tokens_tensor = generated_outputs[:,:-1]
|
92 |
+
|
93 |
+
with torch.no_grad():
|
94 |
+
wav = bicodec.detokenize(global_tokens, semantic_tokens_tensor)
|
95 |
+
|
96 |
+
return wav
|
97 |
+
|
98 |
+
# --- Main execution ---
|
99 |
+
device = 'cuda:0'
|
100 |
+
|
101 |
+
# Initialize tokenizers and model
|
102 |
+
audio_tokenizer = BiCodecTokenizer(model_dir=current_dir, device=device)
|
103 |
+
tokenizer = AutoTokenizer.from_pretrained(current_dir, trust_remote_code=True)
|
104 |
+
model = AutoModelForCausalLM.from_pretrained(current_dir, trust_remote_code=True)
|
105 |
+
|
106 |
+
model = model.bfloat16().to(device)
|
107 |
+
model.eval()
|
108 |
+
|
109 |
+
# Prepare prompt audio and text for voice cloning
|
110 |
+
prompt_text = "我们并不是通过物理移动手段找到星河的。"
|
111 |
+
prompt_audio_file = os.path.join(current_dir, 'kafka.wav')
|
112 |
+
prompt_audio, sampling_rate = sf.read(prompt_audio_file)
|
113 |
+
|
114 |
+
# Resample audio if necessary
|
115 |
+
target_sample_rate = audio_tokenizer.config['sample_rate']
|
116 |
+
if sampling_rate != target_sample_rate:
|
117 |
+
from librosa import resample
|
118 |
+
prompt_audio = resample(prompt_audio, orig_sr=sampling_rate, target_sr=target_sample_rate)
|
119 |
+
prompt_audio = np.array(prompt_audio, dtype=np.float32)
|
120 |
+
|
121 |
+
# Text to synthesize
|
122 |
+
text = "科学技术是第一生产力,最近 AI的迅猛发展让我们看到了迈向星辰大海的希望。"
|
123 |
+
|
124 |
+
# Generate speech
|
125 |
+
wav = generate_speech(model, tokenizer, text, audio_tokenizer, prompt_audio=prompt_audio, device=device)
|
126 |
+
|
127 |
+
# Save the output
|
128 |
+
sf.write('output.wav', wav, target_sample_rate)
|
129 |
+
print("Generated audio saved to output.wav")
|
130 |
+
```
|
trained_190k_steps/Readme_zh.md
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
---
|
4 |
+
|
5 |
+
# ReSpark TTS 模型
|
6 |
+
|
7 |
+
本仓库包含 ReSpark 文本转语音 (TTS) 模型,这是一个强大而高效的模型,可以从文本生成高质量的语音。它基于 RWKV 架构,并利用 BiCodec-Tokenizer 进行音频处理。
|
8 |
+
|
9 |
+
## 安装
|
10 |
+
|
11 |
+
首先,请安装所需的依赖库:
|
12 |
+
|
13 |
+
```bash
|
14 |
+
pip install transformers rwkv-fla torch torchaudio torchvision transformers soundfile numpy librosa omegaconf soxr soundfile einx librosa
|
15 |
+
```
|
16 |
+
|
17 |
+
## 使用方法
|
18 |
+
|
19 |
+
`tts.py` 脚本提供了一个完整的使用该模型进行文本转语音合成(带声音克隆功能)的示例。
|
20 |
+
|
21 |
+
### 运行测试脚本
|
22 |
+
|
23 |
+
要生成语音,只需运行以下脚本:
|
24 |
+
|
25 |
+
```bash
|
26 |
+
python tts.py
|
27 |
+
```
|
28 |
+
|
29 |
+
### 工作原理
|
30 |
+
|
31 |
+
该脚本执行以下步骤:
|
32 |
+
1. 从当前目录加载预训练的 `AutoModelForCausalLM` 和 `AutoTokenizer`。
|
33 |
+
2. 初始化用于音频编码和解码的 `BiCodecTokenizer`。
|
34 |
+
3. 加载一个参考音频文件 (`kafka.wav`) 及其对应的文本 (`prompt_text`) 以提供声音提示(voice prompt)。
|
35 |
+
4. 如果需要,将参考音频重采样以匹配模型期望的采样率 (24000 Hz)。
|
36 |
+
5. 指定一个需要被合成的目标文本 (`text`)。
|
37 |
+
6. 调用 `generate_speech` 函数,该函数会根据目标文本和参考音频中的声音生成音频。
|
38 |
+
7. 将生成的音频保存到 `output.wav`。
|
39 |
+
|
40 |
+
您可以修改 `tts.py` 文件中的 `prompt_text`、`prompt_audio_file` 和 `text` 变量,以使用不同的声音合成不同的文本。
|
41 |
+
|
42 |
+
### 示例代码 (`tts.py`)
|
43 |
+
|
44 |
+
```python
|
45 |
+
import os
|
46 |
+
import sys
|
47 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
48 |
+
print('add current dir to sys.path', current_dir)
|
49 |
+
sys.path.append(current_dir)
|
50 |
+
from sparktts.models.audio_tokenizer import BiCodecTokenizer
|
51 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
52 |
+
import soundfile as sf
|
53 |
+
import numpy as np
|
54 |
+
import torch
|
55 |
+
from utilities import generate_embeddings
|
56 |
+
|
57 |
+
def generate_speech(model, tokenizer, text, bicodec, prompt_text=None, prompt_audio=None,
|
58 |
+
max_new_tokens=3000, do_sample=True, top_k=50, top_p=0.95,
|
59 |
+
temperature=1.0, device="cuda:0"):
|
60 |
+
"""
|
61 |
+
生成语音的函数
|
62 |
+
"""
|
63 |
+
eos_token_id = model.config.vocab_size - 1
|
64 |
+
|
65 |
+
embeddings = generate_embeddings(
|
66 |
+
model=model,
|
67 |
+
tokenizer=tokenizer,
|
68 |
+
text=text,
|
69 |
+
bicodec=bicodec,
|
70 |
+
prompt_text=prompt_text,
|
71 |
+
prompt_audio=prompt_audio
|
72 |
+
)
|
73 |
+
|
74 |
+
global_tokens = embeddings['global_tokens'].unsqueeze(0)
|
75 |
+
model.eval()
|
76 |
+
|
77 |
+
with torch.no_grad():
|
78 |
+
generated_outputs = model.generate(
|
79 |
+
inputs_embeds=embeddings['input_embs'],
|
80 |
+
attention_mask=torch.ones((1, embeddings['input_embs'].shape[1]),dtype=torch.long,device=device),
|
81 |
+
max_new_tokens=max_new_tokens,
|
82 |
+
do_sample=do_sample,
|
83 |
+
top_k=top_k,
|
84 |
+
top_p=top_p,
|
85 |
+
temperature=temperature,
|
86 |
+
eos_token_id=eos_token_id,
|
87 |
+
pad_token_id=tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
|
88 |
+
use_cache=True
|
89 |
+
)
|
90 |
+
|
91 |
+
semantic_tokens_tensor = generated_outputs[:,:-1]
|
92 |
+
|
93 |
+
with torch.no_grad():
|
94 |
+
wav = bicodec.detokenize(global_tokens, semantic_tokens_tensor)
|
95 |
+
|
96 |
+
return wav
|
97 |
+
|
98 |
+
# --- 主程序 ---
|
99 |
+
device = 'cuda:0'
|
100 |
+
|
101 |
+
# 初始化分词器和模型
|
102 |
+
audio_tokenizer = BiCodecTokenizer(model_dir=current_dir, device=device)
|
103 |
+
tokenizer = AutoTokenizer.from_pretrained(current_dir, trust_remote_code=True)
|
104 |
+
model = AutoModelForCausalLM.from_pretrained(current_dir, trust_remote_code=True)
|
105 |
+
|
106 |
+
model = model.bfloat16().to(device)
|
107 |
+
model.eval()
|
108 |
+
|
109 |
+
# 准备用于声音克隆的提示音频和文本
|
110 |
+
prompt_text = "我们并不是通过物理移动手段找到星河的。"
|
111 |
+
prompt_audio_file = os.path.join(current_dir, 'kafka.wav')
|
112 |
+
prompt_audio, sampling_rate = sf.read(prompt_audio_file)
|
113 |
+
|
114 |
+
# 如果需要,重采样音频
|
115 |
+
target_sample_rate = audio_tokenizer.config['sample_rate']
|
116 |
+
if sampling_rate != target_sample_rate:
|
117 |
+
from librosa import resample
|
118 |
+
prompt_audio = resample(prompt_audio, orig_sr=sampling_rate, target_sr=target_sample_rate)
|
119 |
+
prompt_audio = np.array(prompt_audio, dtype=np.float32)
|
120 |
+
|
121 |
+
# 要合成的文本
|
122 |
+
text = "科学技术是第一生产力,最近 AI的迅猛发展让我们看到了迈向星辰大海的希望。"
|
123 |
+
|
124 |
+
# 生成语音
|
125 |
+
wav = generate_speech(model, tokenizer, text, audio_tokenizer, prompt_audio=prompt_audio, device=device)
|
126 |
+
|
127 |
+
# 保存输出
|
128 |
+
sf.write('output.wav', wav, target_sample_rate)
|
129 |
+
print("生成的音频已保存到 output.wav")
|
130 |
+
```
|
trained_190k_steps/__init__.py
ADDED
File without changes
|
trained_190k_steps/__pycache__/spark_llm.cpython-311.pyc
ADDED
Binary file (10.6 kB). View file
|
|
trained_190k_steps/__pycache__/utilities.cpython-311.pyc
ADDED
Binary file (20.5 kB). View file
|
|
trained_190k_steps/added_tokens.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"<|rwkv_tokenizer_end_of_text|>": 0
|
3 |
+
}
|
trained_190k_steps/config.json
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"a_low_rank_dim": 64,
|
3 |
+
"architectures": [
|
4 |
+
"RWKV7ForSpeech"
|
5 |
+
],
|
6 |
+
"attn": null,
|
7 |
+
"attn_mode": "chunk",
|
8 |
+
"audio_global_vocab_size": 4096,
|
9 |
+
"auto_map": {
|
10 |
+
"AutoConfig": "modeling_rwkvspeech.RWKV7SpeechConfig",
|
11 |
+
"AutoModel": "modeling_rwkvspeech.RWKV7Model",
|
12 |
+
"AutoModelForCausalLM": "modeling_rwkvspeech.RWKV7ForSpeech"
|
13 |
+
},
|
14 |
+
"bos_token_id": 0,
|
15 |
+
"decay_low_rank_dim": 64,
|
16 |
+
"eos_token_id": 0,
|
17 |
+
"fuse_cross_entropy": true,
|
18 |
+
"fuse_norm": false,
|
19 |
+
"gate_low_rank_dim": 128,
|
20 |
+
"head_dim": 64,
|
21 |
+
"hidden_act": "sqrelu",
|
22 |
+
"hidden_ratio": 4.0,
|
23 |
+
"hidden_size": 1024,
|
24 |
+
"initializer_range": 0.006,
|
25 |
+
"intermediate_size": 4096,
|
26 |
+
"max_position_embeddings": 2048,
|
27 |
+
"model_type": "rwkv7",
|
28 |
+
"norm_bias": true,
|
29 |
+
"norm_eps": 1e-05,
|
30 |
+
"norm_first": true,
|
31 |
+
"num_heads": 32,
|
32 |
+
"num_hidden_layers": 24,
|
33 |
+
"text_vocab_size": 65536,
|
34 |
+
"tie_word_embeddings": false,
|
35 |
+
"torch_dtype": "float32",
|
36 |
+
"transformers_version": "4.52.4",
|
37 |
+
"use_cache": true,
|
38 |
+
"v_low_rank_dim": 32,
|
39 |
+
"value_dim": [
|
40 |
+
1024,
|
41 |
+
1024,
|
42 |
+
1024,
|
43 |
+
1024,
|
44 |
+
1024,
|
45 |
+
1024,
|
46 |
+
1024,
|
47 |
+
1024,
|
48 |
+
1024,
|
49 |
+
1024,
|
50 |
+
1024,
|
51 |
+
1024,
|
52 |
+
1024,
|
53 |
+
1024,
|
54 |
+
1024,
|
55 |
+
1024,
|
56 |
+
1024,
|
57 |
+
1024,
|
58 |
+
1024,
|
59 |
+
1024,
|
60 |
+
1024,
|
61 |
+
1024,
|
62 |
+
1024,
|
63 |
+
1024
|
64 |
+
],
|
65 |
+
"vocab_size": 8193
|
66 |
+
}
|
trained_190k_steps/config.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
highpass_cutoff_freq: 40
|
2 |
+
sample_rate: 16000
|
3 |
+
segment_duration: 2.4 # (s)
|
4 |
+
max_val_duration: 12 # (s)
|
5 |
+
latent_hop_length: 320
|
6 |
+
ref_segment_duration: 6
|
7 |
+
volume_normalize: true
|
trained_190k_steps/configuration_rwkv7.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
from typing import Dict, Optional
|
4 |
+
|
5 |
+
from transformers.configuration_utils import PretrainedConfig
|
6 |
+
|
7 |
+
|
8 |
+
class RWKV7Config(PretrainedConfig):
|
9 |
+
|
10 |
+
model_type = 'rwkv7'
|
11 |
+
keys_to_ignore_at_inference = ['past_key_values']
|
12 |
+
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
attn_mode: str = "chunk",
|
16 |
+
hidden_size: int = 2048,
|
17 |
+
hidden_ratio: Optional[int] = 4,
|
18 |
+
intermediate_size: Optional[int] = None,
|
19 |
+
num_hidden_layers: int = 24,
|
20 |
+
head_dim: Optional[int] = 64,
|
21 |
+
num_heads: Optional[int] = None,
|
22 |
+
decay_low_rank_dim: int = 64,
|
23 |
+
gate_low_rank_dim: int = 128,
|
24 |
+
a_low_rank_dim: int = 64,
|
25 |
+
v_low_rank_dim: int = 16,
|
26 |
+
hidden_act: str = "sqrelu",
|
27 |
+
max_position_embeddings: int = 2048,
|
28 |
+
norm_first: bool = True,
|
29 |
+
norm_bias: bool = True,
|
30 |
+
norm_eps: float = 1e-5,
|
31 |
+
attn: Optional[Dict] = None,
|
32 |
+
use_cache: bool = True,
|
33 |
+
pad_token_id: int = None,
|
34 |
+
bos_token_id: int = 1,
|
35 |
+
eos_token_id: int = 2,
|
36 |
+
tie_word_embeddings: bool = False,
|
37 |
+
initializer_range: float = 0.006,
|
38 |
+
fuse_norm: bool = True,
|
39 |
+
fuse_cross_entropy: bool = True,
|
40 |
+
vocab_size: int = 32000,
|
41 |
+
**kwargs
|
42 |
+
):
|
43 |
+
self.attn_mode = attn_mode
|
44 |
+
self.hidden_size = hidden_size
|
45 |
+
self.hidden_ratio = hidden_ratio
|
46 |
+
self.intermediate_size = intermediate_size
|
47 |
+
self.norm_first = norm_first
|
48 |
+
self.num_hidden_layers = num_hidden_layers
|
49 |
+
|
50 |
+
if head_dim is None and num_heads is not None:
|
51 |
+
head_dim = int(hidden_size // num_heads)
|
52 |
+
elif head_dim is not None and num_heads is None:
|
53 |
+
num_heads = int(hidden_size // head_dim)
|
54 |
+
|
55 |
+
self.head_dim = head_dim
|
56 |
+
self.num_heads = num_heads
|
57 |
+
|
58 |
+
self.decay_low_rank_dim = decay_low_rank_dim
|
59 |
+
self.gate_low_rank_dim = gate_low_rank_dim
|
60 |
+
self.a_low_rank_dim = a_low_rank_dim
|
61 |
+
self.v_low_rank_dim = v_low_rank_dim
|
62 |
+
self.hidden_act = hidden_act
|
63 |
+
self.max_position_embeddings = max_position_embeddings
|
64 |
+
self.norm_bias = norm_bias
|
65 |
+
self.norm_eps = norm_eps
|
66 |
+
self.attn = attn
|
67 |
+
self.use_cache = use_cache
|
68 |
+
self.initializer_range = initializer_range
|
69 |
+
self.fuse_norm = fuse_norm
|
70 |
+
self.fuse_cross_entropy = fuse_cross_entropy
|
71 |
+
self.vocab_size = vocab_size
|
72 |
+
|
73 |
+
if attn is not None:
|
74 |
+
if not isinstance(attn, Dict):
|
75 |
+
raise ValueError("attn must be a dictionary")
|
76 |
+
if 'layers' not in attn:
|
77 |
+
raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
|
78 |
+
if 'num_heads' not in attn:
|
79 |
+
raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
|
80 |
+
attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
|
81 |
+
attn['qkv_bias'] = attn.get('qkv_bias', False)
|
82 |
+
attn['window_size'] = attn.get('window_size', None)
|
83 |
+
attn['rope_theta'] = attn.get('rope_theta', 10000.)
|
84 |
+
|
85 |
+
super().__init__(
|
86 |
+
pad_token_id=pad_token_id,
|
87 |
+
bos_token_id=bos_token_id,
|
88 |
+
eos_token_id=eos_token_id,
|
89 |
+
tie_word_embeddings=tie_word_embeddings,
|
90 |
+
**kwargs,
|
91 |
+
)
|
trained_190k_steps/generation_config.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 0,
|
4 |
+
"eos_token_id": 0,
|
5 |
+
"transformers_version": "4.52.4"
|
6 |
+
}
|
trained_190k_steps/hf_rwkv_tokenizer.py
ADDED
@@ -0,0 +1,280 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2024 The HuggingFace Inc. team.
|
3 |
+
#
|
4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
5 |
+
# you may not use this file except in compliance with the License.
|
6 |
+
# You may obtain a copy of the License at
|
7 |
+
#
|
8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
9 |
+
#
|
10 |
+
# Unless required by applicable law or agreed to in writing, software
|
11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13 |
+
# See the License for the specific language governing permissions and
|
14 |
+
# limitations under the License.
|
15 |
+
"""Tokenization classes for RWKV."""
|
16 |
+
|
17 |
+
import os
|
18 |
+
import re
|
19 |
+
from typing import TYPE_CHECKING, List, Optional, Tuple
|
20 |
+
|
21 |
+
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
|
22 |
+
from transformers.utils import logging
|
23 |
+
|
24 |
+
|
25 |
+
if TYPE_CHECKING:
|
26 |
+
pass
|
27 |
+
|
28 |
+
logger = logging.get_logger(__name__)
|
29 |
+
|
30 |
+
|
31 |
+
VOCAB_FILES_NAMES = {
|
32 |
+
"vocab_file": "rwkv_vocab_v20230424.txt",
|
33 |
+
}
|
34 |
+
|
35 |
+
class TRIE:
|
36 |
+
__slots__ = tuple("ch,to,values,front".split(","))
|
37 |
+
to: list
|
38 |
+
values: set
|
39 |
+
|
40 |
+
def __init__(self, front=None, ch=None):
|
41 |
+
self.ch = ch
|
42 |
+
self.to = [None for ch in range(256)]
|
43 |
+
self.values = set()
|
44 |
+
self.front = front
|
45 |
+
|
46 |
+
def __repr__(self):
|
47 |
+
fr = self
|
48 |
+
ret = []
|
49 |
+
while fr != None:
|
50 |
+
if fr.ch != None:
|
51 |
+
ret.append(fr.ch)
|
52 |
+
fr = fr.front
|
53 |
+
return "<TRIE %s %s>" % (ret[::-1], self.values)
|
54 |
+
|
55 |
+
def add(self, key: bytes, idx: int = 0, val=None):
|
56 |
+
if idx == len(key):
|
57 |
+
if val is None:
|
58 |
+
val = key
|
59 |
+
self.values.add(val)
|
60 |
+
return self
|
61 |
+
ch = key[idx]
|
62 |
+
if self.to[ch] is None:
|
63 |
+
self.to[ch] = TRIE(front=self, ch=ch)
|
64 |
+
return self.to[ch].add(key, idx=idx + 1, val=val)
|
65 |
+
|
66 |
+
def find_longest(self, key: bytes, idx: int = 0):
|
67 |
+
u: TRIE = self
|
68 |
+
ch: int = key[idx]
|
69 |
+
|
70 |
+
while u.to[ch] is not None:
|
71 |
+
u = u.to[ch]
|
72 |
+
idx += 1
|
73 |
+
if u.values:
|
74 |
+
ret = idx, u, u.values
|
75 |
+
if idx == len(key):
|
76 |
+
break
|
77 |
+
ch = key[idx]
|
78 |
+
return ret
|
79 |
+
|
80 |
+
|
81 |
+
class RWKV_TOKENIZER:
|
82 |
+
def __init__(self, file_name):
|
83 |
+
self.idx2token = {}
|
84 |
+
sorted = [] # must be already sorted
|
85 |
+
with open(file_name, "r", encoding="utf-8") as f:
|
86 |
+
lines = f.readlines()
|
87 |
+
for l in lines:
|
88 |
+
idx = int(l[: l.index(" ")])
|
89 |
+
x = eval(l[l.index(" ") : l.rindex(" ")])
|
90 |
+
x = x.encode("utf-8") if isinstance(x, str) else x
|
91 |
+
assert isinstance(x, bytes)
|
92 |
+
|
93 |
+
assert len(x) == int(l[l.rindex(" ") :])
|
94 |
+
sorted += [x]
|
95 |
+
self.idx2token[idx] = x
|
96 |
+
|
97 |
+
self.token2idx = {}
|
98 |
+
for k, v in self.idx2token.items():
|
99 |
+
self.token2idx[v] = int(k)
|
100 |
+
|
101 |
+
self.root = TRIE()
|
102 |
+
for t, i in self.token2idx.items():
|
103 |
+
_ = self.root.add(t, val=(t, i))
|
104 |
+
|
105 |
+
def encodeBytes(self, src: bytes):
|
106 |
+
idx: int = 0
|
107 |
+
tokens = []
|
108 |
+
while idx < len(src):
|
109 |
+
_idx: int = idx
|
110 |
+
idx, _, values = self.root.find_longest(src, idx)
|
111 |
+
assert idx != _idx
|
112 |
+
_, token = next(iter(values))
|
113 |
+
tokens.append(token)
|
114 |
+
return tokens
|
115 |
+
|
116 |
+
def decodeBytes(self, tokens):
|
117 |
+
return b"".join(map(lambda i: self.idx2token[i], tokens))
|
118 |
+
|
119 |
+
def encode(self, src):
|
120 |
+
if isinstance(src, str):
|
121 |
+
return [self.encodeBytes(src.encode("utf-8"))]
|
122 |
+
elif isinstance(src, list):
|
123 |
+
return [self.encodeBytes(s.encode("utf-8")) for s in src]
|
124 |
+
|
125 |
+
def decode(self, tokens):
|
126 |
+
return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
|
127 |
+
# try:
|
128 |
+
# return self.decodeBytes(tokens).decode('utf-8')
|
129 |
+
# except:
|
130 |
+
# return '\ufffd' # bad utf-8
|
131 |
+
|
132 |
+
def printTokens(self, tokens):
|
133 |
+
for i in tokens:
|
134 |
+
s = self.idx2token[i]
|
135 |
+
try:
|
136 |
+
s = s.decode("utf-8")
|
137 |
+
except:
|
138 |
+
pass
|
139 |
+
print(f"{repr(s)}{i}", end=" ")
|
140 |
+
print()
|
141 |
+
|
142 |
+
|
143 |
+
class RwkvTokenizer(PreTrainedTokenizer):
|
144 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
145 |
+
model_input_names = ["input_ids", "attention_mask"]
|
146 |
+
|
147 |
+
def __init__(
|
148 |
+
self, vocab_file, bos_token="<|rwkv_tokenizer_end_of_text|>", eos_token="<|rwkv_tokenizer_end_of_text|>", unk_token="<|rwkv_tokenizer_end_of_text|>", **kwargs
|
149 |
+
):
|
150 |
+
if not os.path.isfile(vocab_file):
|
151 |
+
raise ValueError(
|
152 |
+
f"Can't find a vocabulary file at path '{vocab_file}'."
|
153 |
+
)
|
154 |
+
|
155 |
+
with open(vocab_file, "r", encoding="utf-8") as reader:
|
156 |
+
tokens = reader.readlines()
|
157 |
+
|
158 |
+
if "add_bos_token" in kwargs:
|
159 |
+
self.add_bos_token = kwargs["add_bos_token"]
|
160 |
+
else:
|
161 |
+
self.add_bos_token = False
|
162 |
+
self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
|
163 |
+
vocab = self.trie_tokenizer.token2idx
|
164 |
+
self.encoder = vocab
|
165 |
+
self.decoder = {v: k for k, v in vocab.items()}
|
166 |
+
self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
|
167 |
+
super().__init__(
|
168 |
+
bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
|
169 |
+
)
|
170 |
+
|
171 |
+
@property
|
172 |
+
def vocab_size(self):
|
173 |
+
return len(self.encoder)
|
174 |
+
|
175 |
+
def get_vocab(self):
|
176 |
+
vocab = self.encoder
|
177 |
+
vocab.update(self.added_tokens_encoder)
|
178 |
+
vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))
|
179 |
+
return vocab
|
180 |
+
|
181 |
+
def _tokenize(self, text, split_special_tokens=False):
|
182 |
+
# return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
|
183 |
+
return self.trie_tokenizer.encode(text)[0]
|
184 |
+
|
185 |
+
def _convert_token_to_id(self, token):
|
186 |
+
return token
|
187 |
+
|
188 |
+
def _convert_id_to_token(self, index):
|
189 |
+
"""Converts an index (integer) in a token (byte) using the vocab."""
|
190 |
+
token = self.decoder.get(index, self.unk_token)
|
191 |
+
if isinstance(token, (bytes)):
|
192 |
+
token = token.decode("utf-8", errors="replace")
|
193 |
+
return token
|
194 |
+
|
195 |
+
def convert_tokens_to_string(self, tokens):
|
196 |
+
"""Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
|
197 |
+
out_string = b"".join(
|
198 |
+
[k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
|
199 |
+
).decode("utf-8")
|
200 |
+
return out_string
|
201 |
+
|
202 |
+
def save_vocabulary(
|
203 |
+
self, save_directory: str, filename_prefix: Optional[str] = None
|
204 |
+
) -> Tuple[str]:
|
205 |
+
index = 0
|
206 |
+
if os.path.isdir(save_directory):
|
207 |
+
vocab_file = os.path.join(
|
208 |
+
save_directory,
|
209 |
+
(filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
|
210 |
+
)
|
211 |
+
else:
|
212 |
+
vocab_file = (
|
213 |
+
filename_prefix + "-" if filename_prefix else ""
|
214 |
+
) + save_directory
|
215 |
+
with open(vocab_file, "w", encoding="utf-8") as writer:
|
216 |
+
for token, token_index in sorted(
|
217 |
+
self.encoder.items(), key=lambda kv: kv[1]
|
218 |
+
):
|
219 |
+
if index != token_index:
|
220 |
+
logger.warning(
|
221 |
+
f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
|
222 |
+
" Please check that the vocabulary is not corrupted!"
|
223 |
+
)
|
224 |
+
index = token_index
|
225 |
+
writer.write(str(token) + "\n")
|
226 |
+
index += 1
|
227 |
+
return (vocab_file,)
|
228 |
+
|
229 |
+
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
230 |
+
if self.add_bos_token:
|
231 |
+
bos_token_ids = [self.bos_token_id]
|
232 |
+
else:
|
233 |
+
bos_token_ids = []
|
234 |
+
|
235 |
+
output = bos_token_ids + token_ids_0
|
236 |
+
|
237 |
+
if token_ids_1 is None:
|
238 |
+
return output
|
239 |
+
|
240 |
+
return output + bos_token_ids + token_ids_1
|
241 |
+
|
242 |
+
def get_special_tokens_mask(
|
243 |
+
self,
|
244 |
+
token_ids_0: List[int],
|
245 |
+
token_ids_1: Optional[List[int]] = None,
|
246 |
+
already_has_special_tokens: bool = False,
|
247 |
+
) -> List[int]:
|
248 |
+
"""
|
249 |
+
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
250 |
+
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
|
251 |
+
|
252 |
+
Args:
|
253 |
+
token_ids_0 (`List[int]`):
|
254 |
+
List of IDs.
|
255 |
+
token_ids_1 (`List[int]`, *optional*):
|
256 |
+
Optional second list of IDs for sequence pairs.
|
257 |
+
already_has_special_tokens (`bool`, *optional*, defaults to `False`):
|
258 |
+
Whether or not the token list is already formatted with special tokens for the model.
|
259 |
+
|
260 |
+
Returns:
|
261 |
+
`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
262 |
+
"""
|
263 |
+
if already_has_special_tokens:
|
264 |
+
return super().get_special_tokens_mask(
|
265 |
+
token_ids_0=token_ids_0,
|
266 |
+
token_ids_1=token_ids_1,
|
267 |
+
already_has_special_tokens=True,
|
268 |
+
)
|
269 |
+
|
270 |
+
if not self.add_bos_token:
|
271 |
+
return super().get_special_tokens_mask(
|
272 |
+
token_ids_0=token_ids_0,
|
273 |
+
token_ids_1=token_ids_1,
|
274 |
+
already_has_special_tokens=False,
|
275 |
+
)
|
276 |
+
|
277 |
+
if token_ids_1 is None:
|
278 |
+
return [1] + ([0] * len(token_ids_0))
|
279 |
+
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
|
280 |
+
|
trained_190k_steps/kafka.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b7928aeaf90600d6a014a5fececdc59cdf0e2971db327a0cf56b922b7cd8f8a7
|
3 |
+
size 265524
|
trained_190k_steps/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:77c5578b1aaab351a1c89b8695ec465456268a5586020c5046fcd8544328a002
|
3 |
+
size 809355976
|
trained_190k_steps/modeling_rwkvspeech.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from spark_llm import RWKV7SpeechConfig,RWKV7ForSpeech
|
2 |
+
from rwkvfla.models.rwkv7 import RWKV7Model
|
3 |
+
|
4 |
+
RWKV7ForCausalLM = RWKV7ForSpeech
|
5 |
+
RWKV7Model = RWKV7Model
|
6 |
+
RWKV7Config = RWKV7SpeechConfig
|
trained_190k_steps/output.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1b56b3b68f11fdb8539634bb27312f1346b3876ede818d311f6c89dd8b8e94dd
|
3 |
+
size 499244
|
trained_190k_steps/output_0.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:559514056f3c45e9362937acedf1a6a4def27f14443842ff1193dad9c6a274a3
|
3 |
+
size 439724
|
trained_190k_steps/output_1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7e4afaaa5498043eff67996feb3310b0b02d1fb62cdf52c7c93fcbd6c936be9d
|
3 |
+
size 228524
|
trained_190k_steps/output_10.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e28726c1f0d4199c061bebb5770c14f448f177fe1d6d9341596bd112b5b0fc9f
|
3 |
+
size 133804
|
trained_190k_steps/output_2.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b80e10f88d2214da41180723783bed9bdfced387834377b92e6dad064174b0b1
|
3 |
+
size 150444
|
trained_190k_steps/output_3.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbbef7cad347491d556cf82287293c353588ec2bc0c2247110766004d6c1dd2e
|
3 |
+
size 586284
|
trained_190k_steps/output_4.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d00b4160fc563d9149a8bc834186542807f853d270db3b27a2d2c39a004edc4
|
3 |
+
size 209964
|
trained_190k_steps/output_5.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1049970bb90cf801a9e2ffac5f66715a277928b495900e4a7ce940f0c18f49c8
|
3 |
+
size 256044
|
trained_190k_steps/output_6.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2c46bea5a3c52f6832612f92f0f89c8aa8ea9f9b15b0d4afcd3152c24037692a
|
3 |
+
size 184364
|
trained_190k_steps/output_7.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:628147e65cdd71c534ff65c47d4d80a21588522c78a1ddfae2ec1c1d0d83b103
|
3 |
+
size 307244
|
trained_190k_steps/output_8.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:040f941a4a1962d4c8b8cad6d1b0b6b0db3b36a955d2801cd1c59992fb6053c9
|
3 |
+
size 171564
|
trained_190k_steps/output_9.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f87822f47fe7cf8626bb0e9de61455e862063cfdbfc8443e7e2f7aee93d2b169
|
3 |
+
size 241964
|
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_103617.json
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"test_info": {
|
3 |
+
"timestamp": "20250714_103617",
|
4 |
+
"device": "cuda:2",
|
5 |
+
"model_path": "/home/yueyulin/tmp/respark",
|
6 |
+
"batch_size": 4
|
7 |
+
},
|
8 |
+
"statistics": {
|
9 |
+
"total_tests": 20,
|
10 |
+
"successful_tests": 20,
|
11 |
+
"failed_tests": 0,
|
12 |
+
"batch_size": 4,
|
13 |
+
"total_batches": 5,
|
14 |
+
"total_processing_time": 100.95156717300415,
|
15 |
+
"total_audio_length": 123.94000000000001,
|
16 |
+
"total_rtf": 1.227717443827294,
|
17 |
+
"avg_rtf": 1.4573588824585872,
|
18 |
+
"avg_processing_time": 5.047578358650208,
|
19 |
+
"avg_audio_length": 6.197000000000001,
|
20 |
+
"min_rtf": 0.3893700157312124,
|
21 |
+
"max_rtf": 2.0969803734054207,
|
22 |
+
"std_rtf": 0.48215195815676065
|
23 |
+
},
|
24 |
+
"detailed_results": [
|
25 |
+
{
|
26 |
+
"index": 1,
|
27 |
+
"batch": 1,
|
28 |
+
"text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
|
29 |
+
"processing_time": 11.5057652592659,
|
30 |
+
"audio_length": 14.06,
|
31 |
+
"rtf": 1.2219960761564388,
|
32 |
+
"output_file": "rtf_test_results/test_001.wav"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"index": 2,
|
36 |
+
"batch": 1,
|
37 |
+
"text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
|
38 |
+
"processing_time": 11.5057652592659,
|
39 |
+
"audio_length": 6.16,
|
40 |
+
"rtf": 0.5353837716304171,
|
41 |
+
"output_file": "rtf_test_results/test_002.wav"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"index": 3,
|
45 |
+
"batch": 1,
|
46 |
+
"text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
|
47 |
+
"processing_time": 11.5057652592659,
|
48 |
+
"audio_length": 4.48,
|
49 |
+
"rtf": 0.3893700157312124,
|
50 |
+
"output_file": "rtf_test_results/test_003.wav"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"index": 4,
|
54 |
+
"batch": 1,
|
55 |
+
"text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
|
56 |
+
"processing_time": 11.5057652592659,
|
57 |
+
"audio_length": 13.54,
|
58 |
+
"rtf": 1.1768013421876373,
|
59 |
+
"output_file": "rtf_test_results/test_004.wav"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"index": 5,
|
63 |
+
"batch": 2,
|
64 |
+
"text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
|
65 |
+
"processing_time": 4.353760182857513,
|
66 |
+
"audio_length": 4.82,
|
67 |
+
"rtf": 1.1070889983739247,
|
68 |
+
"output_file": "rtf_test_results/test_005.wav"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"index": 6,
|
72 |
+
"batch": 2,
|
73 |
+
"text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
|
74 |
+
"processing_time": 4.353760182857513,
|
75 |
+
"audio_length": 5.9,
|
76 |
+
"rtf": 1.3551504336942233,
|
77 |
+
"output_file": "rtf_test_results/test_006.wav"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"index": 7,
|
81 |
+
"batch": 2,
|
82 |
+
"text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
|
83 |
+
"processing_time": 4.353760182857513,
|
84 |
+
"audio_length": 9.02,
|
85 |
+
"rtf": 2.071772357952863,
|
86 |
+
"output_file": "rtf_test_results/test_007.wav"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"index": 8,
|
90 |
+
"batch": 2,
|
91 |
+
"text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
|
92 |
+
"processing_time": 4.353760182857513,
|
93 |
+
"audio_length": 6.0,
|
94 |
+
"rtf": 1.3781190851127694,
|
95 |
+
"output_file": "rtf_test_results/test_008.wav"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"index": 9,
|
99 |
+
"batch": 3,
|
100 |
+
"text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
|
101 |
+
"processing_time": 4.711536705493927,
|
102 |
+
"audio_length": 9.88,
|
103 |
+
"rtf": 2.0969803734054207,
|
104 |
+
"output_file": "rtf_test_results/test_009.wav"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"index": 10,
|
108 |
+
"batch": 3,
|
109 |
+
"text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
|
110 |
+
"processing_time": 4.711536705493927,
|
111 |
+
"audio_length": 5.8,
|
112 |
+
"rtf": 1.2310208669788905,
|
113 |
+
"output_file": "rtf_test_results/test_010.wav"
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"index": 11,
|
117 |
+
"batch": 3,
|
118 |
+
"text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
|
119 |
+
"processing_time": 4.711536705493927,
|
120 |
+
"audio_length": 6.9,
|
121 |
+
"rtf": 1.4644903417507493,
|
122 |
+
"output_file": "rtf_test_results/test_011.wav"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"index": 12,
|
126 |
+
"batch": 3,
|
127 |
+
"text": "数智技术服务港,让科技触手可及,让创新无处不在。",
|
128 |
+
"processing_time": 4.711536705493927,
|
129 |
+
"audio_length": 3.86,
|
130 |
+
"rtf": 0.8192656114721582,
|
131 |
+
"output_file": "rtf_test_results/test_012.wav"
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"index": 13,
|
135 |
+
"batch": 4,
|
136 |
+
"text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
|
137 |
+
"processing_time": 2.7456793189048767,
|
138 |
+
"audio_length": 4.4,
|
139 |
+
"rtf": 1.602517806687984,
|
140 |
+
"output_file": "rtf_test_results/test_013.wav"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"index": 14,
|
144 |
+
"batch": 4,
|
145 |
+
"text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
|
146 |
+
"processing_time": 2.7456793189048767,
|
147 |
+
"audio_length": 5.68,
|
148 |
+
"rtf": 2.0687048049972154,
|
149 |
+
"output_file": "rtf_test_results/test_014.wav"
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"index": 15,
|
153 |
+
"batch": 4,
|
154 |
+
"text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
|
155 |
+
"processing_time": 2.7456793189048767,
|
156 |
+
"audio_length": 5.2,
|
157 |
+
"rtf": 1.8938846806312535,
|
158 |
+
"output_file": "rtf_test_results/test_015.wav"
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"index": 16,
|
162 |
+
"batch": 4,
|
163 |
+
"text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
|
164 |
+
"processing_time": 2.7456793189048767,
|
165 |
+
"audio_length": 4.86,
|
166 |
+
"rtf": 1.770053759205364,
|
167 |
+
"output_file": "rtf_test_results/test_016.wav"
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"index": 17,
|
171 |
+
"batch": 5,
|
172 |
+
"text": "计算机视觉技术让机器能够理解和分析图像内容。",
|
173 |
+
"processing_time": 1.9211503267288208,
|
174 |
+
"audio_length": 3.4,
|
175 |
+
"rtf": 1.7697730118752573,
|
176 |
+
"output_file": "rtf_test_results/test_017.wav"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"index": 18,
|
180 |
+
"batch": 5,
|
181 |
+
"text": "自然语言处理技术使计算机能够理解和生成人类语言。",
|
182 |
+
"processing_time": 1.9211503267288208,
|
183 |
+
"audio_length": 3.9,
|
184 |
+
"rtf": 2.0300337489157365,
|
185 |
+
"output_file": "rtf_test_results/test_018.wav"
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"index": 19,
|
189 |
+
"batch": 5,
|
190 |
+
"text": "语音合成技术将文本转换为自然的语音输出。",
|
191 |
+
"processing_time": 1.9211503267288208,
|
192 |
+
"audio_length": 3.06,
|
193 |
+
"rtf": 1.5927957106877317,
|
194 |
+
"output_file": "rtf_test_results/test_019.wav"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"index": 20,
|
198 |
+
"batch": 5,
|
199 |
+
"text": "大数据分析帮助企业发现隐藏的模式和趋势。",
|
200 |
+
"processing_time": 1.9211503267288208,
|
201 |
+
"audio_length": 3.02,
|
202 |
+
"rtf": 1.5719748517244934,
|
203 |
+
"output_file": "rtf_test_results/test_020.wav"
|
204 |
+
}
|
205 |
+
]
|
206 |
+
}
|
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104243.json
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"test_info": {
|
3 |
+
"timestamp": "20250714_104243",
|
4 |
+
"device": "cuda:2",
|
5 |
+
"model_path": "/home/yueyulin/tmp/respark",
|
6 |
+
"batch_size": 4
|
7 |
+
},
|
8 |
+
"statistics": {
|
9 |
+
"total_tests": 20,
|
10 |
+
"successful_tests": 20,
|
11 |
+
"failed_tests": 0,
|
12 |
+
"batch_size": 4,
|
13 |
+
"total_batches": 5,
|
14 |
+
"total_processing_time": 97.51047849655151,
|
15 |
+
"total_audio_length": 127.58000000000001,
|
16 |
+
"total_rtf": 1.3083722074496016,
|
17 |
+
"avg_batch_rtf": 1.5580788079346588,
|
18 |
+
"avg_batch_processing_time": 19.502095699310303,
|
19 |
+
"avg_audio_length": 6.3790000000000004,
|
20 |
+
"min_batch_rtf": 0.9216863966136871,
|
21 |
+
"max_batch_rtf": 1.9146887765353369,
|
22 |
+
"std_batch_rtf": 0.35584074129598514
|
23 |
+
},
|
24 |
+
"detailed_results": [
|
25 |
+
{
|
26 |
+
"index": 1,
|
27 |
+
"batch": 1,
|
28 |
+
"text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
|
29 |
+
"batch_processing_time": 46.76210927963257,
|
30 |
+
"audio_length": 15.72,
|
31 |
+
"batch_rtf": 0.9216863966136871,
|
32 |
+
"output_file": "rtf_test_results/test_001.wav"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"index": 2,
|
36 |
+
"batch": 1,
|
37 |
+
"text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
|
38 |
+
"batch_processing_time": 46.76210927963257,
|
39 |
+
"audio_length": 6.66,
|
40 |
+
"batch_rtf": 0.9216863966136871,
|
41 |
+
"output_file": "rtf_test_results/test_002.wav"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"index": 3,
|
45 |
+
"batch": 1,
|
46 |
+
"text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
|
47 |
+
"batch_processing_time": 46.76210927963257,
|
48 |
+
"audio_length": 4.82,
|
49 |
+
"batch_rtf": 0.9216863966136871,
|
50 |
+
"output_file": "rtf_test_results/test_003.wav"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"index": 4,
|
54 |
+
"batch": 1,
|
55 |
+
"text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
|
56 |
+
"batch_processing_time": 46.76210927963257,
|
57 |
+
"audio_length": 15.9,
|
58 |
+
"batch_rtf": 0.9216863966136871,
|
59 |
+
"output_file": "rtf_test_results/test_004.wav"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"index": 5,
|
63 |
+
"batch": 2,
|
64 |
+
"text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
|
65 |
+
"batch_processing_time": 15.214709043502808,
|
66 |
+
"audio_length": 4.66,
|
67 |
+
"batch_rtf": 1.6300015944498414,
|
68 |
+
"output_file": "rtf_test_results/test_005.wav"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"index": 6,
|
72 |
+
"batch": 2,
|
73 |
+
"text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
|
74 |
+
"batch_processing_time": 15.214709043502808,
|
75 |
+
"audio_length": 5.86,
|
76 |
+
"batch_rtf": 1.6300015944498414,
|
77 |
+
"output_file": "rtf_test_results/test_006.wav"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"index": 7,
|
81 |
+
"batch": 2,
|
82 |
+
"text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
|
83 |
+
"batch_processing_time": 15.214709043502808,
|
84 |
+
"audio_length": 8.06,
|
85 |
+
"batch_rtf": 1.6300015944498414,
|
86 |
+
"output_file": "rtf_test_results/test_007.wav"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"index": 8,
|
90 |
+
"batch": 2,
|
91 |
+
"text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
|
92 |
+
"batch_processing_time": 15.214709043502808,
|
93 |
+
"audio_length": 6.22,
|
94 |
+
"batch_rtf": 1.6300015944498414,
|
95 |
+
"output_file": "rtf_test_results/test_008.wav"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"index": 9,
|
99 |
+
"batch": 3,
|
100 |
+
"text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
|
101 |
+
"batch_processing_time": 17.258368730545044,
|
102 |
+
"audio_length": 8.92,
|
103 |
+
"batch_rtf": 1.4694320416921058,
|
104 |
+
"output_file": "rtf_test_results/test_009.wav"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"index": 10,
|
108 |
+
"batch": 3,
|
109 |
+
"text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
|
110 |
+
"batch_processing_time": 17.258368730545044,
|
111 |
+
"audio_length": 5.68,
|
112 |
+
"batch_rtf": 1.4694320416921058,
|
113 |
+
"output_file": "rtf_test_results/test_010.wav"
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"index": 11,
|
117 |
+
"batch": 3,
|
118 |
+
"text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
|
119 |
+
"batch_processing_time": 17.258368730545044,
|
120 |
+
"audio_length": 6.86,
|
121 |
+
"batch_rtf": 1.4694320416921058,
|
122 |
+
"output_file": "rtf_test_results/test_011.wav"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"index": 12,
|
126 |
+
"batch": 3,
|
127 |
+
"text": "数智技术服务港,让科技触手可及,让创新无处不在。",
|
128 |
+
"batch_processing_time": 17.258368730545044,
|
129 |
+
"audio_length": 3.9,
|
130 |
+
"batch_rtf": 1.4694320416921058,
|
131 |
+
"output_file": "rtf_test_results/test_012.wav"
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"index": 13,
|
135 |
+
"batch": 4,
|
136 |
+
"text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
|
137 |
+
"batch_processing_time": 11.172309398651123,
|
138 |
+
"audio_length": 4.96,
|
139 |
+
"batch_rtf": 1.8545852303823243,
|
140 |
+
"output_file": "rtf_test_results/test_013.wav"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"index": 14,
|
144 |
+
"batch": 4,
|
145 |
+
"text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
|
146 |
+
"batch_processing_time": 11.172309398651123,
|
147 |
+
"audio_length": 5.72,
|
148 |
+
"batch_rtf": 1.8545852303823243,
|
149 |
+
"output_file": "rtf_test_results/test_014.wav"
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"index": 15,
|
153 |
+
"batch": 4,
|
154 |
+
"text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
|
155 |
+
"batch_processing_time": 11.172309398651123,
|
156 |
+
"audio_length": 5.5,
|
157 |
+
"batch_rtf": 1.8545852303823243,
|
158 |
+
"output_file": "rtf_test_results/test_015.wav"
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"index": 16,
|
162 |
+
"batch": 4,
|
163 |
+
"text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
|
164 |
+
"batch_processing_time": 11.172309398651123,
|
165 |
+
"audio_length": 4.54,
|
166 |
+
"batch_rtf": 1.8545852303823243,
|
167 |
+
"output_file": "rtf_test_results/test_016.wav"
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"index": 17,
|
171 |
+
"batch": 5,
|
172 |
+
"text": "计算机视觉技术让机器能够理解和分析图像内容。",
|
173 |
+
"batch_processing_time": 7.102982044219971,
|
174 |
+
"audio_length": 3.6,
|
175 |
+
"batch_rtf": 1.9146887765353369,
|
176 |
+
"output_file": "rtf_test_results/test_017.wav"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"index": 18,
|
180 |
+
"batch": 5,
|
181 |
+
"text": "自然语言处理技术使计算机能够理解和生成人类语言。",
|
182 |
+
"batch_processing_time": 7.102982044219971,
|
183 |
+
"audio_length": 3.44,
|
184 |
+
"batch_rtf": 1.9146887765353369,
|
185 |
+
"output_file": "rtf_test_results/test_018.wav"
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"index": 19,
|
189 |
+
"batch": 5,
|
190 |
+
"text": "语音合成技术将文本转换为自然的语音输出。",
|
191 |
+
"batch_processing_time": 7.102982044219971,
|
192 |
+
"audio_length": 3.42,
|
193 |
+
"batch_rtf": 1.9146887765353369,
|
194 |
+
"output_file": "rtf_test_results/test_019.wav"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"index": 20,
|
198 |
+
"batch": 5,
|
199 |
+
"text": "大数据分析帮助企业发现隐藏的模式和趋势。",
|
200 |
+
"batch_processing_time": 7.102982044219971,
|
201 |
+
"audio_length": 3.14,
|
202 |
+
"batch_rtf": 1.9146887765353369,
|
203 |
+
"output_file": "rtf_test_results/test_020.wav"
|
204 |
+
}
|
205 |
+
]
|
206 |
+
}
|
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104655.json
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"test_info": {
|
3 |
+
"timestamp": "20250714_104655",
|
4 |
+
"device": "cuda:2",
|
5 |
+
"model_path": "/home/yueyulin/tmp/respark",
|
6 |
+
"batch_size": 4
|
7 |
+
},
|
8 |
+
"statistics": {
|
9 |
+
"total_tests": 20,
|
10 |
+
"successful_tests": 20,
|
11 |
+
"failed_tests": 0,
|
12 |
+
"batch_size": 4,
|
13 |
+
"total_batches": 5,
|
14 |
+
"total_processing_time": 97.07794642448425,
|
15 |
+
"total_audio_length": 126.58,
|
16 |
+
"total_rtf": 0.7669295814858924,
|
17 |
+
"avg_batch_rtf": 0.6971107903490739,
|
18 |
+
"avg_batch_processing_time": 19.41558928489685,
|
19 |
+
"avg_audio_length": 6.328999999999999,
|
20 |
+
"min_batch_rtf": 0.5344265722669661,
|
21 |
+
"max_batch_rtf": 1.088588070754268,
|
22 |
+
"std_batch_rtf": 0.20383940965077615
|
23 |
+
},
|
24 |
+
"detailed_results": [
|
25 |
+
{
|
26 |
+
"index": 1,
|
27 |
+
"batch": 1,
|
28 |
+
"text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
|
29 |
+
"batch_processing_time": 45.067546129226685,
|
30 |
+
"audio_length": 14.48,
|
31 |
+
"batch_rtf": 1.088588070754268,
|
32 |
+
"output_file": "rtf_test_results/test_001.wav"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"index": 2,
|
36 |
+
"batch": 1,
|
37 |
+
"text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
|
38 |
+
"batch_processing_time": 45.067546129226685,
|
39 |
+
"audio_length": 6.24,
|
40 |
+
"batch_rtf": 1.088588070754268,
|
41 |
+
"output_file": "rtf_test_results/test_002.wav"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"index": 3,
|
45 |
+
"batch": 1,
|
46 |
+
"text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
|
47 |
+
"batch_processing_time": 45.067546129226685,
|
48 |
+
"audio_length": 5.6,
|
49 |
+
"batch_rtf": 1.088588070754268,
|
50 |
+
"output_file": "rtf_test_results/test_003.wav"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"index": 4,
|
54 |
+
"batch": 1,
|
55 |
+
"text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
|
56 |
+
"batch_processing_time": 45.067546129226685,
|
57 |
+
"audio_length": 15.08,
|
58 |
+
"batch_rtf": 1.088588070754268,
|
59 |
+
"output_file": "rtf_test_results/test_004.wav"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"index": 5,
|
63 |
+
"batch": 2,
|
64 |
+
"text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
|
65 |
+
"batch_processing_time": 16.357284545898438,
|
66 |
+
"audio_length": 4.8,
|
67 |
+
"batch_rtf": 0.6558654589373872,
|
68 |
+
"output_file": "rtf_test_results/test_005.wav"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"index": 6,
|
72 |
+
"batch": 2,
|
73 |
+
"text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
|
74 |
+
"batch_processing_time": 16.357284545898438,
|
75 |
+
"audio_length": 5.84,
|
76 |
+
"batch_rtf": 0.6558654589373872,
|
77 |
+
"output_file": "rtf_test_results/test_006.wav"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"index": 7,
|
81 |
+
"batch": 2,
|
82 |
+
"text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
|
83 |
+
"batch_processing_time": 16.357284545898438,
|
84 |
+
"audio_length": 8.6,
|
85 |
+
"batch_rtf": 0.6558654589373872,
|
86 |
+
"output_file": "rtf_test_results/test_007.wav"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"index": 8,
|
90 |
+
"batch": 2,
|
91 |
+
"text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
|
92 |
+
"batch_processing_time": 16.357284545898438,
|
93 |
+
"audio_length": 5.7,
|
94 |
+
"batch_rtf": 0.6558654589373872,
|
95 |
+
"output_file": "rtf_test_results/test_008.wav"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"index": 9,
|
99 |
+
"batch": 3,
|
100 |
+
"text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
|
101 |
+
"batch_processing_time": 16.95064425468445,
|
102 |
+
"audio_length": 9.0,
|
103 |
+
"batch_rtf": 0.6694567241186591,
|
104 |
+
"output_file": "rtf_test_results/test_009.wav"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"index": 10,
|
108 |
+
"batch": 3,
|
109 |
+
"text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
|
110 |
+
"batch_processing_time": 16.95064425468445,
|
111 |
+
"audio_length": 5.72,
|
112 |
+
"batch_rtf": 0.6694567241186591,
|
113 |
+
"output_file": "rtf_test_results/test_010.wav"
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"index": 11,
|
117 |
+
"batch": 3,
|
118 |
+
"text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
|
119 |
+
"batch_processing_time": 16.95064425468445,
|
120 |
+
"audio_length": 6.8,
|
121 |
+
"batch_rtf": 0.6694567241186591,
|
122 |
+
"output_file": "rtf_test_results/test_011.wav"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"index": 12,
|
126 |
+
"batch": 3,
|
127 |
+
"text": "数智技术服务港,让科技触手可及,让创新无处不在。",
|
128 |
+
"batch_processing_time": 16.95064425468445,
|
129 |
+
"audio_length": 3.8,
|
130 |
+
"batch_rtf": 0.6694567241186591,
|
131 |
+
"output_file": "rtf_test_results/test_012.wav"
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"index": 13,
|
135 |
+
"batch": 4,
|
136 |
+
"text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
|
137 |
+
"batch_processing_time": 10.945056200027466,
|
138 |
+
"audio_length": 4.72,
|
139 |
+
"batch_rtf": 0.5344265722669661,
|
140 |
+
"output_file": "rtf_test_results/test_013.wav"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"index": 14,
|
144 |
+
"batch": 4,
|
145 |
+
"text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
|
146 |
+
"batch_processing_time": 10.945056200027466,
|
147 |
+
"audio_length": 5.76,
|
148 |
+
"batch_rtf": 0.5344265722669661,
|
149 |
+
"output_file": "rtf_test_results/test_014.wav"
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"index": 15,
|
153 |
+
"batch": 4,
|
154 |
+
"text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
|
155 |
+
"batch_processing_time": 10.945056200027466,
|
156 |
+
"audio_length": 4.98,
|
157 |
+
"batch_rtf": 0.5344265722669661,
|
158 |
+
"output_file": "rtf_test_results/test_015.wav"
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"index": 16,
|
162 |
+
"batch": 4,
|
163 |
+
"text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
|
164 |
+
"batch_processing_time": 10.945056200027466,
|
165 |
+
"audio_length": 5.02,
|
166 |
+
"batch_rtf": 0.5344265722669661,
|
167 |
+
"output_file": "rtf_test_results/test_016.wav"
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"index": 17,
|
171 |
+
"batch": 5,
|
172 |
+
"text": "计算机视觉技术让机器能够理解和分析图像内容。",
|
173 |
+
"batch_processing_time": 7.757415294647217,
|
174 |
+
"audio_length": 3.82,
|
175 |
+
"batch_rtf": 0.5372171256680899,
|
176 |
+
"output_file": "rtf_test_results/test_017.wav"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"index": 18,
|
180 |
+
"batch": 5,
|
181 |
+
"text": "自然语言处理技术使计算机能够理解和生成人类语言。",
|
182 |
+
"batch_processing_time": 7.757415294647217,
|
183 |
+
"audio_length": 3.68,
|
184 |
+
"batch_rtf": 0.5372171256680899,
|
185 |
+
"output_file": "rtf_test_results/test_018.wav"
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"index": 19,
|
189 |
+
"batch": 5,
|
190 |
+
"text": "语音合成技术将文本转换为自然的语音输出。",
|
191 |
+
"batch_processing_time": 7.757415294647217,
|
192 |
+
"audio_length": 3.34,
|
193 |
+
"batch_rtf": 0.5372171256680899,
|
194 |
+
"output_file": "rtf_test_results/test_019.wav"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"index": 20,
|
198 |
+
"batch": 5,
|
199 |
+
"text": "大数据分析帮助企业发现隐藏的模式和趋势。",
|
200 |
+
"batch_processing_time": 7.757415294647217,
|
201 |
+
"audio_length": 3.6,
|
202 |
+
"batch_rtf": 0.5372171256680899,
|
203 |
+
"output_file": "rtf_test_results/test_020.wav"
|
204 |
+
}
|
205 |
+
]
|
206 |
+
}
|
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104853.json
ADDED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"test_info": {
|
3 |
+
"timestamp": "20250714_104853",
|
4 |
+
"device": "cuda:2",
|
5 |
+
"model_path": "/home/yueyulin/tmp/respark",
|
6 |
+
"batch_size": 8
|
7 |
+
},
|
8 |
+
"statistics": {
|
9 |
+
"total_tests": 20,
|
10 |
+
"successful_tests": 20,
|
11 |
+
"failed_tests": 0,
|
12 |
+
"batch_size": 8,
|
13 |
+
"total_batches": 3,
|
14 |
+
"total_processing_time": 70.2043297290802,
|
15 |
+
"total_audio_length": 124.24,
|
16 |
+
"total_rtf": 0.5650702650441098,
|
17 |
+
"avg_batch_rtf": 0.5415599169688988,
|
18 |
+
"avg_batch_processing_time": 26.568262767791747,
|
19 |
+
"avg_audio_length": 6.212,
|
20 |
+
"min_batch_rtf": 0.3969690210187983,
|
21 |
+
"max_batch_rtf": 0.6930763216404902,
|
22 |
+
"std_batch_rtf": 0.13260418306880234
|
23 |
+
},
|
24 |
+
"detailed_results": [
|
25 |
+
{
|
26 |
+
"index": 1,
|
27 |
+
"batch": 1,
|
28 |
+
"text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
|
29 |
+
"batch_processing_time": 44.49549984931946,
|
30 |
+
"audio_length": 14.36,
|
31 |
+
"batch_rtf": 0.6930763216404902,
|
32 |
+
"output_file": "rtf_test_results/test_001.wav"
|
33 |
+
},
|
34 |
+
{
|
35 |
+
"index": 2,
|
36 |
+
"batch": 1,
|
37 |
+
"text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
|
38 |
+
"batch_processing_time": 44.49549984931946,
|
39 |
+
"audio_length": 5.54,
|
40 |
+
"batch_rtf": 0.6930763216404902,
|
41 |
+
"output_file": "rtf_test_results/test_002.wav"
|
42 |
+
},
|
43 |
+
{
|
44 |
+
"index": 3,
|
45 |
+
"batch": 1,
|
46 |
+
"text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
|
47 |
+
"batch_processing_time": 44.49549984931946,
|
48 |
+
"audio_length": 4.2,
|
49 |
+
"batch_rtf": 0.6930763216404902,
|
50 |
+
"output_file": "rtf_test_results/test_003.wav"
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"index": 4,
|
54 |
+
"batch": 1,
|
55 |
+
"text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
|
56 |
+
"batch_processing_time": 44.49549984931946,
|
57 |
+
"audio_length": 14.58,
|
58 |
+
"batch_rtf": 0.6930763216404902,
|
59 |
+
"output_file": "rtf_test_results/test_004.wav"
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"index": 5,
|
63 |
+
"batch": 1,
|
64 |
+
"text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
|
65 |
+
"batch_processing_time": 44.49549984931946,
|
66 |
+
"audio_length": 4.62,
|
67 |
+
"batch_rtf": 0.6930763216404902,
|
68 |
+
"output_file": "rtf_test_results/test_005.wav"
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"index": 6,
|
72 |
+
"batch": 1,
|
73 |
+
"text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
|
74 |
+
"batch_processing_time": 44.49549984931946,
|
75 |
+
"audio_length": 7.0,
|
76 |
+
"batch_rtf": 0.6930763216404902,
|
77 |
+
"output_file": "rtf_test_results/test_006.wav"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"index": 7,
|
81 |
+
"batch": 1,
|
82 |
+
"text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
|
83 |
+
"batch_processing_time": 44.49549984931946,
|
84 |
+
"audio_length": 8.26,
|
85 |
+
"batch_rtf": 0.6930763216404902,
|
86 |
+
"output_file": "rtf_test_results/test_007.wav"
|
87 |
+
},
|
88 |
+
{
|
89 |
+
"index": 8,
|
90 |
+
"batch": 1,
|
91 |
+
"text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
|
92 |
+
"batch_processing_time": 44.49549984931946,
|
93 |
+
"audio_length": 5.64,
|
94 |
+
"batch_rtf": 0.6930763216404902,
|
95 |
+
"output_file": "rtf_test_results/test_008.wav"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"index": 9,
|
99 |
+
"batch": 2,
|
100 |
+
"text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
|
101 |
+
"batch_processing_time": 18.141484260559082,
|
102 |
+
"audio_length": 9.36,
|
103 |
+
"batch_rtf": 0.3969690210187983,
|
104 |
+
"output_file": "rtf_test_results/test_009.wav"
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"index": 10,
|
108 |
+
"batch": 2,
|
109 |
+
"text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
|
110 |
+
"batch_processing_time": 18.141484260559082,
|
111 |
+
"audio_length": 5.6,
|
112 |
+
"batch_rtf": 0.3969690210187983,
|
113 |
+
"output_file": "rtf_test_results/test_010.wav"
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"index": 11,
|
117 |
+
"batch": 2,
|
118 |
+
"text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
|
119 |
+
"batch_processing_time": 18.141484260559082,
|
120 |
+
"audio_length": 6.96,
|
121 |
+
"batch_rtf": 0.3969690210187983,
|
122 |
+
"output_file": "rtf_test_results/test_011.wav"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"index": 12,
|
126 |
+
"batch": 2,
|
127 |
+
"text": "数智技术服务港,让科技触手可及,让创新无处不在。",
|
128 |
+
"batch_processing_time": 18.141484260559082,
|
129 |
+
"audio_length": 3.88,
|
130 |
+
"batch_rtf": 0.3969690210187983,
|
131 |
+
"output_file": "rtf_test_results/test_012.wav"
|
132 |
+
},
|
133 |
+
{
|
134 |
+
"index": 13,
|
135 |
+
"batch": 2,
|
136 |
+
"text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
|
137 |
+
"batch_processing_time": 18.141484260559082,
|
138 |
+
"audio_length": 4.86,
|
139 |
+
"batch_rtf": 0.3969690210187983,
|
140 |
+
"output_file": "rtf_test_results/test_013.wav"
|
141 |
+
},
|
142 |
+
{
|
143 |
+
"index": 14,
|
144 |
+
"batch": 2,
|
145 |
+
"text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
|
146 |
+
"batch_processing_time": 18.141484260559082,
|
147 |
+
"audio_length": 4.92,
|
148 |
+
"batch_rtf": 0.3969690210187983,
|
149 |
+
"output_file": "rtf_test_results/test_014.wav"
|
150 |
+
},
|
151 |
+
{
|
152 |
+
"index": 15,
|
153 |
+
"batch": 2,
|
154 |
+
"text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
|
155 |
+
"batch_processing_time": 18.141484260559082,
|
156 |
+
"audio_length": 5.7,
|
157 |
+
"batch_rtf": 0.3969690210187983,
|
158 |
+
"output_file": "rtf_test_results/test_015.wav"
|
159 |
+
},
|
160 |
+
{
|
161 |
+
"index": 16,
|
162 |
+
"batch": 2,
|
163 |
+
"text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
|
164 |
+
"batch_processing_time": 18.141484260559082,
|
165 |
+
"audio_length": 4.42,
|
166 |
+
"batch_rtf": 0.3969690210187983,
|
167 |
+
"output_file": "rtf_test_results/test_016.wav"
|
168 |
+
},
|
169 |
+
{
|
170 |
+
"index": 17,
|
171 |
+
"batch": 3,
|
172 |
+
"text": "计算机视觉技术让机器能够理解和分析图像内容。",
|
173 |
+
"batch_processing_time": 7.56734561920166,
|
174 |
+
"audio_length": 3.74,
|
175 |
+
"batch_rtf": 0.5277088995259177,
|
176 |
+
"output_file": "rtf_test_results/test_017.wav"
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"index": 18,
|
180 |
+
"batch": 3,
|
181 |
+
"text": "自然语言处理技术使计算机能够理解和生成人类语言。",
|
182 |
+
"batch_processing_time": 7.56734561920166,
|
183 |
+
"audio_length": 3.86,
|
184 |
+
"batch_rtf": 0.5277088995259177,
|
185 |
+
"output_file": "rtf_test_results/test_018.wav"
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"index": 19,
|
189 |
+
"batch": 3,
|
190 |
+
"text": "语音合成技术将文本转换为自然的语音输出。",
|
191 |
+
"batch_processing_time": 7.56734561920166,
|
192 |
+
"audio_length": 3.38,
|
193 |
+
"batch_rtf": 0.5277088995259177,
|
194 |
+
"output_file": "rtf_test_results/test_019.wav"
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"index": 20,
|
198 |
+
"batch": 3,
|
199 |
+
"text": "大数据分析帮助企业发现隐藏的模式和趋势。",
|
200 |
+
"batch_processing_time": 7.56734561920166,
|
201 |
+
"audio_length": 3.36,
|
202 |
+
"batch_rtf": 0.5277088995259177,
|
203 |
+
"output_file": "rtf_test_results/test_020.wav"
|
204 |
+
}
|
205 |
+
]
|
206 |
+
}
|
trained_190k_steps/rtf_test_results/test_001.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acddee462cbe93cfae2db94a0987ab88ed594188ca1bc4ac816c33e91cc9e13d
|
3 |
+
size 459564
|
trained_190k_steps/rtf_test_results/test_002.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6361fee9642fd977e0981276d86d5d58f87100d7309a595035a70731d1671589
|
3 |
+
size 177324
|
trained_190k_steps/rtf_test_results/test_003.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4feb2c3b9a8e330a0300fbaeef3f6c4a8d7adb39ac22104808a83a4f51e57f2
|
3 |
+
size 134444
|
trained_190k_steps/rtf_test_results/test_004.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f3689553a60a8153d5c34c14701905862c79744a10f8e0e1a12866b08121116
|
3 |
+
size 466604
|
trained_190k_steps/rtf_test_results/test_005.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:caa1ab23c8dc026655270e0a8132e6528794f28eab7416d55f7f915608c16e92
|
3 |
+
size 147884
|
trained_190k_steps/rtf_test_results/test_006.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec9ee466a4439eff14acab039c2aa5a9effa4743a0846c7bdcb79d33f5358712
|
3 |
+
size 224044
|
trained_190k_steps/rtf_test_results/test_007.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5ff6890b46d01626477bdc6a9908e0bd9647978d76278374391423728299d77
|
3 |
+
size 264364
|
trained_190k_steps/rtf_test_results/test_008.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b79aacacde6ecc15b85a8c98f9a8fcaac3ad0a57a64ec5a7e8d7e18461468852
|
3 |
+
size 180524
|
trained_190k_steps/rtf_test_results/test_009.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:664f58297326d99c6c4c74601674f825623bb854970ade90e1aa76ace913fa0b
|
3 |
+
size 299564
|
trained_190k_steps/rtf_test_results/test_010.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7fe5c8d90a944d990a84dc0488cbc6f5f182f8c92c73487c6e7564956b87da72
|
3 |
+
size 179244
|
trained_190k_steps/rtf_test_results/test_011.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:decd4ce16f819b4f332aeb5abd7eb5920dc09ef1b7760652b59a028657487913
|
3 |
+
size 222764
|
trained_190k_steps/rtf_test_results/test_012.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8ad1cb092212b7ef32d1c801a5432383c5e1b5b9ac56df997c8c3089884cd5c9
|
3 |
+
size 124204
|
trained_190k_steps/rtf_test_results/test_013.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:40cf22639096c82432678578973352c31e07f662f7f4aace8bffd81ecca3559f
|
3 |
+
size 155564
|
trained_190k_steps/rtf_test_results/test_014.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:19b3793fb37bd806ca97ef02463a6c0870cb3e5bb42a6df9815c558ed7a117bb
|
3 |
+
size 157484
|