yueyulin commited on
Commit
fc99023
·
verified ·
1 Parent(s): ee3b868

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +34 -0
  2. trained_190k_steps/.gitignore +16 -0
  3. trained_190k_steps/.vscode/launch.json +17 -0
  4. trained_190k_steps/BATCH_INFERENCE_README.md +30 -0
  5. trained_190k_steps/BiCodec/config.yaml +60 -0
  6. trained_190k_steps/BiCodec/model.safetensors +3 -0
  7. trained_190k_steps/Readme.md +130 -0
  8. trained_190k_steps/Readme_zh.md +130 -0
  9. trained_190k_steps/__init__.py +0 -0
  10. trained_190k_steps/__pycache__/spark_llm.cpython-311.pyc +0 -0
  11. trained_190k_steps/__pycache__/utilities.cpython-311.pyc +0 -0
  12. trained_190k_steps/added_tokens.json +3 -0
  13. trained_190k_steps/config.json +66 -0
  14. trained_190k_steps/config.yaml +7 -0
  15. trained_190k_steps/configuration_rwkv7.py +91 -0
  16. trained_190k_steps/generation_config.json +6 -0
  17. trained_190k_steps/hf_rwkv_tokenizer.py +280 -0
  18. trained_190k_steps/kafka.wav +3 -0
  19. trained_190k_steps/model.safetensors +3 -0
  20. trained_190k_steps/modeling_rwkvspeech.py +6 -0
  21. trained_190k_steps/output.wav +3 -0
  22. trained_190k_steps/output_0.wav +3 -0
  23. trained_190k_steps/output_1.wav +3 -0
  24. trained_190k_steps/output_10.wav +3 -0
  25. trained_190k_steps/output_2.wav +3 -0
  26. trained_190k_steps/output_3.wav +3 -0
  27. trained_190k_steps/output_4.wav +3 -0
  28. trained_190k_steps/output_5.wav +3 -0
  29. trained_190k_steps/output_6.wav +3 -0
  30. trained_190k_steps/output_7.wav +3 -0
  31. trained_190k_steps/output_8.wav +3 -0
  32. trained_190k_steps/output_9.wav +3 -0
  33. trained_190k_steps/rtf_test_results/rtf_test_results_20250714_103617.json +206 -0
  34. trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104243.json +206 -0
  35. trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104655.json +206 -0
  36. trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104853.json +206 -0
  37. trained_190k_steps/rtf_test_results/test_001.wav +3 -0
  38. trained_190k_steps/rtf_test_results/test_002.wav +3 -0
  39. trained_190k_steps/rtf_test_results/test_003.wav +3 -0
  40. trained_190k_steps/rtf_test_results/test_004.wav +3 -0
  41. trained_190k_steps/rtf_test_results/test_005.wav +3 -0
  42. trained_190k_steps/rtf_test_results/test_006.wav +3 -0
  43. trained_190k_steps/rtf_test_results/test_007.wav +3 -0
  44. trained_190k_steps/rtf_test_results/test_008.wav +3 -0
  45. trained_190k_steps/rtf_test_results/test_009.wav +3 -0
  46. trained_190k_steps/rtf_test_results/test_010.wav +3 -0
  47. trained_190k_steps/rtf_test_results/test_011.wav +3 -0
  48. trained_190k_steps/rtf_test_results/test_012.wav +3 -0
  49. trained_190k_steps/rtf_test_results/test_013.wav +3 -0
  50. trained_190k_steps/rtf_test_results/test_014.wav +3 -0
.gitattributes CHANGED
@@ -48,3 +48,37 @@ trained_50_percents/output_6.wav filter=lfs diff=lfs merge=lfs -text
48
  trained_50_percents/output_7.wav filter=lfs diff=lfs merge=lfs -text
49
  trained_50_percents/output_8.wav filter=lfs diff=lfs merge=lfs -text
50
  trained_50_percents/output_9.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  trained_50_percents/output_7.wav filter=lfs diff=lfs merge=lfs -text
49
  trained_50_percents/output_8.wav filter=lfs diff=lfs merge=lfs -text
50
  trained_50_percents/output_9.wav filter=lfs diff=lfs merge=lfs -text
51
+ trained_190k_steps/kafka.wav filter=lfs diff=lfs merge=lfs -text
52
+ trained_190k_steps/output.wav filter=lfs diff=lfs merge=lfs -text
53
+ trained_190k_steps/output_0.wav filter=lfs diff=lfs merge=lfs -text
54
+ trained_190k_steps/output_1.wav filter=lfs diff=lfs merge=lfs -text
55
+ trained_190k_steps/output_10.wav filter=lfs diff=lfs merge=lfs -text
56
+ trained_190k_steps/output_2.wav filter=lfs diff=lfs merge=lfs -text
57
+ trained_190k_steps/output_3.wav filter=lfs diff=lfs merge=lfs -text
58
+ trained_190k_steps/output_4.wav filter=lfs diff=lfs merge=lfs -text
59
+ trained_190k_steps/output_5.wav filter=lfs diff=lfs merge=lfs -text
60
+ trained_190k_steps/output_6.wav filter=lfs diff=lfs merge=lfs -text
61
+ trained_190k_steps/output_7.wav filter=lfs diff=lfs merge=lfs -text
62
+ trained_190k_steps/output_8.wav filter=lfs diff=lfs merge=lfs -text
63
+ trained_190k_steps/output_9.wav filter=lfs diff=lfs merge=lfs -text
64
+ trained_190k_steps/rtf_test_results/test_001.wav filter=lfs diff=lfs merge=lfs -text
65
+ trained_190k_steps/rtf_test_results/test_002.wav filter=lfs diff=lfs merge=lfs -text
66
+ trained_190k_steps/rtf_test_results/test_003.wav filter=lfs diff=lfs merge=lfs -text
67
+ trained_190k_steps/rtf_test_results/test_004.wav filter=lfs diff=lfs merge=lfs -text
68
+ trained_190k_steps/rtf_test_results/test_005.wav filter=lfs diff=lfs merge=lfs -text
69
+ trained_190k_steps/rtf_test_results/test_006.wav filter=lfs diff=lfs merge=lfs -text
70
+ trained_190k_steps/rtf_test_results/test_007.wav filter=lfs diff=lfs merge=lfs -text
71
+ trained_190k_steps/rtf_test_results/test_008.wav filter=lfs diff=lfs merge=lfs -text
72
+ trained_190k_steps/rtf_test_results/test_009.wav filter=lfs diff=lfs merge=lfs -text
73
+ trained_190k_steps/rtf_test_results/test_010.wav filter=lfs diff=lfs merge=lfs -text
74
+ trained_190k_steps/rtf_test_results/test_011.wav filter=lfs diff=lfs merge=lfs -text
75
+ trained_190k_steps/rtf_test_results/test_012.wav filter=lfs diff=lfs merge=lfs -text
76
+ trained_190k_steps/rtf_test_results/test_013.wav filter=lfs diff=lfs merge=lfs -text
77
+ trained_190k_steps/rtf_test_results/test_014.wav filter=lfs diff=lfs merge=lfs -text
78
+ trained_190k_steps/rtf_test_results/test_015.wav filter=lfs diff=lfs merge=lfs -text
79
+ trained_190k_steps/rtf_test_results/test_016.wav filter=lfs diff=lfs merge=lfs -text
80
+ trained_190k_steps/rtf_test_results/test_017.wav filter=lfs diff=lfs merge=lfs -text
81
+ trained_190k_steps/rtf_test_results/test_018.wav filter=lfs diff=lfs merge=lfs -text
82
+ trained_190k_steps/rtf_test_results/test_019.wav filter=lfs diff=lfs merge=lfs -text
83
+ trained_190k_steps/rtf_test_results/test_020.wav filter=lfs diff=lfs merge=lfs -text
84
+ trained_190k_steps/test.wav filter=lfs diff=lfs merge=lfs -text
trained_190k_steps/.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python build artifacts
2
+ __pycache__/
3
+ *.pyc
4
+
5
+ # Environment variables
6
+ .env
7
+
8
+ # Virtual environment
9
+ venv/
10
+
11
+ # Model backups and outputs
12
+ model.fp32.safetensors
13
+ output.wav
14
+
15
+ # Temporary scripts
16
+ check_dtype.py
trained_190k_steps/.vscode/launch.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // 使用 IntelliSense 了解相关属性。
3
+ // 悬停以查看现有属性的描述。
4
+ // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+
8
+ {
9
+ "name": "Python 调试程序: 当前文件",
10
+ "type": "debugpy",
11
+ "request": "launch",
12
+ "program": "${file}",
13
+ "console": "integratedTerminal",
14
+ "justMyCode": false
15
+ }
16
+ ]
17
+ }
trained_190k_steps/BATCH_INFERENCE_README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 批量推理功能说明
2
+
3
+ 本文档介绍了 ReSpark TTS 模型的批量推理功能,该功能可以显著提高多个文本的语音合成效率。
4
+
5
+ ## 使用方法
6
+
7
+ ### 基本批量推理
8
+ ```python
9
+ from utilities import generate_embeddings_batch
10
+ from tts_batch_infer import generate_speech_batch
11
+
12
+ # 准备文本列表
13
+ texts = [
14
+ "第一个要合成的文本。",
15
+ "第二个要合成的文本。",
16
+ "第三个要合成的文本。"
17
+ ]
18
+
19
+ # 批量生成语音
20
+ wavs = generate_speech_batch(
21
+ model, tokenizer, texts, audio_tokenizer,
22
+ prompt_text="提示文本",
23
+ prompt_audio=prompt_audio,
24
+ device=device
25
+ )
26
+
27
+ # 保存音频文件
28
+ for i, wav in enumerate(wavs):
29
+ sf.write(f'output_{i}.wav', wav, sample_rate)
30
+ ```
trained_190k_steps/BiCodec/config.yaml ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio_tokenizer:
2
+ mel_params:
3
+ sample_rate: 16000
4
+ n_fft: 1024
5
+ win_length: 640
6
+ hop_length: 320
7
+ mel_fmin: 10
8
+ mel_fmax: null
9
+ num_mels: 128
10
+
11
+ encoder:
12
+ input_channels: 1024
13
+ vocos_dim: 384
14
+ vocos_intermediate_dim: 2048
15
+ vocos_num_layers: 12
16
+ out_channels: 1024
17
+ sample_ratios: [1,1]
18
+
19
+ decoder:
20
+ input_channel: 1024
21
+ channels: 1536
22
+ rates: [8, 5, 4, 2]
23
+ kernel_sizes: [16,11,8,4]
24
+
25
+ quantizer:
26
+ input_dim: 1024
27
+ codebook_size: 8192
28
+ codebook_dim: 8
29
+ commitment: 0.25
30
+ codebook_loss_weight: 2.0
31
+ use_l2_normlize: True
32
+ threshold_ema_dead_code: 0.2
33
+
34
+ speaker_encoder:
35
+ input_dim: 128
36
+ out_dim: 1024
37
+ latent_dim: 128
38
+ token_num: 32
39
+ fsq_levels: [4, 4, 4, 4, 4, 4]
40
+ fsq_num_quantizers: 1
41
+
42
+ prenet:
43
+ input_channels: 1024
44
+ vocos_dim: 384
45
+ vocos_intermediate_dim: 2048
46
+ vocos_num_layers: 12
47
+ out_channels: 1024
48
+ condition_dim: 1024
49
+ sample_ratios: [1,1]
50
+ use_tanh_at_final: False
51
+
52
+ postnet:
53
+ input_channels: 1024
54
+ vocos_dim: 384
55
+ vocos_intermediate_dim: 2048
56
+ vocos_num_layers: 6
57
+ out_channels: 1024
58
+ use_tanh_at_final: False
59
+
60
+
trained_190k_steps/BiCodec/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9940cd48d4446e4340ced82d234bf5618350dd9f5db900ebe47a4fdb03867ec
3
+ size 625518756
trained_190k_steps/Readme.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # ReSpark TTS Model
6
+
7
+ This repository contains the ReSpark Text-to-Speech (TTS) model, a powerful and efficient model for generating high-quality speech from text. It is based on the RWKV architecture and utilizes the BiCodec tokenizer for audio processing.
8
+
9
+ ## Installation
10
+
11
+ First, install the required dependencies:
12
+
13
+ ```bash
14
+ pip install transformers rwkv-fla torch torchaudio torchvision transformers soundfile numpy librosa omegaconf soxr soundfile einx librosa
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ The `tts.py` script provides a complete example of how to use this model for text-to-speech synthesis with voice cloning.
20
+
21
+ ### Running the Test Script
22
+
23
+ To generate speech, simply run the script:
24
+
25
+ ```bash
26
+ python tts.py
27
+ ```
28
+
29
+ ### How it Works
30
+
31
+ The script performs the following steps:
32
+ 1. Loads the pre-trained `AutoModelForCausalLM` and `AutoTokenizer` from the current directory.
33
+ 2. Initializes the `BiCodecTokenizer` for audio encoding and decoding.
34
+ 3. Loads a reference audio file (`kafka.wav`) and its corresponding transcript (`prompt_text`) to provide a voice prompt.
35
+ 4. Resamples the reference audio to match the model's expected sample rate (24000 Hz).
36
+ 5. Takes a target text (`text`) to be synthesized.
37
+ 6. Calls the `generate_speech` function, which generates audio based on the target text and the voice from the reference audio.
38
+ 7. Saves the generated audio to `output.wav`.
39
+
40
+ You can modify the `prompt_text`, `prompt_audio_file`, and `text` variables in `tts.py` to synthesize different text with different voices.
41
+
42
+ ### Example Code (`tts.py`)
43
+
44
+ ```python
45
+ import os
46
+ import sys
47
+ current_dir = os.path.dirname(os.path.abspath(__file__))
48
+ print('add current dir to sys.path', current_dir)
49
+ sys.path.append(current_dir)
50
+ from sparktts.models.audio_tokenizer import BiCodecTokenizer
51
+ from transformers import AutoTokenizer, AutoModelForCausalLM
52
+ import soundfile as sf
53
+ import numpy as np
54
+ import torch
55
+ from utilities import generate_embeddings
56
+
57
+ def generate_speech(model, tokenizer, text, bicodec, prompt_text=None, prompt_audio=None,
58
+ max_new_tokens=3000, do_sample=True, top_k=50, top_p=0.95,
59
+ temperature=1.0, device="cuda:0"):
60
+ """
61
+ Function to generate speech.
62
+ """
63
+ eos_token_id = model.config.vocab_size - 1
64
+
65
+ embeddings = generate_embeddings(
66
+ model=model,
67
+ tokenizer=tokenizer,
68
+ text=text,
69
+ bicodec=bicodec,
70
+ prompt_text=prompt_text,
71
+ prompt_audio=prompt_audio
72
+ )
73
+
74
+ global_tokens = embeddings['global_tokens'].unsqueeze(0)
75
+ model.eval()
76
+
77
+ with torch.no_grad():
78
+ generated_outputs = model.generate(
79
+ inputs_embeds=embeddings['input_embs'],
80
+ attention_mask=torch.ones((1, embeddings['input_embs'].shape[1]),dtype=torch.long,device=device),
81
+ max_new_tokens=max_new_tokens,
82
+ do_sample=do_sample,
83
+ top_k=top_k,
84
+ top_p=top_p,
85
+ temperature=temperature,
86
+ eos_token_id=eos_token_id,
87
+ pad_token_id=tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
88
+ use_cache=True
89
+ )
90
+
91
+ semantic_tokens_tensor = generated_outputs[:,:-1]
92
+
93
+ with torch.no_grad():
94
+ wav = bicodec.detokenize(global_tokens, semantic_tokens_tensor)
95
+
96
+ return wav
97
+
98
+ # --- Main execution ---
99
+ device = 'cuda:0'
100
+
101
+ # Initialize tokenizers and model
102
+ audio_tokenizer = BiCodecTokenizer(model_dir=current_dir, device=device)
103
+ tokenizer = AutoTokenizer.from_pretrained(current_dir, trust_remote_code=True)
104
+ model = AutoModelForCausalLM.from_pretrained(current_dir, trust_remote_code=True)
105
+
106
+ model = model.bfloat16().to(device)
107
+ model.eval()
108
+
109
+ # Prepare prompt audio and text for voice cloning
110
+ prompt_text = "我们并不是通过物理移动手段找到星河的。"
111
+ prompt_audio_file = os.path.join(current_dir, 'kafka.wav')
112
+ prompt_audio, sampling_rate = sf.read(prompt_audio_file)
113
+
114
+ # Resample audio if necessary
115
+ target_sample_rate = audio_tokenizer.config['sample_rate']
116
+ if sampling_rate != target_sample_rate:
117
+ from librosa import resample
118
+ prompt_audio = resample(prompt_audio, orig_sr=sampling_rate, target_sr=target_sample_rate)
119
+ prompt_audio = np.array(prompt_audio, dtype=np.float32)
120
+
121
+ # Text to synthesize
122
+ text = "科学技术是第一生产力,最近 AI的迅猛发展让我们看到了迈向星辰大海的希望。"
123
+
124
+ # Generate speech
125
+ wav = generate_speech(model, tokenizer, text, audio_tokenizer, prompt_audio=prompt_audio, device=device)
126
+
127
+ # Save the output
128
+ sf.write('output.wav', wav, target_sample_rate)
129
+ print("Generated audio saved to output.wav")
130
+ ```
trained_190k_steps/Readme_zh.md ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+
5
+ # ReSpark TTS 模型
6
+
7
+ 本仓库包含 ReSpark 文本转语音 (TTS) 模型,这是一个强大而高效的模型,可以从文本生成高质量的语音。它基于 RWKV 架构,并利用 BiCodec-Tokenizer 进行音频处理。
8
+
9
+ ## 安装
10
+
11
+ 首先,请安装所需的依赖库:
12
+
13
+ ```bash
14
+ pip install transformers rwkv-fla torch torchaudio torchvision transformers soundfile numpy librosa omegaconf soxr soundfile einx librosa
15
+ ```
16
+
17
+ ## 使用方法
18
+
19
+ `tts.py` 脚本提供了一个完整的使用该模型进行文本转语音合成(带声音克隆功能)的示例。
20
+
21
+ ### 运行测试脚本
22
+
23
+ 要生成语音,只需运行以下脚本:
24
+
25
+ ```bash
26
+ python tts.py
27
+ ```
28
+
29
+ ### 工作原理
30
+
31
+ 该脚本执行以下步骤:
32
+ 1. 从当前目录加载预训练的 `AutoModelForCausalLM` 和 `AutoTokenizer`。
33
+ 2. 初始化用于音频编码和解码的 `BiCodecTokenizer`。
34
+ 3. 加载一个参考音频文件 (`kafka.wav`) 及其对应的文本 (`prompt_text`) 以提供声音提示(voice prompt)。
35
+ 4. 如果需要,将参考音频重采样以匹配模型期望的采样率 (24000 Hz)。
36
+ 5. 指定一个需要被合成的目标文本 (`text`)。
37
+ 6. 调用 `generate_speech` 函数,该函数会根据目标文本和参考音频中的声音生成音频。
38
+ 7. 将生成的音频保存到 `output.wav`。
39
+
40
+ 您可以修改 `tts.py` 文件中的 `prompt_text`、`prompt_audio_file` 和 `text` 变量,以使用不同的声音合成不同的文本。
41
+
42
+ ### 示例代码 (`tts.py`)
43
+
44
+ ```python
45
+ import os
46
+ import sys
47
+ current_dir = os.path.dirname(os.path.abspath(__file__))
48
+ print('add current dir to sys.path', current_dir)
49
+ sys.path.append(current_dir)
50
+ from sparktts.models.audio_tokenizer import BiCodecTokenizer
51
+ from transformers import AutoTokenizer, AutoModelForCausalLM
52
+ import soundfile as sf
53
+ import numpy as np
54
+ import torch
55
+ from utilities import generate_embeddings
56
+
57
+ def generate_speech(model, tokenizer, text, bicodec, prompt_text=None, prompt_audio=None,
58
+ max_new_tokens=3000, do_sample=True, top_k=50, top_p=0.95,
59
+ temperature=1.0, device="cuda:0"):
60
+ """
61
+ 生成语音的函数
62
+ """
63
+ eos_token_id = model.config.vocab_size - 1
64
+
65
+ embeddings = generate_embeddings(
66
+ model=model,
67
+ tokenizer=tokenizer,
68
+ text=text,
69
+ bicodec=bicodec,
70
+ prompt_text=prompt_text,
71
+ prompt_audio=prompt_audio
72
+ )
73
+
74
+ global_tokens = embeddings['global_tokens'].unsqueeze(0)
75
+ model.eval()
76
+
77
+ with torch.no_grad():
78
+ generated_outputs = model.generate(
79
+ inputs_embeds=embeddings['input_embs'],
80
+ attention_mask=torch.ones((1, embeddings['input_embs'].shape[1]),dtype=torch.long,device=device),
81
+ max_new_tokens=max_new_tokens,
82
+ do_sample=do_sample,
83
+ top_k=top_k,
84
+ top_p=top_p,
85
+ temperature=temperature,
86
+ eos_token_id=eos_token_id,
87
+ pad_token_id=tokenizer.pad_token_id if hasattr(tokenizer, 'pad_token_id') else tokenizer.eos_token_id,
88
+ use_cache=True
89
+ )
90
+
91
+ semantic_tokens_tensor = generated_outputs[:,:-1]
92
+
93
+ with torch.no_grad():
94
+ wav = bicodec.detokenize(global_tokens, semantic_tokens_tensor)
95
+
96
+ return wav
97
+
98
+ # --- 主程序 ---
99
+ device = 'cuda:0'
100
+
101
+ # 初始化分词器和模型
102
+ audio_tokenizer = BiCodecTokenizer(model_dir=current_dir, device=device)
103
+ tokenizer = AutoTokenizer.from_pretrained(current_dir, trust_remote_code=True)
104
+ model = AutoModelForCausalLM.from_pretrained(current_dir, trust_remote_code=True)
105
+
106
+ model = model.bfloat16().to(device)
107
+ model.eval()
108
+
109
+ # 准备用于声音克隆的提示音频和文本
110
+ prompt_text = "我们并不是通过物理移动手段找到星河的。"
111
+ prompt_audio_file = os.path.join(current_dir, 'kafka.wav')
112
+ prompt_audio, sampling_rate = sf.read(prompt_audio_file)
113
+
114
+ # 如果需要,重采样音频
115
+ target_sample_rate = audio_tokenizer.config['sample_rate']
116
+ if sampling_rate != target_sample_rate:
117
+ from librosa import resample
118
+ prompt_audio = resample(prompt_audio, orig_sr=sampling_rate, target_sr=target_sample_rate)
119
+ prompt_audio = np.array(prompt_audio, dtype=np.float32)
120
+
121
+ # 要合成的文本
122
+ text = "科学技术是第一生产力,最近 AI的迅猛发展让我们看到了迈向星辰大海的希望。"
123
+
124
+ # 生成语音
125
+ wav = generate_speech(model, tokenizer, text, audio_tokenizer, prompt_audio=prompt_audio, device=device)
126
+
127
+ # 保存输出
128
+ sf.write('output.wav', wav, target_sample_rate)
129
+ print("生成的音频已保存到 output.wav")
130
+ ```
trained_190k_steps/__init__.py ADDED
File without changes
trained_190k_steps/__pycache__/spark_llm.cpython-311.pyc ADDED
Binary file (10.6 kB). View file
 
trained_190k_steps/__pycache__/utilities.cpython-311.pyc ADDED
Binary file (20.5 kB). View file
 
trained_190k_steps/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<|rwkv_tokenizer_end_of_text|>": 0
3
+ }
trained_190k_steps/config.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "a_low_rank_dim": 64,
3
+ "architectures": [
4
+ "RWKV7ForSpeech"
5
+ ],
6
+ "attn": null,
7
+ "attn_mode": "chunk",
8
+ "audio_global_vocab_size": 4096,
9
+ "auto_map": {
10
+ "AutoConfig": "modeling_rwkvspeech.RWKV7SpeechConfig",
11
+ "AutoModel": "modeling_rwkvspeech.RWKV7Model",
12
+ "AutoModelForCausalLM": "modeling_rwkvspeech.RWKV7ForSpeech"
13
+ },
14
+ "bos_token_id": 0,
15
+ "decay_low_rank_dim": 64,
16
+ "eos_token_id": 0,
17
+ "fuse_cross_entropy": true,
18
+ "fuse_norm": false,
19
+ "gate_low_rank_dim": 128,
20
+ "head_dim": 64,
21
+ "hidden_act": "sqrelu",
22
+ "hidden_ratio": 4.0,
23
+ "hidden_size": 1024,
24
+ "initializer_range": 0.006,
25
+ "intermediate_size": 4096,
26
+ "max_position_embeddings": 2048,
27
+ "model_type": "rwkv7",
28
+ "norm_bias": true,
29
+ "norm_eps": 1e-05,
30
+ "norm_first": true,
31
+ "num_heads": 32,
32
+ "num_hidden_layers": 24,
33
+ "text_vocab_size": 65536,
34
+ "tie_word_embeddings": false,
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.52.4",
37
+ "use_cache": true,
38
+ "v_low_rank_dim": 32,
39
+ "value_dim": [
40
+ 1024,
41
+ 1024,
42
+ 1024,
43
+ 1024,
44
+ 1024,
45
+ 1024,
46
+ 1024,
47
+ 1024,
48
+ 1024,
49
+ 1024,
50
+ 1024,
51
+ 1024,
52
+ 1024,
53
+ 1024,
54
+ 1024,
55
+ 1024,
56
+ 1024,
57
+ 1024,
58
+ 1024,
59
+ 1024,
60
+ 1024,
61
+ 1024,
62
+ 1024,
63
+ 1024
64
+ ],
65
+ "vocab_size": 8193
66
+ }
trained_190k_steps/config.yaml ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ highpass_cutoff_freq: 40
2
+ sample_rate: 16000
3
+ segment_duration: 2.4 # (s)
4
+ max_val_duration: 12 # (s)
5
+ latent_hop_length: 320
6
+ ref_segment_duration: 6
7
+ volume_normalize: true
trained_190k_steps/configuration_rwkv7.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Dict, Optional
4
+
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class RWKV7Config(PretrainedConfig):
9
+
10
+ model_type = 'rwkv7'
11
+ keys_to_ignore_at_inference = ['past_key_values']
12
+
13
+ def __init__(
14
+ self,
15
+ attn_mode: str = "chunk",
16
+ hidden_size: int = 2048,
17
+ hidden_ratio: Optional[int] = 4,
18
+ intermediate_size: Optional[int] = None,
19
+ num_hidden_layers: int = 24,
20
+ head_dim: Optional[int] = 64,
21
+ num_heads: Optional[int] = None,
22
+ decay_low_rank_dim: int = 64,
23
+ gate_low_rank_dim: int = 128,
24
+ a_low_rank_dim: int = 64,
25
+ v_low_rank_dim: int = 16,
26
+ hidden_act: str = "sqrelu",
27
+ max_position_embeddings: int = 2048,
28
+ norm_first: bool = True,
29
+ norm_bias: bool = True,
30
+ norm_eps: float = 1e-5,
31
+ attn: Optional[Dict] = None,
32
+ use_cache: bool = True,
33
+ pad_token_id: int = None,
34
+ bos_token_id: int = 1,
35
+ eos_token_id: int = 2,
36
+ tie_word_embeddings: bool = False,
37
+ initializer_range: float = 0.006,
38
+ fuse_norm: bool = True,
39
+ fuse_cross_entropy: bool = True,
40
+ vocab_size: int = 32000,
41
+ **kwargs
42
+ ):
43
+ self.attn_mode = attn_mode
44
+ self.hidden_size = hidden_size
45
+ self.hidden_ratio = hidden_ratio
46
+ self.intermediate_size = intermediate_size
47
+ self.norm_first = norm_first
48
+ self.num_hidden_layers = num_hidden_layers
49
+
50
+ if head_dim is None and num_heads is not None:
51
+ head_dim = int(hidden_size // num_heads)
52
+ elif head_dim is not None and num_heads is None:
53
+ num_heads = int(hidden_size // head_dim)
54
+
55
+ self.head_dim = head_dim
56
+ self.num_heads = num_heads
57
+
58
+ self.decay_low_rank_dim = decay_low_rank_dim
59
+ self.gate_low_rank_dim = gate_low_rank_dim
60
+ self.a_low_rank_dim = a_low_rank_dim
61
+ self.v_low_rank_dim = v_low_rank_dim
62
+ self.hidden_act = hidden_act
63
+ self.max_position_embeddings = max_position_embeddings
64
+ self.norm_bias = norm_bias
65
+ self.norm_eps = norm_eps
66
+ self.attn = attn
67
+ self.use_cache = use_cache
68
+ self.initializer_range = initializer_range
69
+ self.fuse_norm = fuse_norm
70
+ self.fuse_cross_entropy = fuse_cross_entropy
71
+ self.vocab_size = vocab_size
72
+
73
+ if attn is not None:
74
+ if not isinstance(attn, Dict):
75
+ raise ValueError("attn must be a dictionary")
76
+ if 'layers' not in attn:
77
+ raise ValueError("Layer indices must be provided to initialize hybrid attention layers")
78
+ if 'num_heads' not in attn:
79
+ raise ValueError("Number of heads must be provided to initialize hybrid attention layers")
80
+ attn['num_kv_heads'] = attn.get('num_kv_heads', attn['num_heads'])
81
+ attn['qkv_bias'] = attn.get('qkv_bias', False)
82
+ attn['window_size'] = attn.get('window_size', None)
83
+ attn['rope_theta'] = attn.get('rope_theta', 10000.)
84
+
85
+ super().__init__(
86
+ pad_token_id=pad_token_id,
87
+ bos_token_id=bos_token_id,
88
+ eos_token_id=eos_token_id,
89
+ tie_word_embeddings=tie_word_embeddings,
90
+ **kwargs,
91
+ )
trained_190k_steps/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "transformers_version": "4.52.4"
6
+ }
trained_190k_steps/hf_rwkv_tokenizer.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for RWKV."""
16
+
17
+ import os
18
+ import re
19
+ from typing import TYPE_CHECKING, List, Optional, Tuple
20
+
21
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
22
+ from transformers.utils import logging
23
+
24
+
25
+ if TYPE_CHECKING:
26
+ pass
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ VOCAB_FILES_NAMES = {
32
+ "vocab_file": "rwkv_vocab_v20230424.txt",
33
+ }
34
+
35
+ class TRIE:
36
+ __slots__ = tuple("ch,to,values,front".split(","))
37
+ to: list
38
+ values: set
39
+
40
+ def __init__(self, front=None, ch=None):
41
+ self.ch = ch
42
+ self.to = [None for ch in range(256)]
43
+ self.values = set()
44
+ self.front = front
45
+
46
+ def __repr__(self):
47
+ fr = self
48
+ ret = []
49
+ while fr != None:
50
+ if fr.ch != None:
51
+ ret.append(fr.ch)
52
+ fr = fr.front
53
+ return "<TRIE %s %s>" % (ret[::-1], self.values)
54
+
55
+ def add(self, key: bytes, idx: int = 0, val=None):
56
+ if idx == len(key):
57
+ if val is None:
58
+ val = key
59
+ self.values.add(val)
60
+ return self
61
+ ch = key[idx]
62
+ if self.to[ch] is None:
63
+ self.to[ch] = TRIE(front=self, ch=ch)
64
+ return self.to[ch].add(key, idx=idx + 1, val=val)
65
+
66
+ def find_longest(self, key: bytes, idx: int = 0):
67
+ u: TRIE = self
68
+ ch: int = key[idx]
69
+
70
+ while u.to[ch] is not None:
71
+ u = u.to[ch]
72
+ idx += 1
73
+ if u.values:
74
+ ret = idx, u, u.values
75
+ if idx == len(key):
76
+ break
77
+ ch = key[idx]
78
+ return ret
79
+
80
+
81
+ class RWKV_TOKENIZER:
82
+ def __init__(self, file_name):
83
+ self.idx2token = {}
84
+ sorted = [] # must be already sorted
85
+ with open(file_name, "r", encoding="utf-8") as f:
86
+ lines = f.readlines()
87
+ for l in lines:
88
+ idx = int(l[: l.index(" ")])
89
+ x = eval(l[l.index(" ") : l.rindex(" ")])
90
+ x = x.encode("utf-8") if isinstance(x, str) else x
91
+ assert isinstance(x, bytes)
92
+
93
+ assert len(x) == int(l[l.rindex(" ") :])
94
+ sorted += [x]
95
+ self.idx2token[idx] = x
96
+
97
+ self.token2idx = {}
98
+ for k, v in self.idx2token.items():
99
+ self.token2idx[v] = int(k)
100
+
101
+ self.root = TRIE()
102
+ for t, i in self.token2idx.items():
103
+ _ = self.root.add(t, val=(t, i))
104
+
105
+ def encodeBytes(self, src: bytes):
106
+ idx: int = 0
107
+ tokens = []
108
+ while idx < len(src):
109
+ _idx: int = idx
110
+ idx, _, values = self.root.find_longest(src, idx)
111
+ assert idx != _idx
112
+ _, token = next(iter(values))
113
+ tokens.append(token)
114
+ return tokens
115
+
116
+ def decodeBytes(self, tokens):
117
+ return b"".join(map(lambda i: self.idx2token[i], tokens))
118
+
119
+ def encode(self, src):
120
+ if isinstance(src, str):
121
+ return [self.encodeBytes(src.encode("utf-8"))]
122
+ elif isinstance(src, list):
123
+ return [self.encodeBytes(s.encode("utf-8")) for s in src]
124
+
125
+ def decode(self, tokens):
126
+ return [self.decodeBytes(batch).decode("utf-8") for batch in tokens]
127
+ # try:
128
+ # return self.decodeBytes(tokens).decode('utf-8')
129
+ # except:
130
+ # return '\ufffd' # bad utf-8
131
+
132
+ def printTokens(self, tokens):
133
+ for i in tokens:
134
+ s = self.idx2token[i]
135
+ try:
136
+ s = s.decode("utf-8")
137
+ except:
138
+ pass
139
+ print(f"{repr(s)}{i}", end=" ")
140
+ print()
141
+
142
+
143
+ class RwkvTokenizer(PreTrainedTokenizer):
144
+ vocab_files_names = VOCAB_FILES_NAMES
145
+ model_input_names = ["input_ids", "attention_mask"]
146
+
147
+ def __init__(
148
+ self, vocab_file, bos_token="<|rwkv_tokenizer_end_of_text|>", eos_token="<|rwkv_tokenizer_end_of_text|>", unk_token="<|rwkv_tokenizer_end_of_text|>", **kwargs
149
+ ):
150
+ if not os.path.isfile(vocab_file):
151
+ raise ValueError(
152
+ f"Can't find a vocabulary file at path '{vocab_file}'."
153
+ )
154
+
155
+ with open(vocab_file, "r", encoding="utf-8") as reader:
156
+ tokens = reader.readlines()
157
+
158
+ if "add_bos_token" in kwargs:
159
+ self.add_bos_token = kwargs["add_bos_token"]
160
+ else:
161
+ self.add_bos_token = False
162
+ self.trie_tokenizer = RWKV_TOKENIZER(vocab_file)
163
+ vocab = self.trie_tokenizer.token2idx
164
+ self.encoder = vocab
165
+ self.decoder = {v: k for k, v in vocab.items()}
166
+ self._added_tokens_decoder = {0: AddedToken(str(bos_token))}
167
+ super().__init__(
168
+ bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
169
+ )
170
+
171
+ @property
172
+ def vocab_size(self):
173
+ return len(self.encoder)
174
+
175
+ def get_vocab(self):
176
+ vocab = self.encoder
177
+ vocab.update(self.added_tokens_encoder)
178
+ vocab = dict(sorted(vocab.items(), key=lambda item: item[1]))
179
+ return vocab
180
+
181
+ def _tokenize(self, text, split_special_tokens=False):
182
+ # return self.wordpiece_tokenizer.tokenize(text.encode("utf-8"))
183
+ return self.trie_tokenizer.encode(text)[0]
184
+
185
+ def _convert_token_to_id(self, token):
186
+ return token
187
+
188
+ def _convert_id_to_token(self, index):
189
+ """Converts an index (integer) in a token (byte) using the vocab."""
190
+ token = self.decoder.get(index, self.unk_token)
191
+ if isinstance(token, (bytes)):
192
+ token = token.decode("utf-8", errors="replace")
193
+ return token
194
+
195
+ def convert_tokens_to_string(self, tokens):
196
+ """Converts a sequence of tokens (bytes) in a single string. Additional tokens are encoded to bytes"""
197
+ out_string = b"".join(
198
+ [k.encode(errors="replace") if isinstance(k, str) else k for k in tokens]
199
+ ).decode("utf-8")
200
+ return out_string
201
+
202
+ def save_vocabulary(
203
+ self, save_directory: str, filename_prefix: Optional[str] = None
204
+ ) -> Tuple[str]:
205
+ index = 0
206
+ if os.path.isdir(save_directory):
207
+ vocab_file = os.path.join(
208
+ save_directory,
209
+ (filename_prefix + "-" if filename_prefix else "") + "vocab.txt",
210
+ )
211
+ else:
212
+ vocab_file = (
213
+ filename_prefix + "-" if filename_prefix else ""
214
+ ) + save_directory
215
+ with open(vocab_file, "w", encoding="utf-8") as writer:
216
+ for token, token_index in sorted(
217
+ self.encoder.items(), key=lambda kv: kv[1]
218
+ ):
219
+ if index != token_index:
220
+ logger.warning(
221
+ f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
222
+ " Please check that the vocabulary is not corrupted!"
223
+ )
224
+ index = token_index
225
+ writer.write(str(token) + "\n")
226
+ index += 1
227
+ return (vocab_file,)
228
+
229
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
230
+ if self.add_bos_token:
231
+ bos_token_ids = [self.bos_token_id]
232
+ else:
233
+ bos_token_ids = []
234
+
235
+ output = bos_token_ids + token_ids_0
236
+
237
+ if token_ids_1 is None:
238
+ return output
239
+
240
+ return output + bos_token_ids + token_ids_1
241
+
242
+ def get_special_tokens_mask(
243
+ self,
244
+ token_ids_0: List[int],
245
+ token_ids_1: Optional[List[int]] = None,
246
+ already_has_special_tokens: bool = False,
247
+ ) -> List[int]:
248
+ """
249
+ Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
250
+ special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
251
+
252
+ Args:
253
+ token_ids_0 (`List[int]`):
254
+ List of IDs.
255
+ token_ids_1 (`List[int]`, *optional*):
256
+ Optional second list of IDs for sequence pairs.
257
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
258
+ Whether or not the token list is already formatted with special tokens for the model.
259
+
260
+ Returns:
261
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
262
+ """
263
+ if already_has_special_tokens:
264
+ return super().get_special_tokens_mask(
265
+ token_ids_0=token_ids_0,
266
+ token_ids_1=token_ids_1,
267
+ already_has_special_tokens=True,
268
+ )
269
+
270
+ if not self.add_bos_token:
271
+ return super().get_special_tokens_mask(
272
+ token_ids_0=token_ids_0,
273
+ token_ids_1=token_ids_1,
274
+ already_has_special_tokens=False,
275
+ )
276
+
277
+ if token_ids_1 is None:
278
+ return [1] + ([0] * len(token_ids_0))
279
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1))
280
+
trained_190k_steps/kafka.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7928aeaf90600d6a014a5fececdc59cdf0e2971db327a0cf56b922b7cd8f8a7
3
+ size 265524
trained_190k_steps/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77c5578b1aaab351a1c89b8695ec465456268a5586020c5046fcd8544328a002
3
+ size 809355976
trained_190k_steps/modeling_rwkvspeech.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from spark_llm import RWKV7SpeechConfig,RWKV7ForSpeech
2
+ from rwkvfla.models.rwkv7 import RWKV7Model
3
+
4
+ RWKV7ForCausalLM = RWKV7ForSpeech
5
+ RWKV7Model = RWKV7Model
6
+ RWKV7Config = RWKV7SpeechConfig
trained_190k_steps/output.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b56b3b68f11fdb8539634bb27312f1346b3876ede818d311f6c89dd8b8e94dd
3
+ size 499244
trained_190k_steps/output_0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:559514056f3c45e9362937acedf1a6a4def27f14443842ff1193dad9c6a274a3
3
+ size 439724
trained_190k_steps/output_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e4afaaa5498043eff67996feb3310b0b02d1fb62cdf52c7c93fcbd6c936be9d
3
+ size 228524
trained_190k_steps/output_10.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e28726c1f0d4199c061bebb5770c14f448f177fe1d6d9341596bd112b5b0fc9f
3
+ size 133804
trained_190k_steps/output_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80e10f88d2214da41180723783bed9bdfced387834377b92e6dad064174b0b1
3
+ size 150444
trained_190k_steps/output_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbbef7cad347491d556cf82287293c353588ec2bc0c2247110766004d6c1dd2e
3
+ size 586284
trained_190k_steps/output_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d00b4160fc563d9149a8bc834186542807f853d270db3b27a2d2c39a004edc4
3
+ size 209964
trained_190k_steps/output_5.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1049970bb90cf801a9e2ffac5f66715a277928b495900e4a7ce940f0c18f49c8
3
+ size 256044
trained_190k_steps/output_6.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c46bea5a3c52f6832612f92f0f89c8aa8ea9f9b15b0d4afcd3152c24037692a
3
+ size 184364
trained_190k_steps/output_7.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:628147e65cdd71c534ff65c47d4d80a21588522c78a1ddfae2ec1c1d0d83b103
3
+ size 307244
trained_190k_steps/output_8.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:040f941a4a1962d4c8b8cad6d1b0b6b0db3b36a955d2801cd1c59992fb6053c9
3
+ size 171564
trained_190k_steps/output_9.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f87822f47fe7cf8626bb0e9de61455e862063cfdbfc8443e7e2f7aee93d2b169
3
+ size 241964
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_103617.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_info": {
3
+ "timestamp": "20250714_103617",
4
+ "device": "cuda:2",
5
+ "model_path": "/home/yueyulin/tmp/respark",
6
+ "batch_size": 4
7
+ },
8
+ "statistics": {
9
+ "total_tests": 20,
10
+ "successful_tests": 20,
11
+ "failed_tests": 0,
12
+ "batch_size": 4,
13
+ "total_batches": 5,
14
+ "total_processing_time": 100.95156717300415,
15
+ "total_audio_length": 123.94000000000001,
16
+ "total_rtf": 1.227717443827294,
17
+ "avg_rtf": 1.4573588824585872,
18
+ "avg_processing_time": 5.047578358650208,
19
+ "avg_audio_length": 6.197000000000001,
20
+ "min_rtf": 0.3893700157312124,
21
+ "max_rtf": 2.0969803734054207,
22
+ "std_rtf": 0.48215195815676065
23
+ },
24
+ "detailed_results": [
25
+ {
26
+ "index": 1,
27
+ "batch": 1,
28
+ "text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
29
+ "processing_time": 11.5057652592659,
30
+ "audio_length": 14.06,
31
+ "rtf": 1.2219960761564388,
32
+ "output_file": "rtf_test_results/test_001.wav"
33
+ },
34
+ {
35
+ "index": 2,
36
+ "batch": 1,
37
+ "text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
38
+ "processing_time": 11.5057652592659,
39
+ "audio_length": 6.16,
40
+ "rtf": 0.5353837716304171,
41
+ "output_file": "rtf_test_results/test_002.wav"
42
+ },
43
+ {
44
+ "index": 3,
45
+ "batch": 1,
46
+ "text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
47
+ "processing_time": 11.5057652592659,
48
+ "audio_length": 4.48,
49
+ "rtf": 0.3893700157312124,
50
+ "output_file": "rtf_test_results/test_003.wav"
51
+ },
52
+ {
53
+ "index": 4,
54
+ "batch": 1,
55
+ "text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
56
+ "processing_time": 11.5057652592659,
57
+ "audio_length": 13.54,
58
+ "rtf": 1.1768013421876373,
59
+ "output_file": "rtf_test_results/test_004.wav"
60
+ },
61
+ {
62
+ "index": 5,
63
+ "batch": 2,
64
+ "text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
65
+ "processing_time": 4.353760182857513,
66
+ "audio_length": 4.82,
67
+ "rtf": 1.1070889983739247,
68
+ "output_file": "rtf_test_results/test_005.wav"
69
+ },
70
+ {
71
+ "index": 6,
72
+ "batch": 2,
73
+ "text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
74
+ "processing_time": 4.353760182857513,
75
+ "audio_length": 5.9,
76
+ "rtf": 1.3551504336942233,
77
+ "output_file": "rtf_test_results/test_006.wav"
78
+ },
79
+ {
80
+ "index": 7,
81
+ "batch": 2,
82
+ "text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
83
+ "processing_time": 4.353760182857513,
84
+ "audio_length": 9.02,
85
+ "rtf": 2.071772357952863,
86
+ "output_file": "rtf_test_results/test_007.wav"
87
+ },
88
+ {
89
+ "index": 8,
90
+ "batch": 2,
91
+ "text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
92
+ "processing_time": 4.353760182857513,
93
+ "audio_length": 6.0,
94
+ "rtf": 1.3781190851127694,
95
+ "output_file": "rtf_test_results/test_008.wav"
96
+ },
97
+ {
98
+ "index": 9,
99
+ "batch": 3,
100
+ "text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
101
+ "processing_time": 4.711536705493927,
102
+ "audio_length": 9.88,
103
+ "rtf": 2.0969803734054207,
104
+ "output_file": "rtf_test_results/test_009.wav"
105
+ },
106
+ {
107
+ "index": 10,
108
+ "batch": 3,
109
+ "text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
110
+ "processing_time": 4.711536705493927,
111
+ "audio_length": 5.8,
112
+ "rtf": 1.2310208669788905,
113
+ "output_file": "rtf_test_results/test_010.wav"
114
+ },
115
+ {
116
+ "index": 11,
117
+ "batch": 3,
118
+ "text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
119
+ "processing_time": 4.711536705493927,
120
+ "audio_length": 6.9,
121
+ "rtf": 1.4644903417507493,
122
+ "output_file": "rtf_test_results/test_011.wav"
123
+ },
124
+ {
125
+ "index": 12,
126
+ "batch": 3,
127
+ "text": "数智技术服务港,让科技触手可及,让创新无处不在。",
128
+ "processing_time": 4.711536705493927,
129
+ "audio_length": 3.86,
130
+ "rtf": 0.8192656114721582,
131
+ "output_file": "rtf_test_results/test_012.wav"
132
+ },
133
+ {
134
+ "index": 13,
135
+ "batch": 4,
136
+ "text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
137
+ "processing_time": 2.7456793189048767,
138
+ "audio_length": 4.4,
139
+ "rtf": 1.602517806687984,
140
+ "output_file": "rtf_test_results/test_013.wav"
141
+ },
142
+ {
143
+ "index": 14,
144
+ "batch": 4,
145
+ "text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
146
+ "processing_time": 2.7456793189048767,
147
+ "audio_length": 5.68,
148
+ "rtf": 2.0687048049972154,
149
+ "output_file": "rtf_test_results/test_014.wav"
150
+ },
151
+ {
152
+ "index": 15,
153
+ "batch": 4,
154
+ "text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
155
+ "processing_time": 2.7456793189048767,
156
+ "audio_length": 5.2,
157
+ "rtf": 1.8938846806312535,
158
+ "output_file": "rtf_test_results/test_015.wav"
159
+ },
160
+ {
161
+ "index": 16,
162
+ "batch": 4,
163
+ "text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
164
+ "processing_time": 2.7456793189048767,
165
+ "audio_length": 4.86,
166
+ "rtf": 1.770053759205364,
167
+ "output_file": "rtf_test_results/test_016.wav"
168
+ },
169
+ {
170
+ "index": 17,
171
+ "batch": 5,
172
+ "text": "计算机视觉技术让机器能够理解和分析图像内容。",
173
+ "processing_time": 1.9211503267288208,
174
+ "audio_length": 3.4,
175
+ "rtf": 1.7697730118752573,
176
+ "output_file": "rtf_test_results/test_017.wav"
177
+ },
178
+ {
179
+ "index": 18,
180
+ "batch": 5,
181
+ "text": "自然语言处理技术使计算机能够理解和生成人类语言。",
182
+ "processing_time": 1.9211503267288208,
183
+ "audio_length": 3.9,
184
+ "rtf": 2.0300337489157365,
185
+ "output_file": "rtf_test_results/test_018.wav"
186
+ },
187
+ {
188
+ "index": 19,
189
+ "batch": 5,
190
+ "text": "语音合成技术将文本转换为自然的语音输出。",
191
+ "processing_time": 1.9211503267288208,
192
+ "audio_length": 3.06,
193
+ "rtf": 1.5927957106877317,
194
+ "output_file": "rtf_test_results/test_019.wav"
195
+ },
196
+ {
197
+ "index": 20,
198
+ "batch": 5,
199
+ "text": "大数据分析帮助企业发现隐藏的模式和趋势。",
200
+ "processing_time": 1.9211503267288208,
201
+ "audio_length": 3.02,
202
+ "rtf": 1.5719748517244934,
203
+ "output_file": "rtf_test_results/test_020.wav"
204
+ }
205
+ ]
206
+ }
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104243.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_info": {
3
+ "timestamp": "20250714_104243",
4
+ "device": "cuda:2",
5
+ "model_path": "/home/yueyulin/tmp/respark",
6
+ "batch_size": 4
7
+ },
8
+ "statistics": {
9
+ "total_tests": 20,
10
+ "successful_tests": 20,
11
+ "failed_tests": 0,
12
+ "batch_size": 4,
13
+ "total_batches": 5,
14
+ "total_processing_time": 97.51047849655151,
15
+ "total_audio_length": 127.58000000000001,
16
+ "total_rtf": 1.3083722074496016,
17
+ "avg_batch_rtf": 1.5580788079346588,
18
+ "avg_batch_processing_time": 19.502095699310303,
19
+ "avg_audio_length": 6.3790000000000004,
20
+ "min_batch_rtf": 0.9216863966136871,
21
+ "max_batch_rtf": 1.9146887765353369,
22
+ "std_batch_rtf": 0.35584074129598514
23
+ },
24
+ "detailed_results": [
25
+ {
26
+ "index": 1,
27
+ "batch": 1,
28
+ "text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
29
+ "batch_processing_time": 46.76210927963257,
30
+ "audio_length": 15.72,
31
+ "batch_rtf": 0.9216863966136871,
32
+ "output_file": "rtf_test_results/test_001.wav"
33
+ },
34
+ {
35
+ "index": 2,
36
+ "batch": 1,
37
+ "text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
38
+ "batch_processing_time": 46.76210927963257,
39
+ "audio_length": 6.66,
40
+ "batch_rtf": 0.9216863966136871,
41
+ "output_file": "rtf_test_results/test_002.wav"
42
+ },
43
+ {
44
+ "index": 3,
45
+ "batch": 1,
46
+ "text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
47
+ "batch_processing_time": 46.76210927963257,
48
+ "audio_length": 4.82,
49
+ "batch_rtf": 0.9216863966136871,
50
+ "output_file": "rtf_test_results/test_003.wav"
51
+ },
52
+ {
53
+ "index": 4,
54
+ "batch": 1,
55
+ "text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
56
+ "batch_processing_time": 46.76210927963257,
57
+ "audio_length": 15.9,
58
+ "batch_rtf": 0.9216863966136871,
59
+ "output_file": "rtf_test_results/test_004.wav"
60
+ },
61
+ {
62
+ "index": 5,
63
+ "batch": 2,
64
+ "text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
65
+ "batch_processing_time": 15.214709043502808,
66
+ "audio_length": 4.66,
67
+ "batch_rtf": 1.6300015944498414,
68
+ "output_file": "rtf_test_results/test_005.wav"
69
+ },
70
+ {
71
+ "index": 6,
72
+ "batch": 2,
73
+ "text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
74
+ "batch_processing_time": 15.214709043502808,
75
+ "audio_length": 5.86,
76
+ "batch_rtf": 1.6300015944498414,
77
+ "output_file": "rtf_test_results/test_006.wav"
78
+ },
79
+ {
80
+ "index": 7,
81
+ "batch": 2,
82
+ "text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
83
+ "batch_processing_time": 15.214709043502808,
84
+ "audio_length": 8.06,
85
+ "batch_rtf": 1.6300015944498414,
86
+ "output_file": "rtf_test_results/test_007.wav"
87
+ },
88
+ {
89
+ "index": 8,
90
+ "batch": 2,
91
+ "text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
92
+ "batch_processing_time": 15.214709043502808,
93
+ "audio_length": 6.22,
94
+ "batch_rtf": 1.6300015944498414,
95
+ "output_file": "rtf_test_results/test_008.wav"
96
+ },
97
+ {
98
+ "index": 9,
99
+ "batch": 3,
100
+ "text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
101
+ "batch_processing_time": 17.258368730545044,
102
+ "audio_length": 8.92,
103
+ "batch_rtf": 1.4694320416921058,
104
+ "output_file": "rtf_test_results/test_009.wav"
105
+ },
106
+ {
107
+ "index": 10,
108
+ "batch": 3,
109
+ "text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
110
+ "batch_processing_time": 17.258368730545044,
111
+ "audio_length": 5.68,
112
+ "batch_rtf": 1.4694320416921058,
113
+ "output_file": "rtf_test_results/test_010.wav"
114
+ },
115
+ {
116
+ "index": 11,
117
+ "batch": 3,
118
+ "text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
119
+ "batch_processing_time": 17.258368730545044,
120
+ "audio_length": 6.86,
121
+ "batch_rtf": 1.4694320416921058,
122
+ "output_file": "rtf_test_results/test_011.wav"
123
+ },
124
+ {
125
+ "index": 12,
126
+ "batch": 3,
127
+ "text": "数智技术服务港,让科技触手可及,让创新无处不在。",
128
+ "batch_processing_time": 17.258368730545044,
129
+ "audio_length": 3.9,
130
+ "batch_rtf": 1.4694320416921058,
131
+ "output_file": "rtf_test_results/test_012.wav"
132
+ },
133
+ {
134
+ "index": 13,
135
+ "batch": 4,
136
+ "text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
137
+ "batch_processing_time": 11.172309398651123,
138
+ "audio_length": 4.96,
139
+ "batch_rtf": 1.8545852303823243,
140
+ "output_file": "rtf_test_results/test_013.wav"
141
+ },
142
+ {
143
+ "index": 14,
144
+ "batch": 4,
145
+ "text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
146
+ "batch_processing_time": 11.172309398651123,
147
+ "audio_length": 5.72,
148
+ "batch_rtf": 1.8545852303823243,
149
+ "output_file": "rtf_test_results/test_014.wav"
150
+ },
151
+ {
152
+ "index": 15,
153
+ "batch": 4,
154
+ "text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
155
+ "batch_processing_time": 11.172309398651123,
156
+ "audio_length": 5.5,
157
+ "batch_rtf": 1.8545852303823243,
158
+ "output_file": "rtf_test_results/test_015.wav"
159
+ },
160
+ {
161
+ "index": 16,
162
+ "batch": 4,
163
+ "text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
164
+ "batch_processing_time": 11.172309398651123,
165
+ "audio_length": 4.54,
166
+ "batch_rtf": 1.8545852303823243,
167
+ "output_file": "rtf_test_results/test_016.wav"
168
+ },
169
+ {
170
+ "index": 17,
171
+ "batch": 5,
172
+ "text": "计算机视觉技术让机器能够理解和分析图像内容。",
173
+ "batch_processing_time": 7.102982044219971,
174
+ "audio_length": 3.6,
175
+ "batch_rtf": 1.9146887765353369,
176
+ "output_file": "rtf_test_results/test_017.wav"
177
+ },
178
+ {
179
+ "index": 18,
180
+ "batch": 5,
181
+ "text": "自然语言处理技术使计算机能够理解和生成人类语言。",
182
+ "batch_processing_time": 7.102982044219971,
183
+ "audio_length": 3.44,
184
+ "batch_rtf": 1.9146887765353369,
185
+ "output_file": "rtf_test_results/test_018.wav"
186
+ },
187
+ {
188
+ "index": 19,
189
+ "batch": 5,
190
+ "text": "语音合成技术将文本转换为自然的语音输出。",
191
+ "batch_processing_time": 7.102982044219971,
192
+ "audio_length": 3.42,
193
+ "batch_rtf": 1.9146887765353369,
194
+ "output_file": "rtf_test_results/test_019.wav"
195
+ },
196
+ {
197
+ "index": 20,
198
+ "batch": 5,
199
+ "text": "大数据分析帮助企业发现隐藏的模式和趋势。",
200
+ "batch_processing_time": 7.102982044219971,
201
+ "audio_length": 3.14,
202
+ "batch_rtf": 1.9146887765353369,
203
+ "output_file": "rtf_test_results/test_020.wav"
204
+ }
205
+ ]
206
+ }
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104655.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_info": {
3
+ "timestamp": "20250714_104655",
4
+ "device": "cuda:2",
5
+ "model_path": "/home/yueyulin/tmp/respark",
6
+ "batch_size": 4
7
+ },
8
+ "statistics": {
9
+ "total_tests": 20,
10
+ "successful_tests": 20,
11
+ "failed_tests": 0,
12
+ "batch_size": 4,
13
+ "total_batches": 5,
14
+ "total_processing_time": 97.07794642448425,
15
+ "total_audio_length": 126.58,
16
+ "total_rtf": 0.7669295814858924,
17
+ "avg_batch_rtf": 0.6971107903490739,
18
+ "avg_batch_processing_time": 19.41558928489685,
19
+ "avg_audio_length": 6.328999999999999,
20
+ "min_batch_rtf": 0.5344265722669661,
21
+ "max_batch_rtf": 1.088588070754268,
22
+ "std_batch_rtf": 0.20383940965077615
23
+ },
24
+ "detailed_results": [
25
+ {
26
+ "index": 1,
27
+ "batch": 1,
28
+ "text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
29
+ "batch_processing_time": 45.067546129226685,
30
+ "audio_length": 14.48,
31
+ "batch_rtf": 1.088588070754268,
32
+ "output_file": "rtf_test_results/test_001.wav"
33
+ },
34
+ {
35
+ "index": 2,
36
+ "batch": 1,
37
+ "text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
38
+ "batch_processing_time": 45.067546129226685,
39
+ "audio_length": 6.24,
40
+ "batch_rtf": 1.088588070754268,
41
+ "output_file": "rtf_test_results/test_002.wav"
42
+ },
43
+ {
44
+ "index": 3,
45
+ "batch": 1,
46
+ "text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
47
+ "batch_processing_time": 45.067546129226685,
48
+ "audio_length": 5.6,
49
+ "batch_rtf": 1.088588070754268,
50
+ "output_file": "rtf_test_results/test_003.wav"
51
+ },
52
+ {
53
+ "index": 4,
54
+ "batch": 1,
55
+ "text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
56
+ "batch_processing_time": 45.067546129226685,
57
+ "audio_length": 15.08,
58
+ "batch_rtf": 1.088588070754268,
59
+ "output_file": "rtf_test_results/test_004.wav"
60
+ },
61
+ {
62
+ "index": 5,
63
+ "batch": 2,
64
+ "text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
65
+ "batch_processing_time": 16.357284545898438,
66
+ "audio_length": 4.8,
67
+ "batch_rtf": 0.6558654589373872,
68
+ "output_file": "rtf_test_results/test_005.wav"
69
+ },
70
+ {
71
+ "index": 6,
72
+ "batch": 2,
73
+ "text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
74
+ "batch_processing_time": 16.357284545898438,
75
+ "audio_length": 5.84,
76
+ "batch_rtf": 0.6558654589373872,
77
+ "output_file": "rtf_test_results/test_006.wav"
78
+ },
79
+ {
80
+ "index": 7,
81
+ "batch": 2,
82
+ "text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
83
+ "batch_processing_time": 16.357284545898438,
84
+ "audio_length": 8.6,
85
+ "batch_rtf": 0.6558654589373872,
86
+ "output_file": "rtf_test_results/test_007.wav"
87
+ },
88
+ {
89
+ "index": 8,
90
+ "batch": 2,
91
+ "text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
92
+ "batch_processing_time": 16.357284545898438,
93
+ "audio_length": 5.7,
94
+ "batch_rtf": 0.6558654589373872,
95
+ "output_file": "rtf_test_results/test_008.wav"
96
+ },
97
+ {
98
+ "index": 9,
99
+ "batch": 3,
100
+ "text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
101
+ "batch_processing_time": 16.95064425468445,
102
+ "audio_length": 9.0,
103
+ "batch_rtf": 0.6694567241186591,
104
+ "output_file": "rtf_test_results/test_009.wav"
105
+ },
106
+ {
107
+ "index": 10,
108
+ "batch": 3,
109
+ "text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
110
+ "batch_processing_time": 16.95064425468445,
111
+ "audio_length": 5.72,
112
+ "batch_rtf": 0.6694567241186591,
113
+ "output_file": "rtf_test_results/test_010.wav"
114
+ },
115
+ {
116
+ "index": 11,
117
+ "batch": 3,
118
+ "text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
119
+ "batch_processing_time": 16.95064425468445,
120
+ "audio_length": 6.8,
121
+ "batch_rtf": 0.6694567241186591,
122
+ "output_file": "rtf_test_results/test_011.wav"
123
+ },
124
+ {
125
+ "index": 12,
126
+ "batch": 3,
127
+ "text": "数智技术服务港,让科技触手可及,让创新无处不在。",
128
+ "batch_processing_time": 16.95064425468445,
129
+ "audio_length": 3.8,
130
+ "batch_rtf": 0.6694567241186591,
131
+ "output_file": "rtf_test_results/test_012.wav"
132
+ },
133
+ {
134
+ "index": 13,
135
+ "batch": 4,
136
+ "text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
137
+ "batch_processing_time": 10.945056200027466,
138
+ "audio_length": 4.72,
139
+ "batch_rtf": 0.5344265722669661,
140
+ "output_file": "rtf_test_results/test_013.wav"
141
+ },
142
+ {
143
+ "index": 14,
144
+ "batch": 4,
145
+ "text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
146
+ "batch_processing_time": 10.945056200027466,
147
+ "audio_length": 5.76,
148
+ "batch_rtf": 0.5344265722669661,
149
+ "output_file": "rtf_test_results/test_014.wav"
150
+ },
151
+ {
152
+ "index": 15,
153
+ "batch": 4,
154
+ "text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
155
+ "batch_processing_time": 10.945056200027466,
156
+ "audio_length": 4.98,
157
+ "batch_rtf": 0.5344265722669661,
158
+ "output_file": "rtf_test_results/test_015.wav"
159
+ },
160
+ {
161
+ "index": 16,
162
+ "batch": 4,
163
+ "text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
164
+ "batch_processing_time": 10.945056200027466,
165
+ "audio_length": 5.02,
166
+ "batch_rtf": 0.5344265722669661,
167
+ "output_file": "rtf_test_results/test_016.wav"
168
+ },
169
+ {
170
+ "index": 17,
171
+ "batch": 5,
172
+ "text": "计算机视觉技术让机器能够理解和分析图像内容。",
173
+ "batch_processing_time": 7.757415294647217,
174
+ "audio_length": 3.82,
175
+ "batch_rtf": 0.5372171256680899,
176
+ "output_file": "rtf_test_results/test_017.wav"
177
+ },
178
+ {
179
+ "index": 18,
180
+ "batch": 5,
181
+ "text": "自然语言处理技术使计算机能够理解和生成人类语言。",
182
+ "batch_processing_time": 7.757415294647217,
183
+ "audio_length": 3.68,
184
+ "batch_rtf": 0.5372171256680899,
185
+ "output_file": "rtf_test_results/test_018.wav"
186
+ },
187
+ {
188
+ "index": 19,
189
+ "batch": 5,
190
+ "text": "语音合成技术将文本转换为自然的语音输出。",
191
+ "batch_processing_time": 7.757415294647217,
192
+ "audio_length": 3.34,
193
+ "batch_rtf": 0.5372171256680899,
194
+ "output_file": "rtf_test_results/test_019.wav"
195
+ },
196
+ {
197
+ "index": 20,
198
+ "batch": 5,
199
+ "text": "大数据分析帮助企业发现隐藏的模式和趋势。",
200
+ "batch_processing_time": 7.757415294647217,
201
+ "audio_length": 3.6,
202
+ "batch_rtf": 0.5372171256680899,
203
+ "output_file": "rtf_test_results/test_020.wav"
204
+ }
205
+ ]
206
+ }
trained_190k_steps/rtf_test_results/rtf_test_results_20250714_104853.json ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "test_info": {
3
+ "timestamp": "20250714_104853",
4
+ "device": "cuda:2",
5
+ "model_path": "/home/yueyulin/tmp/respark",
6
+ "batch_size": 8
7
+ },
8
+ "statistics": {
9
+ "total_tests": 20,
10
+ "successful_tests": 20,
11
+ "failed_tests": 0,
12
+ "batch_size": 8,
13
+ "total_batches": 3,
14
+ "total_processing_time": 70.2043297290802,
15
+ "total_audio_length": 124.24,
16
+ "total_rtf": 0.5650702650441098,
17
+ "avg_batch_rtf": 0.5415599169688988,
18
+ "avg_batch_processing_time": 26.568262767791747,
19
+ "avg_audio_length": 6.212,
20
+ "min_batch_rtf": 0.3969690210187983,
21
+ "max_batch_rtf": 0.6930763216404902,
22
+ "std_batch_rtf": 0.13260418306880234
23
+ },
24
+ "detailed_results": [
25
+ {
26
+ "index": 1,
27
+ "batch": 1,
28
+ "text": "一九五二年二月十日,志愿军大英雄张积慧击落美军双料王牌飞行员戴维斯,在自己飞机坠毁处距离戴维斯坠机处不足五百米的情况下,取得了世界空战史不可能复制的奇迹。伟大的张积慧。",
29
+ "batch_processing_time": 44.49549984931946,
30
+ "audio_length": 14.36,
31
+ "batch_rtf": 0.6930763216404902,
32
+ "output_file": "rtf_test_results/test_001.wav"
33
+ },
34
+ {
35
+ "index": 2,
36
+ "batch": 1,
37
+ "text": "在数字浪潮汹涌的今天,数智技术正以前所未有的力量重塑着社会的每一个角落。",
38
+ "batch_processing_time": 44.49549984931946,
39
+ "audio_length": 5.54,
40
+ "batch_rtf": 0.6930763216404902,
41
+ "output_file": "rtf_test_results/test_002.wav"
42
+ },
43
+ {
44
+ "index": 3,
45
+ "batch": 1,
46
+ "text": "为了点燃青少年对科技的热情,培养他们的创新思维与动手能力",
47
+ "batch_processing_time": 44.49549984931946,
48
+ "audio_length": 4.2,
49
+ "batch_rtf": 0.6930763216404902,
50
+ "output_file": "rtf_test_results/test_003.wav"
51
+ },
52
+ {
53
+ "index": 4,
54
+ "batch": 1,
55
+ "text": "杏花岭区巨轮街道社区教育学校携手中车社区教育分校,与太原市科学技术协会联手,于暑期精心策划了一场别开生面的青少年数智技术服务港探索之旅,吸引了众多社区青少年的积极参与。",
56
+ "batch_processing_time": 44.49549984931946,
57
+ "audio_length": 14.58,
58
+ "batch_rtf": 0.6930763216404902,
59
+ "output_file": "rtf_test_results/test_004.wav"
60
+ },
61
+ {
62
+ "index": 5,
63
+ "batch": 1,
64
+ "text": "一踏入数智技术服务港的大门,一股浓厚的科技气息便扑面而来。",
65
+ "batch_processing_time": 44.49549984931946,
66
+ "audio_length": 4.62,
67
+ "batch_rtf": 0.6930763216404902,
68
+ "output_file": "rtf_test_results/test_005.wav"
69
+ },
70
+ {
71
+ "index": 6,
72
+ "batch": 1,
73
+ "text": "科普课堂上,“简易红绿灯”科学实验更是将抽象的电路原理与日常生活紧密相连。",
74
+ "batch_processing_time": 44.49549984931946,
75
+ "audio_length": 7.0,
76
+ "batch_rtf": 0.6930763216404902,
77
+ "output_file": "rtf_test_results/test_006.wav"
78
+ },
79
+ {
80
+ "index": 7,
81
+ "batch": 1,
82
+ "text": "实验开始前,老师生动地介绍了实验物品,并引导青少年思考红绿灯的工作原理,激发了他们浓厚的探索兴趣。",
83
+ "batch_processing_time": 44.49549984931946,
84
+ "audio_length": 8.26,
85
+ "batch_rtf": 0.6930763216404902,
86
+ "output_file": "rtf_test_results/test_007.wav"
87
+ },
88
+ {
89
+ "index": 8,
90
+ "batch": 1,
91
+ "text": "在老师的指导下,青少年们开始动手组装电路,将红绿灯的各个部件连接起来。",
92
+ "batch_processing_time": 44.49549984931946,
93
+ "audio_length": 5.64,
94
+ "batch_rtf": 0.6930763216404902,
95
+ "output_file": "rtf_test_results/test_008.wav"
96
+ },
97
+ {
98
+ "index": 9,
99
+ "batch": 2,
100
+ "text": "他们小心翼翼地调整电路,确保每个部件都正确连接,红灯、绿灯、黄灯依次亮起,仿佛在讲述一个关于交通规则的故事。",
101
+ "batch_processing_time": 18.141484260559082,
102
+ "audio_length": 9.36,
103
+ "batch_rtf": 0.3969690210187983,
104
+ "output_file": "rtf_test_results/test_009.wav"
105
+ },
106
+ {
107
+ "index": 10,
108
+ "batch": 2,
109
+ "text": "实验过程中,青少年们不仅学到了电路知识,还体验到了动手实践的乐趣。",
110
+ "batch_processing_time": 18.141484260559082,
111
+ "audio_length": 5.6,
112
+ "batch_rtf": 0.3969690210187983,
113
+ "output_file": "rtf_test_results/test_010.wav"
114
+ },
115
+ {
116
+ "index": 11,
117
+ "batch": 2,
118
+ "text": "他们纷纷表示,这次实验不仅让他们对科技有了更深的理解,还培养了他们的创新思维和动手能力。",
119
+ "batch_processing_time": 18.141484260559082,
120
+ "audio_length": 6.96,
121
+ "batch_rtf": 0.3969690210187983,
122
+ "output_file": "rtf_test_results/test_011.wav"
123
+ },
124
+ {
125
+ "index": 12,
126
+ "batch": 2,
127
+ "text": "数智技术服务港,让科技触手可及,让创新无处不在。",
128
+ "batch_processing_time": 18.141484260559082,
129
+ "audio_length": 3.88,
130
+ "batch_rtf": 0.3969690210187983,
131
+ "output_file": "rtf_test_results/test_012.wav"
132
+ },
133
+ {
134
+ "index": 13,
135
+ "batch": 2,
136
+ "text": "人工智能技术正在快速发展,为各行各业带来了革命性的变化。",
137
+ "batch_processing_time": 18.141484260559082,
138
+ "audio_length": 4.86,
139
+ "batch_rtf": 0.3969690210187983,
140
+ "output_file": "rtf_test_results/test_013.wav"
141
+ },
142
+ {
143
+ "index": 14,
144
+ "batch": 2,
145
+ "text": "深度学习模型在语音识别、图像处理、自然语言处理等领域取得了突破性进展。",
146
+ "batch_processing_time": 18.141484260559082,
147
+ "audio_length": 4.92,
148
+ "batch_rtf": 0.3969690210187983,
149
+ "output_file": "rtf_test_results/test_014.wav"
150
+ },
151
+ {
152
+ "index": 15,
153
+ "batch": 2,
154
+ "text": "机器学习算法能够从大量数据中学习模式,并做出准确的预测和决策。",
155
+ "batch_processing_time": 18.141484260559082,
156
+ "audio_length": 5.7,
157
+ "batch_rtf": 0.3969690210187983,
158
+ "output_file": "rtf_test_results/test_015.wav"
159
+ },
160
+ {
161
+ "index": 16,
162
+ "batch": 2,
163
+ "text": "神经网络模拟人脑的工作方式,通过多层神经元处理复杂的信息。",
164
+ "batch_processing_time": 18.141484260559082,
165
+ "audio_length": 4.42,
166
+ "batch_rtf": 0.3969690210187983,
167
+ "output_file": "rtf_test_results/test_016.wav"
168
+ },
169
+ {
170
+ "index": 17,
171
+ "batch": 3,
172
+ "text": "计算机视觉技术让机器能够理解和分析图像内容。",
173
+ "batch_processing_time": 7.56734561920166,
174
+ "audio_length": 3.74,
175
+ "batch_rtf": 0.5277088995259177,
176
+ "output_file": "rtf_test_results/test_017.wav"
177
+ },
178
+ {
179
+ "index": 18,
180
+ "batch": 3,
181
+ "text": "自然语言处理技术使计算机能够理解和生成人类语言。",
182
+ "batch_processing_time": 7.56734561920166,
183
+ "audio_length": 3.86,
184
+ "batch_rtf": 0.5277088995259177,
185
+ "output_file": "rtf_test_results/test_018.wav"
186
+ },
187
+ {
188
+ "index": 19,
189
+ "batch": 3,
190
+ "text": "语音合成技术将文本转换为自然的语音输出。",
191
+ "batch_processing_time": 7.56734561920166,
192
+ "audio_length": 3.38,
193
+ "batch_rtf": 0.5277088995259177,
194
+ "output_file": "rtf_test_results/test_019.wav"
195
+ },
196
+ {
197
+ "index": 20,
198
+ "batch": 3,
199
+ "text": "大数据分析帮助企业发现隐藏的模式和趋势。",
200
+ "batch_processing_time": 7.56734561920166,
201
+ "audio_length": 3.36,
202
+ "batch_rtf": 0.5277088995259177,
203
+ "output_file": "rtf_test_results/test_020.wav"
204
+ }
205
+ ]
206
+ }
trained_190k_steps/rtf_test_results/test_001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acddee462cbe93cfae2db94a0987ab88ed594188ca1bc4ac816c33e91cc9e13d
3
+ size 459564
trained_190k_steps/rtf_test_results/test_002.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6361fee9642fd977e0981276d86d5d58f87100d7309a595035a70731d1671589
3
+ size 177324
trained_190k_steps/rtf_test_results/test_003.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4feb2c3b9a8e330a0300fbaeef3f6c4a8d7adb39ac22104808a83a4f51e57f2
3
+ size 134444
trained_190k_steps/rtf_test_results/test_004.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f3689553a60a8153d5c34c14701905862c79744a10f8e0e1a12866b08121116
3
+ size 466604
trained_190k_steps/rtf_test_results/test_005.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:caa1ab23c8dc026655270e0a8132e6528794f28eab7416d55f7f915608c16e92
3
+ size 147884
trained_190k_steps/rtf_test_results/test_006.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec9ee466a4439eff14acab039c2aa5a9effa4743a0846c7bdcb79d33f5358712
3
+ size 224044
trained_190k_steps/rtf_test_results/test_007.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5ff6890b46d01626477bdc6a9908e0bd9647978d76278374391423728299d77
3
+ size 264364
trained_190k_steps/rtf_test_results/test_008.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79aacacde6ecc15b85a8c98f9a8fcaac3ad0a57a64ec5a7e8d7e18461468852
3
+ size 180524
trained_190k_steps/rtf_test_results/test_009.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:664f58297326d99c6c4c74601674f825623bb854970ade90e1aa76ace913fa0b
3
+ size 299564
trained_190k_steps/rtf_test_results/test_010.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fe5c8d90a944d990a84dc0488cbc6f5f182f8c92c73487c6e7564956b87da72
3
+ size 179244
trained_190k_steps/rtf_test_results/test_011.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:decd4ce16f819b4f332aeb5abd7eb5920dc09ef1b7760652b59a028657487913
3
+ size 222764
trained_190k_steps/rtf_test_results/test_012.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ad1cb092212b7ef32d1c801a5432383c5e1b5b9ac56df997c8c3089884cd5c9
3
+ size 124204
trained_190k_steps/rtf_test_results/test_013.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cf22639096c82432678578973352c31e07f662f7f4aace8bffd81ecca3559f
3
+ size 155564
trained_190k_steps/rtf_test_results/test_014.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b3793fb37bd806ca97ef02463a6c0870cb3e5bb42a6df9815c558ed7a117bb
3
+ size 157484