|
--- |
|
license: apache-2.0 |
|
--- |
|
** Model Detail |
|
|
|
* Model type: |
|
RWKV7-0.4B-wavlmLarge-ENASR-demo is an opensource English ASR Model trained using RWKV7 architecture and WavLM-Large Encoder. |
|
|
|
* Model date: Mar,2025 |
|
|
|
* Paper or resources for more information: https://github.com/JL-er/WorldRWKV |
|
|
|
* Where to send questions or comments about the model: https://github.com/JL-er/WorldRWKV/issues |
|
|
|
** Training Time |
|
1 hrs 4*4090 |
|
|
|
|
|
|
|
|
|
** Inference |
|
|
|
``` |
|
from infer.worldmodel import Worldinfer |
|
import librosa |
|
import numpy as np |
|
import soundfile as sf |
|
|
|
# 模型路径 |
|
llm_path = '/home/rwkv/model/rwkv7-0.4b-wavlmlarge-enasr-demo.pth' |
|
encoder_path = '/home/rwkv/model/facebookhubert-large-ls960-ft' |
|
encoder_type = 'speech' |
|
|
|
# 初始化模型 |
|
model = Worldinfer(model_path=llm_path, encoder_type=encoder_type, encoder_path=encoder_path) |
|
|
|
# 加载音频文件 |
|
audio_path = './test_audio.wav' |
|
audio_data, sample_rate = sf.read(audio_path) |
|
|
|
# 确保音频是单声道 |
|
if len(audio_data.shape) > 1: |
|
audio_data = audio_data[:, 0] |
|
|
|
# 检查并转换音频数据为浮点数格式 |
|
if audio_data.dtype != np.float32 and audio_data.dtype != np.float64: |
|
audio_data = audio_data.astype(np.float32) / np.iinfo(audio_data.dtype).max |
|
|
|
# 重采样到 16000 Hz |
|
resampled_audio = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
|
|
|
# 构造提示文本 |
|
text = '\x16Assistant:' |
|
|
|
# 生成结果 |
|
result, _ = model.generate(text, resampled_audio) |
|
|
|
print(result) |
|
|
|
``` |