Spaces:
Running
Running
File size: 1,307 Bytes
1c817fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import os
import uuid
from flask import jsonify, send_file, request
from main import *
import torch
import torchaudio
def speech_to_text_func(audio_path, output_path="output_stt.txt"):
if stt_model is None:
return "STT model not initialized."
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.ndim > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
waveform = waveform.to(device)
with torch.no_grad():
logits = stt_model(waveform)
predicted_ids = torch.argmax(logits, dim=-1)
transcription = stt_model.tokenizer.decode(predicted_ids[0].cpu().tolist())
with open(output_path, "w") as file:
file.write(transcription)
return output_path
def stt_api():
if 'audio' not in request.files:
return jsonify({"error": "Audio file is required"}), 400
audio_file = request.files['audio']
temp_audio_path = f"temp_audio_{uuid.uuid4()}.wav"
audio_file.save(temp_audio_path)
output_file = speech_to_text_func(temp_audio_path)
os.remove(temp_audio_path)
if output_file == "STT model not initialized.":
return jsonify({"error": "STT failed"}), 500
return send_file(output_file, mimetype="text/plain", as_attachment=True, download_name="output.txt")
|