import gradio as gr from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer # get models Arabic_speech_Syllables_recognition = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2") Arabic_Syllables_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") Arabic_Syllables_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5") Arabic_Syllables_to_text_model.eval() def get_and_process_syllabels(audio): def pre_process_input(syllables): syllables="|"+syllables.replace(" ","|")+"." return syllables clip = Arabic_speech_Syllables_recognition(audio)['text'] preprocessed_input=pre_process_input(clip) # Define the input text input_ids = Arabic_Syllables_to_text_tokenizer.encode(preprocessed_input, return_tensors="pt",) output_ids = Arabic_Syllables_to_text_model.generate( input_ids, max_length=100, early_stopping=True, pad_token_id=Arabic_Syllables_to_text_tokenizer.pad_token_id, bos_token_id=Arabic_Syllables_to_text_tokenizer.bos_token_id, eos_token_id=Arabic_Syllables_to_text_tokenizer.eos_token_id, ) # Decode the output output_text = Arabic_Syllables_to_text_tokenizer.decode(output_ids[0][1:], skip_special_tokens=True) return output_text.split(".")[0] # Create the interface demo = gr.Interface( fn=get_and_process_syllabels, inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs="text", title="Audio Transcription", description="Speak or upload an audio file to see the transcribed text with diacritics." ) demo.launch(share=True)