import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

# get models
Arabic_speech_Syllables_recognition = pipeline("automatic-speech-recognition", model="IbrahimSalah/Arabic_speech_Syllables_recognition_Using_Wav2vec2")
Arabic_Syllables_to_text_model = AutoModelForSeq2SeqLM.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
Arabic_Syllables_to_text_tokenizer = AutoTokenizer.from_pretrained("IbrahimSalah/Arabic_Syllables_to_text_Converter_Using_MT5")
Arabic_Syllables_to_text_model.eval()

def get_and_process_syllabels(audio):
    def pre_process_input(syllables):
      syllables="|"+syllables.replace(" ","|")+"."
      return syllables
    clip = Arabic_speech_Syllables_recognition(audio)['text']
    preprocessed_input=pre_process_input(clip)
    # Define the input text
    input_ids = Arabic_Syllables_to_text_tokenizer.encode(preprocessed_input, return_tensors="pt",)
    output_ids = Arabic_Syllables_to_text_model.generate(
        input_ids,
        max_length=100,
        early_stopping=True,
        pad_token_id=Arabic_Syllables_to_text_tokenizer.pad_token_id,
        bos_token_id=Arabic_Syllables_to_text_tokenizer.bos_token_id,
        eos_token_id=Arabic_Syllables_to_text_tokenizer.eos_token_id,
    )

    # Decode the output
    output_text = Arabic_Syllables_to_text_tokenizer.decode(output_ids[0][1:], skip_special_tokens=True)
    return output_text.split(".")[0]


# Create the interface
demo = gr.Interface(
    fn=get_and_process_syllabels,
    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
    outputs="text",
    title="Audio Transcription",
    description="Speak or upload an audio file to see the transcribed text with diacritics."
)

demo.launch(share=True)