Addaci's picture
Update app.py
bdbc86a verified
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load model and tokenizer
model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Define the correction function
def correct_transcription(input_text):
# Add task instruction to the input
prompt = "Correct the following raw Yiddish HTR output from Transkribus into authentic Yiddish correspondence. Keep period-specific spellings, regional variations, and Hebrew-origin words as they were historically used. Fix only clear OCR errors, such as broken words or nonsensical combinations. Retain proper nouns, place names, abbreviations, and informal language. Maintain original line breaks and formatting:\n"
input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
output_ids = model.generate(input_ids, max_length=512)
corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return corrected_text
# Gradio Interface
with gr.Blocks() as interface:
gr.Markdown("### Yiddish Transcription Correction")
with gr.Row():
input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box")
output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box")
submit_button = gr.Button("Correct")
submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box])
# Launch the interface
interface.launch()