|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
def correct_transcription(input_text): |
|
|
|
prompt = "Correct the following raw Yiddish HTR output from Transkribus into authentic Yiddish correspondence. Keep period-specific spellings, regional variations, and Hebrew-origin words as they were historically used. Fix only clear OCR errors, such as broken words or nonsensical combinations. Retain proper nouns, place names, abbreviations, and informal language. Maintain original line breaks and formatting:\n" |
|
input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids |
|
output_ids = model.generate(input_ids, max_length=512) |
|
corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) |
|
return corrected_text |
|
|
|
|
|
with gr.Blocks() as interface: |
|
gr.Markdown("### Yiddish Transcription Correction") |
|
|
|
with gr.Row(): |
|
input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box") |
|
output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box") |
|
|
|
submit_button = gr.Button("Correct") |
|
submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box]) |
|
|
|
|
|
interface.launch() |