Spaces:

MarineLives
/

yiddish-transcription-correction

Sleeping

App Files Files Community

yiddish-transcription-correction / app.py

Addaci

Update app.py

bdbc86a verified 7 months ago

raw

history blame contribute delete

1.63 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

	# Load model and tokenizer
	model_name = "Addaci/byt5-small-finetuned-yiddish-experiment-10"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

	# Define the correction function
	def correct_transcription(input_text):
	# Add task instruction to the input
	prompt = "Correct the following raw Yiddish HTR output from Transkribus into authentic Yiddish correspondence. Keep period-specific spellings, regional variations, and Hebrew-origin words as they were historically used. Fix only clear OCR errors, such as broken words or nonsensical combinations. Retain proper nouns, place names, abbreviations, and informal language. Maintain original line breaks and formatting:\n"
	input_ids = tokenizer(prompt + input_text, return_tensors="pt", truncation=True).input_ids
	output_ids = model.generate(input_ids, max_length=512)
	corrected_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
	return corrected_text

	# Gradio Interface
	with gr.Blocks() as interface:
	gr.Markdown("### Yiddish Transcription Correction")

	with gr.Row():
	input_box = gr.Textbox(label="Raw Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="input_box")
	output_box = gr.Textbox(label="Corrected Transcription (Hebrew Script)", lines=1, rtl=True, elem_id="output_box")

	submit_button = gr.Button("Correct")
	submit_button.click(correct_transcription, inputs=[input_box], outputs=[output_box])

	# Launch the interface
	interface.launch()