import gradio as gr import requests from google import genai from google.genai import types import wave import json import re import os import time import numpy as np from pydub import AudioSegment import soundfile as sf import io client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) def transcribe_audio(audio_path): try: with open(audio_path, 'rb') as audio_file: audio_bytes = audio_file.read() print(f"Transcribing: {audio_path}") prompt = """ Transcribe provided audio in its original spoken language and return only the transcribed text. Instructions: 1. Do not translate, only transcribe in the original language. 2. If any speech is not handled, return only . 3. Do not generate text by yourself, only transcription. 4. Ignore background noises and focus only on the speaker. 5. Handle incomplete sentences or short clips by transcribing all audible words.""" response = client.models.generate_content( model="gemini-2.5-pro", contents=[ prompt, types.Part.from_bytes( data=audio_bytes, mime_type='audio/wav' ) ] ) if response is None or response.text is None: print(f"No transcription returned for {audio_path}") return None response_text = response.text.strip() print(f"Transcription for {audio_path}: {response_text}") return response_text except Exception as e: print(f"Error transcribing {audio_path}: {e}") return None def translate_text(text, target_language): if not text or text == "": return text try: prompt = f""" Translate the following text to {target_language}. Return only the translated text. Text: {text} Instructions: 1. Preserve the meaning and context of the original text. 2. Do not add any additional text or explanations. 3. If the text is already in {target_language}, return it unchanged.""" response = client.models.generate_content( model="gemini-2.0-flash", contents=prompt ) if response is None or response.text is None: print(f"No translation returned for text: {text}") return text translated_text = response.text.strip() print(f"Translated to {target_language}: {translated_text}") return translated_text except Exception as e: print(f"Error translating text: {e}") return text def save_transcription(transcription, filename="transcription.txt"): """Save the transcription to a text file.""" try: with open(filename, 'a', encoding='utf-8') as f: f.write(transcription + "\n") print(f"Saved transcription to {filename}") except Exception as e: print(f"Error saving transcription to {filename}: {e}") def summarize_transcription(selected_language): """Read the transcription file, generate a summary, and delete the file.""" try: transcription_file = "transcription.txt" if not os.path.exists(transcription_file): return f"No transcription file available to summarize in {selected_language}." with open(transcription_file, 'r', encoding='utf-8') as f: text = f.read().strip() if not text: return f"No transcription available to summarize in {selected_language}." prompt = f""" The following text is in {selected_language}. Perform the following: 1. Identify all tasks mentioned in the transcription. 2. List the tasks in bullet points in {selected_language}. 3. Provide a concise summary of the transcription in {selected_language}. Transcription: {text} """ response = client.models.generate_content( model="gemini-2.5-flash", contents=prompt ) if response is None or response.text is None: return "Failed to generate summary." summary_text = response.text.strip() try: os.remove(transcription_file) print(f"Deleted transcription file: {transcription_file}") except Exception as e: print(f"Error deleting transcription file: {e}") return summary_text except Exception as e: return f"Error summarizing transcription: {e}" def init_state(): return { "transcription": "", "audio_data": None, "sample_rate": None, "last_audio_path": None, "processed_samples": 0, "last_transcription": "", "selected_language": "Uzbek" # Default language } def deduplicate_transcriptions(prev_transcription, current_transcription): if not prev_transcription or not current_transcription or current_transcription == "": return current_transcription prev_words = prev_transcription.split() curr_words = current_transcription.split() overlap_len = min(len(prev_words), len(curr_words), 3) for i in range(overlap_len, 0, -1): if prev_words[-i:] == curr_words[:i]: deduped = " ".join(curr_words[i:]) if curr_words[i:] else "" print(f"Deduplicated: Removed {prev_words[-i:]} from {current_transcription}") return deduped return current_transcription def get_last_two_sentences(text): sentences = [s.strip() for s in re.split(r'[.!?]+', text.strip()) if s.strip()] return " ".join(sentences[-2:]) if len(sentences) >= 2 else " ".join(sentences) if sentences else "" def process_audio_chunk(new_chunk, state): chunk_duration = 5 overlap_duration = 0.2 sample_rate, audio_data = new_chunk selected_language = state["selected_language"] if state["audio_data"] is None: state["audio_data"] = audio_data state["sample_rate"] = sample_rate else: state["audio_data"] = np.concatenate([state["audio_data"], audio_data]) total_samples = len(state["audio_data"]) total_duration = total_samples / sample_rate chunk_samples = int(chunk_duration * sample_rate) overlap_samples = int(overlap_duration * sample_rate) step_samples = chunk_samples - overlap_samples transcriptions = [] while total_samples - state["processed_samples"] >= chunk_samples: start_sample = state["processed_samples"] end_sample = start_sample + chunk_samples chunk_data = state["audio_data"][start_sample:end_sample] chunk_filename = f"chunk_{int(time.time())}_{start_sample}.wav" with io.BytesIO() as f: sf.write(f, chunk_data, sample_rate, format="wav") f.seek(0) audio_segment = AudioSegment.from_file(f, format="wav") audio_segment.export(chunk_filename, format="wav") if chunk_filename != state["last_audio_path"]: state["last_audio_path"] = chunk_filename raw_transcription = transcribe_audio(chunk_filename) if raw_transcription and raw_transcription != "": transcription = translate_text(raw_transcription, selected_language) deduped_transcription = deduplicate_transcriptions(state["last_transcription"], transcription) if deduped_transcription: transcriptions.append(deduped_transcription) save_transcription(deduped_transcription) state["last_transcription"] = transcription else: transcriptions.append("") state["last_transcription"] = "" try: if os.path.exists(chunk_filename): os.remove(chunk_filename) print(f"Deleted audio chunk: {chunk_filename}") except Exception as e: print(f"Error deleting audio chunk {chunk_filename}: {e}") state["processed_samples"] += step_samples if transcriptions: state["transcription"] += " ".join(t for t in transcriptions if t) + " " if state["processed_samples"] > 0: state["audio_data"] = state["audio_data"][state["processed_samples"]:] state["processed_samples"] = 0 short_transcription = get_last_two_sentences(state["transcription"]) return state, short_transcription def update_language(language, state): state["selected_language"] = language return state custom_css = """ @import url('https://fonts.googleapis.com/css2?family=Calibri'); #output_textbox textarea { font-size: 24px !important; font-family: Calibri, sans-serif !important; padding: 15px; } #summary_textbox { font-size: 18px !important; font-family: Calibri, sans-serif !important; padding: 10px; } #send_button { width: 100px !important; font-size: 14px !important; font-family: Calibri, sans-serif !important; } #language_dropdown { font-family: Calibri, sans-serif !important; } """ enhanced_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="slate", neutral_hue="gray" ) with gr.Blocks( css=custom_css, theme=enhanced_theme, title='Live Transcription & Analysis' ) as demo: state = gr.State(value=init_state()) gr.Markdown("## 🎙️ Live Transcription & Analysis") with gr.Row(): with gr.Column(scale=2): language_dropdown = gr.Dropdown( choices=["Uzbek", "English", "Russian"], label="🌐 Select Language", value="Uzbek", elem_id="language_dropdown" ) mic = gr.Audio( sources=["microphone"], streaming=True, label="🎤 Audio Input" ) output = gr.Textbox( label="📝 Live Transcription", elem_id="output_textbox", lines=2, max_lines=3, placeholder="Start speaking to see transcription here..." ) with gr.Column(scale=1): summarize_button = gr.Button( "✨ Summarize", elem_id='send_button', variant="primary" ) summary_output = gr.Markdown( label="📋 Summary", elem_id="summary_textbox", value="*Summary will appear here after clicking Summarize*" ) language_dropdown.change( fn=update_language, inputs=[language_dropdown, state], outputs=[state] ) mic.stream( fn=process_audio_chunk, inputs=[mic, state], outputs=[state, output], stream_every=0.5 ) summarize_button.click( fn=summarize_transcription, inputs=[state], outputs=[summary_output] ) demo.launch()