import gradio as gr
import requests
from google import genai
from google.genai import types
import wave
import json
import re
import os
import time
import numpy as np
from pydub import AudioSegment
import soundfile as sf
import io

client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))

def transcribe_audio(audio_path):
    try:
        with open(audio_path, 'rb') as audio_file:
            audio_bytes = audio_file.read()

        print(f"Transcribing: {audio_path}")
        prompt = """
                Transcribe provided audio in its original spoken language and return only the transcribed text.
                Instructions:
                 1. Do not translate, only transcribe in the original language.
                 2. If any speech is not handled, return only <no speech>.
                 3. Do not generate text by yourself, only transcription.
                 4. Ignore background noises and focus only on the speaker.
                 5. Handle incomplete sentences or short clips by transcribing all audible words."""

        response = client.models.generate_content(
            model="gemini-2.5-pro",
            contents=[
                prompt,
                types.Part.from_bytes(
                    data=audio_bytes, mime_type='audio/wav'
                )
            ]
        )
    
        if response is None or response.text is None:
            print(f"No transcription returned for {audio_path}")
            return None
        
        response_text = response.text.strip()
        print(f"Transcription for {audio_path}: {response_text}")
        return response_text
    
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None

def translate_text(text, target_language):
    if not text or text == "<no speech>":
        return text
    
    try:
        prompt = f"""
                Translate the following text to {target_language}. Return only the translated text.
                Text: {text}
                Instructions:
                 1. Preserve the meaning and context of the original text.
                 2. Do not add any additional text or explanations.
                 3. If the text is already in {target_language}, return it unchanged."""

        response = client.models.generate_content(
            model="gemini-2.0-flash",
            contents=prompt
        )
        
        if response is None or response.text is None:
            print(f"No translation returned for text: {text}")
            return text
        
        translated_text = response.text.strip()
        print(f"Translated to {target_language}: {translated_text}")
        return translated_text
    
    except Exception as e:
        print(f"Error translating text: {e}")
        return text

def save_transcription(transcription, filename="transcription.txt"):
    """Save the transcription to a text file."""
    try:
        with open(filename, 'a', encoding='utf-8') as f:
            f.write(transcription + "\n")
        print(f"Saved transcription to {filename}")
    except Exception as e:
        print(f"Error saving transcription to {filename}: {e}")

def summarize_transcription(selected_language):
    """Read the transcription file, generate a summary, and delete the file."""
    try:
        transcription_file = "transcription.txt"
        if not os.path.exists(transcription_file):
            return f"No transcription file available to summarize in {selected_language}."
        
        with open(transcription_file, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        
        if not text:
            return f"No transcription available to summarize in {selected_language}."
                
        prompt = f"""
        The following text is in {selected_language}. Perform the following:
        1. Identify all tasks mentioned in the transcription.
        2. List the tasks in bullet points in {selected_language}.
        3. Provide a concise summary of the transcription in {selected_language}.

        Transcription:
        {text}
        """
        
        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt
        )
        
        if response is None or response.text is None:
            return "Failed to generate summary."
        
        summary_text = response.text.strip()
        
        try:
            os.remove(transcription_file)
            print(f"Deleted transcription file: {transcription_file}")
        except Exception as e:
            print(f"Error deleting transcription file: {e}")
        
        return summary_text
    
    except Exception as e:
        return f"Error summarizing transcription: {e}"


def init_state():
    return {
        "transcription": "",
        "audio_data": None,
        "sample_rate": None,
        "last_audio_path": None,
        "processed_samples": 0,
        "last_transcription": "",
        "selected_language": "Uzbek"  # Default language
    }

def deduplicate_transcriptions(prev_transcription, current_transcription):
    if not prev_transcription or not current_transcription or current_transcription == "<no speech>":
        return current_transcription
    
    prev_words = prev_transcription.split()
    curr_words = current_transcription.split()
    
    overlap_len = min(len(prev_words), len(curr_words), 3)
    for i in range(overlap_len, 0, -1):
        if prev_words[-i:] == curr_words[:i]:
            deduped = " ".join(curr_words[i:]) if curr_words[i:] else ""
            print(f"Deduplicated: Removed {prev_words[-i:]} from {current_transcription}")
            return deduped
    
    return current_transcription

def get_last_two_sentences(text):
    sentences = [s.strip() for s in re.split(r'[.!?]+', text.strip()) if s.strip()]
    return " ".join(sentences[-2:]) if len(sentences) >= 2 else " ".join(sentences) if sentences else ""

def process_audio_chunk(new_chunk, state):
    chunk_duration = 5
    overlap_duration = 0.2
    sample_rate, audio_data = new_chunk
    selected_language = state["selected_language"]

    if state["audio_data"] is None:
        state["audio_data"] = audio_data
        state["sample_rate"] = sample_rate
    else:
        state["audio_data"] = np.concatenate([state["audio_data"], audio_data])

    total_samples = len(state["audio_data"])
    total_duration = total_samples / sample_rate
    chunk_samples = int(chunk_duration * sample_rate)
    overlap_samples = int(overlap_duration * sample_rate)
    step_samples = chunk_samples - overlap_samples

    transcriptions = []
    while total_samples - state["processed_samples"] >= chunk_samples:
        start_sample = state["processed_samples"]
        end_sample = start_sample + chunk_samples
        chunk_data = state["audio_data"][start_sample:end_sample]

        chunk_filename = f"chunk_{int(time.time())}_{start_sample}.wav"
        with io.BytesIO() as f:
            sf.write(f, chunk_data, sample_rate, format="wav")
            f.seek(0)
            audio_segment = AudioSegment.from_file(f, format="wav")
            audio_segment.export(chunk_filename, format="wav")

        if chunk_filename != state["last_audio_path"]:
            state["last_audio_path"] = chunk_filename
            raw_transcription = transcribe_audio(chunk_filename)
            if raw_transcription and raw_transcription != "<no speech>":
                transcription = translate_text(raw_transcription, selected_language)
                deduped_transcription = deduplicate_transcriptions(state["last_transcription"], transcription)
                if deduped_transcription:
                    transcriptions.append(deduped_transcription)
                    save_transcription(deduped_transcription)
                state["last_transcription"] = transcription
            else:
                transcriptions.append("")
                state["last_transcription"] = ""

        try:
            if os.path.exists(chunk_filename):
                os.remove(chunk_filename)
                print(f"Deleted audio chunk: {chunk_filename}")
        except Exception as e:
            print(f"Error deleting audio chunk {chunk_filename}: {e}")

        state["processed_samples"] += step_samples

    if transcriptions:
        state["transcription"] += " ".join(t for t in transcriptions if t) + " "

    if state["processed_samples"] > 0:
        state["audio_data"] = state["audio_data"][state["processed_samples"]:]
        state["processed_samples"] = 0

    short_transcription = get_last_two_sentences(state["transcription"])
    return state, short_transcription

def update_language(language, state):
    state["selected_language"] = language
    return state

custom_css = """
@import url('https://fonts.googleapis.com/css2?family=Calibri');

#output_textbox textarea {
    font-size: 24px !important;
    font-family: Calibri, sans-serif !important;
    padding: 15px;
}

#summary_textbox {
    font-size: 18px !important;
    font-family: Calibri, sans-serif !important;
    padding: 10px;
}

#send_button {
    width: 100px !important;
    font-size: 14px !important;
    font-family: Calibri, sans-serif !important;
}

#language_dropdown {
    font-family: Calibri, sans-serif !important;
}
"""

enhanced_theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="slate", 
    neutral_hue="gray"
)

with gr.Blocks(
    css=custom_css, 
    theme=enhanced_theme, 
    title='Live Transcription & Analysis'
) as demo:
    state = gr.State(value=init_state())
    gr.Markdown("## 🎙️ Live Transcription & Analysis")
    with gr.Row():
        with gr.Column(scale=2):
            language_dropdown = gr.Dropdown(
                choices=["Uzbek", "English", "Russian"],
                label="🌐 Select Language",
                value="Uzbek",
                elem_id="language_dropdown"
            )
            mic = gr.Audio(
                sources=["microphone"], 
                streaming=True, 
                label="🎤 Audio Input"
            )
            output = gr.Textbox(
                label="📝 Live Transcription", 
                elem_id="output_textbox", 
                lines=2, 
                max_lines=3,
                placeholder="Start speaking to see transcription here..."
            )
        
        with gr.Column(scale=1):
            summarize_button = gr.Button(
                "✨ Summarize", 
                elem_id='send_button',
                variant="primary"
            )
            summary_output = gr.Markdown(
                label="📋 Summary", 
                elem_id="summary_textbox",
                value="*Summary will appear here after clicking Summarize*"
            )
    
    language_dropdown.change(
        fn=update_language,
        inputs=[language_dropdown, state],
        outputs=[state]
    )
    
    mic.stream(
        fn=process_audio_chunk,
        inputs=[mic, state],
        outputs=[state, output],
        stream_every=0.5
    )
   
    summarize_button.click(
        fn=summarize_transcription,
        inputs=[state],
        outputs=[summary_output]
    )

demo.launch()