import sys
import time

try:
    import spaces
except ImportError:
    print("ZeroGPU is not available, skipping...")

import torch
import torchaudio
import gradio as gr
import torchaudio.transforms as T
import polars as pl

from importlib.metadata import version
from gradio.utils import is_zero_gpu_space
from gradio.themes import Base

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoProcessor,
    MoonshineForConditionalGeneration,
)

from doctr.io import DocumentFile
from doctr.models import ocr_predictor

use_zero_gpu = is_zero_gpu_space()
use_cuda = torch.cuda.is_available()

if use_zero_gpu:
    spaces_version = version("spaces")
    print("ZeroGPU is available, changing inference call.")
else:
    spaces_version = "N/A"
    print("ZeroGPU is not available, skipping...")

print(f"Spaces version: {spaces_version}")

if use_cuda:
    print("CUDA is available, setting correct `device` variable.")
    device = "cuda"
    torch_dtype = torch.bfloat16
else:
    device = "cpu"
    torch_dtype = torch.bfloat16

# Config
model_name = "Yehor/kulyk-en-uk"
concurrency_limit = 5
current_theme = Base()

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=torch_dtype,
    trust_remote_code=True,
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load ASR
audio_processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-base")
audio_model = MoonshineForConditionalGeneration.from_pretrained(
    "UsefulSensors/moonshine-base", attn_implementation="sdpa"
)
audio_model.to(device)
audio_model.to(torch_dtype)

# Load OCR
ocr_model = ocr_predictor(pretrained=True)
ocr_model.to(device)

# Examples
examples_text = [
    "WP: F-16s are unlikely to make a significant difference on the battlefield",
    "Missile and 7 of 8 Shaheeds shot down over Ukraine",
    "Olympic Games 2024. Schedule of competitions for Ukrainian athletes on 28 July",
    "Harris' campaign raised more than $200 million in less than a week",
    "Over the week, the NBU sold almost $800 million on the interbank market",
    "Paris 2024. Day 2: Text broadcast",
]
examples_audio = [
    "example_1.wav",
    "example_2.wav",
    "example_3.wav",
    "example_4.wav",
    "example_5.wav",
    "example_6.wav",
    "example_7.wav",
]
examples_image = [
    "example_1.jpg",
    "example_2.jpg",
    "example_3.jpg",
    "example_4.jpg",
    "example_5.jpg",
    "example_6.jpg",
]

title = "EN-UK Translator"

authors_table = """
## Authors

Follow them on social networks and **contact** if you need any help or have any questions:

| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
|-------------------------------------------------------------------------------------------------|
| https://t.me/smlkw in Telegram                                                                  |
| https://x.com/yehor_smoliakov at X                                                              |
| https://github.com/egorsmkv at GitHub                                                           |
| https://huggingface.co/Yehor at Hugging Face                                                    |
| or use egorsmkv@gmail.com                                                                       |
""".strip()

description_head = f"""
# {title}

This space translates your text, audio, image from English to Ukrainian using [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk) model. Also, check [UK-EN Translator](https://huggingface.co/spaces/Yehor/uk-en-translator) out.
""".strip()


tech_env = f"""
#### Environment

- Python: {sys.version}

#### Models

- [kulyk-en-uk](https://huggingface.co/Yehor/kulyk-en-uk)
- [moonshine-base](https://huggingface.co/UsefulSensors/moonshine-base)
- [doctr](https://github.com/mindee/doctr)
""".strip()

tech_libraries = f"""
#### Libraries

- torch: {version("torch")}
- gradio: {version("gradio")}
- transformers: {version("transformers")}
""".strip()


def translate(text: str) -> str:
    prompt = "Translate the text to Ukrainian:\n" + text

    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)

    output = model.generate(
        input_ids,
        max_new_tokens=2048,
        # Greedy Search
        do_sample=False,
        repetition_penalty=1.05,
        # Sampling
        # do_sample=True,
        # temperature=0.1,
        # # top_k=1,
        # min_p=0.9,
        # repetition_penalty=1.05,
    )

    prompt_len = input_ids.shape[1]
    generated_tokens = output[:, prompt_len:]
    translated_text = tokenizer.batch_decode(
        generated_tokens, skip_special_tokens=True
    )[0]

    return translated_text.strip()


@spaces.GPU
def inference_text(text, progress=gr.Progress()):
    if not text:
        raise gr.Error("Please paste your text.")

    progress(0, desc="Translating...")

    results = []

    sentences = text.split("\n")

    non_empty_sentences = []
    for sentence in sentences:
        s = sentence.strip()
        if len(s) != 0:
            non_empty_sentences.append(s)

    for sentence in progress.tqdm(
        non_empty_sentences, desc="Translating...", unit="sentence"
    ):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        translated_text = translated_text.strip()
        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


@spaces.GPU
def inference_audio(audio, progress=gr.Progress()):
    if not audio:
        raise gr.Error("Please paste your audio file.")

    progress(0, desc="Translating...")

    if isinstance(audio, str):
        audio_array, sr = torchaudio.load(audio)
        audio_array = audio_array.squeeze()
    else:
        audio_array, sr = audio

    r_sr = audio_processor.feature_extractor.sampling_rate

    print("Audio processor SR:", r_sr)
    print("Audio file SR:", sr)

    if r_sr != sr:
        print("Resampling...")
        resampler = T.Resample(orig_freq=sr, new_freq=r_sr)
        audio_array = resampler(audio_array)

    inputs = audio_processor(audio_array, return_tensors="pt", sampling_rate=r_sr)
    inputs = inputs.to(device, dtype=torch_dtype)

    # to avoid hallucination loops, we limit the maximum length of the generated text based expected number of tokens per second
    token_limit_factor = (
        6.5 / audio_processor.feature_extractor.sampling_rate
    )  # Maximum of 6.5 tokens per second
    seq_lens = inputs.attention_mask.sum(dim=-1)
    max_length = int((seq_lens * token_limit_factor).max().item())

    generated_ids = audio_model.generate(**inputs, max_length=max_length)

    predictions = audio_processor.batch_decode(generated_ids, skip_special_tokens=True)

    print("Predictions:", predictions)

    text = predictions[0]

    print("Text:", text)

    results = []

    sentences = text.split("\n")

    non_empty_sentences = []
    for sentence in sentences:
        s = sentence.strip()
        if len(s) != 0:
            non_empty_sentences.append(s)

    for sentence in progress.tqdm(
        non_empty_sentences, desc="Translating...", unit="sentence"
    ):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


@spaces.GPU
def inference_image(image, progress=gr.Progress()):
    if not image:
        raise gr.Error("Please paste your image file.")

    progress(0, desc="Translating...")

    if isinstance(image, str):
        doc = DocumentFile.from_images(image)
    else:
        raise gr.Error("Please paste your image file.")

    result = ocr_model(doc)

    text = result.render()

    print("Text:", text)

    results = []

    sentences = [text.replace("\n", " ")]

    for sentence in progress.tqdm(sentences, desc="Translating...", unit="sentence"):
        t0 = time.time()
        translated_text = translate(sentence)
        elapsed_time = round(time.time() - t0, 2)

        results.append(
            {
                "sentence": sentence,
                "translated_text": translated_text,
                "elapsed_time": elapsed_time,
            }
        )

    gr.Info("Finished!", duration=2)

    return pl.DataFrame(results)


def create_app():
    tab = gr.Blocks(
        title=title,
        analytics_enabled=False,
        theme=current_theme,
    )

    with tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        text = gr.Textbox(label="Text", autofocus=True, lines=5)

        gr.Button("Translate").click(
            inference_text,
            concurrency_limit=concurrency_limit,
            inputs=text,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(label="Choose an example", inputs=text, examples=examples_text)

    return tab


def create_audio_app():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        audio = gr.Audio(label="Audio file", sources="upload", type="filepath")

        gr.Button("Translate").click(
            inference_audio,
            concurrency_limit=concurrency_limit,
            inputs=audio,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(
                label="Choose an example", inputs=audio, examples=examples_audio
            )

    return tab


def create_image_app():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(description_head)
        gr.Markdown("## Usage")

        translated_text = gr.DataFrame(
            label="Translated text",
        )

        image = gr.Image(label="Image file", sources="upload", type="filepath")

        gr.Button("Translate").click(
            inference_image,
            concurrency_limit=concurrency_limit,
            inputs=image,
            outputs=translated_text,
        )

        with gr.Row():
            gr.Examples(
                label="Choose an example", inputs=image, examples=examples_image
            )

    return tab


def create_env():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(tech_env)
        gr.Markdown(tech_libraries)

    return tab


def create_authors():
    with gr.Blocks(theme=current_theme) as tab:
        gr.Markdown(authors_table)

    return tab


def create_demo():
    app_tab = create_app()
    app_audio_tab = create_audio_app()
    app_image_tab = create_image_app()
    authors_tab = create_authors()
    env_tab = create_env()

    return gr.TabbedInterface(
        [app_tab, app_audio_tab, app_image_tab, authors_tab, env_tab],
        tab_names=[
            "✍️ Text",
            "🔊 Audio",
            "👀 Image",
            "👥 Authors",
            "📦 Environment, Models, and Libraries",
        ],
    )


if __name__ == "__main__":
    demo = create_demo()
    demo.queue()
    demo.launch()