Spaces:

prithivMLmods
/

Doc-VLMs-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Mar 20

Commit

8ec6920

verified ·

1 Parent(s): 0d5b113

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -35

app.py CHANGED Viewed

@@ -83,12 +83,12 @@ orpheus_tts_model.to(tts_device)
 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
-# Global parameters for chat responses
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-# (Image generation code has been removed.)
 MAX_SEED = np.iinfo(np.int32).max
@@ -200,7 +200,7 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
     if not text.strip():
         return None
     try:
-        # Generate speech without internal progress calls (UI progress is handled externally)
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
@@ -233,7 +233,7 @@ def generate(
     repetition_penalty: float = 1.2,
 ):
     """
-    Generates chatbot responses with support for video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
@@ -335,39 +335,64 @@ def generate(
             yield gr.Audio(audio_output, autoplay=True)
             return
-    # Default branch for regular chat (text without explicit TTS trigger)
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
-    # Process using the DeepHermes LLM
-    input_ids = hermes_llm_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
-        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
-        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(hermes_llm_model.device)
-    streamer = TextIteratorStreamer(hermes_llm_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        "input_ids": input_ids,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "top_p": top_p,
-        "top_k": top_k,
-        "temperature": temperature,
-        "num_beams": 1,
-        "repetition_penalty": repetition_penalty,
-    }
-    t = Thread(target=hermes_llm_model.generate, kwargs=generation_kwargs)
-    t.start()
-    outputs = []
-    yield progress_bar_html("Processing with DeepHermes LLM")
-    for new_text in streamer:
-        outputs.append(new_text)
-        yield "".join(outputs)
-    final_response = "".join(outputs)
-    yield final_response
-    # Also convert the final response to speech using a default voice ("tara")
-    audio_output = generate_speech(final_response, "tara", temperature, top_p, repetition_penalty, max_new_tokens)
-    yield gr.Audio(audio_output, autoplay=True)
 # Gradio Interface
 demo = gr.ChatInterface(
@@ -386,6 +411,7 @@ demo = gr.ChatInterface(
         ["@josh-llm What causes rainbows to form?"],
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],

 orpheus_tts_tokenizer = AutoTokenizer.from_pretrained(tts_model_name)
 print(f"Orpheus TTS model loaded to {tts_device}")
+# Some global parameters for chat responses
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+# (Image generation related code has been fully removed.)
 MAX_SEED = np.iinfo(np.int32).max
     if not text.strip():
         return None
     try:
+        # Removed in-function progress calls to maintain UI consistency.
         input_ids, attention_mask = process_prompt(text, voice, orpheus_tts_tokenizer, tts_device)
         with torch.no_grad():
             generated_ids = orpheus_tts_model.generate(
     repetition_penalty: float = 1.2,
 ):
     """
+    Generates chatbot responses with support for multimodal input, video processing,
     TTS, and LLM-augmented TTS.
     Trigger commands:
             yield gr.Audio(audio_output, autoplay=True)
             return
+    # Default branch for regular chat (text and multimodal without TTS).
     conversation = clean_chat_history(chat_history)
     conversation.append({"role": "user", "content": text})
+    # If files are provided, only non-image files (e.g. video) are processed via Qwen2VL.
+    if files:
+        # Process files using the processor (this branch no longer handles image generation)
+        if len(files) > 1:
+            inputs_list = [load_image(image) for image in files]
+        elif len(files) == 1:
+            inputs_list = [load_image(files[0])]
+        else:
+            inputs_list = []
+        messages = [{
+            "role": "user",
+            "content": [
+                *[{"type": "image", "image": img} for img in inputs_list],
+                {"type": "text", "text": text},
+            ]
+        }]
+        prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=[prompt_full], images=inputs_list, return_tensors="pt", padding=True).to("cuda")
+        streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
+        thread = Thread(target=model_m.generate, kwargs=generation_kwargs)
+        thread.start()
+        buffer = ""
+        yield progress_bar_html("Processing with Qwen2VL")
+        for new_text in streamer:
+            buffer += new_text.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer
+    else:
+        input_ids = hermes_llm_tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+        if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+            input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+            gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+        input_ids = input_ids.to(hermes_llm_model.device)
+        streamer = TextIteratorStreamer(hermes_llm_tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            "input_ids": input_ids,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "num_beams": 1,
+            "repetition_penalty": repetition_penalty,
+        }
+        t = Thread(target=hermes_llm_model.generate, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        yield progress_bar_html("Processing with DeepHermes LLM")
+        for new_text in streamer:
+            outputs.append(new_text)
+            yield "".join(outputs)
+        final_response = "".join(outputs)
+        yield final_response
 # Gradio Interface
 demo = gr.ChatInterface(
         ["@josh-llm What causes rainbows to form?"],
         ["@dan-tts Yo, I’m Dan, [groan] and yes, I can even sound annoyed if I have to."],
         ["Write python program for array rotation"],
+        [{"text": "summarize the letter", "files": ["examples/1.png"]}],
         ["@tara-tts Hey there, my name is Tara, [laugh] and I’m a speech generation model that can sound just like you!"],
         ["@tara-llm Who is Nikola Tesla, and why did he die?"],
         ["@emma-llm Explain the causes of rainbows"],