Spaces:
Running
on
Zero
Running
on
Zero
Upload app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,7 @@ import spaces
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
from PIL import Image, ImageOps
|
14 |
-
import cv2
|
15 |
|
16 |
from transformers import (
|
17 |
Qwen2VLForConditionalGeneration,
|
@@ -29,7 +29,7 @@ import ast
|
|
29 |
import html
|
30 |
|
31 |
# Constants for text generation
|
32 |
-
MAX_MAX_NEW_TOKENS
|
33 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
34 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
35 |
|
@@ -191,8 +191,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
|
|
191 |
@spaces.GPU
|
192 |
def generate_video(model_name: str, text: str, video_path: str,
|
193 |
max_new_tokens: int = 1024,
|
194 |
-
temperature: float = 0.
|
195 |
-
top_p: float = 0.
|
196 |
top_k: int = 50,
|
197 |
repetition_penalty: float = 1.2):
|
198 |
"""Generate responses for video input using the selected model."""
|
@@ -277,20 +277,20 @@ def generate_video(model_name: str, text: str, video_path: str,
|
|
277 |
# yield cleaned_output
|
278 |
|
279 |
# Define examples for image and video inference
|
280 |
-
image_examples = [
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
]
|
289 |
-
|
290 |
-
video_examples = [
|
291 |
-
|
292 |
-
|
293 |
-
]
|
294 |
|
295 |
css = """
|
296 |
.submit-btn {
|
@@ -304,7 +304,7 @@ css = """
|
|
304 |
|
305 |
# Create the Gradio Interface
|
306 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
307 |
-
gr.Markdown("# **[
|
308 |
with gr.Row():
|
309 |
with gr.Column():
|
310 |
with gr.Tabs():
|
@@ -312,22 +312,22 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
312 |
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
313 |
image_upload = gr.Image(type="pil", label="Image")
|
314 |
image_submit = gr.Button("Submit", elem_classes="submit-btn")
|
315 |
-
gr.Examples(
|
316 |
-
|
317 |
-
|
318 |
-
)
|
319 |
with gr.TabItem("Video Inference"):
|
320 |
video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
321 |
video_upload = gr.Video(label="Video")
|
322 |
video_submit = gr.Button("Submit", elem_classes="submit-btn")
|
323 |
-
gr.Examples(
|
324 |
-
|
325 |
-
|
326 |
-
)
|
327 |
with gr.Accordion("Advanced options", open=False):
|
328 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
329 |
-
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.
|
330 |
-
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.
|
331 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
332 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
333 |
with gr.Column():
|
@@ -335,15 +335,15 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
|
335 |
model_choice = gr.Radio(
|
336 |
choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
|
337 |
label="Select Model",
|
338 |
-
value="
|
339 |
)
|
340 |
|
341 |
-
|
342 |
|
343 |
# gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
344 |
# gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
|
345 |
# gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
346 |
-
gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by
|
347 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
348 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
349 |
|
|
|
11 |
import torch
|
12 |
import numpy as np
|
13 |
from PIL import Image, ImageOps
|
14 |
+
#import cv2
|
15 |
|
16 |
from transformers import (
|
17 |
Qwen2VLForConditionalGeneration,
|
|
|
29 |
import html
|
30 |
|
31 |
# Constants for text generation
|
32 |
+
MAX_MAX_NEW_TOKENS = 2048
|
33 |
DEFAULT_MAX_NEW_TOKENS = 1024
|
34 |
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
|
35 |
|
|
|
191 |
@spaces.GPU
|
192 |
def generate_video(model_name: str, text: str, video_path: str,
|
193 |
max_new_tokens: int = 1024,
|
194 |
+
temperature: float = 0.1,
|
195 |
+
top_p: float = 0.6,
|
196 |
top_k: int = 50,
|
197 |
repetition_penalty: float = 1.2):
|
198 |
"""Generate responses for video input using the selected model."""
|
|
|
277 |
# yield cleaned_output
|
278 |
|
279 |
# Define examples for image and video inference
|
280 |
+
# image_examples = [
|
281 |
+
# ["OCR the image", "images/2.jpg"],
|
282 |
+
# ["Convert this page to docling", "images/1.png"],
|
283 |
+
# ["Convert this page to docling", "images/3.png"],
|
284 |
+
# ["Convert chart to OTSL.", "images/4.png"],
|
285 |
+
# ["Convert code to text", "images/5.jpg"],
|
286 |
+
# ["Convert this table to OTSL.", "images/6.jpg"],
|
287 |
+
# ["Convert formula to late.", "images/7.jpg"],
|
288 |
+
# ]
|
289 |
+
|
290 |
+
# video_examples = [
|
291 |
+
# ["Explain the video in detail.", "videos/1.mp4"],
|
292 |
+
# ["Explain the video in detail.", "videos/2.mp4"]
|
293 |
+
# ]
|
294 |
|
295 |
css = """
|
296 |
.submit-btn {
|
|
|
304 |
|
305 |
# Create the Gradio Interface
|
306 |
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
|
307 |
+
gr.Markdown("# **[Typhoon OCR Finetuned](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4)**")
|
308 |
with gr.Row():
|
309 |
with gr.Column():
|
310 |
with gr.Tabs():
|
|
|
312 |
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
313 |
image_upload = gr.Image(type="pil", label="Image")
|
314 |
image_submit = gr.Button("Submit", elem_classes="submit-btn")
|
315 |
+
# gr.Examples(
|
316 |
+
# examples=image_examples,
|
317 |
+
# inputs=[image_query, image_upload]
|
318 |
+
# )
|
319 |
with gr.TabItem("Video Inference"):
|
320 |
video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
|
321 |
video_upload = gr.Video(label="Video")
|
322 |
video_submit = gr.Button("Submit", elem_classes="submit-btn")
|
323 |
+
# gr.Examples(
|
324 |
+
# examples=video_examples,
|
325 |
+
# inputs=[video_query, video_upload]
|
326 |
+
# )
|
327 |
with gr.Accordion("Advanced options", open=False):
|
328 |
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
|
329 |
+
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.1)
|
330 |
+
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.6)
|
331 |
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
|
332 |
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
|
333 |
with gr.Column():
|
|
|
335 |
model_choice = gr.Radio(
|
336 |
choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
|
337 |
label="Select Model",
|
338 |
+
value="Adun/typhoon_ocr-7B-v1.4"
|
339 |
)
|
340 |
|
341 |
+
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/Adun/Multimodal-OCR2/discussions)")
|
342 |
|
343 |
# gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
|
344 |
# gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
|
345 |
# gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
|
346 |
+
gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Adun](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4): Finetuned Typhoon-OCR ให้เข้าใจคำสั่งภาษาไทยได้ดียิ่งขึ้น")
|
347 |
gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
|
348 |
gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
|
349 |
|