Spaces:

Adun
/

typhoon-ocr-finetuned-v1.x

Running on Zero

App Files Files Community

Adun commited on about 1 month ago

Commit

555b1ac

verified ·

1 Parent(s): 979a028

Upload app.py

Browse files

Files changed (1) hide show

app.py +32 -32

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import spaces
 import torch
 import numpy as np
 from PIL import Image, ImageOps
-import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
@@ -29,7 +29,7 @@ import ast
 import html
 # Constants for text generation
-MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
@@ -191,8 +191,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
-                   temperature: float = 0.6,
-                   top_p: float = 0.9,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
@@ -277,20 +277,20 @@ def generate_video(model_name: str, text: str, video_path: str,
     #         yield cleaned_output
 # Define examples for image and video inference
-image_examples = [
-    ["OCR the image", "images/2.jpg"],
-    ["Convert this page to docling", "images/1.png"],
-    ["Convert this page to docling", "images/3.png"],
-    ["Convert chart to OTSL.", "images/4.png"],
-    ["Convert code to text", "images/5.jpg"],
-    ["Convert this table to OTSL.", "images/6.jpg"],
-    ["Convert formula to late.", "images/7.jpg"],
-]
-video_examples = [
-    ["Explain the video in detail.", "videos/1.mp4"],
-    ["Explain the video in detail.", "videos/2.mp4"]
-]
 css = """
 .submit-btn {
@@ -304,7 +304,7 @@ css = """
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
-    gr.Markdown("# **[Multimodal OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
@@ -312,22 +312,22 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
-                    gr.Examples(
-                        examples=image_examples,
-                        inputs=[image_query, image_upload]
-                    )
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
-                    gr.Examples(
-                        examples=video_examples,
-                        inputs=[video_query, video_upload]
-                    )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
@@ -335,15 +335,15 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             model_choice = gr.Radio(
                 choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
                 label="Select Model",
-                value="Nanonets-OCR-s"
             )
-           #gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
             # gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
             # gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
             # gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
-            gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Aun](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")

 import torch
 import numpy as np
 from PIL import Image, ImageOps
+#import cv2
 from transformers import (
     Qwen2VLForConditionalGeneration,
 import html
 # Constants for text generation
+MAX_MAX_NEW_TOKENS     = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 @spaces.GPU
 def generate_video(model_name: str, text: str, video_path: str,
                    max_new_tokens: int = 1024,
+                   temperature: float = 0.1,
+                   top_p: float = 0.6,
                    top_k: int = 50,
                    repetition_penalty: float = 1.2):
     """Generate responses for video input using the selected model."""
     #         yield cleaned_output
 # Define examples for image and video inference
+# image_examples = [
+#     ["OCR the image", "images/2.jpg"],
+#     ["Convert this page to docling", "images/1.png"],
+#     ["Convert this page to docling", "images/3.png"],
+#     ["Convert chart to OTSL.", "images/4.png"],
+#     ["Convert code to text", "images/5.jpg"],
+#     ["Convert this table to OTSL.", "images/6.jpg"],
+#     ["Convert formula to late.", "images/7.jpg"],
+# ]
+# video_examples = [
+#     ["Explain the video in detail.", "videos/1.mp4"],
+#     ["Explain the video in detail.", "videos/2.mp4"]
+# ]
 css = """
 .submit-btn {
 # Create the Gradio Interface
 with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
+    gr.Markdown("# **[Typhoon OCR Finetuned](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4)**")
     with gr.Row():
         with gr.Column():
             with gr.Tabs():
                     image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     image_upload = gr.Image(type="pil", label="Image")
                     image_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    # gr.Examples(
+                    #     examples=image_examples,
+                    #     inputs=[image_query, image_upload]
+                    # )
                 with gr.TabItem("Video Inference"):
                     video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
                     video_upload = gr.Video(label="Video")
                     video_submit = gr.Button("Submit", elem_classes="submit-btn")
+                    # gr.Examples(
+                    #     examples=video_examples,
+                    #     inputs=[video_query, video_upload]
+                    # )
             with gr.Accordion("Advanced options", open=False):
                 max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.1)
+                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.6)
                 top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                 repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         with gr.Column():
             model_choice = gr.Radio(
                 choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
                 label="Select Model",
+                value="Adun/typhoon_ocr-7B-v1.4"
             )
+           gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/Adun/Multimodal-OCR2/discussions)")
             # gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
             # gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
             # gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
+            gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Adun](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4): Finetuned Typhoon-OCR ให้เข้าใจคำสั่งภาษาไทยได้ดียิ่งขึ้น")
             gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")