Adun commited on
Commit
555b1ac
·
verified ·
1 Parent(s): 979a028

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -32
app.py CHANGED
@@ -11,7 +11,7 @@ import spaces
11
  import torch
12
  import numpy as np
13
  from PIL import Image, ImageOps
14
- import cv2
15
 
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
@@ -29,7 +29,7 @@ import ast
29
  import html
30
 
31
  # Constants for text generation
32
- MAX_MAX_NEW_TOKENS = 2048
33
  DEFAULT_MAX_NEW_TOKENS = 1024
34
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
35
 
@@ -191,8 +191,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
191
  @spaces.GPU
192
  def generate_video(model_name: str, text: str, video_path: str,
193
  max_new_tokens: int = 1024,
194
- temperature: float = 0.6,
195
- top_p: float = 0.9,
196
  top_k: int = 50,
197
  repetition_penalty: float = 1.2):
198
  """Generate responses for video input using the selected model."""
@@ -277,20 +277,20 @@ def generate_video(model_name: str, text: str, video_path: str,
277
  # yield cleaned_output
278
 
279
  # Define examples for image and video inference
280
- image_examples = [
281
- ["OCR the image", "images/2.jpg"],
282
- ["Convert this page to docling", "images/1.png"],
283
- ["Convert this page to docling", "images/3.png"],
284
- ["Convert chart to OTSL.", "images/4.png"],
285
- ["Convert code to text", "images/5.jpg"],
286
- ["Convert this table to OTSL.", "images/6.jpg"],
287
- ["Convert formula to late.", "images/7.jpg"],
288
- ]
289
-
290
- video_examples = [
291
- ["Explain the video in detail.", "videos/1.mp4"],
292
- ["Explain the video in detail.", "videos/2.mp4"]
293
- ]
294
 
295
  css = """
296
  .submit-btn {
@@ -304,7 +304,7 @@ css = """
304
 
305
  # Create the Gradio Interface
306
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
307
- gr.Markdown("# **[Multimodal OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
308
  with gr.Row():
309
  with gr.Column():
310
  with gr.Tabs():
@@ -312,22 +312,22 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
312
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
313
  image_upload = gr.Image(type="pil", label="Image")
314
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
315
- gr.Examples(
316
- examples=image_examples,
317
- inputs=[image_query, image_upload]
318
- )
319
  with gr.TabItem("Video Inference"):
320
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
321
  video_upload = gr.Video(label="Video")
322
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
323
- gr.Examples(
324
- examples=video_examples,
325
- inputs=[video_query, video_upload]
326
- )
327
  with gr.Accordion("Advanced options", open=False):
328
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
329
- temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
330
- top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
331
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
332
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
333
  with gr.Column():
@@ -335,15 +335,15 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
335
  model_choice = gr.Radio(
336
  choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
337
  label="Select Model",
338
- value="Nanonets-OCR-s"
339
  )
340
 
341
- #gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
342
 
343
  # gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
344
  # gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
345
  # gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
346
- gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Aun](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
347
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
348
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
349
 
 
11
  import torch
12
  import numpy as np
13
  from PIL import Image, ImageOps
14
+ #import cv2
15
 
16
  from transformers import (
17
  Qwen2VLForConditionalGeneration,
 
29
  import html
30
 
31
  # Constants for text generation
32
+ MAX_MAX_NEW_TOKENS = 2048
33
  DEFAULT_MAX_NEW_TOKENS = 1024
34
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
35
 
 
191
  @spaces.GPU
192
  def generate_video(model_name: str, text: str, video_path: str,
193
  max_new_tokens: int = 1024,
194
+ temperature: float = 0.1,
195
+ top_p: float = 0.6,
196
  top_k: int = 50,
197
  repetition_penalty: float = 1.2):
198
  """Generate responses for video input using the selected model."""
 
277
  # yield cleaned_output
278
 
279
  # Define examples for image and video inference
280
+ # image_examples = [
281
+ # ["OCR the image", "images/2.jpg"],
282
+ # ["Convert this page to docling", "images/1.png"],
283
+ # ["Convert this page to docling", "images/3.png"],
284
+ # ["Convert chart to OTSL.", "images/4.png"],
285
+ # ["Convert code to text", "images/5.jpg"],
286
+ # ["Convert this table to OTSL.", "images/6.jpg"],
287
+ # ["Convert formula to late.", "images/7.jpg"],
288
+ # ]
289
+
290
+ # video_examples = [
291
+ # ["Explain the video in detail.", "videos/1.mp4"],
292
+ # ["Explain the video in detail.", "videos/2.mp4"]
293
+ # ]
294
 
295
  css = """
296
  .submit-btn {
 
304
 
305
  # Create the Gradio Interface
306
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
307
+ gr.Markdown("# **[Typhoon OCR Finetuned](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4)**")
308
  with gr.Row():
309
  with gr.Column():
310
  with gr.Tabs():
 
312
  image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
313
  image_upload = gr.Image(type="pil", label="Image")
314
  image_submit = gr.Button("Submit", elem_classes="submit-btn")
315
+ # gr.Examples(
316
+ # examples=image_examples,
317
+ # inputs=[image_query, image_upload]
318
+ # )
319
  with gr.TabItem("Video Inference"):
320
  video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
321
  video_upload = gr.Video(label="Video")
322
  video_submit = gr.Button("Submit", elem_classes="submit-btn")
323
+ # gr.Examples(
324
+ # examples=video_examples,
325
+ # inputs=[video_query, video_upload]
326
+ # )
327
  with gr.Accordion("Advanced options", open=False):
328
  max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
329
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.1)
330
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.6)
331
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
332
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
333
  with gr.Column():
 
335
  model_choice = gr.Radio(
336
  choices=["Adun/typhoon_ocr-7B-v1.4", "Typhoon-OCR-7B"],
337
  label="Select Model",
338
+ value="Adun/typhoon_ocr-7B-v1.4"
339
  )
340
 
341
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/Adun/Multimodal-OCR2/discussions)")
342
 
343
  # gr.Markdown("> [SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview): SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.")
344
  # gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
345
  # gr.Markdown("> [MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR): MonkeyOCR adopts a Structure-Recognition-Relation (SRR) triplet paradigm, which simplifies the multi-tool pipeline of modular approaches while avoiding the inefficiency of using large multimodal models for full-page document processing.")
346
+ gr.Markdown("> [Typhoon-OCR-7B-1.4 finetuned by Adun](https://huggingface.co/Adun/typhoon_ocr-7B-v1.4): Finetuned Typhoon-OCR ให้เข้าใจคำสั่งภาษาไทยได้ดียิ่งขึ้น")
347
  gr.Markdown("> [Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b): A bilingual document parsing model built specifically for real-world documents in Thai and English inspired by models like olmOCR based on Qwen2.5-VL-Instruction. Extracts and interprets embedded text (e.g., chart labels, captions) in Thai or English.")
348
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
349