Spaces:

vinhtruong3
/

florence-captioning-2-txt

Running on Zero

App Files Files Community

vinhtruong3 commited on Mar 27

Commit

f18cbce

verified ·

1 Parent(s): 8e93e59

Update app.py

Browse files

Files changed (1) hide show

app.py +149 -87

app.py CHANGED Viewed

@@ -16,17 +16,16 @@ print(f"Using device: {device}")
 model_configs = {
     'gokaygokay/Florence-2-Flux': "<DESCRIPTION>",
     'gokaygokay/Florence-2-Flux-Large': "<DESCRIPTION>",
-    'yayayaaa/florence-2-large-ft-moredetailed': "<MORE_DETAILED_CAPTION>"
-    # Temporarily removed MiaoshouAI model due to compatibility issues
-    # 'MiaoshouAI/Florence-2-large-PromptGen-v2.0': "<MORE_DETAILED_CAPTION>"
 }
 # Define a description for each model to be shown in UI
 model_descriptions = {
     'gokaygokay/Florence-2-Flux': "Faster version with good quality captions",
     'gokaygokay/Florence-2-Flux-Large': "Provides detailed captions with better image understanding",
-    'yayayaaa/florence-2-large-ft-moredetailed': "Fine-tuned specifically for more detailed captions"
-    # 'MiaoshouAI/Florence-2-large-PromptGen-v2.0': "Memory efficient model with high quality detailed captions"
 }
 # Load a single model to start with
@@ -56,6 +55,7 @@ title = """<h1 align="center">Florence-2 Caption Dataset Creator</h1>
 <a href="https://huggingface.co/gokaygokay/Florence-2-Flux-Large" target="_blank">[Florence-2 Flux Large]</a>
 <a href="https://huggingface.co/gokaygokay/Florence-2-Flux" target="_blank">[Florence-2 Flux Base]</a>
 <a href="https://huggingface.co/yayayaaa/florence-2-large-ft-moredetailed" target="_blank">[Florence-2 More Detailed]</a>
 </center></p>"""
 # Function to clean caption text
@@ -100,6 +100,50 @@ def load_model(selected_model_name):
     return "Model loaded successfully"
 # Function to generate a caption for a single image
 @spaces.GPU
 def generate_caption(image, selected_model_name):
@@ -124,47 +168,54 @@ def generate_caption(image, selected_model_name):
     if image.mode != "RGB":
         image = image.convert("RGB")
-    # Create an appropriate prompt based on the model
-    prompt = task_prompt
     try:
-        # Process the image
-        inputs = processor(text=prompt, images=image, return_tensors="pt")
-        # Move inputs to the same device as the model
-        for key in inputs:
-            if isinstance(inputs[key], torch.Tensor):
-                inputs[key] = inputs[key].to(device)
-        # Generate the caption
-        with torch.no_grad():
-            generated_ids = model.generate(
-                **inputs,
-                max_new_tokens=512,  # Reduced for better memory usage
-                num_beams=3,
-                repetition_penalty=1.10,
-            )
-        # Decode the generated text
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # Handle post-processing for different models
-        if task_prompt == "<DESCRIPTION>":
-            # Use the post processing for Florence-2-Flux models
-            try:
-                decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-                parsed_answer = processor.post_process_generation(
-                    decoded_text,
-                    task=task_prompt,
-                    image_size=(image.width, image.height)
-                )
-                caption = parsed_answer[task_prompt]
-            except Exception as e:
-                print(f"Error in post processing: {str(e)}")
-                caption = generated_text  # Fallback to direct output
         else:
-            # For other models, use the generated text directly
-            caption = generated_text
         # Clean the caption to remove padding tokens
         clean_text = clean_caption(caption)
@@ -211,51 +262,59 @@ def process_images(images, selected_model_name, add_trigger=True, trigger_word="
                         results.append(f"⚠️ Skipped {base_name}: Unsupported format (only jpg, jpeg, png supported)")
                         continue
-                    # Generate caption for this specific image
                     image = Image.open(img_path)
                     if image.mode != "RGB":
                         image = image.convert("RGB")
-                    # Use the task prompt for the selected model
-                    prompt = task_prompt
-                    # Process the image
-                    inputs = processor(text=prompt, images=image, return_tensors="pt")
-                    # Move inputs to the same device as the model
-                    for key in inputs:
-                        if isinstance(inputs[key], torch.Tensor):
-                            inputs[key] = inputs[key].to(device)
-                    # Generate the caption
-                    with torch.no_grad():
-                        generated_ids = model.generate(
-                            **inputs,
-                            max_new_tokens=512,  # Reduced for better memory usage
-                            num_beams=3,
-                            repetition_penalty=1.10,
-                        )
-                    # Decode the generated text
-                    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-                    # Handle post-processing for different models
-                    if task_prompt == "<DESCRIPTION>":
-                        # Use the post processing for Florence-2-Flux models
-                        try:
-                            decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
-                            parsed_answer = processor.post_process_generation(
-                                decoded_text,
-                                task=task_prompt,
-                                image_size=(image.width, image.height)
-                            )
-                            caption = parsed_answer[task_prompt]
-                        except Exception as e:
-                            print(f"Error in post processing: {str(e)}")
-                            caption = generated_text  # Fallback to direct output
                     else:
-                        # For other models, use the generated text directly
-                        caption = generated_text
                     # Clean caption and add trigger if needed
                     caption = clean_caption(caption)
@@ -377,10 +436,13 @@ with gr.Blocks() as demo:
         gr.Markdown(model_md)
-        # Add note about MiaoshouAI model
         gr.Markdown("""
-        ### Note:
-        The MiaoshouAI/Florence-2-large-PromptGen-v2.0 model has been temporarily removed due to compatibility issues with the current setup. The available models still provide excellent captioning capabilities.
         Supported image formats: JPG, JPEG, PNG
         """)

 model_configs = {
     'gokaygokay/Florence-2-Flux': "<DESCRIPTION>",
     'gokaygokay/Florence-2-Flux-Large': "<DESCRIPTION>",
+    'yayayaaa/florence-2-large-ft-moredetailed': "<MORE_DETAILED_CAPTION>",
+    'MiaoshouAI/Florence-2-large-PromptGen-v2.0': "<MORE_DETAILED_CAPTION>"
 }
 # Define a description for each model to be shown in UI
 model_descriptions = {
     'gokaygokay/Florence-2-Flux': "Faster version with good quality captions",
     'gokaygokay/Florence-2-Flux-Large': "Provides detailed captions with better image understanding",
+    'yayayaaa/florence-2-large-ft-moredetailed': "Fine-tuned specifically for more detailed captions",
+    'MiaoshouAI/Florence-2-large-PromptGen-v2.0': "Memory efficient model with high quality detailed captions"
 }
 # Load a single model to start with
 <a href="https://huggingface.co/gokaygokay/Florence-2-Flux-Large" target="_blank">[Florence-2 Flux Large]</a>
 <a href="https://huggingface.co/gokaygokay/Florence-2-Flux" target="_blank">[Florence-2 Flux Base]</a>
 <a href="https://huggingface.co/yayayaaa/florence-2-large-ft-moredetailed" target="_blank">[Florence-2 More Detailed]</a>
+<a href="https://huggingface.co/MiaoshouAI/Florence-2-large-PromptGen-v2.0" target="_blank">[MiaoshouAI PromptGen v2.0]</a>
 </center></p>"""
 # Function to clean caption text
     return "Model loaded successfully"
+# Special function for MiaoshouAI model
+def generate_miaoshou_caption(image):
+    """Special handling for MiaoshouAI model"""
+    # Create inputs for MiaoshouAI model
+    inputs = processor(
+        text=task_prompt,
+        images=image,
+        return_tensors="pt"
+    )
+    # Move inputs to device
+    for key in inputs:
+        if isinstance(inputs[key], torch.Tensor):
+            inputs[key] = inputs[key].to(device)
+    # Generate using only input_ids and pixel_values
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=512,
+        do_sample=False,
+        num_beams=3
+    )
+    # Decode the generated text
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    # Use the model's post-processing
+    try:
+        parsed_answer = processor.post_process_generation(
+            generated_text,
+            task=task_prompt,
+            image_size=(image.width, image.height)
+        )
+        # Get the generated text from parsed answer
+        if isinstance(parsed_answer, dict) and task_prompt in parsed_answer:
+            return parsed_answer[task_prompt]
+        else:
+            return str(parsed_answer)
+    except Exception as e:
+        print(f"Post-processing error: {str(e)}")
+        # Fallback to regular decoding if post-processing fails
+        return processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
 # Function to generate a caption for a single image
 @spaces.GPU
 def generate_caption(image, selected_model_name):
     if image.mode != "RGB":
         image = image.convert("RGB")
     try:
+        # Special handling for MiaoshouAI model
+        if model_name == 'MiaoshouAI/Florence-2-large-PromptGen-v2.0':
+            caption = generate_miaoshou_caption(image)
         else:
+            # Regular processing for other models
+            # Create an appropriate prompt based on the model
+            prompt = task_prompt
+            if prompt == "<DESCRIPTION>":
+                prompt = prompt + "Describe this image in great detail."
+            # Process the image
+            inputs = processor(text=prompt, images=image, return_tensors="pt")
+            # Move inputs to the same device as the model
+            for key in inputs:
+                if isinstance(inputs[key], torch.Tensor):
+                    inputs[key] = inputs[key].to(device)
+            # Generate the caption
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    **inputs,
+                    max_new_tokens=512,
+                    num_beams=3,
+                    repetition_penalty=1.10,
+                )
+            # Decode the generated text
+            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            # Handle post-processing for different models
+            if task_prompt == "<DESCRIPTION>":
+                # Use the post processing for Florence-2-Flux models
+                try:
+                    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+                    parsed_answer = processor.post_process_generation(
+                        decoded_text,
+                        task=task_prompt,
+                        image_size=(image.width, image.height)
+                    )
+                    caption = parsed_answer[task_prompt]
+                except Exception as e:
+                    print(f"Error in post processing: {str(e)}")
+                    caption = generated_text  # Fallback to direct output
+            else:
+                # For other models, use the generated text directly
+                caption = generated_text
         # Clean the caption to remove padding tokens
         clean_text = clean_caption(caption)
                         results.append(f"⚠️ Skipped {base_name}: Unsupported format (only jpg, jpeg, png supported)")
                         continue
+                    # Generate caption
+                    # Open the image once
                     image = Image.open(img_path)
                     if image.mode != "RGB":
                         image = image.convert("RGB")
+                    # Use the same caption generation logic as in generate_caption
+                    if model_name == 'MiaoshouAI/Florence-2-large-PromptGen-v2.0':
+                        caption = generate_miaoshou_caption(image)
                     else:
+                        # Regular processing for other models
+                        # Create an appropriate prompt based on the model
+                        prompt = task_prompt
+                        if prompt == "<DESCRIPTION>":
+                            prompt = prompt + "Describe this image in great detail."
+                        # Process the image
+                        inputs = processor(text=prompt, images=image, return_tensors="pt")
+                        # Move inputs to the same device as the model
+                        for key in inputs:
+                            if isinstance(inputs[key], torch.Tensor):
+                                inputs[key] = inputs[key].to(device)
+                        # Generate the caption
+                        with torch.no_grad():
+                            generated_ids = model.generate(
+                                **inputs,
+                                max_new_tokens=512,
+                                num_beams=3,
+                                repetition_penalty=1.10,
+                            )
+                        # Decode the generated text
+                        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                        # Handle post-processing for different models
+                        if task_prompt == "<DESCRIPTION>":
+                            # Use the post processing for Florence-2-Flux models
+                            try:
+                                decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+                                parsed_answer = processor.post_process_generation(
+                                    decoded_text,
+                                    task=task_prompt,
+                                    image_size=(image.width, image.height)
+                                )
+                                caption = parsed_answer[task_prompt]
+                            except Exception as e:
+                                print(f"Error in post processing: {str(e)}")
+                                caption = generated_text  # Fallback to direct output
+                        else:
+                            # For other models, use the generated text directly
+                            caption = generated_text
                     # Clean caption and add trigger if needed
                     caption = clean_caption(caption)
         gr.Markdown(model_md)
+        # Add special note for MiaoshouAI model
         gr.Markdown("""
+        ### MiaoshouAI/Florence-2-large-PromptGen-v2.0 Features
+        - Improved caption quality for detailed captions
+        - Memory efficient (requires only ~1GB VRAM)
+        - Fast generation while maintaining high quality
+        - Supports multiple caption formats including detailed captions, tags, and analysis
         Supported image formats: JPG, JPEG, PNG
         """)