Ghibli-Multilingual-Text-rendering

Running on Zero

App Files Files Community

seawolf2357 commited on Apr 3

Commit

b35c0f1

verified ·

1 Parent(s): 776d15b

Update app.py

Browse files

Files changed (1) hide show

app.py +312 -90

app.py CHANGED Viewed

@@ -1,8 +1,16 @@
 import spaces
 import os
 import json
 import time
 import torch
 from PIL import Image, ImageDraw, ImageFont
 from tqdm import tqdm
 import gradio as gr
@@ -12,6 +20,10 @@ from src.pipeline import FluxPipeline
 from src.transformer_flux import FluxTransformer2DModel
 from src.lora_helper import set_single_lora, set_multi_lora, unset_lora
 # Initialize the image processor
 base_path = "black-forest-labs/FLUX.1-dev"
 lora_base_path = "./models"
@@ -19,6 +31,13 @@ lora_base_path = "./models"
 # System prompt that will be hidden from users but automatically added to their input
 SYSTEM_PROMPT = "Ghibli Studio style, Charming hand-drawn anime-style illustration"
 pipe = FluxPipeline.from_pretrained(base_path, torch_dtype=torch.bfloat16)
 transformer = FluxTransformer2DModel.from_pretrained(base_path, subfolder="transformer", torch_dtype=torch.bfloat16)
 pipe.transformer = transformer
@@ -28,7 +47,182 @@ def clear_cache(transformer):
     for name, attn_processor in transformer.attn_processors.items():
         attn_processor.bank_kv.clear()
-# Define the Gradio interface
 @spaces.GPU()
 def single_condition_generate_image(user_prompt, spatial_img, height, width, seed):
     # Combine the system prompt with user prompt
@@ -55,90 +249,72 @@ def single_condition_generate_image(user_prompt, spatial_img, height, width, see
     clear_cache(pipe.transformer)
     return image
-# New function for multilingual text rendering
 @spaces.GPU()
 def text_rendering_generate_image(user_prompt, input_text, text_color, text_size, text_position, spatial_img, height, width, seed):
-    # Combine the system prompt with user prompt
-    full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT
-    # Set the Ghibli LoRA
-    lora_path = os.path.join(lora_base_path, "Ghibli.safetensors")
-    set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512)
-    # Process the image
-    spatial_imgs = [spatial_img] if spatial_img else []
-    image = pipe(
-        full_prompt,
-        height=int(height),
-        width=int(width),
-        guidance_scale=3.5,
-        num_inference_steps=25,
-        max_sequence_length=512,
-        generator=torch.Generator("cpu").manual_seed(seed),
-        subject_images=[],
-        spatial_images=spatial_imgs,
-        cond_size=512,
-    ).images[0]
-    # Add text to the generated image if text is provided
-    if input_text:
-        # Convert to PIL Image if needed
-        if not isinstance(image, Image.Image):
-            image = Image.fromarray(image)
-        # Create a drawing context
-        draw = ImageDraw.Draw(image)
-        # Try to load a font that supports multilingual text
-        try:
-            # Attempt to load a system font that supports multilingual text
-            # Scale up the text size significantly to make it more visible
-            actual_text_size = text_size * 3  # Multiply the size by 3 for better visibility
-            font = ImageFont.truetype("Arial Unicode.ttf", actual_text_size)
-        except IOError:
-            try:
-                # Try another common font if Arial Unicode is not available
-                actual_text_size = text_size * 3
-                font = ImageFont.truetype("DejaVuSans.ttf", actual_text_size)
-            except IOError:
-                # Final fallback to default font with increased size
-                font = ImageFont.load_default()
-        # Parse position (top, center, bottom)
-        # Use actual_text_size for position calculations to maintain proper spacing
         if text_position == "top":
-            position = (width // 2, actual_text_size + 30)  # More padding from the top
         elif text_position == "bottom":
-            position = (width // 2, height - actual_text_size - 30)  # More padding from the bottom
         else:  # center
-            position = (width // 2, height // 2)
-        # Add text with outline for better visibility
-        # Draw text outline (shadow) with larger offset for better visibility
-        outline_size = max(3, actual_text_size // 15)  # Scale outline size with text size
-        for offset_x in range(-outline_size, outline_size + 1, outline_size):
-            for offset_y in range(-outline_size, outline_size + 1, outline_size):
-                if offset_x == 0 and offset_y == 0:
-                    continue  # Skip the center position (will be drawn as main text)
-                draw.text(
-                    (position[0] + offset_x, position[1] + offset_y),
-                    input_text,
-                    fill="black",
-                    font=font,
-                    anchor="mm"  # Center align the text
-                )
-        # Draw the main text
-        draw.text(
-            position,
-            input_text,
-            fill=text_color,
-            font=font,
-            anchor="mm"  # Center align the text
-        )
-    clear_cache(pipe.transformer)
-    return image
 # Load example images
 def load_examples():
@@ -386,6 +562,21 @@ body {
     border-radius: var(--border-radius);
     margin-top: 16px;
 }
 """
 # Create the Gradio Blocks interface
@@ -393,10 +584,16 @@ with gr.Blocks(css=css) as demo:
     gr.HTML("""
     <div class="gr-header">
         <h1>✨ Ghibli Multilingual Text-Rendering ✨</h1>
-        <p>Transform your ideas into magical Ghibli-inspired artwork</p>
     </div>
     """)
     with gr.Tabs():
         with gr.Tab("Create Ghibli Art"):
             with gr.Row():
@@ -428,7 +625,7 @@ with gr.Blocks(css=css) as demo:
                         seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42,
                                         info="Change for different variations")
-                    generate_btn = gr.Button("✨ Generate Ghibli Art", elem_classes="gr-button")
                 with gr.Column(scale=1):
                     gr.HTML("""
@@ -464,14 +661,14 @@ with gr.Blocks(css=css) as demo:
                 outputs=output_image
             )
-        # Second tab for Image & Multilingual Text Rendering
         with gr.Tab("Image & Multilingual Text Rendering"):
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.HTML("""
                     <div class="gr-box">
-                        <h3>🌈 Art with Text</h3>
-                        <p>Create Ghibli-style images with beautiful text in any language</p>
                     </div>
                     """)
@@ -483,8 +680,8 @@ with gr.Blocks(css=css) as demo:
                     with gr.Group(elem_classes="text-rendering-options"):
                         input_text = gr.Textbox(
-                            label="Overlay Text",
-                            placeholder="Enter text in any language",
                             lines=1
                         )
@@ -522,20 +719,31 @@ with gr.Blocks(css=css) as demo:
                         text_seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42,
                                            info="Change for different variations")
-                    text_generate_btn = gr.Button("✨ Generate Art with Text", elem_classes="gr-button")
                 with gr.Column(scale=1):
                     gr.HTML("""
                     <div class="gr-box">
-                        <h3>✨ Your Text Creation</h3>
-                        <p>Your Ghibli-inspired artwork with text will appear here</p>
                     </div>
                     """)
-                    text_output_image = gr.Image(label="Generated Image with Text", elem_classes="gr-output-image")
             gr.HTML("""
             <div class="gr-box gr-examples-gallery">
-                <h3>✨ Text Rendering Examples</h3>
                 <p>Click on any example to try it out</p>
             </div>
             """)
@@ -546,7 +754,7 @@ with gr.Blocks(css=css) as demo:
                 examples=text_examples,
                 inputs=[text_user_prompt, input_text, text_color, text_size, text_position,
                         text_spatial_img, text_height, text_width, text_seed],
-                outputs=text_output_image,
                 fn=text_rendering_generate_image,
                 cache_examples=False,
                 examples_per_page=3
@@ -557,14 +765,28 @@ with gr.Blocks(css=css) as demo:
                 text_rendering_generate_image,
                 inputs=[text_user_prompt, input_text, text_color, text_size, text_position,
                         text_spatial_img, text_height, text_width, text_seed],
-                outputs=text_output_image
             )
     gr.HTML("""
     <div class="gr-footer">
-        <p>Powered by FLUX.1 and Ghibli LoRA • Created with ❤️</p>
     </div>
     """)
 # Launch the Gradio app
 demo.queue().launch()

 import spaces
 import os
+import re
 import json
 import time
 import torch
+import tempfile
+import io
+import random
+import string
+import logging
+from typing import Tuple, Optional, List, Dict, Any, Union
 from PIL import Image, ImageDraw, ImageFont
 from tqdm import tqdm
 import gradio as gr
 from src.transformer_flux import FluxTransformer2DModel
 from src.lora_helper import set_single_lora, set_multi_lora, unset_lora
+# Google Gemini API 추가
+from google import genai
+from google.genai import types
 # Initialize the image processor
 base_path = "black-forest-labs/FLUX.1-dev"
 lora_base_path = "./models"
 # System prompt that will be hidden from users but automatically added to their input
 SYSTEM_PROMPT = "Ghibli Studio style, Charming hand-drawn anime-style illustration"
+# 로깅 설정
+logging.basicConfig(
+    level=logging.DEBUG,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+# Load the model
 pipe = FluxPipeline.from_pretrained(base_path, torch_dtype=torch.bfloat16)
 transformer = FluxTransformer2DModel.from_pretrained(base_path, subfolder="transformer", torch_dtype=torch.bfloat16)
 pipe.transformer = transformer
     for name, attn_processor in transformer.attn_processors.items():
         attn_processor.bank_kv.clear()
+#######################################
+# Utility Functions
+#######################################
+# Simple Timer Class
+class timer:
+    def __init__(self, method_name="timed process"):
+        self.method = method_name
+    def __enter__(self):
+        self.start = time.time()
+        print(f"[TIMER] {self.method} starts")
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        end = time.time()
+        print(f"[TIMER] {self.method} took {round(end - self.start, 2)}s")
+# 간단한 번역 기능 (한글 -> 영어)
+def maybe_translate_to_english(text: str) -> str:
+    """
+    텍스트에 한글이 포함되어 있으면 영어로 번역, 아니면 그대로 반환
+    """
+    if not text or not re.search("[가-힣]", text):
+        return text
+    try:
+        # 간단한 번역 규칙 (실제 프로덕션에서는 API 사용 권장)
+        translations = {
+            "안녕하세요": "Hello",
+            "환영합니다": "Welcome",
+            "아름다운 당신": "Beautiful You",
+            "안녕": "Hello",
+            "고양이": "Cat",
+            "배너": "Banner",
+            "썬글라스": "Sunglasses",
+            "착용한": "wearing",
+            "흰색": "white"
+        }
+        # 전체 문장에 대한 대략적인 번역
+        for kr, en in translations.items():
+            if kr in text:
+                text = text.replace(kr, en)
+        print(f"[TRANSLATE] Translated Korean text: '{text}'")
+        return text
+    except Exception as e:
+        print(f"[WARNING] Translation failed: {e}")
+        return text
+def save_binary_file(file_name, data):
+    with open(file_name, "wb") as f:
+        f.write(data)
+#######################################
+# Gemini API Functions
+#######################################
+def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"):
+    """
+    - 추가 지시사항(AIP)을 전달해 이미지 기반 편집을 수행.
+    - 응답이 '이미지'면 저장, '텍스트'면 누적하여 반환.
+    """
+    # API 키 가져오기 (환경 변수 GAPI_TOKEN 사용)
+    api_key = os.getenv("GAPI_TOKEN", None)
+    if not api_key:
+        raise ValueError("GAPI_TOKEN is missing. Please set an API key.")
+    client = genai.Client(api_key=api_key)
+    files = [client.files.upload(file=file_name)]
+    contents = [
+        types.Content(
+            role="user",
+            parts=[
+                types.Part.from_uri(
+                    file_uri=files[0].uri,
+                    mime_type=files[0].mime_type,
+                ),
+                types.Part.from_text(text=text),
+            ],
+        ),
+    ]
+    generate_content_config = types.GenerateContentConfig(
+        temperature=1,
+        top_p=0.95,
+        top_k=40,
+        max_output_tokens=8192,
+        response_modalities=["image", "text"],
+        response_mime_type="text/plain",
+    )
+    text_response = ""
+    image_path = None
+    # 임시 파일에 이미지 저장 가능하도록 준비
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+        temp_path = tmp.name
+        for chunk in client.models.generate_content_stream(
+            model=model,
+            contents=contents,
+            config=generate_content_config,
+        ):
+            if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
+                continue
+            candidate = chunk.candidates[0].content.parts[0]
+            # 만약 inline_data(이미지 데이터)가 있다면 -> 실제 이미지 편집 결과
+            if candidate.inline_data:
+                save_binary_file(temp_path, candidate.inline_data.data)
+                print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path}")
+                image_path = temp_path
+                # 이미지 한 장만 확보하면 중단
+                break
+            else:
+                # inline_data가 없으면 텍스트 데이터이므로 누적
+                text_response += chunk.text + "\n"
+    del files
+    return image_path, text_response
+def change_text_in_image_two_times(original_image, instruction):
+    """
+    Call the text-modification API twice (Google Gemini), returning 2 final variations.
+    """
+    if original_image is None:
+        raise gr.Error("처리할 이미지가 없습니다. 먼저 이미지를 생성해주세요.")
+    results = []
+    for version_tag in ["(A)", "(B)"]:
+        mod_instruction = f"{instruction} {version_tag}"
+        try:
+            # 이미지 저장용 임시 파일 생성
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                original_path = tmp.name
+                # PIL 이미지 객체인 경우 저장
+                print(f"[DEBUG] Saving image of type {type(original_image)} to temporary file")
+                if isinstance(original_image, Image.Image):
+                    original_image.save(original_path, format="PNG")
+                    print(f"[DEBUG] Saved image to temporary file: {original_path}")
+                else:
+                    raise gr.Error(f"예상된 PIL Image가 아닌 {type(original_image)} 타입이 제공되었습니다.")
+            print(f"[DEBUG] Google Gemini API에 보내는 지시사항: {mod_instruction}")
+            image_path, text_response = generate_by_google_genai(
+                text=mod_instruction,
+                file_name=original_path
+            )
+            if image_path:
+                print(f"[DEBUG] Received image from Gemini API: {image_path}")
+                try:
+                    with open(image_path, "rb") as f:
+                        image_data = f.read()
+                    new_img = Image.open(io.BytesIO(image_data))
+                    results.append(new_img)
+                except Exception as img_err:
+                    print(f"[ERROR] Failed to process Gemini image: {img_err}")
+                    results.append(original_image)
+            else:
+                # 만약 이미지 응답이 없고, 텍스트만 온 경우
+                print(f"[WARNING] 이미지가 반환되지 않았습니다. 텍스트 응답: {text_response}")
+                results.append(original_image)
+        except Exception as e:
+            logging.exception(f"Text modification error: {e}")
+            # 오류가 나도 원본 이미지라도 반환
+            print(f"[ERROR] 텍스트 수정 중 오류 발생: {e}")
+            results.append(original_image)
+    return results
+#######################################
+# Image Generation Functions
+#######################################
 @spaces.GPU()
 def single_condition_generate_image(user_prompt, spatial_img, height, width, seed):
     # Combine the system prompt with user prompt
     clear_cache(pipe.transformer)
     return image
 @spaces.GPU()
 def text_rendering_generate_image(user_prompt, input_text, text_color, text_size, text_position, spatial_img, height, width, seed):
+    """
+    Generate image with Ghibli style and then send to Gemini API for multilingual text rendering
+    """
+    try:
+        # Step 1: Generate the base image using FLUX
+        print(f"[DEBUG] Generating base image with FLUX")
+        full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT
+        # Set the Ghibli LoRA
+        lora_path = os.path.join(lora_base_path, "Ghibli.safetensors")
+        set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512)
+        # Process the image
+        spatial_imgs = [spatial_img] if spatial_img else []
+        base_image = pipe(
+            full_prompt,
+            height=int(height),
+            width=int(width),
+            guidance_scale=3.5,
+            num_inference_steps=25,
+            max_sequence_length=512,
+            generator=torch.Generator("cpu").manual_seed(seed),
+            subject_images=[],
+            spatial_images=spatial_imgs,
+            cond_size=512,
+        ).images[0]
+        clear_cache(pipe.transformer)
+        # If no text is provided, return the base image
+        if not input_text or not input_text.strip():
+            return [base_image, base_image]
+        # Step 2: Build the instruction for Gemini API
+        instruction = f"Add the text '{input_text}' to this image in {text_color} color"
+        # Add position information
         if text_position == "top":
+            instruction += " at the top of the image"
         elif text_position == "bottom":
+            instruction += " at the bottom of the image"
         else:  # center
+            instruction += " at the center of the image"
+        # Add size information
+        if text_size <= 40:
+            instruction += " in small size"
+        elif text_size <= 120:
+            instruction += " in medium size"
+        else:
+            instruction += " in large size"
+        instruction += ". Make sure the text is clearly visible and readable."
+        # Step 3: Call Gemini API to generate two variations
+        print(f"[DEBUG] Sending to Gemini API with instruction: {instruction}")
+        return change_text_in_image_two_times(base_image, instruction)
+    except Exception as e:
+        logging.exception(f"Text rendering error: {e}")
+        # Create a dummy image in case of error
+        dummy_img = Image.new('RGB', (width, height), color=(255, 200, 200))
+        draw = ImageDraw.Draw(dummy_img)
+        draw.text((width//2, height//2), f"Error: {str(e)}", fill="black", anchor="mm")
+        return [dummy_img, dummy_img]
 # Load example images
 def load_examples():
     border-radius: var(--border-radius);
     margin-top: 16px;
 }
+.api-status {
+    font-size: 14px;
+    color: #666;
+    text-align: center;
+    margin-bottom: 10px;
+}
+.api-connected {
+    color: green;
+}
+.api-disconnected {
+    color: red;
+}
 """
 # Create the Gradio Blocks interface
     gr.HTML("""
     <div class="gr-header">
         <h1>✨ Ghibli Multilingual Text-Rendering ✨</h1>
+        <p>Transform your ideas into magical Ghibli-inspired artwork with multilingual text</p>
     </div>
     """)
+    # API Status
+    api_status = gr.Markdown(
+        """<div class="api-status api-connected">✓ Connected to image generation and Gemini API</div>""",
+        visible=True
+    )
     with gr.Tabs():
         with gr.Tab("Create Ghibli Art"):
             with gr.Row():
                         seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42,
                                         info="Change for different variations")
+                    generate_btn = gr.Button("✨ Generate Ghibli Art", variant="primary", elem_classes=["generate-btn"])
                 with gr.Column(scale=1):
                     gr.HTML("""
                 outputs=output_image
             )
+        # Second tab for Image & Multilingual Text Rendering with Gemini API
         with gr.Tab("Image & Multilingual Text Rendering"):
             with gr.Row():
                 with gr.Column(scale=1):
                     gr.HTML("""
                     <div class="gr-box">
+                        <h3>🌈 Art with Multilingual Text</h3>
+                        <p>Create Ghibli-style images with beautiful text in any language using Gemini AI</p>
                     </div>
                     """)
                     with gr.Group(elem_classes="text-rendering-options"):
                         input_text = gr.Textbox(
+                            label="Multilingual Text to Add",
+                            placeholder="Enter text in any language (Korean, Japanese, English, etc.)",
                             lines=1
                         )
                         text_seed = gr.Slider(minimum=1, maximum=9999, step=1, label="Seed", value=42,
                                            info="Change for different variations")
+                    text_generate_btn = gr.Button("✨ Generate Art with Multilingual Text", variant="primary", elem_classes=["generate-btn"])
                 with gr.Column(scale=1):
                     gr.HTML("""
                     <div class="gr-box">
+                        <h3>✨ Your Text Creations (Two Variations)</h3>
+                        <p>Two versions of your Ghibli-inspired artwork with text will appear here</p>
                     </div>
                     """)
+                    with gr.Row():
+                        text_output_image1 = gr.Image(
+                            label="Variation A",
+                            type="pil",
+                            elem_classes="gr-output-image"
+                        )
+                        text_output_image2 = gr.Image(
+                            label="Variation B",
+                            type="pil",
+                            elem_classes="gr-output-image"
+                        )
             gr.HTML("""
             <div class="gr-box gr-examples-gallery">
+                <h3>✨ Multilingual Text Examples</h3>
                 <p>Click on any example to try it out</p>
             </div>
             """)
                 examples=text_examples,
                 inputs=[text_user_prompt, input_text, text_color, text_size, text_position,
                         text_spatial_img, text_height, text_width, text_seed],
+                outputs=[text_output_image1, text_output_image2],
                 fn=text_rendering_generate_image,
                 cache_examples=False,
                 examples_per_page=3
                 text_rendering_generate_image,
                 inputs=[text_user_prompt, input_text, text_color, text_size, text_position,
                         text_spatial_img, text_height, text_width, text_seed],
+                outputs=[text_output_image1, text_output_image2]
             )
     gr.HTML("""
     <div class="gr-footer">
+        <p>Powered by FLUX.1, Ghibli LoRA, and Google Gemini API • Created with ❤️</p>
     </div>
     """)
+# Function to check API availability
+def check_api_status():
+    # Check Gemini API availability
+    api_key = os.getenv("GAPI_TOKEN")
+    gemini_available = api_key is not None
+    if gemini_available:
+        return """<div class="api-status api-connected">✓ Connected to FLUX.1 and Gemini API</div>"""
+    else:
+        return """<div class="api-status api-disconnected">✗ Gemini API connection issue. Please check GAPI_TOKEN environment variable.</div>"""
+# Update API status on page load
+demo.load(fn=check_api_status, outputs=api_status)
 # Launch the Gradio app
 demo.queue().launch()