import spaces import os import re import json import time import torch import tempfile import io import random import string import logging from typing import Tuple, Optional, List, Dict, Any, Union from PIL import Image, ImageDraw, ImageFont from tqdm import tqdm import gradio as gr from safetensors.torch import save_file from src.pipeline import FluxPipeline from src.transformer_flux import FluxTransformer2DModel from src.lora_helper import set_single_lora, set_multi_lora, unset_lora # Google Gemini API 추가 from google import genai from google.genai import types # Initialize the image processor base_path = "black-forest-labs/FLUX.1-dev" lora_base_path = "./models" # System prompt that will be hidden from users but automatically added to their input SYSTEM_PROMPT = "Ghibli Studio style, Charming hand-drawn anime-style illustration" # 로깅 설정 logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s' ) # Load the model pipe = FluxPipeline.from_pretrained(base_path, torch_dtype=torch.bfloat16) transformer = FluxTransformer2DModel.from_pretrained(base_path, subfolder="transformer", torch_dtype=torch.bfloat16) pipe.transformer = transformer pipe.to("cuda") def clear_cache(transformer): for name, attn_processor in transformer.attn_processors.items(): attn_processor.bank_kv.clear() ####################################### # Utility Functions ####################################### # Simple Timer Class class timer: def __init__(self, method_name="timed process"): self.method = method_name def __enter__(self): self.start = time.time() print(f"[TIMER] {self.method} starts") def __exit__(self, exc_type, exc_val, exc_tb): end = time.time() print(f"[TIMER] {self.method} took {round(end - self.start, 2)}s") # 간단한 번역 기능 (한글 -> 영어) def maybe_translate_to_english(text: str) -> str: """ 텍스트에 한글이 포함되어 있으면 영어로 번역, 아니면 그대로 반환 """ if not text or not re.search("[가-힣]", text): return text try: # 간단한 번역 규칙 (실제 프로덕션에서는 API 사용 권장) translations = { "안녕하세요": "Hello", "환영합니다": "Welcome", "아름다운 당신": "Beautiful You", "안녕": "Hello", "고양이": "Cat", "배너": "Banner", "썬글라스": "Sunglasses", "착용한": "wearing", "흰색": "white" } # 전체 문장에 대한 대략적인 번역 for kr, en in translations.items(): if kr in text: text = text.replace(kr, en) print(f"[TRANSLATE] Translated Korean text: '{text}'") return text except Exception as e: print(f"[WARNING] Translation failed: {e}") return text def save_binary_file(file_name, data): with open(file_name, "wb") as f: f.write(data) ####################################### # Gemini API Functions ####################################### def generate_by_google_genai(text, file_name, model="gemini-2.0-flash-exp"): """ - 추가 지시사항(AIP)을 전달해 이미지 기반 편집을 수행. - 응답이 '이미지'면 저장, '텍스트'면 누적하여 반환. """ # API 키 가져오기 (환경 변수 GAPI_TOKEN 사용) api_key = os.getenv("GAPI_TOKEN", None) if not api_key: raise ValueError("GAPI_TOKEN is missing. Please set an API key.") client = genai.Client(api_key=api_key) files = [client.files.upload(file=file_name)] contents = [ types.Content( role="user", parts=[ types.Part.from_uri( file_uri=files[0].uri, mime_type=files[0].mime_type, ), types.Part.from_text(text=text), ], ), ] generate_content_config = types.GenerateContentConfig( temperature=1, top_p=0.95, top_k=40, max_output_tokens=8192, response_modalities=["image", "text"], response_mime_type="text/plain", ) text_response = "" image_path = None # 임시 파일에 이미지 저장 가능하도록 준비 with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: temp_path = tmp.name for chunk in client.models.generate_content_stream( model=model, contents=contents, config=generate_content_config, ): if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts: continue candidate = chunk.candidates[0].content.parts[0] # 만약 inline_data(이미지 데이터)가 있다면 -> 실제 이미지 편집 결과 if candidate.inline_data: save_binary_file(temp_path, candidate.inline_data.data) print(f"File of mime type {candidate.inline_data.mime_type} saved to: {temp_path}") image_path = temp_path # 이미지 한 장만 확보하면 중단 break else: # inline_data가 없으면 텍스트 데이터이므로 누적 text_response += chunk.text + "\n" del files return image_path, text_response def change_text_in_image_two_times(original_image, instruction): """ Call the text-modification API twice (Google Gemini), returning 2 final variations. """ if original_image is None: raise gr.Error("처리할 이미지가 없습니다. 먼저 이미지를 생성해주세요.") results = [] for version_tag in ["(A)", "(B)"]: mod_instruction = f"{instruction} {version_tag}" try: # 이미지 저장용 임시 파일 생성 with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: original_path = tmp.name # PIL 이미지 객체인 경우 저장 print(f"[DEBUG] Saving image of type {type(original_image)} to temporary file") if isinstance(original_image, Image.Image): original_image.save(original_path, format="PNG") print(f"[DEBUG] Saved image to temporary file: {original_path}") else: raise gr.Error(f"예상된 PIL Image가 아닌 {type(original_image)} 타입이 제공되었습니다.") print(f"[DEBUG] Google Gemini API에 보내는 지시사항: {mod_instruction}") image_path, text_response = generate_by_google_genai( text=mod_instruction, file_name=original_path ) if image_path: print(f"[DEBUG] Received image from Gemini API: {image_path}") try: with open(image_path, "rb") as f: image_data = f.read() new_img = Image.open(io.BytesIO(image_data)) results.append(new_img) except Exception as img_err: print(f"[ERROR] Failed to process Gemini image: {img_err}") results.append(original_image) else: # 만약 이미지 응답이 없고, 텍스트만 온 경우 print(f"[WARNING] 이미지가 반환되지 않았습니다. 텍스트 응답: {text_response}") results.append(original_image) except Exception as e: logging.exception(f"Text modification error: {e}") # 오류가 나도 원본 이미지라도 반환 print(f"[ERROR] 텍스트 수정 중 오류 발생: {e}") results.append(original_image) return results ####################################### # Image Generation Functions ####################################### @spaces.GPU() def single_condition_generate_image(user_prompt, spatial_img, height, width, seed): # Combine the system prompt with user prompt full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT # Set the Ghibli LoRA lora_path = os.path.join(lora_base_path, "Ghibli.safetensors") set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512) # Process the image spatial_imgs = [spatial_img] if spatial_img else [] image = pipe( full_prompt, height=int(height), width=int(width), guidance_scale=3.5, num_inference_steps=25, max_sequence_length=512, generator=torch.Generator("cpu").manual_seed(seed), subject_images=[], spatial_images=spatial_imgs, cond_size=512, ).images[0] clear_cache(pipe.transformer) return image @spaces.GPU() def text_rendering_generate_image(user_prompt, input_text, text_color, text_size, text_position, spatial_img, height, width, seed): """ Generate image with Ghibli style and then send to Gemini API for multilingual text rendering """ try: # Step 1: Generate the base image using FLUX print(f"[DEBUG] Generating base image with FLUX") full_prompt = f"{SYSTEM_PROMPT}, {user_prompt}" if user_prompt else SYSTEM_PROMPT # Set the Ghibli LoRA lora_path = os.path.join(lora_base_path, "Ghibli.safetensors") set_single_lora(pipe.transformer, lora_path, lora_weights=[1], cond_size=512) # Process the image spatial_imgs = [spatial_img] if spatial_img else [] base_image = pipe( full_prompt, height=int(height), width=int(width), guidance_scale=3.5, num_inference_steps=25, max_sequence_length=512, generator=torch.Generator("cpu").manual_seed(seed), subject_images=[], spatial_images=spatial_imgs, cond_size=512, ).images[0] clear_cache(pipe.transformer) # If no text is provided, return the base image if not input_text or not input_text.strip(): return [base_image, base_image] # Step 2: Build the instruction for Gemini API instruction = f"Add the text '{input_text}' to this image in {text_color} color" # Add position information if text_position == "top": instruction += " at the top of the image" elif text_position == "bottom": instruction += " at the bottom of the image" else: # center instruction += " at the center of the image" # Add size information if text_size <= 40: instruction += " in small size" elif text_size <= 120: instruction += " in medium size" else: instruction += " in large size" instruction += ". Make sure the text is clearly visible and readable." # Step 3: Call Gemini API to generate two variations print(f"[DEBUG] Sending to Gemini API with instruction: {instruction}") return change_text_in_image_two_times(base_image, instruction) except Exception as e: logging.exception(f"Text rendering error: {e}") # Create a dummy image in case of error dummy_img = Image.new('RGB', (width, height), color=(255, 200, 200)) draw = ImageDraw.Draw(dummy_img) draw.text((width//2, height//2), f"Error: {str(e)}", fill="black", anchor="mm") return [dummy_img, dummy_img] # Load example images def load_examples(): examples = [] test_img_dir = "./test_imgs" example_prompts = [ " ", "saying 'HELLO' in 'speech bubble'", "background 'alps'" ] for i, filename in enumerate(["00.jpg", "02.jpg", "03.jpg"]): img_path = os.path.join(test_img_dir, filename) if os.path.exists(img_path): # Use dimensions from original code for each specific example if filename == "00.jpg": height, width = 680, 1024 elif filename == "02.jpg": height, width = 560, 1024 elif filename == "03.jpg": height, width = 1024, 768 else: height, width = 768, 768 examples.append([ example_prompts[i % len(example_prompts)], # User prompt (without system prompt) Image.open(img_path), # Reference image height, # Height width, # Width i + 1 # Seed ]) return examples # Load examples for text rendering tab def load_text_examples(): examples = [] test_img_dir = "./test_imgs" example_data = [ { "prompt": "cute character with speech bubble", "text": "Hello World!", "color": "#ffffff", "size": 72, "position": "center", "filename": "00.jpg", "height": 680, "width": 1024, "seed": 123 }, { "prompt": "landscape with message", "text": "안녕하세요!", "color": "#ffff00", "size": 100, "position": "top", "filename": "03.jpg", "height": 1024, "width": 768, "seed": 456 }, { "prompt": "character with subtitles", "text": "こんにちは世界!", "color": "#00ffff", "size": 90, "position": "bottom", "filename": "02.jpg", "height": 560, "width": 1024, "seed": 789 } ] for example in example_data: img_path = os.path.join(test_img_dir, example["filename"]) if os.path.exists(img_path): examples.append([ example["prompt"], example["text"], example["color"], example["size"], example["position"], Image.open(img_path), example["height"], example["width"], example["seed"] ]) return examples # CSS for improved UI css = """ :root { --primary-color: #4a6670; --accent-color: #ff8a65; --background-color: #f5f5f5; --card-background: #ffffff; --text-color: #333333; --border-radius: 10px; --shadow: 0 4px 6px rgba(0,0,0,0.1); } body { background-color: var(--background-color); color: var(--text-color); font-family: 'Helvetica Neue', Arial, sans-serif; } .container { max-width: 1200px; margin: 0 auto; padding: 20px; } .gr-header { background: linear-gradient(135deg, #668796 0%, #4a6670 100%); padding: 24px; border-radius: var(--border-radius); margin-bottom: 24px; box-shadow: var(--shadow); text-align: center; } .gr-header h1 { color: white; font-size: 2.5rem; margin: 0; font-weight: 700; } .gr-header p { color: rgba(255, 255, 255, 0.9); font-size: 1.1rem; margin-top: 8px; } .gr-panel { background-color: var(--card-background); border-radius: var(--border-radius); padding: 16px; box-shadow: var(--shadow); } .gr-button { background-color: var(--accent-color); border: none; color: white; padding: 10px 20px; border-radius: 5px; font-size: 16px; font-weight: bold; cursor: pointer; transition: transform 0.1s, background-color 0.3s; } .gr-button:hover { background-color: #ff7043; transform: translateY(-2px); } .gr-input, .gr-select { border-radius: 5px; border: 1px solid #ddd; padding: 10px; width: 100%; } .gr-form { display: grid; gap: 16px; } .gr-box { background-color: var(--card-background); border-radius: var(--border-radius); padding: 20px; box-shadow: var(--shadow); margin-bottom: 20px; } .gr-gallery { display: grid; grid-template-columns: repeat(auto-fill, minmax(200px, 1fr)); gap: 16px; } .gr-gallery-item { overflow: hidden; border-radius: var(--border-radius); box-shadow: var(--shadow); transition: transform 0.3s; } .gr-gallery-item:hover { transform: scale(1.02); } .gr-image { width: 100%; height: auto; object-fit: cover; } .gr-footer { text-align: center; margin-top: 40px; padding: 20px; color: #666; font-size: 14px; } .gr-examples-gallery { margin-top: 20px; } /* Responsive adjustments */ @media (max-width: 768px) { .gr-header h1 { font-size: 1.8rem; } .gr-panel { padding: 12px; } } /* Ghibli-inspired accent colors */ .gr-accent-1 { background-color: #95ccd9; } .gr-accent-2 { background-color: #74ad8c; } .gr-accent-3 { background-color: #f9c06b; } .text-rendering-options { background-color: #f0f8ff; padding: 16px; border-radius: var(--border-radius); margin-top: 16px; } .api-status { font-size: 14px; color: #666; text-align: center; margin-bottom: 10px; } .api-connected { color: green; } .api-disconnected { color: red; } """ # Create the Gradio Blocks interface with gr.Blocks(css=css) as demo: gr.HTML("""
Transform your ideas into magical Ghibli-inspired artwork with multilingual text
Describe what you want to see in your Ghibli-inspired image
Your Ghibli-inspired artwork will appear here
Click on any example to try it out
Create Ghibli-style images with beautiful text in any language using Gemini AI
Two versions of your Ghibli-inspired artwork with text will appear here
Click on any example to try it out