Spaces:

Zoro-chi
/

ai-creative-studio

Running

App Files Files Community

Zoro-chi commited on Apr 17

Commit

b26638e

1 Parent(s): 6c28b5c

Switch to google/flan-t5-base and improve prompt expansion for better quality

Browse files

Files changed (2) hide show

app.py +4 -4
app/llm/model.py +46 -39

app.py CHANGED Viewed

@@ -28,15 +28,15 @@ os.environ["HF_SPACES"] = "1"  # Flag to indicate we're running in Spaces
 # Set model environment variables explicitly for Hugging Face Spaces
 # These will override any variables loaded from .env.spaces
-os.environ["MODEL_ID"] = (
-    "distilgpt2"  # Use DistilGPT2 model which is publicly available
-)
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "transformers"
 os.environ["MODEL_QUANTIZED"] = (
     "false"  # Disable quantization to avoid bitsandbytes dependency
 )
-os.environ["MODEL_ARCHITECTURE"] = "causal"  # GPT2 is a causal language model
 # Import UI module directly
 try:

 # Set model environment variables explicitly for Hugging Face Spaces
 # These will override any variables loaded from .env.spaces
+os.environ["MODEL_ID"] = "google/flan-t5-base"  # Use flan-t5-base model
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "transformers"
 os.environ["MODEL_QUANTIZED"] = (
     "false"  # Disable quantization to avoid bitsandbytes dependency
 )
+os.environ["MODEL_ARCHITECTURE"] = (
+    "seq2seq"  # T5 models are sequence-to-sequence models
+)
 # Import UI module directly
 try:

app/llm/model.py CHANGED Viewed

@@ -455,54 +455,61 @@ class LocalLLM:
         """
         # For seq2seq models like T5, use a format that works better with their training
         if self.model_architecture == "seq2seq":
-            # Special formatting for T5 models which work better with task-specific prefixes
-            if "flan" in self.model_path.lower():
-                task_prompt = (
-                    f"Enhance this image prompt with artistic details: {prompt}"
-                )
-                # Generate with higher max tokens for T5 models
                 try:
                     logger.info(
-                        f"Using seq2seq format for prompt expansion with model: {self.model_path}"
-                    )
-                    expanded = self.generate(
-                        prompt=task_prompt,
-                        system_prompt=None,  # T5 doesn't use system prompts the same way
-                        max_tokens=512,
-                        temperature=0.9,  # Higher temperature for more creative outputs
                     )
-                    # If the model returns the input, try a different approach
-                    if (
-                        expanded.strip() == task_prompt.strip()
-                        or expanded.strip() == prompt.strip()
-                    ):
                         expanded = self.pipe(
-                            f"Generate a detailed visual description of: {prompt}",
-                            max_length=256,
                             do_sample=True,
-                            temperature=0.9,
                             top_p=0.92,
-                        )[0]["generated_text"]
-                    logger.info(f"Expanded prompt: {expanded[:100]}...")
-                    return expanded
                 except Exception as e:
-                    logger.error(f"Error expanding prompt with T5: {str(e)}")
-                    # Fall back to original prompt with some basic additions
-                    adjectives = [
-                        "vibrant",
-                        "detailed",
-                        "high-quality",
-                        "stunning",
-                        "professional",
-                    ]
-                    import random
-                    enhanced = f"{random.choice(adjectives)} {prompt}, {random.choice(adjectives)} artwork, highly detailed"
-                    logger.info(f"Fallback prompt expansion: {enhanced}")
-                    return enhanced
         # Standard approach for causal LMs like GPT-2 or Llama
         system_prompt = """You are a creative assistant specializing in enhancing text prompts for image and 3D model generation.

         """
         # For seq2seq models like T5, use a format that works better with their training
         if self.model_architecture == "seq2seq":
+            # Special handling for FLAN-T5 models
+            if "flan-t5" in self.model_path.lower():
                 try:
                     logger.info(
+                        f"Using optimized T5 format for prompt expansion with {self.model_path}"
                     )
+                    # Try different instruction formats that work well with FLAN-T5
+                    prompts_to_try = [
+                        f"Create a detailed, professional-quality image description for: {prompt}",
+                        f"Turn this simple prompt into a detailed, vivid scene description: {prompt}",
+                        f"Enhance this image prompt with artistic details, lighting, and style: {prompt}",
+                    ]
+                    # Try each prompt format until we get a good result
+                    for task_prompt in prompts_to_try:
                         expanded = self.pipe(
+                            task_prompt,
+                            max_length=150,  # Allow longer expansions
                             do_sample=True,
+                            temperature=0.8,  # Slightly more focused than previous attempts
                             top_p=0.92,
+                            repetition_penalty=1.2,  # Discourage repetition
+                        )[0]["generated_text"].strip()
+                        # Check if the result is good
+                        if expanded and len(expanded) > len(prompt) + 10:
+                            logger.info(f"Expanded prompt: {expanded[:100]}...")
+                            # For longer generations, check if we need to clean it up
+                            if len(expanded) > 200:
+                                sentences = expanded.split(".")
+                                # Keep first 3-4 meaningful sentences
+                                meaningful_sentences = [
+                                    s for s in sentences if len(s.strip()) > 5
+                                ][:4]
+                                expanded = ". ".join(meaningful_sentences)
+                                if not expanded.endswith("."):
+                                    expanded += "."
+                            return expanded
+                    # If all attempts failed, use a template-based expansion
+                    fallback = f"{prompt}, high resolution, professional photography, detailed, vivid colors, dramatic lighting"
+                    logger.info(f"Using template fallback: {fallback}")
+                    return fallback
                 except Exception as e:
+                    logger.error(f"Error expanding prompt with FLAN-T5: {str(e)}")
+                    # Fall back to original prompt with enhancements
+                    fallback = (
+                        f"{prompt}, high quality, detailed, 4k, professional, artistic"
+                    )
+                    logger.info(f"Using error fallback: {fallback}")
+                    return fallback
         # Standard approach for causal LMs like GPT-2 or Llama
         system_prompt = """You are a creative assistant specializing in enhancing text prompts for image and 3D model generation.