Spaces:

Zoro-chi
/

ai-creative-studio

Running

App Files Files Community

Zoro-chi commited on Apr 17

Commit

647c8f0

1 Parent(s): c0b9c7a

cleaned code

Browse files

Files changed (3) hide show

app/llm/client.py +18 -6
app/llm/model.py +79 -602
app/llm/service.py +62 -151

app/llm/client.py CHANGED Viewed

@@ -38,6 +38,7 @@ class LLMClient:
     """
     Client for interacting with the LLM service or direct model access.
     Provides methods to generate text and expand creative prompts.
     """
     def __init__(self, base_url: str = None):
@@ -56,7 +57,9 @@ class LLMClient:
         if self.spaces_mode or not self.base_url:
             if MODEL_SUPPORT:
                 try:
-                    logger.info("Running in Spaces mode, initializing local model...")
                     self.local_model = get_llm_instance()
                     logger.info(f"Local model initialized successfully")
                 except Exception as e:
@@ -122,14 +125,16 @@ class LLMClient:
         try:
             response = self.session.post(f"{self.base_url}/generate", json=payload)
             response.raise_for_status()
-            return response.json()["text"]
         except requests.RequestException as e:
             logger.error(f"Failed to generate text: {str(e)}")
             return prompt
     def expand_prompt(self, prompt: str) -> str:
         """
-        Expand a creative prompt with rich details.
         Args:
             prompt: The user's original prompt
@@ -152,10 +157,13 @@ class LLMClient:
         try:
             response = self.session.post(
-                f"{self.base_url}/expand", json={"prompt": prompt}
             )
             response.raise_for_status()
-            return response.json()["text"]
         except requests.RequestException as e:
             logger.error(f"Failed to expand prompt: {str(e)}")
             return prompt
@@ -168,7 +176,11 @@ class LLMClient:
             Health status information
         """
         if self.local_model:
-            return {"status": "healthy", "mode": "direct_model"}
         if not self.base_url:
             return {"status": "unavailable", "reason": "no_service_url"}

     """
     Client for interacting with the LLM service or direct model access.
     Provides methods to generate text and expand creative prompts.
+    Uses TinyLlama model for efficient prompt expansion.
     """
     def __init__(self, base_url: str = None):
         if self.spaces_mode or not self.base_url:
             if MODEL_SUPPORT:
                 try:
+                    logger.info(
+                        "Running in Spaces mode, initializing TinyLlama model..."
+                    )
                     self.local_model = get_llm_instance()
                     logger.info(f"Local model initialized successfully")
                 except Exception as e:
         try:
             response = self.session.post(f"{self.base_url}/generate", json=payload)
             response.raise_for_status()
+            return response.json()[
+                "result"
+            ]  # Updated to match service.py response format
         except requests.RequestException as e:
             logger.error(f"Failed to generate text: {str(e)}")
             return prompt
     def expand_prompt(self, prompt: str) -> str:
         """
+        Expand a creative prompt with rich details using TinyLlama.
         Args:
             prompt: The user's original prompt
         try:
             response = self.session.post(
+                f"{self.base_url}/expand-prompt",
+                json={"prompt": prompt},  # Updated endpoint to match service.py
             )
             response.raise_for_status()
+            return response.json()[
+                "expanded_prompt"
+            ]  # Updated to match service.py response format
         except requests.RequestException as e:
             logger.error(f"Failed to expand prompt: {str(e)}")
             return prompt
             Health status information
         """
         if self.local_model:
+            return {
+                "status": "healthy",
+                "mode": "direct_model",
+                "model": "TinyLlama-1.1B-Chat-v1.0",
+            }
         if not self.base_url:
             return {"status": "unavailable", "reason": "no_service_url"}

app/llm/model.py CHANGED Viewed

@@ -1,251 +1,62 @@
 import os
 import logging
-import torch
-import re
-from typing import Dict, List, Optional, Union
 from pathlib import Path
-import json
-import tempfile
 logger = logging.getLogger(__name__)
-# Try to import transformers and ctransformers
-try:
-    from transformers import (
-        AutoTokenizer,
-        AutoModelForCausalLM,
-        AutoModelForSeq2SeqLM,
-        pipeline,
-        AutoConfig,
-    )
-    HAS_TRANSFORMERS = True
-except ImportError:
-    HAS_TRANSFORMERS = False
-    logger.warning(
-        "Transformers library not found. Standard models won't be available."
-    )
-# Try to import ctransformers for GGUF support
-try:
-    from ctransformers import AutoModelForCausalLM as CTAutoModelForCausalLM
-    HAS_CTRANSFORMERS = True
-except ImportError:
-    HAS_CTRANSFORMERS = False
-    logger.warning("CTransformers library not found. GGUF models won't be available.")
 class LocalLLM:
     """
-    A wrapper for running local LLMs using either Hugging Face Transformers or CTransformers.
-    Optimized for creative prompt expansion and interpretation.
     """
-    def __init__(
-        self,
-        model_path: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
-        model_file: str = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-        model_type: str = "gguf",
-        model_architecture: str = "causal",
-        device_map: str = "auto",
-        torch_dtype=None,
-        use_quantization: bool = False,
-    ):
         """
         Initialize the local LLM.
         Args:
-            model_path: Path to model or HuggingFace model ID
-            model_file: Specific model file to load (for GGUF models)
-            model_type: Type of model ('transformers' or 'gguf')
-            model_architecture: Architecture type ('causal' or 'seq2seq')
-            device_map: Device mapping strategy (default: "auto")
-            torch_dtype: Torch data type (default: float16)
-            use_quantization: Whether to use 8-bit quantization to reduce memory usage
         """
-        self.model_path = model_path
-        self.model_file = model_file
-        self.model_type = model_type.lower()
-        self.model_architecture = model_architecture.lower()
-        self.device_map = device_map
-        self.use_quantization = use_quantization
-        self.pipe = None
-        self.model = None
-        self.tokenizer = None
-        # Set torch dtype if using transformers models
-        if torch_dtype is None and self.model_type != "gguf":
-            self.torch_dtype = torch.float16
-        else:
-            self.torch_dtype = torch_dtype
-        logger.info(f"Loading LLM from {model_path}")
-        logger.info(
-            f"Model type: {model_type}, architecture: {model_architecture}, model file: {model_file}"
-        )
-        # Various loading strategies based on model type
-        if self.model_type == "gguf":
-            self._load_gguf_model()
-        else:
-            self._load_transformers_model()
-    def _load_gguf_model(self):
-        """Load a GGUF model using CTransformers"""
-        if not HAS_CTRANSFORMERS:
-            raise ImportError(
-                "CTransformers library not found but required for GGUF models"
             )
-        try:
-            # Handle spaces and CPU constraints
-            spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
-            threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
-            # Very optimized settings for spaces
-            if spaces_mode:
-                logger.info("Using optimized settings for Spaces environment")
-                context_length = 512  # Use shorter context for speed
-            else:
-                context_length = 2048  # Standard context length
-            logger.info(f"Using context length: {context_length}, threads: {threads}")
-            # If we have a model file specified, use it directly
-            if self.model_file and "/" in self.model_path:
-                logger.info(
-                    f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
-                )
-                # Using the exact pattern from the example
-                self.model = CTAutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    model_file=self.model_file,
-                    model_type="llama",  # required for Llama/TinyLlama models
-                    max_new_tokens=256,
-                    context_length=context_length,
-                    temperature=0.7,
-                    top_p=0.95,
-                    repetition_penalty=1.1,
-                    threads=threads,  # CPU threads
-                )
-            else:
-                # Local path with model
-                logger.info(f"Loading local GGUF model: {self.model_path}")
-                self.model = CTAutoModelForCausalLM.from_pretrained(
-                    self.model_path,
-                    model_type="llama",
-                )
-            logger.info("GGUF model loaded successfully")
-        except Exception as e:
-            logger.error(f"Failed to load GGUF model: {str(e)}")
-            raise
-    def _load_transformers_model(self):
-        """Load a model using Hugging Face transformers"""
-        if not HAS_TRANSFORMERS:
-            raise ImportError(
-                "Transformers library not found but required for standard models"
-            )
-        try:
-            # When running in Spaces, we need more conservative settings
-            spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
-            # Prepare model loading arguments
-            load_kwargs = {
-                "torch_dtype": self.torch_dtype,
-            }
-            # Add quantization for memory savings if requested
-            if self.use_quantization:
-                logger.info("Using 8-bit quantization for memory efficiency")
-                load_kwargs.update(
-                    {
-                        "load_in_8bit": True,
-                        "device_map": "auto",  # Force auto when using quantization
-                    }
-                )
-            else:
-                load_kwargs["device_map"] = self.device_map
-            # In Spaces, use more conservative loading options
-            if spaces_mode:
-                logger.info(
-                    "Running in Hugging Face Spaces, using minimal memory settings"
-                )
-                load_kwargs.update(
-                    {
-                        "low_cpu_mem_usage": True,
-                    }
-                )
-            # Load the tokenizer first - common to both architectures
-            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-            # Load the model based on architecture
-            if self.model_architecture == "seq2seq":
-                logger.info("Loading sequence-to-sequence model architecture")
-                model = AutoModelForSeq2SeqLM.from_pretrained(
-                    self.model_path, **load_kwargs
-                )
-                self.pipe = pipeline(
-                    "text2text-generation",
-                    model=model,
-                    tokenizer=tokenizer,
-                    framework="pt",
-                )
-            else:
-                # Standard causal language model
-                logger.info("Loading causal language model architecture")
-                # Skip the custom config handling for Spaces mode or small models
-                if (
-                    spaces_mode
-                    or "phi" in self.model_path.lower()
-                    or "tiny" in self.model_path.lower()
-                ):
-                    model = AutoModelForCausalLM.from_pretrained(
-                        self.model_path, **load_kwargs
-                    )
-                else:
-                    # Standard local loading with our custom config handling
-                    config = AutoConfig.from_pretrained(self.model_path)
-                    # Fix the rope_scaling issue for Llama models
-                    if hasattr(config, "rope_scaling") and isinstance(
-                        config.rope_scaling, dict
-                    ):
-                        config.rope_scaling["type"] = "linear"
-                        logger.info("Fixed rope_scaling configuration with type=linear")
-                    elif (
-                        not hasattr(config, "rope_scaling")
-                        and "llama" in self.model_path.lower()
-                    ):
-                        config.rope_scaling = {"type": "linear", "factor": 1.0}
-                        logger.info("Added default rope_scaling configuration")
-                    # Load the model with our fixed config
-                    model = AutoModelForCausalLM.from_pretrained(
-                        self.model_path, config=config, **load_kwargs
-                    )
-                # Create text generation pipeline for causal LM
-                self.pipe = pipeline(
-                    "text-generation", model=model, tokenizer=tokenizer, framework="pt"
-                )
-            # Store the model and tokenizer reference
-            self.model = model
-            self.tokenizer = tokenizer
-            logger.info("Transformers model loaded successfully")
-        except Exception as e:
-            logger.error(f"Failed to load transformers model: {str(e)}")
-            raise
     def generate(
         self,
@@ -256,7 +67,7 @@ class LocalLLM:
         top_p: float = 0.9,
     ) -> str:
         """
-        Generate text based on a prompt with the local LLM.
         Args:
             prompt: The user prompt to generate from
@@ -268,400 +79,66 @@ class LocalLLM:
         Returns:
             The generated text
         """
-        # Different handling based on model type
-        if self.model_type == "gguf":
-            return self._generate_with_gguf(
-                prompt, system_prompt, max_tokens, temperature, top_p
-            )
-        else:
-            return self._generate_with_transformers(
-                prompt, system_prompt, max_tokens, temperature, top_p
-            )
-    def _generate_with_gguf(
-        self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        max_tokens: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.9,
-    ) -> str:
-        """Generate text using GGUF model"""
-        try:
-            # Format prompt for chat completion
-            formatted_prompt = prompt
-            if system_prompt:
-                # Format system and user prompts for chat
-                formatted_prompt = (
-                    f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>\n"
-                )
-            # Generate from the GGUF model
-            # Use a slightly more conservative max_new_tokens for spaces
-            spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
-            if spaces_mode:
-                max_tokens = min(max_tokens, 256)  # Cap at 256 for faster responses
-            start_time = os.times().user
-            response = self.model(
-                formatted_prompt,
-                max_new_tokens=max_tokens,
-                temperature=temperature,
-                top_p=top_p,
-                stop=["<|user|>", "<|system|>", "<|end|>"],
-            )
-            end_time = os.times().user
-            generation_time = end_time - start_time
-            logger.info(f"GGUF generation completed in {generation_time:.2f}s")
-            return response
-        except Exception as e:
-            logger.error(f"Error during GGUF generation: {str(e)}")
-            return ""
-    def _generate_with_transformers(
-        self,
-        prompt: str,
-        system_prompt: Optional[str] = None,
-        max_tokens: int = 512,
-        temperature: float = 0.7,
-        top_p: float = 0.9,
-    ) -> str:
-        """Generate text using transformers pipeline"""
-        try:
-            # Handle seq2seq models (like T5)
-            if self.model_architecture == "seq2seq":
-                logger.debug(f"Generating with seq2seq model: {self.model_path}")
-                # Format prompt for seq2seq models
-                formatted_prompt = prompt
-                if system_prompt:
-                    formatted_prompt = f"{system_prompt}\n\nQuery: {prompt}"
-                # T5 models work best with specific task prefixes
-                if (
-                    "flan" in self.model_path.lower()
-                    and not formatted_prompt.startswith("enhance:")
-                ):
-                    formatted_prompt = f"enhance: {formatted_prompt}"
-                # Generate with seq2seq model
-                outputs = self.pipe(
-                    formatted_prompt,
-                    max_length=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=True,
-                )
-                # Extract the generated text
-                if isinstance(outputs, list) and len(outputs) > 0:
-                    if "generated_text" in outputs[0]:
-                        return outputs[0]["generated_text"].strip()
-                # Fallback extraction
-                return str(outputs).strip()
-            # Check if the model can handle chat templates
-            has_chat_template = (
-                hasattr(self.tokenizer, "chat_template")
-                and self.tokenizer.chat_template is not None
-            )
-            # For models that support chat templates
-            if has_chat_template:
-                # Format messages for chat-style models
-                messages = []
-                # Add system prompt if provided
-                if system_prompt:
-                    messages.append({"role": "system", "content": system_prompt})
-                # Add user prompt
-                messages.append({"role": "user", "content": prompt})
-                logger.debug(f"Generating with chat messages: {prompt[:100]}...")
-                # Generate response using the pipeline
-                outputs = self.pipe(
-                    messages,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=True,
-                )
-                # Extract the assistant's response
-                response = outputs[0]["generated_text"][-1]["content"]
-                return response
-            # For non-chat models (like DistilGPT2)
-            else:
-                logger.debug(f"Using non-chat model format for: {self.model_path}")
-                # Format prompt directly for non-chat models
-                formatted_prompt = prompt
-                if system_prompt:
-                    formatted_prompt = (
-                        f"{system_prompt}\n\nUser: {prompt}\n\nAssistant:"
-                    )
-                outputs = self.pipe(
-                    formatted_prompt,
-                    max_new_tokens=max_tokens,
-                    temperature=temperature,
-                    top_p=top_p,
-                    do_sample=True,
-                    return_full_text=False,  # Only return the generated text, not the prompt
-                )
-                # Extract just the generated text
-                if isinstance(outputs, list) and len(outputs) > 0:
-                    if "generated_text" in outputs[0]:
-                        return outputs[0]["generated_text"].strip()
-                # Fallback - return whatever we got
-                return str(outputs).strip()
-        except Exception as e:
-            logger.error(f"Error during transformers generation: {str(e)}")
-            # Return the original prompt on error as a fallback (non-empty result)
-            return prompt
-    def expand_creative_prompt(self, prompt: str) -> str:
         """
-        Specifically designed to expand a user prompt into a more detailed,
-        creative description suitable for image generation.
         Args:
-            prompt: The user's original prompt
         Returns:
             An expanded, detailed creative prompt
         """
-        # For seq2seq models like T5, use a template-first approach
-        if self.model_architecture == "seq2seq":
-            # Get some standard enhancement phrases
-            expansions = [
-                "cinematic lighting",
-                "professional photography",
-                "8k resolution",
-                "dramatic angle",
-                "photorealistic",
-                "highly detailed",
-                "vivid colors",
-            ]
-            import random
-            # Select 2-3 random expansions
-            selected = random.sample(expansions, k=min(3, len(expansions)))
-            return f"{prompt}, {', '.join(selected)}"
-        # For GGUF models like TinyLlama, use a very specific chat format
-        if self.model_type == "gguf":
-            # This system prompt is now much more direct and explicit
-            system_prompt = "You enhance image generation prompts by adding style and quality descriptors."
-            user_prompt = f'Transform: "{prompt}" into "{prompt}, [artistic style], [quality details]". Max 40 words. No explanations.'
-            # Generate the expanded prompt
-            expanded = self.generate(
-                prompt=user_prompt,
-                system_prompt=system_prompt,
-                max_tokens=100,
-                temperature=0.7,
-            )
-            # Post-process the response
-            expanded = self._clean_expansion(prompt, expanded)
-            return expanded
-        # For Transformers models
-        else:
-            # Standard approach for causal LMs like GPT-2 or Llama
-            system_prompt = (
-                "You are a prompt engineer that enhances image generation prompts."
-            )
-            user_prompt = f'Enhance this prompt for image generation: "{prompt}"\n\nOutput format: "{prompt}, [style], [quality]"\n\nKeep it under 40 words.'
-            # Generate the expanded prompt
-            expanded = self.generate(
-                prompt=user_prompt,
-                system_prompt=system_prompt,
-                max_tokens=100,
-                temperature=0.7,
-            )
-            # Post-process the response
-            expanded = self._clean_expansion(prompt, expanded)
-            return expanded
-    def _clean_expansion(self, original_prompt: str, expanded_text: str) -> str:
-        """
-        Clean up the expanded prompt text to ensure proper formatting.
-        Args:
-            original_prompt: The original prompt for reference
-            expanded_text: The raw expanded text from the model
-        Returns:
-            Cleaned and properly formatted prompt
-        """
-        import re
-        # First, handle the common case where TinyLlama outputs multiple variations
-        # Split by instances of the original prompt
-        if expanded_text.lower().count(original_prompt.lower()) > 1:
-            # Multiple variations detected - just use the first one
-            parts = expanded_text.lower().split(original_prompt.lower(), 1)
-            if len(parts) > 1:
-                # Take just the first expansion
-                expanded_text = original_prompt + parts[1]
-                # Find the next occurrence of the prompt and cut everything after it
-                next_prompt_pos = expanded_text.lower().find(
-                    original_prompt.lower(), len(original_prompt)
-                )
-                if next_prompt_pos > 0:
-                    expanded_text = expanded_text[:next_prompt_pos].strip()
-        # First pass: remove obvious instruction text
-        patterns_to_remove = [
-            r"(?i)^\s*(?:output|enhanced prompt|result):\s*",  # Remove prefixes like "Output:" or "Enhanced prompt:"
-            r"(?i)\b(?:original prompt|start with|add|use|format|rule|follow|example)\b.*$",  # Remove instructions
-            r"^\s*\d+\.?\s*",  # Remove numbered list markers
-            r'^["\'](.*)["\']$',  # Remove quotes surrounding the entire text
-        ]
-        for pattern in patterns_to_remove:
-            expanded_text = re.sub(pattern, "", expanded_text, flags=re.MULTILINE)
-        # Normalize whitespace
-        expanded_text = " ".join(expanded_text.split())
-        # If the expansion doesn't start with the original prompt, add it
-        if not expanded_text.lower().startswith(original_prompt.lower()):
-            if "," in expanded_text and not expanded_text.startswith(","):
-                # Try to find where the original prompt might appear
-                parts = expanded_text.split(",", 1)
-                if original_prompt.lower() in parts[0].lower():
-                    # The first part contains the original prompt but with modifications
-                    expanded_text = f"{original_prompt}, {parts[1].strip()}"
-                else:
-                    expanded_text = f"{original_prompt}, {expanded_text}"
-            else:
-                expanded_text = f"{original_prompt}, {expanded_text}"
-        # Remove any duplicated commas
-        expanded_text = re.sub(r",\s*,", ",", expanded_text)
-        # Strict length control - limit expansion to approximately 40 words
-        # Count words in the expansion
-        words = expanded_text.split()
-        if len(words) > 40:
-            # Keep original prompt and just enough words to stay under 40
-            prompt_words = len(original_prompt.split())
-            # We need to keep the original prompt and stay under 40 total words
-            allowed_extra_words = 40 - prompt_words
-            # Join the original prompt with the allowed number of additional words
-            expanded_text = " ".join(words[: prompt_words + allowed_extra_words])
-        # Check if the expansion still contains instruction-like text or is too repetitive
-        instruction_indicators = [
-            "original prompt",
-            "add only",
-            "rule",
-            "format as",
-            "example",
-            "enhancement:",
-        ]
-        if any(
-            indicator in expanded_text.lower() for indicator in instruction_indicators
-        ):
-            # Emergency fallback - use hardcoded expansion phrases
-            expansions = [
-                "cinematic lighting",
-                "professional photography",
-                "8k resolution",
-                "dramatic angle",
-                "photorealistic",
-                "highly detailed",
-                "vivid colors",
-                "stunning detail",
-                "artistically composed",
-                "sharp focus",
-            ]
-            import random
-            # Select 2-3 random expansions
-            selected = random.sample(expansions, k=min(3, len(expansions)))
-            expanded_text = f"{original_prompt}, {', '.join(selected)}"
-        logger.info(f"Expanded prompt: {expanded_text[:100]}...")
-        return expanded_text
-def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
     """
-    Factory function to get a LocalLLM instance with default settings.
-    Returns None if model loading fails, allowing graceful fallback.
-    Args:
-        model_path: Optional path to model or HuggingFace model ID
     Returns:
-        A LocalLLM instance or None if model loading fails
     """
-    use_local_model = os.environ.get("USE_LOCAL_MODEL", "true").lower() != "false"
-    if not use_local_model:
-        logger.info("Local model usage is disabled by environment setting")
-        return None
-    # Default to environment settings with fallbacks
-    if not model_path:
-        model_path = os.environ.get("MODEL_PATH") or os.environ.get(
-            "MODEL_ID", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
-        )
-    # Get model file for GGUF models
-    model_file = os.environ.get("MODEL_FILENAME")
-    # Check model architecture - T5 models use seq2seq, others use causal LM
-    model_architecture = os.environ.get("MODEL_ARCHITECTURE", "causal").lower()
-    # Check model type - prefer GGUF for speed in resource-constrained environments
-    model_type = os.environ.get("MODEL_TYPE", "transformers").lower()
-    # Check if quantization is enabled
-    use_quantization = os.environ.get("MODEL_QUANTIZED", "false").lower() == "true"
-    try:
-        # Check if the provided path is a local directory
-        if os.path.isdir(model_path):
-            logger.info(f"Using local model directory: {model_path}")
-        else:
-            logger.info(f"Using model ID from Hugging Face: {model_path}")
-        # Check available device backends
-        device_map = "auto"
-        torch_dtype = None
-        # For Hugging Face Spaces, be more careful about memory usage
-        spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
-        if spaces_mode and model_type != "gguf":
-            logger.info("Running in Hugging Face Spaces, using CPU for stability")
-            # Force CPU for Spaces with transformers models
-            device_map = "cpu" if not use_quantization else "auto"
-        # Create the LLM instance with appropriate settings
-        return LocalLLM(
-            model_path=model_path,
-            model_file=model_file,
-            model_type=model_type,
-            model_architecture=model_architecture,
-            device_map=device_map,
-            torch_dtype=torch_dtype,
-            use_quantization=use_quantization,
-        )
-    except Exception as e:
-        logger.error(f"Failed to create LLM instance: {e}")
-        return None

 import os
 import logging
 from pathlib import Path
+import torch
+from typing import Optional, Dict, Any, Union, List
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 logger = logging.getLogger(__name__)
+# Constants for TinyLlama model
+MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+# Default system prompt for creative expansion
+DEFAULT_CREATIVE_SYSTEM_PROMPT = """You are an expert assistant that helps expand creative prompts for image generation.
+Your goal is to enrich the original prompt with vivid visual details, artistic style suggestions, and composition elements.
+Focus on visual enhancement only. Keep your response concise (under 100 words) and focus entirely on the expanded prompt.
+Do not include explanations or comments - only return the enhanced prompt text."""
+# Default user template for creative expansion
+DEFAULT_CREATIVE_USER_TEMPLATE = """Original prompt: {original_prompt}
+Please expand this into a detailed, vivid prompt for image generation with rich visual elements, mood, and style."""
 class LocalLLM:
     """
+    A local LLM implementation using TinyLlama-1.1B-Chat.
+    Provides methods to generate text and expand creative prompts.
     """
+    def __init__(self, model_id: str = MODEL_ID):
         """
         Initialize the local LLM.
         Args:
+            model_id: The model ID to load
         """
+        self.model_id = model_id
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Configure quantization for efficient memory usage
+        quantization_config = None
+        if self.device == "cuda":
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
             )
+        # Load model and tokenizer
+        logger.info(f"Loading TinyLlama model on {self.device}...")
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            quantization_config=quantization_config,
+            device_map="auto" if self.device == "cuda" else None,
+            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
+        )
+        logger.info(f"TinyLlama model loaded successfully")
     def generate(
         self,
         top_p: float = 0.9,
     ) -> str:
         """
+        Generate text based on a prompt.
         Args:
             prompt: The user prompt to generate from
         Returns:
             The generated text
         """
+        messages = []
+        # Add system message if provided
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        # Add user message
+        messages.append({"role": "user", "content": prompt})
+        # Format messages for the model
+        prompt_text = self.tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        # Generate response
+        inputs = self.tokenizer(prompt_text, return_tensors="pt").to(self.device)
+        outputs = self.model.generate(
+            **inputs,
+            max_new_tokens=max_tokens,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p,
+        )
+        # Decode the response and extract the assistant's message
+        full_output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Extract just the assistant's response
+        assistant_response = full_output[len(prompt_text) :].strip()
+        return assistant_response
+    def expand_creative_prompt(self, original_prompt: str) -> str:
         """
+        Expand a creative prompt with rich details for better image generation.
         Args:
+            original_prompt: The user's original prompt
         Returns:
             An expanded, detailed creative prompt
         """
+        # Format the prompt for creative expansion
+        prompt = DEFAULT_CREATIVE_USER_TEMPLATE.format(original_prompt=original_prompt)
+        # Generate expanded prompt
+        expanded = self.generate(
+            prompt=prompt,
+            system_prompt=DEFAULT_CREATIVE_SYSTEM_PROMPT,
+            max_tokens=150,  # Limit to ensure concise responses
+            temperature=0.7,  # Balanced creativity
+        )
+        return expanded
+def get_llm_instance() -> LocalLLM:
     """
+    Get an instance of the local LLM.
     Returns:
+        An initialized LocalLLM instance
     """
+    return LocalLLM(model_id=MODEL_ID)

app/llm/service.py CHANGED Viewed

@@ -1,139 +1,78 @@
-import os
 import logging
-import time
 import sys
-from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
-from typing import Optional
-import psutil
-import uvicorn
-from dotenv import load_dotenv
 from pathlib import Path
-from model import LocalLLM, get_llm_instance
-# Configure logging first
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handlers=[
         logging.StreamHandler(sys.stdout),
-        logging.FileHandler(os.path.join(os.path.dirname(__file__), "llm_service.log")),
     ],
 )
-logger = logging.getLogger("llm_service")
-# Try to load .env file from project root
-env_path = Path(__file__).parents[2] / ".env"
-if env_path.exists():
-    load_dotenv(dotenv_path=env_path)
-    logger.info(f"Loaded environment variables from {env_path}")
-# Initialize FastAPI app
-app = FastAPI(title="Local LLM Service", description="API for local LLM interaction")
-# Add CORS middleware
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Request timing middleware
-@app.middleware("http")
-async def log_requests(request: Request, call_next):
-    start_time = time.time()
-    logger.info(f"Request started: {request.method} {request.url.path}")
-    response = await call_next(request)
-    process_time = time.time() - start_time
-    logger.info(
-        f"Request completed: {request.method} {request.url.path} - Status: {response.status_code} - Duration: {process_time:.4f}s"
-    )
-    return response
-# Model request and response classes
-class PromptRequest(BaseModel):
     prompt: str
     system_prompt: Optional[str] = None
-    max_tokens: int = 512
-    temperature: float = 0.7
-    top_p: float = 0.9
-class ExpandRequest(BaseModel):
     prompt: str
-class LLMResponse(BaseModel):
-    text: str
-# Global LLM instance
-llm = None
-@app.on_event("startup")
-async def startup_event():
-    """Initialize the LLM on startup"""
-    global llm
-    logger.info("Starting LLM service initialization...")
-    # First check for MODEL_PATH (local model), then fall back to MODEL_ID
-    model_path = os.environ.get("MODEL_PATH")
-    if model_path and os.path.isdir(model_path):
-        logger.info(f"Using local model from MODEL_PATH: {model_path}")
-    else:
-        # Fall back to MODEL_ID if MODEL_PATH isn't set or doesn't exist
-        model_path = os.environ.get("MODEL_ID", "meta-llama/Llama-3.2-3B-Instruct")
-        logger.info(f"Using model ID from Hugging Face: {model_path}")
-    try:
-        start_time = time.time()
-        llm = get_llm_instance(model_path)
-        init_time = time.time() - start_time
-        logger.info(
-            f"LLM initialized successfully with model: {model_path} in {init_time:.2f} seconds"
-        )
-        memory = psutil.virtual_memory()
-        logger.info(
-            f"System memory: {memory.percent}% used ({memory.used / (1024**3):.1f}GB / {memory.total / (1024**3):.1f}GB)"
-        )
-    except Exception as e:
-        logger.error(f"Failed to initialize LLM: {str(e)}", exc_info=True)
-        raise
-@app.post("/generate", response_model=LLMResponse)
-async def generate_text(request: PromptRequest):
-    """Generate text based on a prompt"""
-    logger.info(
-        f"Received text generation request, prompt length: {len(request.prompt)} chars"
-    )
-    logger.debug(f"Prompt: {request.prompt[:50]}...")
-    if not llm:
-        logger.error("LLM service not initialized when generate endpoint was called")
-        raise HTTPException(status_code=503, detail="LLM service not initialized")
     try:
         start_time = time.time()
-        logger.info(
-            f"Generation parameters: max_tokens={request.max_tokens}, temperature={request.temperature}, top_p={request.top_p}"
-        )
-        response = llm.generate(
             prompt=request.prompt,
             system_prompt=request.system_prompt,
             max_tokens=request.max_tokens,
@@ -141,73 +80,45 @@ async def generate_text(request: PromptRequest):
             top_p=request.top_p,
         )
-        generation_time = time.time() - start_time
-        response_length = len(response)
-        logger.info(
-            f"Text generation completed in {generation_time:.2f}s, response length: {response_length} chars"
-        )
-        logger.debug(f"Generated response: {response[:50]}...")
-        return LLMResponse(text=response)
     except Exception as e:
-        logger.error(f"Error generating text: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))
-@app.post("/expand", response_model=LLMResponse)
-async def expand_prompt(request: ExpandRequest):
-    """Expand a creative prompt with rich details"""
-    logger.info(f"Received prompt expansion request, prompt: '{request.prompt}'")
-    if not llm:
-        logger.error("LLM service not initialized when expand endpoint was called")
-        raise HTTPException(status_code=503, detail="LLM service not initialized")
     try:
         start_time = time.time()
-        expanded = llm.expand_creative_prompt(request.prompt)
-        expansion_time = time.time() - start_time
-        expanded_length = len(expanded)
-        logger.info(
-            f"Prompt expansion completed in {expansion_time:.2f}s, original length: {len(request.prompt)}, expanded length: {expanded_length}"
-        )
-        logger.debug(f"Original: '{request.prompt}'")
-        logger.debug(f"Expanded: '{expanded}'")
-        return LLMResponse(text=expanded)
     except Exception as e:
-        logger.error(f"Error expanding prompt: {str(e)}", exc_info=True)
         raise HTTPException(status_code=500, detail=str(e))
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    logger.debug("Health check endpoint called")
-    if llm:
-        logger.info(f"Health check: LLM service is healthy, model: {llm.model_path}")
-        return {"status": "healthy", "model": llm.model_path}
-    logger.warning("Health check: LLM service is still initializing")
-    return {"status": "initializing"}
-# Start the service if run directly
 if __name__ == "__main__":
-    # Check for psutil dependency
-    try:
-        import psutil
-    except ImportError:
-        logger.warning(
-            "psutil not installed. Some system resource metrics will not be available."
-        )
-        logger.warning("Install with: pip install psutil")
-    logger.info("Starting LLM service server")
-    uvicorn.run(app, host="0.0.0.0", port=8001)

 import logging
+import os
 import sys
+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
+from typing import Dict, Any, List, Optional
 from pathlib import Path
+import time
+# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     handlers=[
         logging.StreamHandler(sys.stdout),
+        logging.FileHandler("llm_service.log", mode="a"),
     ],
 )
+logger = logging.getLogger(__name__)
+# Import the model
+from .model import get_llm_instance
+# Initialize model
+llm = get_llm_instance()
+# Create FastAPI app
+app = FastAPI(
+    title="LLM Service API",
+    description="API for interacting with the local LLM",
+    version="1.0.0",
+)
+# Configure CORS
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # In production, specify actual origins
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+class GenerateRequest(BaseModel):
     prompt: str
     system_prompt: Optional[str] = None
+    max_tokens: Optional[int] = 512
+    temperature: Optional[float] = 0.7
+    top_p: Optional[float] = 0.9
+class ExpandPromptRequest(BaseModel):
     prompt: str
+@app.get("/")
+def read_root():
+    return {"status": "ok", "message": "LLM Service is running"}
+@app.get("/health")
+def health_check():
+    """Health check endpoint"""
+    return {"status": "healthy", "model": "TinyLlama-1.1B-Chat-v1.0"}
+@app.post("/generate")
+def generate_text(request: GenerateRequest):
+    """Generate text from a prompt"""
     try:
         start_time = time.time()
+        logger.info(f"Generating text for prompt: {request.prompt[:50]}...")
+        result = llm.generate(
             prompt=request.prompt,
             system_prompt=request.system_prompt,
             max_tokens=request.max_tokens,
             top_p=request.top_p,
         )
+        elapsed = time.time() - start_time
+        logger.info(f"Generation completed in {elapsed:.2f} seconds")
+        return {
+            "result": result,
+            "elapsed_seconds": elapsed,
+        }
     except Exception as e:
+        logger.error(f"Error during text generation: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/expand-prompt")
+def expand_prompt(request: ExpandPromptRequest):
+    """Expand a creative prompt with more detail"""
     try:
         start_time = time.time()
+        logger.info(f"Expanding prompt: {request.prompt[:50]}...")
+        expanded_prompt = llm.expand_creative_prompt(request.prompt)
+        elapsed = time.time() - start_time
+        logger.info(f"Prompt expansion completed in {elapsed:.2f} seconds")
+        return {
+            "original_prompt": request.prompt,
+            "expanded_prompt": expanded_prompt,
+            "elapsed_seconds": elapsed,
+        }
     except Exception as e:
+        logger.error(f"Error during prompt expansion: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# Run the service when executed directly
 if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 8000))
+    host = os.environ.get("HOST", "0.0.0.0")
+    logger.info(f"Starting LLM service on {host}:{port}")
+    uvicorn.run(app, host=host, port=port)