Spaces:

Zoro-chi
/

ai-creative-studio

Running

App Files Files Community

Zoro-chi commited on Apr 17

Commit

37ed2a0

1 Parent(s): f1cc7f4

Switch to TinyLlama GGUF model for much faster inference in Hugging Face Spaces

Browse files

Files changed (3) hide show

.env.spaces +5 -2
app/llm/model.py +220 -59
requirements-hf.txt +1 -0

.env.spaces CHANGED Viewed

@@ -7,10 +7,13 @@ HF_SPACES=1
 TEXT_TO_IMAGE_APP_ID=c25dcd829d134ea98f5ae4dd311d13bc.node3.openfabric.network
 IMAGE_TO_3D_APP_ID=f0b5f319156c4819b9827000b17e511a.node3.openfabric.network
-# LLM Configuration for Spaces - use a tiny model that can run in limited memory
-MODEL_ID=microsoft/phi-1_5
 USE_LOCAL_MODEL=true
 MODEL_QUANTIZED=true
 # Data Directories (Spaces-friendly paths)
 IMAGE_OUTPUT_DIR=/tmp/data/images

 TEXT_TO_IMAGE_APP_ID=c25dcd829d134ea98f5ae4dd311d13bc.node3.openfabric.network
 IMAGE_TO_3D_APP_ID=f0b5f319156c4819b9827000b17e511a.node3.openfabric.network
+# LLM Configuration for Spaces - use a very fast model optimized for efficiency
+MODEL_ID=TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
 USE_LOCAL_MODEL=true
 MODEL_QUANTIZED=true
+MODEL_TYPE=gguf
+MODEL_REVISION=main
+MODEL_FILENAME=tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # Data Directories (Spaces-friendly paths)
 IMAGE_OUTPUT_DIR=/tmp/data/images

app/llm/model.py CHANGED Viewed

@@ -2,21 +2,44 @@ import os
 from typing import Dict, List, Optional, Union
 import logging
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
 from pathlib import Path
 logger = logging.getLogger(__name__)
 class LocalLLM:
     """
-    A wrapper for running local LLMs using the Hugging Face Transformers library.
     Optimized for creative prompt expansion and interpretation.
     """
     def __init__(
         self,
-        model_path: str = "microsoft/phi-1_5",  # Changed default to a much smaller model
         device_map: str = "auto",
         torch_dtype=None,
         use_quantization: bool = False,
@@ -26,34 +49,111 @@ class LocalLLM:
         Args:
             model_path: Path to model or HuggingFace model ID
             device_map: Device mapping strategy (default: "auto")
-            torch_dtype: Torch data type (default: bfloat16 if available, otherwise float16)
             use_quantization: Whether to use 8-bit quantization to reduce memory usage
         """
         self.model_path = model_path
         self.device_map = device_map
         self.use_quantization = use_quantization
-        if torch_dtype is None:
-            # Set default dtype based on device
-            if device_map == "mps":
-                # Apple Silicon uses float16
-                self.torch_dtype = torch.float16
-            elif (
-                torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
-            ):
-                # Modern NVIDIA GPUs use bfloat16
-                self.torch_dtype = torch.bfloat16
-            else:
-                # Default to float16 for other cases
-                self.torch_dtype = torch.float16
         else:
             self.torch_dtype = torch_dtype
         logger.info(f"Loading LLM from {model_path}")
-        logger.info(
-            f"Using device: {device_map}, dtype: {self.torch_dtype}, quantization: {use_quantization}"
-        )
         try:
             # When running in Spaces, we need more conservative settings
@@ -74,7 +174,7 @@ class LocalLLM:
                     }
                 )
             else:
-                load_kwargs["device_map"] = device_map
             # In Spaces, use more conservative loading options
             if spaces_mode:
@@ -89,21 +189,19 @@ class LocalLLM:
                     }
                 )
-                # For Phi models, use even more conservative settings
-                if "phi" in model_path.lower():
-                    load_kwargs.update(
-                        {
-                            "torch_dtype": torch.float16,  # Force float16 for Phi model
-                        }
-                    )
             # Skip the custom config handling for Spaces mode or small models
-            if spaces_mode or "phi" in model_path.lower():
-                model = AutoModelForCausalLM.from_pretrained(model_path, **load_kwargs)
-                tokenizer = AutoTokenizer.from_pretrained(model_path)
             else:
                 # Standard local loading with our custom config handling
-                config = AutoConfig.from_pretrained(model_path)
                 # Fix the rope_scaling issue for Llama models
                 if hasattr(config, "rope_scaling") and isinstance(
@@ -113,35 +211,30 @@ class LocalLLM:
                     logger.info("Fixed rope_scaling configuration with type=linear")
                 elif (
                     not hasattr(config, "rope_scaling")
-                    and "llama" in model_path.lower()
                 ):
                     config.rope_scaling = {"type": "linear", "factor": 1.0}
                     logger.info("Added default rope_scaling configuration")
                 # Load the tokenizer
-                tokenizer = AutoTokenizer.from_pretrained(model_path)
                 # Load the model with our fixed config
-                if device_map == "mps":
-                    # For Apple Silicon, load to device directly
-                    model = AutoModelForCausalLM.from_pretrained(
-                        model_path, config=config, **load_kwargs
-                    )
-                else:
-                    # For other devices, use the device_map parameter
-                    model = AutoModelForCausalLM.from_pretrained(
-                        model_path, config=config, **load_kwargs
-                    )
             # Create the pipeline with our pre-loaded model and tokenizer
             self.pipe = pipeline(
                 "text-generation", model=model, tokenizer=tokenizer, framework="pt"
             )
-            logger.info("LLM loaded successfully")
         except Exception as e:
-            logger.error(f"Failed to load model: {str(e)}")
             raise
     def generate(
@@ -165,6 +258,67 @@ class LocalLLM:
         Returns:
             The generated text
         """
         # Format messages for chat-style models
         messages = []
@@ -192,7 +346,7 @@ class LocalLLM:
             return response
         except Exception as e:
-            logger.error(f"Error during generation: {str(e)}")
             return ""
     def expand_creative_prompt(self, prompt: str) -> str:
@@ -240,18 +394,23 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
     Returns:
         A LocalLLM instance or None if model loading fails
     """
-    # If model path not provided, first check for MODEL_PATH, then MODEL_ID from environment
-    if not model_path:
-        model_path = os.environ.get("MODEL_PATH") or os.environ.get(
-            "MODEL_ID", "microsoft/phi-1_5"  # Changed default to a smaller model
-        )
-    # Check if local models should be disabled (useful in restricted environments)
     use_local_model = os.environ.get("USE_LOCAL_MODEL", "true").lower() != "false"
     if not use_local_model:
         logger.info("Local model usage is disabled by environment setting")
         return None
     # Check if quantization is enabled
     use_quantization = os.environ.get("MODEL_QUANTIZED", "false").lower() == "true"
@@ -266,16 +425,18 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
         device_map = "auto"
         torch_dtype = None
-        # For Hugging Face Spaces, we need to be more careful about memory usage
         spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
-        if spaces_mode:
             logger.info("Running in Hugging Face Spaces, using CPU for stability")
-            # Force CPU for Spaces (most Spaces have very limited GPU resources)
             device_map = "cpu" if not use_quantization else "auto"
-        # Create the LLM instance
         return LocalLLM(
             model_path=model_path,
             device_map=device_map,
             torch_dtype=torch_dtype,
             use_quantization=use_quantization,

 from typing import Dict, List, Optional, Union
 import logging
 import torch
 from pathlib import Path
+import json
+import tempfile
 logger = logging.getLogger(__name__)
+# Try to import transformers and ctransformers
+try:
+    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
+    logger.warning(
+        "Transformers library not found. Standard models won't be available."
+    )
+# Try to import ctransformers for GGUF support
+try:
+    from ctransformers import AutoModelForCausalLM as CTAutoModelForCausalLM
+    HAS_CTRANSFORMERS = True
+except ImportError:
+    HAS_CTRANSFORMERS = False
+    logger.warning("CTransformers library not found. GGUF models won't be available.")
 class LocalLLM:
     """
+    A wrapper for running local LLMs using either Hugging Face Transformers or CTransformers.
     Optimized for creative prompt expansion and interpretation.
     """
     def __init__(
         self,
+        model_path: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+        model_file: str = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+        model_type: str = "gguf",
         device_map: str = "auto",
         torch_dtype=None,
         use_quantization: bool = False,
         Args:
             model_path: Path to model or HuggingFace model ID
+            model_file: Specific model file to load (for GGUF models)
+            model_type: Type of model ('transformers' or 'gguf')
             device_map: Device mapping strategy (default: "auto")
+            torch_dtype: Torch data type (default: float16)
             use_quantization: Whether to use 8-bit quantization to reduce memory usage
         """
         self.model_path = model_path
+        self.model_file = model_file
+        self.model_type = model_type.lower()
         self.device_map = device_map
         self.use_quantization = use_quantization
+        self.pipe = None
+        self.model = None
+        self.tokenizer = None
+        # Set torch dtype if using transformers models
+        if torch_dtype is None and self.model_type != "gguf":
+            self.torch_dtype = torch.float16
         else:
             self.torch_dtype = torch_dtype
         logger.info(f"Loading LLM from {model_path}")
+        logger.info(f"Model type: {model_type}, model file: {model_file}")
+        # Various loading strategies based on model type
+        if self.model_type == "gguf":
+            self._load_gguf_model()
+        else:
+            self._load_transformers_model()
+    def _load_gguf_model(self):
+        """Load a GGUF model using CTransformers"""
+        if not HAS_CTRANSFORMERS:
+            raise ImportError(
+                "CTransformers library not found but required for GGUF models"
+            )
+        try:
+            # Handle spaces and CPU constraints
+            spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
+            # Determine model file - either specific file or default
+            if self.model_file:
+                model_file = self.model_file
+            else:
+                model_file = None  # Let ctransformers choose default
+            # For Hugging Face models with specific files
+            if "/" in self.model_path and self.model_file:
+                logger.info(
+                    f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
+                )
+                # CPU threads based on environment or default to 4
+                cpu_threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
+                # Very optimized settings for spaces
+                if spaces_mode:
+                    logger.info("Using optimized settings for Spaces environment")
+                    # Use context length of 512 for faster responses
+                    context_length = 512
+                    # Batch size 512 is good balance for small models
+                    batch_size = 512
+                else:
+                    # Standard settings for more powerful environments
+                    context_length = 2048
+                    batch_size = 1024
+                logger.info(
+                    f"Using context length: {context_length}, batch size: {batch_size}, CPU threads: {cpu_threads}"
+                )
+                # Create the model with optimized parameters
+                self.model = CTAutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    model_file=self.model_file,
+                    model_type="llama",
+                    context_length=context_length,
+                    batch_size=batch_size,
+                    cpu_threads=cpu_threads,
+                    # Add streaming options for better memory usage and fast first token
+                    stream=True,
+                    reset=True,
+                )
+            else:
+                # Local path with model
+                logger.info(f"Loading local GGUF model: {self.model_path}")
+                self.model = CTAutoModelForCausalLM.from_pretrained(
+                    self.model_path,
+                    model_type="llama",
+                )
+            logger.info("GGUF model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load GGUF model: {str(e)}")
+            raise
+    def _load_transformers_model(self):
+        """Load a model using Hugging Face transformers"""
+        if not HAS_TRANSFORMERS:
+            raise ImportError(
+                "Transformers library not found but required for standard models"
+            )
         try:
             # When running in Spaces, we need more conservative settings
                     }
                 )
             else:
+                load_kwargs["device_map"] = self.device_map
             # In Spaces, use more conservative loading options
             if spaces_mode:
                     }
                 )
             # Skip the custom config handling for Spaces mode or small models
+            if (
+                spaces_mode
+                or "phi" in self.model_path.lower()
+                or "tiny" in self.model_path.lower()
+            ):
+                model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path, **load_kwargs
+                )
+                tokenizer = AutoTokenizer.from_pretrained(self.model_path)
             else:
                 # Standard local loading with our custom config handling
+                config = AutoConfig.from_pretrained(self.model_path)
                 # Fix the rope_scaling issue for Llama models
                 if hasattr(config, "rope_scaling") and isinstance(
                     logger.info("Fixed rope_scaling configuration with type=linear")
                 elif (
                     not hasattr(config, "rope_scaling")
+                    and "llama" in self.model_path.lower()
                 ):
                     config.rope_scaling = {"type": "linear", "factor": 1.0}
                     logger.info("Added default rope_scaling configuration")
                 # Load the tokenizer
+                tokenizer = AutoTokenizer.from_pretrained(self.model_path)
                 # Load the model with our fixed config
+                model = AutoModelForCausalLM.from_pretrained(
+                    self.model_path, config=config, **load_kwargs
+                )
             # Create the pipeline with our pre-loaded model and tokenizer
             self.pipe = pipeline(
                 "text-generation", model=model, tokenizer=tokenizer, framework="pt"
             )
+            self.model = model
+            self.tokenizer = tokenizer
+            logger.info("Transformers model loaded successfully")
         except Exception as e:
+            logger.error(f"Failed to load transformers model: {str(e)}")
             raise
     def generate(
         Returns:
             The generated text
         """
+        # Different handling based on model type
+        if self.model_type == "gguf":
+            return self._generate_with_gguf(
+                prompt, system_prompt, max_tokens, temperature, top_p
+            )
+        else:
+            return self._generate_with_transformers(
+                prompt, system_prompt, max_tokens, temperature, top_p
+            )
+    def _generate_with_gguf(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+    ) -> str:
+        """Generate text using GGUF model"""
+        try:
+            # Format prompt for chat completion
+            formatted_prompt = prompt
+            if system_prompt:
+                # Format system and user prompts for chat
+                formatted_prompt = (
+                    f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>\n"
+                )
+            # Generate from the GGUF model
+            # Use a slightly more conservative max_new_tokens for spaces
+            spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
+            if spaces_mode:
+                max_tokens = min(max_tokens, 256)  # Cap at 256 for faster responses
+            start_time = os.times().user
+            response = self.model(
+                formatted_prompt,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                stop=["<|user|>", "<|system|>", "<|end|>"],
+            )
+            end_time = os.times().user
+            generation_time = end_time - start_time
+            logger.info(f"GGUF generation completed in {generation_time:.2f}s")
+            return response
+        except Exception as e:
+            logger.error(f"Error during GGUF generation: {str(e)}")
+            return ""
+    def _generate_with_transformers(
+        self,
+        prompt: str,
+        system_prompt: Optional[str] = None,
+        max_tokens: int = 512,
+        temperature: float = 0.7,
+        top_p: float = 0.9,
+    ) -> str:
+        """Generate text using transformers pipeline"""
         # Format messages for chat-style models
         messages = []
             return response
         except Exception as e:
+            logger.error(f"Error during transformers generation: {str(e)}")
             return ""
     def expand_creative_prompt(self, prompt: str) -> str:
     Returns:
         A LocalLLM instance or None if model loading fails
     """
     use_local_model = os.environ.get("USE_LOCAL_MODEL", "true").lower() != "false"
     if not use_local_model:
         logger.info("Local model usage is disabled by environment setting")
         return None
+    # Default to environment settings with fallbacks
+    if not model_path:
+        model_path = os.environ.get("MODEL_PATH") or os.environ.get(
+            "MODEL_ID", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+        )
+    # Get model file for GGUF models
+    model_file = os.environ.get("MODEL_FILENAME")
+    # Check model type - prefer GGUF for speed in resource-constrained environments
+    model_type = os.environ.get("MODEL_TYPE", "transformers").lower()
     # Check if quantization is enabled
     use_quantization = os.environ.get("MODEL_QUANTIZED", "false").lower() == "true"
         device_map = "auto"
         torch_dtype = None
+        # For Hugging Face Spaces, be more careful about memory usage
         spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
+        if spaces_mode and model_type != "gguf":
             logger.info("Running in Hugging Face Spaces, using CPU for stability")
+            # Force CPU for Spaces with transformers models
             device_map = "cpu" if not use_quantization else "auto"
+        # Create the LLM instance with appropriate settings
         return LocalLLM(
             model_path=model_path,
+            model_file=model_file,
+            model_type=model_type,
             device_map=device_map,
             torch_dtype=torch_dtype,
             use_quantization=use_quantization,

requirements-hf.txt CHANGED Viewed

@@ -20,6 +20,7 @@ transformers>=4.43.0
 torch>=2.0.0
 huggingface_hub>=0.16.0
 accelerate>=0.21.0
 # API and utilities
 fastapi>=0.100.0

 torch>=2.0.0
 huggingface_hub>=0.16.0
 accelerate>=0.21.0
+ctransformers>=0.2.24  # For GGUF model support
 # API and utilities
 fastapi>=0.100.0