Spaces:

Zoro-chi
/

ai-creative-studio

Running

App Files Files Community

Zoro-chi commited on Apr 17

Commit

34d21a4

1 Parent(s): bbce178

Add support for google/flan-t5-small with proper sequence-to-sequence model handling

Browse files

Files changed (2) hide show

app.py +7 -2
app/llm/model.py +98 -36

app.py CHANGED Viewed

@@ -28,10 +28,15 @@ os.environ["HF_SPACES"] = "1"  # Flag to indicate we're running in Spaces
 # Set model environment variables explicitly for Hugging Face Spaces
 # These will override any variables loaded from .env.spaces
-os.environ["MODEL_ID"] = "microsoft/phi-2"  # Use phi-1.5 model
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "transformers"
-os.environ["MODEL_QUANTIZED"] = "false"
 # Import UI module directly
 try:

 # Set model environment variables explicitly for Hugging Face Spaces
 # These will override any variables loaded from .env.spaces
+os.environ["MODEL_ID"] = "google/flan-t5-small"  # Use flan-t5-small model
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "transformers"
+os.environ["MODEL_QUANTIZED"] = (
+    "false"  # Disable quantization to avoid bitsandbytes dependency
+)
+os.environ["MODEL_ARCHITECTURE"] = (
+    "seq2seq"  # T5 models are sequence-to-sequence, not causal LM
+)
 # Import UI module directly
 try:

app/llm/model.py CHANGED Viewed

@@ -10,7 +10,13 @@ logger = logging.getLogger(__name__)
 # Try to import transformers and ctransformers
 try:
-    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
     HAS_TRANSFORMERS = True
 except ImportError:
@@ -40,6 +46,7 @@ class LocalLLM:
         model_path: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
         model_file: str = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
         model_type: str = "gguf",
         device_map: str = "auto",
         torch_dtype=None,
         use_quantization: bool = False,
@@ -51,6 +58,7 @@ class LocalLLM:
             model_path: Path to model or HuggingFace model ID
             model_file: Specific model file to load (for GGUF models)
             model_type: Type of model ('transformers' or 'gguf')
             device_map: Device mapping strategy (default: "auto")
             torch_dtype: Torch data type (default: float16)
             use_quantization: Whether to use 8-bit quantization to reduce memory usage
@@ -58,6 +66,7 @@ class LocalLLM:
         self.model_path = model_path
         self.model_file = model_file
         self.model_type = model_type.lower()
         self.device_map = device_map
         self.use_quantization = use_quantization
         self.pipe = None
@@ -71,7 +80,9 @@ class LocalLLM:
             self.torch_dtype = torch_dtype
         logger.info(f"Loading LLM from {model_path}")
-        logger.info(f"Model type: {model_type}, model file: {model_file}")
         # Various loading strategies based on model type
         if self.model_type == "gguf":
@@ -184,50 +195,64 @@ class LocalLLM:
                 load_kwargs.update(
                     {
                         "low_cpu_mem_usage": True,
-                        "offload_folder": "/tmp/offload",
-                        "offload_state_dict": True,
                     }
                 )
-            # Skip the custom config handling for Spaces mode or small models
-            if (
-                spaces_mode
-                or "phi" in self.model_path.lower()
-                or "tiny" in self.model_path.lower()
-            ):
-                model = AutoModelForCausalLM.from_pretrained(
                     self.model_path, **load_kwargs
                 )
-                tokenizer = AutoTokenizer.from_pretrained(self.model_path)
             else:
-                # Standard local loading with our custom config handling
-                config = AutoConfig.from_pretrained(self.model_path)
-                # Fix the rope_scaling issue for Llama models
-                if hasattr(config, "rope_scaling") and isinstance(
-                    config.rope_scaling, dict
-                ):
-                    config.rope_scaling["type"] = "linear"
-                    logger.info("Fixed rope_scaling configuration with type=linear")
-                elif (
-                    not hasattr(config, "rope_scaling")
-                    and "llama" in self.model_path.lower()
                 ):
-                    config.rope_scaling = {"type": "linear", "factor": 1.0}
-                    logger.info("Added default rope_scaling configuration")
-                # Load the tokenizer
-                tokenizer = AutoTokenizer.from_pretrained(self.model_path)
-                # Load the model with our fixed config
-                model = AutoModelForCausalLM.from_pretrained(
-                    self.model_path, config=config, **load_kwargs
                 )
-            # Create the pipeline with our pre-loaded model and tokenizer
-            self.pipe = pipeline(
-                "text-generation", model=model, tokenizer=tokenizer, framework="pt"
-            )
             self.model = model
             self.tokenizer = tokenizer
@@ -320,6 +345,39 @@ class LocalLLM:
     ) -> str:
         """Generate text using transformers pipeline"""
         try:
             # Check if the model can handle chat templates
             has_chat_template = (
                 hasattr(self.tokenizer, "chat_template")
@@ -443,6 +501,9 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
     # Get model file for GGUF models
     model_file = os.environ.get("MODEL_FILENAME")
     # Check model type - prefer GGUF for speed in resource-constrained environments
     model_type = os.environ.get("MODEL_TYPE", "transformers").lower()
@@ -472,6 +533,7 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
             model_path=model_path,
             model_file=model_file,
             model_type=model_type,
             device_map=device_map,
             torch_dtype=torch_dtype,
             use_quantization=use_quantization,

 # Try to import transformers and ctransformers
 try:
+    from transformers import (
+        AutoTokenizer,
+        AutoModelForCausalLM,
+        AutoModelForSeq2SeqLM,
+        pipeline,
+        AutoConfig,
+    )
     HAS_TRANSFORMERS = True
 except ImportError:
         model_path: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
         model_file: str = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
         model_type: str = "gguf",
+        model_architecture: str = "causal",
         device_map: str = "auto",
         torch_dtype=None,
         use_quantization: bool = False,
             model_path: Path to model or HuggingFace model ID
             model_file: Specific model file to load (for GGUF models)
             model_type: Type of model ('transformers' or 'gguf')
+            model_architecture: Architecture type ('causal' or 'seq2seq')
             device_map: Device mapping strategy (default: "auto")
             torch_dtype: Torch data type (default: float16)
             use_quantization: Whether to use 8-bit quantization to reduce memory usage
         self.model_path = model_path
         self.model_file = model_file
         self.model_type = model_type.lower()
+        self.model_architecture = model_architecture.lower()
         self.device_map = device_map
         self.use_quantization = use_quantization
         self.pipe = None
             self.torch_dtype = torch_dtype
         logger.info(f"Loading LLM from {model_path}")
+        logger.info(
+            f"Model type: {model_type}, architecture: {model_architecture}, model file: {model_file}"
+        )
         # Various loading strategies based on model type
         if self.model_type == "gguf":
                 load_kwargs.update(
                     {
                         "low_cpu_mem_usage": True,
                     }
                 )
+            # Load the tokenizer first - common to both architectures
+            tokenizer = AutoTokenizer.from_pretrained(self.model_path)
+            # Load the model based on architecture
+            if self.model_architecture == "seq2seq":
+                logger.info("Loading sequence-to-sequence model architecture")
+                model = AutoModelForSeq2SeqLM.from_pretrained(
                     self.model_path, **load_kwargs
                 )
+                self.pipe = pipeline(
+                    "text2text-generation",
+                    model=model,
+                    tokenizer=tokenizer,
+                    framework="pt",
+                )
             else:
+                # Standard causal language model
+                logger.info("Loading causal language model architecture")
+                # Skip the custom config handling for Spaces mode or small models
+                if (
+                    spaces_mode
+                    or "phi" in self.model_path.lower()
+                    or "tiny" in self.model_path.lower()
                 ):
+                    model = AutoModelForCausalLM.from_pretrained(
+                        self.model_path, **load_kwargs
+                    )
+                else:
+                    # Standard local loading with our custom config handling
+                    config = AutoConfig.from_pretrained(self.model_path)
+                    # Fix the rope_scaling issue for Llama models
+                    if hasattr(config, "rope_scaling") and isinstance(
+                        config.rope_scaling, dict
+                    ):
+                        config.rope_scaling["type"] = "linear"
+                        logger.info("Fixed rope_scaling configuration with type=linear")
+                    elif (
+                        not hasattr(config, "rope_scaling")
+                        and "llama" in self.model_path.lower()
+                    ):
+                        config.rope_scaling = {"type": "linear", "factor": 1.0}
+                        logger.info("Added default rope_scaling configuration")
+                    # Load the model with our fixed config
+                    model = AutoModelForCausalLM.from_pretrained(
+                        self.model_path, config=config, **load_kwargs
+                    )
+                # Create text generation pipeline for causal LM
+                self.pipe = pipeline(
+                    "text-generation", model=model, tokenizer=tokenizer, framework="pt"
                 )
+            # Store the model and tokenizer reference
             self.model = model
             self.tokenizer = tokenizer
     ) -> str:
         """Generate text using transformers pipeline"""
         try:
+            # Handle seq2seq models (like T5)
+            if self.model_architecture == "seq2seq":
+                logger.debug(f"Generating with seq2seq model: {self.model_path}")
+                # Format prompt for seq2seq models
+                formatted_prompt = prompt
+                if system_prompt:
+                    formatted_prompt = f"{system_prompt}\n\nQuery: {prompt}"
+                # T5 models work best with specific task prefixes
+                if (
+                    "flan" in self.model_path.lower()
+                    and not formatted_prompt.startswith("enhance:")
+                ):
+                    formatted_prompt = f"enhance: {formatted_prompt}"
+                # Generate with seq2seq model
+                outputs = self.pipe(
+                    formatted_prompt,
+                    max_length=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                )
+                # Extract the generated text
+                if isinstance(outputs, list) and len(outputs) > 0:
+                    if "generated_text" in outputs[0]:
+                        return outputs[0]["generated_text"].strip()
+                # Fallback extraction
+                return str(outputs).strip()
             # Check if the model can handle chat templates
             has_chat_template = (
                 hasattr(self.tokenizer, "chat_template")
     # Get model file for GGUF models
     model_file = os.environ.get("MODEL_FILENAME")
+    # Check model architecture - T5 models use seq2seq, others use causal LM
+    model_architecture = os.environ.get("MODEL_ARCHITECTURE", "causal").lower()
     # Check model type - prefer GGUF for speed in resource-constrained environments
     model_type = os.environ.get("MODEL_TYPE", "transformers").lower()
             model_path=model_path,
             model_file=model_file,
             model_type=model_type,
+            model_architecture=model_architecture,
             device_map=device_map,
             torch_dtype=torch_dtype,
             use_quantization=use_quantization,