Spaces:

CamiloVega
/

NewsIA

Sleeping

App Files Files Community

CamiloVega commited on Mar 31

Commit

fb3575f

verified ·

1 Parent(s): 9dd3257

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -37

app.py CHANGED Viewed

@@ -52,7 +52,7 @@ class ModelManager:
     @spaces.GPU()
     def initialize_llm(self):
-        """Initialize LLM model with unsloth optimization"""
         try:
             MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
@@ -64,36 +64,41 @@ class ModelManager:
             )
             self.tokenizer.pad_token = self.tokenizer.eos_token
-            # Configure 4-bit quantization
-            bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_compute_dtype=torch.float16,
-                bnb_4bit_use_double_quant=True
-            )
-            logger.info("Loading and optimizing model with unsloth...")
-            # Use unsloth to load and optimize the model
-            self.model, self.tokenizer = FastLanguageModel.from_pretrained(
-                model_name=MODEL_NAME,
-                token=HUGGINGFACE_TOKEN,
-                quantization_config=bnb_config,
-                max_seq_length=2048,
-                device_map="auto"
-            )
-            # Optimize with unsloth
-            self.model = FastLanguageModel.get_peft_model(
-                self.model,
-                r=16,
-                target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
-                               "gate_proj", "up_proj", "down_proj"],
-                lora_alpha=16,
-                lora_dropout=0,
-                bias="none",
-                use_gradient_checkpointing=True,
-                random_state=3407
-            )
             logger.info("LLM initialized successfully")
             self.last_used = time.time()
@@ -492,14 +497,17 @@ Follow these requirements:
         with torch.inference_mode():
             try:
                 logger.info("Generating news article...")
-                # Use unsloth's optimized generate method
                 inputs = model_manager.tokenizer(
                     prompt,
                     return_tensors="pt",
                     add_special_tokens=False
                 ).to(model_manager.model.device)
-                # Generate with optimized settings
                 outputs = model_manager.model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
@@ -512,10 +520,24 @@ Follow these requirements:
                 )
                 # Decode the generated text
-                generated_text = model_manager.tokenizer.decode(
-                    outputs[0][inputs.input_ids.shape[1]:],
-                    skip_special_tokens=True
-                )
                 # Clean up the generated text
                 news_article = generated_text.strip()

     @spaces.GPU()
     def initialize_llm(self):
+        """Initialize LLM model with optimization"""
         try:
             MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
             )
             self.tokenizer.pad_token = self.tokenizer.eos_token
+            try:
+                # Try with unsloth first
+                logger.info("Attempting to load model with unsloth optimization...")
+                self.model, self.tokenizer = FastLanguageModel.from_pretrained(
+                    model_name=MODEL_NAME,
+                    token=HUGGINGFACE_TOKEN,
+                    load_in_8bit=True,
+                    max_seq_length=2048,
+                    device_map="auto"
+                )
+                # Optimize with unsloth
+                self.model = FastLanguageModel.get_peft_model(
+                    self.model,
+                    r=8,
+                    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+                    lora_alpha=8,
+                    bias="none"
+                )
+                logger.info("Model loaded successfully with unsloth")
+            except Exception as unsloth_error:
+                # Fallback to standard transformers
+                logger.warning(f"Unsloth optimization failed: {str(unsloth_error)}. Falling back to standard model.")
+                from transformers import AutoModelForCausalLM
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    MODEL_NAME,
+                    token=HUGGINGFACE_TOKEN,
+                    device_map="auto",
+                    torch_dtype=torch.float16,
+                    load_in_8bit=True
+                )
+                logger.info("Model loaded with standard transformers")
             logger.info("LLM initialized successfully")
             self.last_used = time.time()
         with torch.inference_mode():
             try:
                 logger.info("Generating news article...")
+                # Check if we're using unsloth or standard model
+                is_unsloth = hasattr(model_manager.model, 'unsloth_module') if hasattr(model_manager.model, 'unsloth_module') else False
+                # Prepare inputs
                 inputs = model_manager.tokenizer(
                     prompt,
                     return_tensors="pt",
                     add_special_tokens=False
                 ).to(model_manager.model.device)
+                # Generate with appropriate settings
                 outputs = model_manager.model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
                 )
                 # Decode the generated text
+                if is_unsloth:
+                    # Unsloth specific decoding
+                    generated_text = model_manager.tokenizer.decode(
+                        outputs[0][inputs.input_ids.shape[1]:],
+                        skip_special_tokens=True
+                    )
+                else:
+                    # Standard transformers decoding
+                    generated_text = model_manager.tokenizer.decode(
+                        outputs[0],
+                        skip_special_tokens=True
+                    )
+                    # Remove the prompt from the generated text
+                    prompt_text = model_manager.tokenizer.decode(
+                        inputs.input_ids[0],
+                        skip_special_tokens=True
+                    )
+                    generated_text = generated_text.replace(prompt_text, "")
                 # Clean up the generated text
                 news_article = generated_text.strip()