Spaces:

Zoro-chi
/

ai-creative-studio

Running

App Files Files Community

Zoro-chi commited on Apr 17

Commit

62f25ac

1 Parent(s): 94a2d06

Fix GGUF model loading with proper config parameter structure

Browse files

Files changed (2) hide show

app.py +1 -1
app/llm/model.py +19 -11

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ os.environ["MODEL_ID"] = (
     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"  # TinyLlama chat model
 )
 os.environ["MODEL_FILENAME"] = (
-    "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"  # Specific quantized file
 )
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "gguf"  # Use ctransformers

     "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"  # TinyLlama chat model
 )
 os.environ["MODEL_FILENAME"] = (
+    "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"  # Correct model filename
 )
 os.environ["USE_LOCAL_MODEL"] = "true"
 os.environ["MODEL_TYPE"] = "gguf"  # Use ctransformers

app/llm/model.py CHANGED Viewed

@@ -113,8 +113,8 @@ class LocalLLM:
                     f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
                 )
-                # CPU threads based on environment or default to 4
-                cpu_threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
                 # Very optimized settings for spaces
                 if spaces_mode:
@@ -129,20 +129,28 @@ class LocalLLM:
                     batch_size = 1024
                 logger.info(
-                    f"Using context length: {context_length}, batch size: {batch_size}, CPU threads: {cpu_threads}"
                 )
-                # Create the model with optimized parameters
                 self.model = CTAutoModelForCausalLM.from_pretrained(
                     self.model_path,
                     model_file=self.model_file,
-                    model_type="llama",
-                    context_length=context_length,
-                    batch_size=batch_size,
-                    cpu_threads=cpu_threads,
-                    # Add streaming options for better memory usage and fast first token
-                    stream=True,
-                    reset=True,
                 )
             else:

                     f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
                 )
+                # Get CPU threads from environment or default to 4
+                threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
                 # Very optimized settings for spaces
                 if spaces_mode:
                     batch_size = 1024
                 logger.info(
+                    f"Using context length: {context_length}, batch size: {batch_size}, threads: {threads}"
                 )
+                # Create configuration dictionary with all parameters
+                config = {
+                    "context_length": context_length,
+                    "batch_size": batch_size,
+                    "threads": threads,  # Correct parameter for CPU threads
+                    "stream": True,  # For better memory usage and fast first token
+                    "reset": True,  # Reset context between generations
+                    "max_new_tokens": 256,
+                    "temperature": 0.7,
+                    "top_p": 0.95,
+                    "repetition_penalty": 1.1,
+                }
+                # Create the model with the config dictionary
                 self.model = CTAutoModelForCausalLM.from_pretrained(
                     self.model_path,
                     model_file=self.model_file,
+                    model_type="llama",  # required for Llama/TinyLlama models
+                    config=config,
                 )
             else: