Zoro-chi commited on
Commit
62f25ac
·
1 Parent(s): 94a2d06

Fix GGUF model loading with proper config parameter structure

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. app/llm/model.py +19 -11
app.py CHANGED
@@ -32,7 +32,7 @@ os.environ["MODEL_ID"] = (
32
  "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # TinyLlama chat model
33
  )
34
  os.environ["MODEL_FILENAME"] = (
35
- "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Specific quantized file
36
  )
37
  os.environ["USE_LOCAL_MODEL"] = "true"
38
  os.environ["MODEL_TYPE"] = "gguf" # Use ctransformers
 
32
  "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # TinyLlama chat model
33
  )
34
  os.environ["MODEL_FILENAME"] = (
35
+ "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Correct model filename
36
  )
37
  os.environ["USE_LOCAL_MODEL"] = "true"
38
  os.environ["MODEL_TYPE"] = "gguf" # Use ctransformers
app/llm/model.py CHANGED
@@ -113,8 +113,8 @@ class LocalLLM:
113
  f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
114
  )
115
 
116
- # CPU threads based on environment or default to 4
117
- cpu_threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
118
 
119
  # Very optimized settings for spaces
120
  if spaces_mode:
@@ -129,20 +129,28 @@ class LocalLLM:
129
  batch_size = 1024
130
 
131
  logger.info(
132
- f"Using context length: {context_length}, batch size: {batch_size}, CPU threads: {cpu_threads}"
133
  )
134
 
135
- # Create the model with optimized parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  self.model = CTAutoModelForCausalLM.from_pretrained(
137
  self.model_path,
138
  model_file=self.model_file,
139
- model_type="llama",
140
- context_length=context_length,
141
- batch_size=batch_size,
142
- cpu_threads=cpu_threads,
143
- # Add streaming options for better memory usage and fast first token
144
- stream=True,
145
- reset=True,
146
  )
147
 
148
  else:
 
113
  f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
114
  )
115
 
116
+ # Get CPU threads from environment or default to 4
117
+ threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
118
 
119
  # Very optimized settings for spaces
120
  if spaces_mode:
 
129
  batch_size = 1024
130
 
131
  logger.info(
132
+ f"Using context length: {context_length}, batch size: {batch_size}, threads: {threads}"
133
  )
134
 
135
+ # Create configuration dictionary with all parameters
136
+ config = {
137
+ "context_length": context_length,
138
+ "batch_size": batch_size,
139
+ "threads": threads, # Correct parameter for CPU threads
140
+ "stream": True, # For better memory usage and fast first token
141
+ "reset": True, # Reset context between generations
142
+ "max_new_tokens": 256,
143
+ "temperature": 0.7,
144
+ "top_p": 0.95,
145
+ "repetition_penalty": 1.1,
146
+ }
147
+
148
+ # Create the model with the config dictionary
149
  self.model = CTAutoModelForCausalLM.from_pretrained(
150
  self.model_path,
151
  model_file=self.model_file,
152
+ model_type="llama", # required for Llama/TinyLlama models
153
+ config=config,
 
 
 
 
 
154
  )
155
 
156
  else: