Spaces:
Running
Running
Fix GGUF model loading with proper config parameter structure
Browse files- app.py +1 -1
- app/llm/model.py +19 -11
app.py
CHANGED
@@ -32,7 +32,7 @@ os.environ["MODEL_ID"] = (
|
|
32 |
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # TinyLlama chat model
|
33 |
)
|
34 |
os.environ["MODEL_FILENAME"] = (
|
35 |
-
"tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" #
|
36 |
)
|
37 |
os.environ["USE_LOCAL_MODEL"] = "true"
|
38 |
os.environ["MODEL_TYPE"] = "gguf" # Use ctransformers
|
|
|
32 |
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" # TinyLlama chat model
|
33 |
)
|
34 |
os.environ["MODEL_FILENAME"] = (
|
35 |
+
"tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" # Correct model filename
|
36 |
)
|
37 |
os.environ["USE_LOCAL_MODEL"] = "true"
|
38 |
os.environ["MODEL_TYPE"] = "gguf" # Use ctransformers
|
app/llm/model.py
CHANGED
@@ -113,8 +113,8 @@ class LocalLLM:
|
|
113 |
f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
|
114 |
)
|
115 |
|
116 |
-
# CPU threads
|
117 |
-
|
118 |
|
119 |
# Very optimized settings for spaces
|
120 |
if spaces_mode:
|
@@ -129,20 +129,28 @@ class LocalLLM:
|
|
129 |
batch_size = 1024
|
130 |
|
131 |
logger.info(
|
132 |
-
f"Using context length: {context_length}, batch size: {batch_size},
|
133 |
)
|
134 |
|
135 |
-
# Create
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
self.model = CTAutoModelForCausalLM.from_pretrained(
|
137 |
self.model_path,
|
138 |
model_file=self.model_file,
|
139 |
-
model_type="llama",
|
140 |
-
|
141 |
-
batch_size=batch_size,
|
142 |
-
cpu_threads=cpu_threads,
|
143 |
-
# Add streaming options for better memory usage and fast first token
|
144 |
-
stream=True,
|
145 |
-
reset=True,
|
146 |
)
|
147 |
|
148 |
else:
|
|
|
113 |
f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
|
114 |
)
|
115 |
|
116 |
+
# Get CPU threads from environment or default to 4
|
117 |
+
threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
|
118 |
|
119 |
# Very optimized settings for spaces
|
120 |
if spaces_mode:
|
|
|
129 |
batch_size = 1024
|
130 |
|
131 |
logger.info(
|
132 |
+
f"Using context length: {context_length}, batch size: {batch_size}, threads: {threads}"
|
133 |
)
|
134 |
|
135 |
+
# Create configuration dictionary with all parameters
|
136 |
+
config = {
|
137 |
+
"context_length": context_length,
|
138 |
+
"batch_size": batch_size,
|
139 |
+
"threads": threads, # Correct parameter for CPU threads
|
140 |
+
"stream": True, # For better memory usage and fast first token
|
141 |
+
"reset": True, # Reset context between generations
|
142 |
+
"max_new_tokens": 256,
|
143 |
+
"temperature": 0.7,
|
144 |
+
"top_p": 0.95,
|
145 |
+
"repetition_penalty": 1.1,
|
146 |
+
}
|
147 |
+
|
148 |
+
# Create the model with the config dictionary
|
149 |
self.model = CTAutoModelForCausalLM.from_pretrained(
|
150 |
self.model_path,
|
151 |
model_file=self.model_file,
|
152 |
+
model_type="llama", # required for Llama/TinyLlama models
|
153 |
+
config=config,
|
|
|
|
|
|
|
|
|
|
|
154 |
)
|
155 |
|
156 |
else:
|