Zoro-chi commited on
Commit
37ed2a0
·
1 Parent(s): f1cc7f4

Switch to TinyLlama GGUF model for much faster inference in Hugging Face Spaces

Browse files
Files changed (3) hide show
  1. .env.spaces +5 -2
  2. app/llm/model.py +220 -59
  3. requirements-hf.txt +1 -0
.env.spaces CHANGED
@@ -7,10 +7,13 @@ HF_SPACES=1
7
  TEXT_TO_IMAGE_APP_ID=c25dcd829d134ea98f5ae4dd311d13bc.node3.openfabric.network
8
  IMAGE_TO_3D_APP_ID=f0b5f319156c4819b9827000b17e511a.node3.openfabric.network
9
 
10
- # LLM Configuration for Spaces - use a tiny model that can run in limited memory
11
- MODEL_ID=microsoft/phi-1_5
12
  USE_LOCAL_MODEL=true
13
  MODEL_QUANTIZED=true
 
 
 
14
 
15
  # Data Directories (Spaces-friendly paths)
16
  IMAGE_OUTPUT_DIR=/tmp/data/images
 
7
  TEXT_TO_IMAGE_APP_ID=c25dcd829d134ea98f5ae4dd311d13bc.node3.openfabric.network
8
  IMAGE_TO_3D_APP_ID=f0b5f319156c4819b9827000b17e511a.node3.openfabric.network
9
 
10
+ # LLM Configuration for Spaces - use a very fast model optimized for efficiency
11
+ MODEL_ID=TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF
12
  USE_LOCAL_MODEL=true
13
  MODEL_QUANTIZED=true
14
+ MODEL_TYPE=gguf
15
+ MODEL_REVISION=main
16
+ MODEL_FILENAME=tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
17
 
18
  # Data Directories (Spaces-friendly paths)
19
  IMAGE_OUTPUT_DIR=/tmp/data/images
app/llm/model.py CHANGED
@@ -2,21 +2,44 @@ import os
2
  from typing import Dict, List, Optional, Union
3
  import logging
4
  import torch
5
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
6
  from pathlib import Path
 
 
7
 
8
  logger = logging.getLogger(__name__)
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class LocalLLM:
12
  """
13
- A wrapper for running local LLMs using the Hugging Face Transformers library.
14
  Optimized for creative prompt expansion and interpretation.
15
  """
16
 
17
  def __init__(
18
  self,
19
- model_path: str = "microsoft/phi-1_5", # Changed default to a much smaller model
 
 
20
  device_map: str = "auto",
21
  torch_dtype=None,
22
  use_quantization: bool = False,
@@ -26,34 +49,111 @@ class LocalLLM:
26
 
27
  Args:
28
  model_path: Path to model or HuggingFace model ID
 
 
29
  device_map: Device mapping strategy (default: "auto")
30
- torch_dtype: Torch data type (default: bfloat16 if available, otherwise float16)
31
  use_quantization: Whether to use 8-bit quantization to reduce memory usage
32
  """
33
  self.model_path = model_path
 
 
34
  self.device_map = device_map
35
  self.use_quantization = use_quantization
 
 
 
36
 
37
- if torch_dtype is None:
38
- # Set default dtype based on device
39
- if device_map == "mps":
40
- # Apple Silicon uses float16
41
- self.torch_dtype = torch.float16
42
- elif (
43
- torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
44
- ):
45
- # Modern NVIDIA GPUs use bfloat16
46
- self.torch_dtype = torch.bfloat16
47
- else:
48
- # Default to float16 for other cases
49
- self.torch_dtype = torch.float16
50
  else:
51
  self.torch_dtype = torch_dtype
52
 
53
  logger.info(f"Loading LLM from {model_path}")
54
- logger.info(
55
- f"Using device: {device_map}, dtype: {self.torch_dtype}, quantization: {use_quantization}"
56
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  try:
59
  # When running in Spaces, we need more conservative settings
@@ -74,7 +174,7 @@ class LocalLLM:
74
  }
75
  )
76
  else:
77
- load_kwargs["device_map"] = device_map
78
 
79
  # In Spaces, use more conservative loading options
80
  if spaces_mode:
@@ -89,21 +189,19 @@ class LocalLLM:
89
  }
90
  )
91
 
92
- # For Phi models, use even more conservative settings
93
- if "phi" in model_path.lower():
94
- load_kwargs.update(
95
- {
96
- "torch_dtype": torch.float16, # Force float16 for Phi model
97
- }
98
- )
99
-
100
  # Skip the custom config handling for Spaces mode or small models
101
- if spaces_mode or "phi" in model_path.lower():
102
- model = AutoModelForCausalLM.from_pretrained(model_path, **load_kwargs)
103
- tokenizer = AutoTokenizer.from_pretrained(model_path)
 
 
 
 
 
 
104
  else:
105
  # Standard local loading with our custom config handling
106
- config = AutoConfig.from_pretrained(model_path)
107
 
108
  # Fix the rope_scaling issue for Llama models
109
  if hasattr(config, "rope_scaling") and isinstance(
@@ -113,35 +211,30 @@ class LocalLLM:
113
  logger.info("Fixed rope_scaling configuration with type=linear")
114
  elif (
115
  not hasattr(config, "rope_scaling")
116
- and "llama" in model_path.lower()
117
  ):
118
  config.rope_scaling = {"type": "linear", "factor": 1.0}
119
  logger.info("Added default rope_scaling configuration")
120
 
121
  # Load the tokenizer
122
- tokenizer = AutoTokenizer.from_pretrained(model_path)
123
 
124
  # Load the model with our fixed config
125
- if device_map == "mps":
126
- # For Apple Silicon, load to device directly
127
- model = AutoModelForCausalLM.from_pretrained(
128
- model_path, config=config, **load_kwargs
129
- )
130
- else:
131
- # For other devices, use the device_map parameter
132
- model = AutoModelForCausalLM.from_pretrained(
133
- model_path, config=config, **load_kwargs
134
- )
135
 
136
  # Create the pipeline with our pre-loaded model and tokenizer
137
  self.pipe = pipeline(
138
  "text-generation", model=model, tokenizer=tokenizer, framework="pt"
139
  )
 
 
140
 
141
- logger.info("LLM loaded successfully")
142
 
143
  except Exception as e:
144
- logger.error(f"Failed to load model: {str(e)}")
145
  raise
146
 
147
  def generate(
@@ -165,6 +258,67 @@ class LocalLLM:
165
  Returns:
166
  The generated text
167
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  # Format messages for chat-style models
169
  messages = []
170
 
@@ -192,7 +346,7 @@ class LocalLLM:
192
  return response
193
 
194
  except Exception as e:
195
- logger.error(f"Error during generation: {str(e)}")
196
  return ""
197
 
198
  def expand_creative_prompt(self, prompt: str) -> str:
@@ -240,18 +394,23 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
240
  Returns:
241
  A LocalLLM instance or None if model loading fails
242
  """
243
- # If model path not provided, first check for MODEL_PATH, then MODEL_ID from environment
244
- if not model_path:
245
- model_path = os.environ.get("MODEL_PATH") or os.environ.get(
246
- "MODEL_ID", "microsoft/phi-1_5" # Changed default to a smaller model
247
- )
248
-
249
- # Check if local models should be disabled (useful in restricted environments)
250
  use_local_model = os.environ.get("USE_LOCAL_MODEL", "true").lower() != "false"
251
  if not use_local_model:
252
  logger.info("Local model usage is disabled by environment setting")
253
  return None
254
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  # Check if quantization is enabled
256
  use_quantization = os.environ.get("MODEL_QUANTIZED", "false").lower() == "true"
257
 
@@ -266,16 +425,18 @@ def get_llm_instance(model_path: Optional[str] = None) -> Optional[LocalLLM]:
266
  device_map = "auto"
267
  torch_dtype = None
268
 
269
- # For Hugging Face Spaces, we need to be more careful about memory usage
270
  spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
271
- if spaces_mode:
272
  logger.info("Running in Hugging Face Spaces, using CPU for stability")
273
- # Force CPU for Spaces (most Spaces have very limited GPU resources)
274
  device_map = "cpu" if not use_quantization else "auto"
275
 
276
- # Create the LLM instance
277
  return LocalLLM(
278
  model_path=model_path,
 
 
279
  device_map=device_map,
280
  torch_dtype=torch_dtype,
281
  use_quantization=use_quantization,
 
2
  from typing import Dict, List, Optional, Union
3
  import logging
4
  import torch
 
5
  from pathlib import Path
6
+ import json
7
+ import tempfile
8
 
9
  logger = logging.getLogger(__name__)
10
 
11
+ # Try to import transformers and ctransformers
12
+ try:
13
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoConfig
14
+
15
+ HAS_TRANSFORMERS = True
16
+ except ImportError:
17
+ HAS_TRANSFORMERS = False
18
+ logger.warning(
19
+ "Transformers library not found. Standard models won't be available."
20
+ )
21
+
22
+ # Try to import ctransformers for GGUF support
23
+ try:
24
+ from ctransformers import AutoModelForCausalLM as CTAutoModelForCausalLM
25
+
26
+ HAS_CTRANSFORMERS = True
27
+ except ImportError:
28
+ HAS_CTRANSFORMERS = False
29
+ logger.warning("CTransformers library not found. GGUF models won't be available.")
30
+
31
 
32
  class LocalLLM:
33
  """
34
+ A wrapper for running local LLMs using either Hugging Face Transformers or CTransformers.
35
  Optimized for creative prompt expansion and interpretation.
36
  """
37
 
38
  def __init__(
39
  self,
40
+ model_path: str = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
41
+ model_file: str = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
42
+ model_type: str = "gguf",
43
  device_map: str = "auto",
44
  torch_dtype=None,
45
  use_quantization: bool = False,
 
49
 
50
  Args:
51
  model_path: Path to model or HuggingFace model ID
52
+ model_file: Specific model file to load (for GGUF models)
53
+ model_type: Type of model ('transformers' or 'gguf')
54
  device_map: Device mapping strategy (default: "auto")
55
+ torch_dtype: Torch data type (default: float16)
56
  use_quantization: Whether to use 8-bit quantization to reduce memory usage
57
  """
58
  self.model_path = model_path
59
+ self.model_file = model_file
60
+ self.model_type = model_type.lower()
61
  self.device_map = device_map
62
  self.use_quantization = use_quantization
63
+ self.pipe = None
64
+ self.model = None
65
+ self.tokenizer = None
66
 
67
+ # Set torch dtype if using transformers models
68
+ if torch_dtype is None and self.model_type != "gguf":
69
+ self.torch_dtype = torch.float16
 
 
 
 
 
 
 
 
 
 
70
  else:
71
  self.torch_dtype = torch_dtype
72
 
73
  logger.info(f"Loading LLM from {model_path}")
74
+ logger.info(f"Model type: {model_type}, model file: {model_file}")
75
+
76
+ # Various loading strategies based on model type
77
+ if self.model_type == "gguf":
78
+ self._load_gguf_model()
79
+ else:
80
+ self._load_transformers_model()
81
+
82
+ def _load_gguf_model(self):
83
+ """Load a GGUF model using CTransformers"""
84
+ if not HAS_CTRANSFORMERS:
85
+ raise ImportError(
86
+ "CTransformers library not found but required for GGUF models"
87
+ )
88
+
89
+ try:
90
+ # Handle spaces and CPU constraints
91
+ spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
92
+
93
+ # Determine model file - either specific file or default
94
+ if self.model_file:
95
+ model_file = self.model_file
96
+ else:
97
+ model_file = None # Let ctransformers choose default
98
+
99
+ # For Hugging Face models with specific files
100
+ if "/" in self.model_path and self.model_file:
101
+ logger.info(
102
+ f"Loading GGUF model from Hugging Face: {self.model_path}/{self.model_file}"
103
+ )
104
+
105
+ # CPU threads based on environment or default to 4
106
+ cpu_threads = int(os.environ.get("MODEL_CPU_THREADS", "4"))
107
+
108
+ # Very optimized settings for spaces
109
+ if spaces_mode:
110
+ logger.info("Using optimized settings for Spaces environment")
111
+ # Use context length of 512 for faster responses
112
+ context_length = 512
113
+ # Batch size 512 is good balance for small models
114
+ batch_size = 512
115
+ else:
116
+ # Standard settings for more powerful environments
117
+ context_length = 2048
118
+ batch_size = 1024
119
+
120
+ logger.info(
121
+ f"Using context length: {context_length}, batch size: {batch_size}, CPU threads: {cpu_threads}"
122
+ )
123
+
124
+ # Create the model with optimized parameters
125
+ self.model = CTAutoModelForCausalLM.from_pretrained(
126
+ self.model_path,
127
+ model_file=self.model_file,
128
+ model_type="llama",
129
+ context_length=context_length,
130
+ batch_size=batch_size,
131
+ cpu_threads=cpu_threads,
132
+ # Add streaming options for better memory usage and fast first token
133
+ stream=True,
134
+ reset=True,
135
+ )
136
+
137
+ else:
138
+ # Local path with model
139
+ logger.info(f"Loading local GGUF model: {self.model_path}")
140
+ self.model = CTAutoModelForCausalLM.from_pretrained(
141
+ self.model_path,
142
+ model_type="llama",
143
+ )
144
+
145
+ logger.info("GGUF model loaded successfully")
146
+
147
+ except Exception as e:
148
+ logger.error(f"Failed to load GGUF model: {str(e)}")
149
+ raise
150
+
151
+ def _load_transformers_model(self):
152
+ """Load a model using Hugging Face transformers"""
153
+ if not HAS_TRANSFORMERS:
154
+ raise ImportError(
155
+ "Transformers library not found but required for standard models"
156
+ )
157
 
158
  try:
159
  # When running in Spaces, we need more conservative settings
 
174
  }
175
  )
176
  else:
177
+ load_kwargs["device_map"] = self.device_map
178
 
179
  # In Spaces, use more conservative loading options
180
  if spaces_mode:
 
189
  }
190
  )
191
 
 
 
 
 
 
 
 
 
192
  # Skip the custom config handling for Spaces mode or small models
193
+ if (
194
+ spaces_mode
195
+ or "phi" in self.model_path.lower()
196
+ or "tiny" in self.model_path.lower()
197
+ ):
198
+ model = AutoModelForCausalLM.from_pretrained(
199
+ self.model_path, **load_kwargs
200
+ )
201
+ tokenizer = AutoTokenizer.from_pretrained(self.model_path)
202
  else:
203
  # Standard local loading with our custom config handling
204
+ config = AutoConfig.from_pretrained(self.model_path)
205
 
206
  # Fix the rope_scaling issue for Llama models
207
  if hasattr(config, "rope_scaling") and isinstance(
 
211
  logger.info("Fixed rope_scaling configuration with type=linear")
212
  elif (
213
  not hasattr(config, "rope_scaling")
214
+ and "llama" in self.model_path.lower()
215
  ):
216
  config.rope_scaling = {"type": "linear", "factor": 1.0}
217
  logger.info("Added default rope_scaling configuration")
218
 
219
  # Load the tokenizer
220
+ tokenizer = AutoTokenizer.from_pretrained(self.model_path)
221
 
222
  # Load the model with our fixed config
223
+ model = AutoModelForCausalLM.from_pretrained(
224
+ self.model_path, config=config, **load_kwargs
225
+ )
 
 
 
 
 
 
 
226
 
227
  # Create the pipeline with our pre-loaded model and tokenizer
228
  self.pipe = pipeline(
229
  "text-generation", model=model, tokenizer=tokenizer, framework="pt"
230
  )
231
+ self.model = model
232
+ self.tokenizer = tokenizer
233
 
234
+ logger.info("Transformers model loaded successfully")
235
 
236
  except Exception as e:
237
+ logger.error(f"Failed to load transformers model: {str(e)}")
238
  raise
239
 
240
  def generate(
 
258
  Returns:
259
  The generated text
260
  """
261
+ # Different handling based on model type
262
+ if self.model_type == "gguf":
263
+ return self._generate_with_gguf(
264
+ prompt, system_prompt, max_tokens, temperature, top_p
265
+ )
266
+ else:
267
+ return self._generate_with_transformers(
268
+ prompt, system_prompt, max_tokens, temperature, top_p
269
+ )
270
+
271
+ def _generate_with_gguf(
272
+ self,
273
+ prompt: str,
274
+ system_prompt: Optional[str] = None,
275
+ max_tokens: int = 512,
276
+ temperature: float = 0.7,
277
+ top_p: float = 0.9,
278
+ ) -> str:
279
+ """Generate text using GGUF model"""
280
+ try:
281
+ # Format prompt for chat completion
282
+ formatted_prompt = prompt
283
+ if system_prompt:
284
+ # Format system and user prompts for chat
285
+ formatted_prompt = (
286
+ f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>\n"
287
+ )
288
+
289
+ # Generate from the GGUF model
290
+ # Use a slightly more conservative max_new_tokens for spaces
291
+ spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
292
+ if spaces_mode:
293
+ max_tokens = min(max_tokens, 256) # Cap at 256 for faster responses
294
+
295
+ start_time = os.times().user
296
+ response = self.model(
297
+ formatted_prompt,
298
+ max_new_tokens=max_tokens,
299
+ temperature=temperature,
300
+ top_p=top_p,
301
+ stop=["<|user|>", "<|system|>", "<|end|>"],
302
+ )
303
+ end_time = os.times().user
304
+ generation_time = end_time - start_time
305
+ logger.info(f"GGUF generation completed in {generation_time:.2f}s")
306
+
307
+ return response
308
+
309
+ except Exception as e:
310
+ logger.error(f"Error during GGUF generation: {str(e)}")
311
+ return ""
312
+
313
+ def _generate_with_transformers(
314
+ self,
315
+ prompt: str,
316
+ system_prompt: Optional[str] = None,
317
+ max_tokens: int = 512,
318
+ temperature: float = 0.7,
319
+ top_p: float = 0.9,
320
+ ) -> str:
321
+ """Generate text using transformers pipeline"""
322
  # Format messages for chat-style models
323
  messages = []
324
 
 
346
  return response
347
 
348
  except Exception as e:
349
+ logger.error(f"Error during transformers generation: {str(e)}")
350
  return ""
351
 
352
  def expand_creative_prompt(self, prompt: str) -> str:
 
394
  Returns:
395
  A LocalLLM instance or None if model loading fails
396
  """
 
 
 
 
 
 
 
397
  use_local_model = os.environ.get("USE_LOCAL_MODEL", "true").lower() != "false"
398
  if not use_local_model:
399
  logger.info("Local model usage is disabled by environment setting")
400
  return None
401
 
402
+ # Default to environment settings with fallbacks
403
+ if not model_path:
404
+ model_path = os.environ.get("MODEL_PATH") or os.environ.get(
405
+ "MODEL_ID", "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
406
+ )
407
+
408
+ # Get model file for GGUF models
409
+ model_file = os.environ.get("MODEL_FILENAME")
410
+
411
+ # Check model type - prefer GGUF for speed in resource-constrained environments
412
+ model_type = os.environ.get("MODEL_TYPE", "transformers").lower()
413
+
414
  # Check if quantization is enabled
415
  use_quantization = os.environ.get("MODEL_QUANTIZED", "false").lower() == "true"
416
 
 
425
  device_map = "auto"
426
  torch_dtype = None
427
 
428
+ # For Hugging Face Spaces, be more careful about memory usage
429
  spaces_mode = os.environ.get("HF_SPACES", "0") == "1"
430
+ if spaces_mode and model_type != "gguf":
431
  logger.info("Running in Hugging Face Spaces, using CPU for stability")
432
+ # Force CPU for Spaces with transformers models
433
  device_map = "cpu" if not use_quantization else "auto"
434
 
435
+ # Create the LLM instance with appropriate settings
436
  return LocalLLM(
437
  model_path=model_path,
438
+ model_file=model_file,
439
+ model_type=model_type,
440
  device_map=device_map,
441
  torch_dtype=torch_dtype,
442
  use_quantization=use_quantization,
requirements-hf.txt CHANGED
@@ -20,6 +20,7 @@ transformers>=4.43.0
20
  torch>=2.0.0
21
  huggingface_hub>=0.16.0
22
  accelerate>=0.21.0
 
23
 
24
  # API and utilities
25
  fastapi>=0.100.0
 
20
  torch>=2.0.0
21
  huggingface_hub>=0.16.0
22
  accelerate>=0.21.0
23
+ ctransformers>=0.2.24 # For GGUF model support
24
 
25
  # API and utilities
26
  fastapi>=0.100.0