CamiloVega commited on
Commit
d8c6271
·
verified ·
1 Parent(s): cc6d9fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -32
app.py CHANGED
@@ -46,6 +46,9 @@ class ModelManager:
46
  def initialize_models(self):
47
  """Initialize models with optimized settings"""
48
  try:
 
 
 
49
  HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
50
  if not HUGGINGFACE_TOKEN:
51
  raise ValueError("HUGGINGFACE_TOKEN environment variable not set")
@@ -53,6 +56,14 @@ class ModelManager:
53
  logger.info("Starting model initialization...")
54
  model_name = "meta-llama/Llama-2-7b-chat-hf"
55
 
 
 
 
 
 
 
 
 
56
  # Load tokenizer with optimized settings
57
  logger.info("Loading tokenizer...")
58
  self.tokenizer = AutoTokenizer.from_pretrained(
@@ -63,43 +74,25 @@ class ModelManager:
63
  )
64
  self.tokenizer.pad_token = self.tokenizer.eos_token
65
 
66
- # Initialize model with Unsloth optimizations
67
- logger.info("Loading model with Unsloth optimizations...")
68
- model, tokenizer = FastLanguageModel.from_pretrained(
69
- model_name=model_name,
70
  token=HUGGINGFACE_TOKEN,
71
- max_seq_length=512,
72
- dtype="float16",
73
- load_in_4bit=True, # Use 4-bit quantization
74
- device_map="auto", # Automatically handle device mapping
75
- kwargs=dict(
76
- use_gradient_checkpointing=True,
77
- use_flash_attention_2=True,
78
- use_merged_kernels=True,
79
- )
80
- )
81
-
82
- # Apply additional optimizations
83
- model = FastLanguageModel.get_peft_model(
84
- model,
85
- r=16,
86
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
87
- modules_to_save=None,
88
- lora_alpha=16,
89
- lora_dropout=0.05,
90
- bias="none",
91
- use_gradient_checkpointing=True,
92
- random_state=42,
93
- use_rslora=False,
94
- use_dora=False,
95
  )
96
 
97
- self.model = model
98
- logger.info("Model loaded successfully with Unsloth optimizations")
99
-
100
  # Create optimized pipeline
101
  logger.info("Creating pipeline...")
102
- self.news_generator = FastLanguageModel.get_pipeline(
 
 
103
  model=self.model,
104
  tokenizer=self.tokenizer,
105
  device_map="auto",
 
46
  def initialize_models(self):
47
  """Initialize models with optimized settings"""
48
  try:
49
+ import torch
50
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
51
+
52
  HUGGINGFACE_TOKEN = os.environ.get('HUGGINGFACE_TOKEN')
53
  if not HUGGINGFACE_TOKEN:
54
  raise ValueError("HUGGINGFACE_TOKEN environment variable not set")
 
56
  logger.info("Starting model initialization...")
57
  model_name = "meta-llama/Llama-2-7b-chat-hf"
58
 
59
+ # Configure 4-bit quantization
60
+ bnb_config = BitsAndBytesConfig(
61
+ load_in_4bit=True,
62
+ bnb_4bit_use_double_quant=True,
63
+ bnb_4bit_quant_type="nf4",
64
+ bnb_4bit_compute_dtype=torch.bfloat16
65
+ )
66
+
67
  # Load tokenizer with optimized settings
68
  logger.info("Loading tokenizer...")
69
  self.tokenizer = AutoTokenizer.from_pretrained(
 
74
  )
75
  self.tokenizer.pad_token = self.tokenizer.eos_token
76
 
77
+ # Initialize model with optimized settings
78
+ logger.info("Loading model...")
79
+ self.model = AutoModelForCausalLM.from_pretrained(
80
+ model_name,
81
  token=HUGGINGFACE_TOKEN,
82
+ device_map="auto",
83
+ torch_dtype=torch.float16,
84
+ quantization_config=bnb_config,
85
+ use_flash_attention_2=True,
86
+ use_cache=True,
87
+ attn_implementation="flash_attention_2",
88
+ low_cpu_mem_usage=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
90
 
 
 
 
91
  # Create optimized pipeline
92
  logger.info("Creating pipeline...")
93
+ from transformers import pipeline
94
+ self.news_generator = pipeline(
95
+ "text-generation",
96
  model=self.model,
97
  tokenizer=self.tokenizer,
98
  device_map="auto",