darrenphodgson76 commited on
Commit
f2467aa
·
verified ·
1 Parent(s): 3c0a089

Update train.py

Browse files
Files changed (1) hide show
  1. train.py +2 -15
train.py CHANGED
@@ -11,7 +11,7 @@ from trl import SFTTrainer
11
 
12
 
13
  def main():
14
- # 1) Load 4-bit model + tokenizer
15
  model_name = "HuggingFaceTB/SmolLM2-1.7B"
16
  max_seq_length = 768
17
  dtype = torch.float16
@@ -24,20 +24,7 @@ def main():
24
  load_in_4bit=load_in_4bit,
25
  )
26
 
27
- # 2) Configure special tokens for chat format
28
- special_tokens = {
29
- "bos_token": "<|im_start|>user\n",
30
- "eos_token": "<|im_end|>",
31
- "pad_token": "<|im_end|>",
32
- "additional_special_tokens": [
33
- "<|im_start|>assistant\n"
34
- ]
35
- }
36
- tokenizer.add_special_tokens(special_tokens)
37
- model.resize_token_embeddings(len(tokenizer))
38
- model.config.bos_token_id = tokenizer.bos_token_id
39
- model.config.eos_token_id = tokenizer.eos_token_id
40
- model.config.pad_token_id = tokenizer.pad_token_id
41
 
42
  # 3) Load chat-formatted dataset
43
  df = pd.read_json("data.jsonl", lines=True)
 
11
 
12
 
13
  def main():
14
+ # 1) Load 4-bit model + tokenizer (SmolLM already chat-formatted)
15
  model_name = "HuggingFaceTB/SmolLM2-1.7B"
16
  max_seq_length = 768
17
  dtype = torch.float16
 
24
  load_in_4bit=load_in_4bit,
25
  )
26
 
27
+ # 2) NO manual special-tokens injection or resizing — base model tokenizer already includes chat markers
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  # 3) Load chat-formatted dataset
30
  df = pd.read_json("data.jsonl", lines=True)