Spaces:

darrenphodgson76
/

SmolLM2-1.7B-Instruct-Bussiness-Analysis

Paused

App Files Files Community

darrenphodgson76 commited on Apr 18

Commit

8ba34e8

verified ·

1 Parent(s): b6a51b0

Update train.py

Browse files

Files changed (1) hide show

train.py +56 -39

train.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# ✅ Final train.py with ZIP logic added
 import unsloth  # must be first
 import pandas as pd
 import torch
@@ -7,75 +7,92 @@ from transformers import TrainingArguments
 from unsloth import FastLanguageModel
 from trl import SFTTrainer
 import os
-import shutil
 import zipfile
-# Load Unsloth model
 model, tokenizer = FastLanguageModel.from_pretrained(
-    model_name = "HuggingFaceTB/SmolLM2-1.7B",
-    max_seq_length = 2048,
-    dtype = torch.float16,
-    load_in_4bit = True,
 )
-# Get eos_token after tokenizer is loaded
-eos_token = tokenizer.eos_token or "</s>"# Fallback if eos_token is None
-# Load and format your dataset
 df = pd.read_csv("data.csv")
-df["text"] = df.apply(lambda row: f"### Instruction:\n{row['instruction']}\n\n### Response:\n{row['response']} {eos_token}", axis=1)
 dataset = Dataset.from_pandas(df[["text"]])
-# Apply LoRA without task_type
 model = FastLanguageModel.get_peft_model(
     model,
-    r = 8,
-    lora_alpha = 32,
-    lora_dropout = 0.05,
-    bias = "none",
 )
-# Tokenize text
 def tokenize(example):
-    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)
 tokenized_dataset = dataset.map(tokenize, batched=True)
-# Set up training
 training_args = TrainingArguments(
-    output_dir = "./output_model",
-    per_device_train_batch_size = 2,
-    num_train_epochs = 3,
-    learning_rate = 2e-4,
-    logging_steps = 10,
-    save_steps = 100,
-    fp16 = True,
 )
-# Train
 trainer = SFTTrainer(
-    model = model,
-    tokenizer = tokenizer,
-    args = training_args,
-    train_dataset = tokenized_dataset,
 )
 trainer.train()
-# Save the fine-tuned LoRA adapter
 output_dir = "./output_model"
 os.makedirs(output_dir, exist_ok=True)
 model.save_pretrained(output_dir)
-# ✅ Zip it for download
 zip_path = "/home/user/app/model.zip"
 try:
-    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
         for root, _, files in os.walk(output_dir):
-            for file in files:
-                full_path = os.path.join(root, file)
-                rel_path = os.path.relpath(full_path, output_dir)
-                zipf.write(full_path, rel_path)
     print(f"✅ Zipped model to {zip_path}")
 except Exception as e:
-    print(f"❌ Failed to zip model: {e}")

+# ✅ Final train.py with EOS-as-pad and stop_sequences
 import unsloth  # must be first
 import pandas as pd
 import torch
 from unsloth import FastLanguageModel
 from trl import SFTTrainer
 import os
 import zipfile
+# 1) Load Unsloth model + tokenizer
 model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="HuggingFaceTB/SmolLM2-1.7B",
+    max_seq_length=2048,
+    dtype=torch.float16,
+    load_in_4bit=True,
 )
+# 2) Reuse the existing eos_token as pad_token so generation will stop
+eos_token = tokenizer.eos_token  # should be "<|endoftext|>"
+tokenizer.pad_token = eos_token
+model.config.pad_token_id = tokenizer.eos_token_id
+# 3) Load & format your dataset, always ending responses with the EOS token
 df = pd.read_csv("data.csv")
+df["text"] = df.apply(
+    lambda row: (
+        "### Instruction:\n"
+        + row["instruction"].strip()
+        + "\n\n### Response:\n"
+        + row["response"].strip()
+        + eos_token
+    ),
+    axis=1
+)
 dataset = Dataset.from_pandas(df[["text"]])
+# 4) Apply LoRA without task_type
 model = FastLanguageModel.get_peft_model(
     model,
+    r=8,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
 )
+# 5) Tokenize
 def tokenize(example):
+    return tokenizer(
+        example["text"],
+        truncation=True,
+        padding="max_length",
+        max_length=512,
+    )
 tokenized_dataset = dataset.map(tokenize, batched=True)
+# 6) Set up training arguments
 training_args = TrainingArguments(
+    output_dir="./output_model",
+    per_device_train_batch_size=2,
+    num_train_epochs=3,
+    learning_rate=2e-4,
+    logging_steps=10,
+    save_steps=100,
+    fp16=True,
 )
+# 7) Initialize SFTTrainer with stop_sequences
 trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=training_args,
+    train_dataset=tokenized_dataset,
+    stop_sequences=[eos_token],
 )
+# 8) Train!
 trainer.train()
+# 9) Save the fine‑tuned LoRA adapter
 output_dir = "./output_model"
 os.makedirs(output_dir, exist_ok=True)
 model.save_pretrained(output_dir)
+# 🔧 Zip the model for download
 zip_path = "/home/user/app/model.zip"
 try:
+    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zipf:
         for root, _, files in os.walk(output_dir):
+            for fname in files:
+                full = os.path.join(root, fname)
+                rel = os.path.relpath(full, output_dir)
+                zipf.write(full, rel)
     print(f"✅ Zipped model to {zip_path}")
 except Exception as e:
+    print(f"❌ Failed to zip model: {e}")