vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Apr 11

Commit

99575b1

1 Parent(s): 95b94fd

Increase threshold

Browse files

Files changed (7) hide show

api.py +17 -5
example_uses.md +3 -3
inference_example.py +1 -1
inference_lstm.py +1 -1
knowledge_distillation.py +5 -3
trainer.py +2 -2
utils/convert_vihsd_gemini.py +4 -4

api.py CHANGED Viewed

@@ -40,7 +40,7 @@ def load_model_lstm():
     model = model.to(device)
     return model, device
-def inference(model, device, comments: str | list, threshold: float = 0.5):
     if isinstance(comments, str):
         comments = [comments]
     elif not isinstance(comments, list):
@@ -73,6 +73,7 @@ def inference(model, device, comments: str | list, threshold: float = 0.5):
             # Keep only the probs that are above the threshold (to prevent false positive), else set it to 0 (NORMAL, in this case unconclusive)
             probs = torch.where(probs > threshold, probs, 0.0)
             # Argmax over each group of classes_per_group
             predictions = probs.argmax(dim=-1)
         else:
@@ -95,11 +96,22 @@ def inference(model, device, comments: str | list, threshold: float = 0.5):
 if __name__ == "__main__":
     model, device = load_model_bert()
     comments = [
-        "Để avata bít ngay là ngu hơn chó",
-        "Hàn Quốc chửi dân Đông Lào và đây là hậu quả",
-        "Nguyễn Thuận =)) tư tưởng rừng rú gì vậy",
-        "@công danh nguyen thể chế chính trị khác hẳn tư tưởng xã hội nhé. Con cờ hó china liên quan cmn gì?"
     ]
     predictions = inference(model, device, comments)
     print("BERT Predictions:")

     model = model.to(device)
     return model, device
+def inference(model, device, comments: str | list, threshold: float = 0.55):
     if isinstance(comments, str):
         comments = [comments]
     elif not isinstance(comments, list):
             # Keep only the probs that are above the threshold (to prevent false positive), else set it to 0 (NORMAL, in this case unconclusive)
             probs = torch.where(probs > threshold, probs, 0.0)
+            print("Probabilities: ", probs)
             # Argmax over each group of classes_per_group
             predictions = probs.argmax(dim=-1)
         else:
 if __name__ == "__main__":
     model, device = load_model_bert()
+    '''comments = [
+        "Em ăn hoành thánh sáng bị khó chịu mắc ói quá bỏ ăn trưa luôn. Các thím thường hay uống gì cho đỡ vậy? Em tính làm gói gừng pha uống",
+        "Quan trọng là năm nay có tham gia những lễ hội có tính chất, quy mô và bối cảnh y hệt vậy không? Chứ tôi nói thật, dù ở bất cứ đâu mà tập trung đông đến mức không tiến không lùi như này được thì đều nguy hiểm. Khoan nói về giẫm đạp, chỉ riêng việc có sự cố đột xuất xảy ra thì chuyện cấp cứu nó sẽ vô cùng khó khăn và mất rất nhiều thời gian. Bởi vậy, tôi từ chối tham gia tất cả lễ hội nơi mà số người vượt tải đến mức không thể nhúc nhích như thế này.",
+        "Còn phải tốn hơn nữa mới được",
+        "Mình k có ý kích dục fen nhé :v Có sao kể vậy thôi.",
+        "Này là lúc trước khi gặp P hả bác? Em thắc mắc là bác có thể thẳng thừng chặn C - người bác yêu như vậy à?",
+        "Thì mượt hơn là đúng thôi. Mới phát triển thì không có nhiều tính năng, không có nhiều app thì chả mượt",
+    ]'''
     comments = [
+        "đúng là vozer, nhiều thằng sống ngu và ích kỷ vcl, nếu như người yêu nó cần 1 trái thận, lúc đó bản thân suy nghĩ tính toán thì ok, này chạy xe có 40km mà tính toán chi ly, mua cái váy mà mặc đi",
+        "Khác mẹ gì tàu khựa, bơm tiền cho đám NGO woke đi biểu tình phá lại bọn tây lông thôi. Chó chê mèo lắm lông. À mà acc Emma Roberts bị ban rồi à mày",
+        "đùa, cái shop thế mà cũng bảo chính hãng, vả vỡ alo nó đi. ra trung tâm thương mại, hay cửa hàng chính hãng mà mua.",
+        "qua thớt này của nó thì 90% là xiaolol rùi",
+        "thằng này chuyên đăng bài để hả hê, khóa mõm nó đi mod",
+        "Đm nhẫm vào đuổi con bò đỏ này nó giãy nảy cắn người kinh thật @@ Tao có hay ko liên quan lol gì mà mày có vẻ cay cú vkl nhỉ, chắc gato với tao hả ))",
+        "Sao thế óc chó, bị chửi cho ngu người rồi à =]] thứ ngu học chả biết mẹ gì vào sủa như đúng rồi =]]",
     ]
     predictions = inference(model, device, comments)
     print("BERT Predictions:")

example_uses.md CHANGED Viewed

@@ -1,16 +1,16 @@
 ## Example uses:
-- Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
 ```
 python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4 --output "./vietnamese_hate_speech_detection_phobert"
 ```
-- Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
 ```
 python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
 ```
-- Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
 python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --output_dir "./vietnamese_hate_speech_detection_phobert" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
 ```

 ## Example uses:
+- Train with PhoBERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
 ```
 python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4 --output "./vietnamese_hate_speech_detection_phobert"
 ```
+- Inference with PhoBERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
 ```
 python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
 ```
+- Train LSTM model from PhoBERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
 python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --output_dir "./vietnamese_hate_speech_detection_phobert" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
 ```

inference_example.py CHANGED Viewed

@@ -19,7 +19,7 @@ if __name__ == "__main__":
     parser.add_argument("--class_names", type=str, nargs='+', required=False, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
-    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
     args = parser.parse_args()
     class_names = args.class_names

     parser.add_argument("--class_names", type=str, nargs='+', required=False, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
+    parser.add_argument("--threshold", type=float, default=0.55, help="Threshold for classification")
     args = parser.parse_args()
     class_names = args.class_names

inference_lstm.py CHANGED Viewed

@@ -30,7 +30,7 @@ if __name__ == "__main__":
     parser.add_argument("--hidden_dim", type=int, default=256, help="Hidden dimension of LSTM")
     parser.add_argument("--num_layers", type=int, default=2, help="Number of LSTM layers")
     parser.add_argument("--dropout", type=float, default=0.5, help="Dropout probability")
-    parser.add_argument("--threshold", type=float, default=0.5, help="Threshold for classification")
     args = parser.parse_args()
     class_names = args.class_names

     parser.add_argument("--hidden_dim", type=int, default=256, help="Hidden dimension of LSTM")
     parser.add_argument("--num_layers", type=int, default=2, help="Number of LSTM layers")
     parser.add_argument("--dropout", type=float, default=0.5, help="Dropout probability")
+    parser.add_argument("--threshold", type=float, default=0.55, help="Threshold for classification")
     args = parser.parse_args()
     class_names = args.class_names

knowledge_distillation.py CHANGED Viewed

@@ -231,7 +231,7 @@ class DistillationTrainer:
             logger.info(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
             print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
-    def evaluate(self, data_loader=None, phase="Validation"):
         """
         Evaluate the student model
         """
@@ -284,9 +284,11 @@ class DistillationTrainer:
                     classes_per_group = total_classes // self.num_categories
                     # Group every classes_per_group values along dim=1
                     reshaped = student_logits.view(student_logits.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
                     # Argmax over each group of classes_per_group
-                    preds = reshaped.argmax(dim=-1)
                 else:
                     _, preds = torch.max(student_logits, 1)
                 all_preds = np.append(all_preds, preds.cpu().numpy())

             logger.info(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
             print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}, Test F1: {test_f1:.4f}")
+    def evaluate(self, data_loader=None, phase="Validation", threshold=0.55):
         """
         Evaluate the student model
         """
                     classes_per_group = total_classes // self.num_categories
                     # Group every classes_per_group values along dim=1
                     reshaped = student_logits.view(student_logits.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                    probs = F.softmax(reshaped, dim=1)
+                    # Keep only the probs that are above the threshold (to prevent false positive), else set it to 0 (NORMAL, in this case unconclusive)
+                    probs = torch.where(probs > threshold, probs, 0.0)
                     # Argmax over each group of classes_per_group
+                    preds = probs.argmax(dim=-1)
                 else:
                     _, preds = torch.max(student_logits, 1)
                 all_preds = np.append(all_preds, preds.cpu().numpy())

trainer.py CHANGED Viewed

@@ -219,7 +219,7 @@ class Trainer:
                        f"Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, F1: {test_f1:.4f}, ",
                        f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
-    def evaluate(self, data_loader, phase="Validation"):
         """
         Evaluation function for both validation and test sets
         """
@@ -280,7 +280,7 @@ class Trainer:
                     # Softmax and apply threshold
                     probs = torch.softmax(reshaped, dim=1)
-                    probs = torch.where(probs > 0.5, probs, 0.0)
                     # Argmax over each group of classes_per_group
                     preds = probs.argmax(dim=-1)
                 else:

                        f"Loss: {test_loss:.4f}, Acc: {test_acc:.4f}, F1: {test_f1:.4f}, ",
                        f"Precision: {test_precision:.4f}, Recall: {test_recall:.4f}")
+    def evaluate(self, data_loader, phase="Validation", threshold=0.55):
         """
         Evaluation function for both validation and test sets
         """
                     # Softmax and apply threshold
                     probs = torch.softmax(reshaped, dim=1)
+                    probs = torch.where(probs > threshold, probs, 0.0)
                     # Argmax over each group of classes_per_group
                     preds = probs.argmax(dim=-1)
                 else:

utils/convert_vihsd_gemini.py CHANGED Viewed

@@ -42,7 +42,7 @@ def classify_text(model, text):
         print(f"Error classifying text: {e}")
         return None
-def process_file(input_file, output_file, model, rate_limit_pause=4):
     """Process a single CSV file to match the test.csv format"""
     print(f"Processing {input_file}...")
@@ -53,9 +53,9 @@ def process_file(input_file, output_file, model, rate_limit_pause=4):
         print(f"Error reading {input_file}: {e}")
         return
-    # Rename column free_text to content
-    if 'free_text' in df.columns:
-        df.rename(columns={'free_text': 'content'}, inplace=True)
     elif 'content' not in df.columns:
         print(f"Error: 'content' column not found in {input_file}")
         return

         print(f"Error classifying text: {e}")
         return None
+def process_file(input_file, output_file, model, rate_limit_pause=4, text_col="free_text"):
     """Process a single CSV file to match the test.csv format"""
     print(f"Processing {input_file}...")
         print(f"Error reading {input_file}: {e}")
         return
+    # Rename column text_col to content
+    if text_col in df.columns:
+        df.rename(columns={text_col: 'content'}, inplace=True)
     elif 'content' not in df.columns:
         print(f"Error: 'content' column not found in {input_file}")
         return