vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Apr 8

Commit

37586a2

1 Parent(s): b8ddcf4

Add streamlit app

Browse files

Files changed (6) hide show

.gitignore +1 -1
api.py +108 -0
app.py +49 -0
dataset.py +5 -3
example_uses.md +13 -5
requirements.txt +2 -1

.gitignore CHANGED Viewed

@@ -8,4 +8,4 @@ __pycache__/
 metrics.txt
 predictions.txt
 *.pth
-news-category-dataset/

 metrics.txt
 predictions.txt
 *.pth
+vietnamese_hate_speech_detection_phobert/

api.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from model import DocBERT
+from models.lstm_model import DocumentBiLSTM
+from dataset import DataLoader, DocumentDataset
+from utils.word_segmentation_vi import word_segmentation_vi
+import numpy as np
+from transformers import AutoTokenizer
+import torch.nn.functional as F
+import torch
+args = {
+    "bert_model": "vinai/phobert-base-v2", # Base BERT model name
+    "model_path": "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth", # Change this if you have a fine-tuned model somewhere else
+    "lstm_model_path": "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth", # Change this if you have a fine-tuned model somewhere else
+    "max_seq_length": 250,
+    "num_classes": 4, # As the fine tuned model has 4 classes per category
+    "num_categories": 5, # As the fine tuned model has 5 categories
+}
+class_names = ["NORMAL", "CLEAN", "OFFENSIVE", "HATE"]
+def load_model_bert():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = DocBERT(bert_model_name=args["bert_model"], num_classes=args["num_classes"], num_categories=args["num_categories"])
+    model.load_state_dict(torch.load(args["model_path"], map_location=device))
+    model = model.to(device)
+    return model, device
+def load_model_lstm():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(args["bert_model"])
+    vocab_size = tokenizer.vocab_size
+    model = DocumentBiLSTM(vocab_size=vocab_size,
+                           embedding_dim=300,
+                           hidden_dim=256,
+                           n_layers=2,
+                           output_dim=args["num_classes"] * args["num_categories"])
+    model.load_state_dict(torch.load(args["lstm_model_path"], map_location=device)["model_state_dict"])
+    model = model.to(device)
+    return model, device
+def inference(model, device, comments: str | list):
+    if isinstance(comments, str):
+        comments = [comments]
+    elif not isinstance(comments, list):
+        raise ValueError("comment must be a string or a list of strings")
+    comments = np.array([word_segmentation_vi(comment) for comment in comments])
+    data = DocumentDataset(texts=comments, labels=None, tokenizer_name=args["bert_model"], max_length=args["max_seq_length"])
+    inference_loader = DataLoader(data, batch_size=comments.shape[0], shuffle=False)
+    batch = next(iter(inference_loader))
+    input_ids = batch['input_ids']
+    attention_mask = batch['attention_mask']
+    token_type_ids = batch['token_type_ids']
+    input_ids = input_ids.to(device)
+    attention_mask = attention_mask.to(device)
+    token_type_ids = token_type_ids.to(device)
+    with torch.no_grad():
+        outputs = model(input_ids, attention_mask=attention_mask)
+        if args["num_categories"] > 1:
+            batch_size, total_classes = outputs.shape
+            if total_classes % args["num_categories"] != 0:
+                raise ValueError(f"Error: Number of total classes in the batch must of divisible by {args["num_categories"]}")
+            classes_per_group = total_classes // args["num_categories"]
+            # Group every classes_per_group values along dim=1
+            reshaped = outputs.view(outputs.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+            probs = F.softmax(reshaped, dim=1)
+            # Argmax over each group of classes_per_group
+            predictions = probs.argmax(dim=-1)
+        else:
+            predictions = torch.argmax(outputs, dim=-1)
+    preds_array = predictions.cpu().numpy()
+    result = []
+    for i in range(preds_array.shape[0]):
+        result.append(
+        {
+            "Bình luận": comments[i],
+            "Cá nhân": class_names[ preds_array[i, 0] ],
+            "Nhóm/tổ chức": class_names[ preds_array[i, 1] ],
+            "Tôn giáo/tín ngưỡng": class_names[ preds_array[i, 2] ],
+            "Chủng tộc/sắc tộc": class_names[ preds_array[i, 3] ],
+            "Chính trị": class_names[ preds_array[i, 4] ],
+        })
+    return result
+if __name__ == "__main__":
+    model, device = load_model_bert()
+    comments = [
+        "Để avata bít ngay là ngu hơn chó",
+        "Hàn Quốc chửi dân Đông Lào và đây là hậu quả",
+        "Nguyễn Thuận =)) tư tưởng rừng rú gì vậy",
+        "@công danh nguyen thể chế chính trị khác hẳn tư tưởng xã hội nhé. Con cờ hó china liên quan cmn gì?"
+    ]
+    predictions = inference(model, device, comments)
+    print("BERT Predictions:")
+    print(predictions)
+    lstm_model, device = load_model_lstm()
+    lstm_predictions = inference(lstm_model, device, comments)
+    print("LSTM Predictions:")
+    print(lstm_predictions)

app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import streamlit as st
+from api import load_model_bert, load_model_lstm, inference
+import pandas as pd
+# Set up the Streamlit app
+def app():
+    st.title("Phân tích ngôn từ thù địch, phân biệt sử dụng PhoBERT và LSTM")
+      # Show loading progress bar
+    # Load models
+    @st.cache_resource
+    def load_models():
+        st.progress(0, "Nạp các mô hình...")
+        # Load BERT model
+        bert_model, bert_device = load_model_bert()
+        st.progress(50, "Mô hình PhoBERT đã được nạp.")
+        # Load LSTM model
+        lstm_model, lstm_device = load_model_lstm()
+        st.progress(100, "Mô hình LSTM đã được nạp.")  # Complete loading progress
+        return bert_model, bert_device, lstm_model, lstm_device
+    bert_model, bert_device, lstm_model, lstm_device = load_models()
+    # User input
+    user_input = st.text_area("Nhập các bình luận để phân tích ngôn từ thù địch, phân biệt (xuống dòng cho từng bình luận):")
+    if st.button("Phân tích"):
+        if user_input:
+            # Preprocess input
+            comments = user_input.splitlines()
+            # Inference with BERT
+            st.progress(0, "Đang phân tích với PhoBERT...")
+            bert_predictions = inference(bert_model, bert_device, comments)
+            st.write("BERT Predictions:")
+            st.dataframe(pd.DataFrame(bert_predictions))
+            st.progress(50, "Đang phân tích với LSTM...")
+            # Inference with LSTM
+            lstm_predictions = inference(lstm_model, lstm_device, comments)
+            st.write("LSTM Predictions:")
+            st.progress(100, "Phân tích hoàn tất!")
+            st.dataframe(pd.DataFrame(lstm_predictions))
+        else:
+            st.warning("Hãy nhập một vài bình luận.")
+if __name__ == "__main__":
+    app()

dataset.py CHANGED Viewed

@@ -19,7 +19,9 @@ class DocumentDataset(Dataset):
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
         self.max_length = max_length
-        if type(labels) is not np.ndarray and type(labels) is not list:
             # Validate labels
             unique_labels = set(labels)
             min_label = min(unique_labels) if unique_labels else 0
@@ -67,7 +69,7 @@ class DocumentDataset(Dataset):
     def __getitem__(self, idx):
         text = str(self.texts[idx])
-        label = self.labels[idx]
         # Tokenize the text with attention mask and truncation
         encoding = self.tokenizer.encode_plus(
@@ -92,7 +94,7 @@ class DocumentDataset(Dataset):
         """Get original text for a given index"""
         return {
             'text': self.texts[idx],
-            'label': self.labels[idx]
         }
 def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):

         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
         self.max_length = max_length
+        if labels is None:
+            self.labels = np.zeros(len(texts), dtype=int)
+        elif type(labels) is not np.ndarray and type(labels) is not list:
             # Validate labels
             unique_labels = set(labels)
             min_label = min(unique_labels) if unique_labels else 0
     def __getitem__(self, idx):
         text = str(self.texts[idx])
+        label = self.labels[idx] if self.labels is not None else torch.tensor(0, dtype=torch.long)
         # Tokenize the text with attention mask and truncation
         encoding = self.tokenizer.encode_plus(
         """Get original text for a given index"""
         return {
             'text': self.texts[idx],
+            'label': self.labels[idx] if self.labels is not None else None
         }
 def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):

example_uses.md CHANGED Viewed

@@ -3,19 +3,27 @@
 - Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
 ```
-python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4
 ```
 - Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
 ```
-python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./output/vinai_phobert-base-v2_finetuned.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
 ```
 - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
-python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vinai_phobert-base-v2_finetuned/best_model.pth" --output_dir "./output" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
 ```
 - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
-python ./inference_lstm.py --model_path "./output/distilled_lstm_model.pth" --bert_tokenizer "vinai/phobert-base-v2" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
-```

 - Train with BERT model (train.csv is ViTHSD dataset with 4 classes each for 5 categories)
 ```
+python ./train.py --bert_model "vinai/phobert-base-v2" --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4 --output "./vietnamese_hate_speech_detection_phobert"
 ```
 - Inference with BERT model (test_data.csv is test dataset with 4 classes each for 5 categories like ViTHSD)
 ```
+python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
 ```
 - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
+python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vietnamese_hate_speech_detection_phobert/vinai_phobert-base-v2_finetuned.pth" --output_dir "./vietnamese_hate_speech_detection_phobert" --batch_size 32 --epochs 10 --train_data_path "./datasets/train.csv" --val_data_path "./datasets/dev.csv" --test_data_path "./datasets/test.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
 ```
 - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
+python ./inference_lstm.py --model_path "./vietnamese_hate_speech_detection_phobert/distilled_lstm_model.pth" --bert_tokenizer "vinai/phobert-base-v2" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
+```
+## How to run:
+- Install the dependencies in requirements.txt: pip install -r requirements.txt
+- Either follow the "Train with BERT model" or "Train LSTM model from BERT model using distillation" in Example uses section above, or git clone the model from: "https://huggingface.co/jesse-tong/vietnamese_hate_speech_detection_phobert"
+- Run the Streamlit app: streamlit run app.py, then either go to http://localhost:8501 or waiting for the browser tab to open.

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ datasets
 torchtext
 maturin
 underthesea --only-binary :all:
-accelerate

 torchtext
 maturin
 underthesea --only-binary :all:
+accelerate
+streamlit