Commit
·
30ae36c
1
Parent(s):
946b455
Update datasets
Browse files- LICENSE.md +15 -16
- scripts/merge_datasets.py +116 -0
- utils/convert_vihsd_gemini.py +166 -0
- utils/word_segmentation_vi.py +4 -0
LICENSE.md
CHANGED
@@ -1,26 +1,25 @@
|
|
1 |
-
AGPL-3.0
|
2 |
-
|
3 |
This repository as a whole is licensed under the [GNU Affero General Public License v3.0 or any later version (AGPL v3.0 or later)](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
4 |
|
5 |
## Third-Party Components
|
6 |
|
7 |
This repository uses the following third-party components, each under their respective licenses:
|
8 |
|
9 |
-
| Component | License | Description |
|
10 |
-
|-----------|---------|-------------|
|
11 |
-
| PhoBERT v2 | [AGPLv3.0](https://www.gnu.org/licenses/agpl-3.0.en.html) | Pre-trained language model for Vietnamese |
|
12 |
-
| ViTHSD | [MIT License](https://opensource.org/licenses/MIT) | Vietnamese Targeted Hate Speech Detection |
|
13 |
-
|
|
14 |
-
|
|
15 |
-
|
|
|
|
16 |
| datasets | [Apache License 2.0](https://github.com/huggingface/datasets/blob/main/LICENSE) | Dataset library by Hugging Face |
|
17 |
-
| pandas | [BSD 3-Clause License](https://github.com/pandas-dev/pandas/blob/main/LICENSE) | Data analysis and manipulation library |
|
18 |
-
| scikit-learn | [BSD 3-Clause License](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) | Machine learning library for Python |
|
19 |
-
| numpy | [BSD 3-Clause License](https://github.com/numpy/numpy/blob/main/LICENSE.txt) | Scientific computing library |
|
20 |
-
| tokenizers | [Apache License 2.0](https://github.com/huggingface/tokenizers/blob/main/LICENSE) | Fast tokenizers library by Hugging Face |
|
21 |
-
| torchtext | [BSD License](https://github.com/pytorch/text/blob/main/LICENSE) | Text processing utilities for PyTorch |
|
22 |
-
| maturin | [MIT License or Apache License 2.0](https://github.com/PyO3/maturin/blob/main/license-mit) | Build and publish Rust extensions for Python |
|
23 |
-
| accelerate | [Apache License 2.0](https://github.com/huggingface/accelerate/blob/main/LICENSE) | Library for easy PyTorch distributed training |
|
24 |
|
25 |
## AGPLv3.0 License Requirements
|
26 |
|
|
|
|
|
|
|
1 |
This repository as a whole is licensed under the [GNU Affero General Public License v3.0 or any later version (AGPL v3.0 or later)](https://www.gnu.org/licenses/agpl-3.0.en.html).
|
2 |
|
3 |
## Third-Party Components
|
4 |
|
5 |
This repository uses the following third-party components, each under their respective licenses:
|
6 |
|
7 |
+
| Component | License | Description | Link to repository (if possible) |
|
8 |
+
|-----------|---------|-------------| -------------------------------- |
|
9 |
+
| PhoBERT v2 | [AGPLv3.0](https://www.gnu.org/licenses/agpl-3.0.en.html) | Pre-trained language model for Vietnamese | [vinai/phobert-base-v2](https://huggingface.co/vinai/phobert-base-v2) |
|
10 |
+
| ViTHSD | [MIT License](https://opensource.org/licenses/MIT) | Vietnamese Targeted Hate Speech Detection Dataset | [bakansm/ViTHSD](https://github.com/bakansm/ViTHSD) |
|
11 |
+
| ViHSD | [MIT License](https://opensource.org/licenses/MIT) | Vietnamese Hate Speech Detection Dataset | [sonlam1102/vihsd](https://huggingface.co/datasets/sonlam1102/vihsd) |
|
12 |
+
| underthesea | [GPLv3](https://www.gnu.org/licenses/gpl-3.0.en.html) | Vietnamese NLP Toolkit | [undertheseanlp/underthesea](https://github.com/undertheseanlp/underthesea) |
|
13 |
+
| transformers | [Apache License 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE) | State-of-the-art NLP library by Hugging Face | [huggingface/transformers](https://github.com/huggingface/transformers) |
|
14 |
+
| torch (PyTorch) | [BSD License](https://github.com/pytorch/pytorch/blob/master/LICENSE) | Open-source machine learning library | [Repo \(Github\)](https://github.com/pytorch/pytorch/blob/master/LICENSE) |
|
15 |
| datasets | [Apache License 2.0](https://github.com/huggingface/datasets/blob/main/LICENSE) | Dataset library by Hugging Face |
|
16 |
+
| pandas | [BSD 3-Clause License](https://github.com/pandas-dev/pandas/blob/main/LICENSE) | Data analysis and manipulation library | |
|
17 |
+
| scikit-learn | [BSD 3-Clause License](https://github.com/scikit-learn/scikit-learn/blob/main/COPYING) | Machine learning library for Python | |
|
18 |
+
| numpy | [BSD 3-Clause License](https://github.com/numpy/numpy/blob/main/LICENSE.txt) | Scientific computing library | |
|
19 |
+
| tokenizers | [Apache License 2.0](https://github.com/huggingface/tokenizers/blob/main/LICENSE) | Fast tokenizers library by Hugging Face | |
|
20 |
+
| torchtext | [BSD License](https://github.com/pytorch/text/blob/main/LICENSE) | Text processing utilities for PyTorch | |
|
21 |
+
| maturin | [MIT License or Apache License 2.0](https://github.com/PyO3/maturin/blob/main/license-mit) | Build and publish Rust extensions for Python | |
|
22 |
+
| accelerate | [Apache License 2.0](https://github.com/huggingface/accelerate/blob/main/LICENSE) | Library for easy PyTorch distributed training | |
|
23 |
|
24 |
## AGPLv3.0 License Requirements
|
25 |
|
scripts/merge_datasets.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import pandas as pd
|
4 |
+
import argparse
|
5 |
+
from tqdm import tqdm
|
6 |
+
import shutil
|
7 |
+
|
8 |
+
def merge_datasets(input_dirs, output_dir, preserve_splits=False):
|
9 |
+
"""
|
10 |
+
Merge CSV datasets from multiple directories into one directory
|
11 |
+
|
12 |
+
Args:
|
13 |
+
input_dirs (list): List of input directory paths
|
14 |
+
output_dir (str): Output directory path
|
15 |
+
"""
|
16 |
+
# Create output directory if it doesn't exist
|
17 |
+
os.makedirs(output_dir, exist_ok=True)
|
18 |
+
|
19 |
+
# Define the expected columns for the format in test.csv
|
20 |
+
expected_columns = ['content', 'individual', 'groups', 'religion/creed', 'race/ethnicity', 'politics']
|
21 |
+
|
22 |
+
# Dictionary to hold dataframes for each split if preserving splits
|
23 |
+
combined_data = {}
|
24 |
+
if preserve_splits:
|
25 |
+
combined_data = {'train': [], 'dev': [], 'test': []}
|
26 |
+
else:
|
27 |
+
combined_data['all'] = []
|
28 |
+
|
29 |
+
# Process each input directory
|
30 |
+
for input_dir in input_dirs:
|
31 |
+
print(f"Processing directory: {input_dir}")
|
32 |
+
|
33 |
+
# Find all CSV files in the directory
|
34 |
+
csv_files = glob.glob(os.path.join(input_dir, "*.csv"))
|
35 |
+
|
36 |
+
for file_path in tqdm(csv_files, desc=f"Processing files in {os.path.basename(input_dir)}"):
|
37 |
+
file_name = os.path.basename(file_path)
|
38 |
+
|
39 |
+
# Read the CSV file
|
40 |
+
try:
|
41 |
+
df = pd.read_csv(file_path)
|
42 |
+
print(f" Reading {file_name}: {len(df)} rows")
|
43 |
+
except Exception as e:
|
44 |
+
print(f" Error reading {file_name}: {e}")
|
45 |
+
continue
|
46 |
+
|
47 |
+
# Rename 'free_text' column to 'content' if it exists
|
48 |
+
if 'free_text' in df.columns:
|
49 |
+
df.rename(columns={'free_text': 'content'}, inplace=True)
|
50 |
+
|
51 |
+
# Check if 'content' column exists
|
52 |
+
if 'content' not in df.columns:
|
53 |
+
print(f" Warning: 'content' column not found in {file_name}. Skipping.")
|
54 |
+
continue
|
55 |
+
|
56 |
+
# Ensure all required columns exist
|
57 |
+
for col in expected_columns:
|
58 |
+
if col != 'content' and col not in df.columns:
|
59 |
+
df[col] = 0 # Set default value for missing columns
|
60 |
+
|
61 |
+
# Convert category columns to integer type
|
62 |
+
for col in expected_columns:
|
63 |
+
if col != 'content' and col in df.columns:
|
64 |
+
df[col] = df[col].fillna(0).astype(int)
|
65 |
+
|
66 |
+
# Drop unnecessary columns
|
67 |
+
df = df[expected_columns]
|
68 |
+
|
69 |
+
# Determine which split this file belongs to
|
70 |
+
if preserve_splits:
|
71 |
+
if 'train' in file_name.lower():
|
72 |
+
combined_data['train'].append(df)
|
73 |
+
elif 'dev' in file_name.lower():
|
74 |
+
combined_data['dev'].append(df)
|
75 |
+
elif 'test' in file_name.lower():
|
76 |
+
combined_data['test'].append(df)
|
77 |
+
else:
|
78 |
+
# If not explicitly marked, add to all splits
|
79 |
+
for split in ['train', 'dev', 'test']:
|
80 |
+
combined_data[split].append(df)
|
81 |
+
else:
|
82 |
+
combined_data['all'].append(df)
|
83 |
+
|
84 |
+
# Combine and save the data
|
85 |
+
for split, dfs in combined_data.items():
|
86 |
+
if not dfs:
|
87 |
+
print(f"No data for {split} split")
|
88 |
+
continue
|
89 |
+
|
90 |
+
combined_df = pd.concat(dfs, ignore_index=True)
|
91 |
+
|
92 |
+
# Remove duplicates
|
93 |
+
combined_df = combined_df.drop_duplicates(subset=['content'])
|
94 |
+
|
95 |
+
# Save to output directory
|
96 |
+
output_file = os.path.join(output_dir, f"{split}.csv" if preserve_splits else "combined.csv")
|
97 |
+
combined_df.to_csv(output_file, index=False)
|
98 |
+
print(f"Saved {len(combined_df)} rows to {output_file}")
|
99 |
+
|
100 |
+
def main():
|
101 |
+
parser = argparse.ArgumentParser(description="Merge CSV datasets from multiple directories")
|
102 |
+
parser.add_argument("--input_dirs", required=True, nargs='+',
|
103 |
+
help="List of input directory paths containing CSV files")
|
104 |
+
parser.add_argument("--output_dir", required=True,
|
105 |
+
help="Output directory path for merged datasets")
|
106 |
+
|
107 |
+
args = parser.parse_args()
|
108 |
+
|
109 |
+
merge_datasets(
|
110 |
+
args.input_dirs,
|
111 |
+
args.output_dir,
|
112 |
+
preserve_splits=True
|
113 |
+
)
|
114 |
+
|
115 |
+
if __name__ == "__main__":
|
116 |
+
main()
|
utils/convert_vihsd_gemini.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import pandas as pd
|
4 |
+
import argparse
|
5 |
+
from google import genai
|
6 |
+
from tqdm import tqdm
|
7 |
+
import time
|
8 |
+
import re
|
9 |
+
from word_segmentation_vi import word_segmentation_vi
|
10 |
+
|
11 |
+
def setup_genai(api_key):
|
12 |
+
"""Configure the Google Generative AI client with your API key"""
|
13 |
+
return genai.Client(api_key=api_key)
|
14 |
+
|
15 |
+
def classify_text(model, text):
|
16 |
+
"""Classify Vietnamese text into hate speech categories using Google's Generative AI"""
|
17 |
+
prompt = f"""
|
18 |
+
Analyze the following Vietnamese text for hate speech (each sentence is separated by a newline):
|
19 |
+
"{text}"
|
20 |
+
|
21 |
+
Rate it on these categories (0=NORMAL, 1=CLEAN, 2=OFFENSIVE, 3=HATE):
|
22 |
+
- individual (targeting specific individuals)
|
23 |
+
- groups (targeting groups or organizations)
|
24 |
+
- religion/creed (targeting religious groups or beliefs)
|
25 |
+
- race/ethnicity (racial/ethnic hate speech)
|
26 |
+
- politics (political hate speech)
|
27 |
+
If the text doesn't specify a person or group in a category, return 0 for that category.
|
28 |
+
Else, return 1 for CLEAN, 2 for OFFENSIVE, or 3 for HATE.
|
29 |
+
|
30 |
+
For each sentence in the text, return only 5 numbers separated by commas (corresponding to the label of individual, groups, religion/creed, race/ethnicity, politics) and numbers for each sentence seperated by newlines, like (with no other text):
|
31 |
+
0,1,0,0,0
|
32 |
+
1,0,0,0,2
|
33 |
+
"""
|
34 |
+
|
35 |
+
try:
|
36 |
+
response = model.models.generate_content(model="gemini-2.0-flash", contents=prompt)
|
37 |
+
values = response.text.strip().split('\n')
|
38 |
+
values = [line.split(',') for line in values]
|
39 |
+
return values
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error classifying text: {e}")
|
43 |
+
return None
|
44 |
+
|
45 |
+
def process_file(input_file, output_file, model, rate_limit_pause=4):
|
46 |
+
"""Process a single CSV file to match the test.csv format"""
|
47 |
+
print(f"Processing {input_file}...")
|
48 |
+
|
49 |
+
# Read the input file
|
50 |
+
try:
|
51 |
+
df = pd.read_csv(input_file)
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error reading {input_file}: {e}")
|
54 |
+
return
|
55 |
+
|
56 |
+
# Rename column free_text to content
|
57 |
+
if 'free_text' in df.columns:
|
58 |
+
df.rename(columns={'free_text': 'content'}, inplace=True)
|
59 |
+
elif 'content' not in df.columns:
|
60 |
+
print(f"Error: 'content' column not found in {input_file}")
|
61 |
+
return
|
62 |
+
|
63 |
+
# Ensure all required columns exist
|
64 |
+
category_columns = ['individual', 'groups', 'religion/creed', 'race/ethnicity', 'politics']
|
65 |
+
for col in category_columns:
|
66 |
+
if col not in df.columns:
|
67 |
+
# Change column type to int if it doesn't exist
|
68 |
+
df[col] = 0
|
69 |
+
|
70 |
+
# Process each batch (100 rows at a time)
|
71 |
+
batch_size = 100
|
72 |
+
for start in tqdm(range(0, len(df), batch_size), desc="Processing batches"):
|
73 |
+
end = min(start + batch_size, len(df))
|
74 |
+
batch_df = df.iloc[start:end]
|
75 |
+
|
76 |
+
# Skip if all categories already have values
|
77 |
+
if all(batch_df[cat].all() != 0 for cat in category_columns):
|
78 |
+
continue
|
79 |
+
|
80 |
+
# Join 50 rows by newlines, and classify all at once
|
81 |
+
text_to_classify = "\n".join([str(sentence) for sentence in batch_df['content'].tolist()])
|
82 |
+
classifications = classify_text(model, text_to_classify)
|
83 |
+
|
84 |
+
# Try 2 more times, else skip
|
85 |
+
if classifications is None:
|
86 |
+
for _ in range(2):
|
87 |
+
classifications = classify_text(model, text_to_classify)
|
88 |
+
if classifications is not None:
|
89 |
+
break
|
90 |
+
time.sleep(rate_limit_pause)
|
91 |
+
else:
|
92 |
+
print(f"Error classifying batch starting at index {start}. Skipping...")
|
93 |
+
continue
|
94 |
+
|
95 |
+
try:
|
96 |
+
# Update the DataFrame with the classifications
|
97 |
+
for i, row in enumerate(classifications):
|
98 |
+
for j, col in enumerate(category_columns):
|
99 |
+
df.at[start + i, col] = int(row[j])
|
100 |
+
except Exception as e:
|
101 |
+
for _ in range(2):
|
102 |
+
classifications = classify_text(model, text_to_classify)
|
103 |
+
if classifications is not None:
|
104 |
+
break
|
105 |
+
time.sleep(rate_limit_pause)
|
106 |
+
else:
|
107 |
+
print(f"Error classifying batch starting at index {start}. Skipping...")
|
108 |
+
continue
|
109 |
+
|
110 |
+
try:
|
111 |
+
for i, row in enumerate(classifications):
|
112 |
+
for j, col in enumerate(category_columns):
|
113 |
+
df.at[start + i, col] = int(row[j])
|
114 |
+
except Exception as e:
|
115 |
+
print(f"Error updating DataFrame: {e}")
|
116 |
+
continue
|
117 |
+
|
118 |
+
time.sleep(rate_limit_pause)
|
119 |
+
|
120 |
+
# Apply word segmentation to the content column
|
121 |
+
df['content'] = df['content'].apply(lambda x: word_segmentation_vi(str(x)))
|
122 |
+
|
123 |
+
# Save processed file, export columns of category_columns is int
|
124 |
+
for col in category_columns:
|
125 |
+
df[col] = df[col].astype(int)
|
126 |
+
# Drop label_id column if it exists
|
127 |
+
if 'label_id' in df.columns:
|
128 |
+
df.drop(columns=['label_id'], inplace=True)
|
129 |
+
df.to_csv(output_file, index=False)
|
130 |
+
print(f"Saved processed file to {output_file}")
|
131 |
+
|
132 |
+
def main():
|
133 |
+
parser = argparse.ArgumentParser(description="Process ViHSD CSV files with Google Generative AI")
|
134 |
+
parser.add_argument("--input_dir", required=True, help="Directory containing input CSV files")
|
135 |
+
parser.add_argument("--output_dir", required=True, help="Directory to save processed files")
|
136 |
+
parser.add_argument("--api_key", required=True, help="Google Generative AI API key")
|
137 |
+
parser.add_argument("--pause", type=float, default=4.0, help="Pause between API calls (seconds)")
|
138 |
+
|
139 |
+
args = parser.parse_args()
|
140 |
+
|
141 |
+
# Ensure output directory exists
|
142 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
143 |
+
|
144 |
+
# Setup Google Generative AI
|
145 |
+
model = setup_genai(args.api_key)
|
146 |
+
|
147 |
+
# Get all CSV files in the input directory
|
148 |
+
csv_files = glob.glob(os.path.join(args.input_dir, "*.csv"))
|
149 |
+
if not csv_files:
|
150 |
+
print(f"No CSV files found in {args.input_dir}")
|
151 |
+
return
|
152 |
+
|
153 |
+
print(f"Found {len(csv_files)} CSV files to process")
|
154 |
+
|
155 |
+
# Process each file
|
156 |
+
for input_file in csv_files:
|
157 |
+
output_file = os.path.join(args.output_dir, os.path.basename(input_file))
|
158 |
+
if os.path.exists(output_file):
|
159 |
+
print(f"Output file {output_file} already exists. Skipping...")
|
160 |
+
continue
|
161 |
+
process_file(input_file, output_file, model, args.pause)
|
162 |
+
|
163 |
+
if __name__ == "__main__":
|
164 |
+
# This script is used to process ViHSD CSV files with Google Generative AI
|
165 |
+
# First, git clone from https://huggingface.co/datasets/sonlam1102/vihsd
|
166 |
+
main()
|
utils/word_segmentation_vi.py
CHANGED
@@ -17,6 +17,10 @@ if __name__ == "__main__":
|
|
17 |
df = pandas.read_csv(file_path)
|
18 |
if 'content' in df.columns:
|
19 |
df['content'] = df['content'].apply(lambda text: word_segmentation_vi(str(text)))
|
|
|
|
|
|
|
|
|
20 |
df.to_csv(file_path, index=False)
|
21 |
print(f"Processed {file}")
|
22 |
else:
|
|
|
17 |
df = pandas.read_csv(file_path)
|
18 |
if 'content' in df.columns:
|
19 |
df['content'] = df['content'].apply(lambda text: word_segmentation_vi(str(text)))
|
20 |
+
|
21 |
+
if 'Unnamed: 0' in df.columns:
|
22 |
+
df.drop(columns=['Unnamed: 0'], inplace=True)
|
23 |
+
|
24 |
df.to_csv(file_path, index=False)
|
25 |
print(f"Processed {file}")
|
26 |
else:
|