albertmartinez commited on
Commit
3ce1088
·
1 Parent(s): 222cf81

Upgrade gradio

Browse files
Files changed (6) hide show
  1. README.md +91 -1
  2. app.py +250 -102
  3. mining.py +81 -48
  4. requirements.txt +10 -8
  5. sts.py +104 -47
  6. utils.py +149 -12
README.md CHANGED
@@ -4,9 +4,99 @@ emoji: 🏢
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.23.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.33.1
8
  app_file: app.py
9
  pinned: false
10
  ---
11
 
12
+ # Sentence Transformers Demo
13
+
14
+ Interactive web application for semantic text similarity analysis using Sentence Transformers models.
15
+
16
+ ## Features
17
+
18
+ ### 1. Paraphrase Mining
19
+ - Find sentences with similar meaning in a text corpus
20
+ - Support for multiple language models
21
+ - Adjustable similarity threshold
22
+ - Export results in CSV format
23
+
24
+ ### 2. Semantic Textual Similarity (STS)
25
+ - Calculate semantic similarity between two sets of sentences
26
+ - Uses advanced sentence transformation models
27
+ - Compare sentences in different languages
28
+ - Export results in CSV format
29
+
30
+ ## Available Models
31
+
32
+ - [`Lajavaness/bilingual-embedding-large`](https://huggingface.co/Lajavaness/bilingual-embedding-large): Multilingual model optimized for multiple languages
33
+ - [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2): High-quality general-purpose model
34
+ - [`intfloat/multilingual-e5-large-instruct`](https://huggingface.co/intfloat/multilingual-e5-large-instruct): Multilingual model with instructions
35
+
36
+ ## Requirements
37
+
38
+ - Python 3.8+
39
+ - Dependencies listed in `requirements.txt`
40
+
41
+ ## Installation
42
+
43
+ 1. Clone the repository:
44
+ ```bash
45
+ git clone https://github.com/yourusername/sentence-transformers.git
46
+ cd sentence-transformers
47
+ ```
48
+
49
+ 2. Create and activate a virtual environment:
50
+ ```bash
51
+ python -m venv venv
52
+ source venv/bin/activate # Linux/Mac
53
+ # or
54
+ .\venv\Scripts\activate # Windows
55
+ ```
56
+
57
+ 3. Install dependencies:
58
+ ```bash
59
+ pip install -r requirements.txt
60
+ ```
61
+
62
+ ## Usage
63
+
64
+ 1. Start the application:
65
+ ```bash
66
+ python app.py
67
+ ```
68
+
69
+ 2. Open your browser at `http://localhost:7860`
70
+
71
+ 3. Select the desired functionality:
72
+ - Paraphrase Mining: Upload a CSV file with sentences to analyze
73
+ - STS: Upload two CSV files with sentences to compare
74
+
75
+ 4. Select the model and adjust the similarity threshold
76
+
77
+ 5. Click "Process" to start the analysis
78
+
79
+ 6. Download results in CSV format
80
+
81
+ ## CSV File Format
82
+
83
+ CSV files must contain a column named "text" with the sentences to analyze:
84
+
85
+ ```csv
86
+ text
87
+ "First sentence to analyze"
88
+ "Second sentence to analyze"
89
+ ...
90
+ ```
91
+
92
+ ## Notes
93
+
94
+ - Temporary files are automatically cleaned up every 30 minutes
95
+ - Using complete sentences is recommended for better results
96
+ - Models may take time to load on first use
97
+
98
+ ## License
99
+
100
+ MIT
101
+
102
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -4,112 +4,260 @@ import gradio as gr
4
  from mining import mining
5
  from sts import sts
6
  from utils import getDataFrame, save_to_csv, delete_folder_periodically
 
7
 
8
- CONCURRENCY_LIMIT = 5
9
-
10
- with gr.Blocks() as demo:
11
- with gr.Tab("Paraphrase Mining"):
12
- with gr.Row():
13
- gr.Markdown(
14
- "### Paraphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a large corpus of sentences")
15
- with gr.Row():
16
- with gr.Column():
17
- gr.Markdown("#### sentences")
18
-
19
- upload_button_sentences = gr.UploadButton(label="upload sentences csv", file_types=['.csv'],
20
- file_count="single")
21
- output_data_sentences = gr.Dataframe(headers=["text"], col_count=1, label="sentences data")
22
-
23
- upload_button_sentences.upload(fn=getDataFrame, inputs=upload_button_sentences,
24
- outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT)
25
-
26
- with gr.Row():
27
- with gr.Column():
28
- model = gr.Dropdown(
29
- ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
30
- "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
31
- score_mining = gr.Number(label="score", value=0.96, interactive=True)
32
- submit_button_mining = gr.Button("Submit", variant="primary")
33
-
34
- with gr.Row():
35
- with gr.Column():
36
- output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
37
- label="Mining")
38
-
39
- submit_button_mining.click(
40
- fn=mining,
41
- inputs=[model, upload_button_sentences, score_mining],
42
- outputs=output_mining
43
- )
44
-
45
- download_button = gr.Button("Download Results as CSV", variant="huggingface")
46
- download_file = gr.File(label="Downloadable File")
47
-
48
- download_button.click(
49
- fn=save_to_csv,
50
- inputs=output_mining,
51
- outputs=download_file
52
- )
53
 
54
- with gr.Tab("Semantic Textual Similarity"):
55
- with gr.Row(): # Row for the title
56
- gr.Markdown(
57
- "### Semantic Textual Similarity (STS), we want to produce embeddings for all texts involved and calculate the similarities between them")
58
- with gr.Row(): # First row of two columns
59
- with gr.Column():
60
- gr.Markdown("#### sentences 1")
61
- upload_button_sentences1 = gr.UploadButton(label="upload sentences 1 csv", file_types=['.csv'],
62
- file_count="single")
63
- output_data_sentences1 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 1 data")
64
-
65
- upload_button_sentences1.upload(fn=getDataFrame, inputs=upload_button_sentences1,
66
- outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT)
67
-
68
- with gr.Column():
69
- gr.Markdown("#### sentences 2")
70
- upload_button_sentences2 = gr.UploadButton(label="upload sentences 2 csv", file_types=['.csv'],
71
- file_count="single")
72
- output_data_sentences2 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 2 data")
73
-
74
- upload_button_sentences2.upload(fn=getDataFrame, inputs=upload_button_sentences2,
75
- outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT)
76
-
77
- with gr.Row():
78
- with gr.Column():
79
- model = gr.Dropdown(
80
- ["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
81
- "intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
82
- score_sts = gr.Number(label="score", value=0.96, interactive=True)
83
- submit_button_sts = gr.Button("Submit", variant="primary")
84
-
85
- with gr.Row():
86
- with gr.Column():
87
- gr.Markdown("#### STS Results")
88
-
89
- output_sts = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
90
- label="Semantic Textual Similarit")
91
-
92
- submit_button_sts.click(
93
- fn=sts,
94
- inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
95
- outputs=output_sts
96
- )
97
 
98
- download_button = gr.Button("Download Results as CSV", variant="huggingface")
99
- download_file = gr.File(label="Downloadable File")
 
 
 
100
 
101
- download_button.click(
102
- fn=save_to_csv,
103
- inputs=output_sts,
104
- outputs=download_file
105
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  if __name__ == "__main__":
108
- multiprocessing.set_start_method("spawn")
109
-
110
- folder_path = "data"
111
- thread = threading.Thread(target=delete_folder_periodically, args=(folder_path, 1800), daemon=True)
112
- thread.start()
 
 
 
 
 
 
113
 
114
- print(gr.__version__)
115
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
4
  from mining import mining
5
  from sts import sts
6
  from utils import getDataFrame, save_to_csv, delete_folder_periodically
7
+ import logging
8
 
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ CONCURRENCY_LIMIT = 5
17
+ AVAILABLE_MODELS = [
18
+ "Lajavaness/bilingual-embedding-large",
19
+ "sentence-transformers/all-mpnet-base-v2",
20
+ "intfloat/multilingual-e5-large-instruct"
21
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ MODEL_DESCRIPTIONS = {
24
+ "Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)",
25
+ "sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)",
26
+ "intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)"
27
+ }
28
 
29
+ def create_interface():
30
+ with gr.Blocks(title="Sentence Transformers Demo") as demo:
31
+ gr.Markdown("# Sentence Transformers Demo")
32
+ gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).")
33
+
34
+ with gr.Tab("Paraphrase Mining"):
35
+ with gr.Row():
36
+ with gr.Column():
37
+ gr.Markdown(
38
+ "### Paraphrase Mining\n"
39
+ "Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n"
40
+ "Upload a CSV file containing your sentences and select a model to begin."
41
+ )
42
+
43
+ with gr.Row():
44
+ with gr.Column():
45
+ gr.Markdown("#### Input Sentences")
46
+ upload_button_sentences = gr.UploadButton(
47
+ label="Upload Sentences CSV",
48
+ file_types=['.csv'],
49
+ file_count="single",
50
+ variant="primary"
51
+ )
52
+ output_data_sentences = gr.Dataframe(
53
+ headers=["_id", "text"],
54
+ col_count=2,
55
+ label="Sentences Data",
56
+ interactive=False
57
+ )
58
+
59
+ upload_button_sentences.upload(
60
+ fn=getDataFrame,
61
+ inputs=upload_button_sentences,
62
+ outputs=output_data_sentences,
63
+ concurrency_limit=CONCURRENCY_LIMIT
64
+ )
65
+
66
+ with gr.Row():
67
+ with gr.Column():
68
+ model = gr.Dropdown(
69
+ choices=AVAILABLE_MODELS,
70
+ label="Select Model",
71
+ value=AVAILABLE_MODELS[0],
72
+ interactive=True
73
+ )
74
+ model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
75
+
76
+ def update_model_description(model_name):
77
+ return MODEL_DESCRIPTIONS[model_name]
78
+
79
+ model.change(
80
+ fn=update_model_description,
81
+ inputs=model,
82
+ outputs=model_description
83
+ )
84
+
85
+ score_mining = gr.Slider(
86
+ minimum=0.0,
87
+ maximum=1.0,
88
+ value=0.96,
89
+ step=0.01,
90
+ label="Similarity Threshold",
91
+ interactive=True
92
+ )
93
+ submit_button_mining = gr.Button("Process", variant="primary")
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ output_mining = gr.Dataframe(
98
+ headers=["score", "sentence_1", "sentence_2"],
99
+ type="polars",
100
+ label="Mining Results"
101
+ )
102
+
103
+ submit_button_mining.click(
104
+ fn=mining,
105
+ inputs=[model, upload_button_sentences, score_mining],
106
+ outputs=output_mining
107
+ ).then(
108
+ fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
109
+ inputs=[output_mining],
110
+ outputs=[]
111
+ )
112
+
113
+ download_button = gr.Button("Download Results as CSV", variant="secondary")
114
+ download_file = gr.File(label="Downloadable File")
115
+
116
+ download_button.click(
117
+ fn=save_to_csv,
118
+ inputs=output_mining,
119
+ outputs=download_file
120
+ ).then(
121
+ fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
122
+ inputs=[download_file],
123
+ outputs=[]
124
+ )
125
+
126
+ with gr.Tab("Semantic Textual Similarity"):
127
+ with gr.Row():
128
+ with gr.Column():
129
+ gr.Markdown(
130
+ "### Semantic Textual Similarity (STS)\n"
131
+ "Calculate semantic similarity between two sets of sentences.\n"
132
+ "Upload two CSV files containing your sentences and select a model to begin."
133
+ )
134
+
135
+ with gr.Row():
136
+ with gr.Column():
137
+ gr.Markdown("#### First Set of Sentences")
138
+ upload_button_sentences1 = gr.UploadButton(
139
+ label="Upload First Set CSV",
140
+ file_types=['.csv'],
141
+ file_count="single",
142
+ variant="primary"
143
+ )
144
+ output_data_sentences1 = gr.Dataframe(
145
+ headers=["_id", "text"],
146
+ col_count=2,
147
+ label="First Set Data",
148
+ interactive=False
149
+ )
150
+
151
+ upload_button_sentences1.upload(
152
+ fn=getDataFrame,
153
+ inputs=upload_button_sentences1,
154
+ outputs=output_data_sentences1,
155
+ concurrency_limit=CONCURRENCY_LIMIT
156
+ )
157
+
158
+ with gr.Column():
159
+ gr.Markdown("#### Second Set of Sentences")
160
+ upload_button_sentences2 = gr.UploadButton(
161
+ label="Upload Second Set CSV",
162
+ file_types=['.csv'],
163
+ file_count="single",
164
+ variant="primary"
165
+ )
166
+ output_data_sentences2 = gr.Dataframe(
167
+ headers=["_id", "text"],
168
+ col_count=2,
169
+ label="Second Set Data",
170
+ interactive=False
171
+ )
172
+
173
+ upload_button_sentences2.upload(
174
+ fn=getDataFrame,
175
+ inputs=upload_button_sentences2,
176
+ outputs=output_data_sentences2,
177
+ concurrency_limit=CONCURRENCY_LIMIT
178
+ )
179
+
180
+ with gr.Row():
181
+ with gr.Column():
182
+ model = gr.Dropdown(
183
+ choices=AVAILABLE_MODELS,
184
+ label="Select Model",
185
+ value=AVAILABLE_MODELS[0],
186
+ interactive=True
187
+ )
188
+ model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
189
+
190
+ model.change(
191
+ fn=update_model_description,
192
+ inputs=model,
193
+ outputs=model_description
194
+ )
195
+
196
+ score_sts = gr.Slider(
197
+ minimum=0.0,
198
+ maximum=1.0,
199
+ value=0.96,
200
+ step=0.01,
201
+ label="Similarity Threshold",
202
+ interactive=True
203
+ )
204
+ submit_button_sts = gr.Button("Process", variant="primary")
205
+
206
+ with gr.Row():
207
+ with gr.Column():
208
+ output_sts = gr.Dataframe(
209
+ headers=["score", "sentences1", "sentences2"],
210
+ type="polars",
211
+ label="Similarity Results"
212
+ )
213
+
214
+ submit_button_sts.click(
215
+ fn=sts,
216
+ inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
217
+ outputs=output_sts
218
+ ).then(
219
+ fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
220
+ inputs=[output_sts],
221
+ outputs=[]
222
+ )
223
+
224
+ download_button = gr.Button("Download Results as CSV", variant="secondary")
225
+ download_file = gr.File(label="Downloadable File")
226
+
227
+ download_button.click(
228
+ fn=save_to_csv,
229
+ inputs=output_sts,
230
+ outputs=download_file
231
+ ).then(
232
+ fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
233
+ inputs=[download_file],
234
+ outputs=[]
235
+ )
236
+
237
+ return demo
238
 
239
  if __name__ == "__main__":
240
+ try:
241
+ multiprocessing.set_start_method("spawn")
242
+
243
+ # Start cleanup thread
244
+ folder_path = "data"
245
+ thread = threading.Thread(
246
+ target=delete_folder_periodically,
247
+ args=(folder_path, 1800),
248
+ daemon=True
249
+ )
250
+ thread.start()
251
 
252
+ # Create and launch interface
253
+ demo = create_interface()
254
+ demo.launch(
255
+ share=False,
256
+ server_name="0.0.0.0",
257
+ server_port=7860,
258
+ show_error=True,
259
+ show_api=False
260
+ )
261
+ except Exception as e:
262
+ logger.error(f"Error starting application: {str(e)}")
263
+ raise
mining.py CHANGED
@@ -2,56 +2,89 @@ import time
2
  import pandas as pd
3
  import polars as pl
4
  import torch
 
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
7
  from sentence_transformers.util import paraphrase_mining
 
8
 
 
9
 
10
- def mining(modelname, path, score):
11
- st = time.time()
12
- data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
13
- original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
14
-
15
- device = "cuda" if torch.cuda.is_available() else "cpu"
16
- model = SentenceTransformer(
17
- modelname,
18
- device=device,
19
- trust_remote_code=True,
20
- )
21
-
22
- paraphrases = paraphrase_mining(
23
- model,
24
- data["text"],
25
- corpus_chunk_size=len(data),
26
- show_progress_bar=True,
27
- batch_size=1024,
28
- max_pairs=len(data) ** 2,
29
- )
30
-
31
- df_pd = pd.DataFrame(paraphrases)
32
- df = pl.from_pandas(df_pd)
33
- df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
34
-
35
- union_df = pl.DataFrame(data.to_pandas())
36
-
37
- original_columns = original_df.columns.tolist()
38
-
39
- additional_cols = []
40
- for col in original_columns:
41
- if col != "text":
42
- additional_cols.extend([
43
- union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
44
- union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
45
- ])
46
-
47
- df = df.with_columns([
48
- pl.col("score").round(3).cast(pl.Float32),
49
- union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
50
- union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
51
- *additional_cols
52
- ]).filter(pl.col("score") > score).sort(["score"], descending=True)
53
-
54
- elapsed_time = time.time() - st
55
- print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
56
-
57
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import polars as pl
4
  import torch
5
+ import logging
6
  from datasets import Dataset
7
  from sentence_transformers import SentenceTransformer
8
  from sentence_transformers.util import paraphrase_mining
9
+ from typing import Optional
10
 
11
+ logger = logging.getLogger(__name__)
12
 
13
+ def mining(modelname: str, path: str, score: float) -> Optional[pl.DataFrame]:
14
+ """
15
+ Perform paraphrase mining on the input data.
16
+
17
+ Args:
18
+ modelname: Name of the model to use
19
+ path: Path to the input CSV file
20
+ score: Minimum similarity score threshold
21
+
22
+ Returns:
23
+ Optional[pl.DataFrame]: DataFrame with mining results or None if error occurs
24
+ """
25
+ try:
26
+ st = time.time()
27
+
28
+ # Read and validate input data
29
+ data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
30
+ original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
31
+
32
+ if data.num_rows == 0:
33
+ logger.error("No data found in input file")
34
+ return None
35
+
36
+ # Initialize model
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ logger.info(f"Using device: {device}")
39
+
40
+ model = SentenceTransformer(
41
+ modelname,
42
+ device=device,
43
+ trust_remote_code=True,
44
+ )
45
+
46
+ # Perform paraphrase mining
47
+ logger.info("Starting paraphrase mining...")
48
+ paraphrases = paraphrase_mining(
49
+ model,
50
+ data["text"],
51
+ corpus_chunk_size=len(data),
52
+ show_progress_bar=True,
53
+ batch_size=1024,
54
+ max_pairs=len(data) ** 2,
55
+ )
56
+
57
+ # Process results
58
+ df_pd = pd.DataFrame(paraphrases)
59
+ df = pl.from_pandas(df_pd)
60
+ df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
61
+
62
+ union_df = pl.DataFrame(data.to_pandas())
63
+ original_columns = original_df.columns.tolist()
64
+
65
+ # Add additional columns if present
66
+ additional_cols = []
67
+ for col in original_columns:
68
+ if col != "text":
69
+ additional_cols.extend([
70
+ union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
71
+ union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
72
+ ])
73
+
74
+ # Process final results
75
+ df = df.with_columns([
76
+ pl.col("score").round(3).cast(pl.Float32),
77
+ union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
78
+ union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
79
+ *additional_cols
80
+ ]).filter(pl.col("score") > score).sort(["score"], descending=True)
81
+
82
+ elapsed_time = time.time() - st
83
+ logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
84
+ logger.info(f'Found {len(df)} paraphrases above score threshold {score}')
85
+
86
+ return df
87
+
88
+ except Exception as e:
89
+ logger.error(f"Error in mining process: {str(e)}")
90
+ return None
requirements.txt CHANGED
@@ -1,8 +1,10 @@
1
- transformers
2
- torch
3
- pandas
4
- polars
5
- datasets
6
- sentence-transformers[openvino,onnx-gpu,onnx]
7
- multiprocess
8
- gradio
 
 
 
1
+ transformers>=4.36.0
2
+ torch>=2.1.0
3
+ pandas>=2.1.0
4
+ polars>=0.20.0
5
+ datasets>=2.14.0
6
+ sentence-transformers[openvino,onnx-gpu,onnx]>=2.2.0
7
+ multiprocess>=0.70.15
8
+ gradio>=4.12.0
9
+ numpy>=1.24.0
10
+ tqdm>=4.66.0
sts.py CHANGED
@@ -2,54 +2,111 @@ import time
2
  import pandas as pd
3
  import polars as pl
4
  import torch
 
5
  from datasets import Dataset
6
  from sentence_transformers import SentenceTransformer
 
7
 
 
8
 
9
- def sts(modelname, data1, data2, score):
10
- st = time.time()
11
-
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
13
- model = SentenceTransformer(
14
- modelname,
15
- device=device,
16
- trust_remote_code=True,
17
- )
18
-
19
- sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
20
- sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
21
-
22
- embeddings1 = model.encode(sentences1["text"], normalize_embeddings=True, batch_size=1024,
23
- show_progress_bar=True)
24
- embeddings2 = model.encode(sentences2["text"], normalize_embeddings=True, batch_size=1024,
25
- show_progress_bar=True)
26
-
27
- similarity_matrix = model.similarity(embeddings1, embeddings2)
28
-
29
- df_pd = pd.DataFrame(similarity_matrix)
30
- dfi = df_pd.__dataframe__()
31
- df = pl.from_dataframe(dfi)
32
-
33
- df_matrix_with_index = df.with_row_index(name="row_index").with_columns(pl.col("row_index").cast(pl.UInt64))
34
- df_long = df_matrix_with_index.unpivot(index="row_index", variable_name="column_index",
35
- value_name="score").with_columns(pl.col("column_index").cast(pl.UInt64))
36
- df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
37
- pl.col("row_index").cast(pl.UInt64))
38
- df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
39
- pl.col("column_index").cast(pl.UInt64))
40
-
41
- df_long = (df_long
42
- .with_columns([pl.col("score").round(4).cast(pl.Float32)]) # Ensure column_index is UInt32
43
- .join(df_sentences1, on="row_index")
44
- .join(df_sentences2, on="column_index"))
45
-
46
- df_long = df_long.rename({
47
- "text": "sentences1",
48
- "text_right": "sentences2",
49
- }).drop(["row_index", "column_index"])
50
-
51
- elapsed_time = time.time() - st
52
- print('Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))
53
-
54
- return df_long.filter(pl.col("score") > score).sort(["score"],
55
- descending=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import pandas as pd
3
  import polars as pl
4
  import torch
5
+ import logging
6
  from datasets import Dataset
7
  from sentence_transformers import SentenceTransformer
8
+ from typing import Optional
9
 
10
+ logger = logging.getLogger(__name__)
11
 
12
+ def sts(modelname: str, data1: str, data2: str, score: float) -> Optional[pl.DataFrame]:
13
+ """
14
+ Calculate semantic textual similarity between two sets of sentences.
15
+
16
+ Args:
17
+ modelname: Name of the model to use
18
+ data1: Path to first input CSV file
19
+ data2: Path to second input CSV file
20
+ score: Minimum similarity score threshold
21
+
22
+ Returns:
23
+ Optional[pl.DataFrame]: DataFrame with similarity results or None if error occurs
24
+ """
25
+ try:
26
+ st = time.time()
27
+
28
+ # Initialize model
29
+ device = "cuda" if torch.cuda.is_available() else "cpu"
30
+ logger.info(f"Using device: {device}")
31
+
32
+ model = SentenceTransformer(
33
+ modelname,
34
+ device=device,
35
+ trust_remote_code=True,
36
+ )
37
+
38
+ # Read and validate input data
39
+ sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
40
+ sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
41
+
42
+ if sentences1.num_rows == 0 or sentences2.num_rows == 0:
43
+ logger.error("Empty input data found")
44
+ return None
45
+
46
+ # Generate embeddings
47
+ logger.info("Generating embeddings for first set...")
48
+ embeddings1 = model.encode(
49
+ sentences1["text"],
50
+ normalize_embeddings=True,
51
+ batch_size=1024,
52
+ show_progress_bar=True
53
+ )
54
+
55
+ logger.info("Generating embeddings for second set...")
56
+ embeddings2 = model.encode(
57
+ sentences2["text"],
58
+ normalize_embeddings=True,
59
+ batch_size=1024,
60
+ show_progress_bar=True
61
+ )
62
+
63
+ # Calculate similarity matrix
64
+ logger.info("Calculating similarity matrix...")
65
+ similarity_matrix = model.similarity(embeddings1, embeddings2)
66
+
67
+ # Process results
68
+ df_pd = pd.DataFrame(similarity_matrix)
69
+ dfi = df_pd.__dataframe__()
70
+ df = pl.from_dataframe(dfi)
71
+
72
+ # Transform matrix to long format
73
+ df_matrix_with_index = df.with_row_index(name="row_index").with_columns(
74
+ pl.col("row_index").cast(pl.UInt64)
75
+ )
76
+ df_long = df_matrix_with_index.unpivot(
77
+ index="row_index",
78
+ variable_name="column_index",
79
+ value_name="score"
80
+ ).with_columns(pl.col("column_index").cast(pl.UInt64))
81
+
82
+ # Join with original text
83
+ df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
84
+ pl.col("row_index").cast(pl.UInt64)
85
+ )
86
+ df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
87
+ pl.col("column_index").cast(pl.UInt64)
88
+ )
89
+
90
+ # Process final results
91
+ df_long = (df_long
92
+ .with_columns([pl.col("score").round(4).cast(pl.Float32)])
93
+ .join(df_sentences1, on="row_index")
94
+ .join(df_sentences2, on="column_index"))
95
+
96
+ df_long = df_long.rename({
97
+ "text": "sentences1",
98
+ "text_right": "sentences2",
99
+ }).drop(["row_index", "column_index"])
100
+
101
+ # Filter and sort results
102
+ result_df = df_long.filter(pl.col("score") > score).sort(["score"], descending=True)
103
+
104
+ elapsed_time = time.time() - st
105
+ logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
106
+ logger.info(f'Found {len(result_df)} pairs above score threshold {score}')
107
+
108
+ return result_df
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error in STS process: {str(e)}")
112
+ return None
utils.py CHANGED
@@ -4,22 +4,159 @@ import shutil
4
  import pandas as pd
5
  import polars as pl
6
  import time
 
 
7
 
8
- def getDataFrame(path):
9
- data = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
10
- return pl.from_pandas(data)
 
 
 
11
 
12
- def save_to_csv(dataframe):
13
- folder_path = "data"
14
- if not dataframe.is_empty():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  os.makedirs(folder_path, exist_ok=True)
16
- csv_path = f"{folder_path}/{uuid.uuid4()}.csv"
 
 
 
 
 
17
  dataframe.write_csv(csv_path, separator="\t")
 
 
18
  return csv_path
 
 
 
 
19
 
20
- def delete_folder_periodically(path, interval=3600):
 
 
 
 
 
 
 
21
  while True:
22
- if os.path.exists(path):
23
- shutil.rmtree(path)
24
- os.makedirs(path, exist_ok=True)
25
- time.sleep(interval)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  import pandas as pd
5
  import polars as pl
6
  import time
7
+ import logging
8
+ from typing import Optional, Tuple
9
 
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
 
17
+ def validate_csv_structure(df: pd.DataFrame) -> Tuple[bool, str]:
18
+ """
19
+ Validate the structure of the DataFrame.
20
+
21
+ Args:
22
+ df: DataFrame to validate
23
+
24
+ Returns:
25
+ Tuple[bool, str]: (is_valid, error_message)
26
+ """
27
+ # Check if DataFrame is empty
28
+ if df.empty:
29
+ return False, "CSV file is empty"
30
+
31
+ # Check required columns
32
+ required_columns = ['_id', 'text']
33
+ missing_columns = [col for col in required_columns if col not in df.columns]
34
+ if missing_columns:
35
+ return False, f"Missing required columns: {', '.join(missing_columns)}"
36
+
37
+ # Validate _id column
38
+ if df['_id'].isna().any():
39
+ return False, "Found empty _id values"
40
+
41
+ # Validate text column
42
+ if df['text'].isna().any():
43
+ return False, "Found empty text values"
44
+
45
+ # Check for duplicate _id values
46
+ if df['_id'].duplicated().any():
47
+ return False, "Found duplicate _id values"
48
+
49
+ return True, ""
50
+
51
+ def getDataFrame(path: str) -> Optional[pl.DataFrame]:
52
+ """
53
+ Read and validate CSV file into a DataFrame.
54
+
55
+ Args:
56
+ path: Path to the CSV file
57
+
58
+ Returns:
59
+ Optional[pl.DataFrame]: The validated DataFrame or None if validation fails
60
+ """
61
+ try:
62
+ # Read CSV with tab separator
63
+ data = pd.read_csv(
64
+ path,
65
+ sep="\t",
66
+ header=0,
67
+ on_bad_lines='warn',
68
+ encoding='utf-8'
69
+ )
70
+
71
+ # Validate structure
72
+ is_valid, error_message = validate_csv_structure(data)
73
+ if not is_valid:
74
+ logger.error(error_message)
75
+ return None
76
+
77
+ # Clean text column
78
+ data['text'] = data['text'].astype(str).str.strip()
79
+ data = data[data['text'].str.len() > 0]
80
+
81
+ if data.empty:
82
+ logger.error("No valid text data found after cleaning")
83
+ return None
84
+
85
+ # Convert to Polars DataFrame
86
+ pl_df = pl.from_pandas(data)
87
+ logger.info(f"Successfully loaded {len(pl_df)} rows from CSV")
88
+
89
+ return pl_df
90
+
91
+ except pd.errors.EmptyDataError:
92
+ logger.error("CSV file is empty")
93
+ return None
94
+ except pd.errors.ParserError as e:
95
+ logger.error(f"Error parsing CSV file: {str(e)}")
96
+ return None
97
+ except Exception as e:
98
+ logger.error(f"Unexpected error reading CSV: {str(e)}")
99
+ return None
100
+
101
+ def save_to_csv(dataframe: pl.DataFrame) -> Optional[str]:
102
+ """
103
+ Save DataFrame to CSV file.
104
+
105
+ Args:
106
+ dataframe: Polars DataFrame to save
107
+
108
+ Returns:
109
+ Optional[str]: Path to saved file or None if save fails
110
+ """
111
+ try:
112
+ if dataframe is None or dataframe.is_empty():
113
+ logger.warning("No data to save")
114
+ return None
115
+
116
+ # Create data directory if it doesn't exist
117
+ folder_path = "data"
118
  os.makedirs(folder_path, exist_ok=True)
119
+
120
+ # Generate unique filename with timestamp
121
+ timestamp = int(time.time())
122
+ csv_path = f"{folder_path}/results_{timestamp}.csv"
123
+
124
+ # Save to CSV with tab separator
125
  dataframe.write_csv(csv_path, separator="\t")
126
+ logger.info(f"Results saved to {csv_path}")
127
+
128
  return csv_path
129
+
130
+ except Exception as e:
131
+ logger.error(f"Error saving results: {str(e)}")
132
+ return None
133
 
134
+ def delete_folder_periodically(path: str, interval: int = 3600) -> None:
135
+ """
136
+ Periodically clean up the data folder.
137
+
138
+ Args:
139
+ path: Path to folder to clean
140
+ interval: Interval between cleanups in seconds
141
+ """
142
  while True:
143
+ try:
144
+ if os.path.exists(path):
145
+ # Get current time
146
+ current_time = time.time()
147
+
148
+ # Check each file in the directory
149
+ for filename in os.listdir(path):
150
+ file_path = os.path.join(path, filename)
151
+ if os.path.isfile(file_path):
152
+ # Check file age
153
+ file_age = current_time - os.path.getmtime(file_path)
154
+ if file_age > interval:
155
+ os.remove(file_path)
156
+ logger.info(f"Deleted old file: {file_path}")
157
+
158
+ time.sleep(interval)
159
+
160
+ except Exception as e:
161
+ logger.error(f"Error in cleanup task: {str(e)}")
162
+ time.sleep(interval)