Commit
·
3ce1088
1
Parent(s):
222cf81
Upgrade gradio
Browse files
README.md
CHANGED
@@ -4,9 +4,99 @@ emoji: 🏢
|
|
4 |
colorFrom: green
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
4 |
colorFrom: green
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.33.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
# Sentence Transformers Demo
|
13 |
+
|
14 |
+
Interactive web application for semantic text similarity analysis using Sentence Transformers models.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
|
18 |
+
### 1. Paraphrase Mining
|
19 |
+
- Find sentences with similar meaning in a text corpus
|
20 |
+
- Support for multiple language models
|
21 |
+
- Adjustable similarity threshold
|
22 |
+
- Export results in CSV format
|
23 |
+
|
24 |
+
### 2. Semantic Textual Similarity (STS)
|
25 |
+
- Calculate semantic similarity between two sets of sentences
|
26 |
+
- Uses advanced sentence transformation models
|
27 |
+
- Compare sentences in different languages
|
28 |
+
- Export results in CSV format
|
29 |
+
|
30 |
+
## Available Models
|
31 |
+
|
32 |
+
- [`Lajavaness/bilingual-embedding-large`](https://huggingface.co/Lajavaness/bilingual-embedding-large): Multilingual model optimized for multiple languages
|
33 |
+
- [`sentence-transformers/all-mpnet-base-v2`](https://huggingface.co/sentence-transformers/all-mpnet-base-v2): High-quality general-purpose model
|
34 |
+
- [`intfloat/multilingual-e5-large-instruct`](https://huggingface.co/intfloat/multilingual-e5-large-instruct): Multilingual model with instructions
|
35 |
+
|
36 |
+
## Requirements
|
37 |
+
|
38 |
+
- Python 3.8+
|
39 |
+
- Dependencies listed in `requirements.txt`
|
40 |
+
|
41 |
+
## Installation
|
42 |
+
|
43 |
+
1. Clone the repository:
|
44 |
+
```bash
|
45 |
+
git clone https://github.com/yourusername/sentence-transformers.git
|
46 |
+
cd sentence-transformers
|
47 |
+
```
|
48 |
+
|
49 |
+
2. Create and activate a virtual environment:
|
50 |
+
```bash
|
51 |
+
python -m venv venv
|
52 |
+
source venv/bin/activate # Linux/Mac
|
53 |
+
# or
|
54 |
+
.\venv\Scripts\activate # Windows
|
55 |
+
```
|
56 |
+
|
57 |
+
3. Install dependencies:
|
58 |
+
```bash
|
59 |
+
pip install -r requirements.txt
|
60 |
+
```
|
61 |
+
|
62 |
+
## Usage
|
63 |
+
|
64 |
+
1. Start the application:
|
65 |
+
```bash
|
66 |
+
python app.py
|
67 |
+
```
|
68 |
+
|
69 |
+
2. Open your browser at `http://localhost:7860`
|
70 |
+
|
71 |
+
3. Select the desired functionality:
|
72 |
+
- Paraphrase Mining: Upload a CSV file with sentences to analyze
|
73 |
+
- STS: Upload two CSV files with sentences to compare
|
74 |
+
|
75 |
+
4. Select the model and adjust the similarity threshold
|
76 |
+
|
77 |
+
5. Click "Process" to start the analysis
|
78 |
+
|
79 |
+
6. Download results in CSV format
|
80 |
+
|
81 |
+
## CSV File Format
|
82 |
+
|
83 |
+
CSV files must contain a column named "text" with the sentences to analyze:
|
84 |
+
|
85 |
+
```csv
|
86 |
+
text
|
87 |
+
"First sentence to analyze"
|
88 |
+
"Second sentence to analyze"
|
89 |
+
...
|
90 |
+
```
|
91 |
+
|
92 |
+
## Notes
|
93 |
+
|
94 |
+
- Temporary files are automatically cleaned up every 30 minutes
|
95 |
+
- Using complete sentences is recommended for better results
|
96 |
+
- Models may take time to load on first use
|
97 |
+
|
98 |
+
## License
|
99 |
+
|
100 |
+
MIT
|
101 |
+
|
102 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
CHANGED
@@ -4,112 +4,260 @@ import gradio as gr
|
|
4 |
from mining import mining
|
5 |
from sts import sts
|
6 |
from utils import getDataFrame, save_to_csv, delete_folder_periodically
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
"### Paraphrase mining is the task of finding paraphrases (texts with identical / similar meaning) in a large corpus of sentences")
|
15 |
-
with gr.Row():
|
16 |
-
with gr.Column():
|
17 |
-
gr.Markdown("#### sentences")
|
18 |
-
|
19 |
-
upload_button_sentences = gr.UploadButton(label="upload sentences csv", file_types=['.csv'],
|
20 |
-
file_count="single")
|
21 |
-
output_data_sentences = gr.Dataframe(headers=["text"], col_count=1, label="sentences data")
|
22 |
-
|
23 |
-
upload_button_sentences.upload(fn=getDataFrame, inputs=upload_button_sentences,
|
24 |
-
outputs=output_data_sentences, concurrency_limit=CONCURRENCY_LIMIT)
|
25 |
-
|
26 |
-
with gr.Row():
|
27 |
-
with gr.Column():
|
28 |
-
model = gr.Dropdown(
|
29 |
-
["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
|
30 |
-
"intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
|
31 |
-
score_mining = gr.Number(label="score", value=0.96, interactive=True)
|
32 |
-
submit_button_mining = gr.Button("Submit", variant="primary")
|
33 |
-
|
34 |
-
with gr.Row():
|
35 |
-
with gr.Column():
|
36 |
-
output_mining = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
37 |
-
label="Mining")
|
38 |
-
|
39 |
-
submit_button_mining.click(
|
40 |
-
fn=mining,
|
41 |
-
inputs=[model, upload_button_sentences, score_mining],
|
42 |
-
outputs=output_mining
|
43 |
-
)
|
44 |
-
|
45 |
-
download_button = gr.Button("Download Results as CSV", variant="huggingface")
|
46 |
-
download_file = gr.File(label="Downloadable File")
|
47 |
-
|
48 |
-
download_button.click(
|
49 |
-
fn=save_to_csv,
|
50 |
-
inputs=output_mining,
|
51 |
-
outputs=download_file
|
52 |
-
)
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
gr.Markdown("#### sentences 1")
|
61 |
-
upload_button_sentences1 = gr.UploadButton(label="upload sentences 1 csv", file_types=['.csv'],
|
62 |
-
file_count="single")
|
63 |
-
output_data_sentences1 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 1 data")
|
64 |
-
|
65 |
-
upload_button_sentences1.upload(fn=getDataFrame, inputs=upload_button_sentences1,
|
66 |
-
outputs=output_data_sentences1, concurrency_limit=CONCURRENCY_LIMIT)
|
67 |
-
|
68 |
-
with gr.Column():
|
69 |
-
gr.Markdown("#### sentences 2")
|
70 |
-
upload_button_sentences2 = gr.UploadButton(label="upload sentences 2 csv", file_types=['.csv'],
|
71 |
-
file_count="single")
|
72 |
-
output_data_sentences2 = gr.Dataframe(headers=["text"], col_count=1, label="sentences 2 data")
|
73 |
-
|
74 |
-
upload_button_sentences2.upload(fn=getDataFrame, inputs=upload_button_sentences2,
|
75 |
-
outputs=output_data_sentences2, concurrency_limit=CONCURRENCY_LIMIT)
|
76 |
-
|
77 |
-
with gr.Row():
|
78 |
-
with gr.Column():
|
79 |
-
model = gr.Dropdown(
|
80 |
-
["Lajavaness/bilingual-embedding-large", "sentence-transformers/all-mpnet-base-v2",
|
81 |
-
"intfloat/multilingual-e5-large-instruct"], label="model", interactive=True)
|
82 |
-
score_sts = gr.Number(label="score", value=0.96, interactive=True)
|
83 |
-
submit_button_sts = gr.Button("Submit", variant="primary")
|
84 |
-
|
85 |
-
with gr.Row():
|
86 |
-
with gr.Column():
|
87 |
-
gr.Markdown("#### STS Results")
|
88 |
-
|
89 |
-
output_sts = gr.Dataframe(headers=["score", "sentence_1", "sentence_2"], type="polars",
|
90 |
-
label="Semantic Textual Similarit")
|
91 |
-
|
92 |
-
submit_button_sts.click(
|
93 |
-
fn=sts,
|
94 |
-
inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
|
95 |
-
outputs=output_sts
|
96 |
-
)
|
97 |
|
98 |
-
|
99 |
-
|
|
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
if __name__ == "__main__":
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from mining import mining
|
5 |
from sts import sts
|
6 |
from utils import getDataFrame, save_to_csv, delete_folder_periodically
|
7 |
+
import logging
|
8 |
|
9 |
+
# Configure logging
|
10 |
+
logging.basicConfig(
|
11 |
+
level=logging.INFO,
|
12 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
13 |
+
)
|
14 |
+
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
CONCURRENCY_LIMIT = 5
|
17 |
+
AVAILABLE_MODELS = [
|
18 |
+
"Lajavaness/bilingual-embedding-large",
|
19 |
+
"sentence-transformers/all-mpnet-base-v2",
|
20 |
+
"intfloat/multilingual-e5-large-instruct"
|
21 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
MODEL_DESCRIPTIONS = {
|
24 |
+
"Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)",
|
25 |
+
"sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)",
|
26 |
+
"intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)"
|
27 |
+
}
|
28 |
|
29 |
+
def create_interface():
|
30 |
+
with gr.Blocks(title="Sentence Transformers Demo") as demo:
|
31 |
+
gr.Markdown("# Sentence Transformers Demo")
|
32 |
+
gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).")
|
33 |
+
|
34 |
+
with gr.Tab("Paraphrase Mining"):
|
35 |
+
with gr.Row():
|
36 |
+
with gr.Column():
|
37 |
+
gr.Markdown(
|
38 |
+
"### Paraphrase Mining\n"
|
39 |
+
"Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n"
|
40 |
+
"Upload a CSV file containing your sentences and select a model to begin."
|
41 |
+
)
|
42 |
+
|
43 |
+
with gr.Row():
|
44 |
+
with gr.Column():
|
45 |
+
gr.Markdown("#### Input Sentences")
|
46 |
+
upload_button_sentences = gr.UploadButton(
|
47 |
+
label="Upload Sentences CSV",
|
48 |
+
file_types=['.csv'],
|
49 |
+
file_count="single",
|
50 |
+
variant="primary"
|
51 |
+
)
|
52 |
+
output_data_sentences = gr.Dataframe(
|
53 |
+
headers=["_id", "text"],
|
54 |
+
col_count=2,
|
55 |
+
label="Sentences Data",
|
56 |
+
interactive=False
|
57 |
+
)
|
58 |
+
|
59 |
+
upload_button_sentences.upload(
|
60 |
+
fn=getDataFrame,
|
61 |
+
inputs=upload_button_sentences,
|
62 |
+
outputs=output_data_sentences,
|
63 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
64 |
+
)
|
65 |
+
|
66 |
+
with gr.Row():
|
67 |
+
with gr.Column():
|
68 |
+
model = gr.Dropdown(
|
69 |
+
choices=AVAILABLE_MODELS,
|
70 |
+
label="Select Model",
|
71 |
+
value=AVAILABLE_MODELS[0],
|
72 |
+
interactive=True
|
73 |
+
)
|
74 |
+
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
|
75 |
+
|
76 |
+
def update_model_description(model_name):
|
77 |
+
return MODEL_DESCRIPTIONS[model_name]
|
78 |
+
|
79 |
+
model.change(
|
80 |
+
fn=update_model_description,
|
81 |
+
inputs=model,
|
82 |
+
outputs=model_description
|
83 |
+
)
|
84 |
+
|
85 |
+
score_mining = gr.Slider(
|
86 |
+
minimum=0.0,
|
87 |
+
maximum=1.0,
|
88 |
+
value=0.96,
|
89 |
+
step=0.01,
|
90 |
+
label="Similarity Threshold",
|
91 |
+
interactive=True
|
92 |
+
)
|
93 |
+
submit_button_mining = gr.Button("Process", variant="primary")
|
94 |
+
|
95 |
+
with gr.Row():
|
96 |
+
with gr.Column():
|
97 |
+
output_mining = gr.Dataframe(
|
98 |
+
headers=["score", "sentence_1", "sentence_2"],
|
99 |
+
type="polars",
|
100 |
+
label="Mining Results"
|
101 |
+
)
|
102 |
+
|
103 |
+
submit_button_mining.click(
|
104 |
+
fn=mining,
|
105 |
+
inputs=[model, upload_button_sentences, score_mining],
|
106 |
+
outputs=output_mining
|
107 |
+
).then(
|
108 |
+
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
|
109 |
+
inputs=[output_mining],
|
110 |
+
outputs=[]
|
111 |
+
)
|
112 |
+
|
113 |
+
download_button = gr.Button("Download Results as CSV", variant="secondary")
|
114 |
+
download_file = gr.File(label="Downloadable File")
|
115 |
+
|
116 |
+
download_button.click(
|
117 |
+
fn=save_to_csv,
|
118 |
+
inputs=output_mining,
|
119 |
+
outputs=download_file
|
120 |
+
).then(
|
121 |
+
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
|
122 |
+
inputs=[download_file],
|
123 |
+
outputs=[]
|
124 |
+
)
|
125 |
+
|
126 |
+
with gr.Tab("Semantic Textual Similarity"):
|
127 |
+
with gr.Row():
|
128 |
+
with gr.Column():
|
129 |
+
gr.Markdown(
|
130 |
+
"### Semantic Textual Similarity (STS)\n"
|
131 |
+
"Calculate semantic similarity between two sets of sentences.\n"
|
132 |
+
"Upload two CSV files containing your sentences and select a model to begin."
|
133 |
+
)
|
134 |
+
|
135 |
+
with gr.Row():
|
136 |
+
with gr.Column():
|
137 |
+
gr.Markdown("#### First Set of Sentences")
|
138 |
+
upload_button_sentences1 = gr.UploadButton(
|
139 |
+
label="Upload First Set CSV",
|
140 |
+
file_types=['.csv'],
|
141 |
+
file_count="single",
|
142 |
+
variant="primary"
|
143 |
+
)
|
144 |
+
output_data_sentences1 = gr.Dataframe(
|
145 |
+
headers=["_id", "text"],
|
146 |
+
col_count=2,
|
147 |
+
label="First Set Data",
|
148 |
+
interactive=False
|
149 |
+
)
|
150 |
+
|
151 |
+
upload_button_sentences1.upload(
|
152 |
+
fn=getDataFrame,
|
153 |
+
inputs=upload_button_sentences1,
|
154 |
+
outputs=output_data_sentences1,
|
155 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
156 |
+
)
|
157 |
+
|
158 |
+
with gr.Column():
|
159 |
+
gr.Markdown("#### Second Set of Sentences")
|
160 |
+
upload_button_sentences2 = gr.UploadButton(
|
161 |
+
label="Upload Second Set CSV",
|
162 |
+
file_types=['.csv'],
|
163 |
+
file_count="single",
|
164 |
+
variant="primary"
|
165 |
+
)
|
166 |
+
output_data_sentences2 = gr.Dataframe(
|
167 |
+
headers=["_id", "text"],
|
168 |
+
col_count=2,
|
169 |
+
label="Second Set Data",
|
170 |
+
interactive=False
|
171 |
+
)
|
172 |
+
|
173 |
+
upload_button_sentences2.upload(
|
174 |
+
fn=getDataFrame,
|
175 |
+
inputs=upload_button_sentences2,
|
176 |
+
outputs=output_data_sentences2,
|
177 |
+
concurrency_limit=CONCURRENCY_LIMIT
|
178 |
+
)
|
179 |
+
|
180 |
+
with gr.Row():
|
181 |
+
with gr.Column():
|
182 |
+
model = gr.Dropdown(
|
183 |
+
choices=AVAILABLE_MODELS,
|
184 |
+
label="Select Model",
|
185 |
+
value=AVAILABLE_MODELS[0],
|
186 |
+
interactive=True
|
187 |
+
)
|
188 |
+
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]])
|
189 |
+
|
190 |
+
model.change(
|
191 |
+
fn=update_model_description,
|
192 |
+
inputs=model,
|
193 |
+
outputs=model_description
|
194 |
+
)
|
195 |
+
|
196 |
+
score_sts = gr.Slider(
|
197 |
+
minimum=0.0,
|
198 |
+
maximum=1.0,
|
199 |
+
value=0.96,
|
200 |
+
step=0.01,
|
201 |
+
label="Similarity Threshold",
|
202 |
+
interactive=True
|
203 |
+
)
|
204 |
+
submit_button_sts = gr.Button("Process", variant="primary")
|
205 |
+
|
206 |
+
with gr.Row():
|
207 |
+
with gr.Column():
|
208 |
+
output_sts = gr.Dataframe(
|
209 |
+
headers=["score", "sentences1", "sentences2"],
|
210 |
+
type="polars",
|
211 |
+
label="Similarity Results"
|
212 |
+
)
|
213 |
+
|
214 |
+
submit_button_sts.click(
|
215 |
+
fn=sts,
|
216 |
+
inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts],
|
217 |
+
outputs=output_sts
|
218 |
+
).then(
|
219 |
+
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."),
|
220 |
+
inputs=[output_sts],
|
221 |
+
outputs=[]
|
222 |
+
)
|
223 |
+
|
224 |
+
download_button = gr.Button("Download Results as CSV", variant="secondary")
|
225 |
+
download_file = gr.File(label="Downloadable File")
|
226 |
+
|
227 |
+
download_button.click(
|
228 |
+
fn=save_to_csv,
|
229 |
+
inputs=output_sts,
|
230 |
+
outputs=download_file
|
231 |
+
).then(
|
232 |
+
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."),
|
233 |
+
inputs=[download_file],
|
234 |
+
outputs=[]
|
235 |
+
)
|
236 |
+
|
237 |
+
return demo
|
238 |
|
239 |
if __name__ == "__main__":
|
240 |
+
try:
|
241 |
+
multiprocessing.set_start_method("spawn")
|
242 |
+
|
243 |
+
# Start cleanup thread
|
244 |
+
folder_path = "data"
|
245 |
+
thread = threading.Thread(
|
246 |
+
target=delete_folder_periodically,
|
247 |
+
args=(folder_path, 1800),
|
248 |
+
daemon=True
|
249 |
+
)
|
250 |
+
thread.start()
|
251 |
|
252 |
+
# Create and launch interface
|
253 |
+
demo = create_interface()
|
254 |
+
demo.launch(
|
255 |
+
share=False,
|
256 |
+
server_name="0.0.0.0",
|
257 |
+
server_port=7860,
|
258 |
+
show_error=True,
|
259 |
+
show_api=False
|
260 |
+
)
|
261 |
+
except Exception as e:
|
262 |
+
logger.error(f"Error starting application: {str(e)}")
|
263 |
+
raise
|
mining.py
CHANGED
@@ -2,56 +2,89 @@ import time
|
|
2 |
import pandas as pd
|
3 |
import polars as pl
|
4 |
import torch
|
|
|
5 |
from datasets import Dataset
|
6 |
from sentence_transformers import SentenceTransformer
|
7 |
from sentence_transformers.util import paraphrase_mining
|
|
|
8 |
|
|
|
9 |
|
10 |
-
def mining(modelname, path, score):
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import polars as pl
|
4 |
import torch
|
5 |
+
import logging
|
6 |
from datasets import Dataset
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
from sentence_transformers.util import paraphrase_mining
|
9 |
+
from typing import Optional
|
10 |
|
11 |
+
logger = logging.getLogger(__name__)
|
12 |
|
13 |
+
def mining(modelname: str, path: str, score: float) -> Optional[pl.DataFrame]:
|
14 |
+
"""
|
15 |
+
Perform paraphrase mining on the input data.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
modelname: Name of the model to use
|
19 |
+
path: Path to the input CSV file
|
20 |
+
score: Minimum similarity score threshold
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
Optional[pl.DataFrame]: DataFrame with mining results or None if error occurs
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
st = time.time()
|
27 |
+
|
28 |
+
# Read and validate input data
|
29 |
+
data = Dataset.from_pandas(pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t"))
|
30 |
+
original_df = pd.read_csv(path, on_bad_lines='skip', header=0, sep="\t")
|
31 |
+
|
32 |
+
if data.num_rows == 0:
|
33 |
+
logger.error("No data found in input file")
|
34 |
+
return None
|
35 |
+
|
36 |
+
# Initialize model
|
37 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
38 |
+
logger.info(f"Using device: {device}")
|
39 |
+
|
40 |
+
model = SentenceTransformer(
|
41 |
+
modelname,
|
42 |
+
device=device,
|
43 |
+
trust_remote_code=True,
|
44 |
+
)
|
45 |
+
|
46 |
+
# Perform paraphrase mining
|
47 |
+
logger.info("Starting paraphrase mining...")
|
48 |
+
paraphrases = paraphrase_mining(
|
49 |
+
model,
|
50 |
+
data["text"],
|
51 |
+
corpus_chunk_size=len(data),
|
52 |
+
show_progress_bar=True,
|
53 |
+
batch_size=1024,
|
54 |
+
max_pairs=len(data) ** 2,
|
55 |
+
)
|
56 |
+
|
57 |
+
# Process results
|
58 |
+
df_pd = pd.DataFrame(paraphrases)
|
59 |
+
df = pl.from_pandas(df_pd)
|
60 |
+
df = df.rename({"0": "score", "1": "sentence_1", "2": "sentence_2"})
|
61 |
+
|
62 |
+
union_df = pl.DataFrame(data.to_pandas())
|
63 |
+
original_columns = original_df.columns.tolist()
|
64 |
+
|
65 |
+
# Add additional columns if present
|
66 |
+
additional_cols = []
|
67 |
+
for col in original_columns:
|
68 |
+
if col != "text":
|
69 |
+
additional_cols.extend([
|
70 |
+
union_df.select(pl.col(col)).to_series()[df["sentence_1"].cast(pl.Int32)].alias(f"{col}_1"),
|
71 |
+
union_df.select(pl.col(col)).to_series()[df["sentence_2"].cast(pl.Int32)].alias(f"{col}_2")
|
72 |
+
])
|
73 |
+
|
74 |
+
# Process final results
|
75 |
+
df = df.with_columns([
|
76 |
+
pl.col("score").round(3).cast(pl.Float32),
|
77 |
+
union_df.select(pl.col("text")).to_series()[df["sentence_1"].cast(pl.Int32)].alias("sentence_1"),
|
78 |
+
union_df.select(pl.col("text")).to_series()[df["sentence_2"].cast(pl.Int32)].alias("sentence_2"),
|
79 |
+
*additional_cols
|
80 |
+
]).filter(pl.col("score") > score).sort(["score"], descending=True)
|
81 |
+
|
82 |
+
elapsed_time = time.time() - st
|
83 |
+
logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
|
84 |
+
logger.info(f'Found {len(df)} paraphrases above score threshold {score}')
|
85 |
+
|
86 |
+
return df
|
87 |
+
|
88 |
+
except Exception as e:
|
89 |
+
logger.error(f"Error in mining process: {str(e)}")
|
90 |
+
return None
|
requirements.txt
CHANGED
@@ -1,8 +1,10 @@
|
|
1 |
-
transformers
|
2 |
-
torch
|
3 |
-
pandas
|
4 |
-
polars
|
5 |
-
datasets
|
6 |
-
sentence-transformers[openvino,onnx-gpu,onnx]
|
7 |
-
multiprocess
|
8 |
-
gradio
|
|
|
|
|
|
1 |
+
transformers>=4.36.0
|
2 |
+
torch>=2.1.0
|
3 |
+
pandas>=2.1.0
|
4 |
+
polars>=0.20.0
|
5 |
+
datasets>=2.14.0
|
6 |
+
sentence-transformers[openvino,onnx-gpu,onnx]>=2.2.0
|
7 |
+
multiprocess>=0.70.15
|
8 |
+
gradio>=4.12.0
|
9 |
+
numpy>=1.24.0
|
10 |
+
tqdm>=4.66.0
|
sts.py
CHANGED
@@ -2,54 +2,111 @@ import time
|
|
2 |
import pandas as pd
|
3 |
import polars as pl
|
4 |
import torch
|
|
|
5 |
from datasets import Dataset
|
6 |
from sentence_transformers import SentenceTransformer
|
|
|
7 |
|
|
|
8 |
|
9 |
-
def sts(modelname, data1, data2, score):
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
modelname
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import pandas as pd
|
3 |
import polars as pl
|
4 |
import torch
|
5 |
+
import logging
|
6 |
from datasets import Dataset
|
7 |
from sentence_transformers import SentenceTransformer
|
8 |
+
from typing import Optional
|
9 |
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
def sts(modelname: str, data1: str, data2: str, score: float) -> Optional[pl.DataFrame]:
|
13 |
+
"""
|
14 |
+
Calculate semantic textual similarity between two sets of sentences.
|
15 |
+
|
16 |
+
Args:
|
17 |
+
modelname: Name of the model to use
|
18 |
+
data1: Path to first input CSV file
|
19 |
+
data2: Path to second input CSV file
|
20 |
+
score: Minimum similarity score threshold
|
21 |
+
|
22 |
+
Returns:
|
23 |
+
Optional[pl.DataFrame]: DataFrame with similarity results or None if error occurs
|
24 |
+
"""
|
25 |
+
try:
|
26 |
+
st = time.time()
|
27 |
+
|
28 |
+
# Initialize model
|
29 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
logger.info(f"Using device: {device}")
|
31 |
+
|
32 |
+
model = SentenceTransformer(
|
33 |
+
modelname,
|
34 |
+
device=device,
|
35 |
+
trust_remote_code=True,
|
36 |
+
)
|
37 |
+
|
38 |
+
# Read and validate input data
|
39 |
+
sentences1 = Dataset.from_pandas(pd.read_csv(data1, on_bad_lines='skip', header=0, sep="\t"))
|
40 |
+
sentences2 = Dataset.from_pandas(pd.read_csv(data2, on_bad_lines='skip', header=0, sep="\t"))
|
41 |
+
|
42 |
+
if sentences1.num_rows == 0 or sentences2.num_rows == 0:
|
43 |
+
logger.error("Empty input data found")
|
44 |
+
return None
|
45 |
+
|
46 |
+
# Generate embeddings
|
47 |
+
logger.info("Generating embeddings for first set...")
|
48 |
+
embeddings1 = model.encode(
|
49 |
+
sentences1["text"],
|
50 |
+
normalize_embeddings=True,
|
51 |
+
batch_size=1024,
|
52 |
+
show_progress_bar=True
|
53 |
+
)
|
54 |
+
|
55 |
+
logger.info("Generating embeddings for second set...")
|
56 |
+
embeddings2 = model.encode(
|
57 |
+
sentences2["text"],
|
58 |
+
normalize_embeddings=True,
|
59 |
+
batch_size=1024,
|
60 |
+
show_progress_bar=True
|
61 |
+
)
|
62 |
+
|
63 |
+
# Calculate similarity matrix
|
64 |
+
logger.info("Calculating similarity matrix...")
|
65 |
+
similarity_matrix = model.similarity(embeddings1, embeddings2)
|
66 |
+
|
67 |
+
# Process results
|
68 |
+
df_pd = pd.DataFrame(similarity_matrix)
|
69 |
+
dfi = df_pd.__dataframe__()
|
70 |
+
df = pl.from_dataframe(dfi)
|
71 |
+
|
72 |
+
# Transform matrix to long format
|
73 |
+
df_matrix_with_index = df.with_row_index(name="row_index").with_columns(
|
74 |
+
pl.col("row_index").cast(pl.UInt64)
|
75 |
+
)
|
76 |
+
df_long = df_matrix_with_index.unpivot(
|
77 |
+
index="row_index",
|
78 |
+
variable_name="column_index",
|
79 |
+
value_name="score"
|
80 |
+
).with_columns(pl.col("column_index").cast(pl.UInt64))
|
81 |
+
|
82 |
+
# Join with original text
|
83 |
+
df_sentences1 = pl.DataFrame(sentences1.to_pandas()).with_row_index(name="row_index").with_columns(
|
84 |
+
pl.col("row_index").cast(pl.UInt64)
|
85 |
+
)
|
86 |
+
df_sentences2 = pl.DataFrame(sentences2.to_pandas()).with_row_index(name="column_index").with_columns(
|
87 |
+
pl.col("column_index").cast(pl.UInt64)
|
88 |
+
)
|
89 |
+
|
90 |
+
# Process final results
|
91 |
+
df_long = (df_long
|
92 |
+
.with_columns([pl.col("score").round(4).cast(pl.Float32)])
|
93 |
+
.join(df_sentences1, on="row_index")
|
94 |
+
.join(df_sentences2, on="column_index"))
|
95 |
+
|
96 |
+
df_long = df_long.rename({
|
97 |
+
"text": "sentences1",
|
98 |
+
"text_right": "sentences2",
|
99 |
+
}).drop(["row_index", "column_index"])
|
100 |
+
|
101 |
+
# Filter and sort results
|
102 |
+
result_df = df_long.filter(pl.col("score") > score).sort(["score"], descending=True)
|
103 |
+
|
104 |
+
elapsed_time = time.time() - st
|
105 |
+
logger.info(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}')
|
106 |
+
logger.info(f'Found {len(result_df)} pairs above score threshold {score}')
|
107 |
+
|
108 |
+
return result_df
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
logger.error(f"Error in STS process: {str(e)}")
|
112 |
+
return None
|
utils.py
CHANGED
@@ -4,22 +4,159 @@ import shutil
|
|
4 |
import pandas as pd
|
5 |
import polars as pl
|
6 |
import time
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
11 |
|
12 |
-
def
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
os.makedirs(folder_path, exist_ok=True)
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
dataframe.write_csv(csv_path, separator="\t")
|
|
|
|
|
18 |
return csv_path
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
def delete_folder_periodically(path, interval=3600):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
while True:
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
import polars as pl
|
6 |
import time
|
7 |
+
import logging
|
8 |
+
from typing import Optional, Tuple
|
9 |
|
10 |
+
# Configure logging
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
14 |
+
)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
|
17 |
+
def validate_csv_structure(df: pd.DataFrame) -> Tuple[bool, str]:
|
18 |
+
"""
|
19 |
+
Validate the structure of the DataFrame.
|
20 |
+
|
21 |
+
Args:
|
22 |
+
df: DataFrame to validate
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
Tuple[bool, str]: (is_valid, error_message)
|
26 |
+
"""
|
27 |
+
# Check if DataFrame is empty
|
28 |
+
if df.empty:
|
29 |
+
return False, "CSV file is empty"
|
30 |
+
|
31 |
+
# Check required columns
|
32 |
+
required_columns = ['_id', 'text']
|
33 |
+
missing_columns = [col for col in required_columns if col not in df.columns]
|
34 |
+
if missing_columns:
|
35 |
+
return False, f"Missing required columns: {', '.join(missing_columns)}"
|
36 |
+
|
37 |
+
# Validate _id column
|
38 |
+
if df['_id'].isna().any():
|
39 |
+
return False, "Found empty _id values"
|
40 |
+
|
41 |
+
# Validate text column
|
42 |
+
if df['text'].isna().any():
|
43 |
+
return False, "Found empty text values"
|
44 |
+
|
45 |
+
# Check for duplicate _id values
|
46 |
+
if df['_id'].duplicated().any():
|
47 |
+
return False, "Found duplicate _id values"
|
48 |
+
|
49 |
+
return True, ""
|
50 |
+
|
51 |
+
def getDataFrame(path: str) -> Optional[pl.DataFrame]:
|
52 |
+
"""
|
53 |
+
Read and validate CSV file into a DataFrame.
|
54 |
+
|
55 |
+
Args:
|
56 |
+
path: Path to the CSV file
|
57 |
+
|
58 |
+
Returns:
|
59 |
+
Optional[pl.DataFrame]: The validated DataFrame or None if validation fails
|
60 |
+
"""
|
61 |
+
try:
|
62 |
+
# Read CSV with tab separator
|
63 |
+
data = pd.read_csv(
|
64 |
+
path,
|
65 |
+
sep="\t",
|
66 |
+
header=0,
|
67 |
+
on_bad_lines='warn',
|
68 |
+
encoding='utf-8'
|
69 |
+
)
|
70 |
+
|
71 |
+
# Validate structure
|
72 |
+
is_valid, error_message = validate_csv_structure(data)
|
73 |
+
if not is_valid:
|
74 |
+
logger.error(error_message)
|
75 |
+
return None
|
76 |
+
|
77 |
+
# Clean text column
|
78 |
+
data['text'] = data['text'].astype(str).str.strip()
|
79 |
+
data = data[data['text'].str.len() > 0]
|
80 |
+
|
81 |
+
if data.empty:
|
82 |
+
logger.error("No valid text data found after cleaning")
|
83 |
+
return None
|
84 |
+
|
85 |
+
# Convert to Polars DataFrame
|
86 |
+
pl_df = pl.from_pandas(data)
|
87 |
+
logger.info(f"Successfully loaded {len(pl_df)} rows from CSV")
|
88 |
+
|
89 |
+
return pl_df
|
90 |
+
|
91 |
+
except pd.errors.EmptyDataError:
|
92 |
+
logger.error("CSV file is empty")
|
93 |
+
return None
|
94 |
+
except pd.errors.ParserError as e:
|
95 |
+
logger.error(f"Error parsing CSV file: {str(e)}")
|
96 |
+
return None
|
97 |
+
except Exception as e:
|
98 |
+
logger.error(f"Unexpected error reading CSV: {str(e)}")
|
99 |
+
return None
|
100 |
+
|
101 |
+
def save_to_csv(dataframe: pl.DataFrame) -> Optional[str]:
|
102 |
+
"""
|
103 |
+
Save DataFrame to CSV file.
|
104 |
+
|
105 |
+
Args:
|
106 |
+
dataframe: Polars DataFrame to save
|
107 |
+
|
108 |
+
Returns:
|
109 |
+
Optional[str]: Path to saved file or None if save fails
|
110 |
+
"""
|
111 |
+
try:
|
112 |
+
if dataframe is None or dataframe.is_empty():
|
113 |
+
logger.warning("No data to save")
|
114 |
+
return None
|
115 |
+
|
116 |
+
# Create data directory if it doesn't exist
|
117 |
+
folder_path = "data"
|
118 |
os.makedirs(folder_path, exist_ok=True)
|
119 |
+
|
120 |
+
# Generate unique filename with timestamp
|
121 |
+
timestamp = int(time.time())
|
122 |
+
csv_path = f"{folder_path}/results_{timestamp}.csv"
|
123 |
+
|
124 |
+
# Save to CSV with tab separator
|
125 |
dataframe.write_csv(csv_path, separator="\t")
|
126 |
+
logger.info(f"Results saved to {csv_path}")
|
127 |
+
|
128 |
return csv_path
|
129 |
+
|
130 |
+
except Exception as e:
|
131 |
+
logger.error(f"Error saving results: {str(e)}")
|
132 |
+
return None
|
133 |
|
134 |
+
def delete_folder_periodically(path: str, interval: int = 3600) -> None:
|
135 |
+
"""
|
136 |
+
Periodically clean up the data folder.
|
137 |
+
|
138 |
+
Args:
|
139 |
+
path: Path to folder to clean
|
140 |
+
interval: Interval between cleanups in seconds
|
141 |
+
"""
|
142 |
while True:
|
143 |
+
try:
|
144 |
+
if os.path.exists(path):
|
145 |
+
# Get current time
|
146 |
+
current_time = time.time()
|
147 |
+
|
148 |
+
# Check each file in the directory
|
149 |
+
for filename in os.listdir(path):
|
150 |
+
file_path = os.path.join(path, filename)
|
151 |
+
if os.path.isfile(file_path):
|
152 |
+
# Check file age
|
153 |
+
file_age = current_time - os.path.getmtime(file_path)
|
154 |
+
if file_age > interval:
|
155 |
+
os.remove(file_path)
|
156 |
+
logger.info(f"Deleted old file: {file_path}")
|
157 |
+
|
158 |
+
time.sleep(interval)
|
159 |
+
|
160 |
+
except Exception as e:
|
161 |
+
logger.error(f"Error in cleanup task: {str(e)}")
|
162 |
+
time.sleep(interval)
|