|
import multiprocessing |
|
import threading |
|
import gradio as gr |
|
from mining import mining |
|
from sts import sts |
|
from utils import getDataFrame, save_to_csv, delete_folder_periodically |
|
import logging |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
CONCURRENCY_LIMIT = 5 |
|
AVAILABLE_MODELS = [ |
|
"Lajavaness/bilingual-embedding-large", |
|
"sentence-transformers/all-mpnet-base-v2", |
|
"intfloat/multilingual-e5-large-instruct" |
|
] |
|
|
|
MODEL_DESCRIPTIONS = { |
|
"Lajavaness/bilingual-embedding-large": "Multilingual model optimized for multiple languages. [More info](https://huggingface.co/Lajavaness/bilingual-embedding-large)", |
|
"sentence-transformers/all-mpnet-base-v2": "High-quality general-purpose model. [More info](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)", |
|
"intfloat/multilingual-e5-large-instruct": "Multilingual model with instructions. [More info](https://huggingface.co/intfloat/multilingual-e5-large-instruct)" |
|
} |
|
|
|
def create_interface(): |
|
with gr.Blocks(title="Sentence Transformers Demo") as demo: |
|
gr.Markdown("# Sentence Transformers Demo") |
|
gr.Markdown("This application provides two main functionalities: Paraphrase Mining and Semantic Textual Similarity (STS).") |
|
|
|
with gr.Tab("Paraphrase Mining"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"### Paraphrase Mining\n" |
|
"Find paraphrases (texts with identical/similar meaning) in a large corpus of sentences.\n" |
|
"Upload a CSV file containing your sentences and select a model to begin." |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("#### Input Sentences") |
|
upload_button_sentences = gr.UploadButton( |
|
label="Upload Sentences CSV", |
|
file_types=['.csv'], |
|
file_count="single", |
|
variant="primary" |
|
) |
|
output_data_sentences = gr.Dataframe( |
|
headers=["_id", "text"], |
|
col_count=2, |
|
label="Sentences Data", |
|
interactive=False |
|
) |
|
|
|
upload_button_sentences.upload( |
|
fn=getDataFrame, |
|
inputs=upload_button_sentences, |
|
outputs=output_data_sentences, |
|
concurrency_limit=CONCURRENCY_LIMIT |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model = gr.Dropdown( |
|
choices=AVAILABLE_MODELS, |
|
label="Select Model", |
|
value=AVAILABLE_MODELS[0], |
|
interactive=True |
|
) |
|
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]]) |
|
|
|
def update_model_description(model_name): |
|
return MODEL_DESCRIPTIONS[model_name] |
|
|
|
model.change( |
|
fn=update_model_description, |
|
inputs=model, |
|
outputs=model_description |
|
) |
|
|
|
score_mining = gr.Slider( |
|
minimum=0.0, |
|
maximum=1.0, |
|
value=0.96, |
|
step=0.01, |
|
label="Similarity Threshold", |
|
interactive=True |
|
) |
|
submit_button_mining = gr.Button("Process", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_mining = gr.Dataframe( |
|
headers=["score", "sentence_1", "sentence_2"], |
|
type="polars", |
|
label="Mining Results" |
|
) |
|
|
|
submit_button_mining.click( |
|
fn=mining, |
|
inputs=[model, upload_button_sentences, score_mining], |
|
outputs=output_mining |
|
).then( |
|
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."), |
|
inputs=[output_mining], |
|
outputs=[] |
|
) |
|
|
|
download_button = gr.Button("Download Results as CSV", variant="secondary") |
|
download_file = gr.File(label="Downloadable File") |
|
|
|
download_button.click( |
|
fn=save_to_csv, |
|
inputs=output_mining, |
|
outputs=download_file |
|
).then( |
|
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."), |
|
inputs=[download_file], |
|
outputs=[] |
|
) |
|
|
|
with gr.Tab("Semantic Textual Similarity"): |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"### Semantic Textual Similarity (STS)\n" |
|
"Calculate semantic similarity between two sets of sentences.\n" |
|
"Upload two CSV files containing your sentences and select a model to begin." |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("#### First Set of Sentences") |
|
upload_button_sentences1 = gr.UploadButton( |
|
label="Upload First Set CSV", |
|
file_types=['.csv'], |
|
file_count="single", |
|
variant="primary" |
|
) |
|
output_data_sentences1 = gr.Dataframe( |
|
headers=["_id", "text"], |
|
col_count=2, |
|
label="First Set Data", |
|
interactive=False |
|
) |
|
|
|
upload_button_sentences1.upload( |
|
fn=getDataFrame, |
|
inputs=upload_button_sentences1, |
|
outputs=output_data_sentences1, |
|
concurrency_limit=CONCURRENCY_LIMIT |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown("#### Second Set of Sentences") |
|
upload_button_sentences2 = gr.UploadButton( |
|
label="Upload Second Set CSV", |
|
file_types=['.csv'], |
|
file_count="single", |
|
variant="primary" |
|
) |
|
output_data_sentences2 = gr.Dataframe( |
|
headers=["_id", "text"], |
|
col_count=2, |
|
label="Second Set Data", |
|
interactive=False |
|
) |
|
|
|
upload_button_sentences2.upload( |
|
fn=getDataFrame, |
|
inputs=upload_button_sentences2, |
|
outputs=output_data_sentences2, |
|
concurrency_limit=CONCURRENCY_LIMIT |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
model = gr.Dropdown( |
|
choices=AVAILABLE_MODELS, |
|
label="Select Model", |
|
value=AVAILABLE_MODELS[0], |
|
interactive=True |
|
) |
|
model_description = gr.Markdown(MODEL_DESCRIPTIONS[AVAILABLE_MODELS[0]]) |
|
|
|
model.change( |
|
fn=update_model_description, |
|
inputs=model, |
|
outputs=model_description |
|
) |
|
|
|
score_sts = gr.Slider( |
|
minimum=0.0, |
|
maximum=1.0, |
|
value=0.96, |
|
step=0.01, |
|
label="Similarity Threshold", |
|
interactive=True |
|
) |
|
submit_button_sts = gr.Button("Process", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
output_sts = gr.Dataframe( |
|
headers=["score", "sentences1", "sentences2"], |
|
type="polars", |
|
label="Similarity Results" |
|
) |
|
|
|
submit_button_sts.click( |
|
fn=sts, |
|
inputs=[model, upload_button_sentences1, upload_button_sentences2, score_sts], |
|
outputs=output_sts |
|
).then( |
|
fn=lambda x: gr.Info("Processing completed successfully!") if x is not None else gr.Error("Error processing data. Please check the logs for details."), |
|
inputs=[output_sts], |
|
outputs=[] |
|
) |
|
|
|
download_button = gr.Button("Download Results as CSV", variant="secondary") |
|
download_file = gr.File(label="Downloadable File") |
|
|
|
download_button.click( |
|
fn=save_to_csv, |
|
inputs=output_sts, |
|
outputs=download_file |
|
).then( |
|
fn=lambda x: gr.Info("Results saved successfully!") if x is not None else gr.Error("Error saving results. Please check the logs for details."), |
|
inputs=[download_file], |
|
outputs=[] |
|
) |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
try: |
|
multiprocessing.set_start_method("spawn") |
|
|
|
|
|
folder_path = "data" |
|
thread = threading.Thread( |
|
target=delete_folder_periodically, |
|
args=(folder_path, 1800), |
|
daemon=True |
|
) |
|
thread.start() |
|
|
|
|
|
demo = create_interface() |
|
demo.launch( |
|
share=False, |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
show_error=True, |
|
show_api=False |
|
) |
|
except Exception as e: |
|
logger.error(f"Error starting application: {str(e)}") |
|
raise |
|
|