Spaces:

simone-papicchio
/

qatch-demo

Running

App Files Files Community

Add TQA task

#21

by franceth - opened Apr 10

base: refs/heads/main

←

from: refs/pr/21

Discussion Files changed

+306

-154

Files changed (4) hide show

app.py +196 -146
concatenated_output.csv +1 -1
utilities.py +97 -1
utils_get_db_tables_info.py +12 -6

app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import plotly.colors as pc
 from qatch.connectors.sqlite_connector import SqliteConnector
 from qatch.generate_dataset.orchestrator_generator import OrchestratorGenerator
 from qatch.evaluate_dataset.orchestrator_evaluator import OrchestratorEvaluator
 from prediction import ModelPrediction
 import utils_get_db_tables_info
 import utilities as us
@@ -31,7 +32,6 @@ import utilities as us
 #pnp_path = os.path.join("data", "evaluation_p_np_metrics.csv")
 pnp_path = "concatenated_output.csv"
 PATH_PKL_TABLES = 'tables_dict_beaver.pkl'
 js_func = """
 function refresh() {
     const url = new URL(window.location);
@@ -42,7 +42,8 @@ function refresh() {
     }
 }
 """
-reset_flag=False
 with open('style.css', 'r') as file:
     css = file.read()
@@ -65,6 +66,8 @@ description = """## 📊 Comparison of Proprietary and Non-Proprietary Databases
                     ### ➤ **Non-Proprietary**
                     ###  &ensp;&ensp;&ensp;           ⇒ Spider 1.0 🕷️"""
 prompt_default = "Translate the following question in SQL code to be executed over the database to fetch the answer.\nReturn the sql code in ```sql ```\nQuestion\n{question}\nDatabase Schema\n{db_schema}\n"
 input_data = {
     'input_method': "",
@@ -93,6 +96,7 @@ def load_data(file, path, use_default):
                 #change path
                 input_data["data_path"] = os.path.join(".", f"{input_data['db_name']}.sqlite")
             input_data["data"] = us.load_data(file, input_data["db_name"])
             df_current = input_data["data"]['data_frames'].get('MyTable', df_default)  # Carica il DataFrame
             if(input_data["data"]['data_frames'] and input_data["data"]["db"] is None): #for csv and xlsx files
                 table2primary_key = {}
@@ -317,7 +321,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
         # Model selection button (initially disabled)
         open_model_selection = gr.Button("Choose your models", interactive=False)
         def update_table_list(data):
             """Dynamically updates the list of available tables and excluded ones."""
             if isinstance(data, dict) and data:
@@ -458,9 +461,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 default_checkbox
             ]
         )
         reset_data.click(open_accordion, inputs=gr.State("reset"), outputs=[upload_acc, select_table_acc, select_model_acc, qatch_acc, metrics_acc, default_checkbox, file_input])
     ####################################
     #       MODEL SELECTION PART       #
@@ -506,10 +507,9 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
         # Function to get selected models
         def get_selected_models(*model_selections):
             selected_models = [model for model, selected in zip(model_list, model_selections) if selected]
             input_data['models'] = selected_models
             button_state = bool(selected_models and '{db_schema}' in input_data["prompt"] and '{question}' in input_data["prompt"])
-            return selected_models, gr.update(open=True, visible=True), gr.update(interactive=button_state)
         # Add the Textbox to the interface
         prompt = gr.TextArea(
@@ -517,17 +517,19 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             placeholder=prompt_default,
             elem_id="custom-textarea"
         )
         warning_prompt = gr.Markdown(value="## Error in the prompt format", visible=False)
         # Submit button (initially disabled)
-        submit_models_button = gr.Button("Submit Models", interactive=False)
         def check_prompt(prompt):
             #TODO
             missing_elements = []
             if(prompt==""):
-                input_data["prompt"]=prompt_default
                 button_state = bool(len(input_data['models']) > 0 and '{db_schema}' in input_data["prompt"] and '{question}' in input_data["prompt"])
             else:
                 input_data["prompt"]=prompt
@@ -544,18 +546,18 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     ), gr.update(interactive=button_state)
             return gr.update(visible=False),  gr.update(interactive=button_state)
-        prompt.change(fn=check_prompt, inputs=[prompt], outputs=[warning_prompt, submit_models_button])
         # Link checkboxes to selection events
         for checkbox in model_checkboxes:
             checkbox.change(
                 fn=get_selected_models,
                 inputs=model_checkboxes,
-                outputs=[selected_models_output, select_model_acc, submit_models_button]
             )
         prompt.change(
             fn=get_selected_models,
             inputs=model_checkboxes,
-            outputs=[selected_models_output, select_model_acc, submit_models_button]
         )
         submit_models_button.click(
@@ -564,6 +566,17 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             outputs=[selected_models_output, select_model_acc, qatch_acc]
         )
         def enable_disable(enable):
             return (
                 *[gr.update(interactive=enable) for _ in model_checkboxes],
@@ -574,6 +587,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 gr.update(interactive=enable),
                 gr.update(interactive=enable),
                 *[gr.update(interactive=enable) for _ in table_outputs],
                 gr.update(interactive=enable)
             )
@@ -591,7 +605,24 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 default_checkbox,
                 table_selector,
                 *table_outputs,
-                open_model_selection
             ]
         )
@@ -609,7 +640,8 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 default_checkbox,
                 table_selector,
                 *table_outputs,
-                open_model_selection
             ]
         )
@@ -660,9 +692,10 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     {mirrored_symbols}
                 </div>
                 """
-        def qatch_flow():
-            #caching
             global reset_flag
             predictions_dict = {model: pd.DataFrame(columns=['id', 'question', 'predicted_sql', 'time', 'query', 'db_path']) for model in model_list}
             metrics_conc = pd.DataFrame()
             columns_to_visulize = ["db_path", "tbl_name", "test_category", "sql_tag", "query", "question", "predicted_sql", "time", "price", "answer"]
@@ -692,7 +725,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                                                     </div>
                                                 """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
-                            #time.sleep(0.02)
                             prediction = row['predicted_sql']
                             display_prediction = f"""<div class='loading' style='font-size: 1.7rem; font-family: 'Inter', sans-serif;'>Predicted SQL:</div>
@@ -700,22 +733,25 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                                                         <div style='font-size: 3rem'>➡️</div>
                                                         <div class='sqlquery' font-family: 'Inter', sans-serif;>{prediction}</div>
                                                     </div>
-                                                """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
                     yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
                 metrics_conc = target_df
-                if 'valid_efficiency_score' not in metrics_conc.columns:
-                    metrics_conc['valid_efficiency_score'] = metrics_conc['VES']
                 eval_text = generate_eval_text("End evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
             else:
                 orchestrator_generator = OrchestratorGenerator()
-                # TODO: add to target_df column target_df["columns_used"], tables selection
-                # print(input_data['data']['db'])
-                #print(input_data['data']['selected_tables'])
-                target_df = orchestrator_generator.generate_dataset(connector=input_data['data']['db'], tables_to_include=input_data['data']['selected_tables'])
-                #target_df = orchestrator_generator.generate_dataset(connector=input_data['data']['db'], tables_to_includes=None)
                 predictor = ModelPrediction()
                 reset_flag = False
@@ -736,15 +772,18 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                                                     </div>
                                                 """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model]for model in model_list]
-                            start_time = time.time()
-                            samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
-                            schema_text = utils_get_db_tables_info.utils_extract_db_schema_as_string(
-                            db_id = input_data["db_name"],
-                            base_path = input_data["data_path"],
-                            normalize=False,
-                            sql=row["query"],
-                            get_insert_into=True
                             )
                             #prompt_to_send = us.prepare_prompt(input_data["prompt"], question, schema_text, samples)
@@ -752,19 +791,27 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                             #PREDICTION SQL
                             # TODO add button for QA or SP and pass to .make_prediction parameter TASK
                             response = predictor.make_prediction(
                                 question=question,
-                                db_schema=schema_text,
                                 model_name=model,
                                 prompt=f"{prompt_to_send}",
-                                task="SP"  # TODO change accordingly
                             )
                             prediction = response['response_parsed']
                             price = response['cost']
                             answer = response['response']
                             end_time = time.time()
-                            display_prediction = f"""<div class='loading' style='font-size: 1.7rem; font-family: 'Inter', sans-serif;'>Predicted SQL:</div>
                                                     <div style='display: flex; align-items: center;'>
                                                         <div style='font-size: 3rem'>➡️</div>
                                                         <div class='sqlquery' font-family: 'Inter', sans-serif;>{prediction}</div>
@@ -779,40 +826,47 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                                 'query': row["query"],
                                 'db_path': input_data["data_path"],
                                 'price':price,
-                                'answer':answer,
                                 'number_question':count,
-                                'prompt': prompt_to_send
                             }]).dropna(how="all")  # Remove only completely empty rows
                             count=count+1
                             # TODO: use a for loop
                             for col in target_df.columns:
                                 if col not in new_row.columns:
                                     new_row[col] = row[col]
                             # Update model's prediction dataframe incrementally
                             if not new_row.empty:
                                 predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
                             # yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model]for model in model_list]
                     yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
                     # END
                 eval_text = generate_eval_text("Evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
                 evaluator = OrchestratorEvaluator()
                 for model in input_data["models"]:
-                    metrics_df_model = evaluator.evaluate_df(
-                        df=predictions_dict[model],
-                        target_col_name="query",
-                        prediction_col_name="predicted_sql",
-                        db_path_name="db_path"
-                    )
                     metrics_df_model['model'] = model
                     metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
-                if 'valid_efficiency_score' not in metrics_conc.columns:
-                    metrics_conc['valid_efficiency_score'] = metrics_conc['VES']
                 eval_text = generate_eval_text("End evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
@@ -848,6 +902,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     gr.Markdown(f"**Results for {model}**")
                     tab_dict[model] = tab
                     dataframe_per_model[model] = gr.DataFrame()
                     # download_pred_model = gr.DownloadButton(label="Download Prediction per Model", visible=False)
         evaluation_loading = gr.Markdown()
@@ -860,13 +915,24 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             inputs=[],
             outputs=[tab_dict[model] for model in model_list]  # Update TabItem visibility
         )
         selected_models_display = gr.JSON(label="Final input data", visible=False)
         metrics_df = gr.DataFrame(visible=False)
         metrics_df_out = gr.DataFrame(visible=False)
         submit_models_button.click(
-            fn=qatch_flow,
             inputs=[],
             outputs=[evaluation_loading, model_logo, variable, question_display, prediction_display, metrics_df] + list(dataframe_per_model.values())
         )
@@ -875,6 +941,10 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             fn=lambda: gr.update(value=input_data),
             outputs=[selected_models_display]
         )
         # Works for METRICS
         metrics_df.change(fn=change_text, inputs=[metrics_df], outputs=[metrics_df_out])
@@ -897,10 +967,16 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             fn=lambda: gr.update(visible=False),
             outputs=[download_metrics]
         )
         def refresh():
             global reset_flag
             reset_flag = True
         reset_data = gr.Button("Back to upload data section", interactive=True)
@@ -926,10 +1002,12 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 default_checkbox,
                 table_selector,
                 *table_outputs,
-                open_model_selection
             ]
         )
     ##########################################
     #     METRICS VISUALIZATION SECTION      #
     ##########################################
@@ -944,8 +1022,9 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             ####################################
             def load_data_csv_es():
                 if input_data["input_method"]=="default":
                     df = pd.read_csv(pnp_path)
                     df = df[df['model'].isin(input_data["models"])]
                     df = df[df['tbl_name'].isin(input_data["data"]["selected_tables"])]
@@ -956,6 +1035,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     df['model'] = df['model'].replace('llama-70', 'Llama-70B')
                     df['model'] = df['model'].replace('llama-8', 'Llama-8B')
                     df['test_category'] = df['test_category'].replace('many-to-many-generator', 'MANY-TO-MANY')
                     return df
                 return metrics_df_out
@@ -998,20 +1078,21 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             DB_CATEGORY_COLORS = generate_db_category_colors()
-            def normalize_valid_efficiency_score(df):
-                #TODO valid_efficiency_score
-                #print(df['valid_efficiency_score'])
-                df['valid_efficiency_score'] = df['valid_efficiency_score'].replace([np.nan, ''], 0)
-                df['valid_efficiency_score'] = df['valid_efficiency_score'].astype(int)
-                min_val = df['valid_efficiency_score'].min()
-                max_val = df['valid_efficiency_score'].max()
-                if min_val == max_val:
-                    # Tutti i valori sono uguali, assegna 1.0 a tutto per evitare divisione per zero
-                    df['valid_efficiency_score'] = 1.0
                 else:
-                    df['valid_efficiency_score'] = (
-                        df['valid_efficiency_score'] - min_val
                     ) / (max_val - min_val)
                 return df
@@ -1024,7 +1105,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             # BAR CHART FOR AVERAGE METRICS WITH UPDATE FUNCTION
             def plot_metric(df, radio_metric, qatch_selected_metrics, external_selected_metric, group_by, selected_models):
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 # Mappatura nomi leggibili -> tecnici
                 qatch_selected_internal = [qatch_metrics_dict[label] for label in qatch_selected_metrics]
@@ -1141,7 +1222,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     selected_models = [selected_models]
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 # Converti nomi leggibili -> tecnici
                 qatch_selected_internal = [qatch_metrics_dict[label] for label in qatch_selected_metrics]
@@ -1226,54 +1307,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 )
                 return gr.Plot(fig, visible=True)
-            """
-            def plot_metric_propietary(df, radio_metric, qatch_selected_metrics, external_selected_metric, selected_models):
-                if selected_models == "All":
-                    selected_models = models
-                else:
-                    selected_models = [selected_models]
-                df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
-                if radio_metric == "Qatch":
-                    selected_metrics = qatch_selected_metrics
-                else:
-                    selected_metrics = external_selected_metric
-                df = calculate_average_metrics(df, selected_metrics)
-                # Raggruppamento per modello e categoria
-                avg_metrics = df.groupby(["model", "db_category"])['avg_metric'].mean().reset_index()
-                avg_metrics['text_label'] = avg_metrics['avg_metric'].apply(lambda x: f'{x:.2f}')
-                # Plot orizzontale con modello sull'asse Y
-                fig = px.bar(
-                    avg_metrics,
-                    x='avg_metric',
-                    y='model',
-                    color='db_category',  # categoria come colore
-                    text='text_label',
-                    barmode='group',
-                    orientation='h',
-                    color_discrete_map=DB_CATEGORY_COLORS,  # devi avere questo dict come MODEL_COLORS
-                    title='Average metric per model and db_category 📊',
-                    labels={'avg_metric': 'AVG Metric', 'model': 'Model'},
-                    template='plotly_dark'
-                )
-                fig.update_traces(textposition='outside', textfont_size=10)
-                fig.update_layout(
-                    margin=dict(t=80),
-                    yaxis=dict(title=''),
-                    xaxis=dict(title='AVG Metrics'),
-                    legend_title='DB Name',
-                    height=600  # puoi aumentare se ci sono tanti modelli
-                )
-                return gr.Plot(fig, visible=True)
-            """
             def update_plot_propietary(radio_metric, qatch_selected_metrics, external_selected_metric, selected_models):
                 df = load_data_csv_es()
@@ -1289,7 +1322,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 df = df[df['db_category'].isin(target_cats)]
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 df = calculate_average_metrics(df, qatch_metrics)
                 # Calcola la media per db_category e modello
@@ -1410,14 +1443,14 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             # RADAR OR BAR CHART BASED ON CATEGORY COUNT
             def plot_radar(df, selected_models, selected_metrics, selected_categories):
-                if "external" in selected_metrics:
-                    selected_metrics = ["execution_accuracy", "valid_efficiency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
                 # Filtro modelli e normalizzazione
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
                 avg_metrics = df.groupby(['model', 'test_category'])['avg_metric'].mean().reset_index()
@@ -1574,13 +1607,13 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             # RADAR OR BAR CHART FOR SUB-CATEGORIES BASED ON CATEGORY COUNT
             def plot_radar_sub(df, selected_models, selected_metrics, selected_category):
-                if "external" in selected_metrics:
-                    selected_metrics = ["execution_accuracy", "valid_efficiency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
                 if isinstance(selected_category, str):
@@ -1743,6 +1776,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             # RANKING FOR THE 3 WORST RESULTS WITH UPDATE FUNCTION
             def worst_cases_text(df, selected_models, selected_metrics, selected_categories):
                 if selected_models == "All":
                     selected_models = models
                 else:
@@ -1757,15 +1791,25 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 df = df[df['test_category'].isin(selected_categories)]
                 if "external" in selected_metrics:
-                    selected_metrics = ["execution_accuracy", "valid_efficiency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
-                df = normalize_valid_efficiency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
-                worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'query', 'predicted_sql', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
                 worst_cases_df = worst_cases_df.sort_values(by="avg_metric", ascending=True).reset_index(drop=True)
                 worst_cases_top_3 = worst_cases_df.head(3)
@@ -1778,14 +1822,24 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                 medals = ["🥇", "🥈", "🥉"]
                 for i, row in worst_cases_top_3.iterrows():
-                    entry = (
-                        f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span>  \n"
-                        f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span>  \n"
-                        f"<span style='font-size:16px;'>- <b>Original Query:</b> `{row['query']}`</span>  \n"
-                        f"<span style='font-size:16px;'>- <b>Predicted SQL:</b> `{row['predicted_sql']}`</span>  \n\n"
-                    )
-                    worst_str.append(entry)
                     raw_answer = (
                         f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span>  \n"
@@ -1793,7 +1847,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                     )
                     answer_str.append(raw_answer)
                 return worst_str[0], worst_str[1], worst_str[2], answer_str[0], answer_str[1], answer_str[2]
             def update_worst_cases_text(selected_models, selected_metrics, selected_categories):
@@ -1803,7 +1857,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             # LINE CHART FOR CUMULATIVE TIME WITH UPDATE FUNCTION
             def plot_cumulative_flow(df, selected_models, max_points):
                 df = df[df['model'].isin(selected_models)]
-                df = normalize_valid_efficiency_score(df)
                 fig = go.Figure()
@@ -1937,10 +1991,10 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             external_metrics_dict = {
                 "Execution Accuracy": "execution_accuracy",
-                "Valid Efficiency Score": "valid_efficiency_score"
             }
-            external_metric = ["execution_accuracy", "valid_efficiency_score"]
             last_valid_external_metric_selection = external_metric.copy()
             def enforce_external_metric_selection(selected):
                 global last_valid_external_metric_selection
@@ -1987,10 +2041,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             all_model_as_dic = {cat: [f"{cat}"] for cat in models}
             all_model_as_dic["All"] = models
-            #with gr.Blocks(theme=gr.themes.Default(primary_hue='blue')) as demo:
             ###########################
             #  VISUALIZATION SECTION  #
@@ -2029,7 +2079,7 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
                                 <span
                                     title="External metric info:
                                     Execution Accuracy: Checks if the predicted query returns exactly the same result as the ground truth query when executed. It is a binary metric: 1 if the output matches, 0 otherwise.
-                                    Valid Efficiency Score: Evaluates the efficiency of a query by combining execution time and correctness. It rewards queries that are both accurate and fast."
                                     style="margin-left: 6px; cursor: help; color: #00bfff; font-size: 16px; white-space: pre-line;"
                                 >External metric info ℹ️</span>
                             </div>
@@ -2304,6 +2354,6 @@ with gr.Blocks(theme='shivi/calm_seafoam', css_paths='style.css', js=js_func) as
             reset_data.click(open_accordion, inputs=gr.State("reset"), outputs=[upload_acc, select_table_acc, select_model_acc, qatch_acc, metrics_acc, default_checkbox, file_input])
             reset_data.click(fn=lambda: gr.update(visible=False), outputs=[download_metrics])
             reset_data.click(fn=enable_disable, inputs=[gr.State(True)], outputs=[*model_checkboxes, submit_models_button, preview_output, submit_button, file_input, default_checkbox, table_selector, *table_outputs, open_model_selection])
 interface.launch(share = True)

 from qatch.connectors.sqlite_connector import SqliteConnector
 from qatch.generate_dataset.orchestrator_generator import OrchestratorGenerator
 from qatch.evaluate_dataset.orchestrator_evaluator import OrchestratorEvaluator
+import qatch.evaluate_dataset.orchestrator_evaluator as eva
 from prediction import ModelPrediction
 import utils_get_db_tables_info
 import utilities as us
 #pnp_path = os.path.join("data", "evaluation_p_np_metrics.csv")
 pnp_path = "concatenated_output.csv"
 PATH_PKL_TABLES = 'tables_dict_beaver.pkl'
 js_func = """
 function refresh() {
     const url = new URL(window.location);
     }
 }
 """
+reset_flag = False
+flag_TQA = False
 with open('style.css', 'r') as file:
     css = file.read()
                     ### ➤ **Non-Proprietary**
                     ###  &ensp;&ensp;&ensp;           ⇒ Spider 1.0 🕷️"""
 prompt_default = "Translate the following question in SQL code to be executed over the database to fetch the answer.\nReturn the sql code in ```sql ```\nQuestion\n{question}\nDatabase Schema\n{db_schema}\n"
+prompt_default_tqa = "Return the answer of the following question based on the provided database. Return your answer as the result of a query executed over the database. Namely, as a list of list where the first list represent the tuples and the second list the values in that tuple.\n Return the answer in answer tag as <answer> </answer>.\n Question \n {question}\n Database Schema\n {db_schema}\n"
 input_data = {
     'input_method': "",
                 #change path
                 input_data["data_path"] = os.path.join(".", f"{input_data['db_name']}.sqlite")
             input_data["data"] = us.load_data(file, input_data["db_name"])
             df_current = input_data["data"]['data_frames'].get('MyTable', df_default)  # Carica il DataFrame
             if(input_data["data"]['data_frames'] and input_data["data"]["db"] is None): #for csv and xlsx files
                 table2primary_key = {}
         # Model selection button (initially disabled)
         open_model_selection = gr.Button("Choose your models", interactive=False)
         def update_table_list(data):
             """Dynamically updates the list of available tables and excluded ones."""
             if isinstance(data, dict) and data:
                 default_checkbox
             ]
         )
         reset_data.click(open_accordion, inputs=gr.State("reset"), outputs=[upload_acc, select_table_acc, select_model_acc, qatch_acc, metrics_acc, default_checkbox, file_input])
     ####################################
     #       MODEL SELECTION PART       #
         # Function to get selected models
         def get_selected_models(*model_selections):
             selected_models = [model for model, selected in zip(model_list, model_selections) if selected]
             input_data['models'] = selected_models
             button_state = bool(selected_models and '{db_schema}' in input_data["prompt"] and '{question}' in input_data["prompt"])
+            return selected_models, gr.update(open=True, visible=True), gr.update(interactive=button_state), gr.update(interactive=button_state)
         # Add the Textbox to the interface
         prompt = gr.TextArea(
             placeholder=prompt_default,
             elem_id="custom-textarea"
         )
         warning_prompt = gr.Markdown(value="## Error in the prompt format", visible=False)
         # Submit button (initially disabled)
+        with gr.Row():
+            submit_models_button = gr.Button("Submit Models for NL2SQL task", interactive=False)
+            submit_models_button_tqa = gr.Button("Submit Models for TQA task", interactive=False)
         def check_prompt(prompt):
             #TODO
             missing_elements = []
             if(prompt==""):
+                input_data["prompt"] = prompt_default
                 button_state = bool(len(input_data['models']) > 0 and '{db_schema}' in input_data["prompt"] and '{question}' in input_data["prompt"])
             else:
                 input_data["prompt"]=prompt
                     ), gr.update(interactive=button_state)
             return gr.update(visible=False),  gr.update(interactive=button_state)
+        prompt.change(fn=check_prompt, inputs=[prompt], outputs=[warning_prompt, submit_models_button, submit_models_button_tqa])
         # Link checkboxes to selection events
         for checkbox in model_checkboxes:
             checkbox.change(
                 fn=get_selected_models,
                 inputs=model_checkboxes,
+                outputs=[selected_models_output, select_model_acc, submit_models_button, submit_models_button_tqa]
             )
         prompt.change(
             fn=get_selected_models,
             inputs=model_checkboxes,
+            outputs=[selected_models_output, select_model_acc, submit_models_button, submit_models_button_tqa]
         )
         submit_models_button.click(
             outputs=[selected_models_output, select_model_acc, qatch_acc]
         )
+        submit_models_button_tqa.click(
+            fn=lambda *args: (get_selected_models(*args), gr.update(open=False, visible=True), gr.update(open=True, visible=True)),
+            inputs=model_checkboxes,
+            outputs=[selected_models_output, select_model_acc, qatch_acc]
+        )
+        def change_flag():
+            global flag_TQA
+            flag_TQA = True
+        submit_models_button_tqa.click(fn = change_flag, inputs=[], outputs=[])
         def enable_disable(enable):
             return (
                 *[gr.update(interactive=enable) for _ in model_checkboxes],
                 gr.update(interactive=enable),
                 gr.update(interactive=enable),
                 *[gr.update(interactive=enable) for _ in table_outputs],
+                gr.update(interactive=enable),
                 gr.update(interactive=enable)
             )
                 default_checkbox,
                 table_selector,
                 *table_outputs,
+                open_model_selection,
+                submit_models_button_tqa
+            ]
+        )
+        submit_models_button_tqa.click(
+            fn=enable_disable,
+            inputs=[gr.State(False)],
+            outputs=[
+                *model_checkboxes,
+                submit_models_button,
+                preview_output,
+                submit_button,
+                file_input,
+                default_checkbox,
+                table_selector,
+                *table_outputs,
+                open_model_selection,
+                submit_models_button_tqa
             ]
         )
                 default_checkbox,
                 table_selector,
                 *table_outputs,
+                open_model_selection,
+                submit_models_button_tqa
             ]
         )
                     {mirrored_symbols}
                 </div>
                 """
+        def qatch_flow_nl_sql():
             global reset_flag
+            global flag_TQA
             predictions_dict = {model: pd.DataFrame(columns=['id', 'question', 'predicted_sql', 'time', 'query', 'db_path']) for model in model_list}
             metrics_conc = pd.DataFrame()
             columns_to_visulize = ["db_path", "tbl_name", "test_category", "sql_tag", "query", "question", "predicted_sql", "time", "price", "answer"]
                                                     </div>
                                                 """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
                             prediction = row['predicted_sql']
                             display_prediction = f"""<div class='loading' style='font-size: 1.7rem; font-family: 'Inter', sans-serif;'>Predicted SQL:</div>
                                                         <div style='font-size: 3rem'>➡️</div>
                                                         <div class='sqlquery' font-family: 'Inter', sans-serif;>{prediction}</div>
                                                     </div>
+                                                """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
                     yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
                 metrics_conc = target_df
+                if 'valid_efficency_score' not in metrics_conc.columns:
+                    metrics_conc['valid_efficency_score'] = metrics_conc['VES']
                 eval_text = generate_eval_text("End evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model][columns_to_visulize] for model in model_list]
             else:
                 orchestrator_generator = OrchestratorGenerator()
+                target_df = orchestrator_generator.generate_dataset(connector=input_data['data']['db'], tables_to_includes=input_data['data']['selected_tables'])
+                #create target_df[target_answer]
+                if flag_TQA :
+                    if (input_data["prompt"] == prompt_default):
+                        input_data["prompt"] = prompt_default_tqa
+                    target_df = us.extract_answer(target_df)
                 predictor = ModelPrediction()
                 reset_flag = False
                                                     </div>
                                                 """
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(display_question), gr.Markdown(), metrics_conc, *[predictions_dict[model]for model in model_list]
+                            #samples = us.generate_some_samples(input_data["data_path"], row["tbl_name"])
+                            model_to_send = None if not flag_TQA else model
+                            db_schema_text = utils_get_db_tables_info.utils_extract_db_schema_as_string(
+                                db_id = input_data["db_name"],
+                                base_path = input_data["data_path"],
+                                normalize=False,
+                                sql=row["query"],
+                                get_insert_into=True,
+                                model = model_to_send,
+                                prompt = input_data["prompt"].format(question=question, db_schema=""),
                             )
                             #prompt_to_send = us.prepare_prompt(input_data["prompt"], question, schema_text, samples)
                             #PREDICTION SQL
                             # TODO add button for QA or SP and pass to .make_prediction parameter TASK
+                            if flag_TQA: task="QA"
+                            else: task="SP"
+                            start_time = time.time()
                             response = predictor.make_prediction(
                                 question=question,
+                                db_schema=db_schema_text,
                                 model_name=model,
                                 prompt=f"{prompt_to_send}",
+                                task=task
                             )
                             prediction = response['response_parsed']
                             price = response['cost']
                             answer = response['response']
                             end_time = time.time()
+                            if flag_TQA:
+                                task_string = "Answer"
+                            else:
+                                task_string = "SQL"
+                            display_prediction = f"""<div class='loading' style='font-size: 1.7rem; font-family: 'Inter', sans-serif;'>Predicted {task_string}:</div>
                                                     <div style='display: flex; align-items: center;'>
                                                         <div style='font-size: 3rem'>➡️</div>
                                                         <div class='sqlquery' font-family: 'Inter', sans-serif;>{prediction}</div>
                                 'query': row["query"],
                                 'db_path': input_data["data_path"],
                                 'price':price,
+                                'answer': answer,
                                 'number_question':count,
+                                'target_answer' : row["target_answer"] if flag_TQA else None,
                             }]).dropna(how="all")  # Remove only completely empty rows
                             count=count+1
                             # TODO: use a for loop
+                            if (flag_TQA) :
+                                new_row['predicted_answer'] = prediction
                             for col in target_df.columns:
                                 if col not in new_row.columns:
                                     new_row[col] = row[col]
                             # Update model's prediction dataframe incrementally
                             if not new_row.empty:
                                 predictions_dict[model] = pd.concat([predictions_dict[model], new_row], ignore_index=True)
                             # yield gr.Textbox(), gr.Textbox(prediction), *[predictions_dict[model] for model in input_data["models"]], None
                             yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model]for model in model_list]
                     yield gr.Markdown(), gr.Image(), gr.Markdown(load_text), gr.Markdown(), gr.Markdown(display_prediction), metrics_conc, *[predictions_dict[model] for model in model_list]
                     # END
                 eval_text = generate_eval_text("Evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
                 evaluator = OrchestratorEvaluator()
                 for model in input_data["models"]:
+                    if not flag_TQA:
+                        metrics_df_model = evaluator.evaluate_df(
+                            df=predictions_dict[model],
+                            target_col_name="query",
+                            prediction_col_name="predicted_sql",
+                            db_path_name="db_path"
+                        )
+                    else:
+                        metrics_df_model = us.evaluate_answer(predictions_dict[model])
                     metrics_df_model['model'] = model
                     metrics_conc = pd.concat([metrics_conc, metrics_df_model], ignore_index=True)
+                if 'valid_efficency_score' not in metrics_conc.columns and 'VES' in metrics_conc.columns:
+                    metrics_conc['valid_efficency_score'] = metrics_conc['VES']
                 eval_text = generate_eval_text("End evaluation")
                 yield gr.Markdown(eval_text, visible=True), gr.Image(), gr.Markdown(), gr.Markdown(), gr.Markdown(), metrics_conc, *[predictions_dict[model] for model in model_list]
                     gr.Markdown(f"**Results for {model}**")
                     tab_dict[model] = tab
                     dataframe_per_model[model] = gr.DataFrame()
+                    #TODO download metrics per model
                     # download_pred_model = gr.DownloadButton(label="Download Prediction per Model", visible=False)
         evaluation_loading = gr.Markdown()
             inputs=[],
             outputs=[tab_dict[model] for model in model_list]  # Update TabItem visibility
         )
+        submit_models_button_tqa.click(
+            change_tab,
+            inputs=[],
+            outputs=[tab_dict[model] for model in model_list]  # Update TabItem visibility
+        )
         selected_models_display = gr.JSON(label="Final input data", visible=False)
         metrics_df = gr.DataFrame(visible=False)
         metrics_df_out = gr.DataFrame(visible=False)
         submit_models_button.click(
+            fn=qatch_flow_nl_sql,
+            inputs=[],
+            outputs=[evaluation_loading, model_logo, variable, question_display, prediction_display, metrics_df] + list(dataframe_per_model.values())
+        )
+        submit_models_button_tqa.click(
+            fn=qatch_flow_nl_sql,
             inputs=[],
             outputs=[evaluation_loading, model_logo, variable, question_display, prediction_display, metrics_df] + list(dataframe_per_model.values())
         )
             fn=lambda: gr.update(value=input_data),
             outputs=[selected_models_display]
         )
+        submit_models_button_tqa.click(
+            fn=lambda: gr.update(value=input_data),
+            outputs=[selected_models_display]
+        )
         # Works for METRICS
         metrics_df.change(fn=change_text, inputs=[metrics_df], outputs=[metrics_df_out])
             fn=lambda: gr.update(visible=False),
             outputs=[download_metrics]
         )
+        submit_models_button_tqa.click(
+            fn=lambda: gr.update(visible=False),
+            outputs=[download_metrics]
+        )
         def refresh():
             global reset_flag
+            global flag_TQA
             reset_flag = True
+            flag_TQA = False
         reset_data = gr.Button("Back to upload data section", interactive=True)
                 default_checkbox,
                 table_selector,
                 *table_outputs,
+                open_model_selection,
+                submit_models_button_tqa
             ]
         )
     ##########################################
     #     METRICS VISUALIZATION SECTION      #
     ##########################################
             ####################################
             def load_data_csv_es():
                 if input_data["input_method"]=="default":
+                    global flag_TQA
                     df = pd.read_csv(pnp_path)
                     df = df[df['model'].isin(input_data["models"])]
                     df = df[df['tbl_name'].isin(input_data["data"]["selected_tables"])]
                     df['model'] = df['model'].replace('llama-70', 'Llama-70B')
                     df['model'] = df['model'].replace('llama-8', 'Llama-8B')
                     df['test_category'] = df['test_category'].replace('many-to-many-generator', 'MANY-TO-MANY')
+                    if (flag_TQA) : flag_TQA = False #TODO delete after make pred
                     return df
                 return metrics_df_out
             DB_CATEGORY_COLORS = generate_db_category_colors()
+            def normalize_valid_efficency_score(df):
+                df['valid_efficency_score'] = df['valid_efficency_score'].replace([np.nan, ''], 0)
+                df['valid_efficency_score'] = df['valid_efficency_score'].astype(int)
+                min_val = df['valid_efficency_score'].min()
+                max_val = df['valid_efficency_score'].max()
+                if min_val == max_val :
+                        # All values are equal, so for avoid division by zero, we set the score to 1/0
+                        if min_val == None:
+                            df['valid_efficency_score'] = 0
+                        else:
+                            df['valid_efficency_score'] = 1.0
                 else:
+                    df['valid_efficency_score'] = (
+                        df['valid_efficency_score'] - min_val
                     ) / (max_val - min_val)
                 return df
             # BAR CHART FOR AVERAGE METRICS WITH UPDATE FUNCTION
             def plot_metric(df, radio_metric, qatch_selected_metrics, external_selected_metric, group_by, selected_models):
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 # Mappatura nomi leggibili -> tecnici
                 qatch_selected_internal = [qatch_metrics_dict[label] for label in qatch_selected_metrics]
                     selected_models = [selected_models]
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 # Converti nomi leggibili -> tecnici
                 qatch_selected_internal = [qatch_metrics_dict[label] for label in qatch_selected_metrics]
                 )
                 return gr.Plot(fig, visible=True)
             def update_plot_propietary(radio_metric, qatch_selected_metrics, external_selected_metric, selected_models):
                 df = load_data_csv_es()
                 df = df[df['db_category'].isin(target_cats)]
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 df = calculate_average_metrics(df, qatch_metrics)
                 # Calcola la media per db_category e modello
             # RADAR OR BAR CHART BASED ON CATEGORY COUNT
             def plot_radar(df, selected_models, selected_metrics, selected_categories):
+                if "External" in selected_metrics:
+                    selected_metrics = ["execution_accuracy", "valid_efficency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
                 # Filtro modelli e normalizzazione
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
                 avg_metrics = df.groupby(['model', 'test_category'])['avg_metric'].mean().reset_index()
             # RADAR OR BAR CHART FOR SUB-CATEGORIES BASED ON CATEGORY COUNT
             def plot_radar_sub(df, selected_models, selected_metrics, selected_category):
+                if "External" in selected_metrics:
+                    selected_metrics = ["execution_accuracy", "valid_efficency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
                 if isinstance(selected_category, str):
             # RANKING FOR THE 3 WORST RESULTS WITH UPDATE FUNCTION
             def worst_cases_text(df, selected_models, selected_metrics, selected_categories):
+                global flag_TQA
                 if selected_models == "All":
                     selected_models = models
                 else:
                 df = df[df['test_category'].isin(selected_categories)]
                 if "external" in selected_metrics:
+                    selected_metrics = ["execution_accuracy", "valid_efficency_score"]
                 else:
                     selected_metrics = ["cell_precision", "cell_recall", "tuple_order", "tuple_cardinality", "tuple_constraint"]
+                df = normalize_valid_efficency_score(df)
                 df = calculate_average_metrics(df, selected_metrics)
+                if flag_TQA:
+                    df["target_answer"] = df["target_answer"].apply(
+                        lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
+                    )
+                    df["predicted_answer"] = df["predicted_answer"].apply(
+                        lambda x: " - ".join([",".join(map(str, item)) for item in x]) if isinstance(x, list) else str(x)
+                    )
+                    worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'target_answer', 'predicted_answer', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
+                else:
+                    worst_cases_df = df.groupby(['model', 'tbl_name', 'test_category', 'question', 'query', 'predicted_sql', 'answer', 'sql_tag'])['avg_metric'].mean().reset_index()
                 worst_cases_df = worst_cases_df.sort_values(by="avg_metric", ascending=True).reset_index(drop=True)
                 worst_cases_top_3 = worst_cases_df.head(3)
                 medals = ["🥇", "🥈", "🥉"]
                 for i, row in worst_cases_top_3.iterrows():
+                    if flag_TQA:
+                        entry = (
+                            f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Original Answer:</b> `{row['target_answer']}`</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Predicted Answer:</b> `{row['predicted_answer']}`</span>  \n\n"
+                        )
+                        worst_str.append(entry)
+                    else:
+                        entry = (
+                            f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Question:</b> {row['question']}</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Original Query:</b> `{row['query']}`</span>  \n"
+                            f"<span style='font-size:16px;'>- <b>Predicted SQL:</b> `{row['predicted_sql']}`</span>  \n\n"
+                        )
+                        worst_str.append(entry)
                     raw_answer = (
                         f"<span style='font-size:18px;'><b>{medals[i]} {row['model']} - {row['tbl_name']} - {row['test_category']} - {row['sql_tag']}</b> ({row['avg_metric']})</span>  \n"
                     )
                     answer_str.append(raw_answer)
                 return worst_str[0], worst_str[1], worst_str[2], answer_str[0], answer_str[1], answer_str[2]
             def update_worst_cases_text(selected_models, selected_metrics, selected_categories):
             # LINE CHART FOR CUMULATIVE TIME WITH UPDATE FUNCTION
             def plot_cumulative_flow(df, selected_models, max_points):
                 df = df[df['model'].isin(selected_models)]
+                df = normalize_valid_efficency_score(df)
                 fig = go.Figure()
             external_metrics_dict = {
                 "Execution Accuracy": "execution_accuracy",
+                "Valid Efficency Score": "valid_efficency_score"
             }
+            external_metric = ["execution_accuracy", "valid_efficency_score"]
             last_valid_external_metric_selection = external_metric.copy()
             def enforce_external_metric_selection(selected):
                 global last_valid_external_metric_selection
             all_model_as_dic = {cat: [f"{cat}"] for cat in models}
             all_model_as_dic["All"] = models
             ###########################
             #  VISUALIZATION SECTION  #
                                 <span
                                     title="External metric info:
                                     Execution Accuracy: Checks if the predicted query returns exactly the same result as the ground truth query when executed. It is a binary metric: 1 if the output matches, 0 otherwise.
+                                    Valid Efficency Score: Evaluates the efficency of a query by combining execution time and correctness. It rewards queries that are both accurate and fast."
                                     style="margin-left: 6px; cursor: help; color: #00bfff; font-size: 16px; white-space: pre-line;"
                                 >External metric info ℹ️</span>
                             </div>
             reset_data.click(open_accordion, inputs=gr.State("reset"), outputs=[upload_acc, select_table_acc, select_model_acc, qatch_acc, metrics_acc, default_checkbox, file_input])
             reset_data.click(fn=lambda: gr.update(visible=False), outputs=[download_metrics])
             reset_data.click(fn=enable_disable, inputs=[gr.State(True)], outputs=[*model_checkboxes, submit_models_button, preview_output, submit_button, file_input, default_checkbox, table_selector, *table_outputs, open_model_selection])
 interface.launch(share = True)

concatenated_output.csv CHANGED Viewed

@@ -1,4 +1,4 @@
-cell_precision,sql_tag,tuple_cardinality,answer,predicted_sql,db_category,tuple_constraint,VES,number_question,valid_efficiency_score,tbl_name,tuple_order,time,price,question,model,cell_recall,db_path,execution_accuracy,test_category,query
 1.0,DISTINCT-SINGLE,1.0,"```sql
 SELECT DISTINCT WAREHOUSE_LOAD_DATE
 FROM FAC_BUILDING_ADDRESS;

+cell_precision,sql_tag,tuple_cardinality,answer,predicted_sql,db_category,tuple_constraint,VES,number_question,valid_efficency_score,tbl_name,tuple_order,time,price,question,model,cell_recall,db_path,execution_accuracy,test_category,query
 1.0,DISTINCT-SINGLE,1.0,"```sql
 SELECT DISTINCT WAREHOUSE_LOAD_DATE
 FROM FAC_BUILDING_ADDRESS;

utilities.py CHANGED Viewed

@@ -6,6 +6,11 @@ import sqlite3
 import gradio as gr
 import os
 from qatch.connectors.sqlite_connector import SqliteConnector
 def extract_tables(file_path):
     conn = sqlite3.connect(file_path)
     cursor = conn.cursor()
@@ -26,7 +31,7 @@ def extract_dataframes(file_path):
     return dfs
 def carica_sqlite(file_path, db_id):
-    data_output = {'data_frames': extract_dataframes(file_path),'db':SqliteConnector(relative_db_path=file_path, db_name=db_id)}
     return data_output
 # Funzione per leggere un file CSV
@@ -113,3 +118,94 @@ def generate_some_samples(file_path, tbl_name):
 def load_tables_dict_from_pkl(file_path):
     with open(file_path, 'rb') as f:
         return pickle.load(f)

 import gradio as gr
 import os
 from qatch.connectors.sqlite_connector import SqliteConnector
+from qatch.evaluate_dataset.metrics_evaluators import CellPrecision, CellRecall, ExecutionAccuracy, TupleCardinality, TupleConstraint, TupleOrder, ValidEfficiencyScore
+import qatch.evaluate_dataset.orchestrator_evaluator as eva
+#import tiktoken
+from transformers import AutoTokenizer
 def extract_tables(file_path):
     conn = sqlite3.connect(file_path)
     cursor = conn.cursor()
     return dfs
 def carica_sqlite(file_path, db_id):
+    data_output = {'data_frames': extract_dataframes(file_path),'db': SqliteConnector(relative_db_path=file_path, db_name=db_id)}
     return data_output
 # Funzione per leggere un file CSV
 def load_tables_dict_from_pkl(file_path):
     with open(file_path, 'rb') as f:
         return pickle.load(f)
+def extract_tables_dict(pnp_path):
+    return load_tables_dict_from_pkl('tables_dict_beaver.pkl')
+    tables_dict = {}
+    with open(pnp_path, mode='r', encoding='utf-8') as file:
+        reader = csv.DictReader(file)
+        tbl_db_pairs = set()  # Use a set to avoid duplicates
+        for row in reader:
+            tbl_name = row.get("tbl_name")
+            db_path = row.get("db_path")
+            if tbl_name and db_path:
+                tbl_db_pairs.add((tbl_name, db_path))  # Add the pair to the set
+    for tbl_name, db_path in list(tbl_db_pairs):
+            if tbl_name and db_path:
+                connector = sqlite3.connect(db_path)
+                query = f"SELECT * FROM {tbl_name} LIMIT 5"
+                try:
+                    df = pd.read_sql_query(query, connector)
+                    tables_dict[tbl_name] = df
+                except Exception as e:
+                    tables_dict[tbl_name] = pd.DataFrame({"Error": [str(e)]})  # DataFrame con messaggio di errore
+    #with open('tables_dict_beaver.pkl', 'wb') as f:
+    #    pickle.dump(tables_dict, f)
+    return tables_dict
+def extract_answer(df):
+    if "query" not in df.columns or "db_path" not in df.columns:
+        raise ValueError("The DataFrame must contain 'query' and 'data_path' columns.")
+    answers = []
+    for _, row in df.iterrows():
+        query = row["query"]
+        db_path = row["db_path"]
+        try:
+            conn = SqliteConnector(relative_db_path = db_path , db_name= "db")
+            answer = eva._utils_run_query_if_str(query, conn)
+            answers.append(answer)
+        except Exception as e:
+            answers.append(f"Error: {e}")
+    df["target_answer"] = answers
+    return df
+evaluator = {
+    "cell_precision": CellPrecision(),
+    "cell_recall": CellRecall(),
+    "tuple_cardinality": TupleCardinality(),
+    "tuple_order": TupleOrder(),
+    "tuple_constraint": TupleConstraint(),
+    "execution_accuracy": ExecutionAccuracy(),
+    "valid_efficency_score": ValidEfficiencyScore()
+}
+def evaluate_answer(df):
+    for metric_name, metric in evaluator.items():
+        results = []
+        for _, row in df.iterrows():
+            target = row["target_answer"]
+            predicted = row["predicted_answer"]
+            try:
+                result = metric.run_metric(target = target, prediction = predicted)
+            except Exception as e:
+                result = None
+            results.append(result)
+        df[metric_name] = results
+    return df
+models = [
+          "gpt-4o-mini",
+          "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
+          ]
+def crop_entries_per_token(entries_list, model, prompt: str | None = None):
+    #open_ai_models = ["gpt-3.5", "gpt-4o-mini"]
+    dimension = 2048
+    #enties_string = [", ".join(map(str, entry)) for entry in entries_list]
+    if prompt:
+        entries_string = prompt.join(entries_list)
+    else:
+        entries_string = " ".join(entries_list)
+    #if model in ["deepseek-ai/DeepSeek-R1-Distill-Llama-70B" ,"gpt-4o-mini" ] :
+    #tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
+    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B")
+    tokens = tokenizer.encode(entries_string)
+    number_of_tokens = len(tokens)
+    if number_of_tokens > dimension and len(entries_list) > 4:
+        entries_list = entries_list[:round(len(entries_list)/2)]
+        entries_list = crop_entries_per_token(entries_list, model)
+    return entries_list

utils_get_db_tables_info.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import os
 import sqlite3
 import re
 def utils_extract_db_schema_as_string(
-    db_id, base_path, normalize=False, sql: str | None = None, get_insert_into: bool = False
 ):
     """
     Extracts the full schema of an SQLite database into a single string.
@@ -19,7 +19,7 @@ def utils_extract_db_schema_as_string(
     cursor = connection.cursor()
     # Get the schema entries based on the provided SQL query
-    schema_entries = _get_schema_entries(cursor, sql, get_insert_into)
     # Combine all schema definitions into a single string
     schema_string = _combine_schema_entries(schema_entries, normalize)
@@ -28,7 +28,7 @@ def utils_extract_db_schema_as_string(
-def _get_schema_entries(cursor, sql=None, get_insert_into=False):
     """
     Retrieves schema entries and optionally data entries from the SQLite database.
@@ -62,11 +62,17 @@ def _get_schema_entries(cursor, sql=None, get_insert_into=False):
             column_names = [description[0] for description in cursor.description]
             # Generate INSERT INTO statements for each row
-            # TODO now hardcoded to first 3
-            for row in rows[:3]:
                 values = ', '.join(f"'{str(value)}'" if isinstance(value, str) else str(value) for value in row)
                 insert_stmt = f"INSERT INTO {table} ({', '.join(column_names)}) VALUES ({values});"
                 entries.append(insert_stmt)
     return entries

 import os
 import sqlite3
 import re
+import utilities as us
 def utils_extract_db_schema_as_string(
+    db_id, base_path, model : str | None = None , normalize=False, sql: str | None = None, get_insert_into: bool = False, prompt : str | None = None
 ):
     """
     Extracts the full schema of an SQLite database into a single string.
     cursor = connection.cursor()
     # Get the schema entries based on the provided SQL query
+    schema_entries = _get_schema_entries(cursor, sql, get_insert_into, model, prompt)
     # Combine all schema definitions into a single string
     schema_string = _combine_schema_entries(schema_entries, normalize)
+def _get_schema_entries(cursor, sql=None, get_insert_into=False, model: str | None = None,  prompt : str | None = None):
     """
     Retrieves schema entries and optionally data entries from the SQLite database.
             column_names = [description[0] for description in cursor.description]
             # Generate INSERT INTO statements for each row
+            if model==None :
+                max_len=3
+            else:
+                max_len = len(rows)
+            for row in rows[:max_len]:
                 values = ', '.join(f"'{str(value)}'" if isinstance(value, str) else str(value) for value in row)
                 insert_stmt = f"INSERT INTO {table} ({', '.join(column_names)}) VALUES ({values});"
                 entries.append(insert_stmt)
+    if model != None : entries = us.crop_entries_per_token(entries, model, prompt)
     return entries