import plotly.express as px import pandas as pd import re # Define columns for all relevant predictions pred_columns = ['pred_dependencies', 'pred_training', 'pred_evaluation', 'pred_weights', 'pred_readme', 'pred_license'] # Define the real and predicted column pairs real_pred_columns = { 'dependencies': 'pred_dependencies', 'training': 'pred_training', 'evaluation': 'pred_evaluation', 'weights': 'pred_weights', 'readme': 'pred_readme', 'license': 'pred_license' } df = pd.read_csv('data/results.csv', sep="\t") # Cleanup df['year'] = pd.to_numeric(df['year'], errors='coerce') df = df.dropna(subset=['year']) df['year'] = df['year'].astype(int) # df['venue'] = df['venue'].apply(lambda x: str(re.search(r"'(.*?)'", x).group(1))) custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"] # Group by year and venue, and calculate the ratio of papers where URL is not None df_grouped = df.groupby(['year', 'venue']).agg( total_papers=('title', 'count'), papers_with_url=('url', lambda x: x.notna().sum()), valid_urls=('pred_live', lambda x: (x == "Yes").sum()) ).reset_index() df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers'] # Create the plotly figure fig = px.bar( df_grouped, x='year', y='ratio', color='venue', barmode='group', title=f'Success Rate per Venue and Year for "valid_url"', labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'}, category_orders={'venue': custom_order} ) fig.update_yaxes(range=[0, 1]) fig.update_xaxes(range=[2017.5, 2024.5]) fig.show() df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url'] # Plot the error rates using Plotly, with year on x-axis and color by venue fig = px.bar( df_grouped, x='year', y='valid_ratio', color='venue', barmode='group', title=f'Success Rate per Venue and Year for "valid_url"', labels={'error_rate': 'Success Rate', 'year': 'Year'}, category_orders={'venue': custom_order} ) fig.update_yaxes(range=[0, 1]) fig.update_xaxes(range=[2017.5, 2024.5]) fig.show() # Ensure boolean columns are actually booleans df_new = df.copy() for col in pred_columns: df_new[col] = df_new[col] == "Yes" df_grouped = df_new.groupby('venue').agg( valid_urls=('pred_live', lambda x: (x == "Yes").sum()), **{col: (col, lambda x: x[df_new['pred_live'] == "Yes"].sum()) for col in pred_columns} ).reset_index() # Calculate the ratio for each prediction column for col in pred_columns: df_grouped[col] = df_grouped[col] / df_grouped['valid_urls'] # Melt the dataframe for easier plotting df_melted = df_grouped.melt(id_vars=['venue'], value_vars=pred_columns, var_name='Prediction Type', value_name='Ratio') # Create a grouped bar plot fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type', barmode='group', # Ensures bars are side by side category_orders={'venue': custom_order}, title='Ratio of Predictions by Venue') # Show the figure fig.update_yaxes(range=[0, 1]) fig.show() # List of columns to check for "No" # Step 1: Filter only rows where pred_live is "Yes" df_filtered = df[df['pred_live'] == "Yes"].copy() for col in pred_columns: df_filtered[col] = df_filtered[col] == "Yes" # Step 1: Calculate the number of "No" answers per row for the specified columns df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1) # Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue fig = px.scatter( df_filtered, x='pred_citations', y='no_count', color='venue', title='Number of passed tests, Color Coded by Venue', labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'}, category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary log_x=True ) # Step 3: Display the scatter plot fig.show() # [np.corrcoef(np.array(df_filtered[col][~(pd.isna(df_filtered['pred_citations']))], dtype=int), df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])[0, 1] for col in pred_columns] # np.corrcoef(np.array(df_filtered['no_count'][~(pd.isna(df_filtered['pred_citations']))]), (1 + np.array(df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))]))) # Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts fig = px.strip( df_filtered, x='venue', y='no_count', color='venue', title='Automated Reproducibility Score per Venue', labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'}, category_orders={'venue': custom_order}, # Ensure custom order for venues stripmode='overlay' # Allows all individual points to overlay each other ) # Step 3: Add some jitter to the x-axis so points don't overlap fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers')) # Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread fig.add_trace(px.box( df_filtered, x='venue', y='no_count', category_orders={'venue': custom_order} ).data[0]) # We add the first trace of the box plot to overlay # Step 5: Show the plot fig.show() # Group by venue df_filtered = df[df['pred_live'] == "Yes"].copy() df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes") df_grouped = df_filtered.groupby('venue').agg( total_papers=('title', 'count') ).reset_index() # Add matching counts for each category for real, pred in real_pred_columns.items(): df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True) # Compute the ratio for each category for real in real_pred_columns.keys(): df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers'] # Melt the dataframe for visualization df_melted = df_grouped.melt(id_vars=['venue'], value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], var_name='Category', value_name='Ratio') # Clean up category names df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize() # Create the bar plot fig = px.bar(df_melted, x='venue', y='Ratio', color='Category', barmode='group', title='Ratio of Matching Real vs Predicted Categories by Venue', labels={'Ratio': 'Ratio of Matches'}) # Ensure y-axis range is between 0 and 1 fig.update_yaxes(range=[0, 1]) # Show the figure fig.show()