Spaces:
Sleeping
Sleeping
import plotly.express as px | |
import pandas as pd | |
df = pd.read_csv('data/results.csv', sep="\t") | |
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"] | |
# Calculate total number of URLs per year and venue | |
total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls') | |
# Calculate the number of URLs with errors per year and venue | |
errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors') | |
# Merge the DataFrames to calculate the error rate | |
error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left') | |
error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors | |
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls'] | |
# Plot the error rates using Plotly, with year on x-axis and color by venue | |
fig = px.bar( | |
error_rate_df, | |
x='year', | |
y='error_rate', | |
color='venue', | |
barmode='group', | |
title=f'Success Rate per Venue and Year for "valid_url"', | |
labels={'error_rate': 'Success Rate', 'year': 'Year'}, | |
category_orders={'venue': custom_order} | |
) | |
fig.update_yaxes(range=[0, 1]) | |
fig.update_xaxes(range=[2017.5, 2024.5]) | |
fig.show() | |
for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]: | |
# Calculate total number of URLs per year and venue | |
total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls') | |
# Calculate the number of URLs with errors per year and venue | |
passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes') | |
# Merge the DataFrames to calculate the error rate | |
success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left') | |
success_rate_df['successes'] = success_rate_df['successes'].fillna(0) # Replace NaN with 0 for venues with no errors | |
success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls'] | |
# Plot the error rates using Plotly, with year on x-axis and color by venue | |
fig = px.bar( | |
success_rate_df, | |
x='year', | |
y='success_rate', | |
color='venue', | |
barmode='group', | |
title=f'Success Rate per Venue and Year for "{topic}"', | |
labels={'error_rate': 'Success Rate', 'year': 'Year'}, | |
category_orders={'venue': custom_order} | |
) | |
fig.update_yaxes(range=[0, 1]) | |
fig.update_xaxes(range=[2017.5, 2024.5]) | |
fig.show() | |
# List of columns to check for "No" | |
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"] | |
# Step 1: Calculate the number of "No" answers per row for the specified columns | |
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1) | |
# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue | |
fig = px.scatter( | |
df, | |
x='pred_citations', | |
y='no_count', | |
color='venue', | |
title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue', | |
labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'}, | |
category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary | |
log_x=True | |
) | |
# Step 3: Display the scatter plot | |
fig.show() | |
# List of columns to check for "No" | |
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"] | |
# Step 1: Calculate the number of "No" answers per row for the specified columns | |
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1) | |
# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts | |
fig = px.strip( | |
df, | |
x='venue', | |
y='no_count', | |
color='venue', | |
title='Individual "No" Scores with Jitter per Venue', | |
labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'}, | |
category_orders={'venue': custom_order}, # Ensure custom order for venues | |
stripmode='overlay' # Allows all individual points to overlay each other | |
) | |
# Step 3: Add some jitter to the x-axis so points don't overlap | |
fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers')) | |
# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread | |
fig.add_trace(px.box( | |
df, | |
x='venue', | |
y='no_count', | |
category_orders={'venue': custom_order} | |
).data[0]) # We add the first trace of the box plot to overlay | |
# Step 5: Show the plot | |
fig.show() | |
for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]: | |
# Calculate total number of URLs per venue | |
total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls') | |
# Calculate the number of URLs with errors per venue | |
errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors') | |
# Merge the DataFrames to calculate the error rate | |
error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left') | |
error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors | |
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls'] | |
# Plot the error rates using Plotly, with venue on x-axis | |
fig = px.bar( | |
error_rate_df, | |
x='venue', | |
y='error_rate', | |
color='venue', | |
title=f'Success Rate per Venue for "{topic}"', | |
labels={'error_rate': 'Success Rate', 'venue': 'Venue'}, | |
category_orders={'venue': custom_order} | |
) | |
fig.update_yaxes(range=[0, 1]) | |
fig.show() | |