Spaces:
Sleeping
Sleeping
File size: 5,895 Bytes
77f290b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import plotly.express as px
import pandas as pd
df = pd.read_csv('data/results.csv', sep="\t")
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]
# Calculate total number of URLs per year and venue
total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls')
# Calculate the number of URLs with errors per year and venue
errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')
# Merge the DataFrames to calculate the error rate
error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
# Plot the error rates using Plotly, with year on x-axis and color by venue
fig = px.bar(
error_rate_df,
x='year',
y='error_rate',
color='venue',
barmode='group',
title=f'Success Rate per Venue and Year for "valid_url"',
labels={'error_rate': 'Success Rate', 'year': 'Year'},
category_orders={'venue': custom_order}
)
fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[2017.5, 2024.5])
fig.show()
for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
# Calculate total number of URLs per year and venue
total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls')
# Calculate the number of URLs with errors per year and venue
passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')
# Merge the DataFrames to calculate the error rate
success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left')
success_rate_df['successes'] = success_rate_df['successes'].fillna(0) # Replace NaN with 0 for venues with no errors
success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']
# Plot the error rates using Plotly, with year on x-axis and color by venue
fig = px.bar(
success_rate_df,
x='year',
y='success_rate',
color='venue',
barmode='group',
title=f'Success Rate per Venue and Year for "{topic}"',
labels={'error_rate': 'Success Rate', 'year': 'Year'},
category_orders={'venue': custom_order}
)
fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[2017.5, 2024.5])
fig.show()
# List of columns to check for "No"
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
# Step 1: Calculate the number of "No" answers per row for the specified columns
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
fig = px.scatter(
df,
x='pred_citations',
y='no_count',
color='venue',
title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue',
labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary
log_x=True
)
# Step 3: Display the scatter plot
fig.show()
# List of columns to check for "No"
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]
# Step 1: Calculate the number of "No" answers per row for the specified columns
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)
# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
fig = px.strip(
df,
x='venue',
y='no_count',
color='venue',
title='Individual "No" Scores with Jitter per Venue',
labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
category_orders={'venue': custom_order}, # Ensure custom order for venues
stripmode='overlay' # Allows all individual points to overlay each other
)
# Step 3: Add some jitter to the x-axis so points don't overlap
fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))
# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
fig.add_trace(px.box(
df,
x='venue',
y='no_count',
category_orders={'venue': custom_order}
).data[0]) # We add the first trace of the box plot to overlay
# Step 5: Show the plot
fig.show()
for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
# Calculate total number of URLs per venue
total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls')
# Calculate the number of URLs with errors per venue
errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors')
# Merge the DataFrames to calculate the error rate
error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left')
error_rate_df['errors'] = error_rate_df['errors'].fillna(0) # Replace NaN with 0 for venues with no errors
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']
# Plot the error rates using Plotly, with venue on x-axis
fig = px.bar(
error_rate_df,
x='venue',
y='error_rate',
color='venue',
title=f'Success Rate per Venue for "{topic}"',
labels={'error_rate': 'Success Rate', 'venue': 'Venue'},
category_orders={'venue': custom_order}
)
fig.update_yaxes(range=[0, 1])
fig.show()
|