File size: 5,895 Bytes
77f290b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import plotly.express as px
import pandas as pd

df = pd.read_csv('data/results.csv', sep="\t")
custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]

# Calculate total number of URLs per year and venue
total_urls_per_year_venue = df.groupby(['year', 'venue']).size().reset_index(name='total_urls')

# Calculate the number of URLs with errors per year and venue
errors_per_year_venue = df[df["pred_valid"] != False].groupby(['year', 'venue']).size().reset_index(name='errors')

# Merge the DataFrames to calculate the error rate
error_rate_df = pd.merge(total_urls_per_year_venue, errors_per_year_venue, on=['year', 'venue'], how='left')
error_rate_df['errors'] = error_rate_df['errors'].fillna(0)  # Replace NaN with 0 for venues with no errors
error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']

# Plot the error rates using Plotly, with year on x-axis and color by venue
fig = px.bar(
    error_rate_df,
    x='year',
    y='error_rate',
    color='venue',
    barmode='group',
    title=f'Success Rate per Venue and Year for "valid_url"',
    labels={'error_rate': 'Success Rate', 'year': 'Year'},
    category_orders={'venue': custom_order}
)

fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[2017.5, 2024.5])
fig.show()


for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
    # Calculate total number of URLs per year and venue
    total_valid_urls_per_year_venue = df[df["pred_valid"] == True].groupby(['year', 'venue']).size().reset_index(name='total_urls')

    # Calculate the number of URLs with errors per year and venue
    passes_per_year_venue = df[df[topic] != "No"].groupby(['year', 'venue']).size().reset_index(name='successes')

    # Merge the DataFrames to calculate the error rate
    success_rate_df = pd.merge(total_urls_per_year_venue, passes_per_year_venue, on=['year', 'venue'], how='left')
    success_rate_df['successes'] = success_rate_df['successes'].fillna(0)  # Replace NaN with 0 for venues with no errors
    success_rate_df['success_rate'] = success_rate_df['successes'] / success_rate_df['total_urls']

    # Plot the error rates using Plotly, with year on x-axis and color by venue
    fig = px.bar(
        success_rate_df,
        x='year',
        y='success_rate',
        color='venue',
        barmode='group',
        title=f'Success Rate per Venue and Year for "{topic}"',
        labels={'error_rate': 'Success Rate', 'year': 'Year'},
        category_orders={'venue': custom_order}
    )

    fig.update_yaxes(range=[0, 1])
    fig.update_xaxes(range=[2017.5, 2024.5])
    fig.show()


# List of columns to check for "No"
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]

# Step 1: Calculate the number of "No" answers per row for the specified columns
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)

# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
fig = px.scatter(
    df,
    x='pred_citations',
    y='no_count',
    color='venue',
    title='Number of "No" Answers vs Predicted Stars, Color Coded by Venue',
    labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
    category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary
    log_x=True
)

# Step 3: Display the scatter plot
fig.show()

# List of columns to check for "No"
columns_to_check = ["pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]

# Step 1: Calculate the number of "No" answers per row for the specified columns
df['no_count'] = df[columns_to_check].apply(lambda row: (row != 'No').sum(), axis=1)

# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
fig = px.strip(
    df,
    x='venue',
    y='no_count',
    color='venue',
    title='Individual "No" Scores with Jitter per Venue',
    labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
    category_orders={'venue': custom_order},  # Ensure custom order for venues
    stripmode='overlay'  # Allows all individual points to overlay each other
)

# Step 3: Add some jitter to the x-axis so points don't overlap
fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))

# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
fig.add_trace(px.box(
    df,
    x='venue',
    y='no_count',
    category_orders={'venue': custom_order}
).data[0])  # We add the first trace of the box plot to overlay

# Step 5: Show the plot
fig.show()

for topic in ["pred_live", "pred_dependencies", "pred_training", "pred_evaluation", "pred_weights", "pred_readme", "pred_license"]:
    # Calculate total number of URLs per venue
    total_urls_per_venue = df.groupby('venue').size().reset_index(name='total_urls')

    # Calculate the number of URLs with errors per venue
    errors_per_venue = df[df[topic] != "No"].groupby('venue').size().reset_index(name='errors')

    # Merge the DataFrames to calculate the error rate
    error_rate_df = pd.merge(total_urls_per_venue, errors_per_venue, on='venue', how='left')
    error_rate_df['errors'] = error_rate_df['errors'].fillna(0)  # Replace NaN with 0 for venues with no errors
    error_rate_df['error_rate'] = error_rate_df['errors'] / error_rate_df['total_urls']

    # Plot the error rates using Plotly, with venue on x-axis
    fig = px.bar(
        error_rate_df,
        x='venue',
        y='error_rate',
        color='venue',
        title=f'Success Rate per Venue for "{topic}"',
        labels={'error_rate': 'Success Rate', 'venue': 'Venue'},
        category_orders={'venue': custom_order}
    )

    fig.update_yaxes(range=[0, 1])
    fig.show()