{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "import plotly.express as px\n", "import pandas as pd\n", "import re\n", "\n", "# Define columns for all relevant predictions\n", "pred_columns = ['pred_dependencies', 'pred_training', \n", " 'pred_evaluation', 'pred_weights', 'pred_readme', \n", " 'pred_license']\n", "\n", "# Define the real and predicted column pairs\n", "real_pred_columns = {\n", " 'dependencies': 'pred_dependencies',\n", " 'training': 'pred_training',\n", " 'evaluation': 'pred_evaluation',\n", " 'weights': 'pred_weights',\n", " 'readme': 'pred_readme',\n", " 'license': 'pred_license'\n", "}\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "\n", "df = pd.read_csv('data/results.csv', sep=\"\\t\")\n", "\n", "# Cleanup\n", "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n", "df = df.dropna(subset=['year'])\n", "df['year'] = df['year'].astype(int)\n", "\n", "df['venue'] = df['venue'].apply(lambda x: str(re.search(r\"'(.*?)'\", x).group(1)))\n", "\n", "custom_order = [\"MICCAI\", \"MIDL\", \"Nature\", \"arXiv\"]\n", "\n", "# Group by year and venue, and calculate the ratio of papers where URL is not None\n", "df_grouped = df.groupby(['year', 'venue']).agg(\n", " total_papers=('title', 'count'),\n", " papers_with_url=('url', lambda x: x.notna().sum()),\n", " valid_urls=('pred_live', lambda x: (x == \"Yes\").sum())\n", ").reset_index()\n", "\n", "df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']\n", "\n", "# Create the plotly figure\n", "fig = px.bar(\n", " df_grouped,\n", " x='year',\n", " y='ratio',\n", " color='venue',\n", " barmode='group',\n", " title=f'Success Rate per Venue and Year for \"valid_url\"',\n", " labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},\n", " category_orders={'venue': custom_order}\n", ")\n", "\n", "fig.update_yaxes(range=[0, 1])\n", "fig.update_xaxes(range=[2017.5, 2024.5])\n", "fig.show()\n", "\n", "df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']\n", "\n", "\n", "# Plot the error rates using Plotly, with year on x-axis and color by venue\n", "fig = px.bar(\n", " df_grouped,\n", " x='year',\n", " y='valid_ratio',\n", " color='venue',\n", " barmode='group',\n", " title=f'Success Rate per Venue and Year for \"valid_url\"',\n", " labels={'error_rate': 'Success Rate', 'year': 'Year'},\n", " category_orders={'venue': custom_order}\n", ")\n", "\n", "fig.update_yaxes(range=[0, 1])\n", "fig.update_xaxes(range=[2017.5, 2024.5])\n", "fig.show()\n", "\n", "\n", "# Ensure boolean columns are actually booleans\n", "df_new = df.copy()\n", "for col in pred_columns:\n", " df_new[col] = df_new[col] == \"Yes\"\n", "\n", "df_grouped = df_new.groupby('venue').agg(\n", " valid_urls=('pred_live', lambda x: (x == \"Yes\").sum()),\n", " **{col: (col, lambda x: x[df_new['pred_live'] == \"Yes\"].sum()) for col in pred_columns} \n", ").reset_index()\n", "\n", "\n", "# Calculate the ratio for each prediction column\n", "for col in pred_columns:\n", " df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']\n", "\n", "# Melt the dataframe for easier plotting\n", "df_melted = df_grouped.melt(id_vars=['venue'], \n", " value_vars=pred_columns, \n", " var_name='Prediction Type', \n", " value_name='Ratio')\n", "\n", "# Create a grouped bar plot\n", "fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',\n", " barmode='group', # Ensures bars are side by side\n", " category_orders={'venue': custom_order},\n", " title='Ratio of Predictions by Venue')\n", "\n", "# Show the figure\n", "fig.update_yaxes(range=[0, 1])\n", "fig.show()\n", "\n", "# List of columns to check for \"No\"\n", "# Step 1: Filter only rows where pred_live is \"Yes\"\n", "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n", "for col in pred_columns:\n", " df_filtered[col] = df_filtered[col] == \"Yes\"\n", "\n", "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n", "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n", "\n", "# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue\n", "fig = px.scatter(\n", " df_filtered,\n", " x='pred_citations',\n", " y='no_count',\n", " color='venue',\n", " title='Number of passed tests, Color Coded by Venue',\n", " labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},\n", " category_orders={'venue': custom_order}, # Ensure custom order for venue if necessary\n", " log_x=True\n", ")\n", "\n", "# Step 3: Display the scatter plot\n", "fig.show()\n", "\n", "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n", "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n", "\n", "# Step 2: Create a strip plot (scatter-like) with jitter to show individual \"No\" counts\n", "fig = px.strip(\n", " df_filtered,\n", " x='venue',\n", " y='no_count',\n", " color='venue',\n", " title='Automated Reproducibility Score per Venue',\n", " labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},\n", " category_orders={'venue': custom_order}, # Ensure custom order for venues\n", " stripmode='overlay' # Allows all individual points to overlay each other\n", ")\n", "\n", "# Step 3: Add some jitter to the x-axis so points don't overlap\n", "fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))\n", "\n", "# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread\n", "fig.add_trace(px.box(\n", " df_filtered,\n", " x='venue',\n", " y='no_count',\n", " category_orders={'venue': custom_order}\n", ").data[0]) # We add the first trace of the box plot to overlay\n", "\n", "# Step 5: Show the plot\n", "fig.show()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "vscode": { "languageId": "plaintext" } }, "outputs": [], "source": [ "\n", "# Group by venue\n", "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n", "df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == \"No\") | (pd.isna(row))) else \"Yes\")\n", "df_grouped = df_filtered.groupby('venue').agg(\n", " total_papers=('title', 'count')\n", ").reset_index()\n", "\n", "# Add matching counts for each category\n", "for real, pred in real_pred_columns.items():\n", " df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)\n", "\n", "# Compute the ratio for each category\n", "for real in real_pred_columns.keys():\n", " df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']\n", "\n", "# Melt the dataframe for visualization\n", "df_melted = df_grouped.melt(id_vars=['venue'], \n", " value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], \n", " var_name='Category', \n", " value_name='Ratio')\n", "\n", "# Clean up category names\n", "df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()\n", "\n", "# Create the bar plot\n", "fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',\n", " barmode='group', \n", " title='Ratio of Matching Real vs Predicted Categories by Venue',\n", " labels={'Ratio': 'Ratio of Matches'})\n", "\n", "# Ensure y-axis range is between 0 and 1\n", "fig.update_yaxes(range=[0, 1])\n", "\n", "# Show the figure\n", "fig.show()" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }