{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "import plotly.express as px\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "# Define columns for all relevant predictions\n",
    "pred_columns = ['pred_dependencies', 'pred_training', \n",
    "                'pred_evaluation', 'pred_weights', 'pred_readme', \n",
    "                'pred_license']\n",
    "\n",
    "# Define the real and predicted column pairs\n",
    "real_pred_columns = {\n",
    "    'dependencies': 'pred_dependencies',\n",
    "    'training': 'pred_training',\n",
    "    'evaluation': 'pred_evaluation',\n",
    "    'weights': 'pred_weights',\n",
    "    'readme': 'pred_readme',\n",
    "    'license': 'pred_license'\n",
    "}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "df = pd.read_csv('data/results.csv', sep=\"\\t\")\n",
    "\n",
    "# Cleanup\n",
    "df['year'] = pd.to_numeric(df['year'], errors='coerce')\n",
    "df = df.dropna(subset=['year'])\n",
    "df['year'] = df['year'].astype(int)\n",
    "\n",
    "df['venue'] = df['venue'].apply(lambda x: str(re.search(r\"'(.*?)'\", x).group(1)))\n",
    "\n",
    "custom_order = [\"MICCAI\", \"MIDL\", \"Nature\", \"arXiv\"]\n",
    "\n",
    "# Group by year and venue, and calculate the ratio of papers where URL is not None\n",
    "df_grouped = df.groupby(['year', 'venue']).agg(\n",
    "    total_papers=('title', 'count'),\n",
    "    papers_with_url=('url', lambda x: x.notna().sum()),\n",
    "    valid_urls=('pred_live', lambda x: (x == \"Yes\").sum())\n",
    ").reset_index()\n",
    "\n",
    "df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']\n",
    "\n",
    "# Create the plotly figure\n",
    "fig = px.bar(\n",
    "    df_grouped,\n",
    "    x='year',\n",
    "    y='ratio',\n",
    "    color='venue',\n",
    "    barmode='group',\n",
    "    title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
    "    labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},\n",
    "    category_orders={'venue': custom_order}\n",
    ")\n",
    "\n",
    "fig.update_yaxes(range=[0, 1])\n",
    "fig.update_xaxes(range=[2017.5, 2024.5])\n",
    "fig.show()\n",
    "\n",
    "df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']\n",
    "\n",
    "\n",
    "# Plot the error rates using Plotly, with year on x-axis and color by venue\n",
    "fig = px.bar(\n",
    "    df_grouped,\n",
    "    x='year',\n",
    "    y='valid_ratio',\n",
    "    color='venue',\n",
    "    barmode='group',\n",
    "    title=f'Success Rate per Venue and Year for \"valid_url\"',\n",
    "    labels={'error_rate': 'Success Rate', 'year': 'Year'},\n",
    "    category_orders={'venue': custom_order}\n",
    ")\n",
    "\n",
    "fig.update_yaxes(range=[0, 1])\n",
    "fig.update_xaxes(range=[2017.5, 2024.5])\n",
    "fig.show()\n",
    "\n",
    "\n",
    "# Ensure boolean columns are actually booleans\n",
    "df_new = df.copy()\n",
    "for col in pred_columns:\n",
    "    df_new[col] = df_new[col] == \"Yes\"\n",
    "\n",
    "df_grouped = df_new.groupby('venue').agg(\n",
    "    valid_urls=('pred_live', lambda x: (x == \"Yes\").sum()),\n",
    "    **{col: (col, lambda x: x[df_new['pred_live'] == \"Yes\"].sum()) for col in pred_columns}  \n",
    ").reset_index()\n",
    "\n",
    "\n",
    "# Calculate the ratio for each prediction column\n",
    "for col in pred_columns:\n",
    "    df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']\n",
    "\n",
    "# Melt the dataframe for easier plotting\n",
    "df_melted = df_grouped.melt(id_vars=['venue'], \n",
    "                             value_vars=pred_columns, \n",
    "                             var_name='Prediction Type', \n",
    "                             value_name='Ratio')\n",
    "\n",
    "# Create a grouped bar plot\n",
    "fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',\n",
    "             barmode='group',  # Ensures bars are side by side\n",
    "             category_orders={'venue': custom_order},\n",
    "             title='Ratio of Predictions by Venue')\n",
    "\n",
    "# Show the figure\n",
    "fig.update_yaxes(range=[0, 1])\n",
    "fig.show()\n",
    "\n",
    "# List of columns to check for \"No\"\n",
    "# Step 1: Filter only rows where pred_live is \"Yes\"\n",
    "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
    "for col in pred_columns:\n",
    "    df_filtered[col] = df_filtered[col] == \"Yes\"\n",
    "\n",
    "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
    "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
    "\n",
    "# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue\n",
    "fig = px.scatter(\n",
    "    df_filtered,\n",
    "    x='pred_citations',\n",
    "    y='no_count',\n",
    "    color='venue',\n",
    "    title='Number of passed tests, Color Coded by Venue',\n",
    "    labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},\n",
    "    category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary\n",
    "    log_x=True\n",
    ")\n",
    "\n",
    "# Step 3: Display the scatter plot\n",
    "fig.show()\n",
    "\n",
    "# Step 1: Calculate the number of \"No\" answers per row for the specified columns\n",
    "df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)\n",
    "\n",
    "# Step 2: Create a strip plot (scatter-like) with jitter to show individual \"No\" counts\n",
    "fig = px.strip(\n",
    "    df_filtered,\n",
    "    x='venue',\n",
    "    y='no_count',\n",
    "    color='venue',\n",
    "    title='Automated Reproducibility Score per Venue',\n",
    "    labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},\n",
    "    category_orders={'venue': custom_order},  # Ensure custom order for venues\n",
    "    stripmode='overlay'  # Allows all individual points to overlay each other\n",
    ")\n",
    "\n",
    "# Step 3: Add some jitter to the x-axis so points don't overlap\n",
    "fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))\n",
    "\n",
    "# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread\n",
    "fig.add_trace(px.box(\n",
    "    df_filtered,\n",
    "    x='venue',\n",
    "    y='no_count',\n",
    "    category_orders={'venue': custom_order}\n",
    ").data[0])  # We add the first trace of the box plot to overlay\n",
    "\n",
    "# Step 5: Show the plot\n",
    "fig.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "# Group by venue\n",
    "df_filtered = df[df['pred_live'] == \"Yes\"].copy()\n",
    "df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == \"No\") | (pd.isna(row))) else \"Yes\")\n",
    "df_grouped = df_filtered.groupby('venue').agg(\n",
    "    total_papers=('title', 'count')\n",
    ").reset_index()\n",
    "\n",
    "# Add matching counts for each category\n",
    "for real, pred in real_pred_columns.items():\n",
    "    df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)\n",
    "\n",
    "# Compute the ratio for each category\n",
    "for real in real_pred_columns.keys():\n",
    "    df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']\n",
    "\n",
    "# Melt the dataframe for visualization\n",
    "df_melted = df_grouped.melt(id_vars=['venue'], \n",
    "                             value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], \n",
    "                             var_name='Category', \n",
    "                             value_name='Ratio')\n",
    "\n",
    "# Clean up category names\n",
    "df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()\n",
    "\n",
    "# Create the bar plot\n",
    "fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',\n",
    "             barmode='group',  \n",
    "             title='Ratio of Matching Real vs Predicted Categories by Venue',\n",
    "             labels={'Ratio': 'Ratio of Matches'})\n",
    "\n",
    "# Ensure y-axis range is between 0 and 1\n",
    "fig.update_yaxes(range=[0, 1])\n",
    "\n",
    "# Show the figure\n",
    "fig.show()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}