import plotly.express as px
import pandas as pd
import re

# Define columns for all relevant predictions
pred_columns = ['pred_dependencies', 'pred_training', 
                'pred_evaluation', 'pred_weights', 'pred_readme', 
                'pred_license']

# Define the real and predicted column pairs
real_pred_columns = {
    'dependencies': 'pred_dependencies',
    'training': 'pred_training',
    'evaluation': 'pred_evaluation',
    'weights': 'pred_weights',
    'readme': 'pred_readme',
    'license': 'pred_license'
}

df = pd.read_csv('data/results.csv', sep="\t")

# Cleanup
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df = df.dropna(subset=['year'])
df['year'] = df['year'].astype(int)

# df['venue'] = df['venue'].apply(lambda x: str(re.search(r"'(.*?)'", x).group(1)))

custom_order = ["MICCAI", "MIDL", "Nature", "arXiv"]

# Group by year and venue, and calculate the ratio of papers where URL is not None
df_grouped = df.groupby(['year', 'venue']).agg(
    total_papers=('title', 'count'),
    papers_with_url=('url', lambda x: x.notna().sum()),
    valid_urls=('pred_live', lambda x: (x == "Yes").sum())
).reset_index()

df_grouped['ratio'] = df_grouped['papers_with_url'] / df_grouped['total_papers']

# Create the plotly figure
fig = px.bar(
    df_grouped,
    x='year',
    y='ratio',
    color='venue',
    barmode='group',
    title=f'Success Rate per Venue and Year for "valid_url"',
    labels={'ratio': 'Ratio of Papers with URL', 'year': 'Year', 'venue': 'Venue'},
    category_orders={'venue': custom_order}
)

fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[2017.5, 2024.5])
fig.show()

df_grouped['valid_ratio'] = df_grouped['valid_urls'] / df_grouped['papers_with_url']


# Plot the error rates using Plotly, with year on x-axis and color by venue
fig = px.bar(
    df_grouped,
    x='year',
    y='valid_ratio',
    color='venue',
    barmode='group',
    title=f'Success Rate per Venue and Year for "valid_url"',
    labels={'error_rate': 'Success Rate', 'year': 'Year'},
    category_orders={'venue': custom_order}
)

fig.update_yaxes(range=[0, 1])
fig.update_xaxes(range=[2017.5, 2024.5])
fig.show()


# Ensure boolean columns are actually booleans
df_new = df.copy()
for col in pred_columns:
    df_new[col] = df_new[col] == "Yes"

df_grouped = df_new.groupby('venue').agg(
    valid_urls=('pred_live', lambda x: (x == "Yes").sum()),
    **{col: (col, lambda x: x[df_new['pred_live'] == "Yes"].sum()) for col in pred_columns}  
).reset_index()


# Calculate the ratio for each prediction column
for col in pred_columns:
    df_grouped[col] = df_grouped[col] / df_grouped['valid_urls']

# Melt the dataframe for easier plotting
df_melted = df_grouped.melt(id_vars=['venue'], 
                             value_vars=pred_columns, 
                             var_name='Prediction Type', 
                             value_name='Ratio')

# Create a grouped bar plot
fig = px.bar(df_melted, x='venue', y='Ratio', color='Prediction Type',
             barmode='group',  # Ensures bars are side by side
             category_orders={'venue': custom_order},
             title='Ratio of Predictions by Venue')

# Show the figure
fig.update_yaxes(range=[0, 1])
fig.show()

# List of columns to check for "No"
# Step 1: Filter only rows where pred_live is "Yes"
df_filtered = df[df['pred_live'] == "Yes"].copy()
for col in pred_columns:
    df_filtered[col] = df_filtered[col] == "Yes"

# Step 1: Calculate the number of "No" answers per row for the specified columns
df_filtered['no_count'] = df_filtered[pred_columns].apply(lambda row: (row).sum(), axis=1)

# Step 2: Create scatter plot with pred_stars on x-axis and no_count on y-axis, color-coded by venue
fig = px.scatter(
    df_filtered,
    x='pred_citations',
    y='no_count',
    color='venue',
    title='Number of passed tests, Color Coded by Venue',
    labels={'pred_stars': 'Predicted Stars', 'no_count': 'Automated Reproducibility score (0-6)'},
    category_orders={'venue': custom_order},  # Ensure custom order for venue if necessary
    log_x=True
)

# Step 3: Display the scatter plot
fig.show()

# [np.corrcoef(np.array(df_filtered[col][~(pd.isna(df_filtered['pred_citations']))], dtype=int), df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])[0, 1] for col in pred_columns]
# np.corrcoef(np.array(df_filtered['no_count'][~(pd.isna(df_filtered['pred_citations']))]), (1 + np.array(df_filtered['pred_citations'][~(pd.isna(df_filtered['pred_citations']))])))

# Step 2: Create a strip plot (scatter-like) with jitter to show individual "No" counts
fig = px.strip(
    df_filtered,
    x='venue',
    y='no_count',
    color='venue',
    title='Automated Reproducibility Score per Venue',
    labels={'no_count': 'Automated Reproducibility Score (0-6)', 'venue': 'Venue'},
    category_orders={'venue': custom_order},  # Ensure custom order for venues
    stripmode='overlay'  # Allows all individual points to overlay each other
)

# Step 3: Add some jitter to the x-axis so points don't overlap
fig.update_traces(jitter=0.3, marker={'size': 8}, selector=dict(mode='markers'))

# Step 4: Optionally overlay a bar plot or box plot to show mean/median and spread
fig.add_trace(px.box(
    df_filtered,
    x='venue',
    y='no_count',
    category_orders={'venue': custom_order}
).data[0])  # We add the first trace of the box plot to overlay

# Step 5: Show the plot
fig.show()

# Group by venue
df_filtered = df[df['pred_live'] == "Yes"].copy()
df_filtered['license'] = df_filtered['license'].apply(lambda row: row if ((row == "No") | (pd.isna(row))) else "Yes")
df_grouped = df_filtered.groupby('venue').agg(
    total_papers=('title', 'count')
).reset_index()

# Add matching counts for each category
for real, pred in real_pred_columns.items():
    df_grouped[f'matching_{real}'] = df_filtered.groupby('venue').apply(lambda g: (g[real] == g[pred]).sum()).reset_index(drop=True)

# Compute the ratio for each category
for real in real_pred_columns.keys():
    df_grouped[f'ratio_{real}'] = df_grouped[f'matching_{real}'] / df_grouped['total_papers']

# Melt the dataframe for visualization
df_melted = df_grouped.melt(id_vars=['venue'], 
                             value_vars=[f'ratio_{real}' for real in real_pred_columns.keys()], 
                             var_name='Category', 
                             value_name='Ratio')

# Clean up category names
df_melted['Category'] = df_melted['Category'].str.replace('ratio_', '').str.capitalize()

# Create the bar plot
fig = px.bar(df_melted, x='venue', y='Ratio', color='Category',
             barmode='group',  
             title='Ratio of Matching Real vs Predicted Categories by Venue',
             labels={'Ratio': 'Ratio of Matches'})

# Ensure y-axis range is between 0 and 1
fig.update_yaxes(range=[0, 1])

# Show the figure
fig.show()