{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !pip install plotly kaleido datasets nbformat -U -q"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import datasets\n",
"import pandas as pd\n",
"from dotenv import load_dotenv\n",
"from huggingface_hub import login\n",
"\n",
"\n",
"load_dotenv(override=True)\n",
"login(os.getenv(\"HF_TOKEN\"))\n",
"\n",
"pd.set_option(\"max_colwidth\", None)\n",
"\n",
"OUTPUT_DIR = \"../../output\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]\n",
"eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})\n",
"eval_df = pd.DataFrame(eval_ds)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 1. Load all results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"\n",
"\n",
"results = []\n",
"for f in glob.glob(f\"{OUTPUT_DIR}/validation/*.jsonl\"):\n",
" df = pd.read_json(f, lines=True)\n",
" df[\"agent_name\"] = f.split(\"/\")[-1].split(\".\")[0]\n",
" results.append(df)\n",
"\n",
"result_df = pd.concat(results)\n",
"result_df[\"prediction\"] = result_df[\"prediction\"].fillna(\"No prediction\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from collections import Counter\n",
"\n",
"from scripts.gaia_scorer import check_close_call, question_scorer\n",
"\n",
"\n",
"result_df[\"is_correct\"] = result_df.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
"result_df[\"is_near_correct\"] = result_df.apply(\n",
" lambda x: check_close_call(x[\"prediction\"], x[\"true_answer\"], x[\"is_correct\"]),\n",
" axis=1,\n",
")\n",
"\n",
"result_df[\"count_steps\"] = result_df[\"intermediate_steps\"].apply(len)\n",
"\n",
"\n",
"def find_attachment(question):\n",
" matches = eval_df.loc[eval_df[\"question\"].apply(lambda x: x in question), \"file_name\"]\n",
"\n",
" if len(matches) == 0:\n",
" return \"Not found\"\n",
" file_path = matches.values[0]\n",
"\n",
" if isinstance(file_path, str) and len(file_path) > 0:\n",
" return file_path.split(\".\")[-1]\n",
" else:\n",
" return \"None\"\n",
"\n",
"\n",
"result_df[\"attachment_type\"] = result_df[\"question\"].apply(find_attachment)\n",
"\n",
"\n",
"def extract_tool_calls(code):\n",
" regex = r\"\\b(\\w+)\\(\"\n",
" function_calls = [el for el in re.findall(regex, code) if el.islower()]\n",
"\n",
" function_call_counter = Counter(function_calls)\n",
" return function_call_counter\n",
"\n",
"\n",
"def sum_tool_calls(steps):\n",
" total_count = Counter()\n",
" for step in steps:\n",
" if \"llm_output\" in step:\n",
" total_count += extract_tool_calls(step[\"llm_output\"])\n",
"\n",
" return total_count\n",
"\n",
"\n",
"def get_durations(row):\n",
" # start_datetime = datetime.strptime(row['start_time'], \"%Y-%m-%d %H:%M:%S\")\n",
" # end_datetime = datetime.strptime(row['end_time'], \"%Y-%m-%d %H:%M:%S\")\n",
"\n",
" duration_timedelta = row[\"end_time\"] - row[\"start_time\"]\n",
" return int(duration_timedelta.total_seconds())\n",
"\n",
"\n",
"result_df[\"duration\"] = result_df.apply(get_durations, axis=1)\n",
"# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_thoughts(x):\n",
" try:\n",
" output = x[0][\"task\"]\n",
" for y in x[1:]:\n",
" try:\n",
" if \"observation\" in y:\n",
" output += y[\"llm_output\"] + \"\\nObservation:\" + y[\"observation\"]\n",
" else:\n",
" output += y[\"llm_output\"] + r\"\\Error:\" + str(y[\"error\"])\n",
" except Exception:\n",
" pass\n",
" return output\n",
" except Exception:\n",
" return None\n",
"\n",
"\n",
"result_df[\"thoughts\"] = result_df[\"intermediate_steps\"].apply(lambda x: get_thoughts(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result_df[\"agent_name\"].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 2. Inspect specific runs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sel_df = result_df\n",
"# sel_df = sel_df.loc[\n",
"# (result_df[\"agent_name\"].isin(list_versions))\n",
"# ]\n",
"sel_df = sel_df.reset_index(drop=True)\n",
"display(sel_df[\"agent_name\"].value_counts())\n",
"sel_df = sel_df.drop_duplicates(subset=[\"agent_name\", \"question\"])\n",
"display(sel_df.groupby(\"agent_name\")[[\"task\"]].value_counts())\n",
"print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display(\"Average score:\", sel_df.groupby(\"agent_name\")[[\"is_correct\"]].mean().round(3))\n",
"display(\n",
" sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\", \"duration\"]]\n",
" .agg(\n",
" {\n",
" \"is_correct\": \"mean\",\n",
" \"is_near_correct\": \"mean\",\n",
" \"count_steps\": \"mean\",\n",
" \"question\": \"count\",\n",
" \"duration\": \"mean\",\n",
" }\n",
" )\n",
" .rename(columns={\"question\": \"count\"})\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"\n",
"cumulative_df = (\n",
" (\n",
" sel_df.groupby(\"agent_name\")[[\"is_correct\", \"is_near_correct\"]]\n",
" .expanding(min_periods=1, axis=0, method=\"single\")\n",
" .agg({\"is_correct\": \"mean\", \"is_near_correct\": \"count\"})\n",
" .reset_index()\n",
" )\n",
" .copy()\n",
" .rename(columns={\"is_near_correct\": \"index\"})\n",
")\n",
"cumulative_df[\"index\"] = cumulative_df[\"index\"].astype(int) - 1\n",
"\n",
"\n",
"def find_question(row):\n",
" try:\n",
" res = sel_df.loc[sel_df[\"agent_name\"] == row[\"agent_name\"], \"question\"].iloc[row[\"index\"]][:50]\n",
" return res\n",
" except Exception:\n",
" return \"\"\n",
"\n",
"\n",
"cumulative_df[\"question\"] = cumulative_df.apply(find_question, axis=1)\n",
"\n",
"px.line(\n",
" cumulative_df,\n",
" color=\"agent_name\",\n",
" x=\"index\",\n",
" y=\"is_correct\",\n",
" hover_data=\"question\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 3. Dive deeper into one run"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sel_df = result_df.loc[result_df[\"agent_name\"] == \"o1\"]\n",
"print(len(sel_df))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Count errors"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"\n",
"error_types = [\n",
" \"AgentParsingError\",\n",
" \"AgentExecutionError\",\n",
" \"AgentMaxIterationsError\",\n",
" \"AgentGenerationError\",\n",
"]\n",
"sel_df[error_types] = 0\n",
"sel_df[\"Count steps\"] = np.nan\n",
"\n",
"\n",
"def count_errors(row):\n",
" if isinstance(row[\"intermediate_steps\"], list):\n",
" row[\"Count steps\"] = len(row[\"intermediate_steps\"])\n",
" for step in row[\"intermediate_steps\"]:\n",
" if isinstance(step, dict) and \"error\" in step:\n",
" try:\n",
" row[str(step[\"error\"][\"error_type\"])] += 1\n",
" except Exception:\n",
" pass\n",
" return row\n",
"\n",
"\n",
"sel_df = sel_df.apply(count_errors, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import plotly.express as px\n",
"\n",
"\n",
"aggregate_errors = (\n",
" sel_df.groupby([\"is_correct\"])[error_types + [\"Count steps\"]].mean().reset_index().melt(id_vars=[\"is_correct\"])\n",
")\n",
"\n",
"fig = px.bar(\n",
" aggregate_errors,\n",
" y=\"value\",\n",
" x=\"variable\",\n",
" color=\"is_correct\",\n",
" labels={\n",
" \"agent_name\": \"Model\",\n",
" \"task\": \"Level\",\n",
" \"aggregate_score\": \"Performance\",\n",
" \"value\": \"Average count\",\n",
" \"eval_score_GPT4\": \"Score\",\n",
" },\n",
")\n",
"fig.update_layout(\n",
" height=500,\n",
" width=800,\n",
" barmode=\"group\",\n",
" bargroupgap=0.0,\n",
")\n",
"fig.update_traces(textposition=\"outside\")\n",
"fig.write_image(\"aggregate_errors.png\", scale=3)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Inspect result by file extension type"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"display(\n",
" result_df.groupby([\"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
" {\"is_correct\": \"mean\", \"count_steps\": \"mean\", \"question\": \"count\"}\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 4. Ensembling methods"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"counts = result_df[\"agent_name\"].value_counts()\n",
"long_series = result_df.loc[result_df[\"agent_name\"].isin(counts[counts > 140].index)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def majority_vote(df):\n",
" df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n",
"\n",
" answer_modes = df.groupby(\"question\")[\"prediction\"].agg(lambda x: x.mode()[0]).reset_index()\n",
" first_occurrences = (\n",
" df.groupby([\"question\", \"prediction\"]).agg({\"task\": \"first\", \"is_correct\": \"first\"}).reset_index()\n",
" )\n",
" result = answer_modes.merge(first_occurrences, on=[\"question\", \"prediction\"], how=\"left\")\n",
"\n",
" return result\n",
"\n",
"\n",
"def oracle(df):\n",
" def get_first_correct_or_first_wrong(group):\n",
" correct_answers = group[group[\"is_correct\"]]\n",
" if len(correct_answers) > 0:\n",
" return correct_answers.iloc[0]\n",
" return group.iloc[0]\n",
"\n",
" result = df.groupby(\"question\").apply(get_first_correct_or_first_wrong)\n",
"\n",
" return result.reset_index(drop=True)\n",
"\n",
"\n",
"display((long_series.groupby(\"agent_name\")[\"is_correct\"].mean() * 100).round(2))\n",
"print(f\"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}\")\n",
"print(f\"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Submit"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"agent_run = \"code_o1_04_february_submission5.jsonl\"\n",
"df = pd.read_json(f\"output/validation/{agent_run}\", lines=True)\n",
"df = df[[\"task_id\", \"prediction\", \"intermediate_steps\"]]\n",
"df = df.rename(columns={\"prediction\": \"model_answer\", \"intermediate_steps\": \"reasoning_trace\"})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.to_json(\"submission.jsonl\", orient=\"records\", lines=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "test",
"language": "python",
"name": "test"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}