In [None]:
# !pip install plotly kaleido datasets nbformat -U -q

In [None]:
import os

import datasets
import pandas as pd
from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
login(os.getenv("HF_TOKEN"))

pd.set_option("max_colwidth", None)

OUTPUT_DIR = "../../output"

In [None]:
eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")["validation"]
eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
eval_df = pd.DataFrame(eval_ds)

# 1. Load all results

In [None]:
import glob


results = []
for f in glob.glob(f"{OUTPUT_DIR}/validation/*.jsonl"):
 df = pd.read_json(f, lines=True)
 df["agent_name"] = f.split("/")[-1].split(".")[0]
 results.append(df)

result_df = pd.concat(results)
result_df["prediction"] = result_df["prediction"].fillna("No prediction")

In [None]:
import re
from collections import Counter

from scripts.gaia_scorer import check_close_call, question_scorer


result_df["is_correct"] = result_df.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
result_df["is_near_correct"] = result_df.apply(
 lambda x: check_close_call(x["prediction"], x["true_answer"], x["is_correct"]),
 axis=1,
)

result_df["count_steps"] = result_df["intermediate_steps"].apply(len)


def find_attachment(question):
 matches = eval_df.loc[eval_df["question"].apply(lambda x: x in question), "file_name"]

 if len(matches) == 0:
 return "Not found"
 file_path = matches.values[0]

 if isinstance(file_path, str) and len(file_path) > 0:
 return file_path.split(".")[-1]
 else:
 return "None"


result_df["attachment_type"] = result_df["question"].apply(find_attachment)


def extract_tool_calls(code):
 regex = r"\b(\w+)\("
 function_calls = [el for el in re.findall(regex, code) if el.islower()]

 function_call_counter = Counter(function_calls)
 return function_call_counter


def sum_tool_calls(steps):
 total_count = Counter()
 for step in steps:
 if "llm_output" in step:
 total_count += extract_tool_calls(step["llm_output"])

 return total_count


def get_durations(row):
 # start_datetime = datetime.strptime(row['start_time'], "%Y-%m-%d %H:%M:%S")
 # end_datetime = datetime.strptime(row['end_time'], "%Y-%m-%d %H:%M:%S")

 duration_timedelta = row["end_time"] - row["start_time"]
 return int(duration_timedelta.total_seconds())


result_df["duration"] = result_df.apply(get_durations, axis=1)
# result_df["tool_calls"] = result_df["intermediate_steps"].apply(sum_tool_calls)

In [None]:
def get_thoughts(x):
 try:
 output = x[0]["task"]
 for y in x[1:]:
 try:
 if "observation" in y:
 output += y["llm_output"] + "\nObservation:" + y["observation"]
 else:
 output += y["llm_output"] + r"\Error:" + str(y["error"])
 except Exception:
 pass
 return output
 except Exception:
 return None


result_df["thoughts"] = result_df["intermediate_steps"].apply(lambda x: get_thoughts(x))

In [None]:
result_df["agent_name"].value_counts()

# 2. Inspect specific runs

In [None]:
sel_df = result_df
# sel_df = sel_df.loc[
# (result_df["agent_name"].isin(list_versions))
# ]
sel_df = sel_df.reset_index(drop=True)
display(sel_df["agent_name"].value_counts())
sel_df = sel_df.drop_duplicates(subset=["agent_name", "question"])
display(sel_df.groupby("agent_name")[["task"]].value_counts())
print("Total length:", len(sel_df), "- is complete:", len(sel_df) == 165)

In [None]:
display("Average score:", sel_df.groupby("agent_name")[["is_correct"]].mean().round(3))
display(
 sel_df.groupby(["agent_name", "task"])[["is_correct", "is_near_correct", "count_steps", "question", "duration"]]
 .agg(
 {
 "is_correct": "mean",
 "is_near_correct": "mean",
 "count_steps": "mean",
 "question": "count",
 "duration": "mean",
 }
 )
 .rename(columns={"question": "count"})
)

In [None]:
import plotly.express as px


cumulative_df = (
 (
 sel_df.groupby("agent_name")[["is_correct", "is_near_correct"]]
 .expanding(min_periods=1, axis=0, method="single")
 .agg({"is_correct": "mean", "is_near_correct": "count"})
 .reset_index()
 )
 .copy()
 .rename(columns={"is_near_correct": "index"})
)
cumulative_df["index"] = cumulative_df["index"].astype(int) - 1


def find_question(row):
 try:
 res = sel_df.loc[sel_df["agent_name"] == row["agent_name"], "question"].iloc[row["index"]][:50]
 return res
 except Exception:
 return ""


cumulative_df["question"] = cumulative_df.apply(find_question, axis=1)

px.line(
 cumulative_df,
 color="agent_name",
 x="index",
 y="is_correct",
 hover_data="question",
)

# 3. Dive deeper into one run

In [None]:
sel_df = result_df.loc[result_df["agent_name"] == "o1"]
print(len(sel_df))

### Count errors

In [None]:
import numpy as np


error_types = [
 "AgentParsingError",
 "AgentExecutionError",
 "AgentMaxIterationsError",
 "AgentGenerationError",
]
sel_df[error_types] = 0
sel_df["Count steps"] = np.nan


def count_errors(row):
 if isinstance(row["intermediate_steps"], list):
 row["Count steps"] = len(row["intermediate_steps"])
 for step in row["intermediate_steps"]:
 if isinstance(step, dict) and "error" in step:
 try:
 row[str(step["error"]["error_type"])] += 1
 except Exception:
 pass
 return row


sel_df = sel_df.apply(count_errors, axis=1)

In [None]:
import plotly.express as px


aggregate_errors = (
 sel_df.groupby(["is_correct"])[error_types + ["Count steps"]].mean().reset_index().melt(id_vars=["is_correct"])
)

fig = px.bar(
 aggregate_errors,
 y="value",
 x="variable",
 color="is_correct",
 labels={
 "agent_name": "Model",
 "task": "Level",
 "aggregate_score": "Performance",
 "value": "Average count",
 "eval_score_GPT4": "Score",
 },
)
fig.update_layout(
 height=500,
 width=800,
 barmode="group",
 bargroupgap=0.0,
)
fig.update_traces(textposition="outside")
fig.write_image("aggregate_errors.png", scale=3)
fig.show()

### Inspect result by file extension type

In [None]:
display(
 result_df.groupby(["attachment_type"])[["is_correct", "count_steps", "question"]].agg(
 {"is_correct": "mean", "count_steps": "mean", "question": "count"}
 )
)

# 4. Ensembling methods

In [None]:
counts = result_df["agent_name"].value_counts()
long_series = result_df.loc[result_df["agent_name"].isin(counts[counts > 140].index)]

In [None]:
def majority_vote(df):
 df = df[(df["prediction"] != "Unable to determine") & (~df["prediction"].isna()) & (df["prediction"] != "None")]

 answer_modes = df.groupby("question")["prediction"].agg(lambda x: x.mode()[0]).reset_index()
 first_occurrences = (
 df.groupby(["question", "prediction"]).agg({"task": "first", "is_correct": "first"}).reset_index()
 )
 result = answer_modes.merge(first_occurrences, on=["question", "prediction"], how="left")

 return result


def oracle(df):
 def get_first_correct_or_first_wrong(group):
 correct_answers = group[group["is_correct"]]
 if len(correct_answers) > 0:
 return correct_answers.iloc[0]
 return group.iloc[0]

 result = df.groupby("question").apply(get_first_correct_or_first_wrong)

 return result.reset_index(drop=True)


display((long_series.groupby("agent_name")["is_correct"].mean() * 100).round(2))
print(f"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}")
print(f"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}")

### Submit

In [None]:
agent_run = "code_o1_04_february_submission5.jsonl"
df = pd.read_json(f"output/validation/{agent_run}", lines=True)
df = df[["task_id", "prediction", "intermediate_steps"]]
df = df.rename(columns={"prediction": "model_answer", "intermediate_steps": "reasoning_trace"})

In [None]:
df.to_json("submission.jsonl", orient="records", lines=True)