{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Compare a text-based vs a vision-based browser\n", "\n", "Warning: this notebook is experimental, it probably won't work out of the box!" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install \"smolagents[litellm]\" -q" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import datasets\n", "\n", "\n", "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "to_keep = [\n", " \"What's the last line of the rhyme under the flavor\",\n", " 'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n", " \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n", " \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n", " \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n", " \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n", " \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n", " \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n", " \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n", " \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n", " \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n", " 'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n", " \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n", "]\n", "eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n", "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "from dotenv import load_dotenv\n", "from huggingface_hub import login\n", "\n", "\n", "load_dotenv(override=True)\n", "\n", "login(os.getenv(\"HF_TOKEN\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Text browser" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from scripts.run_agents import answer_questions\n", "from scripts.text_inspector_tool import TextInspectorTool\n", "from scripts.text_web_browser import (\n", " ArchiveSearchTool,\n", " FinderTool,\n", " FindNextTool,\n", " NavigationalSearchTool,\n", " PageDownTool,\n", " PageUpTool,\n", " SearchInformationTool,\n", " VisitTool,\n", ")\n", "from scripts.visual_qa import VisualQAGPT4Tool\n", "\n", "from smolagents import CodeAgent, LiteLLMModel\n", "\n", "\n", "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "### BUILD AGENTS & TOOLS\n", "\n", "WEB_TOOLS = [\n", " SearchInformationTool(),\n", " NavigationalSearchTool(),\n", " VisitTool(),\n", " PageUpTool(),\n", " PageDownTool(),\n", " FinderTool(),\n", " FindNextTool(),\n", " ArchiveSearchTool(),\n", "]\n", "\n", "\n", "surfer_agent = CodeAgent(\n", " model=proprietary_model,\n", " tools=WEB_TOOLS,\n", " max_steps=20,\n", " verbosity_level=2,\n", ")\n", "\n", "results_text = answer_questions(\n", " eval_ds,\n", " surfer_agent,\n", " \"code_gpt4o_27-01_text\",\n", " reformulation_model=proprietary_model,\n", " output_folder=\"output_browsers\",\n", " visual_inspection_tool=VisualQAGPT4Tool(),\n", " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Vision browser" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install helium -q" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from scripts.visual_qa import VisualQAGPT4Tool\n", "\n", "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n", "from smolagents.vision_web_browser import (\n", " close_popups,\n", " go_back,\n", " helium_instructions,\n", " initialize_agent,\n", " save_screenshot,\n", " search_item_ctrl_f,\n", ")\n", "\n", "\n", "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")\n", "vision_browser_agent = initialize_agent(proprietary_model)\n", "### BUILD AGENTS & TOOLS\n", "\n", "CodeAgent(\n", " tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n", " model=proprietary_model,\n", " additional_authorized_imports=[\"helium\"],\n", " step_callbacks=[save_screenshot],\n", " max_steps=20,\n", " verbosity_level=2,\n", ")\n", "\n", "results_vision = answer_questions(\n", " eval_ds,\n", " vision_browser_agent,\n", " \"code_gpt4o_27-01_vision\",\n", " reformulation_model=proprietary_model,\n", " output_folder=\"output_browsers\",\n", " visual_inspection_tool=VisualQAGPT4Tool(),\n", " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", " postprompt=helium_instructions\n", " + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Browser-use browser" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install browser-use lxml_html_clean -q\n", "!playwright install" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import asyncio\n", "\n", "import nest_asyncio\n", "\n", "\n", "nest_asyncio.apply()\n", "\n", "from browser_use import Agent\n", "from dotenv import load_dotenv\n", "from langchain_openai import ChatOpenAI\n", "\n", "\n", "load_dotenv()\n", "\n", "\n", "class BrowserUseAgent:\n", " logs = []\n", "\n", " def write_inner_memory_from_logs(self, summary_mode):\n", " return self.results\n", "\n", " def run(self, task, **kwargs):\n", " agent = Agent(\n", " task=task,\n", " llm=ChatOpenAI(model=\"gpt-4o\"),\n", " )\n", " self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n", " return self.results.history[-1].result[0].extracted_content\n", "\n", "\n", "browser_use_agent = BrowserUseAgent()\n", "\n", "results_browseruse = answer_questions(\n", " eval_ds,\n", " browser_use_agent,\n", " \"gpt-4o_27-01_browseruse\",\n", " reformulation_model=proprietary_model,\n", " output_folder=\"output_browsers\",\n", " visual_inspection_tool=VisualQAGPT4Tool(),\n", " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n", " postprompt=\"\",\n", " run_simple=True,\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Get results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from scripts.gaia_scorer import question_scorer\n", "\n", "\n", "results_vision, results_text, results_browseruse = (\n", " pd.DataFrame(results_vision),\n", " pd.DataFrame(results_text),\n", " pd.DataFrame(results_browseruse),\n", ")\n", "\n", "results_vision[\"is_correct\"] = results_vision.apply(\n", " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n", ")\n", "results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n", "results_browseruse[\"is_correct\"] = results_browseruse.apply(\n", " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "results = pd.concat([results_vision, results_text, results_browseruse])\n", "results.groupby(\"agent_name\")[\"is_correct\"].mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n", "correct_vision_results" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n", "false_text_results" ] } ], "metadata": { "kernelspec": { "display_name": "gaia", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 2 }