leifh commited on
Commit
e06b478
·
1 Parent(s): a781df1
analysis.ipynb ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# !pip install plotly kaleido datasets nbformat -U -q"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import os\n",
19
+ "\n",
20
+ "import datasets\n",
21
+ "import pandas as pd\n",
22
+ "from dotenv import load_dotenv\n",
23
+ "from huggingface_hub import login\n",
24
+ "\n",
25
+ "\n",
26
+ "load_dotenv(override=True)\n",
27
+ "login(os.getenv(\"HF_TOKEN\"))\n",
28
+ "\n",
29
+ "pd.set_option(\"max_colwidth\", None)\n",
30
+ "\n",
31
+ "OUTPUT_DIR = \"../../output\""
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]\n",
41
+ "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})\n",
42
+ "eval_df = pd.DataFrame(eval_ds)"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "metadata": {},
48
+ "source": [
49
+ "# 1. Load all results"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "import glob\n",
59
+ "\n",
60
+ "\n",
61
+ "results = []\n",
62
+ "for f in glob.glob(f\"{OUTPUT_DIR}/validation/*.jsonl\"):\n",
63
+ " df = pd.read_json(f, lines=True)\n",
64
+ " df[\"agent_name\"] = f.split(\"/\")[-1].split(\".\")[0]\n",
65
+ " results.append(df)\n",
66
+ "\n",
67
+ "result_df = pd.concat(results)\n",
68
+ "result_df[\"prediction\"] = result_df[\"prediction\"].fillna(\"No prediction\")"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "metadata": {},
75
+ "outputs": [],
76
+ "source": [
77
+ "import re\n",
78
+ "from collections import Counter\n",
79
+ "\n",
80
+ "from scripts.gaia_scorer import check_close_call, question_scorer\n",
81
+ "\n",
82
+ "\n",
83
+ "result_df[\"is_correct\"] = result_df.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
84
+ "result_df[\"is_near_correct\"] = result_df.apply(\n",
85
+ " lambda x: check_close_call(x[\"prediction\"], x[\"true_answer\"], x[\"is_correct\"]),\n",
86
+ " axis=1,\n",
87
+ ")\n",
88
+ "\n",
89
+ "result_df[\"count_steps\"] = result_df[\"intermediate_steps\"].apply(len)\n",
90
+ "\n",
91
+ "\n",
92
+ "def find_attachment(question):\n",
93
+ " matches = eval_df.loc[eval_df[\"question\"].apply(lambda x: x in question), \"file_name\"]\n",
94
+ "\n",
95
+ " if len(matches) == 0:\n",
96
+ " return \"Not found\"\n",
97
+ " file_path = matches.values[0]\n",
98
+ "\n",
99
+ " if isinstance(file_path, str) and len(file_path) > 0:\n",
100
+ " return file_path.split(\".\")[-1]\n",
101
+ " else:\n",
102
+ " return \"None\"\n",
103
+ "\n",
104
+ "\n",
105
+ "result_df[\"attachment_type\"] = result_df[\"question\"].apply(find_attachment)\n",
106
+ "\n",
107
+ "\n",
108
+ "def extract_tool_calls(code):\n",
109
+ " regex = r\"\\b(\\w+)\\(\"\n",
110
+ " function_calls = [el for el in re.findall(regex, code) if el.islower()]\n",
111
+ "\n",
112
+ " function_call_counter = Counter(function_calls)\n",
113
+ " return function_call_counter\n",
114
+ "\n",
115
+ "\n",
116
+ "def sum_tool_calls(steps):\n",
117
+ " total_count = Counter()\n",
118
+ " for step in steps:\n",
119
+ " if \"llm_output\" in step:\n",
120
+ " total_count += extract_tool_calls(step[\"llm_output\"])\n",
121
+ "\n",
122
+ " return total_count\n",
123
+ "\n",
124
+ "\n",
125
+ "def get_durations(row):\n",
126
+ " # start_datetime = datetime.strptime(row['start_time'], \"%Y-%m-%d %H:%M:%S\")\n",
127
+ " # end_datetime = datetime.strptime(row['end_time'], \"%Y-%m-%d %H:%M:%S\")\n",
128
+ "\n",
129
+ " duration_timedelta = row[\"end_time\"] - row[\"start_time\"]\n",
130
+ " return int(duration_timedelta.total_seconds())\n",
131
+ "\n",
132
+ "\n",
133
+ "result_df[\"duration\"] = result_df.apply(get_durations, axis=1)\n",
134
+ "# result_df[\"tool_calls\"] = result_df[\"intermediate_steps\"].apply(sum_tool_calls)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "def get_thoughts(x):\n",
144
+ " try:\n",
145
+ " output = x[0][\"task\"]\n",
146
+ " for y in x[1:]:\n",
147
+ " try:\n",
148
+ " if \"observation\" in y:\n",
149
+ " output += y[\"llm_output\"] + \"\\nObservation:\" + y[\"observation\"]\n",
150
+ " else:\n",
151
+ " output += y[\"llm_output\"] + r\"\\Error:\" + str(y[\"error\"])\n",
152
+ " except Exception:\n",
153
+ " pass\n",
154
+ " return output\n",
155
+ " except Exception:\n",
156
+ " return None\n",
157
+ "\n",
158
+ "\n",
159
+ "result_df[\"thoughts\"] = result_df[\"intermediate_steps\"].apply(lambda x: get_thoughts(x))"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "result_df[\"agent_name\"].value_counts()"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "markdown",
173
+ "metadata": {},
174
+ "source": [
175
+ "# 2. Inspect specific runs"
176
+ ]
177
+ },
178
+ {
179
+ "cell_type": "code",
180
+ "execution_count": null,
181
+ "metadata": {},
182
+ "outputs": [],
183
+ "source": [
184
+ "sel_df = result_df\n",
185
+ "# sel_df = sel_df.loc[\n",
186
+ "# (result_df[\"agent_name\"].isin(list_versions))\n",
187
+ "# ]\n",
188
+ "sel_df = sel_df.reset_index(drop=True)\n",
189
+ "display(sel_df[\"agent_name\"].value_counts())\n",
190
+ "sel_df = sel_df.drop_duplicates(subset=[\"agent_name\", \"question\"])\n",
191
+ "display(sel_df.groupby(\"agent_name\")[[\"task\"]].value_counts())\n",
192
+ "print(\"Total length:\", len(sel_df), \"- is complete:\", len(sel_df) == 165)"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": null,
198
+ "metadata": {},
199
+ "outputs": [],
200
+ "source": [
201
+ "display(\"Average score:\", sel_df.groupby(\"agent_name\")[[\"is_correct\"]].mean().round(3))\n",
202
+ "display(\n",
203
+ " sel_df.groupby([\"agent_name\", \"task\"])[[\"is_correct\", \"is_near_correct\", \"count_steps\", \"question\", \"duration\"]]\n",
204
+ " .agg(\n",
205
+ " {\n",
206
+ " \"is_correct\": \"mean\",\n",
207
+ " \"is_near_correct\": \"mean\",\n",
208
+ " \"count_steps\": \"mean\",\n",
209
+ " \"question\": \"count\",\n",
210
+ " \"duration\": \"mean\",\n",
211
+ " }\n",
212
+ " )\n",
213
+ " .rename(columns={\"question\": \"count\"})\n",
214
+ ")"
215
+ ]
216
+ },
217
+ {
218
+ "cell_type": "code",
219
+ "execution_count": null,
220
+ "metadata": {},
221
+ "outputs": [],
222
+ "source": [
223
+ "import plotly.express as px\n",
224
+ "\n",
225
+ "\n",
226
+ "cumulative_df = (\n",
227
+ " (\n",
228
+ " sel_df.groupby(\"agent_name\")[[\"is_correct\", \"is_near_correct\"]]\n",
229
+ " .expanding(min_periods=1, axis=0, method=\"single\")\n",
230
+ " .agg({\"is_correct\": \"mean\", \"is_near_correct\": \"count\"})\n",
231
+ " .reset_index()\n",
232
+ " )\n",
233
+ " .copy()\n",
234
+ " .rename(columns={\"is_near_correct\": \"index\"})\n",
235
+ ")\n",
236
+ "cumulative_df[\"index\"] = cumulative_df[\"index\"].astype(int) - 1\n",
237
+ "\n",
238
+ "\n",
239
+ "def find_question(row):\n",
240
+ " try:\n",
241
+ " res = sel_df.loc[sel_df[\"agent_name\"] == row[\"agent_name\"], \"question\"].iloc[row[\"index\"]][:50]\n",
242
+ " return res\n",
243
+ " except Exception:\n",
244
+ " return \"\"\n",
245
+ "\n",
246
+ "\n",
247
+ "cumulative_df[\"question\"] = cumulative_df.apply(find_question, axis=1)\n",
248
+ "\n",
249
+ "px.line(\n",
250
+ " cumulative_df,\n",
251
+ " color=\"agent_name\",\n",
252
+ " x=\"index\",\n",
253
+ " y=\"is_correct\",\n",
254
+ " hover_data=\"question\",\n",
255
+ ")"
256
+ ]
257
+ },
258
+ {
259
+ "cell_type": "markdown",
260
+ "metadata": {},
261
+ "source": [
262
+ "# 3. Dive deeper into one run"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": null,
268
+ "metadata": {},
269
+ "outputs": [],
270
+ "source": [
271
+ "sel_df = result_df.loc[result_df[\"agent_name\"] == \"o1\"]\n",
272
+ "print(len(sel_df))"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "markdown",
277
+ "metadata": {},
278
+ "source": [
279
+ "### Count errors"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": null,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "import numpy as np\n",
289
+ "\n",
290
+ "\n",
291
+ "error_types = [\n",
292
+ " \"AgentParsingError\",\n",
293
+ " \"AgentExecutionError\",\n",
294
+ " \"AgentMaxIterationsError\",\n",
295
+ " \"AgentGenerationError\",\n",
296
+ "]\n",
297
+ "sel_df[error_types] = 0\n",
298
+ "sel_df[\"Count steps\"] = np.nan\n",
299
+ "\n",
300
+ "\n",
301
+ "def count_errors(row):\n",
302
+ " if isinstance(row[\"intermediate_steps\"], list):\n",
303
+ " row[\"Count steps\"] = len(row[\"intermediate_steps\"])\n",
304
+ " for step in row[\"intermediate_steps\"]:\n",
305
+ " if isinstance(step, dict) and \"error\" in step:\n",
306
+ " try:\n",
307
+ " row[str(step[\"error\"][\"error_type\"])] += 1\n",
308
+ " except Exception:\n",
309
+ " pass\n",
310
+ " return row\n",
311
+ "\n",
312
+ "\n",
313
+ "sel_df = sel_df.apply(count_errors, axis=1)"
314
+ ]
315
+ },
316
+ {
317
+ "cell_type": "code",
318
+ "execution_count": null,
319
+ "metadata": {},
320
+ "outputs": [],
321
+ "source": [
322
+ "import plotly.express as px\n",
323
+ "\n",
324
+ "\n",
325
+ "aggregate_errors = (\n",
326
+ " sel_df.groupby([\"is_correct\"])[error_types + [\"Count steps\"]].mean().reset_index().melt(id_vars=[\"is_correct\"])\n",
327
+ ")\n",
328
+ "\n",
329
+ "fig = px.bar(\n",
330
+ " aggregate_errors,\n",
331
+ " y=\"value\",\n",
332
+ " x=\"variable\",\n",
333
+ " color=\"is_correct\",\n",
334
+ " labels={\n",
335
+ " \"agent_name\": \"<b>Model</b>\",\n",
336
+ " \"task\": \"<b>Level</b>\",\n",
337
+ " \"aggregate_score\": \"<b>Performance</b>\",\n",
338
+ " \"value\": \"<b>Average count</b>\",\n",
339
+ " \"eval_score_GPT4\": \"<b>Score</b>\",\n",
340
+ " },\n",
341
+ ")\n",
342
+ "fig.update_layout(\n",
343
+ " height=500,\n",
344
+ " width=800,\n",
345
+ " barmode=\"group\",\n",
346
+ " bargroupgap=0.0,\n",
347
+ ")\n",
348
+ "fig.update_traces(textposition=\"outside\")\n",
349
+ "fig.write_image(\"aggregate_errors.png\", scale=3)\n",
350
+ "fig.show()"
351
+ ]
352
+ },
353
+ {
354
+ "cell_type": "markdown",
355
+ "metadata": {},
356
+ "source": [
357
+ "### Inspect result by file extension type"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": null,
363
+ "metadata": {},
364
+ "outputs": [],
365
+ "source": [
366
+ "display(\n",
367
+ " result_df.groupby([\"attachment_type\"])[[\"is_correct\", \"count_steps\", \"question\"]].agg(\n",
368
+ " {\"is_correct\": \"mean\", \"count_steps\": \"mean\", \"question\": \"count\"}\n",
369
+ " )\n",
370
+ ")"
371
+ ]
372
+ },
373
+ {
374
+ "cell_type": "markdown",
375
+ "metadata": {},
376
+ "source": [
377
+ "# 4. Ensembling methods"
378
+ ]
379
+ },
380
+ {
381
+ "cell_type": "code",
382
+ "execution_count": null,
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "counts = result_df[\"agent_name\"].value_counts()\n",
387
+ "long_series = result_df.loc[result_df[\"agent_name\"].isin(counts[counts > 140].index)]"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "code",
392
+ "execution_count": null,
393
+ "metadata": {},
394
+ "outputs": [],
395
+ "source": [
396
+ "def majority_vote(df):\n",
397
+ " df = df[(df[\"prediction\"] != \"Unable to determine\") & (~df[\"prediction\"].isna()) & (df[\"prediction\"] != \"None\")]\n",
398
+ "\n",
399
+ " answer_modes = df.groupby(\"question\")[\"prediction\"].agg(lambda x: x.mode()[0]).reset_index()\n",
400
+ " first_occurrences = (\n",
401
+ " df.groupby([\"question\", \"prediction\"]).agg({\"task\": \"first\", \"is_correct\": \"first\"}).reset_index()\n",
402
+ " )\n",
403
+ " result = answer_modes.merge(first_occurrences, on=[\"question\", \"prediction\"], how=\"left\")\n",
404
+ "\n",
405
+ " return result\n",
406
+ "\n",
407
+ "\n",
408
+ "def oracle(df):\n",
409
+ " def get_first_correct_or_first_wrong(group):\n",
410
+ " correct_answers = group[group[\"is_correct\"]]\n",
411
+ " if len(correct_answers) > 0:\n",
412
+ " return correct_answers.iloc[0]\n",
413
+ " return group.iloc[0]\n",
414
+ "\n",
415
+ " result = df.groupby(\"question\").apply(get_first_correct_or_first_wrong)\n",
416
+ "\n",
417
+ " return result.reset_index(drop=True)\n",
418
+ "\n",
419
+ "\n",
420
+ "display((long_series.groupby(\"agent_name\")[\"is_correct\"].mean() * 100).round(2))\n",
421
+ "print(f\"Majority score: {majority_vote(long_series)['is_correct'].mean() * 100:.2f}\")\n",
422
+ "print(f\"Oracle score: {oracle(long_series)['is_correct'].mean() * 100:.2f}\")"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "markdown",
427
+ "metadata": {},
428
+ "source": [
429
+ "### Submit"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": null,
435
+ "metadata": {},
436
+ "outputs": [],
437
+ "source": [
438
+ "agent_run = \"code_o1_04_february_submission5.jsonl\"\n",
439
+ "df = pd.read_json(f\"output/validation/{agent_run}\", lines=True)\n",
440
+ "df = df[[\"task_id\", \"prediction\", \"intermediate_steps\"]]\n",
441
+ "df = df.rename(columns={\"prediction\": \"model_answer\", \"intermediate_steps\": \"reasoning_trace\"})"
442
+ ]
443
+ },
444
+ {
445
+ "cell_type": "code",
446
+ "execution_count": null,
447
+ "metadata": {},
448
+ "outputs": [],
449
+ "source": [
450
+ "df.to_json(\"submission.jsonl\", orient=\"records\", lines=True)"
451
+ ]
452
+ },
453
+ {
454
+ "cell_type": "code",
455
+ "execution_count": null,
456
+ "metadata": {},
457
+ "outputs": [],
458
+ "source": []
459
+ }
460
+ ],
461
+ "metadata": {
462
+ "kernelspec": {
463
+ "display_name": "test",
464
+ "language": "python",
465
+ "name": "test"
466
+ },
467
+ "language_info": {
468
+ "codemirror_mode": {
469
+ "name": "ipython",
470
+ "version": 3
471
+ },
472
+ "file_extension": ".py",
473
+ "mimetype": "text/x-python",
474
+ "name": "python",
475
+ "nbconvert_exporter": "python",
476
+ "pygments_lexer": "ipython3",
477
+ "version": "3.12.0"
478
+ }
479
+ },
480
+ "nbformat": 4,
481
+ "nbformat_minor": 2
482
+ }
app.py CHANGED
@@ -1,64 +1,11 @@
1
- import gradio as gr
2
- from huggingface_hub import InferenceClient
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
 
 
62
 
63
  if __name__ == "__main__":
64
  demo.launch()
 
1
+ from run import create_agent
 
2
 
3
+ from smolagents.gradio_ui import GradioUI
 
 
 
4
 
5
 
6
+ agent = create_agent()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ demo = GradioUI(agent)
9
 
10
  if __name__ == "__main__":
11
  demo.launch()
requirements.txt CHANGED
@@ -1 +1,40 @@
1
- huggingface_hub==0.25.2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ anthropic>=0.37.1
2
+ audioop-lts<1.0; python_version >= "3.13" # required to use pydub in Python >=3.13; LTS port of the removed Python builtin module audioop
3
+ beautifulsoup4>=4.12.3
4
+ datasets>=2.21.0
5
+ google_search_results>=2.4.2
6
+ huggingface_hub>=0.23.4
7
+ mammoth>=1.8.0
8
+ markdownify>=0.13.1
9
+ numexpr>=2.10.1
10
+ numpy>=2.1.2
11
+ openai>=1.52.2
12
+ openpyxl
13
+ pandas>=2.2.3
14
+ pathvalidate>=3.2.1
15
+ pdfminer>=20191125
16
+ pdfminer.six>=20240706
17
+ Pillow>=11.0.0
18
+ puremagic>=1.28
19
+ pypdf>=5.1.0
20
+ python-dotenv>=1.0.1
21
+ python_pptx>=1.0.2
22
+ Requests>=2.32.3
23
+ serpapi>=0.1.5
24
+ tqdm>=4.66.4
25
+ torch>=2.2.2
26
+ torchvision>=0.17.2
27
+ transformers>=4.46.0
28
+ youtube_transcript_api>=0.6.2
29
+ chess
30
+ sympy
31
+ pubchempy
32
+ Bio
33
+ scikit-learn
34
+ scipy
35
+ pydub
36
+ PyPDF2
37
+ python-pptx
38
+ torch
39
+ xlrd
40
+ SpeechRecognition
run.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import threading
4
+
5
+ from dotenv import load_dotenv
6
+ from huggingface_hub import login
7
+ from scripts.text_inspector_tool import TextInspectorTool
8
+ from scripts.text_web_browser import (
9
+ ArchiveSearchTool,
10
+ FinderTool,
11
+ FindNextTool,
12
+ PageDownTool,
13
+ PageUpTool,
14
+ SimpleTextBrowser,
15
+ VisitTool,
16
+ )
17
+ from scripts.visual_qa import visualizer
18
+
19
+ from smolagents import (
20
+ CodeAgent,
21
+ GoogleSearchTool,
22
+ # InferenceClientModel,
23
+ LiteLLMModel,
24
+ ToolCallingAgent,
25
+ )
26
+
27
+
28
+ load_dotenv(override=True)
29
+ login(os.getenv("HF_TOKEN"))
30
+
31
+ append_answer_lock = threading.Lock()
32
+
33
+
34
+ def parse_args():
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument(
37
+ "question", type=str, help="for example: 'How many studio albums did Mercedes Sosa release before 2007?'"
38
+ )
39
+ parser.add_argument("--model-id", type=str, default="o1")
40
+ return parser.parse_args()
41
+
42
+
43
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
44
+
45
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
46
+
47
+ BROWSER_CONFIG = {
48
+ "viewport_size": 1024 * 5,
49
+ "downloads_folder": "downloads_folder",
50
+ "request_kwargs": {
51
+ "headers": {"User-Agent": user_agent},
52
+ "timeout": 300,
53
+ },
54
+ "serpapi_key": os.getenv("SERPAPI_API_KEY"),
55
+ }
56
+
57
+ os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
58
+
59
+
60
+ def create_agent(model_id="o1"):
61
+ model_params = {
62
+ "model_id": model_id,
63
+ "custom_role_conversions": custom_role_conversions,
64
+ "max_completion_tokens": 8192,
65
+ }
66
+ if model_id == "o1":
67
+ model_params["reasoning_effort"] = "high"
68
+ model = LiteLLMModel(**model_params)
69
+
70
+ text_limit = 100000
71
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
72
+ WEB_TOOLS = [
73
+ GoogleSearchTool(provider="serper"),
74
+ VisitTool(browser),
75
+ PageUpTool(browser),
76
+ PageDownTool(browser),
77
+ FinderTool(browser),
78
+ FindNextTool(browser),
79
+ ArchiveSearchTool(browser),
80
+ TextInspectorTool(model, text_limit),
81
+ ]
82
+ text_webbrowser_agent = ToolCallingAgent(
83
+ model=model,
84
+ tools=WEB_TOOLS,
85
+ max_steps=20,
86
+ verbosity_level=2,
87
+ planning_interval=4,
88
+ name="search_agent",
89
+ description="""A team member that will search the internet to answer your question.
90
+ Ask him for all your questions that require browsing the web.
91
+ Provide him as much context as possible, in particular if you need to search on a specific timeframe!
92
+ And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
93
+ Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
94
+ """,
95
+ provide_run_summary=True,
96
+ )
97
+ text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
98
+ If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
99
+ Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
100
+
101
+ manager_agent = CodeAgent(
102
+ model=model,
103
+ tools=[visualizer, TextInspectorTool(model, text_limit)],
104
+ max_steps=12,
105
+ verbosity_level=2,
106
+ additional_authorized_imports=["*"],
107
+ planning_interval=4,
108
+ managed_agents=[text_webbrowser_agent],
109
+ )
110
+
111
+ return manager_agent
112
+
113
+
114
+ def main():
115
+ args = parse_args()
116
+
117
+ agent = create_agent(model_id=args.model_id)
118
+
119
+ answer = agent.run(args.question)
120
+
121
+ print(f"Got this answer: {answer}")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()
run_gaia.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # EXAMPLE COMMAND: python examples/open_deep_research/run_gaia.py --concurrency 32 --run-name generate-traces-03-apr-noplanning --model-id gpt-4o
2
+ import argparse
3
+ import json
4
+ import os
5
+ import threading
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+ from typing import List
10
+
11
+ import datasets
12
+ import pandas as pd
13
+ from dotenv import load_dotenv
14
+ from huggingface_hub import login
15
+ from scripts.reformulator import prepare_response
16
+ from scripts.run_agents import (
17
+ get_single_file_description,
18
+ get_zip_description,
19
+ )
20
+ from scripts.text_inspector_tool import TextInspectorTool
21
+ from scripts.text_web_browser import (
22
+ ArchiveSearchTool,
23
+ FinderTool,
24
+ FindNextTool,
25
+ PageDownTool,
26
+ PageUpTool,
27
+ SimpleTextBrowser,
28
+ VisitTool,
29
+ )
30
+ from scripts.visual_qa import visualizer
31
+ from tqdm import tqdm
32
+
33
+ from smolagents import (
34
+ CodeAgent,
35
+ GoogleSearchTool,
36
+ LiteLLMModel,
37
+ Model,
38
+ ToolCallingAgent,
39
+ )
40
+
41
+
42
+ load_dotenv(override=True)
43
+ login(os.getenv("HF_TOKEN"))
44
+
45
+ append_answer_lock = threading.Lock()
46
+
47
+
48
+ def parse_args():
49
+ parser = argparse.ArgumentParser()
50
+ parser.add_argument("--concurrency", type=int, default=8)
51
+ parser.add_argument("--model-id", type=str, default="o1")
52
+ parser.add_argument("--run-name", type=str, required=True)
53
+ return parser.parse_args()
54
+
55
+
56
+ ### IMPORTANT: EVALUATION SWITCHES
57
+
58
+ print("Make sure you deactivated Tailscale VPN, else some URLs will be blocked!")
59
+
60
+ USE_OPEN_MODELS = False
61
+
62
+ SET = "validation"
63
+
64
+ custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
65
+
66
+ ### LOAD EVALUATION DATASET
67
+
68
+ eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET]
69
+ eval_ds = eval_ds.rename_columns({"Question": "question", "Final answer": "true_answer", "Level": "task"})
70
+
71
+
72
+ def preprocess_file_paths(row):
73
+ if len(row["file_name"]) > 0:
74
+ row["file_name"] = f"data/gaia/{SET}/" + row["file_name"]
75
+ return row
76
+
77
+
78
+ eval_ds = eval_ds.map(preprocess_file_paths)
79
+ eval_df = pd.DataFrame(eval_ds)
80
+ print("Loaded evaluation dataset:")
81
+ print(eval_df["task"].value_counts())
82
+
83
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
84
+
85
+ BROWSER_CONFIG = {
86
+ "viewport_size": 1024 * 5,
87
+ "downloads_folder": "downloads_folder",
88
+ "request_kwargs": {
89
+ "headers": {"User-Agent": user_agent},
90
+ "timeout": 300,
91
+ },
92
+ "serpapi_key": os.getenv("SERPAPI_API_KEY"),
93
+ }
94
+
95
+ os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
96
+
97
+
98
+ def create_agent_team(model: Model):
99
+ text_limit = 100000
100
+ ti_tool = TextInspectorTool(model, text_limit)
101
+
102
+ browser = SimpleTextBrowser(**BROWSER_CONFIG)
103
+
104
+ WEB_TOOLS = [
105
+ GoogleSearchTool(provider="serper"),
106
+ VisitTool(browser),
107
+ PageUpTool(browser),
108
+ PageDownTool(browser),
109
+ FinderTool(browser),
110
+ FindNextTool(browser),
111
+ ArchiveSearchTool(browser),
112
+ TextInspectorTool(model, text_limit),
113
+ ]
114
+
115
+ text_webbrowser_agent = ToolCallingAgent(
116
+ model=model,
117
+ tools=WEB_TOOLS,
118
+ max_steps=20,
119
+ verbosity_level=2,
120
+ planning_interval=4,
121
+ name="search_agent",
122
+ description="""A team member that will search the internet to answer your question.
123
+ Ask him for all your questions that require browsing the web.
124
+ Provide him as much context as possible, in particular if you need to search on a specific timeframe!
125
+ And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
126
+ Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
127
+ """,
128
+ provide_run_summary=True,
129
+ )
130
+ text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
131
+ If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
132
+ Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
133
+
134
+ manager_agent = CodeAgent(
135
+ model=model,
136
+ tools=[visualizer, ti_tool],
137
+ max_steps=12,
138
+ verbosity_level=2,
139
+ additional_authorized_imports=["*"],
140
+ planning_interval=4,
141
+ managed_agents=[text_webbrowser_agent],
142
+ )
143
+ return manager_agent
144
+
145
+
146
+ def append_answer(entry: dict, jsonl_file: str) -> None:
147
+ jsonl_file = Path(jsonl_file)
148
+ jsonl_file.parent.mkdir(parents=True, exist_ok=True)
149
+ with append_answer_lock, open(jsonl_file, "a", encoding="utf-8") as fp:
150
+ fp.write(json.dumps(entry) + "\n")
151
+ assert os.path.exists(jsonl_file), "File not found!"
152
+ print("Answer exported to file:", jsonl_file.resolve())
153
+
154
+
155
+ def answer_single_question(example, model_id, answers_file, visual_inspection_tool):
156
+ model_params = {
157
+ "model_id": model_id,
158
+ "custom_role_conversions": custom_role_conversions,
159
+ }
160
+ if model_id == "o1":
161
+ model_params["reasoning_effort"] = "high"
162
+ model_params["max_completion_tokens"] = 8192
163
+ else:
164
+ model_params["max_tokens"] = 4096
165
+ model = LiteLLMModel(**model_params)
166
+ # model = InferenceClientModel(model_id="Qwen/Qwen2.5-Coder-32B-Instruct", provider="together", max_tokens=4096)
167
+ document_inspection_tool = TextInspectorTool(model, 100000)
168
+
169
+ agent = create_agent_team(model)
170
+
171
+ augmented_question = """You have one question to answer. It is paramount that you provide a correct answer.
172
+ Give it all you can: I know for a fact that you have access to all the relevant tools to solve it and find the correct answer (the answer does exist). Failure or 'I cannot answer' or 'None found' will not be tolerated, success will be rewarded.
173
+ Run verification steps if that's needed, you must make sure you find the correct answer!
174
+ Here is the task:
175
+ """ + example["question"]
176
+
177
+ if example["file_name"]:
178
+ if ".zip" in example["file_name"]:
179
+ prompt_use_files = "\n\nTo solve the task above, you will have to use these attached files:\n"
180
+ prompt_use_files += get_zip_description(
181
+ example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
182
+ )
183
+ else:
184
+ prompt_use_files = "\n\nTo solve the task above, you will have to use this attached file:"
185
+ prompt_use_files += get_single_file_description(
186
+ example["file_name"], example["question"], visual_inspection_tool, document_inspection_tool
187
+ )
188
+ augmented_question += prompt_use_files
189
+
190
+ start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
191
+ try:
192
+ # Run agent 🚀
193
+ final_result = agent.run(augmented_question)
194
+
195
+ agent_memory = agent.write_memory_to_messages()
196
+
197
+ final_result = prepare_response(augmented_question, agent_memory, reformulation_model=model)
198
+
199
+ output = str(final_result)
200
+ for memory_step in agent.memory.steps:
201
+ memory_step.model_input_messages = None
202
+ intermediate_steps = agent_memory
203
+
204
+ # Check for parsing errors which indicate the LLM failed to follow the required format
205
+ parsing_error = True if any(["AgentParsingError" in step for step in intermediate_steps]) else False
206
+
207
+ # check if iteration limit exceeded
208
+ iteration_limit_exceeded = True if "Agent stopped due to iteration limit or time limit." in output else False
209
+ raised_exception = False
210
+
211
+ except Exception as e:
212
+ print("Error on ", augmented_question, e)
213
+ output = None
214
+ intermediate_steps = []
215
+ parsing_error = False
216
+ iteration_limit_exceeded = False
217
+ exception = e
218
+ raised_exception = True
219
+ end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
220
+ token_counts_manager = agent.monitor.get_total_token_counts()
221
+ token_counts_web = list(agent.managed_agents.values())[0].monitor.get_total_token_counts()
222
+ total_token_counts = {
223
+ "input": token_counts_manager["input"] + token_counts_web["input"],
224
+ "output": token_counts_manager["output"] + token_counts_web["output"],
225
+ }
226
+ annotated_example = {
227
+ "agent_name": model.model_id,
228
+ "question": example["question"],
229
+ "augmented_question": augmented_question,
230
+ "prediction": output,
231
+ "intermediate_steps": intermediate_steps,
232
+ "parsing_error": parsing_error,
233
+ "iteration_limit_exceeded": iteration_limit_exceeded,
234
+ "agent_error": str(exception) if raised_exception else None,
235
+ "task": example["task"],
236
+ "task_id": example["task_id"],
237
+ "true_answer": example["true_answer"],
238
+ "start_time": start_time,
239
+ "end_time": end_time,
240
+ "token_counts": total_token_counts,
241
+ }
242
+ append_answer(annotated_example, answers_file)
243
+
244
+
245
+ def get_examples_to_answer(answers_file, eval_ds) -> List[dict]:
246
+ print(f"Loading answers from {answers_file}...")
247
+ try:
248
+ done_questions = pd.read_json(answers_file, lines=True)["question"].tolist()
249
+ print(f"Found {len(done_questions)} previous results!")
250
+ except Exception as e:
251
+ print("Error when loading records: ", e)
252
+ print("No usable records! ▶️ Starting new.")
253
+ done_questions = []
254
+ return [line for line in eval_ds.to_list() if line["question"] not in done_questions]
255
+
256
+
257
+ def main():
258
+ args = parse_args()
259
+ print(f"Starting run with arguments: {args}")
260
+
261
+ answers_file = f"output/{SET}/{args.run_name}.jsonl"
262
+ tasks_to_run = get_examples_to_answer(answers_file, eval_ds)
263
+
264
+ with ThreadPoolExecutor(max_workers=args.concurrency) as exe:
265
+ futures = [
266
+ exe.submit(answer_single_question, example, args.model_id, answers_file, visualizer)
267
+ for example in tasks_to_run
268
+ ]
269
+ for f in tqdm(as_completed(futures), total=len(tasks_to_run), desc="Processing tasks"):
270
+ f.result()
271
+
272
+ # for example in tasks_to_run:
273
+ # answer_single_question(example, args.model_id, answers_file, visualizer)
274
+ print("All tasks processed.")
275
+
276
+
277
+ if __name__ == "__main__":
278
+ main()
scripts/cookies.py ADDED
@@ -0,0 +1,715 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from requests.cookies import RequestsCookieJar
2
+
3
+
4
+ COOKIES_LIST = [
5
+ {
6
+ "domain": ".youtube.com",
7
+ "expirationDate": 1718884961,
8
+ "hostOnly": False,
9
+ "httpOnly": False,
10
+ "name": "ST-xuwub9",
11
+ "path": "/",
12
+ "sameSite": None,
13
+ "secure": False,
14
+ "session": False,
15
+ "storeId": None,
16
+ "value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
17
+ },
18
+ {
19
+ "domain": ".youtube.com",
20
+ "expirationDate": 1753004444.745411,
21
+ "hostOnly": False,
22
+ "httpOnly": True,
23
+ "name": "__Secure-YEC",
24
+ "path": "/",
25
+ "sameSite": "lax",
26
+ "secure": True,
27
+ "session": False,
28
+ "storeId": None,
29
+ "value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
30
+ },
31
+ {
32
+ "domain": ".youtube.com",
33
+ "expirationDate": 1753434620.050824,
34
+ "hostOnly": False,
35
+ "httpOnly": True,
36
+ "name": "__Secure-3PSID",
37
+ "path": "/",
38
+ "sameSite": "no_restriction",
39
+ "secure": True,
40
+ "session": False,
41
+ "storeId": None,
42
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
43
+ },
44
+ {
45
+ "domain": ".youtube.com",
46
+ "expirationDate": 1750420959.974642,
47
+ "hostOnly": False,
48
+ "httpOnly": False,
49
+ "name": "SIDCC",
50
+ "path": "/",
51
+ "sameSite": None,
52
+ "secure": False,
53
+ "session": False,
54
+ "storeId": None,
55
+ "value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
56
+ },
57
+ {
58
+ "domain": ".youtube.com",
59
+ "expirationDate": 1753434620.050652,
60
+ "hostOnly": False,
61
+ "httpOnly": False,
62
+ "name": "SID",
63
+ "path": "/",
64
+ "sameSite": None,
65
+ "secure": False,
66
+ "session": False,
67
+ "storeId": None,
68
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
69
+ },
70
+ {
71
+ "domain": ".youtube.com",
72
+ "expirationDate": 1750420958.397534,
73
+ "hostOnly": False,
74
+ "httpOnly": True,
75
+ "name": "__Secure-1PSIDTS",
76
+ "path": "/",
77
+ "sameSite": None,
78
+ "secure": True,
79
+ "session": False,
80
+ "storeId": None,
81
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
82
+ },
83
+ {
84
+ "domain": ".youtube.com",
85
+ "expirationDate": 1753433494.44729,
86
+ "hostOnly": False,
87
+ "httpOnly": False,
88
+ "name": "_ga_M0180HEFCY",
89
+ "path": "/",
90
+ "sameSite": None,
91
+ "secure": False,
92
+ "session": False,
93
+ "storeId": None,
94
+ "value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
95
+ },
96
+ {
97
+ "domain": ".youtube.com",
98
+ "expirationDate": 1753434620.050933,
99
+ "hostOnly": False,
100
+ "httpOnly": False,
101
+ "name": "SAPISID",
102
+ "path": "/",
103
+ "sameSite": None,
104
+ "secure": True,
105
+ "session": False,
106
+ "storeId": None,
107
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
108
+ },
109
+ {
110
+ "domain": ".youtube.com",
111
+ "expirationDate": 1750420959.974764,
112
+ "hostOnly": False,
113
+ "httpOnly": True,
114
+ "name": "__Secure-1PSIDCC",
115
+ "path": "/",
116
+ "sameSite": None,
117
+ "secure": True,
118
+ "session": False,
119
+ "storeId": None,
120
+ "value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
121
+ },
122
+ {
123
+ "domain": ".youtube.com",
124
+ "expirationDate": 1753434620.050881,
125
+ "hostOnly": False,
126
+ "httpOnly": True,
127
+ "name": "SSID",
128
+ "path": "/",
129
+ "sameSite": None,
130
+ "secure": True,
131
+ "session": False,
132
+ "storeId": None,
133
+ "value": "AmlwXHnQvOQ10LVd-",
134
+ },
135
+ {
136
+ "domain": ".youtube.com",
137
+ "expirationDate": 1753434620.050959,
138
+ "hostOnly": False,
139
+ "httpOnly": False,
140
+ "name": "__Secure-1PAPISID",
141
+ "path": "/",
142
+ "sameSite": None,
143
+ "secure": True,
144
+ "session": False,
145
+ "storeId": None,
146
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
147
+ },
148
+ {
149
+ "domain": ".youtube.com",
150
+ "expirationDate": 1753434620.050795,
151
+ "hostOnly": False,
152
+ "httpOnly": True,
153
+ "name": "__Secure-1PSID",
154
+ "path": "/",
155
+ "sameSite": None,
156
+ "secure": True,
157
+ "session": False,
158
+ "storeId": None,
159
+ "value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
160
+ },
161
+ {
162
+ "domain": ".youtube.com",
163
+ "expirationDate": 1753434620.050993,
164
+ "hostOnly": False,
165
+ "httpOnly": False,
166
+ "name": "__Secure-3PAPISID",
167
+ "path": "/",
168
+ "sameSite": "no_restriction",
169
+ "secure": True,
170
+ "session": False,
171
+ "storeId": None,
172
+ "value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
173
+ },
174
+ {
175
+ "domain": ".youtube.com",
176
+ "expirationDate": 1750420959.974815,
177
+ "hostOnly": False,
178
+ "httpOnly": True,
179
+ "name": "__Secure-3PSIDCC",
180
+ "path": "/",
181
+ "sameSite": "no_restriction",
182
+ "secure": True,
183
+ "session": False,
184
+ "storeId": None,
185
+ "value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
186
+ },
187
+ {
188
+ "domain": ".youtube.com",
189
+ "expirationDate": 1750420958.397647,
190
+ "hostOnly": False,
191
+ "httpOnly": True,
192
+ "name": "__Secure-3PSIDTS",
193
+ "path": "/",
194
+ "sameSite": "no_restriction",
195
+ "secure": True,
196
+ "session": False,
197
+ "storeId": None,
198
+ "value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
199
+ },
200
+ {
201
+ "domain": ".youtube.com",
202
+ "expirationDate": 1753434620.050908,
203
+ "hostOnly": False,
204
+ "httpOnly": False,
205
+ "name": "APISID",
206
+ "path": "/",
207
+ "sameSite": None,
208
+ "secure": False,
209
+ "session": False,
210
+ "storeId": None,
211
+ "value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
212
+ },
213
+ {
214
+ "domain": ".youtube.com",
215
+ "expirationDate": 1753434620.050855,
216
+ "hostOnly": False,
217
+ "httpOnly": True,
218
+ "name": "HSID",
219
+ "path": "/",
220
+ "sameSite": None,
221
+ "secure": False,
222
+ "session": False,
223
+ "storeId": None,
224
+ "value": "AasA7hmRuTFv7vjoq",
225
+ },
226
+ {
227
+ "domain": ".youtube.com",
228
+ "expirationDate": 1753435873.577793,
229
+ "hostOnly": False,
230
+ "httpOnly": True,
231
+ "name": "LOGIN_INFO",
232
+ "path": "/",
233
+ "sameSite": "no_restriction",
234
+ "secure": True,
235
+ "session": False,
236
+ "storeId": None,
237
+ "value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
238
+ },
239
+ {
240
+ "domain": ".youtube.com",
241
+ "expirationDate": 1753444956.555608,
242
+ "hostOnly": False,
243
+ "httpOnly": False,
244
+ "name": "PREF",
245
+ "path": "/",
246
+ "sameSite": None,
247
+ "secure": True,
248
+ "session": False,
249
+ "storeId": None,
250
+ "value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
251
+ },
252
+ ]
253
+
254
+ COOKIES_LIST += [
255
+ {
256
+ "domain": ".www.researchgate.net",
257
+ "hostOnly": False,
258
+ "httpOnly": True,
259
+ "name": "isInstIp",
260
+ "path": "/",
261
+ "sameSite": None,
262
+ "secure": True,
263
+ "session": True,
264
+ "storeId": None,
265
+ "value": "False",
266
+ },
267
+ {
268
+ "domain": ".researchgate.net",
269
+ "expirationDate": 1734423981,
270
+ "hostOnly": False,
271
+ "httpOnly": False,
272
+ "name": "__eoi",
273
+ "path": "/",
274
+ "sameSite": None,
275
+ "secure": False,
276
+ "session": False,
277
+ "storeId": None,
278
+ "value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
279
+ },
280
+ {
281
+ "domain": ".www.researchgate.net",
282
+ "expirationDate": 1753444909.646103,
283
+ "hostOnly": False,
284
+ "httpOnly": True,
285
+ "name": "ptc",
286
+ "path": "/",
287
+ "sameSite": None,
288
+ "secure": True,
289
+ "session": False,
290
+ "storeId": None,
291
+ "value": "RG1.8947708639250500550.1718872043",
292
+ },
293
+ {
294
+ "domain": ".researchgate.net",
295
+ "expirationDate": 1750507578,
296
+ "hostOnly": False,
297
+ "httpOnly": False,
298
+ "name": "euconsent-v2-didomi",
299
+ "path": "/",
300
+ "sameSite": "lax",
301
+ "secure": True,
302
+ "session": False,
303
+ "storeId": None,
304
+ "value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
305
+ },
306
+ {
307
+ "domain": ".researchgate.net",
308
+ "expirationDate": 1718885236,
309
+ "hostOnly": False,
310
+ "httpOnly": False,
311
+ "name": "_gat",
312
+ "path": "/",
313
+ "sameSite": None,
314
+ "secure": False,
315
+ "session": False,
316
+ "storeId": None,
317
+ "value": "1",
318
+ },
319
+ {
320
+ "domain": "www.researchgate.net",
321
+ "expirationDate": 1721477183,
322
+ "hostOnly": True,
323
+ "httpOnly": False,
324
+ "name": "_pbjs_userid_consent_data",
325
+ "path": "/",
326
+ "sameSite": "lax",
327
+ "secure": False,
328
+ "session": False,
329
+ "storeId": None,
330
+ "value": "3524755945110770",
331
+ },
332
+ {
333
+ "domain": ".researchgate.net",
334
+ "expirationDate": 1752567981,
335
+ "hostOnly": False,
336
+ "httpOnly": False,
337
+ "name": "__gads",
338
+ "path": "/",
339
+ "sameSite": None,
340
+ "secure": False,
341
+ "session": False,
342
+ "storeId": None,
343
+ "value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
344
+ },
345
+ {
346
+ "domain": ".researchgate.net",
347
+ "expirationDate": 1718886709.646173,
348
+ "hostOnly": False,
349
+ "httpOnly": True,
350
+ "name": "__cf_bm",
351
+ "path": "/",
352
+ "sameSite": "no_restriction",
353
+ "secure": True,
354
+ "session": False,
355
+ "storeId": None,
356
+ "value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
357
+ },
358
+ {
359
+ "domain": ".researchgate.net",
360
+ "expirationDate": 1752567981,
361
+ "hostOnly": False,
362
+ "httpOnly": False,
363
+ "name": "__gpi",
364
+ "path": "/",
365
+ "sameSite": None,
366
+ "secure": False,
367
+ "session": False,
368
+ "storeId": None,
369
+ "value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
370
+ },
371
+ {
372
+ "domain": ".researchgate.net",
373
+ "hostOnly": False,
374
+ "httpOnly": True,
375
+ "name": "_cfuvid",
376
+ "path": "/",
377
+ "sameSite": "no_restriction",
378
+ "secure": True,
379
+ "session": True,
380
+ "storeId": None,
381
+ "value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
382
+ },
383
+ {
384
+ "domain": ".researchgate.net",
385
+ "expirationDate": 1753445177.271667,
386
+ "hostOnly": False,
387
+ "httpOnly": False,
388
+ "name": "_ga",
389
+ "path": "/",
390
+ "sameSite": None,
391
+ "secure": False,
392
+ "session": False,
393
+ "storeId": None,
394
+ "value": "GA1.1.1525244793.1718885177",
395
+ },
396
+ {
397
+ "domain": ".researchgate.net",
398
+ "expirationDate": 1753445177.271482,
399
+ "hostOnly": False,
400
+ "httpOnly": False,
401
+ "name": "_ga_4P31SJ70EJ",
402
+ "path": "/",
403
+ "sameSite": None,
404
+ "secure": False,
405
+ "session": False,
406
+ "storeId": None,
407
+ "value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
408
+ },
409
+ {
410
+ "domain": ".researchgate.net",
411
+ "expirationDate": 1718971576,
412
+ "hostOnly": False,
413
+ "httpOnly": False,
414
+ "name": "_gid",
415
+ "path": "/",
416
+ "sameSite": None,
417
+ "secure": False,
418
+ "session": False,
419
+ "storeId": None,
420
+ "value": "GA1.2.854907463.1718885177",
421
+ },
422
+ {
423
+ "domain": ".www.researchgate.net",
424
+ "expirationDate": 1750407982.506505,
425
+ "hostOnly": False,
426
+ "httpOnly": True,
427
+ "name": "did",
428
+ "path": "/",
429
+ "sameSite": None,
430
+ "secure": True,
431
+ "session": False,
432
+ "storeId": None,
433
+ "value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
434
+ },
435
+ {
436
+ "domain": ".researchgate.net",
437
+ "expirationDate": 1750507578,
438
+ "hostOnly": False,
439
+ "httpOnly": False,
440
+ "name": "didomi_token",
441
+ "path": "/",
442
+ "sameSite": "lax",
443
+ "secure": True,
444
+ "session": False,
445
+ "storeId": None,
446
+ "value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
447
+ },
448
+ {
449
+ "domain": ".www.researchgate.net",
450
+ "hostOnly": False,
451
+ "httpOnly": True,
452
+ "name": "hasPdpNext",
453
+ "path": "/",
454
+ "sameSite": None,
455
+ "secure": True,
456
+ "session": True,
457
+ "storeId": None,
458
+ "value": "False",
459
+ },
460
+ {
461
+ "domain": ".researchgate.net",
462
+ "expirationDate": 1750421183,
463
+ "hostOnly": False,
464
+ "httpOnly": False,
465
+ "name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
466
+ "path": "/",
467
+ "sameSite": "lax",
468
+ "secure": True,
469
+ "session": False,
470
+ "storeId": None,
471
+ "value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
472
+ },
473
+ {
474
+ "domain": ".www.researchgate.net",
475
+ "hostOnly": False,
476
+ "httpOnly": True,
477
+ "name": "sid",
478
+ "path": "/",
479
+ "sameSite": None,
480
+ "secure": True,
481
+ "session": True,
482
+ "storeId": None,
483
+ "value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
484
+ },
485
+ ]
486
+
487
+ COOKIES_LIST += [
488
+ {
489
+ "domain": "github.com",
490
+ "hostOnly": True,
491
+ "httpOnly": True,
492
+ "name": "_gh_sess",
493
+ "path": "/",
494
+ "sameSite": "lax",
495
+ "secure": True,
496
+ "session": True,
497
+ "storeId": None,
498
+ "value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
499
+ },
500
+ {
501
+ "domain": ".github.com",
502
+ "expirationDate": 1750408875.763785,
503
+ "hostOnly": False,
504
+ "httpOnly": False,
505
+ "name": "_octo",
506
+ "path": "/",
507
+ "sameSite": "lax",
508
+ "secure": True,
509
+ "session": False,
510
+ "storeId": None,
511
+ "value": "GH1.1.728652011.1718872875",
512
+ },
513
+ {
514
+ "domain": ".github.com",
515
+ "expirationDate": 1750408875.763926,
516
+ "hostOnly": False,
517
+ "httpOnly": True,
518
+ "name": "logged_in",
519
+ "path": "/",
520
+ "sameSite": "lax",
521
+ "secure": True,
522
+ "session": False,
523
+ "storeId": None,
524
+ "value": "no",
525
+ },
526
+ {
527
+ "domain": ".github.com",
528
+ "hostOnly": False,
529
+ "httpOnly": False,
530
+ "name": "preferred_color_mode",
531
+ "path": "/",
532
+ "sameSite": "lax",
533
+ "secure": True,
534
+ "session": True,
535
+ "storeId": None,
536
+ "value": "dark",
537
+ },
538
+ {
539
+ "domain": ".github.com",
540
+ "hostOnly": False,
541
+ "httpOnly": False,
542
+ "name": "tz",
543
+ "path": "/",
544
+ "sameSite": "lax",
545
+ "secure": True,
546
+ "session": True,
547
+ "storeId": None,
548
+ "value": "Europe%2FParis",
549
+ },
550
+ ]
551
+
552
+ COOKIES_LIST += [
553
+ {
554
+ "domain": ".web.archive.org",
555
+ "expirationDate": 1718886430,
556
+ "hostOnly": False,
557
+ "httpOnly": False,
558
+ "name": "_gat",
559
+ "path": "/web/20201123221659/http://orcid.org/",
560
+ "sameSite": None,
561
+ "secure": False,
562
+ "session": False,
563
+ "storeId": None,
564
+ "value": "1",
565
+ },
566
+ {
567
+ "domain": ".web.archive.org",
568
+ "expirationDate": 1718972770,
569
+ "hostOnly": False,
570
+ "httpOnly": False,
571
+ "name": "_gid",
572
+ "path": "/web/20201123221659/http://orcid.org/",
573
+ "sameSite": None,
574
+ "secure": False,
575
+ "session": False,
576
+ "storeId": None,
577
+ "value": "GA1.2.402246368.1606169825",
578
+ },
579
+ {
580
+ "domain": ".web.archive.org",
581
+ "expirationDate": 1753446370.315621,
582
+ "hostOnly": False,
583
+ "httpOnly": False,
584
+ "name": "_ga",
585
+ "path": "/web/20201123221659/http://orcid.org/",
586
+ "sameSite": None,
587
+ "secure": False,
588
+ "session": False,
589
+ "storeId": None,
590
+ "value": "GA1.2.1301409987.1606169825",
591
+ },
592
+ {
593
+ "domain": ".web.archive.org",
594
+ "expirationDate": 1750422367,
595
+ "hostOnly": False,
596
+ "httpOnly": False,
597
+ "name": "_hjid",
598
+ "path": "/web/20201123221659/http://orcid.org/",
599
+ "sameSite": "lax",
600
+ "secure": False,
601
+ "session": False,
602
+ "storeId": None,
603
+ "value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
604
+ },
605
+ {
606
+ "domain": ".web.archive.org",
607
+ "expirationDate": 1718888167,
608
+ "hostOnly": False,
609
+ "httpOnly": False,
610
+ "name": "_hjFirstSeen",
611
+ "path": "/web/20201123221659/http://orcid.org/",
612
+ "sameSite": "lax",
613
+ "secure": False,
614
+ "session": False,
615
+ "storeId": None,
616
+ "value": "1",
617
+ },
618
+ ]
619
+ COOKIES_LIST += [
620
+ {
621
+ "domain": "orcid.org",
622
+ "hostOnly": True,
623
+ "httpOnly": False,
624
+ "name": "AWSELBCORS",
625
+ "path": "/",
626
+ "sameSite": "no_restriction",
627
+ "secure": True,
628
+ "session": True,
629
+ "storeId": None,
630
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
631
+ },
632
+ {
633
+ "domain": ".orcid.org",
634
+ "expirationDate": 1753452454.637671,
635
+ "hostOnly": False,
636
+ "httpOnly": False,
637
+ "name": "_ga_9R61FWK9H5",
638
+ "path": "/",
639
+ "sameSite": None,
640
+ "secure": False,
641
+ "session": False,
642
+ "storeId": None,
643
+ "value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
644
+ },
645
+ {
646
+ "domain": ".orcid.org",
647
+ "expirationDate": 1753452454.63421,
648
+ "hostOnly": False,
649
+ "httpOnly": False,
650
+ "name": "_ga",
651
+ "path": "/",
652
+ "sameSite": None,
653
+ "secure": False,
654
+ "session": False,
655
+ "storeId": None,
656
+ "value": "GA1.1.2021310691.1718892455",
657
+ },
658
+ {
659
+ "domain": "orcid.org",
660
+ "hostOnly": True,
661
+ "httpOnly": False,
662
+ "name": "AWSELB",
663
+ "path": "/",
664
+ "sameSite": None,
665
+ "secure": False,
666
+ "session": True,
667
+ "storeId": None,
668
+ "value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
669
+ },
670
+ {
671
+ "domain": ".orcid.org",
672
+ "expirationDate": 1750428454,
673
+ "hostOnly": False,
674
+ "httpOnly": False,
675
+ "name": "OptanonAlertBoxClosed",
676
+ "path": "/",
677
+ "sameSite": "lax",
678
+ "secure": False,
679
+ "session": False,
680
+ "storeId": None,
681
+ "value": "2024-06-20T14:07:34.583Z",
682
+ },
683
+ {
684
+ "domain": ".orcid.org",
685
+ "expirationDate": 1750428454,
686
+ "hostOnly": False,
687
+ "httpOnly": False,
688
+ "name": "OptanonConsent",
689
+ "path": "/",
690
+ "sameSite": "lax",
691
+ "secure": False,
692
+ "session": False,
693
+ "storeId": None,
694
+ "value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
695
+ },
696
+ {
697
+ "domain": "orcid.org",
698
+ "hostOnly": True,
699
+ "httpOnly": False,
700
+ "name": "XSRF-TOKEN",
701
+ "path": "/",
702
+ "sameSite": None,
703
+ "secure": True,
704
+ "session": True,
705
+ "storeId": None,
706
+ "value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
707
+ },
708
+ ]
709
+
710
+ # Create a RequestsCookieJar instance
711
+ COOKIES = RequestsCookieJar()
712
+
713
+ # Add cookies to the jar
714
+ for cookie in COOKIES_LIST:
715
+ COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
scripts/gaia_scorer.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ import warnings
4
+
5
+
6
+ def normalize_number_str(number_str: str) -> float:
7
+ # we replace these common units and commas to allow
8
+ # conversion to float
9
+ for char in ["$", "%", ","]:
10
+ number_str = number_str.replace(char, "")
11
+ try:
12
+ return float(number_str)
13
+ except ValueError:
14
+ print(f"String {number_str} cannot be normalized to number str.")
15
+ return float("inf")
16
+
17
+
18
+ def split_string(
19
+ s: str,
20
+ char_list: list[str] = [",", ";"],
21
+ ) -> list[str]:
22
+ pattern = f"[{''.join(char_list)}]"
23
+ return re.split(pattern, s)
24
+
25
+
26
+ def is_float(element: any) -> bool:
27
+ try:
28
+ float(element)
29
+ return True
30
+ except ValueError:
31
+ return False
32
+
33
+
34
+ def question_scorer(
35
+ model_answer: str,
36
+ ground_truth: str,
37
+ ) -> bool:
38
+ # if gt is a number
39
+ if is_float(ground_truth):
40
+ normalized_answer = normalize_number_str(str(model_answer))
41
+ return normalized_answer == float(ground_truth)
42
+
43
+ # if gt is a list
44
+ elif any(char in ground_truth for char in [",", ";"]):
45
+ # question with the fish: normalization removes punct
46
+
47
+ gt_elems = split_string(ground_truth)
48
+ ma_elems = split_string(model_answer)
49
+
50
+ # check length is the same
51
+ if len(gt_elems) != len(ma_elems):
52
+ warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
53
+ return False
54
+
55
+ # compare each element as float or str
56
+ comparisons = []
57
+ for ma_elem, gt_elem in zip(ma_elems, gt_elems):
58
+ if is_float(gt_elem):
59
+ normalized_ma_elem = normalize_number_str(ma_elem)
60
+ comparisons.append(normalized_ma_elem == float(gt_elem))
61
+ else:
62
+ # we do not remove punct since comparisons can include punct
63
+ comparisons.append(
64
+ normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
65
+ )
66
+ return all(comparisons)
67
+
68
+ # if gt is a str
69
+ else:
70
+ return normalize_str(model_answer) == normalize_str(ground_truth)
71
+
72
+
73
+ def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
74
+ prediction = prediction.lower()
75
+ true_answer = true_answer.lower()
76
+ if len(prediction) > len(true_answer) * 3:
77
+ return False
78
+ i = 0
79
+ for letter in true_answer:
80
+ if letter in prediction[i:]:
81
+ i += prediction[i:].index(letter)
82
+ else:
83
+ return False
84
+ return True
85
+
86
+
87
+ def check_close_call(prediction, true_answer, is_correct):
88
+ if is_correct:
89
+ return True
90
+ else:
91
+ if is_float(true_answer):
92
+ return is_correct
93
+ else:
94
+ if (
95
+ check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
96
+ and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
97
+ ):
98
+ print(f"Close call: {prediction} vs {true_answer}")
99
+ return True
100
+ else:
101
+ return False
102
+
103
+
104
+ def normalize_str(input_str, remove_punct=True) -> str:
105
+ """
106
+ Normalize a string by:
107
+ - Removing all white spaces
108
+ - Optionally removing punctuation (if remove_punct is True)
109
+ - Converting to lowercase
110
+ Parameters:
111
+ - input_str: str, the string to normalize
112
+ - remove_punct: bool, whether to remove punctuation (default: True)
113
+ Returns:
114
+ - str, the normalized string
115
+ """
116
+ # Remove all white spaces. Required e.g for seagull vs. sea gull
117
+ no_spaces = re.sub(r"\s", "", input_str)
118
+
119
+ # Remove punctuation, if specified.
120
+ if remove_punct:
121
+ translator = str.maketrans("", "", string.punctuation)
122
+ return no_spaces.lower().translate(translator)
123
+ else:
124
+ return no_spaces.lower()
scripts/mdconvert.py ADDED
@@ -0,0 +1,1002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
2
+ # Thanks to Microsoft researchers for open-sourcing this!
3
+ # type: ignore
4
+ import base64
5
+ import copy
6
+ import html
7
+ import json
8
+ import mimetypes
9
+ import os
10
+ import re
11
+ import shutil
12
+ import subprocess
13
+ import sys
14
+ import tempfile
15
+ import traceback
16
+ import zipfile
17
+ from typing import Any, Dict, List, Optional, Union
18
+ from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
19
+
20
+ import mammoth
21
+ import markdownify
22
+ import pandas as pd
23
+ import pdfminer
24
+ import pdfminer.high_level
25
+ import pptx
26
+
27
+ # File-format detection
28
+ import puremagic
29
+ import pydub
30
+ import requests
31
+ import speech_recognition as sr
32
+ from bs4 import BeautifulSoup
33
+ from youtube_transcript_api import YouTubeTranscriptApi
34
+ from youtube_transcript_api.formatters import SRTFormatter
35
+
36
+
37
+ class _CustomMarkdownify(markdownify.MarkdownConverter):
38
+ """
39
+ A custom version of markdownify's MarkdownConverter. Changes include:
40
+
41
+ - Altering the default heading style to use '#', '##', etc.
42
+ - Removing javascript hyperlinks.
43
+ - Truncating images with large data:uri sources.
44
+ - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
45
+ """
46
+
47
+ def __init__(self, **options: Any):
48
+ options["heading_style"] = options.get("heading_style", markdownify.ATX)
49
+ # Explicitly cast options to the expected type if necessary
50
+ super().__init__(**options)
51
+
52
+ def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
53
+ """Same as usual, but be sure to start with a new line"""
54
+ if not convert_as_inline:
55
+ if not re.search(r"^\n", text):
56
+ return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
57
+
58
+ return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
59
+
60
+ def convert_a(self, el: Any, text: str, convert_as_inline: bool):
61
+ """Same as usual converter, but removes Javascript links and escapes URIs."""
62
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
63
+ if not text:
64
+ return ""
65
+ href = el.get("href")
66
+ title = el.get("title")
67
+
68
+ # Escape URIs and skip non-http or file schemes
69
+ if href:
70
+ try:
71
+ parsed_url = urlparse(href) # type: ignore
72
+ if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
73
+ return "%s%s%s" % (prefix, text, suffix)
74
+ href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
75
+ except ValueError: # It's not clear if this ever gets thrown
76
+ return "%s%s%s" % (prefix, text, suffix)
77
+
78
+ # For the replacement see #29: text nodes underscores are escaped
79
+ if (
80
+ self.options["autolinks"]
81
+ and text.replace(r"\_", "_") == href
82
+ and not title
83
+ and not self.options["default_title"]
84
+ ):
85
+ # Shortcut syntax
86
+ return "<%s>" % href
87
+ if self.options["default_title"] and not title:
88
+ title = href
89
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
90
+ return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
91
+
92
+ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
93
+ """Same as usual converter, but removes data URIs"""
94
+
95
+ alt = el.attrs.get("alt", None) or ""
96
+ src = el.attrs.get("src", None) or ""
97
+ title = el.attrs.get("title", None) or ""
98
+ title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
99
+ if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
100
+ return alt
101
+
102
+ # Remove dataURIs
103
+ if src.startswith("data:"):
104
+ src = src.split(",")[0] + "..."
105
+
106
+ return "![%s](%s%s)" % (alt, src, title_part)
107
+
108
+ def convert_soup(self, soup: Any) -> str:
109
+ return super().convert_soup(soup) # type: ignore
110
+
111
+
112
+ class DocumentConverterResult:
113
+ """The result of converting a document to text."""
114
+
115
+ def __init__(self, title: Union[str, None] = None, text_content: str = ""):
116
+ self.title: Union[str, None] = title
117
+ self.text_content: str = text_content
118
+
119
+
120
+ class DocumentConverter:
121
+ """Abstract superclass of all DocumentConverters."""
122
+
123
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
124
+ raise NotImplementedError()
125
+
126
+
127
+ class PlainTextConverter(DocumentConverter):
128
+ """Anything with content type text/plain"""
129
+
130
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
131
+ # Guess the content type from any file extension that might be around
132
+ content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
133
+
134
+ # Only accept text files
135
+ if content_type is None:
136
+ return None
137
+ # elif "text/" not in content_type.lower():
138
+ # return None
139
+
140
+ text_content = ""
141
+ with open(local_path, "rt", encoding="utf-8") as fh:
142
+ text_content = fh.read()
143
+ return DocumentConverterResult(
144
+ title=None,
145
+ text_content=text_content,
146
+ )
147
+
148
+
149
+ class HtmlConverter(DocumentConverter):
150
+ """Anything with content type text/html"""
151
+
152
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
153
+ # Bail if not html
154
+ extension = kwargs.get("file_extension", "")
155
+ if extension.lower() not in [".html", ".htm"]:
156
+ return None
157
+
158
+ result = None
159
+ with open(local_path, "rt", encoding="utf-8") as fh:
160
+ result = self._convert(fh.read())
161
+
162
+ return result
163
+
164
+ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
165
+ """Helper function that converts and HTML string."""
166
+
167
+ # Parse the string
168
+ soup = BeautifulSoup(html_content, "html.parser")
169
+
170
+ # Remove javascript and style blocks
171
+ for script in soup(["script", "style"]):
172
+ script.extract()
173
+
174
+ # Print only the main content
175
+ body_elm = soup.find("body")
176
+ webpage_text = ""
177
+ if body_elm:
178
+ webpage_text = _CustomMarkdownify().convert_soup(body_elm)
179
+ else:
180
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
181
+
182
+ assert isinstance(webpage_text, str)
183
+
184
+ return DocumentConverterResult(
185
+ title=None if soup.title is None else soup.title.string, text_content=webpage_text
186
+ )
187
+
188
+
189
+ class WikipediaConverter(DocumentConverter):
190
+ """Handle Wikipedia pages separately, focusing only on the main document content."""
191
+
192
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
193
+ # Bail if not Wikipedia
194
+ extension = kwargs.get("file_extension", "")
195
+ if extension.lower() not in [".html", ".htm"]:
196
+ return None
197
+ url = kwargs.get("url", "")
198
+ if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
199
+ return None
200
+
201
+ # Parse the file
202
+ soup = None
203
+ with open(local_path, "rt", encoding="utf-8") as fh:
204
+ soup = BeautifulSoup(fh.read(), "html.parser")
205
+
206
+ # Remove javascript and style blocks
207
+ for script in soup(["script", "style"]):
208
+ script.extract()
209
+
210
+ # Print only the main content
211
+ body_elm = soup.find("div", {"id": "mw-content-text"})
212
+ title_elm = soup.find("span", {"class": "mw-page-title-main"})
213
+
214
+ webpage_text = ""
215
+ main_title = None if soup.title is None else soup.title.string
216
+
217
+ if body_elm:
218
+ # What's the title
219
+ if title_elm and len(title_elm) > 0:
220
+ main_title = title_elm.string # type: ignore
221
+ assert isinstance(main_title, str)
222
+
223
+ # Convert the page
224
+ webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
225
+ else:
226
+ webpage_text = _CustomMarkdownify().convert_soup(soup)
227
+
228
+ return DocumentConverterResult(
229
+ title=main_title,
230
+ text_content=webpage_text,
231
+ )
232
+
233
+
234
+ class YouTubeConverter(DocumentConverter):
235
+ """Handle YouTube specially, focusing on the video title, description, and transcript."""
236
+
237
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
238
+ # Bail if not YouTube
239
+ extension = kwargs.get("file_extension", "")
240
+ if extension.lower() not in [".html", ".htm"]:
241
+ return None
242
+ url = kwargs.get("url", "")
243
+ if not url.startswith("https://www.youtube.com/watch?"):
244
+ return None
245
+
246
+ # Parse the file
247
+ soup = None
248
+ with open(local_path, "rt", encoding="utf-8") as fh:
249
+ soup = BeautifulSoup(fh.read(), "html.parser")
250
+
251
+ # Read the meta tags
252
+ assert soup.title is not None and soup.title.string is not None
253
+ metadata: Dict[str, str] = {"title": soup.title.string}
254
+ for meta in soup(["meta"]):
255
+ for a in meta.attrs:
256
+ if a in ["itemprop", "property", "name"]:
257
+ metadata[meta[a]] = meta.get("content", "")
258
+ break
259
+
260
+ # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
261
+ try:
262
+ for script in soup(["script"]):
263
+ content = script.text
264
+ if "ytInitialData" in content:
265
+ lines = re.split(r"\r?\n", content)
266
+ obj_start = lines[0].find("{")
267
+ obj_end = lines[0].rfind("}")
268
+ if obj_start >= 0 and obj_end >= 0:
269
+ data = json.loads(lines[0][obj_start : obj_end + 1])
270
+ attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
271
+ if attrdesc:
272
+ metadata["description"] = str(attrdesc["content"])
273
+ break
274
+ except Exception:
275
+ pass
276
+
277
+ # Start preparing the page
278
+ webpage_text = "# YouTube\n"
279
+
280
+ title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
281
+ assert isinstance(title, str)
282
+
283
+ if title:
284
+ webpage_text += f"\n## {title}\n"
285
+
286
+ stats = ""
287
+ views = self._get(metadata, ["interactionCount"]) # type: ignore
288
+ if views:
289
+ stats += f"- **Views:** {views}\n"
290
+
291
+ keywords = self._get(metadata, ["keywords"]) # type: ignore
292
+ if keywords:
293
+ stats += f"- **Keywords:** {keywords}\n"
294
+
295
+ runtime = self._get(metadata, ["duration"]) # type: ignore
296
+ if runtime:
297
+ stats += f"- **Runtime:** {runtime}\n"
298
+
299
+ if len(stats) > 0:
300
+ webpage_text += f"\n### Video Metadata\n{stats}\n"
301
+
302
+ description = self._get(metadata, ["description", "og:description"]) # type: ignore
303
+ if description:
304
+ webpage_text += f"\n### Description\n{description}\n"
305
+
306
+ transcript_text = ""
307
+ parsed_url = urlparse(url) # type: ignore
308
+ params = parse_qs(parsed_url.query) # type: ignore
309
+ if "v" in params:
310
+ assert isinstance(params["v"][0], str)
311
+ video_id = str(params["v"][0])
312
+ try:
313
+ # Must be a single transcript.
314
+ transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
315
+ # transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
316
+ # Alternative formatting:
317
+ transcript_text = SRTFormatter().format_transcript(transcript)
318
+ except Exception:
319
+ pass
320
+ if transcript_text:
321
+ webpage_text += f"\n### Transcript\n{transcript_text}\n"
322
+
323
+ title = title if title else soup.title.string
324
+ assert isinstance(title, str)
325
+
326
+ return DocumentConverterResult(
327
+ title=title,
328
+ text_content=webpage_text,
329
+ )
330
+
331
+ def _get(self, metadata: Dict[str, str], keys: List[str], default: Union[str, None] = None) -> Union[str, None]:
332
+ for k in keys:
333
+ if k in metadata:
334
+ return metadata[k]
335
+ return default
336
+
337
+ def _findKey(self, json: Any, key: str) -> Union[str, None]: # TODO: Fix json type
338
+ if isinstance(json, list):
339
+ for elm in json:
340
+ ret = self._findKey(elm, key)
341
+ if ret is not None:
342
+ return ret
343
+ elif isinstance(json, dict):
344
+ for k in json:
345
+ if k == key:
346
+ return json[k]
347
+ else:
348
+ ret = self._findKey(json[k], key)
349
+ if ret is not None:
350
+ return ret
351
+ return None
352
+
353
+
354
+ class PdfConverter(DocumentConverter):
355
+ """
356
+ Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
357
+ """
358
+
359
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
360
+ # Bail if not a PDF
361
+ extension = kwargs.get("file_extension", "")
362
+ if extension.lower() != ".pdf":
363
+ return None
364
+
365
+ return DocumentConverterResult(
366
+ title=None,
367
+ text_content=pdfminer.high_level.extract_text(local_path),
368
+ )
369
+
370
+
371
+ class DocxConverter(HtmlConverter):
372
+ """
373
+ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
374
+ """
375
+
376
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
377
+ # Bail if not a DOCX
378
+ extension = kwargs.get("file_extension", "")
379
+ if extension.lower() != ".docx":
380
+ return None
381
+
382
+ result = None
383
+ with open(local_path, "rb") as docx_file:
384
+ result = mammoth.convert_to_html(docx_file)
385
+ html_content = result.value
386
+ result = self._convert(html_content)
387
+
388
+ return result
389
+
390
+
391
+ class XlsxConverter(HtmlConverter):
392
+ """
393
+ Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
394
+ """
395
+
396
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
397
+ # Bail if not a XLSX
398
+ extension = kwargs.get("file_extension", "")
399
+ if extension.lower() not in [".xlsx", ".xls"]:
400
+ return None
401
+
402
+ sheets = pd.read_excel(local_path, sheet_name=None)
403
+ md_content = ""
404
+ for s in sheets:
405
+ md_content += f"## {s}\n"
406
+ html_content = sheets[s].to_html(index=False)
407
+ md_content += self._convert(html_content).text_content.strip() + "\n\n"
408
+
409
+ return DocumentConverterResult(
410
+ title=None,
411
+ text_content=md_content.strip(),
412
+ )
413
+
414
+
415
+ class PptxConverter(HtmlConverter):
416
+ """
417
+ Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
418
+ """
419
+
420
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
421
+ # Bail if not a PPTX
422
+ extension = kwargs.get("file_extension", "")
423
+ if extension.lower() != ".pptx":
424
+ return None
425
+
426
+ md_content = ""
427
+
428
+ presentation = pptx.Presentation(local_path)
429
+ slide_num = 0
430
+ for slide in presentation.slides:
431
+ slide_num += 1
432
+
433
+ md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
434
+
435
+ title = slide.shapes.title
436
+ for shape in slide.shapes:
437
+ # Pictures
438
+ if self._is_picture(shape):
439
+ # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
440
+ alt_text = ""
441
+ try:
442
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
443
+ except Exception:
444
+ pass
445
+
446
+ # A placeholder name
447
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
448
+ md_content += "\n![" + (alt_text if alt_text else shape.name) + "](" + filename + ")\n"
449
+
450
+ # Tables
451
+ if self._is_table(shape):
452
+ html_table = "<html><body><table>"
453
+ first_row = True
454
+ for row in shape.table.rows:
455
+ html_table += "<tr>"
456
+ for cell in row.cells:
457
+ if first_row:
458
+ html_table += "<th>" + html.escape(cell.text) + "</th>"
459
+ else:
460
+ html_table += "<td>" + html.escape(cell.text) + "</td>"
461
+ html_table += "</tr>"
462
+ first_row = False
463
+ html_table += "</table></body></html>"
464
+ md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
465
+
466
+ # Text areas
467
+ elif shape.has_text_frame:
468
+ if shape == title:
469
+ md_content += "# " + shape.text.lstrip() + "\n"
470
+ else:
471
+ md_content += shape.text + "\n"
472
+
473
+ md_content = md_content.strip()
474
+
475
+ if slide.has_notes_slide:
476
+ md_content += "\n\n### Notes:\n"
477
+ notes_frame = slide.notes_slide.notes_text_frame
478
+ if notes_frame is not None:
479
+ md_content += notes_frame.text
480
+ md_content = md_content.strip()
481
+
482
+ return DocumentConverterResult(
483
+ title=None,
484
+ text_content=md_content.strip(),
485
+ )
486
+
487
+ def _is_picture(self, shape):
488
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
489
+ return True
490
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
491
+ if hasattr(shape, "image"):
492
+ return True
493
+ return False
494
+
495
+ def _is_table(self, shape):
496
+ if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
497
+ return True
498
+ return False
499
+
500
+
501
+ class MediaConverter(DocumentConverter):
502
+ """
503
+ Abstract class for multi-modal media (e.g., images and audio)
504
+ """
505
+
506
+ def _get_metadata(self, local_path):
507
+ exiftool = shutil.which("exiftool")
508
+ if not exiftool:
509
+ return None
510
+ else:
511
+ try:
512
+ result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
513
+ return json.loads(result)[0]
514
+ except Exception:
515
+ return None
516
+
517
+
518
+ class WavConverter(MediaConverter):
519
+ """
520
+ Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
521
+ """
522
+
523
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
524
+ # Bail if not a XLSX
525
+ extension = kwargs.get("file_extension", "")
526
+ if extension.lower() != ".wav":
527
+ return None
528
+
529
+ md_content = ""
530
+
531
+ # Add metadata
532
+ metadata = self._get_metadata(local_path)
533
+ if metadata:
534
+ for f in [
535
+ "Title",
536
+ "Artist",
537
+ "Author",
538
+ "Band",
539
+ "Album",
540
+ "Genre",
541
+ "Track",
542
+ "DateTimeOriginal",
543
+ "CreateDate",
544
+ "Duration",
545
+ ]:
546
+ if f in metadata:
547
+ md_content += f"{f}: {metadata[f]}\n"
548
+
549
+ # Transcribe
550
+ try:
551
+ transcript = self._transcribe_audio(local_path)
552
+ md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
553
+ except Exception:
554
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
555
+
556
+ return DocumentConverterResult(
557
+ title=None,
558
+ text_content=md_content.strip(),
559
+ )
560
+
561
+ def _transcribe_audio(self, local_path) -> str:
562
+ recognizer = sr.Recognizer()
563
+ with sr.AudioFile(local_path) as source:
564
+ audio = recognizer.record(source)
565
+ return recognizer.recognize_google(audio).strip()
566
+
567
+
568
+ class Mp3Converter(WavConverter):
569
+ """
570
+ Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
571
+ """
572
+
573
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
574
+ # Bail if not a MP3
575
+ extension = kwargs.get("file_extension", "")
576
+ if extension.lower() not in [".mp3", ".m4a"]:
577
+ return None
578
+
579
+ md_content = ""
580
+
581
+ # Add metadata
582
+ metadata = self._get_metadata(local_path)
583
+ if metadata:
584
+ for f in [
585
+ "Title",
586
+ "Artist",
587
+ "Author",
588
+ "Band",
589
+ "Album",
590
+ "Genre",
591
+ "Track",
592
+ "DateTimeOriginal",
593
+ "CreateDate",
594
+ "Duration",
595
+ ]:
596
+ if f in metadata:
597
+ md_content += f"{f}: {metadata[f]}\n"
598
+
599
+ # Transcribe
600
+ handle, temp_path = tempfile.mkstemp(suffix=".wav")
601
+ os.close(handle)
602
+ try:
603
+ if extension.lower() == ".mp3":
604
+ sound = pydub.AudioSegment.from_mp3(local_path)
605
+ else:
606
+ sound = pydub.AudioSegment.from_file(local_path, format="m4a")
607
+ sound.export(temp_path, format="wav")
608
+
609
+ _args = dict()
610
+ _args.update(kwargs)
611
+ _args["file_extension"] = ".wav"
612
+
613
+ try:
614
+ transcript = super()._transcribe_audio(temp_path).strip()
615
+ md_content += "\n\n### Audio Transcript:\n" + (
616
+ "[No speech detected]" if transcript == "" else transcript
617
+ )
618
+ except Exception:
619
+ md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
620
+
621
+ finally:
622
+ os.unlink(temp_path)
623
+
624
+ # Return the result
625
+ return DocumentConverterResult(
626
+ title=None,
627
+ text_content=md_content.strip(),
628
+ )
629
+
630
+
631
+ class ZipConverter(DocumentConverter):
632
+ """
633
+ Extracts ZIP files to a permanent local directory and returns a listing of extracted files.
634
+ """
635
+
636
+ def __init__(self, extract_dir: str = "downloads"):
637
+ """
638
+ Initialize with path to extraction directory.
639
+
640
+ Args:
641
+ extract_dir: The directory where files will be extracted. Defaults to "downloads"
642
+ """
643
+ self.extract_dir = extract_dir
644
+ # Create the extraction directory if it doesn't exist
645
+ os.makedirs(self.extract_dir, exist_ok=True)
646
+
647
+ def convert(self, local_path: str, **kwargs: Any) -> Union[None, DocumentConverterResult]:
648
+ # Bail if not a ZIP file
649
+ extension = kwargs.get("file_extension", "")
650
+ if extension.lower() != ".zip":
651
+ return None
652
+
653
+ # Verify it's actually a ZIP file
654
+ if not zipfile.is_zipfile(local_path):
655
+ return None
656
+
657
+ # Extract all files and build list
658
+ extracted_files = []
659
+ with zipfile.ZipFile(local_path, "r") as zip_ref:
660
+ # Extract all files
661
+ zip_ref.extractall(self.extract_dir)
662
+ # Get list of all files
663
+ for file_path in zip_ref.namelist():
664
+ # Skip directories
665
+ if not file_path.endswith("/"):
666
+ extracted_files.append(self.extract_dir + "/" + file_path)
667
+
668
+ # Sort files for consistent output
669
+ extracted_files.sort()
670
+
671
+ # Build the markdown content
672
+ md_content = "Downloaded the following files:\n"
673
+ for file in extracted_files:
674
+ md_content += f"* {file}\n"
675
+
676
+ return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip())
677
+
678
+
679
+ class ImageConverter(MediaConverter):
680
+ """
681
+ Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
682
+ """
683
+
684
+ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
685
+ # Bail if not a XLSX
686
+ extension = kwargs.get("file_extension", "")
687
+ if extension.lower() not in [".jpg", ".jpeg", ".png"]:
688
+ return None
689
+
690
+ md_content = ""
691
+
692
+ # Add metadata
693
+ metadata = self._get_metadata(local_path)
694
+ if metadata:
695
+ for f in [
696
+ "ImageSize",
697
+ "Title",
698
+ "Caption",
699
+ "Description",
700
+ "Keywords",
701
+ "Artist",
702
+ "Author",
703
+ "DateTimeOriginal",
704
+ "CreateDate",
705
+ "GPSPosition",
706
+ ]:
707
+ if f in metadata:
708
+ md_content += f"{f}: {metadata[f]}\n"
709
+
710
+ # Try describing the image with GPTV
711
+ mlm_client = kwargs.get("mlm_client")
712
+ mlm_model = kwargs.get("mlm_model")
713
+ if mlm_client is not None and mlm_model is not None:
714
+ md_content += (
715
+ "\n# Description:\n"
716
+ + self._get_mlm_description(
717
+ local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
718
+ ).strip()
719
+ + "\n"
720
+ )
721
+
722
+ return DocumentConverterResult(
723
+ title=None,
724
+ text_content=md_content,
725
+ )
726
+
727
+ def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
728
+ if prompt is None or prompt.strip() == "":
729
+ prompt = "Write a detailed caption for this image."
730
+
731
+ sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
732
+
733
+ data_uri = ""
734
+ with open(local_path, "rb") as image_file:
735
+ content_type, encoding = mimetypes.guess_type("_dummy" + extension)
736
+ if content_type is None:
737
+ content_type = "image/jpeg"
738
+ image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
739
+ data_uri = f"data:{content_type};base64,{image_base64}"
740
+
741
+ messages = [
742
+ {
743
+ "role": "user",
744
+ "content": [
745
+ {"type": "text", "text": prompt},
746
+ {
747
+ "type": "image_url",
748
+ "image_url": {
749
+ "url": data_uri,
750
+ },
751
+ },
752
+ ],
753
+ }
754
+ ]
755
+
756
+ response = client.chat.completions.create(model=model, messages=messages)
757
+ return response.choices[0].message.content
758
+
759
+
760
+ class FileConversionException(Exception):
761
+ pass
762
+
763
+
764
+ class UnsupportedFormatException(Exception):
765
+ pass
766
+
767
+
768
+ class MarkdownConverter:
769
+ """(In preview) An extremely simple text-based document reader, suitable for LLM use.
770
+ This reader will convert common file-types or webpages to Markdown."""
771
+
772
+ def __init__(
773
+ self,
774
+ requests_session: Optional[requests.Session] = None,
775
+ mlm_client: Optional[Any] = None,
776
+ mlm_model: Optional[Any] = None,
777
+ ):
778
+ if requests_session is None:
779
+ self._requests_session = requests.Session()
780
+ else:
781
+ self._requests_session = requests_session
782
+
783
+ self._mlm_client = mlm_client
784
+ self._mlm_model = mlm_model
785
+
786
+ self._page_converters: List[DocumentConverter] = []
787
+
788
+ # Register converters for successful browsing operations
789
+ # Later registrations are tried first / take higher priority than earlier registrations
790
+ # To this end, the most specific converters should appear below the most generic converters
791
+ self.register_page_converter(PlainTextConverter())
792
+ self.register_page_converter(HtmlConverter())
793
+ self.register_page_converter(WikipediaConverter())
794
+ self.register_page_converter(YouTubeConverter())
795
+ self.register_page_converter(DocxConverter())
796
+ self.register_page_converter(XlsxConverter())
797
+ self.register_page_converter(PptxConverter())
798
+ self.register_page_converter(WavConverter())
799
+ self.register_page_converter(Mp3Converter())
800
+ self.register_page_converter(ImageConverter())
801
+ self.register_page_converter(ZipConverter())
802
+ self.register_page_converter(PdfConverter())
803
+
804
+ def convert(
805
+ self, source: Union[str, requests.Response], **kwargs: Any
806
+ ) -> DocumentConverterResult: # TODO: deal with kwargs
807
+ """
808
+ Args:
809
+ - source: can be a string representing a path or url, or a requests.response object
810
+ - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
811
+ """
812
+
813
+ # Local path or url
814
+ if isinstance(source, str):
815
+ if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
816
+ return self.convert_url(source, **kwargs)
817
+ else:
818
+ return self.convert_local(source, **kwargs)
819
+ # Request response
820
+ elif isinstance(source, requests.Response):
821
+ return self.convert_response(source, **kwargs)
822
+
823
+ def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
824
+ # Prepare a list of extensions to try (in order of priority)
825
+ ext = kwargs.get("file_extension")
826
+ extensions = [ext] if ext is not None else []
827
+
828
+ # Get extension alternatives from the path and puremagic
829
+ base, ext = os.path.splitext(path)
830
+ self._append_ext(extensions, ext)
831
+ self._append_ext(extensions, self._guess_ext_magic(path))
832
+
833
+ # Convert
834
+ return self._convert(path, extensions, **kwargs)
835
+
836
+ # TODO what should stream's type be?
837
+ def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
838
+ # Prepare a list of extensions to try (in order of priority)
839
+ ext = kwargs.get("file_extension")
840
+ extensions = [ext] if ext is not None else []
841
+
842
+ # Save the file locally to a temporary file. It will be deleted before this method exits
843
+ handle, temp_path = tempfile.mkstemp()
844
+ fh = os.fdopen(handle, "wb")
845
+ result = None
846
+ try:
847
+ # Write to the temporary file
848
+ content = stream.read()
849
+ if isinstance(content, str):
850
+ fh.write(content.encode("utf-8"))
851
+ else:
852
+ fh.write(content)
853
+ fh.close()
854
+
855
+ # Use puremagic to check for more extension options
856
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
857
+
858
+ # Convert
859
+ result = self._convert(temp_path, extensions, **kwargs)
860
+ # Clean up
861
+ finally:
862
+ try:
863
+ fh.close()
864
+ except Exception:
865
+ pass
866
+ os.unlink(temp_path)
867
+
868
+ return result
869
+
870
+ def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
871
+ # Send a HTTP request to the URL
872
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
873
+ response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
874
+ response.raise_for_status()
875
+ return self.convert_response(response, **kwargs)
876
+
877
+ def convert_response(
878
+ self, response: requests.Response, **kwargs: Any
879
+ ) -> DocumentConverterResult: # TODO fix kwargs type
880
+ # Prepare a list of extensions to try (in order of priority)
881
+ ext = kwargs.get("file_extension")
882
+ extensions = [ext] if ext is not None else []
883
+
884
+ # Guess from the mimetype
885
+ content_type = response.headers.get("content-type", "").split(";")[0]
886
+ self._append_ext(extensions, mimetypes.guess_extension(content_type))
887
+
888
+ # Read the content disposition if there is one
889
+ content_disposition = response.headers.get("content-disposition", "")
890
+ m = re.search(r"filename=([^;]+)", content_disposition)
891
+ if m:
892
+ base, ext = os.path.splitext(m.group(1).strip("\"'"))
893
+ self._append_ext(extensions, ext)
894
+
895
+ # Read from the extension from the path
896
+ base, ext = os.path.splitext(urlparse(response.url).path)
897
+ self._append_ext(extensions, ext)
898
+
899
+ # Save the file locally to a temporary file. It will be deleted before this method exits
900
+ handle, temp_path = tempfile.mkstemp()
901
+ fh = os.fdopen(handle, "wb")
902
+ result = None
903
+ try:
904
+ # Download the file
905
+ for chunk in response.iter_content(chunk_size=512):
906
+ fh.write(chunk)
907
+ fh.close()
908
+
909
+ # Use puremagic to check for more extension options
910
+ self._append_ext(extensions, self._guess_ext_magic(temp_path))
911
+
912
+ # Convert
913
+ result = self._convert(temp_path, extensions, url=response.url)
914
+ except Exception as e:
915
+ print(f"Error in converting: {e}")
916
+
917
+ # Clean up
918
+ finally:
919
+ try:
920
+ fh.close()
921
+ except Exception:
922
+ pass
923
+ os.unlink(temp_path)
924
+
925
+ return result
926
+
927
+ def _convert(self, local_path: str, extensions: List[Union[str, None]], **kwargs) -> DocumentConverterResult:
928
+ error_trace = ""
929
+ for ext in extensions + [None]: # Try last with no extension
930
+ for converter in self._page_converters:
931
+ _kwargs = copy.deepcopy(kwargs)
932
+
933
+ # Overwrite file_extension appropriately
934
+ if ext is None:
935
+ if "file_extension" in _kwargs:
936
+ del _kwargs["file_extension"]
937
+ else:
938
+ _kwargs.update({"file_extension": ext})
939
+
940
+ # Copy any additional global options
941
+ if "mlm_client" not in _kwargs and self._mlm_client is not None:
942
+ _kwargs["mlm_client"] = self._mlm_client
943
+
944
+ if "mlm_model" not in _kwargs and self._mlm_model is not None:
945
+ _kwargs["mlm_model"] = self._mlm_model
946
+
947
+ # If we hit an error log it and keep trying
948
+ try:
949
+ res = converter.convert(local_path, **_kwargs)
950
+ except Exception:
951
+ error_trace = ("\n\n" + traceback.format_exc()).strip()
952
+
953
+ if res is not None:
954
+ # Normalize the content
955
+ res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
956
+ res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
957
+
958
+ # Todo
959
+ return res
960
+
961
+ # If we got this far without success, report any exceptions
962
+ if len(error_trace) > 0:
963
+ raise FileConversionException(
964
+ f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
965
+ )
966
+
967
+ # Nothing can handle it!
968
+ raise UnsupportedFormatException(
969
+ f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
970
+ )
971
+
972
+ def _append_ext(self, extensions, ext):
973
+ """Append a unique non-None, non-empty extension to a list of extensions."""
974
+ if ext is None:
975
+ return
976
+ ext = ext.strip()
977
+ if ext == "":
978
+ return
979
+ # if ext not in extensions:
980
+ if True:
981
+ extensions.append(ext)
982
+
983
+ def _guess_ext_magic(self, path):
984
+ """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
985
+ # Use puremagic to guess
986
+ try:
987
+ guesses = puremagic.magic_file(path)
988
+ if len(guesses) > 0:
989
+ ext = guesses[0].extension.strip()
990
+ if len(ext) > 0:
991
+ return ext
992
+ except FileNotFoundError:
993
+ pass
994
+ except IsADirectoryError:
995
+ pass
996
+ except PermissionError:
997
+ pass
998
+ return None
999
+
1000
+ def register_page_converter(self, converter: DocumentConverter) -> None:
1001
+ """Register a page text converter."""
1002
+ self._page_converters.insert(0, converter)
scripts/reformulator.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import copy
4
+
5
+ from smolagents.models import MessageRole, Model
6
+
7
+
8
+ def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
9
+ messages = [
10
+ {
11
+ "role": MessageRole.SYSTEM,
12
+ "content": [
13
+ {
14
+ "type": "text",
15
+ "text": f"""Earlier you were asked the following:
16
+
17
+ {original_task}
18
+
19
+ Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
20
+ }
21
+ ],
22
+ }
23
+ ]
24
+
25
+ # The first message just repeats the question, so remove it
26
+ # if len(inner_messages) > 1:
27
+ # del inner_messages[0]
28
+
29
+ # copy them to this context
30
+ try:
31
+ for message in inner_messages:
32
+ if not message.get("content"):
33
+ continue
34
+ message = copy.deepcopy(message)
35
+ message["role"] = MessageRole.USER
36
+ messages.append(message)
37
+ except Exception:
38
+ messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
39
+
40
+ # ask for the final answer
41
+ messages.append(
42
+ {
43
+ "role": MessageRole.USER,
44
+ "content": [
45
+ {
46
+ "type": "text",
47
+ "text": f"""
48
+ Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
49
+
50
+ {original_task}
51
+
52
+ To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
53
+ Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
54
+ ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
55
+ If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
56
+ If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
57
+ If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
58
+ If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
59
+ """,
60
+ }
61
+ ],
62
+ }
63
+ )
64
+
65
+ response = reformulation_model(messages).content
66
+
67
+ final_answer = response.split("FINAL ANSWER: ")[-1].strip()
68
+ print("> Reformulated answer: ", final_answer)
69
+
70
+ # if "unable to determine" in final_answer.lower():
71
+ # messages.append({"role": MessageRole.ASSISTANT, "content": response })
72
+ # messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
73
+ # I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
74
+
75
+ # To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
76
+ # Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
77
+ # ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
78
+ # If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
79
+ # If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
80
+ # If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
81
+ # """.strip()}]})
82
+
83
+ # response = model(messages).content
84
+ # print("\n>>>Making an educated guess.\n", response)
85
+ # final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
86
+ return final_answer
scripts/run_agents.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import shutil
4
+ import textwrap
5
+ from pathlib import Path
6
+
7
+ # import tqdm.asyncio
8
+ from smolagents.utils import AgentError
9
+
10
+
11
+ def serialize_agent_error(obj):
12
+ if isinstance(obj, AgentError):
13
+ return {"error_type": obj.__class__.__name__, "message": obj.message}
14
+ else:
15
+ return str(obj)
16
+
17
+
18
+ def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
19
+ prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
20
+ {question}. But do not try to answer the question directly!
21
+ Do not add any information that is not present in the image."""
22
+ return visual_inspection_tool(image_path=file_name, question=prompt)
23
+
24
+
25
+ def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
26
+ prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
27
+ {question}. But do not try to answer the question directly!
28
+ Do not add any information that is not present in the document."""
29
+ return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
30
+
31
+
32
+ def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
33
+ file_extension = file_path.split(".")[-1]
34
+ if file_extension in ["png", "jpg", "jpeg"]:
35
+ file_description = f" - Attached image: {file_path}"
36
+ file_description += (
37
+ f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
38
+ )
39
+ return file_description
40
+ elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
41
+ file_description = f" - Attached document: {file_path}"
42
+ image_path = file_path.split(".")[0] + ".png"
43
+ if os.path.exists(image_path):
44
+ description = get_image_description(image_path, question, visual_inspection_tool)
45
+ else:
46
+ description = get_document_description(file_path, question, document_inspection_tool)
47
+ file_description += f"\n -> File description: {description}"
48
+ return file_description
49
+ elif file_extension in ["mp3", "m4a", "wav"]:
50
+ return f" - Attached audio: {file_path}"
51
+ else:
52
+ return f" - Attached file: {file_path}"
53
+
54
+
55
+ def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
56
+ folder_path = file_path.replace(".zip", "")
57
+ os.makedirs(folder_path, exist_ok=True)
58
+ shutil.unpack_archive(file_path, folder_path)
59
+
60
+ prompt_use_files = ""
61
+ for root, dirs, files in os.walk(folder_path):
62
+ for file in files:
63
+ file_path = os.path.join(root, file)
64
+ prompt_use_files += "\n" + textwrap.indent(
65
+ get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
66
+ prefix=" ",
67
+ )
68
+ return prompt_use_files
69
+
70
+
71
+ def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
72
+ f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
73
+ done = set()
74
+ if f.exists():
75
+ with open(f, encoding="utf-8") as fh:
76
+ done = {json.loads(line)["task_id"] for line in fh if line.strip()}
77
+
78
+ tasks = []
79
+ for i in range(total):
80
+ task_id = int(data[i]["task_id"])
81
+ if task_id not in done:
82
+ if tasks_ids is not None:
83
+ if task_id in tasks_ids:
84
+ tasks.append(data[i])
85
+ else:
86
+ tasks.append(data[i])
87
+ return tasks
scripts/text_inspector_tool.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from smolagents import Tool
4
+ from smolagents.models import Model
5
+
6
+
7
+ class TextInspectorTool(Tool):
8
+ name = "inspect_file_as_text"
9
+ description = """
10
+ You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
11
+ This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
12
+
13
+ inputs = {
14
+ "file_path": {
15
+ "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
16
+ "type": "string",
17
+ },
18
+ "question": {
19
+ "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
20
+ "type": "string",
21
+ "nullable": True,
22
+ },
23
+ }
24
+ output_type = "string"
25
+
26
+ def __init__(self, model: Model = None, text_limit: int = 100000):
27
+ super().__init__()
28
+ self.model = model
29
+ self.text_limit = text_limit
30
+ from .mdconvert import MarkdownConverter
31
+
32
+ self.md_converter = MarkdownConverter()
33
+
34
+ def forward_initial_exam_mode(self, file_path, question):
35
+ from smolagents.models import MessageRole
36
+
37
+ result = self.md_converter.convert(file_path)
38
+
39
+ if file_path[-4:] in [".png", ".jpg"]:
40
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
41
+
42
+ if ".zip" in file_path:
43
+ return result.text_content
44
+
45
+ if not question:
46
+ return result.text_content
47
+
48
+ if len(result.text_content) < 4000:
49
+ return "Document content: " + result.text_content
50
+
51
+ messages = [
52
+ {
53
+ "role": MessageRole.SYSTEM,
54
+ "content": [
55
+ {
56
+ "type": "text",
57
+ "text": "Here is a file:\n### "
58
+ + str(result.title)
59
+ + "\n\n"
60
+ + result.text_content[: self.text_limit],
61
+ }
62
+ ],
63
+ },
64
+ {
65
+ "role": MessageRole.USER,
66
+ "content": [
67
+ {
68
+ "type": "text",
69
+ "text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
70
+ + question
71
+ + "\n\nDon't answer the question yourself! Just provide useful notes on the document",
72
+ }
73
+ ],
74
+ },
75
+ ]
76
+ return self.model(messages).content
77
+
78
+ def forward(self, file_path, question: Optional[str] = None) -> str:
79
+ from smolagents.models import MessageRole
80
+
81
+ result = self.md_converter.convert(file_path)
82
+
83
+ if file_path[-4:] in [".png", ".jpg"]:
84
+ raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
85
+
86
+ if ".zip" in file_path:
87
+ return result.text_content
88
+
89
+ if not question:
90
+ return result.text_content
91
+
92
+ messages = [
93
+ {
94
+ "role": MessageRole.SYSTEM,
95
+ "content": [
96
+ {
97
+ "type": "text",
98
+ "text": "You will have to write a short caption for this file, then answer this question:"
99
+ + question,
100
+ }
101
+ ],
102
+ },
103
+ {
104
+ "role": MessageRole.USER,
105
+ "content": [
106
+ {
107
+ "type": "text",
108
+ "text": "Here is the complete file:\n### "
109
+ + str(result.title)
110
+ + "\n\n"
111
+ + result.text_content[: self.text_limit],
112
+ }
113
+ ],
114
+ },
115
+ {
116
+ "role": MessageRole.USER,
117
+ "content": [
118
+ {
119
+ "type": "text",
120
+ "text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
121
+ + question,
122
+ }
123
+ ],
124
+ },
125
+ ]
126
+ return self.model(messages).content
scripts/text_web_browser.py ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
2
+ # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
3
+ import mimetypes
4
+ import os
5
+ import pathlib
6
+ import re
7
+ import time
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib.parse import unquote, urljoin, urlparse
11
+
12
+ import pathvalidate
13
+ import requests
14
+ from serpapi import GoogleSearch
15
+
16
+ from smolagents import Tool
17
+
18
+ from .cookies import COOKIES
19
+ from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
20
+
21
+
22
+ class SimpleTextBrowser:
23
+ """(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
24
+
25
+ def __init__(
26
+ self,
27
+ start_page: Optional[str] = None,
28
+ viewport_size: Optional[int] = 1024 * 8,
29
+ downloads_folder: Optional[Union[str, None]] = None,
30
+ serpapi_key: Optional[Union[str, None]] = None,
31
+ request_kwargs: Optional[Union[Dict[str, Any], None]] = None,
32
+ ):
33
+ self.start_page: str = start_page if start_page else "about:blank"
34
+ self.viewport_size = viewport_size # Applies only to the standard uri types
35
+ self.downloads_folder = downloads_folder
36
+ self.history: List[Tuple[str, float]] = list()
37
+ self.page_title: Optional[str] = None
38
+ self.viewport_current_page = 0
39
+ self.viewport_pages: List[Tuple[int, int]] = list()
40
+ self.set_address(self.start_page)
41
+ self.serpapi_key = serpapi_key
42
+ self.request_kwargs = request_kwargs
43
+ self.request_kwargs["cookies"] = COOKIES
44
+ self._mdconvert = MarkdownConverter()
45
+ self._page_content: str = ""
46
+
47
+ self._find_on_page_query: Union[str, None] = None
48
+ self._find_on_page_last_result: Union[int, None] = None # Location of the last result
49
+
50
+ @property
51
+ def address(self) -> str:
52
+ """Return the address of the current page."""
53
+ return self.history[-1][0]
54
+
55
+ def set_address(self, uri_or_path: str, filter_year: Optional[int] = None) -> None:
56
+ # TODO: Handle anchors
57
+ self.history.append((uri_or_path, time.time()))
58
+
59
+ # Handle special URIs
60
+ if uri_or_path == "about:blank":
61
+ self._set_page_content("")
62
+ elif uri_or_path.startswith("google:"):
63
+ self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
64
+ else:
65
+ if (
66
+ not uri_or_path.startswith("http:")
67
+ and not uri_or_path.startswith("https:")
68
+ and not uri_or_path.startswith("file:")
69
+ ):
70
+ if len(self.history) > 1:
71
+ prior_address = self.history[-2][0]
72
+ uri_or_path = urljoin(prior_address, uri_or_path)
73
+ # Update the address with the fully-qualified path
74
+ self.history[-1] = (uri_or_path, self.history[-1][1])
75
+ self._fetch_page(uri_or_path)
76
+
77
+ self.viewport_current_page = 0
78
+ self.find_on_page_query = None
79
+ self.find_on_page_viewport = None
80
+
81
+ @property
82
+ def viewport(self) -> str:
83
+ """Return the content of the current viewport."""
84
+ bounds = self.viewport_pages[self.viewport_current_page]
85
+ return self.page_content[bounds[0] : bounds[1]]
86
+
87
+ @property
88
+ def page_content(self) -> str:
89
+ """Return the full contents of the current page."""
90
+ return self._page_content
91
+
92
+ def _set_page_content(self, content: str) -> None:
93
+ """Sets the text content of the current page."""
94
+ self._page_content = content
95
+ self._split_pages()
96
+ if self.viewport_current_page >= len(self.viewport_pages):
97
+ self.viewport_current_page = len(self.viewport_pages) - 1
98
+
99
+ def page_down(self) -> None:
100
+ self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
101
+
102
+ def page_up(self) -> None:
103
+ self.viewport_current_page = max(self.viewport_current_page - 1, 0)
104
+
105
+ def find_on_page(self, query: str) -> Union[str, None]:
106
+ """Searches for the query from the current viewport forward, looping back to the start if necessary."""
107
+
108
+ # Did we get here via a previous find_on_page search with the same query?
109
+ # If so, map to find_next
110
+ if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
111
+ return self.find_next()
112
+
113
+ # Ok it's a new search start from the current viewport
114
+ self._find_on_page_query = query
115
+ viewport_match = self._find_next_viewport(query, self.viewport_current_page)
116
+ if viewport_match is None:
117
+ self._find_on_page_last_result = None
118
+ return None
119
+ else:
120
+ self.viewport_current_page = viewport_match
121
+ self._find_on_page_last_result = viewport_match
122
+ return self.viewport
123
+
124
+ def find_next(self) -> Union[str, None]:
125
+ """Scroll to the next viewport that matches the query"""
126
+
127
+ if self._find_on_page_query is None:
128
+ return None
129
+
130
+ starting_viewport = self._find_on_page_last_result
131
+ if starting_viewport is None:
132
+ starting_viewport = 0
133
+ else:
134
+ starting_viewport += 1
135
+ if starting_viewport >= len(self.viewport_pages):
136
+ starting_viewport = 0
137
+
138
+ viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
139
+ if viewport_match is None:
140
+ self._find_on_page_last_result = None
141
+ return None
142
+ else:
143
+ self.viewport_current_page = viewport_match
144
+ self._find_on_page_last_result = viewport_match
145
+ return self.viewport
146
+
147
+ def _find_next_viewport(self, query: str, starting_viewport: int) -> Union[int, None]:
148
+ """Search for matches between the starting viewport looping when reaching the end."""
149
+
150
+ if query is None:
151
+ return None
152
+
153
+ # Normalize the query, and convert to a regular expression
154
+ nquery = re.sub(r"\*", "__STAR__", query)
155
+ nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
156
+ nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
157
+ nquery = nquery.replace("__STAR__", ".*").lower()
158
+
159
+ if nquery.strip() == "":
160
+ return None
161
+
162
+ idxs = list()
163
+ idxs.extend(range(starting_viewport, len(self.viewport_pages)))
164
+ idxs.extend(range(0, starting_viewport))
165
+
166
+ for i in idxs:
167
+ bounds = self.viewport_pages[i]
168
+ content = self.page_content[bounds[0] : bounds[1]]
169
+
170
+ # TODO: Remove markdown links and images
171
+ ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
172
+ if re.search(nquery, ncontent):
173
+ return i
174
+
175
+ return None
176
+
177
+ def visit_page(self, path_or_uri: str, filter_year: Optional[int] = None) -> str:
178
+ """Update the address, visit the page, and return the content of the viewport."""
179
+ self.set_address(path_or_uri, filter_year=filter_year)
180
+ return self.viewport
181
+
182
+ def _split_pages(self) -> None:
183
+ # Do not split search results
184
+ if self.address.startswith("google:"):
185
+ self.viewport_pages = [(0, len(self._page_content))]
186
+ return
187
+
188
+ # Handle empty pages
189
+ if len(self._page_content) == 0:
190
+ self.viewport_pages = [(0, 0)]
191
+ return
192
+
193
+ # Break the viewport into pages
194
+ self.viewport_pages = []
195
+ start_idx = 0
196
+ while start_idx < len(self._page_content):
197
+ end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
198
+ # Adjust to end on a space
199
+ while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
200
+ end_idx += 1
201
+ self.viewport_pages.append((start_idx, end_idx))
202
+ start_idx = end_idx
203
+
204
+ def _serpapi_search(self, query: str, filter_year: Optional[int] = None) -> None:
205
+ if self.serpapi_key is None:
206
+ raise ValueError("Missing SerpAPI key.")
207
+
208
+ params = {
209
+ "engine": "google",
210
+ "q": query,
211
+ "api_key": self.serpapi_key,
212
+ }
213
+ if filter_year is not None:
214
+ params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
215
+
216
+ search = GoogleSearch(params)
217
+ results = search.get_dict()
218
+ self.page_title = f"{query} - Search"
219
+ if "organic_results" not in results.keys():
220
+ raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
221
+ if len(results["organic_results"]) == 0:
222
+ year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
223
+ self._set_page_content(
224
+ f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
225
+ )
226
+ return
227
+
228
+ def _prev_visit(url):
229
+ for i in range(len(self.history) - 1, -1, -1):
230
+ if self.history[i][0] == url:
231
+ return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
232
+ return ""
233
+
234
+ web_snippets: List[str] = list()
235
+ idx = 0
236
+ if "organic_results" in results:
237
+ for page in results["organic_results"]:
238
+ idx += 1
239
+ date_published = ""
240
+ if "date" in page:
241
+ date_published = "\nDate published: " + page["date"]
242
+
243
+ source = ""
244
+ if "source" in page:
245
+ source = "\nSource: " + page["source"]
246
+
247
+ snippet = ""
248
+ if "snippet" in page:
249
+ snippet = "\n" + page["snippet"]
250
+
251
+ redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
252
+
253
+ redacted_version = redacted_version.replace("Your browser can't play this video.", "")
254
+ web_snippets.append(redacted_version)
255
+
256
+ content = (
257
+ f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
258
+ + "\n\n".join(web_snippets)
259
+ )
260
+
261
+ self._set_page_content(content)
262
+
263
+ def _fetch_page(self, url: str) -> None:
264
+ download_path = ""
265
+ try:
266
+ if url.startswith("file://"):
267
+ download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
268
+ res = self._mdconvert.convert_local(download_path)
269
+ self.page_title = res.title
270
+ self._set_page_content(res.text_content)
271
+ else:
272
+ # Prepare the request parameters
273
+ request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
274
+ request_kwargs["stream"] = True
275
+
276
+ # Send a HTTP request to the URL
277
+ response = requests.get(url, **request_kwargs)
278
+ response.raise_for_status()
279
+
280
+ # If the HTTP request was successful
281
+ content_type = response.headers.get("content-type", "")
282
+
283
+ # Text or HTML
284
+ if "text/" in content_type.lower():
285
+ res = self._mdconvert.convert_response(response)
286
+ self.page_title = res.title
287
+ self._set_page_content(res.text_content)
288
+ # A download
289
+ else:
290
+ # Try producing a safe filename
291
+ fname = None
292
+ download_path = None
293
+ try:
294
+ fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
295
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
296
+
297
+ suffix = 0
298
+ while os.path.exists(download_path) and suffix < 1000:
299
+ suffix += 1
300
+ base, ext = os.path.splitext(fname)
301
+ new_fname = f"{base}__{suffix}{ext}"
302
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
303
+
304
+ except NameError:
305
+ pass
306
+
307
+ # No suitable name, so make one
308
+ if fname is None:
309
+ extension = mimetypes.guess_extension(content_type)
310
+ if extension is None:
311
+ extension = ".download"
312
+ fname = str(uuid.uuid4()) + extension
313
+ download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
314
+
315
+ # Open a file for writing
316
+ with open(download_path, "wb") as fh:
317
+ for chunk in response.iter_content(chunk_size=512):
318
+ fh.write(chunk)
319
+
320
+ # Render it
321
+ local_uri = pathlib.Path(download_path).as_uri()
322
+ self.set_address(local_uri)
323
+
324
+ except UnsupportedFormatException as e:
325
+ print(e)
326
+ self.page_title = ("Download complete.",)
327
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
328
+ except FileConversionException as e:
329
+ print(e)
330
+ self.page_title = ("Download complete.",)
331
+ self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
332
+ except FileNotFoundError:
333
+ self.page_title = "Error 404"
334
+ self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
335
+ except requests.exceptions.RequestException as request_exception:
336
+ try:
337
+ self.page_title = f"Error {response.status_code}"
338
+
339
+ # If the error was rendered in HTML we might as well render it
340
+ content_type = response.headers.get("content-type", "")
341
+ if content_type is not None and "text/html" in content_type.lower():
342
+ res = self._mdconvert.convert(response)
343
+ self.page_title = f"Error {response.status_code}"
344
+ self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
345
+ else:
346
+ text = ""
347
+ for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
348
+ text += chunk
349
+ self.page_title = f"Error {response.status_code}"
350
+ self._set_page_content(f"## Error {response.status_code}\n\n{text}")
351
+ except NameError:
352
+ self.page_title = "Error"
353
+ self._set_page_content(f"## Error\n\n{str(request_exception)}")
354
+
355
+ def _state(self) -> Tuple[str, str]:
356
+ header = f"Address: {self.address}\n"
357
+ if self.page_title is not None:
358
+ header += f"Title: {self.page_title}\n"
359
+
360
+ current_page = self.viewport_current_page
361
+ total_pages = len(self.viewport_pages)
362
+
363
+ address = self.address
364
+ for i in range(len(self.history) - 2, -1, -1): # Start from the second last
365
+ if self.history[i][0] == address:
366
+ header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
367
+ break
368
+
369
+ header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
370
+ return (header, self.viewport)
371
+
372
+
373
+ class SearchInformationTool(Tool):
374
+ name = "web_search"
375
+ description = "Perform a web search query (think a google search) and returns the search results."
376
+ inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
377
+ inputs["filter_year"] = {
378
+ "type": "string",
379
+ "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
380
+ "nullable": True,
381
+ }
382
+ output_type = "string"
383
+
384
+ def __init__(self, browser):
385
+ super().__init__()
386
+ self.browser = browser
387
+
388
+ def forward(self, query: str, filter_year: Optional[int] = None) -> str:
389
+ self.browser.visit_page(f"google: {query}", filter_year=filter_year)
390
+ header, content = self.browser._state()
391
+ return header.strip() + "\n=======================\n" + content
392
+
393
+
394
+ class VisitTool(Tool):
395
+ name = "visit_page"
396
+ description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
397
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}}
398
+ output_type = "string"
399
+
400
+ def __init__(self, browser=None):
401
+ super().__init__()
402
+ self.browser = browser
403
+
404
+ def forward(self, url: str) -> str:
405
+ self.browser.visit_page(url)
406
+ header, content = self.browser._state()
407
+ return header.strip() + "\n=======================\n" + content
408
+
409
+
410
+ class DownloadTool(Tool):
411
+ name = "download_file"
412
+ description = """
413
+ Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"]
414
+ After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
415
+ DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
416
+ inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
417
+ output_type = "string"
418
+
419
+ def __init__(self, browser):
420
+ super().__init__()
421
+ self.browser = browser
422
+
423
+ def forward(self, url: str) -> str:
424
+ import requests
425
+
426
+ if "arxiv" in url:
427
+ url = url.replace("abs", "pdf")
428
+ response = requests.get(url)
429
+ content_type = response.headers.get("content-type", "")
430
+ extension = mimetypes.guess_extension(content_type)
431
+ if extension and isinstance(extension, str):
432
+ new_path = f"./downloads/file{extension}"
433
+ else:
434
+ new_path = "./downloads/file.object"
435
+
436
+ with open(new_path, "wb") as f:
437
+ f.write(response.content)
438
+
439
+ if "pdf" in extension or "txt" in extension or "htm" in extension:
440
+ raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
441
+
442
+ return f"File was downloaded and saved under path {new_path}."
443
+
444
+
445
+ class ArchiveSearchTool(Tool):
446
+ name = "find_archived_url"
447
+ description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
448
+ inputs = {
449
+ "url": {"type": "string", "description": "The url you need the archive for."},
450
+ "date": {
451
+ "type": "string",
452
+ "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
453
+ },
454
+ }
455
+ output_type = "string"
456
+
457
+ def __init__(self, browser=None):
458
+ super().__init__()
459
+ self.browser = browser
460
+
461
+ def forward(self, url, date) -> str:
462
+ import requests
463
+
464
+ no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
465
+ archive_url = no_timestamp_url + f"&timestamp={date}"
466
+ response = requests.get(archive_url).json()
467
+ response_notimestamp = requests.get(no_timestamp_url).json()
468
+ if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
469
+ closest = response["archived_snapshots"]["closest"]
470
+ print("Archive found!", closest)
471
+
472
+ elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
473
+ closest = response_notimestamp["archived_snapshots"]["closest"]
474
+ print("Archive found!", closest)
475
+ else:
476
+ raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
477
+ target_url = closest["url"]
478
+ self.browser.visit_page(target_url)
479
+ header, content = self.browser._state()
480
+ return (
481
+ f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
482
+ + header.strip()
483
+ + "\n=======================\n"
484
+ + content
485
+ )
486
+
487
+
488
+ class PageUpTool(Tool):
489
+ name = "page_up"
490
+ description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
491
+ inputs = {}
492
+ output_type = "string"
493
+
494
+ def __init__(self, browser=None):
495
+ super().__init__()
496
+ self.browser = browser
497
+
498
+ def forward(self) -> str:
499
+ self.browser.page_up()
500
+ header, content = self.browser._state()
501
+ return header.strip() + "\n=======================\n" + content
502
+
503
+
504
+ class PageDownTool(Tool):
505
+ name = "page_down"
506
+ description = (
507
+ "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
508
+ )
509
+ inputs = {}
510
+ output_type = "string"
511
+
512
+ def __init__(self, browser=None):
513
+ super().__init__()
514
+ self.browser = browser
515
+
516
+ def forward(self) -> str:
517
+ self.browser.page_down()
518
+ header, content = self.browser._state()
519
+ return header.strip() + "\n=======================\n" + content
520
+
521
+
522
+ class FinderTool(Tool):
523
+ name = "find_on_page_ctrl_f"
524
+ description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
525
+ inputs = {
526
+ "search_string": {
527
+ "type": "string",
528
+ "description": "The string to search for on the page. This search string supports wildcards like '*'",
529
+ }
530
+ }
531
+ output_type = "string"
532
+
533
+ def __init__(self, browser=None):
534
+ super().__init__()
535
+ self.browser = browser
536
+
537
+ def forward(self, search_string: str) -> str:
538
+ find_result = self.browser.find_on_page(search_string)
539
+ header, content = self.browser._state()
540
+
541
+ if find_result is None:
542
+ return (
543
+ header.strip()
544
+ + f"\n=======================\nThe search string '{search_string}' was not found on this page."
545
+ )
546
+ else:
547
+ return header.strip() + "\n=======================\n" + content
548
+
549
+
550
+ class FindNextTool(Tool):
551
+ name = "find_next"
552
+ description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
553
+ inputs = {}
554
+ output_type = "string"
555
+
556
+ def __init__(self, browser=None):
557
+ super().__init__()
558
+ self.browser = browser
559
+
560
+ def forward(self) -> str:
561
+ find_result = self.browser.find_next()
562
+ header, content = self.browser._state()
563
+
564
+ if find_result is None:
565
+ return header.strip() + "\n=======================\nThe search string was not found on this page."
566
+ else:
567
+ return header.strip() + "\n=======================\n" + content
scripts/visual_qa.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import mimetypes
4
+ import os
5
+ import uuid
6
+ from io import BytesIO
7
+ from typing import Optional
8
+
9
+ import PIL.Image
10
+ import requests
11
+ from dotenv import load_dotenv
12
+ from huggingface_hub import InferenceClient
13
+
14
+ from smolagents import Tool, tool
15
+
16
+
17
+ load_dotenv(override=True)
18
+
19
+
20
+ def process_images_and_text(image_path, query, client):
21
+ from transformers import AutoProcessor
22
+
23
+ messages = [
24
+ {
25
+ "role": "user",
26
+ "content": [
27
+ {"type": "image"},
28
+ {"type": "text", "text": query},
29
+ ],
30
+ },
31
+ ]
32
+ idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
33
+ prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
34
+
35
+ # load images from local directory
36
+
37
+ # encode images to strings which can be sent to the endpoint
38
+ def encode_local_image(image_path):
39
+ # load image
40
+ image = PIL.Image.open(image_path).convert("RGB")
41
+
42
+ # Convert the image to a base64 string
43
+ buffer = BytesIO()
44
+ image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
45
+ base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
46
+
47
+ # add string formatting required by the endpoint
48
+ image_string = f"data:image/jpeg;base64,{base64_image}"
49
+
50
+ return image_string
51
+
52
+ image_string = encode_local_image(image_path)
53
+ prompt_with_images = prompt_with_template.replace("<image>", "![]({}) ").format(image_string)
54
+
55
+ payload = {
56
+ "inputs": prompt_with_images,
57
+ "parameters": {
58
+ "return_full_text": False,
59
+ "max_new_tokens": 200,
60
+ },
61
+ }
62
+
63
+ return json.loads(client.post(json=payload).decode())[0]
64
+
65
+
66
+ # Function to encode the image
67
+ def encode_image(image_path):
68
+ if image_path.startswith("http"):
69
+ user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
70
+ request_kwargs = {
71
+ "headers": {"User-Agent": user_agent},
72
+ "stream": True,
73
+ }
74
+
75
+ # Send a HTTP request to the URL
76
+ response = requests.get(image_path, **request_kwargs)
77
+ response.raise_for_status()
78
+ content_type = response.headers.get("content-type", "")
79
+
80
+ extension = mimetypes.guess_extension(content_type)
81
+ if extension is None:
82
+ extension = ".download"
83
+
84
+ fname = str(uuid.uuid4()) + extension
85
+ download_path = os.path.abspath(os.path.join("downloads", fname))
86
+
87
+ with open(download_path, "wb") as fh:
88
+ for chunk in response.iter_content(chunk_size=512):
89
+ fh.write(chunk)
90
+
91
+ image_path = download_path
92
+
93
+ with open(image_path, "rb") as image_file:
94
+ return base64.b64encode(image_file.read()).decode("utf-8")
95
+
96
+
97
+ def resize_image(image_path):
98
+ img = PIL.Image.open(image_path)
99
+ width, height = img.size
100
+ img = img.resize((int(width / 2), int(height / 2)))
101
+ new_image_path = f"resized_{image_path}"
102
+ img.save(new_image_path)
103
+ return new_image_path
104
+
105
+
106
+ class VisualQATool(Tool):
107
+ name = "visualizer"
108
+ description = "A tool that can answer questions about attached images."
109
+ inputs = {
110
+ "image_path": {
111
+ "description": "The path to the image on which to answer the question",
112
+ "type": "string",
113
+ },
114
+ "question": {"description": "the question to answer", "type": "string", "nullable": True},
115
+ }
116
+ output_type = "string"
117
+
118
+ client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
119
+
120
+ def forward(self, image_path: str, question: Optional[str] = None) -> str:
121
+ output = ""
122
+ add_note = False
123
+ if not question:
124
+ add_note = True
125
+ question = "Please write a detailed caption for this image."
126
+ try:
127
+ output = process_images_and_text(image_path, question, self.client)
128
+ except Exception as e:
129
+ print(e)
130
+ if "Payload Too Large" in str(e):
131
+ new_image_path = resize_image(image_path)
132
+ output = process_images_and_text(new_image_path, question, self.client)
133
+
134
+ if add_note:
135
+ output = (
136
+ f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
137
+ )
138
+
139
+ return output
140
+
141
+
142
+ @tool
143
+ def visualizer(image_path: str, question: Optional[str] = None) -> str:
144
+ """A tool that can answer questions about attached images.
145
+
146
+ Args:
147
+ image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
148
+ question: The question to answer.
149
+ """
150
+ import mimetypes
151
+ import os
152
+
153
+ import requests
154
+
155
+ from .visual_qa import encode_image
156
+
157
+ add_note = False
158
+ if not question:
159
+ add_note = True
160
+ question = "Please write a detailed caption for this image."
161
+ if not isinstance(image_path, str):
162
+ raise Exception("You should provide at least `image_path` string argument to this tool!")
163
+
164
+ mime_type, _ = mimetypes.guess_type(image_path)
165
+ base64_image = encode_image(image_path)
166
+
167
+ payload = {
168
+ "model": "gpt-4o",
169
+ "messages": [
170
+ {
171
+ "role": "user",
172
+ "content": [
173
+ {"type": "text", "text": question},
174
+ {"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
175
+ ],
176
+ }
177
+ ],
178
+ "max_tokens": 1000,
179
+ }
180
+ headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
181
+ response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
182
+ try:
183
+ output = response.json()["choices"][0]["message"]["content"]
184
+ except Exception:
185
+ raise Exception(f"Response format unexpected: {response.json()}")
186
+
187
+ if add_note:
188
+ output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
189
+
190
+ return output
visual_vs_text_browser.ipynb ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Compare a text-based vs a vision-based browser\n",
8
+ "\n",
9
+ "Warning: this notebook is experimental, it probably won't work out of the box!"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": null,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "!pip install \"smolagents[litellm]\" -q"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": null,
24
+ "metadata": {},
25
+ "outputs": [],
26
+ "source": [
27
+ "import datasets\n",
28
+ "\n",
29
+ "\n",
30
+ "eval_ds = datasets.load_dataset(\"gaia-benchmark/GAIA\", \"2023_all\")[\"validation\"]"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "to_keep = [\n",
40
+ " \"What's the last line of the rhyme under the flavor\",\n",
41
+ " 'Of the authors (First M. Last) that worked on the paper \"Pie Menus or Linear Menus',\n",
42
+ " \"In Series 9, Episode 11 of Doctor Who, the Doctor is trapped inside an ever-shifting maze. What is this location called in the official script for the episode? Give the setting exactly as it appears in the first scene heading.\",\n",
43
+ " \"Which contributor to the version of OpenCV where support was added for the Mask-RCNN model has the same name as a former Chinese head of government when the names are transliterated to the Latin alphabet?\",\n",
44
+ " \"The photograph in the Whitney Museum of American Art's collection with accession number 2022.128 shows a person holding a book. Which military unit did the author of this book join in 1813? Answer without using articles.\",\n",
45
+ " \"I went to Virtue restaurant & bar in Chicago for my birthday on March 22, 2021 and the main course I had was delicious! Unfortunately, when I went back about a month later on April 21, it was no longer on the dinner menu.\",\n",
46
+ " \"In Emily Midkiff's June 2014 article in a journal named for the one of Hreidmar's \",\n",
47
+ " \"Under DDC 633 on Bielefeld University Library's BASE, as of 2020\",\n",
48
+ " \"In the 2018 VSCode blog post on replit.com, what was the command they clicked on in the last video to remove extra lines?\",\n",
49
+ " \"The Metropolitan Museum of Art has a portrait in its collection with an accession number of 29.100.5. Of the consecrators and co-consecrators\",\n",
50
+ " \"In Nature journal's Scientific Reports conference proceedings from 2012, in the article that did not mention plasmons or plasmonics, what nano-compound is studied?\",\n",
51
+ " 'In the year 2022, and before December, what does \"R\" stand for in the three core policies of the type of content',\n",
52
+ " \"Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?\",\n",
53
+ "]\n",
54
+ "eval_ds = eval_ds.filter(lambda row: any([el in row[\"Question\"] for el in to_keep]))\n",
55
+ "eval_ds = eval_ds.rename_columns({\"Question\": \"question\", \"Final answer\": \"true_answer\", \"Level\": \"task\"})"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "import os\n",
65
+ "\n",
66
+ "from dotenv import load_dotenv\n",
67
+ "from huggingface_hub import login\n",
68
+ "\n",
69
+ "\n",
70
+ "load_dotenv(override=True)\n",
71
+ "\n",
72
+ "login(os.getenv(\"HF_TOKEN\"))"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "markdown",
77
+ "metadata": {},
78
+ "source": [
79
+ "### Text browser"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "from scripts.run_agents import answer_questions\n",
89
+ "from scripts.text_inspector_tool import TextInspectorTool\n",
90
+ "from scripts.text_web_browser import (\n",
91
+ " ArchiveSearchTool,\n",
92
+ " FinderTool,\n",
93
+ " FindNextTool,\n",
94
+ " NavigationalSearchTool,\n",
95
+ " PageDownTool,\n",
96
+ " PageUpTool,\n",
97
+ " SearchInformationTool,\n",
98
+ " VisitTool,\n",
99
+ ")\n",
100
+ "from scripts.visual_qa import VisualQAGPT4Tool\n",
101
+ "\n",
102
+ "from smolagents import CodeAgent, LiteLLMModel\n",
103
+ "\n",
104
+ "\n",
105
+ "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "code",
110
+ "execution_count": null,
111
+ "metadata": {},
112
+ "outputs": [],
113
+ "source": [
114
+ "### BUILD AGENTS & TOOLS\n",
115
+ "\n",
116
+ "WEB_TOOLS = [\n",
117
+ " SearchInformationTool(),\n",
118
+ " NavigationalSearchTool(),\n",
119
+ " VisitTool(),\n",
120
+ " PageUpTool(),\n",
121
+ " PageDownTool(),\n",
122
+ " FinderTool(),\n",
123
+ " FindNextTool(),\n",
124
+ " ArchiveSearchTool(),\n",
125
+ "]\n",
126
+ "\n",
127
+ "\n",
128
+ "surfer_agent = CodeAgent(\n",
129
+ " model=proprietary_model,\n",
130
+ " tools=WEB_TOOLS,\n",
131
+ " max_steps=20,\n",
132
+ " verbosity_level=2,\n",
133
+ ")\n",
134
+ "\n",
135
+ "results_text = answer_questions(\n",
136
+ " eval_ds,\n",
137
+ " surfer_agent,\n",
138
+ " \"code_gpt4o_27-01_text\",\n",
139
+ " reformulation_model=proprietary_model,\n",
140
+ " output_folder=\"output_browsers\",\n",
141
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
142
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
143
+ ")"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "markdown",
148
+ "metadata": {},
149
+ "source": [
150
+ "### Vision browser"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "metadata": {},
157
+ "outputs": [],
158
+ "source": [
159
+ "!pip install helium -q"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "metadata": {},
166
+ "outputs": [],
167
+ "source": [
168
+ "from scripts.visual_qa import VisualQAGPT4Tool\n",
169
+ "\n",
170
+ "from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel\n",
171
+ "from smolagents.vision_web_browser import (\n",
172
+ " close_popups,\n",
173
+ " go_back,\n",
174
+ " helium_instructions,\n",
175
+ " initialize_agent,\n",
176
+ " save_screenshot,\n",
177
+ " search_item_ctrl_f,\n",
178
+ ")\n",
179
+ "\n",
180
+ "\n",
181
+ "proprietary_model = LiteLLMModel(model_id=\"gpt-4o\")\n",
182
+ "vision_browser_agent = initialize_agent(proprietary_model)\n",
183
+ "### BUILD AGENTS & TOOLS\n",
184
+ "\n",
185
+ "CodeAgent(\n",
186
+ " tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],\n",
187
+ " model=proprietary_model,\n",
188
+ " additional_authorized_imports=[\"helium\"],\n",
189
+ " step_callbacks=[save_screenshot],\n",
190
+ " max_steps=20,\n",
191
+ " verbosity_level=2,\n",
192
+ ")\n",
193
+ "\n",
194
+ "results_vision = answer_questions(\n",
195
+ " eval_ds,\n",
196
+ " vision_browser_agent,\n",
197
+ " \"code_gpt4o_27-01_vision\",\n",
198
+ " reformulation_model=proprietary_model,\n",
199
+ " output_folder=\"output_browsers\",\n",
200
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
201
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
202
+ " postprompt=helium_instructions\n",
203
+ " + \"Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them\",\n",
204
+ ")"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "markdown",
209
+ "metadata": {},
210
+ "source": [
211
+ "### Browser-use browser"
212
+ ]
213
+ },
214
+ {
215
+ "cell_type": "code",
216
+ "execution_count": null,
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "!pip install browser-use lxml_html_clean -q\n",
221
+ "!playwright install"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "metadata": {},
228
+ "outputs": [],
229
+ "source": [
230
+ "import asyncio\n",
231
+ "\n",
232
+ "import nest_asyncio\n",
233
+ "\n",
234
+ "\n",
235
+ "nest_asyncio.apply()\n",
236
+ "\n",
237
+ "from browser_use import Agent\n",
238
+ "from dotenv import load_dotenv\n",
239
+ "from langchain_openai import ChatOpenAI\n",
240
+ "\n",
241
+ "\n",
242
+ "load_dotenv()\n",
243
+ "\n",
244
+ "\n",
245
+ "class BrowserUseAgent:\n",
246
+ " logs = []\n",
247
+ "\n",
248
+ " def write_inner_memory_from_logs(self, summary_mode):\n",
249
+ " return self.results\n",
250
+ "\n",
251
+ " def run(self, task, **kwargs):\n",
252
+ " agent = Agent(\n",
253
+ " task=task,\n",
254
+ " llm=ChatOpenAI(model=\"gpt-4o\"),\n",
255
+ " )\n",
256
+ " self.results = asyncio.get_event_loop().run_until_complete(agent.run())\n",
257
+ " return self.results.history[-1].result[0].extracted_content\n",
258
+ "\n",
259
+ "\n",
260
+ "browser_use_agent = BrowserUseAgent()\n",
261
+ "\n",
262
+ "results_browseruse = answer_questions(\n",
263
+ " eval_ds,\n",
264
+ " browser_use_agent,\n",
265
+ " \"gpt-4o_27-01_browseruse\",\n",
266
+ " reformulation_model=proprietary_model,\n",
267
+ " output_folder=\"output_browsers\",\n",
268
+ " visual_inspection_tool=VisualQAGPT4Tool(),\n",
269
+ " text_inspector_tool=TextInspectorTool(proprietary_model, 40000),\n",
270
+ " postprompt=\"\",\n",
271
+ " run_simple=True,\n",
272
+ ")"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "markdown",
277
+ "metadata": {},
278
+ "source": [
279
+ "### Get results"
280
+ ]
281
+ },
282
+ {
283
+ "cell_type": "code",
284
+ "execution_count": null,
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "import pandas as pd\n",
289
+ "from scripts.gaia_scorer import question_scorer\n",
290
+ "\n",
291
+ "\n",
292
+ "results_vision, results_text, results_browseruse = (\n",
293
+ " pd.DataFrame(results_vision),\n",
294
+ " pd.DataFrame(results_text),\n",
295
+ " pd.DataFrame(results_browseruse),\n",
296
+ ")\n",
297
+ "\n",
298
+ "results_vision[\"is_correct\"] = results_vision.apply(\n",
299
+ " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
300
+ ")\n",
301
+ "results_text[\"is_correct\"] = results_text.apply(lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1)\n",
302
+ "results_browseruse[\"is_correct\"] = results_browseruse.apply(\n",
303
+ " lambda x: question_scorer(x[\"prediction\"], x[\"true_answer\"]), axis=1\n",
304
+ ")"
305
+ ]
306
+ },
307
+ {
308
+ "cell_type": "code",
309
+ "execution_count": null,
310
+ "metadata": {},
311
+ "outputs": [],
312
+ "source": [
313
+ "results = pd.concat([results_vision, results_text, results_browseruse])\n",
314
+ "results.groupby(\"agent_name\")[\"is_correct\"].mean()"
315
+ ]
316
+ },
317
+ {
318
+ "cell_type": "code",
319
+ "execution_count": null,
320
+ "metadata": {},
321
+ "outputs": [],
322
+ "source": [
323
+ "correct_vision_results = results_vision.loc[results_vision[\"is_correct\"]]\n",
324
+ "correct_vision_results"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": null,
330
+ "metadata": {},
331
+ "outputs": [],
332
+ "source": [
333
+ "false_text_results = results_text.loc[~results_text[\"is_correct\"]]\n",
334
+ "false_text_results"
335
+ ]
336
+ }
337
+ ],
338
+ "metadata": {
339
+ "kernelspec": {
340
+ "display_name": "gaia",
341
+ "language": "python",
342
+ "name": "python3"
343
+ },
344
+ "language_info": {
345
+ "codemirror_mode": {
346
+ "name": "ipython",
347
+ "version": 3
348
+ },
349
+ "file_extension": ".py",
350
+ "mimetype": "text/x-python",
351
+ "name": "python",
352
+ "nbconvert_exporter": "python",
353
+ "pygments_lexer": "ipython3",
354
+ "version": "3.12.0"
355
+ }
356
+ },
357
+ "nbformat": 4,
358
+ "nbformat_minor": 2
359
+ }