vonliechti commited on
Commit
69e37b7
·
verified ·
1 Parent(s): dd5fe55

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. benchmarking.ipynb +272 -160
  2. benchmarks/baseline.pkl +3 -0
  3. data.py +4 -5
benchmarking.ipynb CHANGED
@@ -9,13 +9,19 @@
9
  },
10
  {
11
  "cell_type": "code",
12
- "execution_count": 34,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
 
16
  "import numpy as np\n",
17
- "import json\n",
18
  "import pandas as pd\n",
 
 
 
 
 
 
19
  "\n",
20
  "def display_text_df(df):\n",
21
  " display(df.style.set_properties(**{'white-space': 'pre-wrap'}).set_table_styles(\n",
@@ -27,9 +33,23 @@
27
  },
28
  {
29
  "cell_type": "code",
30
- "execution_count": 5,
31
  "metadata": {},
32
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "source": [
34
  "from data import get_data\n",
35
  "data = get_data(download=False)\n"
@@ -37,97 +57,160 @@
37
  },
38
  {
39
  "cell_type": "code",
40
- "execution_count": 6,
41
  "metadata": {},
42
  "outputs": [
43
  {
44
  "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  "text/plain": [
46
- "('To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',\n",
47
- " 'Saint Bernadette Soubirous')"
48
  ]
49
  },
50
- "execution_count": 6,
51
  "metadata": {},
52
- "output_type": "execute_result"
53
  }
54
  ],
55
  "source": [
56
- "data.question_answer_pairs[0]"
57
  ]
58
  },
59
  {
60
  "cell_type": "code",
61
- "execution_count": 35,
62
  "metadata": {},
63
  "outputs": [
64
  {
65
  "data": {
66
  "text/html": [
67
  "<style type=\"text/css\">\n",
68
- "#T_fc111 th {\n",
69
  " text-align: left;\n",
70
  "}\n",
71
- "#T_fc111 td {\n",
72
  " text-align: left;\n",
73
  "}\n",
74
- "#T_fc111_row0_col0, #T_fc111_row0_col1, #T_fc111_row1_col0, #T_fc111_row1_col1, #T_fc111_row2_col0, #T_fc111_row2_col1, #T_fc111_row3_col0, #T_fc111_row3_col1, #T_fc111_row4_col0, #T_fc111_row4_col1, #T_fc111_row5_col0, #T_fc111_row5_col1, #T_fc111_row6_col0, #T_fc111_row6_col1, #T_fc111_row7_col0, #T_fc111_row7_col1, #T_fc111_row8_col0, #T_fc111_row8_col1, #T_fc111_row9_col0, #T_fc111_row9_col1 {\n",
75
  " white-space: pre-wrap;\n",
76
  "}\n",
77
  "</style>\n",
78
- "<table id=\"T_fc111\">\n",
79
  " <thead>\n",
80
  " <tr>\n",
81
- " <th id=\"T_fc111_level0_col0\" class=\"col_heading level0 col0\" >Question</th>\n",
82
- " <th id=\"T_fc111_level0_col1\" class=\"col_heading level0 col1\" >Answer</th>\n",
 
 
83
  " </tr>\n",
84
  " </thead>\n",
85
  " <tbody>\n",
86
  " <tr>\n",
87
- " <td id=\"T_fc111_row0_col0\" class=\"data row0 col0\" >What year was the Banská Akadémia founded?</td>\n",
88
- " <td id=\"T_fc111_row0_col1\" class=\"data row0 col1\" >1735</td>\n",
 
 
89
  " </tr>\n",
90
  " <tr>\n",
91
- " <td id=\"T_fc111_row1_col0\" class=\"data row1 col0\" >What is another speed that can also be reported by the camera?</td>\n",
92
- " <td id=\"T_fc111_row1_col1\" class=\"data row1 col1\" >SOS-based speed</td>\n",
 
 
93
  " </tr>\n",
94
  " <tr>\n",
95
- " <td id=\"T_fc111_row2_col0\" class=\"data row2 col0\" >Where were the use of advanced materials and techniques on display in Sumer?</td>\n",
96
- " <td id=\"T_fc111_row2_col1\" class=\"data row2 col1\" >Sumerian temples and palaces</td>\n",
 
 
97
  " </tr>\n",
98
  " <tr>\n",
99
- " <td id=\"T_fc111_row3_col0\" class=\"data row3 col0\" >Who is elected every even numbered year?</td>\n",
100
- " <td id=\"T_fc111_row3_col1\" class=\"data row3 col1\" >mayor</td>\n",
 
 
101
  " </tr>\n",
102
  " <tr>\n",
103
- " <td id=\"T_fc111_row4_col0\" class=\"data row4 col0\" >What was the purpose of top secret ICBM committee?</td>\n",
104
- " <td id=\"T_fc111_row4_col1\" class=\"data row4 col1\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
 
 
105
  " </tr>\n",
106
  " <tr>\n",
107
- " <td id=\"T_fc111_row5_col0\" class=\"data row5 col0\" >What conferences became a requirement after Vatican II?</td>\n",
108
- " <td id=\"T_fc111_row5_col1\" class=\"data row5 col1\" >National Bishop Conferences</td>\n",
 
 
109
  " </tr>\n",
110
  " <tr>\n",
111
- " <td id=\"T_fc111_row6_col0\" class=\"data row6 col0\" >Who does M fight with?</td>\n",
112
- " <td id=\"T_fc111_row6_col1\" class=\"data row6 col1\" >C</td>\n",
 
 
113
  " </tr>\n",
114
  " <tr>\n",
115
- " <td id=\"T_fc111_row7_col0\" class=\"data row7 col0\" >How many species of fungi have been found on Antarctica?</td>\n",
116
- " <td id=\"T_fc111_row7_col1\" class=\"data row7 col1\" >1150</td>\n",
 
 
117
  " </tr>\n",
118
  " <tr>\n",
119
- " <td id=\"T_fc111_row8_col0\" class=\"data row8 col0\" >After losing the battle of Guilford Courthouse, Cornawallis moved his troops where?</td>\n",
120
- " <td id=\"T_fc111_row8_col1\" class=\"data row8 col1\" >Virginia coastline</td>\n",
 
 
121
  " </tr>\n",
122
  " <tr>\n",
123
- " <td id=\"T_fc111_row9_col0\" class=\"data row9 col0\" >What is the Olympic Torch made from?</td>\n",
124
- " <td id=\"T_fc111_row9_col1\" class=\"data row9 col1\" >aluminum.</td>\n",
 
 
125
  " </tr>\n",
126
  " </tbody>\n",
127
  "</table>\n"
128
  ],
129
  "text/plain": [
130
- "<pandas.io.formats.style.Styler at 0x3afc43c80>"
131
  ]
132
  },
133
  "metadata": {},
@@ -136,12 +219,8 @@
136
  ],
137
  "source": [
138
  "np.random.seed(42)\n",
139
- "arr =np.array(data.question_answer_pairs)\n",
140
- "n_samples = 10\n",
141
- "indices = np.random.choice(len(arr), n_samples, replace=False)\n",
142
- "random_sample = arr[indices]\n",
143
- "# Display the questions and answers in the random sample as a dataframe\n",
144
- "dfSample = pd.DataFrame(random_sample, columns=[\"Question\", \"Answer\"])\n",
145
  "display_text_df(dfSample)"
146
  ]
147
  },
@@ -154,30 +233,35 @@
154
  },
155
  {
156
  "cell_type": "code",
157
- "execution_count": 8,
158
  "metadata": {},
159
  "outputs": [],
160
- "source": [
161
- "from agent import get_agent\n",
162
- "agent = get_agent()"
163
- ]
164
  },
165
  {
166
  "cell_type": "markdown",
167
  "metadata": {},
168
  "source": [
169
- "### Run the agent on the random sample of questions"
 
 
 
 
 
 
 
 
170
  ]
171
  },
172
  {
173
  "cell_type": "code",
174
- "execution_count": 36,
175
  "metadata": {},
176
  "outputs": [
177
  {
178
  "data": {
179
  "application/vnd.jupyter.widget-view+json": {
180
- "model_id": "4bce5a5c2449435dbd058ed938db2a91",
181
  "version_major": 2,
182
  "version_minor": 0
183
  },
@@ -190,188 +274,216 @@
190
  }
191
  ],
192
  "source": [
193
- "from gradio import ChatMessage\n",
194
- "from transformers.agents import agent_types\n",
195
- "from tqdm.notebook import tqdm\n",
196
- "import logging\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  "\n",
198
- "answers_ref, answers_pred = [], [] \n",
 
199
  "\n",
200
- "# Suppress logging from the agent, which can be quite verbose\n",
201
- "agent.logger.setLevel(logging.CRITICAL)\n",
 
 
 
202
  "\n",
203
- "for question, answer in tqdm(random_sample):\n",
204
- " class Output:\n",
205
- " output: agent_types.AgentType | str = None\n",
206
  "\n",
207
- " prompt = question\n",
208
- " answers_ref.append(answer)\n",
209
- " final_answer = agent.run(prompt, stream=False, reset=True)\n",
210
- " answers_pred.append(final_answer)"
211
  ]
212
  },
213
  {
214
  "cell_type": "markdown",
215
  "metadata": {},
216
  "source": [
217
- "### Use semantic similarity to evaluate the agent's answers against the reference answers\n",
218
- "\n",
219
- "* One flaw of this approach is that it does not take into account the existence of multiple acceptable answers.\n",
220
- "* It also does not benefit from having the context of the question. "
221
  ]
222
  },
223
  {
224
  "cell_type": "code",
225
- "execution_count": 37,
226
  "metadata": {},
227
  "outputs": [],
228
  "source": [
229
- "from semscore import EmbeddingModelWrapper\n",
230
- "from statistics import mean\n",
231
  "\n",
232
- "answers_ref = [str(answer) for answer in answers_ref]\n",
233
- "answers_pred = [str(answer) for answer in answers_pred]\n",
 
234
  "\n",
235
- "em = EmbeddingModelWrapper()\n",
236
- "similarities = em.get_similarities(\n",
237
- " em.get_embeddings( answers_pred ),\n",
238
- " em.get_embeddings( answers_ref ),\n",
239
- ")"
240
  ]
241
  },
242
  {
243
  "cell_type": "code",
244
- "execution_count": 39,
245
  "metadata": {},
246
  "outputs": [
247
  {
248
  "data": {
249
  "text/html": [
250
  "<style type=\"text/css\">\n",
251
- "#T_67704 th {\n",
252
  " text-align: left;\n",
253
  "}\n",
254
- "#T_67704 td {\n",
255
  " text-align: left;\n",
256
  "}\n",
257
- "#T_67704_row0_col0, #T_67704_row0_col1, #T_67704_row0_col2, #T_67704_row0_col3, #T_67704_row1_col0, #T_67704_row1_col1, #T_67704_row1_col2, #T_67704_row1_col3, #T_67704_row2_col0, #T_67704_row2_col1, #T_67704_row2_col2, #T_67704_row2_col3, #T_67704_row3_col0, #T_67704_row3_col1, #T_67704_row3_col2, #T_67704_row3_col3, #T_67704_row4_col0, #T_67704_row4_col1, #T_67704_row4_col2, #T_67704_row4_col3, #T_67704_row5_col0, #T_67704_row5_col1, #T_67704_row5_col2, #T_67704_row5_col3, #T_67704_row6_col0, #T_67704_row6_col1, #T_67704_row6_col2, #T_67704_row6_col3, #T_67704_row7_col0, #T_67704_row7_col1, #T_67704_row7_col2, #T_67704_row7_col3, #T_67704_row8_col0, #T_67704_row8_col1, #T_67704_row8_col2, #T_67704_row8_col3, #T_67704_row9_col0, #T_67704_row9_col1, #T_67704_row9_col2, #T_67704_row9_col3 {\n",
258
  " white-space: pre-wrap;\n",
259
  "}\n",
260
  "</style>\n",
261
- "<table id=\"T_67704\">\n",
262
  " <thead>\n",
263
  " <tr>\n",
264
- " <th id=\"T_67704_level0_col0\" class=\"col_heading level0 col0\" >Question</th>\n",
265
- " <th id=\"T_67704_level0_col1\" class=\"col_heading level0 col1\" >Reference Answer</th>\n",
266
- " <th id=\"T_67704_level0_col2\" class=\"col_heading level0 col2\" >Predicted Answer</th>\n",
267
- " <th id=\"T_67704_level0_col3\" class=\"col_heading level0 col3\" >Similarity</th>\n",
 
 
268
  " </tr>\n",
269
  " </thead>\n",
270
  " <tbody>\n",
271
  " <tr>\n",
272
- " <td id=\"T_67704_row0_col0\" class=\"data row0 col0\" >What year was the Banská Akadémia founded?</td>\n",
273
- " <td id=\"T_67704_row0_col1\" class=\"data row0 col1\" >1735</td>\n",
274
- " <td id=\"T_67704_row0_col2\" class=\"data row0 col2\" >1735</td>\n",
275
- " <td id=\"T_67704_row0_col3\" class=\"data row0 col3\" >1.000000</td>\n",
 
 
276
  " </tr>\n",
277
  " <tr>\n",
278
- " <td id=\"T_67704_row1_col0\" class=\"data row1 col0\" >What is another speed that can also be reported by the camera?</td>\n",
279
- " <td id=\"T_67704_row1_col1\" class=\"data row1 col1\" >SOS-based speed</td>\n",
280
- " <td id=\"T_67704_row1_col2\" class=\"data row1 col2\" >Average speed</td>\n",
281
- " <td id=\"T_67704_row1_col3\" class=\"data row1 col3\" >0.433297</td>\n",
 
 
282
  " </tr>\n",
283
  " <tr>\n",
284
- " <td id=\"T_67704_row2_col0\" class=\"data row2 col0\" >Where were the use of advanced materials and techniques on display in Sumer?</td>\n",
285
- " <td id=\"T_67704_row2_col1\" class=\"data row2 col1\" >Sumerian temples and palaces</td>\n",
286
- " <td id=\"T_67704_row2_col2\" class=\"data row2 col2\" >Based on the information provided, it appears that the Sumerians developed and displayed advanced materials and techniques such as metrology, writing, and astronomy throughout their city-states. The specific locations where these advanced materials and techniques were on display are not explicitly mentioned.\n",
287
- "\n",
288
- "However, considering the context of the question, I would argue that the city-states of Sumer itself is the most relevant answer. The city-states of Sumer were the hub of Sumerian civilization, culture, and innovation, and it was likely there that these advanced materials and techniques were developed, displayed, and showcased.\n",
289
- "\n",
290
- "Therefore, my final answer to the user request is:\n",
291
- "\n",
292
- "The city-states of Sumer</td>\n",
293
- " <td id=\"T_67704_row2_col3\" class=\"data row2 col3\" >0.545807</td>\n",
294
  " </tr>\n",
295
  " <tr>\n",
296
- " <td id=\"T_67704_row3_col0\" class=\"data row3 col0\" >Who is elected every even numbered year?</td>\n",
297
- " <td id=\"T_67704_row3_col1\" class=\"data row3 col1\" >mayor</td>\n",
298
- " <td id=\"T_67704_row3_col2\" class=\"data row3 col2\" >mayor</td>\n",
299
- " <td id=\"T_67704_row3_col3\" class=\"data row3 col3\" >1.000000</td>\n",
 
 
300
  " </tr>\n",
301
  " <tr>\n",
302
- " <td id=\"T_67704_row4_col0\" class=\"data row4 col0\" >What was the purpose of top secret ICBM committee?</td>\n",
303
- " <td id=\"T_67704_row4_col1\" class=\"data row4 col1\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
304
- " <td id=\"T_67704_row4_col2\" class=\"data row4 col2\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
305
- " <td id=\"T_67704_row4_col3\" class=\"data row4 col3\" >1.000000</td>\n",
 
 
306
  " </tr>\n",
307
  " <tr>\n",
308
- " <td id=\"T_67704_row5_col0\" class=\"data row5 col0\" >What conferences became a requirement after Vatican II?</td>\n",
309
- " <td id=\"T_67704_row5_col1\" class=\"data row5 col1\" >National Bishop Conferences</td>\n",
310
- " <td id=\"T_67704_row5_col2\" class=\"data row5 col2\" >['National Bishop Conferences']</td>\n",
311
- " <td id=\"T_67704_row5_col3\" class=\"data row5 col3\" >0.937632</td>\n",
 
 
312
  " </tr>\n",
313
  " <tr>\n",
314
- " <td id=\"T_67704_row6_col0\" class=\"data row6 col0\" >Who does M fight with?</td>\n",
315
- " <td id=\"T_67704_row6_col1\" class=\"data row6 col1\" >C</td>\n",
316
- " <td id=\"T_67704_row6_col2\" class=\"data row6 col2\" >C</td>\n",
317
- " <td id=\"T_67704_row6_col3\" class=\"data row6 col3\" >1.000000</td>\n",
 
 
318
  " </tr>\n",
319
  " <tr>\n",
320
- " <td id=\"T_67704_row7_col0\" class=\"data row7 col0\" >How many species of fungi have been found on Antarctica?</td>\n",
321
- " <td id=\"T_67704_row7_col1\" class=\"data row7 col1\" >1150</td>\n",
322
- " <td id=\"T_67704_row7_col2\" class=\"data row7 col2\" >Based on the output from the `squad_retriever` tool, I can see that there are two documents in the SQuAD dataset that answer the question \"How many species of fungi have been found on Antarctica?\".\n",
323
- "\n",
324
- "The first document states that about 1150 species of fungi have been recorded from Antarctica. The second document does not provide a different answer to this question.\n",
325
- "\n",
326
- "Therefore, my final answer is:\n",
327
- "\n",
328
- "There are approximately 1150 species of fungi that have been found on Antarctica.</td>\n",
329
- " <td id=\"T_67704_row7_col3\" class=\"data row7 col3\" >-0.020657</td>\n",
330
  " </tr>\n",
331
  " <tr>\n",
332
- " <td id=\"T_67704_row8_col0\" class=\"data row8 col0\" >After losing the battle of Guilford Courthouse, Cornawallis moved his troops where?</td>\n",
333
- " <td id=\"T_67704_row8_col1\" class=\"data row8 col1\" >Virginia coastline</td>\n",
334
- " <td id=\"T_67704_row8_col2\" class=\"data row8 col2\" >The Virginia coastline</td>\n",
335
- " <td id=\"T_67704_row8_col3\" class=\"data row8 col3\" >0.948570</td>\n",
 
 
336
  " </tr>\n",
337
  " <tr>\n",
338
- " <td id=\"T_67704_row9_col0\" class=\"data row9 col0\" >What is the Olympic Torch made from?</td>\n",
339
- " <td id=\"T_67704_row9_col1\" class=\"data row9 col1\" >aluminum.</td>\n",
340
- " <td id=\"T_67704_row9_col2\" class=\"data row9 col2\" >aluminum</td>\n",
341
- " <td id=\"T_67704_row9_col3\" class=\"data row9 col3\" >0.973508</td>\n",
 
 
342
  " </tr>\n",
343
  " </tbody>\n",
344
  "</table>\n"
345
  ],
346
  "text/plain": [
347
- "<pandas.io.formats.style.Styler at 0x3b0db7320>"
348
  ]
349
  },
350
  "metadata": {},
351
  "output_type": "display_data"
352
- },
353
- {
354
- "name": "stdout",
355
- "output_type": "stream",
356
- "text": [
357
- "Mean similarity: 0.78\n"
358
- ]
359
  }
360
  ],
361
  "source": [
362
- "import pandas as pd\n",
363
- "questions = [question for question, _ in random_sample]\n",
364
- "dfAnswers = pd.DataFrame(list(zip(questions, answers_ref, answers_pred)), columns=[\"Question\", \"Reference Answer\", \"Predicted Answer\"])\n",
365
- "dfAnswers[\"Similarity\"] = similarities\n",
366
- "display(dfAnswers.style.set_properties(**{'white-space': 'pre-wrap'}).set_table_styles(\n",
367
- " [{'selector': 'th', 'props': [('text-align', 'left')]},\n",
368
- " {'selector': 'td', 'props': [('text-align', 'left')]}\n",
369
- " ]\n",
370
- ").hide())\n",
371
- "print(f\"Mean similarity: {round(mean(similarities), 2)}\")\n",
372
- "\n"
 
 
 
373
  ]
374
  },
 
 
 
 
 
 
 
375
  {
376
  "cell_type": "code",
377
  "execution_count": null,
 
9
  },
10
  {
11
  "cell_type": "code",
12
+ "execution_count": 1,
13
  "metadata": {},
14
  "outputs": [],
15
  "source": [
16
+ "import os\n",
17
  "import numpy as np\n",
 
18
  "import pandas as pd\n",
19
+ "from transformers.agents import agent_types\n",
20
+ "from tqdm.notebook import tqdm\n",
21
+ "import logging\n",
22
+ "from semscore import EmbeddingModelWrapper\n",
23
+ "from statistics import mean\n",
24
+ "\n",
25
  "\n",
26
  "def display_text_df(df):\n",
27
  " display(df.style.set_properties(**{'white-space': 'pre-wrap'}).set_table_styles(\n",
 
33
  },
34
  {
35
  "cell_type": "code",
36
+ "execution_count": 2,
37
  "metadata": {},
38
+ "outputs": [
39
+ {
40
+ "name": "stdout",
41
+ "output_type": "stream",
42
+ "text": [
43
+ "Initializing Data...\n",
44
+ "Download: False\n",
45
+ "Loading data...\n",
46
+ "Raw Data loaded\n",
47
+ "Chroma DB already exists\n",
48
+ "Loading index...\n",
49
+ "Index loaded\n"
50
+ ]
51
+ }
52
+ ],
53
  "source": [
54
  "from data import get_data\n",
55
  "data = get_data(download=False)\n"
 
57
  },
58
  {
59
  "cell_type": "code",
60
+ "execution_count": 3,
61
  "metadata": {},
62
  "outputs": [
63
  {
64
  "data": {
65
+ "text/html": [
66
+ "<style type=\"text/css\">\n",
67
+ "#T_3b140 th {\n",
68
+ " text-align: left;\n",
69
+ "}\n",
70
+ "#T_3b140 td {\n",
71
+ " text-align: left;\n",
72
+ "}\n",
73
+ "#T_3b140_row0_col0, #T_3b140_row0_col1, #T_3b140_row0_col2, #T_3b140_row0_col3, #T_3b140_row1_col0, #T_3b140_row1_col1, #T_3b140_row1_col2, #T_3b140_row1_col3, #T_3b140_row2_col0, #T_3b140_row2_col1, #T_3b140_row2_col2, #T_3b140_row2_col3 {\n",
74
+ " white-space: pre-wrap;\n",
75
+ "}\n",
76
+ "</style>\n",
77
+ "<table id=\"T_3b140\">\n",
78
+ " <thead>\n",
79
+ " <tr>\n",
80
+ " <th id=\"T_3b140_level0_col0\" class=\"col_heading level0 col0\" >Title</th>\n",
81
+ " <th id=\"T_3b140_level0_col1\" class=\"col_heading level0 col1\" >Context</th>\n",
82
+ " <th id=\"T_3b140_level0_col2\" class=\"col_heading level0 col2\" >Question</th>\n",
83
+ " <th id=\"T_3b140_level0_col3\" class=\"col_heading level0 col3\" >Answer</th>\n",
84
+ " </tr>\n",
85
+ " </thead>\n",
86
+ " <tbody>\n",
87
+ " <tr>\n",
88
+ " <td id=\"T_3b140_row0_col0\" class=\"data row0 col0\" >University_of_Notre_Dame</td>\n",
89
+ " <td id=\"T_3b140_row0_col1\" class=\"data row0 col1\" >Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.</td>\n",
90
+ " <td id=\"T_3b140_row0_col2\" class=\"data row0 col2\" >To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?</td>\n",
91
+ " <td id=\"T_3b140_row0_col3\" class=\"data row0 col3\" >Saint Bernadette Soubirous</td>\n",
92
+ " </tr>\n",
93
+ " <tr>\n",
94
+ " <td id=\"T_3b140_row1_col0\" class=\"data row1 col0\" >University_of_Notre_Dame</td>\n",
95
+ " <td id=\"T_3b140_row1_col1\" class=\"data row1 col1\" >Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.</td>\n",
96
+ " <td id=\"T_3b140_row1_col2\" class=\"data row1 col2\" >What is in front of the Notre Dame Main Building?</td>\n",
97
+ " <td id=\"T_3b140_row1_col3\" class=\"data row1 col3\" >a copper statue of Christ</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <td id=\"T_3b140_row2_col0\" class=\"data row2 col0\" >University_of_Notre_Dame</td>\n",
101
+ " <td id=\"T_3b140_row2_col1\" class=\"data row2 col1\" >Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend \"Venite Ad Me Omnes\". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.</td>\n",
102
+ " <td id=\"T_3b140_row2_col2\" class=\"data row2 col2\" >The Basilica of the Sacred heart at Notre Dame is beside to which structure?</td>\n",
103
+ " <td id=\"T_3b140_row2_col3\" class=\"data row2 col3\" >the Main Building</td>\n",
104
+ " </tr>\n",
105
+ " </tbody>\n",
106
+ "</table>\n"
107
+ ],
108
  "text/plain": [
109
+ "<pandas.io.formats.style.Styler at 0x16b7441d0>"
 
110
  ]
111
  },
 
112
  "metadata": {},
113
+ "output_type": "display_data"
114
  }
115
  ],
116
  "source": [
117
+ "display_text_df(data.df.head(3))\n"
118
  ]
119
  },
120
  {
121
  "cell_type": "code",
122
+ "execution_count": 4,
123
  "metadata": {},
124
  "outputs": [
125
  {
126
  "data": {
127
  "text/html": [
128
  "<style type=\"text/css\">\n",
129
+ "#T_8f656 th {\n",
130
  " text-align: left;\n",
131
  "}\n",
132
+ "#T_8f656 td {\n",
133
  " text-align: left;\n",
134
  "}\n",
135
+ "#T_8f656_row0_col0, #T_8f656_row0_col1, #T_8f656_row0_col2, #T_8f656_row0_col3, #T_8f656_row1_col0, #T_8f656_row1_col1, #T_8f656_row1_col2, #T_8f656_row1_col3, #T_8f656_row2_col0, #T_8f656_row2_col1, #T_8f656_row2_col2, #T_8f656_row2_col3, #T_8f656_row3_col0, #T_8f656_row3_col1, #T_8f656_row3_col2, #T_8f656_row3_col3, #T_8f656_row4_col0, #T_8f656_row4_col1, #T_8f656_row4_col2, #T_8f656_row4_col3, #T_8f656_row5_col0, #T_8f656_row5_col1, #T_8f656_row5_col2, #T_8f656_row5_col3, #T_8f656_row6_col0, #T_8f656_row6_col1, #T_8f656_row6_col2, #T_8f656_row6_col3, #T_8f656_row7_col0, #T_8f656_row7_col1, #T_8f656_row7_col2, #T_8f656_row7_col3, #T_8f656_row8_col0, #T_8f656_row8_col1, #T_8f656_row8_col2, #T_8f656_row8_col3, #T_8f656_row9_col0, #T_8f656_row9_col1, #T_8f656_row9_col2, #T_8f656_row9_col3 {\n",
136
  " white-space: pre-wrap;\n",
137
  "}\n",
138
  "</style>\n",
139
+ "<table id=\"T_8f656\">\n",
140
  " <thead>\n",
141
  " <tr>\n",
142
+ " <th id=\"T_8f656_level0_col0\" class=\"col_heading level0 col0\" >Title</th>\n",
143
+ " <th id=\"T_8f656_level0_col1\" class=\"col_heading level0 col1\" >Context</th>\n",
144
+ " <th id=\"T_8f656_level0_col2\" class=\"col_heading level0 col2\" >Question</th>\n",
145
+ " <th id=\"T_8f656_level0_col3\" class=\"col_heading level0 col3\" >Answer</th>\n",
146
  " </tr>\n",
147
  " </thead>\n",
148
  " <tbody>\n",
149
  " <tr>\n",
150
+ " <td id=\"T_8f656_row0_col0\" class=\"data row0 col0\" >Institute_of_technology</td>\n",
151
+ " <td id=\"T_8f656_row0_col1\" class=\"data row0 col1\" >The world's first institution of technology or technical university with tertiary technical education is the Banská Akadémia in Banská Štiavnica, Slovakia, founded in 1735, Academy since December 13, 1762 established by queen Maria Theresa in order to train specialists of silver and gold mining and metallurgy in neighbourhood. Teaching started in 1764. Later the department of Mathematics, Mechanics and Hydraulics and department of Forestry were settled. University buildings are still at their place today and are used for teaching. University has launched the first book of electrotechnics in the world.</td>\n",
152
+ " <td id=\"T_8f656_row0_col2\" class=\"data row0 col2\" >What year was the Banská Akadémia founded?</td>\n",
153
+ " <td id=\"T_8f656_row0_col3\" class=\"data row0 col3\" >1735</td>\n",
154
  " </tr>\n",
155
  " <tr>\n",
156
+ " <td id=\"T_8f656_row1_col0\" class=\"data row1 col0\" >Film_speed</td>\n",
157
+ " <td id=\"T_8f656_row1_col1\" class=\"data row1 col1\" >The standard specifies how speed ratings should be reported by the camera. If the noise-based speed (40:1) is higher than the saturation-based speed, the noise-based speed should be reported, rounded downwards to a standard value (e.g. 200, 250, 320, or 400). The rationale is that exposure according to the lower saturation-based speed would not result in a visibly better image. In addition, an exposure latitude can be specified, ranging from the saturation-based speed to the 10:1 noise-based speed. If the noise-based speed (40:1) is lower than the saturation-based speed, or undefined because of high noise, the saturation-based speed is specified, rounded upwards to a standard value, because using the noise-based speed would lead to overexposed images. The camera may also report the SOS-based speed (explicitly as being an SOS speed), rounded to the nearest standard speed rating.</td>\n",
158
+ " <td id=\"T_8f656_row1_col2\" class=\"data row1 col2\" >What is another speed that can also be reported by the camera?</td>\n",
159
+ " <td id=\"T_8f656_row1_col3\" class=\"data row1 col3\" >SOS-based speed</td>\n",
160
  " </tr>\n",
161
  " <tr>\n",
162
+ " <td id=\"T_8f656_row2_col0\" class=\"data row2 col0\" >Sumer</td>\n",
163
+ " <td id=\"T_8f656_row2_col1\" class=\"data row2 col1\" >The most impressive and famous of Sumerian buildings are the ziggurats, large layered platforms which supported temples. Sumerian cylinder seals also depict houses built from reeds not unlike those built by the Marsh Arabs of Southern Iraq until as recently as 400 CE. The Sumerians also developed the arch, which enabled them to develop a strong type of dome. They built this by constructing and linking several arches. Sumerian temples and palaces made use of more advanced materials and techniques,[citation needed] such as buttresses, recesses, half columns, and clay nails.</td>\n",
164
+ " <td id=\"T_8f656_row2_col2\" class=\"data row2 col2\" >Where were the use of advanced materials and techniques on display in Sumer?</td>\n",
165
+ " <td id=\"T_8f656_row2_col3\" class=\"data row2 col3\" >Sumerian temples and palaces</td>\n",
166
  " </tr>\n",
167
  " <tr>\n",
168
+ " <td id=\"T_8f656_row3_col0\" class=\"data row3 col0\" >Ann_Arbor,_Michigan</td>\n",
169
+ " <td id=\"T_8f656_row3_col1\" class=\"data row3 col1\" >Ann Arbor has a council-manager form of government. The City Council has 11 voting members: the mayor and 10 city council members. The mayor and city council members serve two-year terms: the mayor is elected every even-numbered year, while half of the city council members are up for election annually (five in even-numbered and five in odd-numbered years). Two council members are elected from each of the city's five wards. The mayor is elected citywide. The mayor is the presiding officer of the City Council and has the power to appoint all Council committee members as well as board and commission members, with the approval of the City Council. The current mayor of Ann Arbor is Christopher Taylor, a Democrat who was elected as mayor in 2014. Day-to-day city operations are managed by a city administrator chosen by the city council.</td>\n",
170
+ " <td id=\"T_8f656_row3_col2\" class=\"data row3 col2\" >Who is elected every even numbered year?</td>\n",
171
+ " <td id=\"T_8f656_row3_col3\" class=\"data row3 col3\" >mayor</td>\n",
172
  " </tr>\n",
173
  " <tr>\n",
174
+ " <td id=\"T_8f656_row4_col0\" class=\"data row4 col0\" >John_von_Neumann</td>\n",
175
+ " <td id=\"T_8f656_row4_col1\" class=\"data row4 col1\" >Shortly before his death, when he was already quite ill, von Neumann headed the United States government's top secret ICBM committee, and it would sometimes meet in his home. Its purpose was to decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon. Von Neumann had long argued that while the technical obstacles were sizable, they could be overcome in time. The SM-65 Atlas passed its first fully functional test in 1959, two years after his death. The feasibility of an ICBM owed as much to improved, smaller warheads as it did to developments in rocketry, and his understanding of the former made his advice invaluable.</td>\n",
176
+ " <td id=\"T_8f656_row4_col2\" class=\"data row4 col2\" >What was the purpose of top secret ICBM committee?</td>\n",
177
+ " <td id=\"T_8f656_row4_col3\" class=\"data row4 col3\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
178
  " </tr>\n",
179
  " <tr>\n",
180
+ " <td id=\"T_8f656_row5_col0\" class=\"data row5 col0\" >Pope_Paul_VI</td>\n",
181
+ " <td id=\"T_8f656_row5_col1\" class=\"data row5 col1\" >Some critiqued Paul VI's decision; the newly created Synod of Bishops had an advisory role only and could not make decisions on their own, although the Council decided exactly that. During the pontificate of Paul VI, five such synods took place, and he is on record of implementing all their decisions. Related questions were raised about the new National Bishop Conferences, which became mandatory after Vatican II. Others questioned his Ostpolitik and contacts with Communism and the deals he engaged in for the faithful.</td>\n",
182
+ " <td id=\"T_8f656_row5_col2\" class=\"data row5 col2\" >What conferences became a requirement after Vatican II?</td>\n",
183
+ " <td id=\"T_8f656_row5_col3\" class=\"data row5 col3\" >National Bishop Conferences</td>\n",
184
  " </tr>\n",
185
  " <tr>\n",
186
+ " <td id=\"T_8f656_row6_col0\" class=\"data row6 col0\" >Spectre_(2015_film)</td>\n",
187
+ " <td id=\"T_8f656_row6_col1\" class=\"data row6 col1\" >Bond and Swann return to London where they meet M, Bill Tanner, Q, and Moneypenny; they intend to arrest C and stop Nine Eyes from going online. Swann leaves Bond, telling him she cannot be part of a life involving espionage, and is subsequently kidnapped. On the way, the group is ambushed and Bond is kidnapped, but the rest still proceed with the plan. After Q succeeds in preventing the Nine Eyes from going online, a brief struggle between M and C ends with the latter falling to his death. Meanwhile, Bond is taken to the old MI6 building, which is scheduled for demolition, and frees himself. Moving throughout the ruined labyrinth, he encounters a disfigured Blofeld, who tells him that he has three minutes to escape the building before explosives are detonated or die trying to save Swann. Bond finds Swann and the two escape by boat as the building collapses. Bond shoots down Blofeld's helicopter, which crashes onto Westminster Bridge. As Blofeld crawls away from the wreckage, Bond confronts him but ultimately leaves him to be arrested by M. Bond leaves the bridge with Swann.</td>\n",
188
+ " <td id=\"T_8f656_row6_col2\" class=\"data row6 col2\" >Who does M fight with?</td>\n",
189
+ " <td id=\"T_8f656_row6_col3\" class=\"data row6 col3\" >C</td>\n",
190
  " </tr>\n",
191
  " <tr>\n",
192
+ " <td id=\"T_8f656_row7_col0\" class=\"data row7 col0\" >Antarctica</td>\n",
193
+ " <td id=\"T_8f656_row7_col1\" class=\"data row7 col1\" >About 1150 species of fungi have been recorded from Antarctica, of which about 750 are non-lichen-forming and 400 are lichen-forming. Some of these species are cryptoendoliths as a result of evolution under extreme conditions, and have significantly contributed to shaping the impressive rock formations of the McMurdo Dry Valleys and surrounding mountain ridges. The apparently simple morphology, scarcely differentiated structures, metabolic systems and enzymes still active at very low temperatures, and reduced life cycles shown by such fungi make them particularly suited to harsh environments such as the McMurdo Dry Valleys. In particular, their thick-walled and strongly melanized cells make them resistant to UV light. Those features can also be observed in algae and cyanobacteria, suggesting that these are adaptations to the conditions prevailing in Antarctica. This has led to speculation that, if life ever occurred on Mars, it might have looked similar to Antarctic fungi such as Cryomyces minteri. Some of these fungi are also apparently endemic to Antarctica. Endemic Antarctic fungi also include certain dung-inhabiting species which have had to evolve in response to the double challenge of extreme cold while growing on dung, and the need to survive passage through the gut of warm-blooded animals.</td>\n",
194
+ " <td id=\"T_8f656_row7_col2\" class=\"data row7 col2\" >How many species of fungi have been found on Antarctica?</td>\n",
195
+ " <td id=\"T_8f656_row7_col3\" class=\"data row7 col3\" >1150</td>\n",
196
  " </tr>\n",
197
  " <tr>\n",
198
+ " <td id=\"T_8f656_row8_col0\" class=\"data row8 col0\" >North_Carolina</td>\n",
199
+ " <td id=\"T_8f656_row8_col1\" class=\"data row8 col1\" >In the Battle of Cowan's Ford, Cornwallis met resistance along the banks of the Catawba River at Cowan's Ford on February 1, 1781, in an attempt to engage General Morgan's forces during a tactical withdrawal. Morgan had moved to the northern part of the state to combine with General Greene's newly recruited forces. Generals Greene and Cornwallis finally met at the Battle of Guilford Courthouse in present-day Greensboro on March 15, 1781. Although the British troops held the field at the end of the battle, their casualties at the hands of the numerically superior Continental Army were crippling. Following this \"Pyrrhic victory\", Cornwallis chose to move to the Virginia coastline to get reinforcements, and to allow the Royal Navy to protect his battered army. This decision would result in Cornwallis' eventual defeat at Yorktown, Virginia, later in 1781. The Patriots' victory there guaranteed American independence.</td>\n",
200
+ " <td id=\"T_8f656_row8_col2\" class=\"data row8 col2\" >After losing the battle of Guilford Courthouse, Cornawallis moved his troops where?</td>\n",
201
+ " <td id=\"T_8f656_row8_col3\" class=\"data row8 col3\" >Virginia coastline</td>\n",
202
  " </tr>\n",
203
  " <tr>\n",
204
+ " <td id=\"T_8f656_row9_col0\" class=\"data row9 col0\" >2008_Summer_Olympics_torch_relay</td>\n",
205
+ " <td id=\"T_8f656_row9_col1\" class=\"data row9 col1\" >The Olympic Torch is based on traditional scrolls and uses a traditional Chinese design known as \"Lucky Cloud\". It is made from aluminum. It is 72 centimetres high and weighs 985 grams. The torch is designed to remain lit in 65 kilometre per hour (37 mile per hour) winds, and in rain of up to 50 millimetres (2 inches) per hour. An ignition key is used to ignite and extinguish the flame. The torch is fueled by cans of propane. Each can will light the torch for 15 minutes. It is designed by a team from Lenovo Group. The Torch is designed in reference to the traditional Chinese concept of the 5 elements that make up the entire universe.</td>\n",
206
+ " <td id=\"T_8f656_row9_col2\" class=\"data row9 col2\" >What is the Olympic Torch made from?</td>\n",
207
+ " <td id=\"T_8f656_row9_col3\" class=\"data row9 col3\" >aluminum.</td>\n",
208
  " </tr>\n",
209
  " </tbody>\n",
210
  "</table>\n"
211
  ],
212
  "text/plain": [
213
+ "<pandas.io.formats.style.Styler at 0x32b08f050>"
214
  ]
215
  },
216
  "metadata": {},
 
219
  ],
220
  "source": [
221
  "np.random.seed(42)\n",
222
+ "# Select 10 random rows from data.df\n",
223
+ "dfSample = data.df.sample(n=10)\n",
 
 
 
 
224
  "display_text_df(dfSample)"
225
  ]
226
  },
 
233
  },
234
  {
235
  "cell_type": "code",
236
+ "execution_count": 5,
237
  "metadata": {},
238
  "outputs": [],
239
+ "source": []
 
 
 
240
  },
241
  {
242
  "cell_type": "markdown",
243
  "metadata": {},
244
  "source": [
245
+ "### Run the agent on the random sample of questions\n",
246
+ "\n",
247
+ "* Unlike the default Retrieval QA or Open Generative QA of SQuAD, in our use case, the agent would normally be given context in the course of a natural conversation, as the user elaborates on what they want to know. \n",
248
+ "* Therefore, for benchmarking, we will provide the context to answer the question in the prompt.\n",
249
+ "\n",
250
+ "### Use semantic similarity to evaluate the agent's answers against the reference answers\n",
251
+ "\n",
252
+ "* One flaw of this approach is that it does not take into account the existence of multiple acceptable answers.\n",
253
+ "* Another flaw is that the agent me be unfairly penalized for elaborating on the answer, while this benchmark focuses on only and exactly the one canonical answer given.\n"
254
  ]
255
  },
256
  {
257
  "cell_type": "code",
258
+ "execution_count": 6,
259
  "metadata": {},
260
  "outputs": [
261
  {
262
  "data": {
263
  "application/vnd.jupyter.widget-view+json": {
264
+ "model_id": "c8e4bf7687534df284b075480f4dba88",
265
  "version_major": 2,
266
  "version_minor": 0
267
  },
 
274
  }
275
  ],
276
  "source": [
277
+ "BENCHMARKS_DIR = \"benchmarks\"\n",
278
+ "\n",
279
+ "def benchmark_agent(agent, dfSample, name):\n",
280
+ " answers_ref, answers_pred = [], [] \n",
281
+ "\n",
282
+ " # Suppress logging from the agent, which can be quite verbose\n",
283
+ " agent.logger.setLevel(logging.CRITICAL)\n",
284
+ "\n",
285
+ " for title, context, question, answer in tqdm(dfSample.values):\n",
286
+ " class Output:\n",
287
+ " output: agent_types.AgentType | str = None\n",
288
+ "\n",
289
+ " prompt = f\"\"\"\n",
290
+ " Read the following document and answer the question.\n",
291
+ "\n",
292
+ " Document Title: {title}\n",
293
+ " Document Content: {context}\n",
294
+ "\n",
295
+ " Question: {question}\n",
296
+ " \"\"\"\n",
297
+ " answers_ref.append(answer)\n",
298
+ " final_answer = agent.run(prompt, stream=False, reset=True)\n",
299
+ " answers_pred.append(final_answer)\n",
300
  "\n",
301
+ " answers_ref = [str(answer) for answer in answers_ref]\n",
302
+ " answers_pred = [str(answer) for answer in answers_pred]\n",
303
  "\n",
304
+ " em = EmbeddingModelWrapper()\n",
305
+ " similarities = em.get_similarities(\n",
306
+ " em.get_embeddings( answers_pred ),\n",
307
+ " em.get_embeddings( answers_ref ),\n",
308
+ " )\n",
309
  "\n",
310
+ " dfAnswers = dfSample.copy()\n",
311
+ " dfAnswers[\"Predicted Answer\"] = answers_pred\n",
312
+ " dfAnswers[\"Similarity\"] = similarities\n",
313
  "\n",
314
+ " os.makedirs(BENCHMARKS_DIR, exist_ok=True)\n",
315
+ " dfAnswers.to_pickle(os.path.join(BENCHMARKS_DIR, f\"{name}.pkl\"))\n"
 
 
316
  ]
317
  },
318
  {
319
  "cell_type": "markdown",
320
  "metadata": {},
321
  "source": [
322
+ "### Set up and run the benchmarks"
 
 
 
323
  ]
324
  },
325
  {
326
  "cell_type": "code",
327
+ "execution_count": 7,
328
  "metadata": {},
329
  "outputs": [],
330
  "source": [
331
+ "from agent import get_agent\n",
 
332
  "\n",
333
+ "benchmarks = [\n",
334
+ " (get_agent(), \"baseline\"),\n",
335
+ "]\n",
336
  "\n",
337
+ "for agent, name in tqdm(benchmarks):\n",
338
+ " benchmark_agent(agent, dfSample, name)\n"
 
 
 
339
  ]
340
  },
341
  {
342
  "cell_type": "code",
343
+ "execution_count": 11,
344
  "metadata": {},
345
  "outputs": [
346
  {
347
  "data": {
348
  "text/html": [
349
  "<style type=\"text/css\">\n",
350
+ "#T_7cf04 th {\n",
351
  " text-align: left;\n",
352
  "}\n",
353
+ "#T_7cf04 td {\n",
354
  " text-align: left;\n",
355
  "}\n",
356
+ "#T_7cf04_row0_col0, #T_7cf04_row0_col1, #T_7cf04_row0_col2, #T_7cf04_row0_col3, #T_7cf04_row0_col4, #T_7cf04_row0_col5, #T_7cf04_row1_col0, #T_7cf04_row1_col1, #T_7cf04_row1_col2, #T_7cf04_row1_col3, #T_7cf04_row1_col4, #T_7cf04_row1_col5, #T_7cf04_row2_col0, #T_7cf04_row2_col1, #T_7cf04_row2_col2, #T_7cf04_row2_col3, #T_7cf04_row2_col4, #T_7cf04_row2_col5, #T_7cf04_row3_col0, #T_7cf04_row3_col1, #T_7cf04_row3_col2, #T_7cf04_row3_col3, #T_7cf04_row3_col4, #T_7cf04_row3_col5, #T_7cf04_row4_col0, #T_7cf04_row4_col1, #T_7cf04_row4_col2, #T_7cf04_row4_col3, #T_7cf04_row4_col4, #T_7cf04_row4_col5, #T_7cf04_row5_col0, #T_7cf04_row5_col1, #T_7cf04_row5_col2, #T_7cf04_row5_col3, #T_7cf04_row5_col4, #T_7cf04_row5_col5, #T_7cf04_row6_col0, #T_7cf04_row6_col1, #T_7cf04_row6_col2, #T_7cf04_row6_col3, #T_7cf04_row6_col4, #T_7cf04_row6_col5, #T_7cf04_row7_col0, #T_7cf04_row7_col1, #T_7cf04_row7_col2, #T_7cf04_row7_col3, #T_7cf04_row7_col4, #T_7cf04_row7_col5, #T_7cf04_row8_col0, #T_7cf04_row8_col1, #T_7cf04_row8_col2, #T_7cf04_row8_col3, #T_7cf04_row8_col4, #T_7cf04_row8_col5, #T_7cf04_row9_col0, #T_7cf04_row9_col1, #T_7cf04_row9_col2, #T_7cf04_row9_col3, #T_7cf04_row9_col4, #T_7cf04_row9_col5 {\n",
357
  " white-space: pre-wrap;\n",
358
  "}\n",
359
  "</style>\n",
360
+ "<table id=\"T_7cf04\">\n",
361
  " <thead>\n",
362
  " <tr>\n",
363
+ " <th id=\"T_7cf04_level0_col0\" class=\"col_heading level0 col0\" >Title</th>\n",
364
+ " <th id=\"T_7cf04_level0_col1\" class=\"col_heading level0 col1\" >Context</th>\n",
365
+ " <th id=\"T_7cf04_level0_col2\" class=\"col_heading level0 col2\" >Question</th>\n",
366
+ " <th id=\"T_7cf04_level0_col3\" class=\"col_heading level0 col3\" >Answer</th>\n",
367
+ " <th id=\"T_7cf04_level0_col4\" class=\"col_heading level0 col4\" >Predicted Answer</th>\n",
368
+ " <th id=\"T_7cf04_level0_col5\" class=\"col_heading level0 col5\" >Similarity</th>\n",
369
  " </tr>\n",
370
  " </thead>\n",
371
  " <tbody>\n",
372
  " <tr>\n",
373
+ " <td id=\"T_7cf04_row0_col0\" class=\"data row0 col0\" >Institute_of_technology</td>\n",
374
+ " <td id=\"T_7cf04_row0_col1\" class=\"data row0 col1\" >The world's first institution of technology or technical university with tertiary technical education is the Banská Akadémia in Banská Štiavnica, Slovakia, founded in 1735, Academy since December 13, 1762 established by queen Maria Theresa in order to train specialists of silver and gold mining and metallurgy in neighbourhood. Teaching started in 1764. Later the department of Mathematics, Mechanics and Hydraulics and department of Forestry were settled. University buildings are still at their place today and are used for teaching. University has launched the first book of electrotechnics in the world.</td>\n",
375
+ " <td id=\"T_7cf04_row0_col2\" class=\"data row0 col2\" >What year was the Banská Akadémia founded?</td>\n",
376
+ " <td id=\"T_7cf04_row0_col3\" class=\"data row0 col3\" >1735</td>\n",
377
+ " <td id=\"T_7cf04_row0_col4\" class=\"data row0 col4\" >1735</td>\n",
378
+ " <td id=\"T_7cf04_row0_col5\" class=\"data row0 col5\" >1.000000</td>\n",
379
  " </tr>\n",
380
  " <tr>\n",
381
+ " <td id=\"T_7cf04_row1_col0\" class=\"data row1 col0\" >Film_speed</td>\n",
382
+ " <td id=\"T_7cf04_row1_col1\" class=\"data row1 col1\" >The standard specifies how speed ratings should be reported by the camera. If the noise-based speed (40:1) is higher than the saturation-based speed, the noise-based speed should be reported, rounded downwards to a standard value (e.g. 200, 250, 320, or 400). The rationale is that exposure according to the lower saturation-based speed would not result in a visibly better image. In addition, an exposure latitude can be specified, ranging from the saturation-based speed to the 10:1 noise-based speed. If the noise-based speed (40:1) is lower than the saturation-based speed, or undefined because of high noise, the saturation-based speed is specified, rounded upwards to a standard value, because using the noise-based speed would lead to overexposed images. The camera may also report the SOS-based speed (explicitly as being an SOS speed), rounded to the nearest standard speed rating.</td>\n",
383
+ " <td id=\"T_7cf04_row1_col2\" class=\"data row1 col2\" >What is another speed that can also be reported by the camera?</td>\n",
384
+ " <td id=\"T_7cf04_row1_col3\" class=\"data row1 col3\" >SOS-based speed</td>\n",
385
+ " <td id=\"T_7cf04_row1_col4\" class=\"data row1 col4\" >saturation-based speed</td>\n",
386
+ " <td id=\"T_7cf04_row1_col5\" class=\"data row1 col5\" >0.555529</td>\n",
387
  " </tr>\n",
388
  " <tr>\n",
389
+ " <td id=\"T_7cf04_row2_col0\" class=\"data row2 col0\" >Sumer</td>\n",
390
+ " <td id=\"T_7cf04_row2_col1\" class=\"data row2 col1\" >The most impressive and famous of Sumerian buildings are the ziggurats, large layered platforms which supported temples. Sumerian cylinder seals also depict houses built from reeds not unlike those built by the Marsh Arabs of Southern Iraq until as recently as 400 CE. The Sumerians also developed the arch, which enabled them to develop a strong type of dome. They built this by constructing and linking several arches. Sumerian temples and palaces made use of more advanced materials and techniques,[citation needed] such as buttresses, recesses, half columns, and clay nails.</td>\n",
391
+ " <td id=\"T_7cf04_row2_col2\" class=\"data row2 col2\" >Where were the use of advanced materials and techniques on display in Sumer?</td>\n",
392
+ " <td id=\"T_7cf04_row2_col3\" class=\"data row2 col3\" >Sumerian temples and palaces</td>\n",
393
+ " <td id=\"T_7cf04_row2_col4\" class=\"data row2 col4\" >temples and palaces</td>\n",
394
+ " <td id=\"T_7cf04_row2_col5\" class=\"data row2 col5\" >0.726322</td>\n",
 
 
 
 
395
  " </tr>\n",
396
  " <tr>\n",
397
+ " <td id=\"T_7cf04_row3_col0\" class=\"data row3 col0\" >Ann_Arbor,_Michigan</td>\n",
398
+ " <td id=\"T_7cf04_row3_col1\" class=\"data row3 col1\" >Ann Arbor has a council-manager form of government. The City Council has 11 voting members: the mayor and 10 city council members. The mayor and city council members serve two-year terms: the mayor is elected every even-numbered year, while half of the city council members are up for election annually (five in even-numbered and five in odd-numbered years). Two council members are elected from each of the city's five wards. The mayor is elected citywide. The mayor is the presiding officer of the City Council and has the power to appoint all Council committee members as well as board and commission members, with the approval of the City Council. The current mayor of Ann Arbor is Christopher Taylor, a Democrat who was elected as mayor in 2014. Day-to-day city operations are managed by a city administrator chosen by the city council.</td>\n",
399
+ " <td id=\"T_7cf04_row3_col2\" class=\"data row3 col2\" >Who is elected every even numbered year?</td>\n",
400
+ " <td id=\"T_7cf04_row3_col3\" class=\"data row3 col3\" >mayor</td>\n",
401
+ " <td id=\"T_7cf04_row3_col4\" class=\"data row3 col4\" >The mayor is elected every even-numbered year.</td>\n",
402
+ " <td id=\"T_7cf04_row3_col5\" class=\"data row3 col5\" >0.493396</td>\n",
403
  " </tr>\n",
404
  " <tr>\n",
405
+ " <td id=\"T_7cf04_row4_col0\" class=\"data row4 col0\" >John_von_Neumann</td>\n",
406
+ " <td id=\"T_7cf04_row4_col1\" class=\"data row4 col1\" >Shortly before his death, when he was already quite ill, von Neumann headed the United States government's top secret ICBM committee, and it would sometimes meet in his home. Its purpose was to decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon. Von Neumann had long argued that while the technical obstacles were sizable, they could be overcome in time. The SM-65 Atlas passed its first fully functional test in 1959, two years after his death. The feasibility of an ICBM owed as much to improved, smaller warheads as it did to developments in rocketry, and his understanding of the former made his advice invaluable.</td>\n",
407
+ " <td id=\"T_7cf04_row4_col2\" class=\"data row4 col2\" >What was the purpose of top secret ICBM committee?</td>\n",
408
+ " <td id=\"T_7cf04_row4_col3\" class=\"data row4 col3\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
409
+ " <td id=\"T_7cf04_row4_col4\" class=\"data row4 col4\" >decide on the feasibility of building an ICBM large enough to carry a thermonuclear weapon</td>\n",
410
+ " <td id=\"T_7cf04_row4_col5\" class=\"data row4 col5\" >1.000000</td>\n",
411
  " </tr>\n",
412
  " <tr>\n",
413
+ " <td id=\"T_7cf04_row5_col0\" class=\"data row5 col0\" >Pope_Paul_VI</td>\n",
414
+ " <td id=\"T_7cf04_row5_col1\" class=\"data row5 col1\" >Some critiqued Paul VI's decision; the newly created Synod of Bishops had an advisory role only and could not make decisions on their own, although the Council decided exactly that. During the pontificate of Paul VI, five such synods took place, and he is on record of implementing all their decisions. Related questions were raised about the new National Bishop Conferences, which became mandatory after Vatican II. Others questioned his Ostpolitik and contacts with Communism and the deals he engaged in for the faithful.</td>\n",
415
+ " <td id=\"T_7cf04_row5_col2\" class=\"data row5 col2\" >What conferences became a requirement after Vatican II?</td>\n",
416
+ " <td id=\"T_7cf04_row5_col3\" class=\"data row5 col3\" >National Bishop Conferences</td>\n",
417
+ " <td id=\"T_7cf04_row5_col4\" class=\"data row5 col4\" >The National Bishop Conferences became mandatory after Vatican II.</td>\n",
418
+ " <td id=\"T_7cf04_row5_col5\" class=\"data row5 col5\" >0.442729</td>\n",
419
  " </tr>\n",
420
  " <tr>\n",
421
+ " <td id=\"T_7cf04_row6_col0\" class=\"data row6 col0\" >Spectre_(2015_film)</td>\n",
422
+ " <td id=\"T_7cf04_row6_col1\" class=\"data row6 col1\" >Bond and Swann return to London where they meet M, Bill Tanner, Q, and Moneypenny; they intend to arrest C and stop Nine Eyes from going online. Swann leaves Bond, telling him she cannot be part of a life involving espionage, and is subsequently kidnapped. On the way, the group is ambushed and Bond is kidnapped, but the rest still proceed with the plan. After Q succeeds in preventing the Nine Eyes from going online, a brief struggle between M and C ends with the latter falling to his death. Meanwhile, Bond is taken to the old MI6 building, which is scheduled for demolition, and frees himself. Moving throughout the ruined labyrinth, he encounters a disfigured Blofeld, who tells him that he has three minutes to escape the building before explosives are detonated or die trying to save Swann. Bond finds Swann and the two escape by boat as the building collapses. Bond shoots down Blofeld's helicopter, which crashes onto Westminster Bridge. As Blofeld crawls away from the wreckage, Bond confronts him but ultimately leaves him to be arrested by M. Bond leaves the bridge with Swann.</td>\n",
423
+ " <td id=\"T_7cf04_row6_col2\" class=\"data row6 col2\" >Who does M fight with?</td>\n",
424
+ " <td id=\"T_7cf04_row6_col3\" class=\"data row6 col3\" >C</td>\n",
425
+ " <td id=\"T_7cf04_row6_col4\" class=\"data row6 col4\" >C</td>\n",
426
+ " <td id=\"T_7cf04_row6_col5\" class=\"data row6 col5\" >1.000000</td>\n",
427
  " </tr>\n",
428
  " <tr>\n",
429
+ " <td id=\"T_7cf04_row7_col0\" class=\"data row7 col0\" >Antarctica</td>\n",
430
+ " <td id=\"T_7cf04_row7_col1\" class=\"data row7 col1\" >About 1150 species of fungi have been recorded from Antarctica, of which about 750 are non-lichen-forming and 400 are lichen-forming. Some of these species are cryptoendoliths as a result of evolution under extreme conditions, and have significantly contributed to shaping the impressive rock formations of the McMurdo Dry Valleys and surrounding mountain ridges. The apparently simple morphology, scarcely differentiated structures, metabolic systems and enzymes still active at very low temperatures, and reduced life cycles shown by such fungi make them particularly suited to harsh environments such as the McMurdo Dry Valleys. In particular, their thick-walled and strongly melanized cells make them resistant to UV light. Those features can also be observed in algae and cyanobacteria, suggesting that these are adaptations to the conditions prevailing in Antarctica. This has led to speculation that, if life ever occurred on Mars, it might have looked similar to Antarctic fungi such as Cryomyces minteri. Some of these fungi are also apparently endemic to Antarctica. Endemic Antarctic fungi also include certain dung-inhabiting species which have had to evolve in response to the double challenge of extreme cold while growing on dung, and the need to survive passage through the gut of warm-blooded animals.</td>\n",
431
+ " <td id=\"T_7cf04_row7_col2\" class=\"data row7 col2\" >How many species of fungi have been found on Antarctica?</td>\n",
432
+ " <td id=\"T_7cf04_row7_col3\" class=\"data row7 col3\" >1150</td>\n",
433
+ " <td id=\"T_7cf04_row7_col4\" class=\"data row7 col4\" >1150</td>\n",
434
+ " <td id=\"T_7cf04_row7_col5\" class=\"data row7 col5\" >1.000000</td>\n",
 
 
 
 
435
  " </tr>\n",
436
  " <tr>\n",
437
+ " <td id=\"T_7cf04_row8_col0\" class=\"data row8 col0\" >North_Carolina</td>\n",
438
+ " <td id=\"T_7cf04_row8_col1\" class=\"data row8 col1\" >In the Battle of Cowan's Ford, Cornwallis met resistance along the banks of the Catawba River at Cowan's Ford on February 1, 1781, in an attempt to engage General Morgan's forces during a tactical withdrawal. Morgan had moved to the northern part of the state to combine with General Greene's newly recruited forces. Generals Greene and Cornwallis finally met at the Battle of Guilford Courthouse in present-day Greensboro on March 15, 1781. Although the British troops held the field at the end of the battle, their casualties at the hands of the numerically superior Continental Army were crippling. Following this \"Pyrrhic victory\", Cornwallis chose to move to the Virginia coastline to get reinforcements, and to allow the Royal Navy to protect his battered army. This decision would result in Cornwallis' eventual defeat at Yorktown, Virginia, later in 1781. The Patriots' victory there guaranteed American independence.</td>\n",
439
+ " <td id=\"T_7cf04_row8_col2\" class=\"data row8 col2\" >After losing the battle of Guilford Courthouse, Cornawallis moved his troops where?</td>\n",
440
+ " <td id=\"T_7cf04_row8_col3\" class=\"data row8 col3\" >Virginia coastline</td>\n",
441
+ " <td id=\"T_7cf04_row8_col4\" class=\"data row8 col4\" >the Virginia coastline</td>\n",
442
+ " <td id=\"T_7cf04_row8_col5\" class=\"data row8 col5\" >0.948570</td>\n",
443
  " </tr>\n",
444
  " <tr>\n",
445
+ " <td id=\"T_7cf04_row9_col0\" class=\"data row9 col0\" >2008_Summer_Olympics_torch_relay</td>\n",
446
+ " <td id=\"T_7cf04_row9_col1\" class=\"data row9 col1\" >The Olympic Torch is based on traditional scrolls and uses a traditional Chinese design known as \"Lucky Cloud\". It is made from aluminum. It is 72 centimetres high and weighs 985 grams. The torch is designed to remain lit in 65 kilometre per hour (37 mile per hour) winds, and in rain of up to 50 millimetres (2 inches) per hour. An ignition key is used to ignite and extinguish the flame. The torch is fueled by cans of propane. Each can will light the torch for 15 minutes. It is designed by a team from Lenovo Group. The Torch is designed in reference to the traditional Chinese concept of the 5 elements that make up the entire universe.</td>\n",
447
+ " <td id=\"T_7cf04_row9_col2\" class=\"data row9 col2\" >What is the Olympic Torch made from?</td>\n",
448
+ " <td id=\"T_7cf04_row9_col3\" class=\"data row9 col3\" >aluminum.</td>\n",
449
+ " <td id=\"T_7cf04_row9_col4\" class=\"data row9 col4\" >aluminum</td>\n",
450
+ " <td id=\"T_7cf04_row9_col5\" class=\"data row9 col5\" >0.973508</td>\n",
451
  " </tr>\n",
452
  " </tbody>\n",
453
  "</table>\n"
454
  ],
455
  "text/plain": [
456
+ "<pandas.io.formats.style.Styler at 0x33e917d10>"
457
  ]
458
  },
459
  "metadata": {},
460
  "output_type": "display_data"
 
 
 
 
 
 
 
461
  }
462
  ],
463
  "source": [
464
+ "# Load and display all benchmarks\n",
465
+ "def load_benchmarks():\n",
466
+ " benchmarks_dir = \"benchmarks\"\n",
467
+ " benchmarks = []\n",
468
+ " for file in os.listdir(benchmarks_dir):\n",
469
+ " if file.endswith(\".pkl\"):\n",
470
+ " df = pd.read_pickle(os.path.join(benchmarks_dir, file))\n",
471
+ " benchmarks.append(df)\n",
472
+ " return benchmarks\n",
473
+ "\n",
474
+ "benchmarks = load_benchmarks()\n",
475
+ "\n",
476
+ "for benchmark in benchmarks:\n",
477
+ " display_text_df(benchmark)\n"
478
  ]
479
  },
480
+ {
481
+ "cell_type": "code",
482
+ "execution_count": 10,
483
+ "metadata": {},
484
+ "outputs": [],
485
+ "source": []
486
+ },
487
  {
488
  "cell_type": "code",
489
  "execution_count": null,
benchmarks/baseline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eac426004c5fb5452866d7d767c3ee286d01e3ade51497a9003a255594c70ae7
3
+ size 10430
data.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  import chromadb
 
4
  from llama_index.core import VectorStoreIndex
5
  from llama_index.vector_stores.chroma import ChromaVectorStore
6
  from llama_index.core import StorageContext
@@ -52,8 +53,7 @@ class Data:
52
  with open('data/train-v1.1.json', 'r') as f:
53
  raw_data = json.load(f)
54
 
55
- extracted_question = []
56
- extracted_answer = []
57
  documents = []
58
 
59
  for data in raw_data['data']:
@@ -67,8 +67,7 @@ class Data:
67
  if ans['text'] not in answers:
68
  answers.append(ans['text'])
69
  for answer in answers:
70
- extracted_question.append(question)
71
- extracted_answer.append(answer)
72
 
73
  doc = f"""
74
  Title: {title}
@@ -81,8 +80,8 @@ class Data:
81
  doc = "\n".join([line.strip() for line in doc.split("\n")])
82
  documents.append(doc)
83
 
 
84
  self.documents = [Document(text=t) for t in documents]
85
- self.question_answer_pairs = list(zip(extracted_question, extracted_answer))
86
 
87
  print("Raw Data loaded")
88
 
 
1
  import os
2
  import json
3
  import chromadb
4
+ import pandas as pd
5
  from llama_index.core import VectorStoreIndex
6
  from llama_index.vector_stores.chroma import ChromaVectorStore
7
  from llama_index.core import StorageContext
 
53
  with open('data/train-v1.1.json', 'r') as f:
54
  raw_data = json.load(f)
55
 
56
+ raw_documents = []
 
57
  documents = []
58
 
59
  for data in raw_data['data']:
 
67
  if ans['text'] not in answers:
68
  answers.append(ans['text'])
69
  for answer in answers:
70
+ raw_documents.append([title, context, question, answer])
 
71
 
72
  doc = f"""
73
  Title: {title}
 
80
  doc = "\n".join([line.strip() for line in doc.split("\n")])
81
  documents.append(doc)
82
 
83
+ self.df = pd.DataFrame(raw_documents, columns=["Title", "Context", "Question", "Answer"])
84
  self.documents = [Document(text=t) for t in documents]
 
85
 
86
  print("Raw Data loaded")
87