File size: 93,632 Bytes
4acec17
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
d597087
4acec17
d597087
4acec17
d597087
4acec17
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
d597087
 
 
 
 
 
 
 
 
 
 
4acec17
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
d597087
4acec17
 
 
 
 
 
 
 
 
 
 
d597087
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4acec17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install langchain langchain_community langchain_openai pypdf langsmith qdrant-client ragas pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import openai\n",
    "from getpass import getpass\n",
    "\n",
    "openai.api_key = getpass(\"Please provide your OpenAI Key: \")\n",
    "os.environ[\"OPENAI_API_KEY\"] = openai.api_key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "test_df = pd.read_csv(\"synthetic_midterm_question_dataset.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_questions = test_df[\"question\"].values.tolist()\n",
    "test_groundtruths = test_df[\"ground_truth\"].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_community.document_loaders import PyPDFLoader\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from langchain_openai import OpenAIEmbeddings\n",
    "from langchain_community.vectorstores.chroma import Chroma\n",
    "from langchain_openai import ChatOpenAI\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain.chains import ConversationalRetrievalChain\n",
    "from langchain_community.vectorstores import Qdrant\n",
    "from langchain.memory import ConversationBufferMemory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_paths = [\"/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf\",\n",
    "\"/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdf_documents = []\n",
    "for pdf_path in pdf_paths:\n",
    "    loader = PyPDFLoader(pdf_path)\n",
    "    pdf_documents.extend(loader.load())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "        chunk_size=2000,\n",
    "        chunk_overlap=100,\n",
    "    )\n",
    "pdf_docs = text_splitter.split_documents(pdf_documents)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "embedding = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "baseline_metrics = pd.read_csv(\"medium_chunk_metrics.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>MediumChunk</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.895359</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.955419</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.934028</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.937500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.629267</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  MediumChunk\n",
       "0        faithfulness     0.895359\n",
       "1    answer_relevancy     0.955419\n",
       "2      context_recall     0.934028\n",
       "3   context_precision     0.937500\n",
       "4  answer_correctness     0.629267"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "baseline_metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "baseline_metrics.rename(columns={'MediumChunk': 'Baseline'}, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorstore = Qdrant.from_documents(\n",
    "    documents=pdf_docs,\n",
    "    embedding=embedding,\n",
    "    location=\":memory:\",\n",
    "    collection_name=\"Midterm Eval\"\n",
    ")\n",
    "\n",
    "retriever = vectorstore.as_retriever(\n",
    "    search_type=\"mmr\",\n",
    "    search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
    ")\n",
    "\n",
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers.multi_query import MultiQueryRetriever\n",
    "\n",
    "retriever_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)\n",
    "multiquery_retriever = MultiQueryRetriever.from_llm(\n",
    "    retriever=retriever, llm=retriever_llm\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "llm = ChatOpenAI(\n",
    "    model=\"gpt-4o-mini\",\n",
    "    temperature=0,\n",
    "    streaming=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "custom_template = \"\"\"\n",
    "You are an expert in artificial intelligence policy, ethics, and industry trends. Your task is to provide clear and accurate answers to questions related to AI's role in politics, government regulations, and its ethical implications for enterprises. Use reliable and up-to-date information from government documents, industry reports, and academic research to inform your responses. Make sure to consider how AI is evolving, especially in relation to the current political landscape, and provide answers in a way that is easy to understand for both AI professionals and non-experts.\n",
    "\n",
    "Remember these key points:\n",
    "1. Use \"you\" when addressing the user and \"I\" when referring to yourself.\n",
    "2. If you encounter complex or legal language in the context, simplify it for easy understanding. Imagine you're explaining it to someone who isn't familiar with legal terms.\n",
    "3. Be prepared for follow-up questions and maintain context from previous exchanges.\n",
    "4. If there's no information from a retrieved document in the context to answer a question or if there are no documents to cite, say: \"I'm sorry, I don't know the answer to that question.\"\n",
    "5. When providing information, always cite the source document and page number in parentheses at the end of the relevant sentence or paragraph, like this: (Source: [document name], p. [page number]).\n",
    "\n",
    "Here are a few example questions you might receive:\n",
    "\n",
    "How are governments regulating AI, and what new policies have been implemented?\n",
    "What are the ethical risks of using AI in political decision-making?\n",
    "How can enterprises ensure their AI applications meet government ethical standards?\n",
    "\n",
    "One final rule for you to remember. You CANNOT under any circumstance, answer any question that does not pertain to the AI. If you do answer an out-of-scope question, you could lose your job. If you are asked a question that does not have to do with AI, you must say: \"I'm sorry, I don't know the answer to that question.\"\n",
    "Context: {context}\n",
    "Chat History: {chat_history}\n",
    "Human: {question}\n",
    "AI:\"\"\"\n",
    "\n",
    "PROMPT = PromptTemplate(\n",
    "    template=custom_template, input_variables=[\"context\", \"question\", \"chat_history\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "multiquery_rag_chain = ConversationalRetrievalChain.from_llm(\n",
    "        llm,\n",
    "        retriever=multiquery_retriever,\n",
    "        memory=memory,\n",
    "        combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
    "        return_source_documents=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'What are Trustworthy AI Characteristics?',\n",
       " 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
       "  AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).')],\n",
       " 'answer': 'Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).',\n",
       " 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='There may also be concerns  about  emotional entanglement  between humans and GAI systems, which \\ncould lead to negative psychological impacts . \\nTrustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with \\nHarmful Bias Managed, Privacy Enhanced, Safe , Valid and Reliable  \\n2.8. Information Integrity  \\nInformation integrity  describes the “ spectrum of information and associated patterns of its creation, \\nexchange, and consumption in society .” High-integrity information can be trusted; “distinguishes fact \\nfrom fiction, opinion, and inference; acknowledges uncertainties; and is transparent about its level of \\nvetting. This information can be linked to the original source(s) with appropriate evidence. High- integrity \\ninformation is also accurate and reliable, can be verified and  authenticated, has a clear chain of custody, \\nand creates reasonable expectations about when its validity  may expire. ”11 \\n \\n \\n11 This definition of information integrity is derived from the  2022 White House Roadmap for Researchers on \\nPriorities Related to Information Integrity Research and Development.'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 0, '_id': '8bad320d25b64bae949445cf2c427d18', '_collection_name': 'Midterm Eval'}, page_content='NIST Trustworthy and Responsible AI  \\nNIST AI 600 -1 \\nArtificial Intelligence Risk Management \\nFramework: Generative Artificial \\nIntelligence Profile \\n \\n  \\nThis publication is available free of charge from:  \\nhttps://doi.org/10.6028/NIST.AI.600 -1'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 21, '_id': '91c6953f48734236907d2797a0c07971', '_collection_name': 'Midterm Eval'}, page_content=\"SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nSome U.S government agencies have developed specific frameworks for ethical use of AI \\nsystems. The Department of Energy (DOE) has activated the AI Advancement Council that oversees coordina -\\ntion and advises on implementation of the DOE AI Strategy and addresses issues and/or escalations on the \\nethical use and development of AI systems.20 The Department of Defense has adopted Artificial Intelligence \\nEthical Principles, and tenets for Responsible Artificial Intelligence specifically tailored to its national \\nsecurity and defense activities.21 Similarl y, the U.S. Intelligence Community (IC) has developed the Principles \\nof Artificial Intelligence Ethics for the Intelligence Community to guide personnel on whether and how to \\ndevelop and use AI in furtherance of the IC's mission, as well as an AI Ethics Framework to help implement \\nthese principles.22\\nThe National Science Foundation (NSF) funds extensive research to help foster the \\ndevelopment of automated systems that adhere to and advance their safety, security and \\neffectiveness. Multiple NSF programs support research that directly addresses many of these principles: \\nthe National AI Research Institutes23 support research on all aspects of safe, trustworth y, fai r, and explainable \\nAI algorithms and systems; the Cyber Physical Systems24 program supports research on developing safe \\nautonomous and cyber physical systems with AI components; the Secure and Trustworthy Cyberspace25 \\nprogram supports research on cybersecurity and privacy enhancing technologies in automated systems; the \\nFormal Methods in the Field26 program supports research on rigorous formal verification and analysis of\"),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 60, '_id': '30c836d6cf9a481c9cf48c580209d301', '_collection_name': 'Midterm Eval'}, page_content='57 National Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix B: \\nHow AI Risks Differ from Traditional Software Risks . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_B  \\nNational Institute of Standards and Technology  (2023) AI RMF Playbook . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/Playbook  \\nNational Institue of Standards and Technology (2023) Framing Risk \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/1- sec-risk \\nNational Institu te of Standards and Technology (2023) The Language of Trustworthy AI: An In- Depth \\nGlossary of Terms https://airc.nist.gov/AI_RMF_Knowledge_Base/Glossary  \\nNational Institue of Standards and Technology (2022) Towards a Standard for Identifying and Managing \\nBias in Artificial Intelligence https://www.nist.gov/publications/towards -standard -identifying -and-\\nmanaging- bias-artificial -intelligence  \\nNorthcutt, C. et al. (2021) Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks.  \\narXiv . https://arxiv.org/pdf/2103.14749  \\nOECD (2023) \"Advancing accountability in AI: Governing and managing risks throughout the lifecycle for \\ntrustworthy AI\", OECD Digital Economy Papers , No. 349, OECD Publishing,  Paris . \\nhttps://doi.org/10.1787/2448f04b- en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers , No. 16, OECD \\nPublishing, Paris . https://doi.org/10.1787/d1a8d965- en \\nOpenAI  (2023) GPT-4 System Card . https://cdn.openai.com/papers/gpt -4-system -card.pdf  \\nOpenAI  (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774  \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity?  ICLR . \\nhttps://arxiv.org/pdf/2309.05196  \\nPark,  P.  et. al. (2024)  AI deception: A survey of  examples, risks, and potential solutions. Patterns, 5(5).  \\narXiv . https://arxiv.org/pdf/2308.14752'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': '8c7c9954577b47f390a8374bb6582294', '_collection_name': 'Midterm Eval'}, page_content='SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nExecutive Order 13960 on Promoting the Use of Trustworthy Artificial Intelligence in the \\nFederal Government requires that certain federal agencies adhere to nine principles when \\ndesigning, developing, acquiring, or using AI for purposes other than national security or \\ndefense. These principles—while taking into account the sensitive law enforcement and other contexts in which \\nthe federal government may use AI, as opposed to private sector use of AI—require that AI is: (a) lawful and \\nrespectful of our Nation’s values; (b) purposeful and performance-driven; (c) accurate, reliable, and effective; (d) \\nsafe, secure, and resilient; (e) understandable; (f ) responsible and traceable; (g) regularly monitored; (h) transpar -\\nent; and, (i) accountable. The Blueprint for an AI Bill of Rights is consistent with the Executive Order. \\nAffected agencies across the federal government have released AI use case inventories13 and are implementing \\nplans to bring those AI systems into compliance with the Executive Order or retire them. \\nThe law and policy landscape for motor vehicles shows that strong safety regulations—and \\nmeasures to address harms when they occur—can enhance innovation in the context of com-\\nplex technologies. Cars, like automated digital systems, comprise a complex collection of components. \\nThe National Highway Traffic Safety Administration,14 through its rigorous standards and independent \\nevaluation, helps make sure vehicles on our roads are safe without limiting manufacturers’ ability to \\ninnovate.15 At the same time, rules of the road are implemented locally to impose contextually appropriate \\nrequirements on drivers, such as slowing down near schools or playgrounds.16'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of \\nharmful uses. The NIST framework will consider and encompass principles such as \\ntransparency, accountability, and fairness during pre-design, design and development, deployment, use, \\nand testing and evaluation of AI technologies and systems. It is expected to be released in the winter of 2022-23. \\n21'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 63, '_id': '4b61d1ab875c4c3a94bdbe35b3e8b18a', '_collection_name': 'Midterm Eval'}, page_content='www.analyticsinsight.net/top-progressive-companies-building-ethical-ai-to-look-out-for-\\nin-2021/ https://www.technologyreview.com/2021/01/15/1016183/ai-ethics-startups/; Disha Sinha. Top\\nProgressive Companies Building Ethical AI to Look Out for in 2021. Analytics Insight . June 30, 2021.\\n18.Office of Management and Budget. Study to Identify Methods to Assess Equity: Report to the President .\\nAug. 2021. https://www.whitehouse.gov/wp-content/uploads/2021/08/OMB-Report-on-E013985-\\nImplementation_508-Compliant-Secure-v1.1.pdf\\n19.National Institute of Standards and Technology. AI Risk Management Framework. Accessed May 23,\\n2022. https://www.nist.gov/itl/ai-risk-management-framework\\n20. U.S. Department of Energy. U.S. Department of Energy Establishes Artificial Intelligence Advancement\\nCouncil. U.S. Department of Energy Artificial Intelligence and Technology Office. April 18, 2022. https://\\nwww.energy.gov/ai/articles/us-department-energy-establishes-artificial-intelligence-advancement-council\\n21.Department of Defense. U.S Department of Defense Responsible Artificial Intelligence Strategy and\\nImplementation Pathway. Jun. 2022. https://media.defense.gov/2022/Jun/22/2003022604/-1/-1/0/\\nDepartment-of-Defense-Responsible-Artificial-Intelligence-Strategy-and-Implementation-\\nPathway.PDF\\n22. Director of National Intelligence. Principles of Artificial Intelligence Ethics for the Intelligence\\nCommunity. https://www.dni.gov/index.php/features/2763-principles-of-artificial-intelligence-ethics-for-\\nthe-intelligence-community\\n64'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 50, '_id': '02d90efce1624306ab5bf45d1f8cb1db', '_collection_name': 'Midterm Eval'}, page_content='warrant additional human review, tracking and documentation, and greater management oversight.  \\nAI technology can produce varied outputs  in multiple modalities and present many classes of user \\ninterfaces. This leads to a broader set of AI Actors  interacting with GAI systems for widely differing \\napplications and contexts of use. These  can include data labeling and preparation, development of GAI \\nmodels, content moderation, code generation and review, text generation and editing, image and video \\ngeneration, summarization, search, and chat. These activities can take place within organizational \\nsettings or in the public domain.  \\nOrganizations can restrict AI applications that cause harm, exceed stated risk tolerances, or that conflict with their tolerances or values. Governance tools and protocols that are applied to other types of AI systems can be applied to GAI systems. These p lans and actions include: \\n• Accessibility and reasonable accommodations  \\n• AI actor credentials and qualifications  \\n• Alignment to organizational values  • Auditing and assessment  \\n• Change -management controls  \\n• Commercial use  \\n• Data provenance'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics:  Accountable and Transparent, Privacy Enhanced, Safe, Secure and \\nResilient  \\n2.5. Environmental Impacts  \\nTraining, maint aining, and operating (running inference  on) GAI systems are resource -intensive  activities , \\nwith potentially large energy and environmental footprints. Energy and carbon emissions vary  based on \\nwhat is being done with the  GAI model (i.e., pre -training, fine -tuning, inference), the modality  of the \\ncontent , hardware used, and type of task or application . \\nCurrent e stimates suggest that training a single transformer  LLM can emit as much carbon  as 300 round-\\ntrip flights between San Francisco and New York.  In a study comparing energy consumption and carbon \\nemissions for LLM inference, generative tasks ( e.g., text summarization) were found to be more energy - \\nand carbon -i ntensive th an discriminative or non- generative tasks  (e.g., text classification).  \\nMethods for creating smaller versions of train ed models, such as model distillation or compression, \\ncould reduce  environmental impacts at inference time, but training and tuning such models may still \\ncontribute to their environmental impacts . Currently there  is no agreed upon method to estimate \\nenvironmental impacts  from GAI .  \\nTrustworthy AI Characteristics:  Accountable and Transparent, Safe  \\n2.6. Harmful Bias and Homogenization  \\nBias exists in many forms  and can become ingrained in automated systems. AI systems , including GAI \\nsystems,  can increase the speed and scale at which harmful biases manifest  and are acted upon, \\npotentially  perpetuati ng and amplify ing harms to individuals, groups, communities, organizations, and \\nsociety . For example,  when prompted to generate images of CEOs, doctors, lawyers, and judges, current \\ntext-to-image models underrepresent  women and/or racial minorities , and people with disabilities . \\nImage generator models have also produce d biased or stereotyped output for  various demographic')]}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiquery_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "answers = []\n",
    "contexts = []\n",
    "\n",
    "for question in test_questions:\n",
    "  response = multiquery_rag_chain.invoke({\"question\" : question})\n",
    "  answers.append(response[\"answer\"])\n",
    "  contexts.append([context.page_content for context in response[\"source_documents\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import Dataset\n",
    "\n",
    "multiquery_dataset = Dataset.from_dict({\n",
    "    \"question\" : test_questions,\n",
    "    \"answer\" : answers,\n",
    "    \"contexts\" : contexts,\n",
    "    \"ground_truth\" : test_groundtruths\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
       " 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect them. This transparency helps build trust between the public and the organizations deploying these systems.\\n\\n2. **Informed Consent**: By notifying individuals about the use of automated systems, organizations allow people to make informed choices about their engagement with these systems. This is particularly important in sensitive areas like healthcare, finance, and law enforcement.\\n\\n3. **Accountability**: Clear notice and explanations hold organizations accountable for their automated decisions. If individuals understand how decisions are made, they can better contest or appeal those decisions if they believe they are unjust or incorrect.\\n\\n4. **Protection of Rights**: Legal requirements for notice and explanation help protect individuals' rights by ensuring they have access to information about how their data is used and how decisions that impact them are made. This is crucial in preventing discrimination and ensuring fair treatment.\\n\\n5. **Facilitating Recourse**: When individuals receive timely and understandable explanations, they are better equipped to seek recourse if they feel wronged by an automated decision. This can include appealing decisions or requesting human intervention.\\n\\n6. **Encouraging Ethical Use**: Legal requirements can encourage organizations to develop and implement automated systems ethically, ensuring that they consider the potential impacts on individuals and society as a whole.\\n\\nOverall, these requirements aim to create a framework where automated systems are used responsibly, with respect for individuals' rights and dignity (Source: [document name], p. [page number]).\",\n",
       " 'contexts': [\"Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. Techniques used to automate the process of explaining such systems are under active research and improvement and such explanations can take many forms. Innovative companies and researchers are rising to the challenge and creating and deploying explanatory systems that can help the public better understand decisions that impact them. \\nWhile notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. \\n• A lawyer representing an older client with disabilities who had been cut off from Medicaid-funded home\\nhealth-care assistance couldn't determine why\\n, especially since the decision went against historical access\\npractices. In a court hearing, the lawyer learned from a witness that the state in which the older client\\nlived \\nhad recently adopted a new algorithm to determine eligibility.83 The lack of a timely explanation made it\\nharder \\nto understand and contest the decision.\\n•\\nA formal child welfare investigation is opened against a parent based on an algorithm and without the parent\\never \\nbeing notified that data was being collected and used as part of an algorithmic child maltreatment\\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\\ncontest a decision.\\n41\",\n",
       "  'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nTailored to the level of risk. An assessment should be done to determine the level of risk of the auto -\\nmated system. In settings where the consequences are high as determined by a risk assessment, or extensive \\noversight is expected (e.g., in criminal justice or some public sector settings), explanatory mechanisms should be built into the system design so that the system’s full behavior can be explained in advance (i.e., only fully transparent models should be used), rather than as an after-the-decision interpretation. In other settings, the extent of explanation provided should be tailored to the risk level. \\nValid. The explanation provided by a system should accurately reflect the factors and the influences that led \\nto a particular decision, and should be meaningful for the particular customization based on purpose, target, and level of risk. While approximation and simplification may be necessary for the system to succeed based on the explanatory purpose and target of the explanation, or to account for the risk of fraud or other concerns related to revealing decision-making information, such simplifications should be done in a scientifically supportable way. Where appropriate based on the explanatory system, error ranges for the explanation should be calculated and included in the explanation, with the choice of presentation of such information balanced with usability and overall interface complexity concerns. \\nDemonstrate protections for notice and explanation \\nReporting. Summary reporting should document the determinations made based on the above consider -',\n",
       "  'should not be used in education, work, housing, or in other contexts where the use of such surveillance \\ntechnologies is likely to limit rights, opportunities, or access. Whenever possible, you should have access to \\nreporting that confirms your data decisions have been respected and provides an assessment of the \\npotential impact of surveillance technologies on your rights, opportunities, or access. \\nNOTICE AND EXPLANATION\\nYou should know that an automated system is being used and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organiza\\n-\\ntion responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible. \\n6',\n",
       "  'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nAn automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and \\nexplanations as to how and why a decision was made or an action was taken by the system. These expectations are explained below. \\nProvide clear, timely, understandable, and accessible notice of use and explanations \\nGenerally accessible plain language documentation. The entity responsible for using the automated \\nsystem should ensure that documentation describing the overall system (including any human components) is \\npublic and easy to find. The documentation should describe, in plain language, how the system works and how \\nany automated component is used to determine an action or decision. It should also include expectations about \\nreporting described throughout this framework, such as the algorithmic impact assessments described as \\npart of Algorithmic Discrimination Protections. \\nAccount able. Notices should clearly identify the entity r esponsible for designing each component of the \\nsystem and the entity using it. \\nTimely and up-to-date. Users should receive notice of the use of automated systems in advance of using or \\nwhile being impacted by the technolog y. An explanation should be available with the decision itself, or soon \\nthereafte r. Notice should be kept up-to-date and people impacted by the system should be notified of use case \\nor key functionality changes. \\nBrief and clear. Notices and explanations should be assessed, such as by research on users’ experiences, \\nincluding user testing, to ensure that the people using or impacted by the automated system are able to easily',\n",
       "  'burdensome in both the process of requesting to opt-out and the human-driven alternative provided. \\nProvide timely human consideration and remedy by a fallback and escalation system in the event that an automated system fails, produces error, or you would like to appeal or con\\n-\\ntest its impacts on you \\nProportionate. The availability of human consideration and fallback, along with associated training and \\nsafeguards against human bias, should be proportionate to the potential of the automated system to meaning -\\nfully impact rights, opportunities, or access. Automated systems that have greater control over outcomes, provide input to high-stakes decisions, relate to sensitive domains, or otherwise have a greater potential to meaningfully impact rights, opportunities, or access should have greater availability (e.g., staffing) and over\\n-\\nsight of human consideration and fallback mechanisms. \\nAccessible. Mechanisms for human consideration and fallback, whether in-person, on paper, by phone, or \\notherwise provided, should be easy to find and use. These mechanisms should be tested to ensure that users who have trouble with the automated system are able to use human consideration and fallback, with the under\\n-\\nstanding that it may be these users who are most likely to need the human assistance. Similarly, it should be tested to ensure that users with disabilities are able to find and use human consideration and fallback and also request reasonable accommodations or modifications. \\nConvenient. Mechanisms for human consideration and fallback should not be unreasonably burdensome as \\ncompared to the automated system’s equivalent. \\n49',\n",
       "  'You should know that an automated system is being used, \\nand understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automat\\n-\\ned systems should provide generally accessible plain language docu -\\nmentation including clear descriptions of the overall system func -\\ntioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and ex\\n-\\nplanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality chang\\n-\\nes. You should know how and why an outcome impacting you was de -\\ntermined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible.   NOTICE AND  EXPLANATION\\n40',\n",
       "  'HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFALLBACK \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nThere are many reasons people may prefer not to use an automated system: the system can be flawed and can lead to \\nunintended outcomes; it may reinforce bias or be inaccessible; it may simply be inconvenient or unavailable; or it may replace a paper or manual process to which people had grown accustomed. Yet members of the public are often presented with no alternative, or are forced to endure a cumbersome process to reach a human decision-maker once they decide they no longer want to deal exclusively with the automated system or be impacted by its results. As a result of this lack of human reconsideration, many receive delayed access, or lose access, to rights, opportunities, benefits, and critical services. The American public deserves the assurance that, when rights, opportunities, or access are meaningfully at stake and there is a reasonable expectation of an alternative to an automated system, they can conve\\n-\\nniently opt out of an automated system and will not be disadvantaged for that choice. In some cases, such a human or other alternative may be required by law, for example it could be required as “reasonable accommodations” for people with disabilities.',\n",
       "  \"find notices and explanations, read them quickl y, and understand and act on them. This includes ensuring that \\nnotices and explanations are accessible to users with disabilities and are available in the language(s) and read-\\ning level appropriate for the audience. Notices and explanations may need to be available in multiple forms, \\n(e.g., on pape r, on a physical sign, or online), in order to meet these expectations and to be accessible to the \\nAmerican public. \\nProvide explanations as to how and why a decision was made or an action was taken by an \\nautomated system \\nTailored to the purpose. Explanations should be tailored to the specific purpose for which the user is \\nexpected to use the explanation, and should clearly state that purpose. An informational explanation might differ from an explanation provided to allow for the possibility of recourse, an appeal, or one provided in the context of a dispute or contestation process. For the purposes of this framework, 'explanation' should be construed broadly. An explanation need not be a plain-language statement about causality but could consist of any mechanism that allows the recipient to build the necessary understanding and intuitions to achieve the stated purpose. Tailoring should be assessed (e.g., via user experience research). \\nTailored to the target of the explanation. Explanations should be targeted to specific audiences and clearly state that audience. An explanation provided to the subject of a decision might differ from one provided to an advocate, or to a domain expert or decision maker. Tailoring should be assessed (e.g., via user experience research). \\n43\"],\n",
       " 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiquery_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "from ragas import evaluate\n",
    "from ragas.metrics import (\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    answer_correctness,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    ")\n",
    "\n",
    "metrics = [\n",
    "    faithfulness,\n",
    "    answer_relevancy,\n",
    "    context_recall,\n",
    "    context_precision,\n",
    "    answer_correctness,\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f5257aea40624e62905d488461b543db",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "multiquery_results = evaluate(multiquery_dataset, metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'faithfulness': 0.8968, 'answer_relevancy': 0.9532, 'context_recall': 0.8906, 'context_precision': 0.9207, 'answer_correctness': 0.6901}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiquery_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>contexts</th>\n",
       "      <th>answer</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>context_precision</th>\n",
       "      <th>answer_correctness</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What is the significance of providing notice a...</td>\n",
       "      <td>[Providing notice has long been a standard pra...</td>\n",
       "      <td>Providing notice and explanation as a legal re...</td>\n",
       "      <td>Providing notice and explanation as a legal re...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.971321</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.821299</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>How can structured human feedback exercises, s...</td>\n",
       "      <td>[50 Participatory Engagement Methods  \\nOn an ...</td>\n",
       "      <td>Structured human feedback exercises, such as G...</td>\n",
       "      <td>Structured human feedback exercises, such as G...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.992832</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.541222</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How do measurement gaps between laboratory and...</td>\n",
       "      <td>[49 early lifecycle TEVV approaches are develo...</td>\n",
       "      <td>Measurement gaps between laboratory and real-w...</td>\n",
       "      <td>Measurement gaps between laboratory and real-w...</td>\n",
       "      <td>0.958333</td>\n",
       "      <td>0.988752</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.876667</td>\n",
       "      <td>0.636556</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>How should data collection and use-case scope ...</td>\n",
       "      <td>[Data collection and use-case scope limits. Da...</td>\n",
       "      <td>To prevent 'mission creep' in automated system...</td>\n",
       "      <td>Data collection and use-case scope limits in a...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.923204</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.491425</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What action did the Federal Trade Commission t...</td>\n",
       "      <td>[alerts about location tracking—are brief, dir...</td>\n",
       "      <td>The Federal Trade Commission (FTC) took action...</td>\n",
       "      <td>FTC sued Kochava for selling data that tracks ...</td>\n",
       "      <td>0.400000</td>\n",
       "      <td>0.936866</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.902212</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  What is the significance of providing notice a...   \n",
       "1  How can structured human feedback exercises, s...   \n",
       "2  How do measurement gaps between laboratory and...   \n",
       "3  How should data collection and use-case scope ...   \n",
       "4  What action did the Federal Trade Commission t...   \n",
       "\n",
       "                                            contexts  \\\n",
       "0  [Providing notice has long been a standard pra...   \n",
       "1  [50 Participatory Engagement Methods  \\nOn an ...   \n",
       "2  [49 early lifecycle TEVV approaches are develo...   \n",
       "3  [Data collection and use-case scope limits. Da...   \n",
       "4  [alerts about location tracking—are brief, dir...   \n",
       "\n",
       "                                              answer  \\\n",
       "0  Providing notice and explanation as a legal re...   \n",
       "1  Structured human feedback exercises, such as G...   \n",
       "2  Measurement gaps between laboratory and real-w...   \n",
       "3  To prevent 'mission creep' in automated system...   \n",
       "4  The Federal Trade Commission (FTC) took action...   \n",
       "\n",
       "                                        ground_truth  faithfulness  \\\n",
       "0  Providing notice and explanation as a legal re...      1.000000   \n",
       "1  Structured human feedback exercises, such as G...      1.000000   \n",
       "2  Measurement gaps between laboratory and real-w...      0.958333   \n",
       "3  Data collection and use-case scope limits in a...      1.000000   \n",
       "4  FTC sued Kochava for selling data that tracks ...      0.400000   \n",
       "\n",
       "   answer_relevancy  context_recall  context_precision  answer_correctness  \n",
       "0          0.971321             1.0           1.000000            0.821299  \n",
       "1          0.992832             1.0           1.000000            0.541222  \n",
       "2          0.988752             1.0           0.876667            0.636556  \n",
       "3          0.923204             1.0           1.000000            0.491425  \n",
       "4          0.936866             0.0           0.125000            0.902212  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiquery_results_df = multiquery_results.to_pandas()\n",
    "multiquery_results_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "multiquery_results_df.to_csv(\"multiquery_ragas_results.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "multiquery_metrics_df = pd.DataFrame(list(multiquery_results.items()), columns=['Metric', 'MultiQuery'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>MultiQuery</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.896804</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.953211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.890625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.920732</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.690058</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  MultiQuery\n",
       "0        faithfulness    0.896804\n",
       "1    answer_relevancy    0.953211\n",
       "2      context_recall    0.890625\n",
       "3   context_precision    0.920732\n",
       "4  answer_correctness    0.690058"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "multiquery_metrics_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "multiquery_metrics_df.to_csv(\"multiquery_metrics.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>Baseline</th>\n",
       "      <th>MultiQuery</th>\n",
       "      <th>Baseline -&gt; MultiQuery</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.895359</td>\n",
       "      <td>0.896804</td>\n",
       "      <td>0.001445</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.955419</td>\n",
       "      <td>0.953211</td>\n",
       "      <td>-0.002208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.934028</td>\n",
       "      <td>0.890625</td>\n",
       "      <td>-0.043403</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.937500</td>\n",
       "      <td>0.920732</td>\n",
       "      <td>-0.016768</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.629267</td>\n",
       "      <td>0.690058</td>\n",
       "      <td>0.060791</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  Baseline  MultiQuery  Baseline -> MultiQuery\n",
       "0        faithfulness  0.895359    0.896804                0.001445\n",
       "1    answer_relevancy  0.955419    0.953211               -0.002208\n",
       "2      context_recall  0.934028    0.890625               -0.043403\n",
       "3   context_precision  0.937500    0.920732               -0.016768\n",
       "4  answer_correctness  0.629267    0.690058                0.060791"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
    "\n",
    "df_baseline_multiquery['Baseline -> MultiQuery'] = df_baseline_multiquery['MultiQuery'] - df_baseline_multiquery['Baseline']\n",
    "\n",
    "df_baseline_multiquery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "compression_retriever = vectorstore.as_retriever(\n",
    "    search_type=\"mmr\",\n",
    "    search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.retrievers import ContextualCompressionRetriever\n",
    "from langchain.retrievers.document_compressors import LLMChainExtractor\n",
    "\n",
    "compressor = LLMChainExtractor.from_llm(llm)\n",
    "compression_retriever = ContextualCompressionRetriever(\n",
    "    base_compressor=compressor, base_retriever=compression_retriever\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "contextual_compression_rag_chain = ConversationalRetrievalChain.from_llm(\n",
    "        llm,\n",
    "        retriever=compression_retriever,\n",
    "        memory=memory,\n",
    "        combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
    "        return_source_documents=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'What are Trustworthy AI Characteristics?',\n",
       " 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
       "  AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).')],\n",
       " 'answer': 'Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).',\n",
       " 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='Trustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with Harmful Bias Managed, Privacy Enhanced, Safe, Valid and Reliable'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of harmful uses. The NIST framework will consider and encompass principles such as transparency, accountability, and fairness during pre-design, design and development, deployment, use, and testing and evaluation of AI technologies and systems.'),\n",
       "  Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics:  Accountable and Transparent, Privacy Enhanced, Safe, Secure and Resilient  \\nTrustworthy AI Characteristics:  Accountable and Transparent, Safe')]}"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contextual_compression_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "answers = []\n",
    "contexts = []\n",
    "\n",
    "for question in test_questions:\n",
    "  response = contextual_compression_rag_chain.invoke({\"question\" : question})\n",
    "  answers.append(response[\"answer\"])\n",
    "  contexts.append([context.page_content for context in response[\"source_documents\"]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "contextual_compression_dataset = Dataset.from_dict({\n",
    "    \"question\" : test_questions,\n",
    "    \"answer\" : answers,\n",
    "    \"contexts\" : contexts,\n",
    "    \"ground_truth\" : test_groundtruths\n",
    "})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
       " 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect their rights, opportunities, or access. This transparency helps build trust in the technology and the entities deploying it.\\n\\n2. **Empowerment**: When people receive clear explanations about how decisions are made by automated systems, they are better equipped to understand and contest those decisions if necessary. This is particularly important in sensitive areas like employment, credit, and legal proceedings, where outcomes can have profound impacts on individuals' lives.\\n\\n3. **Accountability**: Notice and explanation requirements hold organizations accountable for their automated systems. By clearly identifying the entities responsible for the design and use of these systems, it becomes easier to address any issues or biases that may arise.\\n\\n4. **Error Correction**: Providing notice allows individuals to identify and correct errors in automated decision-making processes. Without this knowledge, individuals may be left without recourse to challenge incorrect or unfair outcomes.\\n\\n5. **Public Confidence**: Consistent and clear communication about the use of automated systems can enhance public confidence in these technologies. When people understand how and why decisions are made, they are more likely to trust the systems and the organizations that use them.\\n\\n6. **Safety and Efficacy**: Clear explanations enable experts to verify the reasonableness of recommendations made by automated systems before they are enacted. This is crucial for ensuring that the systems operate safely and effectively.\\n\\nIn summary, notice and explanation requirements are essential for protecting individuals' rights, fostering accountability, and ensuring that automated systems are used responsibly and ethically (Source: [document name], p. [page number]).\",\n",
       " 'contexts': ['Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. While notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. The lack of a timely explanation made it harder to understand and contest the decision. The lack of notice or an explanation makes it harder for those performing child maltreatment assessments to validate the risk assessment and denies parents knowledge that could help them contest a decision.',\n",
       "  'You should know that an automated system is being used, and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible.',\n",
       "  'An automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and explanations as to how and why a decision was made or an action was taken by the system. Notices should clearly identify the entity responsible for designing each component of the system and the entity using it. Users should receive notice of the use of automated systems in advance of using or while being impacted by the technology. An explanation should be available with the decision itself, or soon thereafter. Notice should be kept up-to-date and people impacted by the system should be notified of use case or key functionality changes. Notices and explanations should be assessed, such as by research on users’ experiences, including user testing, to ensure that the people using or impacted by the automated system are able to easily.',\n",
       "  'NOTICE & \\nEXPLANATION \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nAutomated systems now determine opportunities, from employment to credit, and directly shape the American \\npublic’s experiences, from the courtroom to online classrooms, in ways that profoundly impact people’s lives. But this expansive impact is not always visible. An applicant might not know whether a person rejected their resume or a hiring algorithm moved them to the bottom of the list. A defendant in the courtroom might not know if a judge denying their bail is informed by an automated system that labeled them “high risk.” From correcting errors to contesting decisions, people are often denied the knowledge they need to address the impact of automated systems on their lives. Notice and explanations also serve an important safety and efficacy purpose, allowing experts to verify the reasonableness of a recommendation before enacting it. \\nIn order to guard against potential harms, the American public needs to know if an automated system is being used. Clear, brief, and understandable notice is a prerequisite for achieving the other protections in this framework. Like-\\nwise, the public is often unable to ascertain how or why an automated system has made a decision or contributed to a particular outcome. The decision-making processes of automated systems tend to be opaque, complex, and, therefore, unaccountable, whether by design or by omission. These factors can make explanations both more challenging and more important, and should not be used as a pretext to avoid explaining important decisions to the people impacted by those choices. In the context of automated systems, clear and valid explanations should be recognized as a baseline requirement.'],\n",
       " 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contextual_compression_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b6c03ccbf50e4642b9433f2513fb83c3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "contextual_compression_results = evaluate(contextual_compression_dataset, metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'faithfulness': 0.7491, 'answer_relevancy': 0.9140, 'context_recall': 0.7257, 'context_precision': 0.9051, 'answer_correctness': 0.5707}"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contextual_compression_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>question</th>\n",
       "      <th>contexts</th>\n",
       "      <th>answer</th>\n",
       "      <th>ground_truth</th>\n",
       "      <th>faithfulness</th>\n",
       "      <th>answer_relevancy</th>\n",
       "      <th>context_recall</th>\n",
       "      <th>context_precision</th>\n",
       "      <th>answer_correctness</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>What is the significance of providing notice a...</td>\n",
       "      <td>[Providing notice has long been a standard pra...</td>\n",
       "      <td>Providing notice and explanation as a legal re...</td>\n",
       "      <td>Providing notice and explanation as a legal re...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.971321</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.585260</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>How can structured human feedback exercises, s...</td>\n",
       "      <td>[AI Red -teaming  \\nAI red -teaming is an evol...</td>\n",
       "      <td>Structured human feedback exercises, such as G...</td>\n",
       "      <td>Structured human feedback exercises, such as G...</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.988309</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.320501</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>How do measurement gaps between laboratory and...</td>\n",
       "      <td>[Currently available pre -deployment TEVV proc...</td>\n",
       "      <td>Measurement gaps between laboratory and real-w...</td>\n",
       "      <td>Measurement gaps between laboratory and real-w...</td>\n",
       "      <td>0.958333</td>\n",
       "      <td>0.996595</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.597251</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>How should data collection and use-case scope ...</td>\n",
       "      <td>[Data collection should be limited in scope, w...</td>\n",
       "      <td>To prevent \"mission creep\" in automated system...</td>\n",
       "      <td>Data collection and use-case scope limits in a...</td>\n",
       "      <td>0.439024</td>\n",
       "      <td>0.922376</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.551606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>What action did the Federal Trade Commission t...</td>\n",
       "      <td>[]</td>\n",
       "      <td>The Federal Trade Commission (FTC) took action...</td>\n",
       "      <td>FTC sued Kochava for selling data that tracks ...</td>\n",
       "      <td>0.833333</td>\n",
       "      <td>0.925072</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.529680</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                            question  \\\n",
       "0  What is the significance of providing notice a...   \n",
       "1  How can structured human feedback exercises, s...   \n",
       "2  How do measurement gaps between laboratory and...   \n",
       "3  How should data collection and use-case scope ...   \n",
       "4  What action did the Federal Trade Commission t...   \n",
       "\n",
       "                                            contexts  \\\n",
       "0  [Providing notice has long been a standard pra...   \n",
       "1  [AI Red -teaming  \\nAI red -teaming is an evol...   \n",
       "2  [Currently available pre -deployment TEVV proc...   \n",
       "3  [Data collection should be limited in scope, w...   \n",
       "4                                                 []   \n",
       "\n",
       "                                              answer  \\\n",
       "0  Providing notice and explanation as a legal re...   \n",
       "1  Structured human feedback exercises, such as G...   \n",
       "2  Measurement gaps between laboratory and real-w...   \n",
       "3  To prevent \"mission creep\" in automated system...   \n",
       "4  The Federal Trade Commission (FTC) took action...   \n",
       "\n",
       "                                        ground_truth  faithfulness  \\\n",
       "0  Providing notice and explanation as a legal re...      1.000000   \n",
       "1  Structured human feedback exercises, such as G...      1.000000   \n",
       "2  Measurement gaps between laboratory and real-w...      0.958333   \n",
       "3  Data collection and use-case scope limits in a...      0.439024   \n",
       "4  FTC sued Kochava for selling data that tracks ...      0.833333   \n",
       "\n",
       "   answer_relevancy  context_recall  context_precision  answer_correctness  \n",
       "0          0.971321             1.0                1.0            0.585260  \n",
       "1          0.988309             1.0                1.0            0.320501  \n",
       "2          0.996595             1.0                1.0            0.597251  \n",
       "3          0.922376             1.0                1.0            0.551606  \n",
       "4          0.925072             0.0                0.0            0.529680  "
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contextual_compression_results_df = contextual_compression_results.to_pandas()\n",
    "contextual_compression_results_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "contextual_compression_results_df.to_csv(\"contextual_compression_ragas_results.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "contextual_compression_metrics_df = pd.DataFrame(list(contextual_compression_results.items()), columns=['Metric', 'ContextualCompression'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>ContextualCompression</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.749092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.913993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.725694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.905093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.570685</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  ContextualCompression\n",
       "0        faithfulness               0.749092\n",
       "1    answer_relevancy               0.913993\n",
       "2      context_recall               0.725694\n",
       "3   context_precision               0.905093\n",
       "4  answer_correctness               0.570685"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "contextual_compression_metrics_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "contextual_compression_metrics_df.to_csv(\"contextual_compression_metrics.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>Baseline</th>\n",
       "      <th>MultiQuery</th>\n",
       "      <th>ContextualCompression</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.895359</td>\n",
       "      <td>0.896804</td>\n",
       "      <td>0.749092</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.955419</td>\n",
       "      <td>0.953211</td>\n",
       "      <td>0.913993</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.934028</td>\n",
       "      <td>0.890625</td>\n",
       "      <td>0.725694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.937500</td>\n",
       "      <td>0.920732</td>\n",
       "      <td>0.905093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.629267</td>\n",
       "      <td>0.690058</td>\n",
       "      <td>0.570685</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  Baseline  MultiQuery  ContextualCompression\n",
       "0        faithfulness  0.895359    0.896804               0.749092\n",
       "1    answer_relevancy  0.955419    0.953211               0.913993\n",
       "2      context_recall  0.934028    0.890625               0.725694\n",
       "3   context_precision  0.937500    0.920732               0.905093\n",
       "4  answer_correctness  0.629267    0.690058               0.570685"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
    "df_baseline_multiquery_contextual_compression = pd.merge(df_baseline_multiquery, contextual_compression_metrics_df, on='Metric')\n",
    "\n",
    "\n",
    "df_baseline_multiquery_contextual_compression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Metric</th>\n",
       "      <th>Baseline</th>\n",
       "      <th>MultiQuery</th>\n",
       "      <th>ContextualCompression</th>\n",
       "      <th>HigestValue</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>faithfulness</td>\n",
       "      <td>0.895359</td>\n",
       "      <td>0.896804</td>\n",
       "      <td>0.749092</td>\n",
       "      <td>0.9 (MultiQuery)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>answer_relevancy</td>\n",
       "      <td>0.955419</td>\n",
       "      <td>0.953211</td>\n",
       "      <td>0.913993</td>\n",
       "      <td>0.96 (Baseline)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>context_recall</td>\n",
       "      <td>0.934028</td>\n",
       "      <td>0.890625</td>\n",
       "      <td>0.725694</td>\n",
       "      <td>0.93 (Baseline)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>context_precision</td>\n",
       "      <td>0.937500</td>\n",
       "      <td>0.920732</td>\n",
       "      <td>0.905093</td>\n",
       "      <td>0.94 (Baseline)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>answer_correctness</td>\n",
       "      <td>0.629267</td>\n",
       "      <td>0.690058</td>\n",
       "      <td>0.570685</td>\n",
       "      <td>0.69 (MultiQuery)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               Metric  Baseline  MultiQuery  ContextualCompression  \\\n",
       "0        faithfulness  0.895359    0.896804               0.749092   \n",
       "1    answer_relevancy  0.955419    0.953211               0.913993   \n",
       "2      context_recall  0.934028    0.890625               0.725694   \n",
       "3   context_precision  0.937500    0.920732               0.905093   \n",
       "4  answer_correctness  0.629267    0.690058               0.570685   \n",
       "\n",
       "         HigestValue  \n",
       "0   0.9 (MultiQuery)  \n",
       "1    0.96 (Baseline)  \n",
       "2    0.93 (Baseline)  \n",
       "3    0.94 (Baseline)  \n",
       "4  0.69 (MultiQuery)  "
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_baseline_multiquery_contextual_compression['MaxValue'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].max(axis=1)\n",
    "\n",
    "df_baseline_multiquery_contextual_compression['MaxMetric'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].idxmax(axis=1)\n",
    "\n",
    "df_baseline_multiquery_contextual_compression['HigestValue'] = df_baseline_multiquery_contextual_compression['MaxValue'].round(2).astype(str) + ' (' + df_baseline_multiquery_contextual_compression['MaxMetric'] + ')'\n",
    "\n",
    "df_baseline_multiquery_contextual_compression = df_baseline_multiquery_contextual_compression.drop(columns=['MaxValue', 'MaxMetric'])\n",
    "\n",
    "df_baseline_multiquery_contextual_compression"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}