Spaces:
Sleeping
Sleeping
File size: 93,632 Bytes
4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 d597087 4acec17 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 |
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install langchain langchain_community langchain_openai pypdf langsmith qdrant-client ragas pandas"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import openai\n",
"from getpass import getpass\n",
"\n",
"openai.api_key = getpass(\"Please provide your OpenAI Key: \")\n",
"os.environ[\"OPENAI_API_KEY\"] = openai.api_key"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"test_df = pd.read_csv(\"synthetic_midterm_question_dataset.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"test_questions = test_df[\"question\"].values.tolist()\n",
"test_groundtruths = test_df[\"ground_truth\"].values.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFLoader\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_community.vectorstores.chroma import Chroma\n",
"from langchain_openai import ChatOpenAI\n",
"from langchain.prompts import PromptTemplate\n",
"from langchain.chains import ConversationalRetrievalChain\n",
"from langchain_community.vectorstores import Qdrant\n",
"from langchain.memory import ConversationBufferMemory"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"pdf_paths = [\"/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf\",\n",
"\"/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf\"]"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"pdf_documents = []\n",
"for pdf_path in pdf_paths:\n",
" loader = PyPDFLoader(pdf_path)\n",
" pdf_documents.extend(loader.load())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"text_splitter = RecursiveCharacterTextSplitter(\n",
" chunk_size=2000,\n",
" chunk_overlap=100,\n",
" )\n",
"pdf_docs = text_splitter.split_documents(pdf_documents)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"embedding = OpenAIEmbeddings(model=\"text-embedding-3-small\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"baseline_metrics = pd.read_csv(\"medium_chunk_metrics.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>MediumChunk</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.895359</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.955419</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.934028</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.937500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.629267</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric MediumChunk\n",
"0 faithfulness 0.895359\n",
"1 answer_relevancy 0.955419\n",
"2 context_recall 0.934028\n",
"3 context_precision 0.937500\n",
"4 answer_correctness 0.629267"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"baseline_metrics"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"baseline_metrics.rename(columns={'MediumChunk': 'Baseline'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"vectorstore = Qdrant.from_documents(\n",
" documents=pdf_docs,\n",
" embedding=embedding,\n",
" location=\":memory:\",\n",
" collection_name=\"Midterm Eval\"\n",
")\n",
"\n",
"retriever = vectorstore.as_retriever(\n",
" search_type=\"mmr\",\n",
" search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
")\n",
"\n",
"memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from langchain.retrievers.multi_query import MultiQueryRetriever\n",
"\n",
"retriever_llm = ChatOpenAI(model='gpt-4o-mini', temperature=0)\n",
"multiquery_retriever = MultiQueryRetriever.from_llm(\n",
" retriever=retriever, llm=retriever_llm\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"llm = ChatOpenAI(\n",
" model=\"gpt-4o-mini\",\n",
" temperature=0,\n",
" streaming=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"custom_template = \"\"\"\n",
"You are an expert in artificial intelligence policy, ethics, and industry trends. Your task is to provide clear and accurate answers to questions related to AI's role in politics, government regulations, and its ethical implications for enterprises. Use reliable and up-to-date information from government documents, industry reports, and academic research to inform your responses. Make sure to consider how AI is evolving, especially in relation to the current political landscape, and provide answers in a way that is easy to understand for both AI professionals and non-experts.\n",
"\n",
"Remember these key points:\n",
"1. Use \"you\" when addressing the user and \"I\" when referring to yourself.\n",
"2. If you encounter complex or legal language in the context, simplify it for easy understanding. Imagine you're explaining it to someone who isn't familiar with legal terms.\n",
"3. Be prepared for follow-up questions and maintain context from previous exchanges.\n",
"4. If there's no information from a retrieved document in the context to answer a question or if there are no documents to cite, say: \"I'm sorry, I don't know the answer to that question.\"\n",
"5. When providing information, always cite the source document and page number in parentheses at the end of the relevant sentence or paragraph, like this: (Source: [document name], p. [page number]).\n",
"\n",
"Here are a few example questions you might receive:\n",
"\n",
"How are governments regulating AI, and what new policies have been implemented?\n",
"What are the ethical risks of using AI in political decision-making?\n",
"How can enterprises ensure their AI applications meet government ethical standards?\n",
"\n",
"One final rule for you to remember. You CANNOT under any circumstance, answer any question that does not pertain to the AI. If you do answer an out-of-scope question, you could lose your job. If you are asked a question that does not have to do with AI, you must say: \"I'm sorry, I don't know the answer to that question.\"\n",
"Context: {context}\n",
"Chat History: {chat_history}\n",
"Human: {question}\n",
"AI:\"\"\"\n",
"\n",
"PROMPT = PromptTemplate(\n",
" template=custom_template, input_variables=[\"context\", \"question\", \"chat_history\"]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"multiquery_rag_chain = ConversationalRetrievalChain.from_llm(\n",
" llm,\n",
" retriever=multiquery_retriever,\n",
" memory=memory,\n",
" combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
" return_source_documents=True,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'What are Trustworthy AI Characteristics?',\n",
" 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
" AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).')],\n",
" 'answer': 'Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).',\n",
" 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='There may also be concerns about emotional entanglement between humans and GAI systems, which \\ncould lead to negative psychological impacts . \\nTrustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with \\nHarmful Bias Managed, Privacy Enhanced, Safe , Valid and Reliable \\n2.8. Information Integrity \\nInformation integrity describes the “ spectrum of information and associated patterns of its creation, \\nexchange, and consumption in society .” High-integrity information can be trusted; “distinguishes fact \\nfrom fiction, opinion, and inference; acknowledges uncertainties; and is transparent about its level of \\nvetting. This information can be linked to the original source(s) with appropriate evidence. High- integrity \\ninformation is also accurate and reliable, can be verified and authenticated, has a clear chain of custody, \\nand creates reasonable expectations about when its validity may expire. ”11 \\n \\n \\n11 This definition of information integrity is derived from the 2022 White House Roadmap for Researchers on \\nPriorities Related to Information Integrity Research and Development.'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 0, '_id': '8bad320d25b64bae949445cf2c427d18', '_collection_name': 'Midterm Eval'}, page_content='NIST Trustworthy and Responsible AI \\nNIST AI 600 -1 \\nArtificial Intelligence Risk Management \\nFramework: Generative Artificial \\nIntelligence Profile \\n \\n \\nThis publication is available free of charge from: \\nhttps://doi.org/10.6028/NIST.AI.600 -1'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 21, '_id': '91c6953f48734236907d2797a0c07971', '_collection_name': 'Midterm Eval'}, page_content=\"SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nSome U.S government agencies have developed specific frameworks for ethical use of AI \\nsystems. The Department of Energy (DOE) has activated the AI Advancement Council that oversees coordina -\\ntion and advises on implementation of the DOE AI Strategy and addresses issues and/or escalations on the \\nethical use and development of AI systems.20 The Department of Defense has adopted Artificial Intelligence \\nEthical Principles, and tenets for Responsible Artificial Intelligence specifically tailored to its national \\nsecurity and defense activities.21 Similarl y, the U.S. Intelligence Community (IC) has developed the Principles \\nof Artificial Intelligence Ethics for the Intelligence Community to guide personnel on whether and how to \\ndevelop and use AI in furtherance of the IC's mission, as well as an AI Ethics Framework to help implement \\nthese principles.22\\nThe National Science Foundation (NSF) funds extensive research to help foster the \\ndevelopment of automated systems that adhere to and advance their safety, security and \\neffectiveness. Multiple NSF programs support research that directly addresses many of these principles: \\nthe National AI Research Institutes23 support research on all aspects of safe, trustworth y, fai r, and explainable \\nAI algorithms and systems; the Cyber Physical Systems24 program supports research on developing safe \\nautonomous and cyber physical systems with AI components; the Secure and Trustworthy Cyberspace25 \\nprogram supports research on cybersecurity and privacy enhancing technologies in automated systems; the \\nFormal Methods in the Field26 program supports research on rigorous formal verification and analysis of\"),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 60, '_id': '30c836d6cf9a481c9cf48c580209d301', '_collection_name': 'Midterm Eval'}, page_content='57 National Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix B: \\nHow AI Risks Differ from Traditional Software Risks . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_B \\nNational Institute of Standards and Technology (2023) AI RMF Playbook . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/Playbook \\nNational Institue of Standards and Technology (2023) Framing Risk \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/1- sec-risk \\nNational Institu te of Standards and Technology (2023) The Language of Trustworthy AI: An In- Depth \\nGlossary of Terms https://airc.nist.gov/AI_RMF_Knowledge_Base/Glossary \\nNational Institue of Standards and Technology (2022) Towards a Standard for Identifying and Managing \\nBias in Artificial Intelligence https://www.nist.gov/publications/towards -standard -identifying -and-\\nmanaging- bias-artificial -intelligence \\nNorthcutt, C. et al. (2021) Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks. \\narXiv . https://arxiv.org/pdf/2103.14749 \\nOECD (2023) \"Advancing accountability in AI: Governing and managing risks throughout the lifecycle for \\ntrustworthy AI\", OECD Digital Economy Papers , No. 349, OECD Publishing, Paris . \\nhttps://doi.org/10.1787/2448f04b- en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers , No. 16, OECD \\nPublishing, Paris . https://doi.org/10.1787/d1a8d965- en \\nOpenAI (2023) GPT-4 System Card . https://cdn.openai.com/papers/gpt -4-system -card.pdf \\nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity? ICLR . \\nhttps://arxiv.org/pdf/2309.05196 \\nPark, P. et. al. (2024) AI deception: A survey of examples, risks, and potential solutions. Patterns, 5(5). \\narXiv . https://arxiv.org/pdf/2308.14752'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': '8c7c9954577b47f390a8374bb6582294', '_collection_name': 'Midterm Eval'}, page_content='SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nExecutive Order 13960 on Promoting the Use of Trustworthy Artificial Intelligence in the \\nFederal Government requires that certain federal agencies adhere to nine principles when \\ndesigning, developing, acquiring, or using AI for purposes other than national security or \\ndefense. These principles—while taking into account the sensitive law enforcement and other contexts in which \\nthe federal government may use AI, as opposed to private sector use of AI—require that AI is: (a) lawful and \\nrespectful of our Nation’s values; (b) purposeful and performance-driven; (c) accurate, reliable, and effective; (d) \\nsafe, secure, and resilient; (e) understandable; (f ) responsible and traceable; (g) regularly monitored; (h) transpar -\\nent; and, (i) accountable. The Blueprint for an AI Bill of Rights is consistent with the Executive Order. \\nAffected agencies across the federal government have released AI use case inventories13 and are implementing \\nplans to bring those AI systems into compliance with the Executive Order or retire them. \\nThe law and policy landscape for motor vehicles shows that strong safety regulations—and \\nmeasures to address harms when they occur—can enhance innovation in the context of com-\\nplex technologies. Cars, like automated digital systems, comprise a complex collection of components. \\nThe National Highway Traffic Safety Administration,14 through its rigorous standards and independent \\nevaluation, helps make sure vehicles on our roads are safe without limiting manufacturers’ ability to \\ninnovate.15 At the same time, rules of the road are implemented locally to impose contextually appropriate \\nrequirements on drivers, such as slowing down near schools or playgrounds.16'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of \\nharmful uses. The NIST framework will consider and encompass principles such as \\ntransparency, accountability, and fairness during pre-design, design and development, deployment, use, \\nand testing and evaluation of AI technologies and systems. It is expected to be released in the winter of 2022-23. \\n21'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 63, '_id': '4b61d1ab875c4c3a94bdbe35b3e8b18a', '_collection_name': 'Midterm Eval'}, page_content='www.analyticsinsight.net/top-progressive-companies-building-ethical-ai-to-look-out-for-\\nin-2021/ https://www.technologyreview.com/2021/01/15/1016183/ai-ethics-startups/; Disha Sinha. Top\\nProgressive Companies Building Ethical AI to Look Out for in 2021. Analytics Insight . June 30, 2021.\\n18.Office of Management and Budget. Study to Identify Methods to Assess Equity: Report to the President .\\nAug. 2021. https://www.whitehouse.gov/wp-content/uploads/2021/08/OMB-Report-on-E013985-\\nImplementation_508-Compliant-Secure-v1.1.pdf\\n19.National Institute of Standards and Technology. AI Risk Management Framework. Accessed May 23,\\n2022. https://www.nist.gov/itl/ai-risk-management-framework\\n20. U.S. Department of Energy. U.S. Department of Energy Establishes Artificial Intelligence Advancement\\nCouncil. U.S. Department of Energy Artificial Intelligence and Technology Office. April 18, 2022. https://\\nwww.energy.gov/ai/articles/us-department-energy-establishes-artificial-intelligence-advancement-council\\n21.Department of Defense. U.S Department of Defense Responsible Artificial Intelligence Strategy and\\nImplementation Pathway. Jun. 2022. https://media.defense.gov/2022/Jun/22/2003022604/-1/-1/0/\\nDepartment-of-Defense-Responsible-Artificial-Intelligence-Strategy-and-Implementation-\\nPathway.PDF\\n22. Director of National Intelligence. Principles of Artificial Intelligence Ethics for the Intelligence\\nCommunity. https://www.dni.gov/index.php/features/2763-principles-of-artificial-intelligence-ethics-for-\\nthe-intelligence-community\\n64'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 50, '_id': '02d90efce1624306ab5bf45d1f8cb1db', '_collection_name': 'Midterm Eval'}, page_content='warrant additional human review, tracking and documentation, and greater management oversight. \\nAI technology can produce varied outputs in multiple modalities and present many classes of user \\ninterfaces. This leads to a broader set of AI Actors interacting with GAI systems for widely differing \\napplications and contexts of use. These can include data labeling and preparation, development of GAI \\nmodels, content moderation, code generation and review, text generation and editing, image and video \\ngeneration, summarization, search, and chat. These activities can take place within organizational \\nsettings or in the public domain. \\nOrganizations can restrict AI applications that cause harm, exceed stated risk tolerances, or that conflict with their tolerances or values. Governance tools and protocols that are applied to other types of AI systems can be applied to GAI systems. These p lans and actions include: \\n• Accessibility and reasonable accommodations \\n• AI actor credentials and qualifications \\n• Alignment to organizational values • Auditing and assessment \\n• Change -management controls \\n• Commercial use \\n• Data provenance'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics: Accountable and Transparent, Privacy Enhanced, Safe, Secure and \\nResilient \\n2.5. Environmental Impacts \\nTraining, maint aining, and operating (running inference on) GAI systems are resource -intensive activities , \\nwith potentially large energy and environmental footprints. Energy and carbon emissions vary based on \\nwhat is being done with the GAI model (i.e., pre -training, fine -tuning, inference), the modality of the \\ncontent , hardware used, and type of task or application . \\nCurrent e stimates suggest that training a single transformer LLM can emit as much carbon as 300 round-\\ntrip flights between San Francisco and New York. In a study comparing energy consumption and carbon \\nemissions for LLM inference, generative tasks ( e.g., text summarization) were found to be more energy - \\nand carbon -i ntensive th an discriminative or non- generative tasks (e.g., text classification). \\nMethods for creating smaller versions of train ed models, such as model distillation or compression, \\ncould reduce environmental impacts at inference time, but training and tuning such models may still \\ncontribute to their environmental impacts . Currently there is no agreed upon method to estimate \\nenvironmental impacts from GAI . \\nTrustworthy AI Characteristics: Accountable and Transparent, Safe \\n2.6. Harmful Bias and Homogenization \\nBias exists in many forms and can become ingrained in automated systems. AI systems , including GAI \\nsystems, can increase the speed and scale at which harmful biases manifest and are acted upon, \\npotentially perpetuati ng and amplify ing harms to individuals, groups, communities, organizations, and \\nsociety . For example, when prompted to generate images of CEOs, doctors, lawyers, and judges, current \\ntext-to-image models underrepresent women and/or racial minorities , and people with disabilities . \\nImage generator models have also produce d biased or stereotyped output for various demographic')]}"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiquery_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"answers = []\n",
"contexts = []\n",
"\n",
"for question in test_questions:\n",
" response = multiquery_rag_chain.invoke({\"question\" : question})\n",
" answers.append(response[\"answer\"])\n",
" contexts.append([context.page_content for context in response[\"source_documents\"]])"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from datasets import Dataset\n",
"\n",
"multiquery_dataset = Dataset.from_dict({\n",
" \"question\" : test_questions,\n",
" \"answer\" : answers,\n",
" \"contexts\" : contexts,\n",
" \"ground_truth\" : test_groundtruths\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
" 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect them. This transparency helps build trust between the public and the organizations deploying these systems.\\n\\n2. **Informed Consent**: By notifying individuals about the use of automated systems, organizations allow people to make informed choices about their engagement with these systems. This is particularly important in sensitive areas like healthcare, finance, and law enforcement.\\n\\n3. **Accountability**: Clear notice and explanations hold organizations accountable for their automated decisions. If individuals understand how decisions are made, they can better contest or appeal those decisions if they believe they are unjust or incorrect.\\n\\n4. **Protection of Rights**: Legal requirements for notice and explanation help protect individuals' rights by ensuring they have access to information about how their data is used and how decisions that impact them are made. This is crucial in preventing discrimination and ensuring fair treatment.\\n\\n5. **Facilitating Recourse**: When individuals receive timely and understandable explanations, they are better equipped to seek recourse if they feel wronged by an automated decision. This can include appealing decisions or requesting human intervention.\\n\\n6. **Encouraging Ethical Use**: Legal requirements can encourage organizations to develop and implement automated systems ethically, ensuring that they consider the potential impacts on individuals and society as a whole.\\n\\nOverall, these requirements aim to create a framework where automated systems are used responsibly, with respect for individuals' rights and dignity (Source: [document name], p. [page number]).\",\n",
" 'contexts': [\"Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. Techniques used to automate the process of explaining such systems are under active research and improvement and such explanations can take many forms. Innovative companies and researchers are rising to the challenge and creating and deploying explanatory systems that can help the public better understand decisions that impact them. \\nWhile notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. \\n• A lawyer representing an older client with disabilities who had been cut off from Medicaid-funded home\\nhealth-care assistance couldn't determine why\\n, especially since the decision went against historical access\\npractices. In a court hearing, the lawyer learned from a witness that the state in which the older client\\nlived \\nhad recently adopted a new algorithm to determine eligibility.83 The lack of a timely explanation made it\\nharder \\nto understand and contest the decision.\\n•\\nA formal child welfare investigation is opened against a parent based on an algorithm and without the parent\\never \\nbeing notified that data was being collected and used as part of an algorithmic child maltreatment\\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\\ncontest a decision.\\n41\",\n",
" 'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nTailored to the level of risk. An assessment should be done to determine the level of risk of the auto -\\nmated system. In settings where the consequences are high as determined by a risk assessment, or extensive \\noversight is expected (e.g., in criminal justice or some public sector settings), explanatory mechanisms should be built into the system design so that the system’s full behavior can be explained in advance (i.e., only fully transparent models should be used), rather than as an after-the-decision interpretation. In other settings, the extent of explanation provided should be tailored to the risk level. \\nValid. The explanation provided by a system should accurately reflect the factors and the influences that led \\nto a particular decision, and should be meaningful for the particular customization based on purpose, target, and level of risk. While approximation and simplification may be necessary for the system to succeed based on the explanatory purpose and target of the explanation, or to account for the risk of fraud or other concerns related to revealing decision-making information, such simplifications should be done in a scientifically supportable way. Where appropriate based on the explanatory system, error ranges for the explanation should be calculated and included in the explanation, with the choice of presentation of such information balanced with usability and overall interface complexity concerns. \\nDemonstrate protections for notice and explanation \\nReporting. Summary reporting should document the determinations made based on the above consider -',\n",
" 'should not be used in education, work, housing, or in other contexts where the use of such surveillance \\ntechnologies is likely to limit rights, opportunities, or access. Whenever possible, you should have access to \\nreporting that confirms your data decisions have been respected and provides an assessment of the \\npotential impact of surveillance technologies on your rights, opportunities, or access. \\nNOTICE AND EXPLANATION\\nYou should know that an automated system is being used and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organiza\\n-\\ntion responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible. \\n6',\n",
" 'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nAn automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and \\nexplanations as to how and why a decision was made or an action was taken by the system. These expectations are explained below. \\nProvide clear, timely, understandable, and accessible notice of use and explanations \\nGenerally accessible plain language documentation. The entity responsible for using the automated \\nsystem should ensure that documentation describing the overall system (including any human components) is \\npublic and easy to find. The documentation should describe, in plain language, how the system works and how \\nany automated component is used to determine an action or decision. It should also include expectations about \\nreporting described throughout this framework, such as the algorithmic impact assessments described as \\npart of Algorithmic Discrimination Protections. \\nAccount able. Notices should clearly identify the entity r esponsible for designing each component of the \\nsystem and the entity using it. \\nTimely and up-to-date. Users should receive notice of the use of automated systems in advance of using or \\nwhile being impacted by the technolog y. An explanation should be available with the decision itself, or soon \\nthereafte r. Notice should be kept up-to-date and people impacted by the system should be notified of use case \\nor key functionality changes. \\nBrief and clear. Notices and explanations should be assessed, such as by research on users’ experiences, \\nincluding user testing, to ensure that the people using or impacted by the automated system are able to easily',\n",
" 'burdensome in both the process of requesting to opt-out and the human-driven alternative provided. \\nProvide timely human consideration and remedy by a fallback and escalation system in the event that an automated system fails, produces error, or you would like to appeal or con\\n-\\ntest its impacts on you \\nProportionate. The availability of human consideration and fallback, along with associated training and \\nsafeguards against human bias, should be proportionate to the potential of the automated system to meaning -\\nfully impact rights, opportunities, or access. Automated systems that have greater control over outcomes, provide input to high-stakes decisions, relate to sensitive domains, or otherwise have a greater potential to meaningfully impact rights, opportunities, or access should have greater availability (e.g., staffing) and over\\n-\\nsight of human consideration and fallback mechanisms. \\nAccessible. Mechanisms for human consideration and fallback, whether in-person, on paper, by phone, or \\notherwise provided, should be easy to find and use. These mechanisms should be tested to ensure that users who have trouble with the automated system are able to use human consideration and fallback, with the under\\n-\\nstanding that it may be these users who are most likely to need the human assistance. Similarly, it should be tested to ensure that users with disabilities are able to find and use human consideration and fallback and also request reasonable accommodations or modifications. \\nConvenient. Mechanisms for human consideration and fallback should not be unreasonably burdensome as \\ncompared to the automated system’s equivalent. \\n49',\n",
" 'You should know that an automated system is being used, \\nand understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automat\\n-\\ned systems should provide generally accessible plain language docu -\\nmentation including clear descriptions of the overall system func -\\ntioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and ex\\n-\\nplanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality chang\\n-\\nes. You should know how and why an outcome impacting you was de -\\ntermined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible. NOTICE AND EXPLANATION\\n40',\n",
" 'HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFALLBACK \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nThere are many reasons people may prefer not to use an automated system: the system can be flawed and can lead to \\nunintended outcomes; it may reinforce bias or be inaccessible; it may simply be inconvenient or unavailable; or it may replace a paper or manual process to which people had grown accustomed. Yet members of the public are often presented with no alternative, or are forced to endure a cumbersome process to reach a human decision-maker once they decide they no longer want to deal exclusively with the automated system or be impacted by its results. As a result of this lack of human reconsideration, many receive delayed access, or lose access, to rights, opportunities, benefits, and critical services. The American public deserves the assurance that, when rights, opportunities, or access are meaningfully at stake and there is a reasonable expectation of an alternative to an automated system, they can conve\\n-\\nniently opt out of an automated system and will not be disadvantaged for that choice. In some cases, such a human or other alternative may be required by law, for example it could be required as “reasonable accommodations” for people with disabilities.',\n",
" \"find notices and explanations, read them quickl y, and understand and act on them. This includes ensuring that \\nnotices and explanations are accessible to users with disabilities and are available in the language(s) and read-\\ning level appropriate for the audience. Notices and explanations may need to be available in multiple forms, \\n(e.g., on pape r, on a physical sign, or online), in order to meet these expectations and to be accessible to the \\nAmerican public. \\nProvide explanations as to how and why a decision was made or an action was taken by an \\nautomated system \\nTailored to the purpose. Explanations should be tailored to the specific purpose for which the user is \\nexpected to use the explanation, and should clearly state that purpose. An informational explanation might differ from an explanation provided to allow for the possibility of recourse, an appeal, or one provided in the context of a dispute or contestation process. For the purposes of this framework, 'explanation' should be construed broadly. An explanation need not be a plain-language statement about causality but could consist of any mechanism that allows the recipient to build the necessary understanding and intuitions to achieve the stated purpose. Tailoring should be assessed (e.g., via user experience research). \\nTailored to the target of the explanation. Explanations should be targeted to specific audiences and clearly state that audience. An explanation provided to the subject of a decision might differ from one provided to an advocate, or to a domain expert or decision maker. Tailoring should be assessed (e.g., via user experience research). \\n43\"],\n",
" 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiquery_dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from ragas import evaluate\n",
"from ragas.metrics import (\n",
" faithfulness,\n",
" answer_relevancy,\n",
" answer_correctness,\n",
" context_recall,\n",
" context_precision,\n",
")\n",
"\n",
"metrics = [\n",
" faithfulness,\n",
" answer_relevancy,\n",
" context_recall,\n",
" context_precision,\n",
" answer_correctness,\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f5257aea40624e62905d488461b543db",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/120 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"multiquery_results = evaluate(multiquery_dataset, metrics)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'faithfulness': 0.8968, 'answer_relevancy': 0.9532, 'context_recall': 0.8906, 'context_precision': 0.9207, 'answer_correctness': 0.6901}"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiquery_results"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>contexts</th>\n",
" <th>answer</th>\n",
" <th>ground_truth</th>\n",
" <th>faithfulness</th>\n",
" <th>answer_relevancy</th>\n",
" <th>context_recall</th>\n",
" <th>context_precision</th>\n",
" <th>answer_correctness</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What is the significance of providing notice a...</td>\n",
" <td>[Providing notice has long been a standard pra...</td>\n",
" <td>Providing notice and explanation as a legal re...</td>\n",
" <td>Providing notice and explanation as a legal re...</td>\n",
" <td>1.000000</td>\n",
" <td>0.971321</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.821299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>How can structured human feedback exercises, s...</td>\n",
" <td>[50 Participatory Engagement Methods \\nOn an ...</td>\n",
" <td>Structured human feedback exercises, such as G...</td>\n",
" <td>Structured human feedback exercises, such as G...</td>\n",
" <td>1.000000</td>\n",
" <td>0.992832</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.541222</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>How do measurement gaps between laboratory and...</td>\n",
" <td>[49 early lifecycle TEVV approaches are develo...</td>\n",
" <td>Measurement gaps between laboratory and real-w...</td>\n",
" <td>Measurement gaps between laboratory and real-w...</td>\n",
" <td>0.958333</td>\n",
" <td>0.988752</td>\n",
" <td>1.0</td>\n",
" <td>0.876667</td>\n",
" <td>0.636556</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>How should data collection and use-case scope ...</td>\n",
" <td>[Data collection and use-case scope limits. Da...</td>\n",
" <td>To prevent 'mission creep' in automated system...</td>\n",
" <td>Data collection and use-case scope limits in a...</td>\n",
" <td>1.000000</td>\n",
" <td>0.923204</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.491425</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>What action did the Federal Trade Commission t...</td>\n",
" <td>[alerts about location tracking—are brief, dir...</td>\n",
" <td>The Federal Trade Commission (FTC) took action...</td>\n",
" <td>FTC sued Kochava for selling data that tracks ...</td>\n",
" <td>0.400000</td>\n",
" <td>0.936866</td>\n",
" <td>0.0</td>\n",
" <td>0.125000</td>\n",
" <td>0.902212</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question \\\n",
"0 What is the significance of providing notice a... \n",
"1 How can structured human feedback exercises, s... \n",
"2 How do measurement gaps between laboratory and... \n",
"3 How should data collection and use-case scope ... \n",
"4 What action did the Federal Trade Commission t... \n",
"\n",
" contexts \\\n",
"0 [Providing notice has long been a standard pra... \n",
"1 [50 Participatory Engagement Methods \\nOn an ... \n",
"2 [49 early lifecycle TEVV approaches are develo... \n",
"3 [Data collection and use-case scope limits. Da... \n",
"4 [alerts about location tracking—are brief, dir... \n",
"\n",
" answer \\\n",
"0 Providing notice and explanation as a legal re... \n",
"1 Structured human feedback exercises, such as G... \n",
"2 Measurement gaps between laboratory and real-w... \n",
"3 To prevent 'mission creep' in automated system... \n",
"4 The Federal Trade Commission (FTC) took action... \n",
"\n",
" ground_truth faithfulness \\\n",
"0 Providing notice and explanation as a legal re... 1.000000 \n",
"1 Structured human feedback exercises, such as G... 1.000000 \n",
"2 Measurement gaps between laboratory and real-w... 0.958333 \n",
"3 Data collection and use-case scope limits in a... 1.000000 \n",
"4 FTC sued Kochava for selling data that tracks ... 0.400000 \n",
"\n",
" answer_relevancy context_recall context_precision answer_correctness \n",
"0 0.971321 1.0 1.000000 0.821299 \n",
"1 0.992832 1.0 1.000000 0.541222 \n",
"2 0.988752 1.0 0.876667 0.636556 \n",
"3 0.923204 1.0 1.000000 0.491425 \n",
"4 0.936866 0.0 0.125000 0.902212 "
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiquery_results_df = multiquery_results.to_pandas()\n",
"multiquery_results_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"multiquery_results_df.to_csv(\"multiquery_ragas_results.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"multiquery_metrics_df = pd.DataFrame(list(multiquery_results.items()), columns=['Metric', 'MultiQuery'])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>MultiQuery</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.896804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.953211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.890625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.920732</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.690058</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric MultiQuery\n",
"0 faithfulness 0.896804\n",
"1 answer_relevancy 0.953211\n",
"2 context_recall 0.890625\n",
"3 context_precision 0.920732\n",
"4 answer_correctness 0.690058"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"multiquery_metrics_df"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"multiquery_metrics_df.to_csv(\"multiquery_metrics.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>Baseline</th>\n",
" <th>MultiQuery</th>\n",
" <th>Baseline -> MultiQuery</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.895359</td>\n",
" <td>0.896804</td>\n",
" <td>0.001445</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.955419</td>\n",
" <td>0.953211</td>\n",
" <td>-0.002208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.934028</td>\n",
" <td>0.890625</td>\n",
" <td>-0.043403</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.937500</td>\n",
" <td>0.920732</td>\n",
" <td>-0.016768</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.629267</td>\n",
" <td>0.690058</td>\n",
" <td>0.060791</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric Baseline MultiQuery Baseline -> MultiQuery\n",
"0 faithfulness 0.895359 0.896804 0.001445\n",
"1 answer_relevancy 0.955419 0.953211 -0.002208\n",
"2 context_recall 0.934028 0.890625 -0.043403\n",
"3 context_precision 0.937500 0.920732 -0.016768\n",
"4 answer_correctness 0.629267 0.690058 0.060791"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
"\n",
"df_baseline_multiquery['Baseline -> MultiQuery'] = df_baseline_multiquery['MultiQuery'] - df_baseline_multiquery['Baseline']\n",
"\n",
"df_baseline_multiquery"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"compression_retriever = vectorstore.as_retriever(\n",
" search_type=\"mmr\",\n",
" search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"from langchain.retrievers import ContextualCompressionRetriever\n",
"from langchain.retrievers.document_compressors import LLMChainExtractor\n",
"\n",
"compressor = LLMChainExtractor.from_llm(llm)\n",
"compression_retriever = ContextualCompressionRetriever(\n",
" base_compressor=compressor, base_retriever=compression_retriever\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"contextual_compression_rag_chain = ConversationalRetrievalChain.from_llm(\n",
" llm,\n",
" retriever=compression_retriever,\n",
" memory=memory,\n",
" combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
" return_source_documents=True,\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'What are Trustworthy AI Characteristics?',\n",
" 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
" AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).')],\n",
" 'answer': 'Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).',\n",
" 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='Trustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with Harmful Bias Managed, Privacy Enhanced, Safe, Valid and Reliable'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of harmful uses. The NIST framework will consider and encompass principles such as transparency, accountability, and fairness during pre-design, design and development, deployment, use, and testing and evaluation of AI technologies and systems.'),\n",
" Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics: Accountable and Transparent, Privacy Enhanced, Safe, Secure and Resilient \\nTrustworthy AI Characteristics: Accountable and Transparent, Safe')]}"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contextual_compression_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"answers = []\n",
"contexts = []\n",
"\n",
"for question in test_questions:\n",
" response = contextual_compression_rag_chain.invoke({\"question\" : question})\n",
" answers.append(response[\"answer\"])\n",
" contexts.append([context.page_content for context in response[\"source_documents\"]])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"contextual_compression_dataset = Dataset.from_dict({\n",
" \"question\" : test_questions,\n",
" \"answer\" : answers,\n",
" \"contexts\" : contexts,\n",
" \"ground_truth\" : test_groundtruths\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
" 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect their rights, opportunities, or access. This transparency helps build trust in the technology and the entities deploying it.\\n\\n2. **Empowerment**: When people receive clear explanations about how decisions are made by automated systems, they are better equipped to understand and contest those decisions if necessary. This is particularly important in sensitive areas like employment, credit, and legal proceedings, where outcomes can have profound impacts on individuals' lives.\\n\\n3. **Accountability**: Notice and explanation requirements hold organizations accountable for their automated systems. By clearly identifying the entities responsible for the design and use of these systems, it becomes easier to address any issues or biases that may arise.\\n\\n4. **Error Correction**: Providing notice allows individuals to identify and correct errors in automated decision-making processes. Without this knowledge, individuals may be left without recourse to challenge incorrect or unfair outcomes.\\n\\n5. **Public Confidence**: Consistent and clear communication about the use of automated systems can enhance public confidence in these technologies. When people understand how and why decisions are made, they are more likely to trust the systems and the organizations that use them.\\n\\n6. **Safety and Efficacy**: Clear explanations enable experts to verify the reasonableness of recommendations made by automated systems before they are enacted. This is crucial for ensuring that the systems operate safely and effectively.\\n\\nIn summary, notice and explanation requirements are essential for protecting individuals' rights, fostering accountability, and ensuring that automated systems are used responsibly and ethically (Source: [document name], p. [page number]).\",\n",
" 'contexts': ['Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. While notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. The lack of a timely explanation made it harder to understand and contest the decision. The lack of notice or an explanation makes it harder for those performing child maltreatment assessments to validate the risk assessment and denies parents knowledge that could help them contest a decision.',\n",
" 'You should know that an automated system is being used, and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible.',\n",
" 'An automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and explanations as to how and why a decision was made or an action was taken by the system. Notices should clearly identify the entity responsible for designing each component of the system and the entity using it. Users should receive notice of the use of automated systems in advance of using or while being impacted by the technology. An explanation should be available with the decision itself, or soon thereafter. Notice should be kept up-to-date and people impacted by the system should be notified of use case or key functionality changes. Notices and explanations should be assessed, such as by research on users’ experiences, including user testing, to ensure that the people using or impacted by the automated system are able to easily.',\n",
" 'NOTICE & \\nEXPLANATION \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nAutomated systems now determine opportunities, from employment to credit, and directly shape the American \\npublic’s experiences, from the courtroom to online classrooms, in ways that profoundly impact people’s lives. But this expansive impact is not always visible. An applicant might not know whether a person rejected their resume or a hiring algorithm moved them to the bottom of the list. A defendant in the courtroom might not know if a judge denying their bail is informed by an automated system that labeled them “high risk.” From correcting errors to contesting decisions, people are often denied the knowledge they need to address the impact of automated systems on their lives. Notice and explanations also serve an important safety and efficacy purpose, allowing experts to verify the reasonableness of a recommendation before enacting it. \\nIn order to guard against potential harms, the American public needs to know if an automated system is being used. Clear, brief, and understandable notice is a prerequisite for achieving the other protections in this framework. Like-\\nwise, the public is often unable to ascertain how or why an automated system has made a decision or contributed to a particular outcome. The decision-making processes of automated systems tend to be opaque, complex, and, therefore, unaccountable, whether by design or by omission. These factors can make explanations both more challenging and more important, and should not be used as a pretext to avoid explaining important decisions to the people impacted by those choices. In the context of automated systems, clear and valid explanations should be recognized as a baseline requirement.'],\n",
" 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contextual_compression_dataset[0]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b6c03ccbf50e4642b9433f2513fb83c3",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Evaluating: 0%| | 0/120 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"contextual_compression_results = evaluate(contextual_compression_dataset, metrics)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'faithfulness': 0.7491, 'answer_relevancy': 0.9140, 'context_recall': 0.7257, 'context_precision': 0.9051, 'answer_correctness': 0.5707}"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contextual_compression_results"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>question</th>\n",
" <th>contexts</th>\n",
" <th>answer</th>\n",
" <th>ground_truth</th>\n",
" <th>faithfulness</th>\n",
" <th>answer_relevancy</th>\n",
" <th>context_recall</th>\n",
" <th>context_precision</th>\n",
" <th>answer_correctness</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>What is the significance of providing notice a...</td>\n",
" <td>[Providing notice has long been a standard pra...</td>\n",
" <td>Providing notice and explanation as a legal re...</td>\n",
" <td>Providing notice and explanation as a legal re...</td>\n",
" <td>1.000000</td>\n",
" <td>0.971321</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.585260</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>How can structured human feedback exercises, s...</td>\n",
" <td>[AI Red -teaming \\nAI red -teaming is an evol...</td>\n",
" <td>Structured human feedback exercises, such as G...</td>\n",
" <td>Structured human feedback exercises, such as G...</td>\n",
" <td>1.000000</td>\n",
" <td>0.988309</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.320501</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>How do measurement gaps between laboratory and...</td>\n",
" <td>[Currently available pre -deployment TEVV proc...</td>\n",
" <td>Measurement gaps between laboratory and real-w...</td>\n",
" <td>Measurement gaps between laboratory and real-w...</td>\n",
" <td>0.958333</td>\n",
" <td>0.996595</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.597251</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>How should data collection and use-case scope ...</td>\n",
" <td>[Data collection should be limited in scope, w...</td>\n",
" <td>To prevent \"mission creep\" in automated system...</td>\n",
" <td>Data collection and use-case scope limits in a...</td>\n",
" <td>0.439024</td>\n",
" <td>0.922376</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.551606</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>What action did the Federal Trade Commission t...</td>\n",
" <td>[]</td>\n",
" <td>The Federal Trade Commission (FTC) took action...</td>\n",
" <td>FTC sued Kochava for selling data that tracks ...</td>\n",
" <td>0.833333</td>\n",
" <td>0.925072</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.529680</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" question \\\n",
"0 What is the significance of providing notice a... \n",
"1 How can structured human feedback exercises, s... \n",
"2 How do measurement gaps between laboratory and... \n",
"3 How should data collection and use-case scope ... \n",
"4 What action did the Federal Trade Commission t... \n",
"\n",
" contexts \\\n",
"0 [Providing notice has long been a standard pra... \n",
"1 [AI Red -teaming \\nAI red -teaming is an evol... \n",
"2 [Currently available pre -deployment TEVV proc... \n",
"3 [Data collection should be limited in scope, w... \n",
"4 [] \n",
"\n",
" answer \\\n",
"0 Providing notice and explanation as a legal re... \n",
"1 Structured human feedback exercises, such as G... \n",
"2 Measurement gaps between laboratory and real-w... \n",
"3 To prevent \"mission creep\" in automated system... \n",
"4 The Federal Trade Commission (FTC) took action... \n",
"\n",
" ground_truth faithfulness \\\n",
"0 Providing notice and explanation as a legal re... 1.000000 \n",
"1 Structured human feedback exercises, such as G... 1.000000 \n",
"2 Measurement gaps between laboratory and real-w... 0.958333 \n",
"3 Data collection and use-case scope limits in a... 0.439024 \n",
"4 FTC sued Kochava for selling data that tracks ... 0.833333 \n",
"\n",
" answer_relevancy context_recall context_precision answer_correctness \n",
"0 0.971321 1.0 1.0 0.585260 \n",
"1 0.988309 1.0 1.0 0.320501 \n",
"2 0.996595 1.0 1.0 0.597251 \n",
"3 0.922376 1.0 1.0 0.551606 \n",
"4 0.925072 0.0 0.0 0.529680 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contextual_compression_results_df = contextual_compression_results.to_pandas()\n",
"contextual_compression_results_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"contextual_compression_results_df.to_csv(\"contextual_compression_ragas_results.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"contextual_compression_metrics_df = pd.DataFrame(list(contextual_compression_results.items()), columns=['Metric', 'ContextualCompression'])"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>ContextualCompression</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.749092</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.913993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.725694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.905093</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.570685</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric ContextualCompression\n",
"0 faithfulness 0.749092\n",
"1 answer_relevancy 0.913993\n",
"2 context_recall 0.725694\n",
"3 context_precision 0.905093\n",
"4 answer_correctness 0.570685"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"contextual_compression_metrics_df"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"contextual_compression_metrics_df.to_csv(\"contextual_compression_metrics.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>Baseline</th>\n",
" <th>MultiQuery</th>\n",
" <th>ContextualCompression</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.895359</td>\n",
" <td>0.896804</td>\n",
" <td>0.749092</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.955419</td>\n",
" <td>0.953211</td>\n",
" <td>0.913993</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.934028</td>\n",
" <td>0.890625</td>\n",
" <td>0.725694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.937500</td>\n",
" <td>0.920732</td>\n",
" <td>0.905093</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.629267</td>\n",
" <td>0.690058</td>\n",
" <td>0.570685</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric Baseline MultiQuery ContextualCompression\n",
"0 faithfulness 0.895359 0.896804 0.749092\n",
"1 answer_relevancy 0.955419 0.953211 0.913993\n",
"2 context_recall 0.934028 0.890625 0.725694\n",
"3 context_precision 0.937500 0.920732 0.905093\n",
"4 answer_correctness 0.629267 0.690058 0.570685"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
"df_baseline_multiquery_contextual_compression = pd.merge(df_baseline_multiquery, contextual_compression_metrics_df, on='Metric')\n",
"\n",
"\n",
"df_baseline_multiquery_contextual_compression"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Metric</th>\n",
" <th>Baseline</th>\n",
" <th>MultiQuery</th>\n",
" <th>ContextualCompression</th>\n",
" <th>HigestValue</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>faithfulness</td>\n",
" <td>0.895359</td>\n",
" <td>0.896804</td>\n",
" <td>0.749092</td>\n",
" <td>0.9 (MultiQuery)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>answer_relevancy</td>\n",
" <td>0.955419</td>\n",
" <td>0.953211</td>\n",
" <td>0.913993</td>\n",
" <td>0.96 (Baseline)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>context_recall</td>\n",
" <td>0.934028</td>\n",
" <td>0.890625</td>\n",
" <td>0.725694</td>\n",
" <td>0.93 (Baseline)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>context_precision</td>\n",
" <td>0.937500</td>\n",
" <td>0.920732</td>\n",
" <td>0.905093</td>\n",
" <td>0.94 (Baseline)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>answer_correctness</td>\n",
" <td>0.629267</td>\n",
" <td>0.690058</td>\n",
" <td>0.570685</td>\n",
" <td>0.69 (MultiQuery)</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Metric Baseline MultiQuery ContextualCompression \\\n",
"0 faithfulness 0.895359 0.896804 0.749092 \n",
"1 answer_relevancy 0.955419 0.953211 0.913993 \n",
"2 context_recall 0.934028 0.890625 0.725694 \n",
"3 context_precision 0.937500 0.920732 0.905093 \n",
"4 answer_correctness 0.629267 0.690058 0.570685 \n",
"\n",
" HigestValue \n",
"0 0.9 (MultiQuery) \n",
"1 0.96 (Baseline) \n",
"2 0.93 (Baseline) \n",
"3 0.94 (Baseline) \n",
"4 0.69 (MultiQuery) "
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_baseline_multiquery_contextual_compression['MaxValue'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].max(axis=1)\n",
"\n",
"df_baseline_multiquery_contextual_compression['MaxMetric'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].idxmax(axis=1)\n",
"\n",
"df_baseline_multiquery_contextual_compression['HigestValue'] = df_baseline_multiquery_contextual_compression['MaxValue'].round(2).astype(str) + ' (' + df_baseline_multiquery_contextual_compression['MaxMetric'] + ')'\n",
"\n",
"df_baseline_multiquery_contextual_compression = df_baseline_multiquery_contextual_compression.drop(columns=['MaxValue', 'MaxMetric'])\n",
"\n",
"df_baseline_multiquery_contextual_compression"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|