diff --git "a/Retrieval_Strat_Eval.ipynb" "b/Retrieval_Strat_Eval.ipynb"
--- "a/Retrieval_Strat_Eval.ipynb"
+++ "b/Retrieval_Strat_Eval.ipynb"
@@ -6,12 +6,12 @@
"metadata": {},
"outputs": [],
"source": [
- "!pip install langchain langchain_community langchain_openai chromadb pypdf langsmith qdrant-client ragas pandas"
+ "!pip install langchain langchain_community langchain_openai pypdf langsmith qdrant-client ragas pandas"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
@@ -25,7 +25,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
@@ -36,7 +36,7 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -46,12 +46,11 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.document_loaders import PyPDFLoader\n",
- "from langchain_community.document_loaders.sitemap import SitemapLoader\n",
"from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
"from langchain_openai import OpenAIEmbeddings\n",
"from langchain_community.vectorstores.chroma import Chroma\n",
@@ -64,7 +63,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -74,7 +73,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
@@ -86,7 +85,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
@@ -99,7 +98,7 @@
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -108,7 +107,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -117,7 +116,7 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -184,7 +183,7 @@
"4 answer_correctness 0.629267"
]
},
- "execution_count": 12,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -195,39 +194,16 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 29,
"metadata": {},
- "outputs": [
- {
- "ename": "NameError",
- "evalue": "name 'qdrant_client' is not defined",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
- "Cell \u001b[0;32mIn[21], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m qdrant_client\u001b[38;5;241m.\u001b[39mcreate_collection(\n\u001b[1;32m 2\u001b[0m collection_name\u001b[38;5;241m=\u001b[39mCOLLECTION_NAME\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMultiQuery\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 3\u001b[0m vectors_config\u001b[38;5;241m=\u001b[39mVectorParams(size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1536\u001b[39m, distance\u001b[38;5;241m=\u001b[39mDistance\u001b[38;5;241m.\u001b[39mCOSINE),\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m qdrant_vector_store \u001b[38;5;241m=\u001b[39m QdrantVectorStore(\n\u001b[1;32m 7\u001b[0m client\u001b[38;5;241m=\u001b[39mqdrant_client,\n\u001b[1;32m 8\u001b[0m collection_name\u001b[38;5;241m=\u001b[39mCOLLECTION_NAME\u001b[38;5;241m+\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMultiQuery\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 9\u001b[0m embedding\u001b[38;5;241m=\u001b[39membeddings,\n\u001b[1;32m 10\u001b[0m )\n\u001b[1;32m 12\u001b[0m qdrant_vector_store\u001b[38;5;241m.\u001b[39madd_documents(pdf_docs)\n",
- "\u001b[0;31mNameError\u001b[0m: name 'qdrant_client' is not defined"
- ]
- }
- ],
+ "outputs": [],
"source": [
- "qdrant_client.create_collection(\n",
- " collection_name=COLLECTION_NAME+\"MultiQuery\",\n",
- " vectors_config=VectorParams(size=1536, distance=Distance.COSINE),\n",
- ")\n",
- "\n",
- "qdrant_vector_store = QdrantVectorStore(\n",
- " client=qdrant_client,\n",
- " collection_name=COLLECTION_NAME+\"MultiQuery\",\n",
- " embedding=embeddings,\n",
- ")\n",
- "\n",
- "qdrant_vector_store.add_documents(pdf_docs)"
+ "baseline_metrics.rename(columns={'MediumChunk': 'Baseline'}, inplace=True)"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -248,7 +224,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -262,7 +238,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -275,7 +251,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -308,7 +284,7 @@
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -323,7 +299,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -331,20 +307,20 @@
"text/plain": [
"{'question': 'What are Trustworthy AI Characteristics?',\n",
" 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
- " AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should have clear accountability structures, and their decision-making processes should be transparent to users and stakeholders.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand how AI systems make decisions, which helps build trust and allows for better oversight.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be designed to minimize bias and ensure fairness, preventing discrimination against any group.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data, ensuring that data handling practices are secure and compliant with regulations.\\n\\n5. **Safe**: AI systems must be safe to use, meaning they should not cause harm to users or society.\\n\\n6. **Valid and Reliable**: AI systems should produce accurate and consistent results, ensuring their outputs can be trusted.\\n\\nThese characteristics are crucial for fostering trust in AI technologies and ensuring they are used responsibly across various applications (Source: NIST AI Risk Management Framework, p. 57).')],\n",
- " 'answer': 'Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should have clear accountability structures, and their decision-making processes should be transparent to users and stakeholders.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand how AI systems make decisions, which helps build trust and allows for better oversight.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be designed to minimize bias and ensure fairness, preventing discrimination against any group.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data, ensuring that data handling practices are secure and compliant with regulations.\\n\\n5. **Safe**: AI systems must be safe to use, meaning they should not cause harm to users or society.\\n\\n6. **Valid and Reliable**: AI systems should produce accurate and consistent results, ensuring their outputs can be trusted.\\n\\nThese characteristics are crucial for fostering trust in AI technologies and ensuring they are used responsibly across various applications (Source: NIST AI Risk Management Framework, p. 57).',\n",
- " 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'a2242d1731b243988149f0fb5867e7a1', '_collection_name': 'Midterm Eval'}, page_content='There may also be concerns about emotional entanglement between humans and GAI systems, which \\ncould lead to negative psychological impacts . \\nTrustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with \\nHarmful Bias Managed, Privacy Enhanced, Safe , Valid and Reliable \\n2.8. Information Integrity \\nInformation integrity describes the “ spectrum of information and associated patterns of its creation, \\nexchange, and consumption in society .” High-integrity information can be trusted; “distinguishes fact \\nfrom fiction, opinion, and inference; acknowledges uncertainties; and is transparent about its level of \\nvetting. This information can be linked to the original source(s) with appropriate evidence. High- integrity \\ninformation is also accurate and reliable, can be verified and authenticated, has a clear chain of custody, \\nand creates reasonable expectations about when its validity may expire. ”11 \\n \\n \\n11 This definition of information integrity is derived from the 2022 White House Roadmap for Researchers on \\nPriorities Related to Information Integrity Research and Development.'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 0, '_id': '7a1a2660ff514f7b96626a5a0faffcf4', '_collection_name': 'Midterm Eval'}, page_content='NIST Trustworthy and Responsible AI \\nNIST AI 600 -1 \\nArtificial Intelligence Risk Management \\nFramework: Generative Artificial \\nIntelligence Profile \\n \\n \\nThis publication is available free of charge from: \\nhttps://doi.org/10.6028/NIST.AI.600 -1'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 21, '_id': '63aa95de28fa47af8dd59a1d88d431e0', '_collection_name': 'Midterm Eval'}, page_content=\"SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nSome U.S government agencies have developed specific frameworks for ethical use of AI \\nsystems. The Department of Energy (DOE) has activated the AI Advancement Council that oversees coordina -\\ntion and advises on implementation of the DOE AI Strategy and addresses issues and/or escalations on the \\nethical use and development of AI systems.20 The Department of Defense has adopted Artificial Intelligence \\nEthical Principles, and tenets for Responsible Artificial Intelligence specifically tailored to its national \\nsecurity and defense activities.21 Similarl y, the U.S. Intelligence Community (IC) has developed the Principles \\nof Artificial Intelligence Ethics for the Intelligence Community to guide personnel on whether and how to \\ndevelop and use AI in furtherance of the IC's mission, as well as an AI Ethics Framework to help implement \\nthese principles.22\\nThe National Science Foundation (NSF) funds extensive research to help foster the \\ndevelopment of automated systems that adhere to and advance their safety, security and \\neffectiveness. Multiple NSF programs support research that directly addresses many of these principles: \\nthe National AI Research Institutes23 support research on all aspects of safe, trustworth y, fai r, and explainable \\nAI algorithms and systems; the Cyber Physical Systems24 program supports research on developing safe \\nautonomous and cyber physical systems with AI components; the Secure and Trustworthy Cyberspace25 \\nprogram supports research on cybersecurity and privacy enhancing technologies in automated systems; the \\nFormal Methods in the Field26 program supports research on rigorous formal verification and analysis of\"),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 60, '_id': '4bad29f0eeb54ac3a470c70cb1c14066', '_collection_name': 'Midterm Eval'}, page_content='57 National Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix B: \\nHow AI Risks Differ from Traditional Software Risks . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_B \\nNational Institute of Standards and Technology (2023) AI RMF Playbook . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/Playbook \\nNational Institue of Standards and Technology (2023) Framing Risk \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/1- sec-risk \\nNational Institu te of Standards and Technology (2023) The Language of Trustworthy AI: An In- Depth \\nGlossary of Terms https://airc.nist.gov/AI_RMF_Knowledge_Base/Glossary \\nNational Institue of Standards and Technology (2022) Towards a Standard for Identifying and Managing \\nBias in Artificial Intelligence https://www.nist.gov/publications/towards -standard -identifying -and-\\nmanaging- bias-artificial -intelligence \\nNorthcutt, C. et al. (2021) Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks. \\narXiv . https://arxiv.org/pdf/2103.14749 \\nOECD (2023) \"Advancing accountability in AI: Governing and managing risks throughout the lifecycle for \\ntrustworthy AI\", OECD Digital Economy Papers , No. 349, OECD Publishing, Paris . \\nhttps://doi.org/10.1787/2448f04b- en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers , No. 16, OECD \\nPublishing, Paris . https://doi.org/10.1787/d1a8d965- en \\nOpenAI (2023) GPT-4 System Card . https://cdn.openai.com/papers/gpt -4-system -card.pdf \\nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity? ICLR . \\nhttps://arxiv.org/pdf/2309.05196 \\nPark, P. et. al. (2024) AI deception: A survey of examples, risks, and potential solutions. Patterns, 5(5). \\narXiv . https://arxiv.org/pdf/2308.14752'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'b6f8f70a683546d8a21da2f3104c694e', '_collection_name': 'Midterm Eval'}, page_content='SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nExecutive Order 13960 on Promoting the Use of Trustworthy Artificial Intelligence in the \\nFederal Government requires that certain federal agencies adhere to nine principles when \\ndesigning, developing, acquiring, or using AI for purposes other than national security or \\ndefense. These principles—while taking into account the sensitive law enforcement and other contexts in which \\nthe federal government may use AI, as opposed to private sector use of AI—require that AI is: (a) lawful and \\nrespectful of our Nation’s values; (b) purposeful and performance-driven; (c) accurate, reliable, and effective; (d) \\nsafe, secure, and resilient; (e) understandable; (f ) responsible and traceable; (g) regularly monitored; (h) transpar -\\nent; and, (i) accountable. The Blueprint for an AI Bill of Rights is consistent with the Executive Order. \\nAffected agencies across the federal government have released AI use case inventories13 and are implementing \\nplans to bring those AI systems into compliance with the Executive Order or retire them. \\nThe law and policy landscape for motor vehicles shows that strong safety regulations—and \\nmeasures to address harms when they occur—can enhance innovation in the context of com-\\nplex technologies. Cars, like automated digital systems, comprise a complex collection of components. \\nThe National Highway Traffic Safety Administration,14 through its rigorous standards and independent \\nevaluation, helps make sure vehicles on our roads are safe without limiting manufacturers’ ability to \\ninnovate.15 At the same time, rules of the road are implemented locally to impose contextually appropriate \\nrequirements on drivers, such as slowing down near schools or playgrounds.16'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': '8558a75fa5164bca9f5c89597a3a6085', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of \\nharmful uses. The NIST framework will consider and encompass principles such as \\ntransparency, accountability, and fairness during pre-design, design and development, deployment, use, \\nand testing and evaluation of AI technologies and systems. It is expected to be released in the winter of 2022-23. \\n21'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': '28bb3a622da24104a1a99ef79ae7ddc1', '_collection_name': 'Midterm Eval'}, page_content='requirements on drivers, such as slowing down near schools or playgrounds.16\\nFrom large companies to start-ups, industry is providing innovative solutions that allow \\norganizations to mitigate risks to the safety and efficacy of AI systems, both before \\ndeployment and through monitoring over time.17 These innovative solutions include risk \\nassessments, auditing mechanisms, assessment of organizational procedures, dashboards to allow for ongoing \\nmonitoring, documentation procedures specific to model assessments, and many other strategies that aim to \\nmitigate risks posed by the use of AI to companies’ reputation, legal responsibilities, and other product safety \\nand effectiveness concerns. \\nThe Office of Management and Budget (OMB) has called for an expansion of opportunities \\nfor meaningful stakeholder engagement in the design of programs and services. OMB also \\npoints to numerous examples of effective and proactive stakeholder engagement, including the Community-\\nBased Participatory Research Program developed by the National Institutes of Health and the participatory \\ntechnology assessments developed by the National Oceanic and Atmospheric Administration.18\\nThe National Institute of Standards and Technology (NIST) is developing a risk \\nmanagement framework to better manage risks posed to individuals, organizations, and \\nsociety by AI.19 The NIST AI Risk Management Framework, as mandated by Congress, is intended for \\nvoluntary use to help incorporate trustworthiness considerations into the design, development, use, and \\nevaluation of AI products, services, and systems. The NIST framework is being developed through a consensus-\\ndriven, open, transparent, and collaborative process that includes workshops and other opportunities to provide \\ninput. The NIST framework aims to foster the development of innovative approaches to address \\ncharacteristics of trustworthiness including accuracy, explainability and interpretability, reliability, privacy,'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 41, '_id': '382ba83020394d4aa556d2ea6ac5fd0d', '_collection_name': 'Midterm Eval'}, page_content='Information Integrity \\nMS-3.3-003 Evaluate potential biases and stereotypes that could emerge from the AI -\\ngenerated content using appropriate methodologies including computational testing methods as well as evaluating structured feedback input. Harmful Bias and Homogenization'),\n",
- " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 50, '_id': 'a5227da1038247eaac3316935b972942', '_collection_name': 'Midterm Eval'}, page_content='warrant additional human review, tracking and documentation, and greater management oversight. \\nAI technology can produce varied outputs in multiple modalities and present many classes of user \\ninterfaces. This leads to a broader set of AI Actors interacting with GAI systems for widely differing \\napplications and contexts of use. These can include data labeling and preparation, development of GAI \\nmodels, content moderation, code generation and review, text generation and editing, image and video \\ngeneration, summarization, search, and chat. These activities can take place within organizational \\nsettings or in the public domain. \\nOrganizations can restrict AI applications that cause harm, exceed stated risk tolerances, or that conflict with their tolerances or values. Governance tools and protocols that are applied to other types of AI systems can be applied to GAI systems. These p lans and actions include: \\n• Accessibility and reasonable accommodations \\n• AI actor credentials and qualifications \\n• Alignment to organizational values • Auditing and assessment \\n• Change -management controls \\n• Commercial use \\n• Data provenance')]}"
+ " AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).')],\n",
+ " 'answer': 'Trustworthy AI characteristics refer to the essential qualities that artificial intelligence systems should possess to ensure they are reliable, ethical, and beneficial to society. These characteristics include:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their operations can be understood and scrutinized. This means providing clear documentation and explanations of how decisions are made.\\n\\n2. **Explainable and Interpretable**: Users should be able to understand the reasoning behind AI decisions. This is crucial for trust and for users to make informed choices based on AI outputs.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems should be developed and tested to minimize biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data or algorithms.\\n\\n4. **Privacy Enhanced**: AI systems should prioritize user privacy and data protection, ensuring that personal information is handled securely and ethically.\\n\\n5. **Safe**: AI systems must be designed to operate safely and reliably, minimizing risks of harm to users and society.\\n\\n6. **Valid and Reliable**: AI systems should produce consistent and accurate results, ensuring that they can be trusted to perform their intended functions effectively.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they are used responsibly in various applications (Source: NIST AI Risk Management Framework, p. 57).',\n",
+ " 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='There may also be concerns about emotional entanglement between humans and GAI systems, which \\ncould lead to negative psychological impacts . \\nTrustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with \\nHarmful Bias Managed, Privacy Enhanced, Safe , Valid and Reliable \\n2.8. Information Integrity \\nInformation integrity describes the “ spectrum of information and associated patterns of its creation, \\nexchange, and consumption in society .” High-integrity information can be trusted; “distinguishes fact \\nfrom fiction, opinion, and inference; acknowledges uncertainties; and is transparent about its level of \\nvetting. This information can be linked to the original source(s) with appropriate evidence. High- integrity \\ninformation is also accurate and reliable, can be verified and authenticated, has a clear chain of custody, \\nand creates reasonable expectations about when its validity may expire. ”11 \\n \\n \\n11 This definition of information integrity is derived from the 2022 White House Roadmap for Researchers on \\nPriorities Related to Information Integrity Research and Development.'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 0, '_id': '8bad320d25b64bae949445cf2c427d18', '_collection_name': 'Midterm Eval'}, page_content='NIST Trustworthy and Responsible AI \\nNIST AI 600 -1 \\nArtificial Intelligence Risk Management \\nFramework: Generative Artificial \\nIntelligence Profile \\n \\n \\nThis publication is available free of charge from: \\nhttps://doi.org/10.6028/NIST.AI.600 -1'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 21, '_id': '91c6953f48734236907d2797a0c07971', '_collection_name': 'Midterm Eval'}, page_content=\"SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nSome U.S government agencies have developed specific frameworks for ethical use of AI \\nsystems. The Department of Energy (DOE) has activated the AI Advancement Council that oversees coordina -\\ntion and advises on implementation of the DOE AI Strategy and addresses issues and/or escalations on the \\nethical use and development of AI systems.20 The Department of Defense has adopted Artificial Intelligence \\nEthical Principles, and tenets for Responsible Artificial Intelligence specifically tailored to its national \\nsecurity and defense activities.21 Similarl y, the U.S. Intelligence Community (IC) has developed the Principles \\nof Artificial Intelligence Ethics for the Intelligence Community to guide personnel on whether and how to \\ndevelop and use AI in furtherance of the IC's mission, as well as an AI Ethics Framework to help implement \\nthese principles.22\\nThe National Science Foundation (NSF) funds extensive research to help foster the \\ndevelopment of automated systems that adhere to and advance their safety, security and \\neffectiveness. Multiple NSF programs support research that directly addresses many of these principles: \\nthe National AI Research Institutes23 support research on all aspects of safe, trustworth y, fai r, and explainable \\nAI algorithms and systems; the Cyber Physical Systems24 program supports research on developing safe \\nautonomous and cyber physical systems with AI components; the Secure and Trustworthy Cyberspace25 \\nprogram supports research on cybersecurity and privacy enhancing technologies in automated systems; the \\nFormal Methods in the Field26 program supports research on rigorous formal verification and analysis of\"),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 60, '_id': '30c836d6cf9a481c9cf48c580209d301', '_collection_name': 'Midterm Eval'}, page_content='57 National Institute of Standards and Technology (2023) AI Risk Management Framework, Appendix B: \\nHow AI Risks Differ from Traditional Software Risks . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Appendices/Appendix_B \\nNational Institute of Standards and Technology (2023) AI RMF Playbook . \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/Playbook \\nNational Institue of Standards and Technology (2023) Framing Risk \\nhttps://airc.nist.gov/AI_RMF_Knowledge_Base/AI_RMF/Foundational_Information/1- sec-risk \\nNational Institu te of Standards and Technology (2023) The Language of Trustworthy AI: An In- Depth \\nGlossary of Terms https://airc.nist.gov/AI_RMF_Knowledge_Base/Glossary \\nNational Institue of Standards and Technology (2022) Towards a Standard for Identifying and Managing \\nBias in Artificial Intelligence https://www.nist.gov/publications/towards -standard -identifying -and-\\nmanaging- bias-artificial -intelligence \\nNorthcutt, C. et al. (2021) Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks. \\narXiv . https://arxiv.org/pdf/2103.14749 \\nOECD (2023) \"Advancing accountability in AI: Governing and managing risks throughout the lifecycle for \\ntrustworthy AI\", OECD Digital Economy Papers , No. 349, OECD Publishing, Paris . \\nhttps://doi.org/10.1787/2448f04b- en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers , No. 16, OECD \\nPublishing, Paris . https://doi.org/10.1787/d1a8d965- en \\nOpenAI (2023) GPT-4 System Card . https://cdn.openai.com/papers/gpt -4-system -card.pdf \\nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity? ICLR . \\nhttps://arxiv.org/pdf/2309.05196 \\nPark, P. et. al. (2024) AI deception: A survey of examples, risks, and potential solutions. Patterns, 5(5). \\narXiv . https://arxiv.org/pdf/2308.14752'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': '8c7c9954577b47f390a8374bb6582294', '_collection_name': 'Midterm Eval'}, page_content='SAFE AND EFFECTIVE \\nSYSTEMS \\nHOW THESE PRINCIPLES CAN MOVE INTO PRACTICE\\nReal-life examples of how these principles can become reality, through laws, policies, and practical \\ntechnical and sociotechnical approaches to protecting rights, opportunities, and access. \\nExecutive Order 13960 on Promoting the Use of Trustworthy Artificial Intelligence in the \\nFederal Government requires that certain federal agencies adhere to nine principles when \\ndesigning, developing, acquiring, or using AI for purposes other than national security or \\ndefense. These principles—while taking into account the sensitive law enforcement and other contexts in which \\nthe federal government may use AI, as opposed to private sector use of AI—require that AI is: (a) lawful and \\nrespectful of our Nation’s values; (b) purposeful and performance-driven; (c) accurate, reliable, and effective; (d) \\nsafe, secure, and resilient; (e) understandable; (f ) responsible and traceable; (g) regularly monitored; (h) transpar -\\nent; and, (i) accountable. The Blueprint for an AI Bill of Rights is consistent with the Executive Order. \\nAffected agencies across the federal government have released AI use case inventories13 and are implementing \\nplans to bring those AI systems into compliance with the Executive Order or retire them. \\nThe law and policy landscape for motor vehicles shows that strong safety regulations—and \\nmeasures to address harms when they occur—can enhance innovation in the context of com-\\nplex technologies. Cars, like automated digital systems, comprise a complex collection of components. \\nThe National Highway Traffic Safety Administration,14 through its rigorous standards and independent \\nevaluation, helps make sure vehicles on our roads are safe without limiting manufacturers’ ability to \\ninnovate.15 At the same time, rules of the road are implemented locally to impose contextually appropriate \\nrequirements on drivers, such as slowing down near schools or playgrounds.16'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of \\nharmful uses. The NIST framework will consider and encompass principles such as \\ntransparency, accountability, and fairness during pre-design, design and development, deployment, use, \\nand testing and evaluation of AI technologies and systems. It is expected to be released in the winter of 2022-23. \\n21'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 63, '_id': '4b61d1ab875c4c3a94bdbe35b3e8b18a', '_collection_name': 'Midterm Eval'}, page_content='www.analyticsinsight.net/top-progressive-companies-building-ethical-ai-to-look-out-for-\\nin-2021/ https://www.technologyreview.com/2021/01/15/1016183/ai-ethics-startups/; Disha Sinha. Top\\nProgressive Companies Building Ethical AI to Look Out for in 2021. Analytics Insight . June 30, 2021.\\n18.Office of Management and Budget. Study to Identify Methods to Assess Equity: Report to the President .\\nAug. 2021. https://www.whitehouse.gov/wp-content/uploads/2021/08/OMB-Report-on-E013985-\\nImplementation_508-Compliant-Secure-v1.1.pdf\\n19.National Institute of Standards and Technology. AI Risk Management Framework. Accessed May 23,\\n2022. https://www.nist.gov/itl/ai-risk-management-framework\\n20. U.S. Department of Energy. U.S. Department of Energy Establishes Artificial Intelligence Advancement\\nCouncil. U.S. Department of Energy Artificial Intelligence and Technology Office. April 18, 2022. https://\\nwww.energy.gov/ai/articles/us-department-energy-establishes-artificial-intelligence-advancement-council\\n21.Department of Defense. U.S Department of Defense Responsible Artificial Intelligence Strategy and\\nImplementation Pathway. Jun. 2022. https://media.defense.gov/2022/Jun/22/2003022604/-1/-1/0/\\nDepartment-of-Defense-Responsible-Artificial-Intelligence-Strategy-and-Implementation-\\nPathway.PDF\\n22. Director of National Intelligence. Principles of Artificial Intelligence Ethics for the Intelligence\\nCommunity. https://www.dni.gov/index.php/features/2763-principles-of-artificial-intelligence-ethics-for-\\nthe-intelligence-community\\n64'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 50, '_id': '02d90efce1624306ab5bf45d1f8cb1db', '_collection_name': 'Midterm Eval'}, page_content='warrant additional human review, tracking and documentation, and greater management oversight. \\nAI technology can produce varied outputs in multiple modalities and present many classes of user \\ninterfaces. This leads to a broader set of AI Actors interacting with GAI systems for widely differing \\napplications and contexts of use. These can include data labeling and preparation, development of GAI \\nmodels, content moderation, code generation and review, text generation and editing, image and video \\ngeneration, summarization, search, and chat. These activities can take place within organizational \\nsettings or in the public domain. \\nOrganizations can restrict AI applications that cause harm, exceed stated risk tolerances, or that conflict with their tolerances or values. Governance tools and protocols that are applied to other types of AI systems can be applied to GAI systems. These p lans and actions include: \\n• Accessibility and reasonable accommodations \\n• AI actor credentials and qualifications \\n• Alignment to organizational values • Auditing and assessment \\n• Change -management controls \\n• Commercial use \\n• Data provenance'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics: Accountable and Transparent, Privacy Enhanced, Safe, Secure and \\nResilient \\n2.5. Environmental Impacts \\nTraining, maint aining, and operating (running inference on) GAI systems are resource -intensive activities , \\nwith potentially large energy and environmental footprints. Energy and carbon emissions vary based on \\nwhat is being done with the GAI model (i.e., pre -training, fine -tuning, inference), the modality of the \\ncontent , hardware used, and type of task or application . \\nCurrent e stimates suggest that training a single transformer LLM can emit as much carbon as 300 round-\\ntrip flights between San Francisco and New York. In a study comparing energy consumption and carbon \\nemissions for LLM inference, generative tasks ( e.g., text summarization) were found to be more energy - \\nand carbon -i ntensive th an discriminative or non- generative tasks (e.g., text classification). \\nMethods for creating smaller versions of train ed models, such as model distillation or compression, \\ncould reduce environmental impacts at inference time, but training and tuning such models may still \\ncontribute to their environmental impacts . Currently there is no agreed upon method to estimate \\nenvironmental impacts from GAI . \\nTrustworthy AI Characteristics: Accountable and Transparent, Safe \\n2.6. Harmful Bias and Homogenization \\nBias exists in many forms and can become ingrained in automated systems. AI systems , including GAI \\nsystems, can increase the speed and scale at which harmful biases manifest and are acted upon, \\npotentially perpetuati ng and amplify ing harms to individuals, groups, communities, organizations, and \\nsociety . For example, when prompted to generate images of CEOs, doctors, lawyers, and judges, current \\ntext-to-image models underrepresent women and/or racial minorities , and people with disabilities . \\nImage generator models have also produce d biased or stereotyped output for various demographic')]}"
]
},
- "execution_count": 19,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -355,7 +331,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -367,6 +343,1103 @@
" answers.append(response[\"answer\"])\n",
" contexts.append([context.page_content for context in response[\"source_documents\"]])"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from datasets import Dataset\n",
+ "\n",
+ "multiquery_dataset = Dataset.from_dict({\n",
+ " \"question\" : test_questions,\n",
+ " \"answer\" : answers,\n",
+ " \"contexts\" : contexts,\n",
+ " \"ground_truth\" : test_groundtruths\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
+ " 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect them. This transparency helps build trust between the public and the organizations deploying these systems.\\n\\n2. **Informed Consent**: By notifying individuals about the use of automated systems, organizations allow people to make informed choices about their engagement with these systems. This is particularly important in sensitive areas like healthcare, finance, and law enforcement.\\n\\n3. **Accountability**: Clear notice and explanations hold organizations accountable for their automated decisions. If individuals understand how decisions are made, they can better contest or appeal those decisions if they believe they are unjust or incorrect.\\n\\n4. **Protection of Rights**: Legal requirements for notice and explanation help protect individuals' rights by ensuring they have access to information about how their data is used and how decisions that impact them are made. This is crucial in preventing discrimination and ensuring fair treatment.\\n\\n5. **Facilitating Recourse**: When individuals receive timely and understandable explanations, they are better equipped to seek recourse if they feel wronged by an automated decision. This can include appealing decisions or requesting human intervention.\\n\\n6. **Encouraging Ethical Use**: Legal requirements can encourage organizations to develop and implement automated systems ethically, ensuring that they consider the potential impacts on individuals and society as a whole.\\n\\nOverall, these requirements aim to create a framework where automated systems are used responsibly, with respect for individuals' rights and dignity (Source: [document name], p. [page number]).\",\n",
+ " 'contexts': [\"Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. Techniques used to automate the process of explaining such systems are under active research and improvement and such explanations can take many forms. Innovative companies and researchers are rising to the challenge and creating and deploying explanatory systems that can help the public better understand decisions that impact them. \\nWhile notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. \\n• A lawyer representing an older client with disabilities who had been cut off from Medicaid-funded home\\nhealth-care assistance couldn't determine why\\n, especially since the decision went against historical access\\npractices. In a court hearing, the lawyer learned from a witness that the state in which the older client\\nlived \\nhad recently adopted a new algorithm to determine eligibility.83 The lack of a timely explanation made it\\nharder \\nto understand and contest the decision.\\n•\\nA formal child welfare investigation is opened against a parent based on an algorithm and without the parent\\never \\nbeing notified that data was being collected and used as part of an algorithmic child maltreatment\\nrisk assessment.84 The lack of notice or an explanation makes it harder for those performing child\\nmaltreatment assessments to validate the risk assessment and denies parents knowledge that could help them\\ncontest a decision.\\n41\",\n",
+ " 'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nTailored to the level of risk. An assessment should be done to determine the level of risk of the auto -\\nmated system. In settings where the consequences are high as determined by a risk assessment, or extensive \\noversight is expected (e.g., in criminal justice or some public sector settings), explanatory mechanisms should be built into the system design so that the system’s full behavior can be explained in advance (i.e., only fully transparent models should be used), rather than as an after-the-decision interpretation. In other settings, the extent of explanation provided should be tailored to the risk level. \\nValid. The explanation provided by a system should accurately reflect the factors and the influences that led \\nto a particular decision, and should be meaningful for the particular customization based on purpose, target, and level of risk. While approximation and simplification may be necessary for the system to succeed based on the explanatory purpose and target of the explanation, or to account for the risk of fraud or other concerns related to revealing decision-making information, such simplifications should be done in a scientifically supportable way. Where appropriate based on the explanatory system, error ranges for the explanation should be calculated and included in the explanation, with the choice of presentation of such information balanced with usability and overall interface complexity concerns. \\nDemonstrate protections for notice and explanation \\nReporting. Summary reporting should document the determinations made based on the above consider -',\n",
+ " 'should not be used in education, work, housing, or in other contexts where the use of such surveillance \\ntechnologies is likely to limit rights, opportunities, or access. Whenever possible, you should have access to \\nreporting that confirms your data decisions have been respected and provides an assessment of the \\npotential impact of surveillance technologies on your rights, opportunities, or access. \\nNOTICE AND EXPLANATION\\nYou should know that an automated system is being used and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organiza\\n-\\ntion responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible. \\n6',\n",
+ " 'NOTICE & \\nEXPLANATION \\nWHAT SHOULD BE EXPECTED OF AUTOMATED SYSTEMS\\nThe expectations for automated systems are meant to serve as a blueprint for the development of additional \\ntechnical standards and practices that are tailored for particular sectors and contexts. \\nAn automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and \\nexplanations as to how and why a decision was made or an action was taken by the system. These expectations are explained below. \\nProvide clear, timely, understandable, and accessible notice of use and explanations \\nGenerally accessible plain language documentation. The entity responsible for using the automated \\nsystem should ensure that documentation describing the overall system (including any human components) is \\npublic and easy to find. The documentation should describe, in plain language, how the system works and how \\nany automated component is used to determine an action or decision. It should also include expectations about \\nreporting described throughout this framework, such as the algorithmic impact assessments described as \\npart of Algorithmic Discrimination Protections. \\nAccount able. Notices should clearly identify the entity r esponsible for designing each component of the \\nsystem and the entity using it. \\nTimely and up-to-date. Users should receive notice of the use of automated systems in advance of using or \\nwhile being impacted by the technolog y. An explanation should be available with the decision itself, or soon \\nthereafte r. Notice should be kept up-to-date and people impacted by the system should be notified of use case \\nor key functionality changes. \\nBrief and clear. Notices and explanations should be assessed, such as by research on users’ experiences, \\nincluding user testing, to ensure that the people using or impacted by the automated system are able to easily',\n",
+ " 'burdensome in both the process of requesting to opt-out and the human-driven alternative provided. \\nProvide timely human consideration and remedy by a fallback and escalation system in the event that an automated system fails, produces error, or you would like to appeal or con\\n-\\ntest its impacts on you \\nProportionate. The availability of human consideration and fallback, along with associated training and \\nsafeguards against human bias, should be proportionate to the potential of the automated system to meaning -\\nfully impact rights, opportunities, or access. Automated systems that have greater control over outcomes, provide input to high-stakes decisions, relate to sensitive domains, or otherwise have a greater potential to meaningfully impact rights, opportunities, or access should have greater availability (e.g., staffing) and over\\n-\\nsight of human consideration and fallback mechanisms. \\nAccessible. Mechanisms for human consideration and fallback, whether in-person, on paper, by phone, or \\notherwise provided, should be easy to find and use. These mechanisms should be tested to ensure that users who have trouble with the automated system are able to use human consideration and fallback, with the under\\n-\\nstanding that it may be these users who are most likely to need the human assistance. Similarly, it should be tested to ensure that users with disabilities are able to find and use human consideration and fallback and also request reasonable accommodations or modifications. \\nConvenient. Mechanisms for human consideration and fallback should not be unreasonably burdensome as \\ncompared to the automated system’s equivalent. \\n49',\n",
+ " 'You should know that an automated system is being used, \\nand understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automat\\n-\\ned systems should provide generally accessible plain language docu -\\nmentation including clear descriptions of the overall system func -\\ntioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and ex\\n-\\nplanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality chang\\n-\\nes. You should know how and why an outcome impacting you was de -\\ntermined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible. NOTICE AND EXPLANATION\\n40',\n",
+ " 'HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFALLBACK \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nThere are many reasons people may prefer not to use an automated system: the system can be flawed and can lead to \\nunintended outcomes; it may reinforce bias or be inaccessible; it may simply be inconvenient or unavailable; or it may replace a paper or manual process to which people had grown accustomed. Yet members of the public are often presented with no alternative, or are forced to endure a cumbersome process to reach a human decision-maker once they decide they no longer want to deal exclusively with the automated system or be impacted by its results. As a result of this lack of human reconsideration, many receive delayed access, or lose access, to rights, opportunities, benefits, and critical services. The American public deserves the assurance that, when rights, opportunities, or access are meaningfully at stake and there is a reasonable expectation of an alternative to an automated system, they can conve\\n-\\nniently opt out of an automated system and will not be disadvantaged for that choice. In some cases, such a human or other alternative may be required by law, for example it could be required as “reasonable accommodations” for people with disabilities.',\n",
+ " \"find notices and explanations, read them quickl y, and understand and act on them. This includes ensuring that \\nnotices and explanations are accessible to users with disabilities and are available in the language(s) and read-\\ning level appropriate for the audience. Notices and explanations may need to be available in multiple forms, \\n(e.g., on pape r, on a physical sign, or online), in order to meet these expectations and to be accessible to the \\nAmerican public. \\nProvide explanations as to how and why a decision was made or an action was taken by an \\nautomated system \\nTailored to the purpose. Explanations should be tailored to the specific purpose for which the user is \\nexpected to use the explanation, and should clearly state that purpose. An informational explanation might differ from an explanation provided to allow for the possibility of recourse, an appeal, or one provided in the context of a dispute or contestation process. For the purposes of this framework, 'explanation' should be construed broadly. An explanation need not be a plain-language statement about causality but could consist of any mechanism that allows the recipient to build the necessary understanding and intuitions to achieve the stated purpose. Tailoring should be assessed (e.g., via user experience research). \\nTailored to the target of the explanation. Explanations should be targeted to specific audiences and clearly state that audience. An explanation provided to the subject of a decision might differ from one provided to an advocate, or to a domain expert or decision maker. Tailoring should be assessed (e.g., via user experience research). \\n43\"],\n",
+ " 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "multiquery_dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from ragas import evaluate\n",
+ "from ragas.metrics import (\n",
+ " faithfulness,\n",
+ " answer_relevancy,\n",
+ " answer_correctness,\n",
+ " context_recall,\n",
+ " context_precision,\n",
+ ")\n",
+ "\n",
+ "metrics = [\n",
+ " faithfulness,\n",
+ " answer_relevancy,\n",
+ " context_recall,\n",
+ " context_precision,\n",
+ " answer_correctness,\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "f5257aea40624e62905d488461b543db",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Evaluating: 0%| | 0/120 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "multiquery_results = evaluate(multiquery_dataset, metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'faithfulness': 0.8968, 'answer_relevancy': 0.9532, 'context_recall': 0.8906, 'context_precision': 0.9207, 'answer_correctness': 0.6901}"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "multiquery_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " question | \n",
+ " contexts | \n",
+ " answer | \n",
+ " ground_truth | \n",
+ " faithfulness | \n",
+ " answer_relevancy | \n",
+ " context_recall | \n",
+ " context_precision | \n",
+ " answer_correctness | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " What is the significance of providing notice a... | \n",
+ " [Providing notice has long been a standard pra... | \n",
+ " Providing notice and explanation as a legal re... | \n",
+ " Providing notice and explanation as a legal re... | \n",
+ " 1.000000 | \n",
+ " 0.971321 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 0.821299 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " How can structured human feedback exercises, s... | \n",
+ " [50 Participatory Engagement Methods \\nOn an ... | \n",
+ " Structured human feedback exercises, such as G... | \n",
+ " Structured human feedback exercises, such as G... | \n",
+ " 1.000000 | \n",
+ " 0.992832 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 0.541222 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " How do measurement gaps between laboratory and... | \n",
+ " [49 early lifecycle TEVV approaches are develo... | \n",
+ " Measurement gaps between laboratory and real-w... | \n",
+ " Measurement gaps between laboratory and real-w... | \n",
+ " 0.958333 | \n",
+ " 0.988752 | \n",
+ " 1.0 | \n",
+ " 0.876667 | \n",
+ " 0.636556 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " How should data collection and use-case scope ... | \n",
+ " [Data collection and use-case scope limits. Da... | \n",
+ " To prevent 'mission creep' in automated system... | \n",
+ " Data collection and use-case scope limits in a... | \n",
+ " 1.000000 | \n",
+ " 0.923204 | \n",
+ " 1.0 | \n",
+ " 1.000000 | \n",
+ " 0.491425 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " What action did the Federal Trade Commission t... | \n",
+ " [alerts about location tracking—are brief, dir... | \n",
+ " The Federal Trade Commission (FTC) took action... | \n",
+ " FTC sued Kochava for selling data that tracks ... | \n",
+ " 0.400000 | \n",
+ " 0.936866 | \n",
+ " 0.0 | \n",
+ " 0.125000 | \n",
+ " 0.902212 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " question \\\n",
+ "0 What is the significance of providing notice a... \n",
+ "1 How can structured human feedback exercises, s... \n",
+ "2 How do measurement gaps between laboratory and... \n",
+ "3 How should data collection and use-case scope ... \n",
+ "4 What action did the Federal Trade Commission t... \n",
+ "\n",
+ " contexts \\\n",
+ "0 [Providing notice has long been a standard pra... \n",
+ "1 [50 Participatory Engagement Methods \\nOn an ... \n",
+ "2 [49 early lifecycle TEVV approaches are develo... \n",
+ "3 [Data collection and use-case scope limits. Da... \n",
+ "4 [alerts about location tracking—are brief, dir... \n",
+ "\n",
+ " answer \\\n",
+ "0 Providing notice and explanation as a legal re... \n",
+ "1 Structured human feedback exercises, such as G... \n",
+ "2 Measurement gaps between laboratory and real-w... \n",
+ "3 To prevent 'mission creep' in automated system... \n",
+ "4 The Federal Trade Commission (FTC) took action... \n",
+ "\n",
+ " ground_truth faithfulness \\\n",
+ "0 Providing notice and explanation as a legal re... 1.000000 \n",
+ "1 Structured human feedback exercises, such as G... 1.000000 \n",
+ "2 Measurement gaps between laboratory and real-w... 0.958333 \n",
+ "3 Data collection and use-case scope limits in a... 1.000000 \n",
+ "4 FTC sued Kochava for selling data that tracks ... 0.400000 \n",
+ "\n",
+ " answer_relevancy context_recall context_precision answer_correctness \n",
+ "0 0.971321 1.0 1.000000 0.821299 \n",
+ "1 0.992832 1.0 1.000000 0.541222 \n",
+ "2 0.988752 1.0 0.876667 0.636556 \n",
+ "3 0.923204 1.0 1.000000 0.491425 \n",
+ "4 0.936866 0.0 0.125000 0.902212 "
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "multiquery_results_df = multiquery_results.to_pandas()\n",
+ "multiquery_results_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "multiquery_results_df.to_csv(\"multiquery_ragas_results.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "multiquery_metrics_df = pd.DataFrame(list(multiquery_results.items()), columns=['Metric', 'MultiQuery'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metric | \n",
+ " MultiQuery | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " faithfulness | \n",
+ " 0.896804 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " answer_relevancy | \n",
+ " 0.953211 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " context_recall | \n",
+ " 0.890625 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " context_precision | \n",
+ " 0.920732 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " answer_correctness | \n",
+ " 0.690058 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metric MultiQuery\n",
+ "0 faithfulness 0.896804\n",
+ "1 answer_relevancy 0.953211\n",
+ "2 context_recall 0.890625\n",
+ "3 context_precision 0.920732\n",
+ "4 answer_correctness 0.690058"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "multiquery_metrics_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "multiquery_metrics_df.to_csv(\"multiquery_metrics.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metric | \n",
+ " Baseline | \n",
+ " MultiQuery | \n",
+ " Baseline -> MultiQuery | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " faithfulness | \n",
+ " 0.895359 | \n",
+ " 0.896804 | \n",
+ " 0.001445 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " answer_relevancy | \n",
+ " 0.955419 | \n",
+ " 0.953211 | \n",
+ " -0.002208 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " context_recall | \n",
+ " 0.934028 | \n",
+ " 0.890625 | \n",
+ " -0.043403 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " context_precision | \n",
+ " 0.937500 | \n",
+ " 0.920732 | \n",
+ " -0.016768 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " answer_correctness | \n",
+ " 0.629267 | \n",
+ " 0.690058 | \n",
+ " 0.060791 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metric Baseline MultiQuery Baseline -> MultiQuery\n",
+ "0 faithfulness 0.895359 0.896804 0.001445\n",
+ "1 answer_relevancy 0.955419 0.953211 -0.002208\n",
+ "2 context_recall 0.934028 0.890625 -0.043403\n",
+ "3 context_precision 0.937500 0.920732 -0.016768\n",
+ "4 answer_correctness 0.629267 0.690058 0.060791"
+ ]
+ },
+ "execution_count": 31,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
+ "\n",
+ "df_baseline_multiquery['Baseline -> MultiQuery'] = df_baseline_multiquery['MultiQuery'] - df_baseline_multiquery['Baseline']\n",
+ "\n",
+ "df_baseline_multiquery"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "compression_retriever = vectorstore.as_retriever(\n",
+ " search_type=\"mmr\",\n",
+ " search_kwargs={\"k\": 4, \"fetch_k\": 10},\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.retrievers import ContextualCompressionRetriever\n",
+ "from langchain.retrievers.document_compressors import LLMChainExtractor\n",
+ "\n",
+ "compressor = LLMChainExtractor.from_llm(llm)\n",
+ "compression_retriever = ContextualCompressionRetriever(\n",
+ " base_compressor=compressor, base_retriever=compression_retriever\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "memory = ConversationBufferMemory(memory_key=\"chat_history\", return_messages=True, output_key=\"answer\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contextual_compression_rag_chain = ConversationalRetrievalChain.from_llm(\n",
+ " llm,\n",
+ " retriever=compression_retriever,\n",
+ " memory=memory,\n",
+ " combine_docs_chain_kwargs={\"prompt\": PROMPT},\n",
+ " return_source_documents=True,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 36,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'question': 'What are Trustworthy AI Characteristics?',\n",
+ " 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?'),\n",
+ " AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).')],\n",
+ " 'answer': 'Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are reliable, ethical, and beneficial for society. Here are the key characteristics:\\n\\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users and stakeholders should be able to trace how decisions are made and hold the systems accountable for their outcomes.\\n\\n2. **Explainable and Interpretable**: AI should provide clear explanations for its decisions and actions. This is crucial for users to understand the rationale behind AI outputs, especially in critical areas like healthcare or criminal justice.\\n\\n3. **Fair with Harmful Bias Managed**: AI systems must be designed to minimize and manage biases that could lead to unfair treatment of individuals or groups. This involves actively identifying and mitigating any harmful biases in the data and algorithms used.\\n\\n4. **Privacy Enhanced**: AI should respect user privacy and protect personal data. This includes implementing measures to ensure that data is collected, stored, and processed in a way that safeguards individual privacy rights.\\n\\n5. **Safe**: AI systems should operate safely and reliably, minimizing risks to users and society. This includes ensuring that systems are robust against failures and can handle unexpected situations without causing harm.\\n\\n6. **Secure and Resilient**: AI should be protected against malicious attacks and vulnerabilities. This means implementing strong security measures to safeguard the integrity of AI systems and the data they use.\\n\\n7. **Valid and Reliable**: AI systems should produce consistent and accurate results. This involves rigorous testing and validation to ensure that the systems perform as intended across various scenarios.\\n\\nThese characteristics are essential for fostering trust in AI technologies and ensuring that they contribute positively to society (Source: NIST framework, p. [specific page number]).',\n",
+ " 'source_documents': [Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 12, '_id': 'fd17ecae8e274319a78ca70b545e9c1a', '_collection_name': 'Midterm Eval'}, page_content='Trustworthy AI Characteristics: Accountable and Transparent, Explainable and Interpretable, Fair with Harmful Bias Managed, Privacy Enhanced, Safe, Valid and Reliable'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 20, '_id': 'fdbb4ca124b94cadb07b27ae08657b4c', '_collection_name': 'Midterm Eval'}, page_content='robustness, safety, security (resilience), and mitigation of unintended and/or harmful bias, as well as of harmful uses. The NIST framework will consider and encompass principles such as transparency, accountability, and fairness during pre-design, design and development, deployment, use, and testing and evaluation of AI technologies and systems.'),\n",
+ " Document(metadata={'source': '/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf', 'page': 11, '_id': 'fedbf16ddbec4ac89db1620b14630d1e', '_collection_name': 'Midterm Eval'}, page_content='8 Trustworthy AI Characteristics: Accountable and Transparent, Privacy Enhanced, Safe, Secure and Resilient \\nTrustworthy AI Characteristics: Accountable and Transparent, Safe')]}"
+ ]
+ },
+ "execution_count": 36,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contextual_compression_rag_chain.invoke({\"question\": \"What are Trustworthy AI Characteristics?\"})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 37,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "answers = []\n",
+ "contexts = []\n",
+ "\n",
+ "for question in test_questions:\n",
+ " response = contextual_compression_rag_chain.invoke({\"question\" : question})\n",
+ " answers.append(response[\"answer\"])\n",
+ " contexts.append([context.page_content for context in response[\"source_documents\"]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contextual_compression_dataset = Dataset.from_dict({\n",
+ " \"question\" : test_questions,\n",
+ " \"answer\" : answers,\n",
+ " \"contexts\" : contexts,\n",
+ " \"ground_truth\" : test_groundtruths\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',\n",
+ " 'answer': \"Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\\n\\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect their rights, opportunities, or access. This transparency helps build trust in the technology and the entities deploying it.\\n\\n2. **Empowerment**: When people receive clear explanations about how decisions are made by automated systems, they are better equipped to understand and contest those decisions if necessary. This is particularly important in sensitive areas like employment, credit, and legal proceedings, where outcomes can have profound impacts on individuals' lives.\\n\\n3. **Accountability**: Notice and explanation requirements hold organizations accountable for their automated systems. By clearly identifying the entities responsible for the design and use of these systems, it becomes easier to address any issues or biases that may arise.\\n\\n4. **Error Correction**: Providing notice allows individuals to identify and correct errors in automated decision-making processes. Without this knowledge, individuals may be left without recourse to challenge incorrect or unfair outcomes.\\n\\n5. **Public Confidence**: Consistent and clear communication about the use of automated systems can enhance public confidence in these technologies. When people understand how and why decisions are made, they are more likely to trust the systems and the organizations that use them.\\n\\n6. **Safety and Efficacy**: Clear explanations enable experts to verify the reasonableness of recommendations made by automated systems before they are enacted. This is crucial for ensuring that the systems operate safely and effectively.\\n\\nIn summary, notice and explanation requirements are essential for protecting individuals' rights, fostering accountability, and ensuring that automated systems are used responsibly and ethically (Source: [document name], p. [page number]).\",\n",
+ " 'contexts': ['Providing notice has long been a standard practice, and in many cases is a legal requirement, when, for example, making a video recording of someone (outside of a law enforcement or national security context). In some cases, such as credit, lenders are required to provide notice and explanation to consumers. While notice and explanation requirements are already in place in some sectors or situations, the American public deserve to know consistently and across sectors if an automated system is being used in a way that impacts their rights, opportunities, or access. This knowledge should provide confidence in how the public is being treated, and trust in the validity and reasonable use of automated systems. The lack of a timely explanation made it harder to understand and contest the decision. The lack of notice or an explanation makes it harder for those performing child maltreatment assessments to validate the risk assessment and denies parents knowledge that could help them contest a decision.',\n",
+ " 'You should know that an automated system is being used, and understand how and why it contributes to outcomes that impact you. Designers, developers, and deployers of automated systems should provide generally accessible plain language documentation including clear descriptions of the overall system functioning and the role automation plays, notice that such systems are in use, the individual or organization responsible for the system, and explanations of outcomes that are clear, timely, and accessible. Such notice should be kept up-to-date and people impacted by the system should be notified of significant use case or key functionality changes. You should know how and why an outcome impacting you was determined by an automated system, including when the automated system is not the sole input determining the outcome. Automated systems should provide explanations that are technically valid, meaningful and useful to you and to any operators or others who need to understand the system, and calibrated to the level of risk based on the context. Reporting that includes summary information about these automated systems in plain language and assessments of the clarity and quality of the notice and explanations should be made public whenever possible.',\n",
+ " 'An automated system should provide demonstrably clear, timely, understandable, and accessible notice of use, and explanations as to how and why a decision was made or an action was taken by the system. Notices should clearly identify the entity responsible for designing each component of the system and the entity using it. Users should receive notice of the use of automated systems in advance of using or while being impacted by the technology. An explanation should be available with the decision itself, or soon thereafter. Notice should be kept up-to-date and people impacted by the system should be notified of use case or key functionality changes. Notices and explanations should be assessed, such as by research on users’ experiences, including user testing, to ensure that the people using or impacted by the automated system are able to easily.',\n",
+ " 'NOTICE & \\nEXPLANATION \\nWHY THIS PRINCIPLE IS IMPORTANT\\nThis section provides a brief summary of the problems which the principle seeks to address and protect \\nagainst, including illustrative examples. \\nAutomated systems now determine opportunities, from employment to credit, and directly shape the American \\npublic’s experiences, from the courtroom to online classrooms, in ways that profoundly impact people’s lives. But this expansive impact is not always visible. An applicant might not know whether a person rejected their resume or a hiring algorithm moved them to the bottom of the list. A defendant in the courtroom might not know if a judge denying their bail is informed by an automated system that labeled them “high risk.” From correcting errors to contesting decisions, people are often denied the knowledge they need to address the impact of automated systems on their lives. Notice and explanations also serve an important safety and efficacy purpose, allowing experts to verify the reasonableness of a recommendation before enacting it. \\nIn order to guard against potential harms, the American public needs to know if an automated system is being used. Clear, brief, and understandable notice is a prerequisite for achieving the other protections in this framework. Like-\\nwise, the public is often unable to ascertain how or why an automated system has made a decision or contributed to a particular outcome. The decision-making processes of automated systems tend to be opaque, complex, and, therefore, unaccountable, whether by design or by omission. These factors can make explanations both more challenging and more important, and should not be used as a pretext to avoid explaining important decisions to the people impacted by those choices. In the context of automated systems, clear and valid explanations should be recognized as a baseline requirement.'],\n",
+ " 'ground_truth': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant because it allows individuals to understand how automated systems are impacting their lives. It helps in correcting errors, contesting decisions, and verifying the reasonableness of recommendations before enacting them. Clear and valid explanations are essential to ensure transparency, accountability, and trust in the use of automated systems across various sectors.'}"
+ ]
+ },
+ "execution_count": 39,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contextual_compression_dataset[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b6c03ccbf50e4642b9433f2513fb83c3",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Evaluating: 0%| | 0/120 [00:00, ?it/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "contextual_compression_results = evaluate(contextual_compression_dataset, metrics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'faithfulness': 0.7491, 'answer_relevancy': 0.9140, 'context_recall': 0.7257, 'context_precision': 0.9051, 'answer_correctness': 0.5707}"
+ ]
+ },
+ "execution_count": 41,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contextual_compression_results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " question | \n",
+ " contexts | \n",
+ " answer | \n",
+ " ground_truth | \n",
+ " faithfulness | \n",
+ " answer_relevancy | \n",
+ " context_recall | \n",
+ " context_precision | \n",
+ " answer_correctness | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " What is the significance of providing notice a... | \n",
+ " [Providing notice has long been a standard pra... | \n",
+ " Providing notice and explanation as a legal re... | \n",
+ " Providing notice and explanation as a legal re... | \n",
+ " 1.000000 | \n",
+ " 0.971321 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.585260 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " How can structured human feedback exercises, s... | \n",
+ " [AI Red -teaming \\nAI red -teaming is an evol... | \n",
+ " Structured human feedback exercises, such as G... | \n",
+ " Structured human feedback exercises, such as G... | \n",
+ " 1.000000 | \n",
+ " 0.988309 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.320501 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " How do measurement gaps between laboratory and... | \n",
+ " [Currently available pre -deployment TEVV proc... | \n",
+ " Measurement gaps between laboratory and real-w... | \n",
+ " Measurement gaps between laboratory and real-w... | \n",
+ " 0.958333 | \n",
+ " 0.996595 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.597251 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " How should data collection and use-case scope ... | \n",
+ " [Data collection should be limited in scope, w... | \n",
+ " To prevent \"mission creep\" in automated system... | \n",
+ " Data collection and use-case scope limits in a... | \n",
+ " 0.439024 | \n",
+ " 0.922376 | \n",
+ " 1.0 | \n",
+ " 1.0 | \n",
+ " 0.551606 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " What action did the Federal Trade Commission t... | \n",
+ " [] | \n",
+ " The Federal Trade Commission (FTC) took action... | \n",
+ " FTC sued Kochava for selling data that tracks ... | \n",
+ " 0.833333 | \n",
+ " 0.925072 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.529680 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " question \\\n",
+ "0 What is the significance of providing notice a... \n",
+ "1 How can structured human feedback exercises, s... \n",
+ "2 How do measurement gaps between laboratory and... \n",
+ "3 How should data collection and use-case scope ... \n",
+ "4 What action did the Federal Trade Commission t... \n",
+ "\n",
+ " contexts \\\n",
+ "0 [Providing notice has long been a standard pra... \n",
+ "1 [AI Red -teaming \\nAI red -teaming is an evol... \n",
+ "2 [Currently available pre -deployment TEVV proc... \n",
+ "3 [Data collection should be limited in scope, w... \n",
+ "4 [] \n",
+ "\n",
+ " answer \\\n",
+ "0 Providing notice and explanation as a legal re... \n",
+ "1 Structured human feedback exercises, such as G... \n",
+ "2 Measurement gaps between laboratory and real-w... \n",
+ "3 To prevent \"mission creep\" in automated system... \n",
+ "4 The Federal Trade Commission (FTC) took action... \n",
+ "\n",
+ " ground_truth faithfulness \\\n",
+ "0 Providing notice and explanation as a legal re... 1.000000 \n",
+ "1 Structured human feedback exercises, such as G... 1.000000 \n",
+ "2 Measurement gaps between laboratory and real-w... 0.958333 \n",
+ "3 Data collection and use-case scope limits in a... 0.439024 \n",
+ "4 FTC sued Kochava for selling data that tracks ... 0.833333 \n",
+ "\n",
+ " answer_relevancy context_recall context_precision answer_correctness \n",
+ "0 0.971321 1.0 1.0 0.585260 \n",
+ "1 0.988309 1.0 1.0 0.320501 \n",
+ "2 0.996595 1.0 1.0 0.597251 \n",
+ "3 0.922376 1.0 1.0 0.551606 \n",
+ "4 0.925072 0.0 0.0 0.529680 "
+ ]
+ },
+ "execution_count": 42,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contextual_compression_results_df = contextual_compression_results.to_pandas()\n",
+ "contextual_compression_results_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contextual_compression_results_df.to_csv(\"contextual_compression_ragas_results.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contextual_compression_metrics_df = pd.DataFrame(list(contextual_compression_results.items()), columns=['Metric', 'ContextualCompression'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metric | \n",
+ " ContextualCompression | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " faithfulness | \n",
+ " 0.749092 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " answer_relevancy | \n",
+ " 0.913993 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " context_recall | \n",
+ " 0.725694 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " context_precision | \n",
+ " 0.905093 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " answer_correctness | \n",
+ " 0.570685 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metric ContextualCompression\n",
+ "0 faithfulness 0.749092\n",
+ "1 answer_relevancy 0.913993\n",
+ "2 context_recall 0.725694\n",
+ "3 context_precision 0.905093\n",
+ "4 answer_correctness 0.570685"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "contextual_compression_metrics_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "contextual_compression_metrics_df.to_csv(\"contextual_compression_metrics.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metric | \n",
+ " Baseline | \n",
+ " MultiQuery | \n",
+ " ContextualCompression | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " faithfulness | \n",
+ " 0.895359 | \n",
+ " 0.896804 | \n",
+ " 0.749092 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " answer_relevancy | \n",
+ " 0.955419 | \n",
+ " 0.953211 | \n",
+ " 0.913993 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " context_recall | \n",
+ " 0.934028 | \n",
+ " 0.890625 | \n",
+ " 0.725694 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " context_precision | \n",
+ " 0.937500 | \n",
+ " 0.920732 | \n",
+ " 0.905093 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " answer_correctness | \n",
+ " 0.629267 | \n",
+ " 0.690058 | \n",
+ " 0.570685 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metric Baseline MultiQuery ContextualCompression\n",
+ "0 faithfulness 0.895359 0.896804 0.749092\n",
+ "1 answer_relevancy 0.955419 0.953211 0.913993\n",
+ "2 context_recall 0.934028 0.890625 0.725694\n",
+ "3 context_precision 0.937500 0.920732 0.905093\n",
+ "4 answer_correctness 0.629267 0.690058 0.570685"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_baseline_multiquery = pd.merge(baseline_metrics, multiquery_metrics_df, on='Metric')\n",
+ "df_baseline_multiquery_contextual_compression = pd.merge(df_baseline_multiquery, contextual_compression_metrics_df, on='Metric')\n",
+ "\n",
+ "\n",
+ "df_baseline_multiquery_contextual_compression"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Metric | \n",
+ " Baseline | \n",
+ " MultiQuery | \n",
+ " ContextualCompression | \n",
+ " HigestValue | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " faithfulness | \n",
+ " 0.895359 | \n",
+ " 0.896804 | \n",
+ " 0.749092 | \n",
+ " 0.9 (MultiQuery) | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " answer_relevancy | \n",
+ " 0.955419 | \n",
+ " 0.953211 | \n",
+ " 0.913993 | \n",
+ " 0.96 (Baseline) | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " context_recall | \n",
+ " 0.934028 | \n",
+ " 0.890625 | \n",
+ " 0.725694 | \n",
+ " 0.93 (Baseline) | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " context_precision | \n",
+ " 0.937500 | \n",
+ " 0.920732 | \n",
+ " 0.905093 | \n",
+ " 0.94 (Baseline) | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " answer_correctness | \n",
+ " 0.629267 | \n",
+ " 0.690058 | \n",
+ " 0.570685 | \n",
+ " 0.69 (MultiQuery) | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Metric Baseline MultiQuery ContextualCompression \\\n",
+ "0 faithfulness 0.895359 0.896804 0.749092 \n",
+ "1 answer_relevancy 0.955419 0.953211 0.913993 \n",
+ "2 context_recall 0.934028 0.890625 0.725694 \n",
+ "3 context_precision 0.937500 0.920732 0.905093 \n",
+ "4 answer_correctness 0.629267 0.690058 0.570685 \n",
+ "\n",
+ " HigestValue \n",
+ "0 0.9 (MultiQuery) \n",
+ "1 0.96 (Baseline) \n",
+ "2 0.93 (Baseline) \n",
+ "3 0.94 (Baseline) \n",
+ "4 0.69 (MultiQuery) "
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df_baseline_multiquery_contextual_compression['MaxValue'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].max(axis=1)\n",
+ "\n",
+ "df_baseline_multiquery_contextual_compression['MaxMetric'] = df_baseline_multiquery_contextual_compression[['Baseline', 'MultiQuery', 'ContextualCompression']].idxmax(axis=1)\n",
+ "\n",
+ "df_baseline_multiquery_contextual_compression['HigestValue'] = df_baseline_multiquery_contextual_compression['MaxValue'].round(2).astype(str) + ' (' + df_baseline_multiquery_contextual_compression['MaxMetric'] + ')'\n",
+ "\n",
+ "df_baseline_multiquery_contextual_compression = df_baseline_multiquery_contextual_compression.drop(columns=['MaxValue', 'MaxMetric'])\n",
+ "\n",
+ "df_baseline_multiquery_contextual_compression"
+ ]
}
],
"metadata": {