In [None]:
!pip install langchain langchain_community langchain_openai chromadb pypdf langsmith qdrant-client ragas pandas

## Task 2: Set Environment Variables

Let's set up our OpenAI API key so we can leverage their API later on.

In [4]:
import os
import openai
from getpass import getpass

openai.api_key = getpass("Please provide your OpenAI Key: ")
os.environ["OPENAI_API_KEY"] = openai.api_key

In [5]:
from uuid import uuid4

unique_id = uuid4().hex[0:8]

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = f"LangSmith - {unique_id}"

In [7]:
import pandas as pd

test_df = pd.read_csv("synthetic_midterm_question_dataset.csv")

In [8]:
test_questions = test_df["question"].values.tolist()
test_groundtruths = test_df["ground_truth"].values.tolist()

In [9]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.sitemap import SitemapLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain_community.vectorstores import Qdrant
from langchain.memory import ConversationBufferMemory
from langchain_community.document_loaders import YoutubeLoader

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [10]:
pdf_paths = ["/Users/xico/AIMakerSpace-Midterm/AI_Risk_Management_Framework.pdf",
"/Users/xico/AIMakerSpace-Midterm/Blueprint-for-an-AI-Bill-of-Rights.pdf"]

In [11]:
pdf_documents = []
for pdf_path in pdf_paths:
    loader = PyPDFLoader(pdf_path)
    pdf_documents.extend(loader.load())

In [12]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
    )
baseline_docs = text_splitter.split_documents(pdf_documents)

In [13]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Qdrant.from_documents(
    documents=baseline_docs,
    embedding=embedding,
    location=":memory:",
    collection_name="Midterm Evaluation"
)

In [14]:
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 10},
)
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    streaming=True,
)

In [15]:
custom_template = """
You are an expert in artificial intelligence policy, ethics, and industry trends. Your task is to provide clear and accurate answers to questions related to AI's role in politics, government regulations, and its ethical implications for enterprises. Use reliable and up-to-date information from government documents, industry reports, and academic research to inform your responses. Make sure to consider how AI is evolving, especially in relation to the current political landscape, and provide answers in a way that is easy to understand for both AI professionals and non-experts.

Remember these key points:
1. Use "you" when addressing the user and "I" when referring to yourself.
2. If you encounter complex or legal language in the context, simplify it for easy understanding. Imagine you're explaining it to someone who isn't familiar with legal terms.
3. Be prepared for follow-up questions and maintain context from previous exchanges.
4. If there's no information from a retrieved document in the context to answer a question or if there are no documents to cite, say: "I'm sorry, I don't know the answer to that question."
5. When providing information, always cite the source document and page number in parentheses at the end of the relevant sentence or paragraph, like this: (Source: [document name], p. [page number]).

Here are a few example questions you might receive:

How are governments regulating AI, and what new policies have been implemented?
What are the ethical risks of using AI in political decision-making?
How can enterprises ensure their AI applications meet government ethical standards?

One final rule for you to remember. You CANNOT under any circumstance, answer any question that does not pertain to the AI. If you do answer an out-of-scope question, you could lose your job. If you are asked a question that does not have to do with AI, you must say: "I'm sorry, I don't know the answer to that question."
Context: {context}
Chat History: {chat_history}
Human: {question}
AI:"""

PROMPT = PromptTemplate(
    template=custom_template, input_variables=["context", "question", "chat_history"]
)

In [16]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

baseline_rag_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=retriever,
        memory=memory,
        combine_docs_chain_kwargs={"prompt": PROMPT},
        return_source_documents=True,
    )

In [17]:
baseline_rag_chain.invoke({"question": "What are Trustworthy AI Characteristics?"})

{'question': 'What are Trustworthy AI Characteristics?',
 'chat_history': [HumanMessage(content='What are Trustworthy AI Characteristics?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Trustworthy AI characteristics refer to the essential qualities that AI systems should possess to ensure they are used responsibly and ethically. These characteristics include:\n\n1. **Accountable and Transparent**: AI systems should be designed in a way that their decision-making processes can be understood and scrutinized. This means that users should be able to trace how decisions are made and hold systems accountable for their outcomes.\n\n2. **Safe**: AI systems must operate without causing harm to individuals or society. This includes ensuring that they are secure from cyber threats and that they do not produce harmful outputs.\n\n3. **Valid and Reliable**: AI systems should consistently produce accurate and dependable results. This means they should be tested and validated to 

In [18]:
answers = []
contexts = []

for question in test_questions:
  response = baseline_rag_chain.invoke({"question" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["source_documents"]])

Now we can wrap our information in a Hugging Face dataset for use in the Ragas library.

In [19]:
from datasets import Dataset

response_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

  from .autonotebook import tqdm as notebook_tqdm


Let's take a peek and see what that looks like!

In [20]:
response_dataset[0]

{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',
 'answer': "Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\n\n1. **Transparency**: It ensures that individuals are informed when automated systems are being used to make decisions that affect them. This transparency helps build trust between the public and the entities using these systems.\n\n2. **Accountability**: By requiring organizations to explain how decisions are made, it holds them accountable for the outcomes of their automated systems. This means that if a decision negatively impacts someone, they have the right to understand the reasoning behind it and contest it if necessary.\n\n3. **Empowerment**: Notice and explanation empower individuals by giving them the information they need to understand their rights and the processes that affect their lives. This is pa

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)

metrics = [
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
    answer_correctness,
]

All that's left to do is call "evaluate" and away we go!

In [22]:
results = evaluate(response_dataset, metrics)

Evaluating: 100%|██████████| 120/120 [02:11<00:00,  1.10s/it]


In [23]:
results

{'faithfulness': 0.6984, 'answer_relevancy': 0.9468, 'context_recall': 0.8559, 'context_precision': 0.9039, 'answer_correctness': 0.6487}

In [24]:
results_df = results.to_pandas()
results_df.head()

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the significance of providing notice a...,[Providing notice has long been a standard pra...,Providing notice and explanation as a legal re...,Providing notice and explanation as a legal re...,0.615385,0.971321,1.0,1.0,0.81559
1,"How can structured human feedback exercises, s...",[Gaps between benchmarks and real-world use o...,"Structured human feedback exercises, such as G...","Structured human feedback exercises, such as G...",0.789474,0.989817,1.0,1.0,0.534412
2,How do measurement gaps between laboratory and...,"[reliability in those domains. Similarly, jail...",Measurement gaps between laboratory and real-w...,Measurement gaps between laboratory and real-w...,1.0,0.927953,1.0,1.0,0.906851
3,How should data collection and use-case scope ...,[Data collection and use-case scope limits. Da...,To determine and implement data collection and...,Data collection and use-case scope limits in a...,1.0,0.94134,1.0,0.805556,0.442307
4,What action did the Federal Trade Commission t...,"[65. See, e.g., Scott Ikeda. Major Data Broker...",The Federal Trade Commission (FTC) took action...,FTC sued Kochava for selling data that tracks ...,0.0,0.92509,0.0,0.0,0.853336


In [47]:
results_df.to_csv("baseline_ragas_results.csv", index=False)

In [25]:
df_baseline = pd.DataFrame(list(results.items()), columns=['Metric', 'Baseline'])

In [26]:
df_baseline

Unnamed: 0,Metric,Baseline
0,faithfulness,0.698407
1,answer_relevancy,0.946766
2,context_recall,0.855903
3,context_precision,0.903935
4,answer_correctness,0.648744


In [46]:
df_baseline.to_csv("df_baseline_metrics.csv", index=False)

In [26]:
from datetime import datetime, timedelta

In [45]:
api_key = os.environ["LANGCHAIN_API_KEY"]

In [39]:
import uuid
import requests

In [46]:
unique_dataset_id = str(uuid.uuid4())
dataset_name = f"RAGAS Midterm Eval Dataset - {unique_dataset_id[:8]}"

In [47]:
experiment_start_time = datetime.now()
experiment_end_time = experiment_start_time + timedelta(minutes=30)  # Adjust as needed

results = []
for index, row in results_df.iterrows():
    start_time = experiment_start_time + timedelta(seconds=index)
    end_time = start_time + timedelta(seconds=1)
    results.append({
        "row_id": str(uuid.uuid4()),
        "inputs": {"question": row["question"]},
        "expected_outputs": {"ground_truth": row["ground_truth"]},
        "actual_outputs": {"answer": row["answer"]},
        "evaluation_scores": [
            {"key": "faithfulness", "score": row["faithfulness"]},
            {"key": "answer_relevancy", "score": row["answer_relevancy"]},
            {"key": "context_recall", "score": row["context_recall"]},
            {"key": "context_precision", "score": row["context_precision"]},
            {"key": "answer_correctness", "score": row["answer_correctness"]}
        ],
        "start_time": start_time.isoformat(),
        "end_time": end_time.isoformat(),
        "run_name": f"Baseline Run {index}"
    })

summary_scores = [
    {"key": "faithfulness", "score": results_df["faithfulness"].mean(), "comment": "Average faithfulness score"},
    {"key": "answer_relevancy", "score": results_df["answer_relevancy"].mean(), "comment": "Average answer relevancy score"},
    {"key": "context_recall", "score": results_df["context_recall"].mean(), "comment": "Average context recall score"},
    {"key": "context_precision", "score": results_df["context_precision"].mean(), "comment": "Average context precision score"},
    {"key": "answer_correctness", "score": results_df["answer_correctness"].mean(), "comment": "Average answer correctness score"}
]

body = {
    "experiment_name": "Baseline Midterm Evaluation",
    "experiment_description": "Baseline evaluation of Midterm Evaluation using Ragas metrics",
    "dataset_name": dataset_name,
    "dataset_description": "Dataset for RAGBot evaluation using Ragas metrics",
    "experiment_start_time": experiment_start_time.isoformat(),
    "experiment_end_time": experiment_end_time.isoformat(),
    "experiment_metadata": {
        "model": "gpt-4o-mini",
        "retriever": "Qdrant with MMR",
        "chunk_size": "1000 w/ 200 overlap"
    },
    "summary_experiment_scores": summary_scores,
    "results": results
}

response = requests.post(
    "https://api.smith.langchain.com/api/v1/datasets/upload-experiment",
    json=body,
    headers={"x-api-key": api_key}
)

print(response.status_code)
print(response.json())

403
{'detail': 'Forbidden'}


In [27]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=100,
    )
medium_chunk_docs = text_splitter.split_documents(pdf_documents)

In [28]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [29]:
vectorstore = Qdrant.from_documents(
    documents=medium_chunk_docs,
    embedding=embedding,
    location=":memory:",
    collection_name="Midterm Eval"
)

medium_chunk_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 10},
)

medium_chunk_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

In [30]:
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    streaming=True,
)

In [31]:
medium_chunk_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=medium_chunk_retriever,
        memory=medium_chunk_memory,
        combine_docs_chain_kwargs={"prompt": PROMPT},
        return_source_documents=True,
    )

In [32]:
answers = []
contexts = []

for question in test_questions:
  response = medium_chunk_chain.invoke({"question" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["source_documents"]])

In [33]:
from datasets import Dataset

medium_chunk_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [34]:
medium_chunk_dataset[0]

{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',
 'answer': 'Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\n\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect their lives. This transparency helps build trust between the public and the entities deploying these systems.\n\n2. **Accountability**: By requiring entities to identify themselves and explain their systems, it holds them accountable for the decisions made by these automated systems. This accountability is crucial for addressing any potential biases or errors in the algorithms.\n\n3. **Empowerment**: When individuals receive clear explanations about how decisions are made, they are better equipped to contest or appeal those decisions if they believe they have been treated unfairly. This

In [35]:
medium_chunk_results = evaluate(medium_chunk_dataset, metrics)

Evaluating: 100%|██████████| 120/120 [02:31<00:00,  1.26s/it]


In [36]:
medium_chunk_results

{'faithfulness': 0.8954, 'answer_relevancy': 0.9554, 'context_recall': 0.9340, 'context_precision': 0.9375, 'answer_correctness': 0.6293}

In [37]:
medium_chunk_results_df = medium_chunk_results.to_pandas()
medium_chunk_results_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the significance of providing notice a...,[Providing notice has long been a standard pra...,Providing notice and explanation as a legal re...,Providing notice and explanation as a legal re...,1.0,0.971321,1.0,1.0,0.549646
1,"How can structured human feedback exercises, s...",[50 Participatory Engagement Methods \nOn an ...,"Structured human feedback exercises, such as G...","Structured human feedback exercises, such as G...",0.625,0.992832,1.0,1.0,0.596025
2,How do measurement gaps between laboratory and...,[49 early lifecycle TEVV approaches are develo...,Measurement gaps between laboratory and real-w...,Measurement gaps between laboratory and real-w...,0.818182,0.988752,1.0,1.0,0.821382
3,How should data collection and use-case scope ...,[Data collection and use-case scope limits. Da...,"To prevent ""mission creep"" in automated system...",Data collection and use-case scope limits in a...,1.0,0.919733,1.0,1.0,0.712613
4,What action did the Federal Trade Commission t...,[DATA PRIVACY \nEXTRA PROTECTIONS FOR DATA RE...,The Federal Trade Commission (FTC) took action...,FTC sued Kochava for selling data that tracks ...,0.363636,0.939768,1.0,0.0,0.887542
5,How should explanatory mechanisms be built int...,[NOTICE & \nEXPLANATION \nWHAT SHOULD BE EXPEC...,To guarantee complete behavior transparency in...,In settings where the consequences are high as...,0.96,0.939667,1.0,1.0,0.713803
6,What are some examples of GAI risks that organ...,"[risks, and creates unique risks. GAI risks c...",Organizations need to consider several risks a...,Organizations need to consider various GAI ris...,1.0,0.949398,0.25,1.0,0.918143
7,How should the validity of explanations provid...,[NOTICE & \nEXPLANATION \nWHAT SHOULD BE EXPEC...,To ensure the validity of explanations provide...,The explanation provided by a system should ac...,1.0,0.958921,1.0,1.0,0.770297
8,How do generative models like LLMs generate ou...,"[answer itself is incorrect. Similarly, an LLM...","Generative models, such as large language mode...",Generative models like LLMs generate outputs t...,1.0,0.909575,1.0,0.833333,0.404061
9,How can appropriate diligence on training data...,[27 MP-4.1-0 10 Conduct appropriate diligence ...,Appropriate diligence on training data use is ...,Appropriate diligence on training data use can...,1.0,0.981217,1.0,1.0,0.338359


In [44]:
medium_chunk_results_df.to_csv("medium_chunk_ragas_results.csv", index=False)

In [40]:
medium_chunk_pdf = pd.DataFrame(list(medium_chunk_results.items()), columns=['Metric', 'MediumChunk'])

In [41]:
medium_chunk_pdf

Unnamed: 0,Metric,MediumChunk
0,faithfulness,0.895359
1,answer_relevancy,0.955419
2,context_recall,0.934028
3,context_precision,0.9375
4,answer_correctness,0.629267


In [45]:
medium_chunk_pdf.to_csv("medium_chunk_metrics.csv", index=False)

In [42]:
df_baseline_medium_chunk = pd.merge(df_baseline, medium_chunk_pdf, on='Metric')

df_baseline_medium_chunk['Baseline -> MediumChunk'] = df_baseline_medium_chunk['MediumChunk'] - df_baseline_medium_chunk['Baseline']

df_baseline_medium_chunk

Unnamed: 0,Metric,Baseline,MediumChunk,Baseline -> MediumChunk
0,faithfulness,0.698407,0.895359,0.196952
1,answer_relevancy,0.946766,0.955419,0.008653
2,context_recall,0.855903,0.934028,0.078125
3,context_precision,0.903935,0.9375,0.033565
4,answer_correctness,0.648744,0.629267,-0.019477


In [48]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=3000,
        chunk_overlap=0,
    )
large_chunk_docs = text_splitter.split_documents(pdf_documents)

In [49]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [50]:
vectorstore = Qdrant.from_documents(
    documents=large_chunk_docs,
    embedding=embedding,
    location=":memory:",
    collection_name="Full Content w/ Clean PDF"
)

large_chunk_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 10},
)

large_chunk_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

In [51]:
large_chunk_rag_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=large_chunk_retriever,
        memory=large_chunk_memory,
        combine_docs_chain_kwargs={"prompt": PROMPT},
        return_source_documents=True,
    )

In [52]:
answers = []
contexts = []

for question in test_questions:
  response = large_chunk_rag_chain.invoke({"question" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["source_documents"]])

In [53]:
from datasets import Dataset

large_chunk_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [54]:
large_chunk_dataset[0]

{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',
 'answer': "Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\n\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect their lives, such as in hiring, credit scoring, or legal judgments. This transparency helps build trust in these systems (Source: [document name], p. 6).\n\n2. **Accountability**: By requiring organizations to explain how decisions are made, it holds them accountable for the outcomes of their automated systems. This means that if a decision negatively impacts someone, they have the right to understand the reasoning behind it and contest it if necessary (Source: [document name], p. 6).\n\n3. **Empowerment**: Notice and explanation empower individuals by giving them the information they n

In [65]:
large_chunk_dataframe = pd.DataFrame({
    'question': response_dataset['question'],
    'answer': response_dataset['answer'],
    'contexts': response_dataset['contexts'],
    'ground_truth': response_dataset['ground_truth']
})

In [55]:
large_chunk_results = evaluate(large_chunk_dataset, metrics)

Evaluating: 100%|██████████| 120/120 [01:56<00:00,  1.03it/s]


In [56]:
large_chunk_results

{'faithfulness': 0.7961, 'answer_relevancy': 0.9593, 'context_recall': 0.8438, 'context_precision': 0.9294, 'answer_correctness': 0.6326}

In [None]:
large_chunk_size = large_chunk_results.to_pandas()
large_chunk_size

In [65]:
large_chunk_size.to_csv("large_chunk_ragas_results.csv", index=False)

In [60]:
large_chunk_pdf = pd.DataFrame(list(large_chunk_results.items()), columns=['Metric', 'LargeChunk'])

In [61]:
large_chunk_pdf

Unnamed: 0,Metric,LargeChunk
0,faithfulness,0.796131
1,answer_relevancy,0.959296
2,context_recall,0.84375
3,context_precision,0.929398
4,answer_correctness,0.63258


In [64]:
large_chunk_pdf.to_csv("large_chunk_metrics.csv", index=False)

In [62]:
df_baseline_medium_chunk = pd.merge(df_baseline, medium_chunk_pdf, on='Metric')
df_baseline_medium_and_large_chunk = pd.merge(df_baseline_medium_chunk, large_chunk_pdf, on='Metric')


df_baseline_medium_and_large_chunk

Unnamed: 0,Metric,Baseline,MediumChunk,LargeChunk
0,faithfulness,0.698407,0.895359,0.796131
1,answer_relevancy,0.946766,0.955419,0.959296
2,context_recall,0.855903,0.934028,0.84375
3,context_precision,0.903935,0.9375,0.929398
4,answer_correctness,0.648744,0.629267,0.63258


In [63]:
df_baseline_medium_and_large_chunk['MaxValue'] = df_baseline_medium_and_large_chunk[['Baseline', 'MediumChunk', 'LargeChunk']].max(axis=1)

df_baseline_medium_and_large_chunk['MaxMetric'] = df_baseline_medium_and_large_chunk[['Baseline', 'MediumChunk', 'LargeChunk']].idxmax(axis=1)

df_baseline_medium_and_large_chunk['HigestValue'] = df_baseline_medium_and_large_chunk['MaxValue'].round(2).astype(str) + ' (' + df_baseline_medium_and_large_chunk['MaxMetric'] + ')'

df_baseline_medium_and_large_chunk = df_baseline_medium_and_large_chunk.drop(columns=['MaxValue', 'MaxMetric'])

df_baseline_medium_and_large_chunk

Unnamed: 0,Metric,Baseline,MediumChunk,LargeChunk,HigestValue
0,faithfulness,0.698407,0.895359,0.796131,0.9 (MediumChunk)
1,answer_relevancy,0.946766,0.955419,0.959296,0.96 (LargeChunk)
2,context_recall,0.855903,0.934028,0.84375,0.93 (MediumChunk)
3,context_precision,0.903935,0.9375,0.929398,0.94 (MediumChunk)
4,answer_correctness,0.648744,0.629267,0.63258,0.65 (Baseline)


In [66]:
df_baseline_medium_and_large_chunk.to_csv("chunksize_eval.csv", index=False)

In [67]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=250,
    )
medium_chunk_medium_overlap_docs = text_splitter.split_documents(pdf_documents)

In [68]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

In [69]:
vectorstore = Qdrant.from_documents(
    documents=medium_chunk_medium_overlap_docs,
    embedding=embedding,
    location=":memory:",
    collection_name="Full Content w/ Clean PDF"
)

medium_chunk_medium_overlap_retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 4, "fetch_k": 10},
)

medium_chunk_medium_overlap_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, output_key="answer")

In [86]:
medium_chunk_medium_overlap_rag_chain = ConversationalRetrievalChain.from_llm(
        llm,
        retriever=medium_chunk_medium_overlap_retriever,
        memory=medium_chunk_medium_overlap_memory,
        combine_docs_chain_kwargs={"prompt": PROMPT},
        return_source_documents=True,
    )

In [87]:
answers = []
contexts = []

for question in test_questions:
  response = medium_chunk_medium_overlap_rag_chain.invoke({"question" : question})
  answers.append(response["answer"])
  contexts.append([context.page_content for context in response["source_documents"]])

In [88]:
from datasets import Dataset

medium_chunk_medium_overlap_dataset = Dataset.from_dict({
    "question" : test_questions,
    "answer" : answers,
    "contexts" : contexts,
    "ground_truth" : test_groundtruths
})

In [89]:
medium_chunk_medium_overlap_dataset[0]

{'question': 'What is the significance of providing notice and explanation as a legal requirement in the context of automated systems?',
 'answer': "Providing notice and explanation as a legal requirement in the context of automated systems is significant for several reasons:\n\n1. **Transparency**: It ensures that individuals are aware when automated systems are being used to make decisions that affect them. This transparency helps build trust between the public and the entities deploying these systems.\n\n2. **Accountability**: By requiring entities to identify themselves and explain their systems, it holds them accountable for the decisions made by these automated systems. This means that if a decision negatively impacts someone, they can seek clarification and potentially contest that decision.\n\n3. **Empowerment**: When individuals receive clear explanations about how decisions are made, they are better equipped to understand and challenge those decisions if necessary. This is pa

In [90]:
medium_chunk_medium_overlap_results = evaluate(medium_chunk_medium_overlap_dataset, metrics)

Evaluating: 100%|██████████| 120/120 [02:09<00:00,  1.08s/it]


In [91]:
medium_chunk_medium_overlap_results

{'faithfulness': 0.8409, 'answer_relevancy': 0.9521, 'context_recall': 0.8472, 'context_precision': 0.9572, 'answer_correctness': 0.6181}

In [92]:
medium_chunk_medium_overlap_df = large_chunk_results.to_pandas()
medium_chunk_medium_overlap_df

Unnamed: 0,question,contexts,answer,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,What is the significance of providing notice a...,[NOTICE & \nEXPLANATION \nWHY THIS PRINCIPLE I...,Providing notice and explanation as a legal re...,Providing notice and explanation as a legal re...,1.0,0.971321,1.0,1.0,0.604417
1,"How can structured human feedback exercises, s...",[50 Participatory Engagement Methods \nOn an ...,"Structured human feedback exercises, such as G...","Structured human feedback exercises, such as G...",1.0,0.988309,1.0,1.0,0.817911
2,How do measurement gaps between laboratory and...,[49 early lifecycle TEVV approaches are develo...,Measurement gaps between laboratory and real-w...,Measurement gaps between laboratory and real-w...,1.0,0.988752,1.0,1.0,0.823813
3,How should data collection and use-case scope ...,[DATA PRIVACY \nWHAT SHOULD BE EXPECTED OF AUT...,To determine and implement data collection and...,Data collection and use-case scope limits in a...,1.0,0.934138,1.0,1.0,0.583791
4,What action did the Federal Trade Commission t...,[• A device originally developed to help peopl...,The Federal Trade Commission (FTC) took action...,FTC sued Kochava for selling data that tracks ...,0.0,0.939559,0.0,0.0,0.378037
5,How should explanatory mechanisms be built int...,[NOTICE & \nEXPLANATION \nWHAT SHOULD BE EXPEC...,Integrating explanatory mechanisms into system...,In settings where the consequences are high as...,1.0,0.930169,1.0,1.0,0.618594
6,What are some examples of GAI risks that organ...,"[3 the abuse, misuse, and unsafe repurposing b...",Organizations need to consider several GAI (Ge...,Organizations need to consider various GAI ris...,1.0,0.945032,1.0,1.0,0.692431
7,How should the validity of explanations provid...,[NOTICE & \nEXPLANATION \nWHAT SHOULD BE EXPEC...,To ensure the validity of explanations provide...,The explanation provided by a system should ac...,1.0,0.956092,1.0,1.0,0.811835
8,How do generative models like LLMs generate ou...,[6 2.2. Confabulation \n“Confabulation” refer...,"Generative models, such as large language mode...",Generative models like LLMs generate outputs t...,1.0,0.950645,1.0,1.0,0.38785
9,How can appropriate diligence on training data...,[27 MP-4.1-0 10 Conduct appropriate diligence ...,Appropriate diligence in the use of training d...,Appropriate diligence on training data use can...,0.866667,0.980143,0.666667,0.805556,0.36201


In [93]:
medium_chunk_medium_overlap_metrics = pd.DataFrame(list(medium_chunk_medium_overlap_results.items()), columns=['Metric', 'MedChunkMedOverlap'])

In [94]:
medium_chunk_medium_overlap_metrics

Unnamed: 0,Metric,MedChunkMedOverlap
0,faithfulness,0.840881
1,answer_relevancy,0.952068
2,context_recall,0.847222
3,context_precision,0.957176
4,answer_correctness,0.618149


In [95]:
medium_chunk_pdf

Unnamed: 0,Metric,MediumChunk
0,faithfulness,0.895359
1,answer_relevancy,0.955419
2,context_recall,0.934028
3,context_precision,0.9375
4,answer_correctness,0.629267


In [97]:
df_medium_chunk_vs_med_chunk_med_overlap = pd.merge(medium_chunk_pdf, medium_chunk_medium_overlap_metrics, on='Metric')

df_medium_chunk_vs_med_chunk_med_overlap['MediumChunk -> MedChunkMedOverlap'] = df_medium_chunk_vs_med_chunk_med_overlap['MedChunkMedOverlap'] - df_medium_chunk_vs_med_chunk_med_overlap['MediumChunk']

df_medium_chunk_vs_med_chunk_med_overlap

Unnamed: 0,Metric,MediumChunk,MedChunkMedOverlap,MediumChunk -> MedChunkMedOverlap
0,faithfulness,0.895359,0.840881,-0.054478
1,answer_relevancy,0.955419,0.952068,-0.003351
2,context_recall,0.934028,0.847222,-0.086806
3,context_precision,0.9375,0.957176,0.019676
4,answer_correctness,0.629267,0.618149,-0.011118


In [98]:
df_medium_chunk_vs_med_chunk_med_overlap.to_csv("medium_chunk_vs_med_chunk_med_overlap_metrics.csv", index=False)