Final_Assignment_Template

Sleeping

App Files Files Community

Diego Staphorst commited on Apr 30

Commit

9e4d4b4

1 Parent(s): 81917a3

feat(langchain) knowledge base

Browse files

Files changed (6) hide show

.python-version +1 -0
agent.py +15 -0
app.py +1 -11
dataset.py +66 -0
main.py +6 -0
pyproject.toml +14 -0

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.12

agent.py ADDED Viewed

	@@ -0,0 +1,15 @@

+class BasicAgent:
+    """
+    A simple agent that returns a fixed answer for any question.
+    """
+    def __init__(self):
+        print("BasicAgent initialized.")
+    def __call__(self, question: str) -> str:
+        """
+        Processes the question and returns a fixed answer.
+        """
+        print(f"Agent received question (first 50 chars): {question[:50]}...")
+        fixed_answer = "This is a default answer."
+        print(f"Agent returning fixed answer: {fixed_answer}")
+        return fixed_answer

app.py CHANGED Viewed

@@ -3,21 +3,11 @@ import gradio as gr
 import requests
 import inspect
 import pandas as pd
-# (Keep Constants as is)
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

 import requests
 import inspect
 import pandas as pd
+from agent import BasicAgent  # Import the agent
 # --- Constants ---
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
 def run_and_submit_all( profile: gr.OAuthProfile | None):
     """

dataset.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from datasets import load_dataset
+from datasets import Dataset
+import datasets
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from langchain.docstore.document import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores.utils import DistanceStrategy
+knowledge_base = datasets.load_dataset("gaia-benchmark/GAIA", '2023_level1', split='test')
+print(knowledge_base.column_names)
+# ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata']
+source_docs = [
+    Document(
+        page_content=doc["Question"],
+        metadata={
+            "task_id": doc["task_id"],
+            "level": doc["Level"],
+            "final_answer": doc["Final answer"],
+            "file_name": doc["file_name"],
+            "file_path": doc["file_path"],
+            "annotator_metadata": doc["Annotator Metadata"],
+        },
+    )
+    for doc in knowledge_base
+]
+text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
+    AutoTokenizer.from_pretrained("thenlper/gte-small"),
+    chunk_size=200,
+    chunk_overlap=20,
+    add_start_index=True,
+    strip_whitespace=True,
+    separators=["\n\n", "\n", ".", " ", ""],
+)
+# Split docs and keep only unique ones
+print("Splitting documents...")
+docs_processed = []
+unique_texts = {}
+for doc in tqdm(source_docs):
+    new_docs = text_splitter.split_documents([doc])
+    for new_doc in new_docs:
+        if new_doc.page_content not in unique_texts:
+            unique_texts[new_doc.page_content] = True
+            docs_processed.append(new_doc)
+print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)")
+embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
+vectordb = FAISS.from_documents(
+    documents=docs_processed,
+    embedding=embedding_model,
+    distance_strategy=DistanceStrategy.COSINE,
+)
+if __name__ == "__main__":
+    # print(dataset)
+    # ds = Dataset.from_dict(dataset)
+    # dataset = ds.with_format("pandas")
+    print(vectordb)

main.py ADDED Viewed

	@@ -0,0 +1,6 @@

+def main():
+    print("Hello from final-assignment-template!")
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "final-assignment-template"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+    "datasets>=3.5.1",
+    "gradio[oauth]>=5.28.0",
+    "langchain>=0.3.24",
+    "langchain-community>=0.3.23",
+    "requests>=2.32.3",
+    "transformers>=4.51.3",
+]