Diego Staphorst commited on
Commit
9e4d4b4
·
1 Parent(s): 81917a3

feat(langchain) knowledge base

Browse files
Files changed (6) hide show
  1. .python-version +1 -0
  2. agent.py +15 -0
  3. app.py +1 -11
  4. dataset.py +66 -0
  5. main.py +6 -0
  6. pyproject.toml +14 -0
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
agent.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class BasicAgent:
2
+ """
3
+ A simple agent that returns a fixed answer for any question.
4
+ """
5
+ def __init__(self):
6
+ print("BasicAgent initialized.")
7
+
8
+ def __call__(self, question: str) -> str:
9
+ """
10
+ Processes the question and returns a fixed answer.
11
+ """
12
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
13
+ fixed_answer = "This is a default answer."
14
+ print(f"Agent returning fixed answer: {fixed_answer}")
15
+ return fixed_answer
app.py CHANGED
@@ -3,21 +3,11 @@ import gradio as gr
3
  import requests
4
  import inspect
5
  import pandas as pd
 
6
 
7
- # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
 
22
  def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
 
3
  import requests
4
  import inspect
5
  import pandas as pd
6
+ from agent import BasicAgent # Import the agent
7
 
 
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  def run_and_submit_all( profile: gr.OAuthProfile | None):
13
  """
dataset.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from datasets import Dataset
3
+ import datasets
4
+
5
+
6
+ from tqdm import tqdm
7
+ from transformers import AutoTokenizer
8
+ from langchain.docstore.document import Document
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from langchain.vectorstores import FAISS
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain_community.vectorstores.utils import DistanceStrategy
13
+
14
+ knowledge_base = datasets.load_dataset("gaia-benchmark/GAIA", '2023_level1', split='test')
15
+ print(knowledge_base.column_names)
16
+ # ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata']
17
+ source_docs = [
18
+ Document(
19
+ page_content=doc["Question"],
20
+ metadata={
21
+ "task_id": doc["task_id"],
22
+ "level": doc["Level"],
23
+ "final_answer": doc["Final answer"],
24
+ "file_name": doc["file_name"],
25
+ "file_path": doc["file_path"],
26
+ "annotator_metadata": doc["Annotator Metadata"],
27
+ },
28
+ )
29
+ for doc in knowledge_base
30
+ ]
31
+
32
+ text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
33
+ AutoTokenizer.from_pretrained("thenlper/gte-small"),
34
+ chunk_size=200,
35
+ chunk_overlap=20,
36
+ add_start_index=True,
37
+ strip_whitespace=True,
38
+ separators=["\n\n", "\n", ".", " ", ""],
39
+ )
40
+
41
+ # Split docs and keep only unique ones
42
+ print("Splitting documents...")
43
+ docs_processed = []
44
+ unique_texts = {}
45
+ for doc in tqdm(source_docs):
46
+ new_docs = text_splitter.split_documents([doc])
47
+ for new_doc in new_docs:
48
+ if new_doc.page_content not in unique_texts:
49
+ unique_texts[new_doc.page_content] = True
50
+ docs_processed.append(new_doc)
51
+
52
+ print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)")
53
+ embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small")
54
+ vectordb = FAISS.from_documents(
55
+ documents=docs_processed,
56
+ embedding=embedding_model,
57
+ distance_strategy=DistanceStrategy.COSINE,
58
+ )
59
+
60
+
61
+
62
+ if __name__ == "__main__":
63
+ # print(dataset)
64
+ # ds = Dataset.from_dict(dataset)
65
+ # dataset = ds.with_format("pandas")
66
+ print(vectordb)
main.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ def main():
2
+ print("Hello from final-assignment-template!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "final-assignment-template"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "datasets>=3.5.1",
9
+ "gradio[oauth]>=5.28.0",
10
+ "langchain>=0.3.24",
11
+ "langchain-community>=0.3.23",
12
+ "requests>=2.32.3",
13
+ "transformers>=4.51.3",
14
+ ]