Spaces:
Sleeping
Sleeping
from datasets import load_dataset | |
from datasets import Dataset | |
import datasets | |
from tqdm import tqdm | |
from transformers import AutoTokenizer | |
from langchain.docstore.document import Document | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.vectorstores import FAISS | |
from langchain_community.embeddings import HuggingFaceEmbeddings | |
from langchain_community.vectorstores.utils import DistanceStrategy | |
knowledge_base = datasets.load_dataset("gaia-benchmark/GAIA", '2023_level1', split='test') | |
print(knowledge_base.column_names) | |
# ['task_id', 'Question', 'Level', 'Final answer', 'file_name', 'file_path', 'Annotator Metadata'] | |
source_docs = [ | |
Document( | |
page_content=doc["Question"], | |
metadata={ | |
"task_id": doc["task_id"], | |
"level": doc["Level"], | |
"final_answer": doc["Final answer"], | |
"file_name": doc["file_name"], | |
"file_path": doc["file_path"], | |
"annotator_metadata": doc["Annotator Metadata"], | |
}, | |
) | |
for doc in knowledge_base | |
] | |
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( | |
AutoTokenizer.from_pretrained("thenlper/gte-small"), | |
chunk_size=200, | |
chunk_overlap=20, | |
add_start_index=True, | |
strip_whitespace=True, | |
separators=["\n\n", "\n", ".", " ", ""], | |
) | |
# Split docs and keep only unique ones | |
print("Splitting documents...") | |
docs_processed = [] | |
unique_texts = {} | |
for doc in tqdm(source_docs): | |
new_docs = text_splitter.split_documents([doc]) | |
for new_doc in new_docs: | |
if new_doc.page_content not in unique_texts: | |
unique_texts[new_doc.page_content] = True | |
docs_processed.append(new_doc) | |
print("Embedding documents... This should take a few minutes (5 minutes on MacBook with M1 Pro)") | |
embedding_model = HuggingFaceEmbeddings(model_name="thenlper/gte-small") | |
vectordb = FAISS.from_documents( | |
documents=docs_processed, | |
embedding=embedding_model, | |
distance_strategy=DistanceStrategy.COSINE, | |
) | |
if __name__ == "__main__": | |
# print(dataset) | |
# ds = Dataset.from_dict(dataset) | |
# dataset = ds.with_format("pandas") | |
print(vectordb) | |