File size: 4,453 Bytes
60d9d3a 69e37b7 60d9d3a fa1cf80 cd659ed 60d9d3a cd659ed 60d9d3a cd659ed 60d9d3a fa1cf80 66415cb fa1cf80 66415cb fa1cf80 60d9d3a fa1cf80 cd659ed 69e37b7 cd659ed 60d9d3a cd659ed 69e37b7 cd659ed 60d9d3a 69e37b7 cd659ed 60d9d3a cd659ed 60d9d3a cd659ed 60d9d3a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import json
import chromadb
import pandas as pd
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Document
from dotenv import load_dotenv
load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo)
import gdown
data = None
def get_data(download=False):
global data
if data is None:
data = Data(download)
return data
class Data:
def __init__(self, download=False):
print("Initializing Data...")
print(f"Download: {download}")
self.client = None
self.collection = None
self.index = None
if download:
self.download_data()
self.load_data()
def download_data(self):
# Download the already indexed data
if not os.path.exists("./chroma_db"):
try:
print("Downloading data...")
file_id = "1JvYQ9E5zDBKRCUKkxejDvp7UGwzxDAUW"
url = f"https://drive.google.com/uc?export=download&id={file_id}"
output = "chroma_db.zip"
gdown.download(url, output, quiet=False)
print("Unzipping data...")
os.system("unzip chroma_db.zip")
except Exception as e:
print(f"Error downloading data: {e}")
return os.path.exists("./chroma_db")
def load_data(self):
print("Loading data...")
with open('data/train-v1.1.json', 'r') as f:
raw_data = json.load(f)
raw_documents = []
documents = []
for data in raw_data['data']:
title = data['title']
for par in data['paragraphs']:
context = par['context']
for qa in par['qas']:
question = qa['question']
answers = []
for ans in qa['answers']:
if ans['text'] not in answers:
answers.append(ans['text'])
for answer in answers:
raw_documents.append([title, context, question, answer])
doc = f"""
Title: {title}
Context: {context}
Question: {question}
Acceptable Answers:
{[f"{i+1}. {ans}" for i, ans in enumerate(answers)]}
"""
# Remove padding on each line
doc = "\n".join([line.strip() for line in doc.split("\n")])
documents.append(doc)
self.df = pd.DataFrame(raw_documents, columns=["Title", "Context", "Question", "Answer"])
self.documents = [Document(text=t) for t in documents]
print("Raw Data loaded")
if not os.path.exists("./chroma_db"):
# Attempt to generate an index from the raw data
print("Creating Chroma DB...")
# initialize client, setting path to save data
self.client = chromadb.PersistentClient(path="./chroma_db")
# create collection
self.collection = self.client.get_or_create_collection("simple_index")
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=self.collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# create your index
self.index = VectorStoreIndex.from_documents(
self.documents, storage_context=storage_context
)
print("Chroma DB created")
else:
print("Chroma DB already exists")
print("Loading index...")
# initialize client
self.client = chromadb.PersistentClient(path="./chroma_db")
# get collection
self.collection = self.client.get_or_create_collection("simple_index")
# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=self.collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# load your index from stored vectors
self.index = VectorStoreIndex.from_vector_store(
vector_store, storage_context=storage_context
)
print("Index loaded") |