File size: 4,453 Bytes
60d9d3a
 
 
69e37b7
60d9d3a
 
 
 
 
 
 
 
 
fa1cf80
 
cd659ed
 
 
 
 
 
 
60d9d3a
cd659ed
 
 
60d9d3a
 
 
cd659ed
 
60d9d3a
 
fa1cf80
 
 
66415cb
fa1cf80
66415cb
fa1cf80
 
 
 
 
 
 
 
 
 
60d9d3a
 
fa1cf80
cd659ed
 
 
69e37b7
cd659ed
60d9d3a
cd659ed
 
 
 
 
 
 
 
 
 
 
69e37b7
cd659ed
 
 
 
 
 
 
 
 
 
 
60d9d3a
69e37b7
cd659ed
60d9d3a
cd659ed
60d9d3a
cd659ed
 
60d9d3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import json
import chromadb
import pandas as pd
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Document

from dotenv import load_dotenv

load_dotenv()  # Load OPENAI_API_KEY from .env (not included in repo)

import gdown

data = None
def get_data(download=False):
    global data
    if data is None:
        data = Data(download)
    return data

class Data:
    def __init__(self, download=False):
        print("Initializing Data...")
        print(f"Download: {download}")
        self.client = None
        self.collection = None
        self.index = None
        if download:
            self.download_data()
        self.load_data()

    def download_data(self):
        # Download the already indexed data
        if not os.path.exists("./chroma_db"):
            try: 
                print("Downloading data...")
                file_id = "1JvYQ9E5zDBKRCUKkxejDvp7UGwzxDAUW"
                url = f"https://drive.google.com/uc?export=download&id={file_id}"
                output = "chroma_db.zip"
                gdown.download(url, output, quiet=False)
                print("Unzipping data...")
                os.system("unzip chroma_db.zip")
            except Exception as e:
                print(f"Error downloading data: {e}")

        return os.path.exists("./chroma_db")

    def load_data(self):
        print("Loading data...")
    
        with open('data/train-v1.1.json', 'r') as f:
            raw_data = json.load(f)  
            
        raw_documents = []
        documents = []

        for data in raw_data['data']:
            title = data['title']
            for par in data['paragraphs']:
                context = par['context']
                for qa in par['qas']:
                    question = qa['question']
                    answers = []
                    for ans in qa['answers']:
                        if ans['text'] not in answers:
                            answers.append(ans['text'])
                    for answer in answers:
                        raw_documents.append([title, context, question, answer])
                    
                    doc = f"""
                        Title: {title}
                        Context: {context}
                        Question: {question}
                        Acceptable Answers:
                        {[f"{i+1}. {ans}" for i, ans in enumerate(answers)]}
                    """
                    # Remove padding on each line
                    doc = "\n".join([line.strip() for line in doc.split("\n")])
                    documents.append(doc)

        self.df = pd.DataFrame(raw_documents, columns=["Title", "Context", "Question", "Answer"])
        self.documents = [Document(text=t) for t in documents]

        print("Raw Data loaded")

        if not os.path.exists("./chroma_db"):
            # Attempt to generate an index from the raw data
            print("Creating Chroma DB...")
            # initialize client, setting path to save data
            self.client = chromadb.PersistentClient(path="./chroma_db")

            # create collection
            self.collection = self.client.get_or_create_collection("simple_index")

            # assign chroma as the vector_store to the context
            vector_store = ChromaVectorStore(chroma_collection=self.collection)
            storage_context = StorageContext.from_defaults(vector_store=vector_store)

            # create your index
            self.index = VectorStoreIndex.from_documents(
                self.documents, storage_context=storage_context
            )
            print("Chroma DB created")
        else:
            print("Chroma DB already exists")

        print("Loading index...")
        # initialize client
        self.client = chromadb.PersistentClient(path="./chroma_db")

        # get collection
        self.collection = self.client.get_or_create_collection("simple_index")

        # assign chroma as the vector_store to the context
        vector_store = ChromaVectorStore(chroma_collection=self.collection)
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # load your index from stored vectors
        self.index = VectorStoreIndex.from_vector_store(
            vector_store, storage_context=storage_context
        )
        print("Index loaded")