vonliechti commited on
Commit
fa1cf80
·
verified ·
1 Parent(s): 359b180

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. data.py +43 -18
  3. requirements.txt +2 -1
.gitignore CHANGED
@@ -3,6 +3,7 @@
3
 
4
  # Data
5
  chroma_db/
 
6
 
7
  # Byte-compiled / optimized / DLL files
8
  __pycache__/
 
3
 
4
  # Data
5
  chroma_db/
6
+ data/
7
 
8
  # Byte-compiled / optimized / DLL files
9
  __pycache__/
data.py CHANGED
@@ -10,39 +10,64 @@ from dotenv import load_dotenv
10
 
11
  load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo)
12
 
 
 
13
  class Data:
14
  def __init__(self):
15
  self.client = None
16
  self.collection = None
17
  self.index = None
 
18
  self.load_data()
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def load_data(self):
21
  print("Loading data...")
22
- with open('data/train-v1.1.json', 'r') as f:
23
- raw_data = json.load(f)
 
 
 
24
 
25
- extracted_question = []
26
- extracted_answer = []
27
 
28
- for data in raw_data['data']:
29
- for par in data['paragraphs']:
30
- for qa in par['qas']:
31
- for ans in qa['answers']:
32
- extracted_question.append(qa['question'])
33
- extracted_answer.append(ans['text'])
34
 
35
- documents = []
36
- for i in range(len(extracted_question)):
37
- documents.append(f"Question: {extracted_question[i]} \nAnswer: {extracted_answer[i]}")
38
 
39
- self.documents = [Document(text=t) for t in documents]
40
- self.extracted_question = extracted_question
41
- self.extracted_answer = extracted_answer
42
 
43
- print("Raw Data loaded")
44
 
45
- if not os.path.exists("./chroma_db"):
46
  print("Creating Chroma DB...")
47
  # initialize client, setting path to save data
48
  self.client = chromadb.PersistentClient(path="./chroma_db")
 
10
 
11
  load_dotenv() # Load OPENAI_API_KEY from .env (not included in repo)
12
 
13
+ import gdown
14
+
15
  class Data:
16
  def __init__(self):
17
  self.client = None
18
  self.collection = None
19
  self.index = None
20
+ self.download_data()
21
  self.load_data()
22
 
23
+ def download_data(self):
24
+ # Download the already indexed data
25
+ if not os.path.exists("./chroma_db"):
26
+ try:
27
+ print("Downloading data...")
28
+ file_id = "12xLx8J0dhtZuc8G-7xVyldLVnB3eTmxe"
29
+ url = f"https://drive.google.com/uc?export=download&id={file_id}"
30
+ output = "chroma_db.zip"
31
+ gdown.download(url, output, quiet=False)
32
+ # download_file_from_google_drive(file_id, "chroma_db.zip")
33
+ # url = "https://drive.google.com/file/d/12xLx8J0dhtZuc8G-7xVyldLVnB3eTmxe/view?usp=sharing"
34
+ # url = "https://drive.google.com/uc?export=download&id=12xLx8J0dhtZuc8G-7xVyldLVnB3eTmxe"
35
+ # os.system(f"wget {url} -O chroma_db.zip")
36
+ print("Unzipping data...")
37
+ os.system("unzip chroma_db.zip")
38
+ except Exception as e:
39
+ print(f"Error downloading data: {e}")
40
+
41
+ return os.path.exists("./chroma_db")
42
+
43
  def load_data(self):
44
  print("Loading data...")
45
+
46
+ if not os.path.exists("./chroma_db"):
47
+ # Attempt to generate an index from the raw data
48
+ with open('data/train-v1.1.json', 'r') as f:
49
+ raw_data = json.load(f)
50
 
51
+ extracted_question = []
52
+ extracted_answer = []
53
 
54
+ for data in raw_data['data']:
55
+ for par in data['paragraphs']:
56
+ for qa in par['qas']:
57
+ for ans in qa['answers']:
58
+ extracted_question.append(qa['question'])
59
+ extracted_answer.append(ans['text'])
60
 
61
+ documents = []
62
+ for i in range(len(extracted_question)):
63
+ documents.append(f"Question: {extracted_question[i]} \nAnswer: {extracted_answer[i]}")
64
 
65
+ self.documents = [Document(text=t) for t in documents]
66
+ self.extracted_question = extracted_question
67
+ self.extracted_answer = extracted_answer
68
 
69
+ print("Raw Data loaded")
70
 
 
71
  print("Creating Chroma DB...")
72
  # initialize client, setting path to save data
73
  self.client = chromadb.PersistentClient(path="./chroma_db")
requirements.txt CHANGED
@@ -10,4 +10,5 @@ python-dotenv==1.0.1
10
  Requests==2.32.3
11
  transformers==4.45.2
12
  llama-index-vector-stores-chroma==0.2.0
13
-
 
 
10
  Requests==2.32.3
11
  transformers==4.45.2
12
  llama-index-vector-stores-chroma==0.2.0
13
+ torch==2.4.1
14
+ gdown==5.2.0