My Duong commited on
Commit
25b7393
·
1 Parent(s): d594c97

update app

Browse files
Files changed (2) hide show
  1. .gitignore +2 -1
  2. app.py +18 -9
.gitignore CHANGED
@@ -1,3 +1,4 @@
1
  \demovv
2
  BoPhapDienDienTu
3
- vbpl_links.txt
 
 
1
  \demovv
2
  BoPhapDienDienTu
3
+ vbpl_links.txt
4
+ .env
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import shutil
3
  import numpy as np
4
  from pathlib import Path
@@ -12,6 +13,11 @@ from torch.utils.data import DataLoader
12
  from accelerate import Accelerator
13
  from datasets import Dataset
14
 
 
 
 
 
 
15
 
16
  # Wrapper for embedding
17
  class SentenceTransformerWrapper:
@@ -156,9 +162,6 @@ def save_to_chromadb(
156
  vector_db.persist()
157
  print(f"Database saved successfully to {persist_directory}")
158
 
159
- shutil.make_archive("chroma_db", "zip", "./chroma_db")
160
- print("Vector database archived as chroma_db.zip")
161
-
162
  return vector_db
163
 
164
 
@@ -187,9 +190,15 @@ if __name__ == "__main__":
187
  # Step 6: Generate embeddings and save to ChromaDB
188
  save_to_chromadb(processed_docs, processed_metadata, embedding_model)
189
 
190
- # os.system("git lfs install")
191
- os.system("git add chroma_db/")
192
- os.system("git commit -m 'Persist vector database after processing'")
193
- os.system("git push")
194
- os.system("git config --global user.email '[email protected]'")
195
- os.system("git config --global user.name 'My Duong'")
 
 
 
 
 
 
 
1
  import os
2
+ from dotenv import load_dotenv
3
  import shutil
4
  import numpy as np
5
  from pathlib import Path
 
13
  from accelerate import Accelerator
14
  from datasets import Dataset
15
 
16
+ load_dotenv()
17
+
18
+ hf_token = os.getenv("HF_TOKEN")
19
+ if hf_token is None:
20
+ raise ValueError("HF_TOKEN not in the .env file")
21
 
22
  # Wrapper for embedding
23
  class SentenceTransformerWrapper:
 
162
  vector_db.persist()
163
  print(f"Database saved successfully to {persist_directory}")
164
 
 
 
 
165
  return vector_db
166
 
167
 
 
190
  # Step 6: Generate embeddings and save to ChromaDB
191
  save_to_chromadb(processed_docs, processed_metadata, embedding_model)
192
 
193
+ shutil.make_archive("chroma_db", "zip", "./chroma_db")
194
+ print("Vector database archived as chroma_db.zip")
195
+
196
+ with open("chroma_db.zip", "rb") as f:
197
+ zip_bytes = f.read()
198
+
199
+ # Create a dataset from the zip file bytes
200
+ dataset = Dataset.from_dict({"vector_db_zip": [zip_bytes]})
201
+
202
+ # Push to the HF Datasets Hub (replace with your username, repo name, and use a valid token)
203
+ dataset.push_to_hub("camiellia/phapdien_demo", token=hf_token)
204
+ print("Vector database uploaded to the HF Datasets Hub.")