Spaces:
Running
Running
My Duong
commited on
Commit
·
25b7393
1
Parent(s):
d594c97
update app
Browse files- .gitignore +2 -1
- app.py +18 -9
.gitignore
CHANGED
@@ -1,3 +1,4 @@
|
|
1 |
\demovv
|
2 |
BoPhapDienDienTu
|
3 |
-
vbpl_links.txt
|
|
|
|
1 |
\demovv
|
2 |
BoPhapDienDienTu
|
3 |
+
vbpl_links.txt
|
4 |
+
.env
|
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
import shutil
|
3 |
import numpy as np
|
4 |
from pathlib import Path
|
@@ -12,6 +13,11 @@ from torch.utils.data import DataLoader
|
|
12 |
from accelerate import Accelerator
|
13 |
from datasets import Dataset
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Wrapper for embedding
|
17 |
class SentenceTransformerWrapper:
|
@@ -156,9 +162,6 @@ def save_to_chromadb(
|
|
156 |
vector_db.persist()
|
157 |
print(f"Database saved successfully to {persist_directory}")
|
158 |
|
159 |
-
shutil.make_archive("chroma_db", "zip", "./chroma_db")
|
160 |
-
print("Vector database archived as chroma_db.zip")
|
161 |
-
|
162 |
return vector_db
|
163 |
|
164 |
|
@@ -187,9 +190,15 @@ if __name__ == "__main__":
|
|
187 |
# Step 6: Generate embeddings and save to ChromaDB
|
188 |
save_to_chromadb(processed_docs, processed_metadata, embedding_model)
|
189 |
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
import shutil
|
4 |
import numpy as np
|
5 |
from pathlib import Path
|
|
|
13 |
from accelerate import Accelerator
|
14 |
from datasets import Dataset
|
15 |
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
hf_token = os.getenv("HF_TOKEN")
|
19 |
+
if hf_token is None:
|
20 |
+
raise ValueError("HF_TOKEN not in the .env file")
|
21 |
|
22 |
# Wrapper for embedding
|
23 |
class SentenceTransformerWrapper:
|
|
|
162 |
vector_db.persist()
|
163 |
print(f"Database saved successfully to {persist_directory}")
|
164 |
|
|
|
|
|
|
|
165 |
return vector_db
|
166 |
|
167 |
|
|
|
190 |
# Step 6: Generate embeddings and save to ChromaDB
|
191 |
save_to_chromadb(processed_docs, processed_metadata, embedding_model)
|
192 |
|
193 |
+
shutil.make_archive("chroma_db", "zip", "./chroma_db")
|
194 |
+
print("Vector database archived as chroma_db.zip")
|
195 |
+
|
196 |
+
with open("chroma_db.zip", "rb") as f:
|
197 |
+
zip_bytes = f.read()
|
198 |
+
|
199 |
+
# Create a dataset from the zip file bytes
|
200 |
+
dataset = Dataset.from_dict({"vector_db_zip": [zip_bytes]})
|
201 |
+
|
202 |
+
# Push to the HF Datasets Hub (replace with your username, repo name, and use a valid token)
|
203 |
+
dataset.push_to_hub("camiellia/phapdien_demo", token=hf_token)
|
204 |
+
print("Vector database uploaded to the HF Datasets Hub.")
|