Spaces:
Running
Running
HaRin2806
commited on
Commit
·
89397a4
1
Parent(s):
0d96daf
fix embedding data
Browse files- app.py +69 -64
- scripts/embed_data.py +96 -70
- startup.py +46 -13
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
from flask import Flask, jsonify
|
2 |
from flask_cors import CORS
|
3 |
import logging
|
4 |
import os
|
@@ -6,13 +6,20 @@ import threading
|
|
6 |
from dotenv import load_dotenv
|
7 |
from flask_jwt_extended import JWTManager
|
8 |
import datetime
|
9 |
-
from api.admin import admin_routes
|
10 |
|
11 |
-
# Cấu hình logging
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
logger = logging.getLogger(__name__)
|
17 |
|
18 |
# Tải biến môi trường
|
@@ -33,31 +40,14 @@ app.config['MAX_CONTENT_LENGTH'] = 50 * 1024 * 1024 # 50MB max file size
|
|
33 |
|
34 |
jwt = JWTManager(app)
|
35 |
|
36 |
-
# Cho phép CORS
|
37 |
CORS(app, resources={
|
38 |
r"/api/*": {
|
39 |
-
"origins":
|
40 |
"supports_credentials": True
|
41 |
}
|
42 |
})
|
43 |
|
44 |
-
# Đăng ký các API endpoints cho user
|
45 |
-
from api.auth import auth_routes
|
46 |
-
from api.chat import chat_routes
|
47 |
-
from api.data import data_routes
|
48 |
-
from api.history import history_routes
|
49 |
-
from api.feedback import feedback_routes
|
50 |
-
|
51 |
-
# Đăng ký các blueprint cho user
|
52 |
-
app.register_blueprint(auth_routes, url_prefix='/api/auth')
|
53 |
-
app.register_blueprint(chat_routes, url_prefix='/api')
|
54 |
-
app.register_blueprint(data_routes, url_prefix='/api')
|
55 |
-
app.register_blueprint(history_routes, url_prefix='/api')
|
56 |
-
app.register_blueprint(feedback_routes, url_prefix='/api')
|
57 |
-
|
58 |
-
# Đăng ký các blueprint cho admin
|
59 |
-
app.register_blueprint(admin_routes, url_prefix='/api/admin')
|
60 |
-
|
61 |
def setup_data_background():
|
62 |
"""Setup data in background thread"""
|
63 |
try:
|
@@ -67,6 +57,22 @@ def setup_data_background():
|
|
67 |
logger.info("Background data setup completed")
|
68 |
except Exception as e:
|
69 |
logger.error(f"Background setup failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
@app.route('/api/health', methods=['GET'])
|
72 |
def health_check():
|
@@ -84,7 +90,8 @@ def health_check():
|
|
84 |
"status": "healthy",
|
85 |
"message": "Server đang hoạt động",
|
86 |
"time": time.strftime('%Y-%m-%d %H:%M:%S'),
|
87 |
-
"data_items": collection_count
|
|
|
88 |
})
|
89 |
|
90 |
@app.route('/api/data-status', methods=['GET'])
|
@@ -120,40 +127,14 @@ def data_status():
|
|
120 |
"error": str(e)
|
121 |
}), 500
|
122 |
|
123 |
-
@app.route('/api/admin/init', methods=['POST'])
|
124 |
-
def init_admin():
|
125 |
-
"""API endpoint để khởi tạo admin đầu tiên"""
|
126 |
-
try:
|
127 |
-
from models.admin_model import AdminUser
|
128 |
-
|
129 |
-
success, result = AdminUser.create_default_super_admin()
|
130 |
-
|
131 |
-
if success:
|
132 |
-
return jsonify({
|
133 |
-
"success": True,
|
134 |
-
"message": "Khởi tạo admin thành công",
|
135 |
-
"admin_info": result
|
136 |
-
})
|
137 |
-
else:
|
138 |
-
return jsonify({
|
139 |
-
"success": False,
|
140 |
-
"error": result
|
141 |
-
}), 400
|
142 |
-
|
143 |
-
except Exception as e:
|
144 |
-
logger.error(f"Lỗi khởi tạo admin: {str(e)}")
|
145 |
-
return jsonify({
|
146 |
-
"success": False,
|
147 |
-
"error": str(e)
|
148 |
-
}), 500
|
149 |
-
|
150 |
@app.route('/api/embed-data', methods=['POST'])
|
151 |
def manual_embed_data():
|
152 |
"""API endpoint để chạy embedding data thủ công"""
|
153 |
try:
|
154 |
-
|
|
|
|
|
155 |
|
156 |
-
force = request.json.get('force', False) if request.is_json else False
|
157 |
data_dir = "data"
|
158 |
|
159 |
if not os.path.exists(data_dir):
|
@@ -165,8 +146,12 @@ def manual_embed_data():
|
|
165 |
# Chạy embedding trong thread riêng để không block request
|
166 |
def run_embedding():
|
167 |
try:
|
168 |
-
embed_all_data
|
169 |
-
|
|
|
|
|
|
|
|
|
170 |
except Exception as e:
|
171 |
logger.error(f"Manual embedding failed: {e}")
|
172 |
|
@@ -184,15 +169,37 @@ def manual_embed_data():
|
|
184 |
"error": str(e)
|
185 |
}), 500
|
186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
# Auto setup data khi chạy trên HuggingFace
|
188 |
-
if os.getenv("SPACE_ID"):
|
189 |
logger.info("Detected HuggingFace environment, starting background data setup...")
|
190 |
threading.Thread(target=setup_data_background, daemon=True).start()
|
191 |
else:
|
192 |
logger.info("Running in local environment")
|
193 |
|
194 |
if __name__ == '__main__':
|
195 |
-
# Tạo
|
196 |
try:
|
197 |
from models.admin_model import AdminUser
|
198 |
success, result = AdminUser.create_default_super_admin()
|
@@ -204,7 +211,6 @@ if __name__ == '__main__':
|
|
204 |
except Exception as e:
|
205 |
logger.error(f"Lỗi tạo super admin: {e}")
|
206 |
|
207 |
-
# Tạo indexes cho feedback
|
208 |
try:
|
209 |
from models.feedback_model import ensure_indexes
|
210 |
ensure_indexes()
|
@@ -212,6 +218,5 @@ if __name__ == '__main__':
|
|
212 |
logger.error(f"Lỗi tạo feedback indexes: {e}")
|
213 |
|
214 |
# Chạy Flask app
|
215 |
-
port = int(os.getenv("PORT", 7860))
|
216 |
-
app.run(host='0.0.0.0', port=port, debug=False)
|
217 |
-
|
|
|
1 |
+
from flask import Flask, jsonify, request
|
2 |
from flask_cors import CORS
|
3 |
import logging
|
4 |
import os
|
|
|
6 |
from dotenv import load_dotenv
|
7 |
from flask_jwt_extended import JWTManager
|
8 |
import datetime
|
|
|
9 |
|
10 |
+
# Cấu hình logging cho HuggingFace
|
11 |
+
if os.getenv("SPACE_ID"):
|
12 |
+
logging.basicConfig(
|
13 |
+
level=logging.INFO,
|
14 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
15 |
+
handlers=[logging.StreamHandler()]
|
16 |
+
)
|
17 |
+
else:
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.INFO,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
21 |
+
)
|
22 |
+
|
23 |
logger = logging.getLogger(__name__)
|
24 |
|
25 |
# Tải biến môi trường
|
|
|
40 |
|
41 |
jwt = JWTManager(app)
|
42 |
|
43 |
+
# Cho phép CORS
|
44 |
CORS(app, resources={
|
45 |
r"/api/*": {
|
46 |
+
"origins": "*", # Cho phép tất cả origins trên HF
|
47 |
"supports_credentials": True
|
48 |
}
|
49 |
})
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
def setup_data_background():
|
52 |
"""Setup data in background thread"""
|
53 |
try:
|
|
|
57 |
logger.info("Background data setup completed")
|
58 |
except Exception as e:
|
59 |
logger.error(f"Background setup failed: {e}")
|
60 |
+
import traceback
|
61 |
+
logger.error(traceback.format_exc())
|
62 |
+
|
63 |
+
# Basic routes first
|
64 |
+
@app.route('/', methods=['GET'])
|
65 |
+
def root():
|
66 |
+
"""Root endpoint"""
|
67 |
+
return jsonify({
|
68 |
+
"message": "Nutribot API is running",
|
69 |
+
"status": "healthy",
|
70 |
+
"endpoints": {
|
71 |
+
"health": "/api/health",
|
72 |
+
"data_status": "/api/data-status",
|
73 |
+
"embed_data": "/api/embed-data"
|
74 |
+
}
|
75 |
+
})
|
76 |
|
77 |
@app.route('/api/health', methods=['GET'])
|
78 |
def health_check():
|
|
|
90 |
"status": "healthy",
|
91 |
"message": "Server đang hoạt động",
|
92 |
"time": time.strftime('%Y-%m-%d %H:%M:%S'),
|
93 |
+
"data_items": collection_count,
|
94 |
+
"environment": "huggingface" if os.getenv("SPACE_ID") else "local"
|
95 |
})
|
96 |
|
97 |
@app.route('/api/data-status', methods=['GET'])
|
|
|
127 |
"error": str(e)
|
128 |
}), 500
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
@app.route('/api/embed-data', methods=['POST'])
|
131 |
def manual_embed_data():
|
132 |
"""API endpoint để chạy embedding data thủ công"""
|
133 |
try:
|
134 |
+
force = False
|
135 |
+
if request.is_json and request.json:
|
136 |
+
force = request.json.get('force', False)
|
137 |
|
|
|
138 |
data_dir = "data"
|
139 |
|
140 |
if not os.path.exists(data_dir):
|
|
|
146 |
# Chạy embedding trong thread riêng để không block request
|
147 |
def run_embedding():
|
148 |
try:
|
149 |
+
from scripts.embed_data import embed_all_data
|
150 |
+
success = embed_all_data(data_dir, force=force)
|
151 |
+
if success:
|
152 |
+
logger.info("Manual embedding completed successfully")
|
153 |
+
else:
|
154 |
+
logger.error("Manual embedding failed")
|
155 |
except Exception as e:
|
156 |
logger.error(f"Manual embedding failed: {e}")
|
157 |
|
|
|
169 |
"error": str(e)
|
170 |
}), 500
|
171 |
|
172 |
+
# Import và đăng ký các blueprint sau khi định nghĩa basic routes
|
173 |
+
try:
|
174 |
+
from api.auth import auth_routes
|
175 |
+
from api.chat import chat_routes
|
176 |
+
from api.data import data_routes
|
177 |
+
from api.history import history_routes
|
178 |
+
from api.feedback import feedback_routes
|
179 |
+
from api.admin import admin_routes
|
180 |
+
|
181 |
+
# Đăng ký các blueprint
|
182 |
+
app.register_blueprint(auth_routes, url_prefix='/api/auth')
|
183 |
+
app.register_blueprint(chat_routes, url_prefix='/api')
|
184 |
+
app.register_blueprint(data_routes, url_prefix='/api')
|
185 |
+
app.register_blueprint(history_routes, url_prefix='/api')
|
186 |
+
app.register_blueprint(feedback_routes, url_prefix='/api')
|
187 |
+
app.register_blueprint(admin_routes, url_prefix='/api/admin')
|
188 |
+
|
189 |
+
logger.info("All blueprints registered successfully")
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
logger.error(f"Error importing/registering blueprints: {e}")
|
193 |
+
|
194 |
# Auto setup data khi chạy trên HuggingFace
|
195 |
+
if os.getenv("SPACE_ID"):
|
196 |
logger.info("Detected HuggingFace environment, starting background data setup...")
|
197 |
threading.Thread(target=setup_data_background, daemon=True).start()
|
198 |
else:
|
199 |
logger.info("Running in local environment")
|
200 |
|
201 |
if __name__ == '__main__':
|
202 |
+
# Tạo admin và feedback indexes
|
203 |
try:
|
204 |
from models.admin_model import AdminUser
|
205 |
success, result = AdminUser.create_default_super_admin()
|
|
|
211 |
except Exception as e:
|
212 |
logger.error(f"Lỗi tạo super admin: {e}")
|
213 |
|
|
|
214 |
try:
|
215 |
from models.feedback_model import ensure_indexes
|
216 |
ensure_indexes()
|
|
|
218 |
logger.error(f"Lỗi tạo feedback indexes: {e}")
|
219 |
|
220 |
# Chạy Flask app
|
221 |
+
port = int(os.getenv("PORT", 7860))
|
222 |
+
app.run(host='0.0.0.0', port=port, debug=False)
|
|
scripts/embed_data.py
CHANGED
@@ -2,7 +2,6 @@ import os
|
|
2 |
import sys
|
3 |
import time
|
4 |
import argparse
|
5 |
-
import logging
|
6 |
|
7 |
# Set UTF-8 encoding cho console
|
8 |
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
@@ -13,83 +12,104 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
13 |
from core.data_processor import DataProcessor
|
14 |
from core.embedding_model import get_embedding_model
|
15 |
|
16 |
-
# Cấu hình logging với
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
logging.
|
23 |
-
|
24 |
-
)
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
def embed_all_data(data_dir, force=False):
|
28 |
"""
|
29 |
Embedding tất cả dữ liệu từ thư mục data
|
30 |
-
|
31 |
-
Args:
|
32 |
-
data_dir: Đường dẫn đến thư mục chứa dữ liệu
|
33 |
-
force: Nếu True, sẽ xóa và tạo lại chỉ mục hiện có
|
34 |
"""
|
35 |
logger.info(f"Bat dau qua trinh embedding du lieu tu {data_dir}")
|
36 |
start_time = time.time()
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
logger.info("
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
logger.info("
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
83 |
if not success:
|
84 |
-
logger.error(
|
85 |
-
return
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
if __name__ == "__main__":
|
95 |
parser = argparse.ArgumentParser(description="Embedding du lieu cho he thong Nutribot")
|
@@ -109,4 +129,10 @@ if __name__ == "__main__":
|
|
109 |
sys.exit(1)
|
110 |
|
111 |
# Thực hiện embedding
|
112 |
-
embed_all_data(data_dir, args.force)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import sys
|
3 |
import time
|
4 |
import argparse
|
|
|
5 |
|
6 |
# Set UTF-8 encoding cho console
|
7 |
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
|
|
12 |
from core.data_processor import DataProcessor
|
13 |
from core.embedding_model import get_embedding_model
|
14 |
|
15 |
+
# Cấu hình logging phù hợp với environment
|
16 |
+
def setup_logging():
|
17 |
+
import logging
|
18 |
+
|
19 |
+
if os.getenv("SPACE_ID"):
|
20 |
+
# Trên HuggingFace, chỉ log ra console
|
21 |
+
logging.basicConfig(
|
22 |
+
level=logging.INFO,
|
23 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
24 |
+
handlers=[logging.StreamHandler()]
|
25 |
+
)
|
26 |
+
else:
|
27 |
+
# Local environment
|
28 |
+
logging.basicConfig(
|
29 |
+
level=logging.INFO,
|
30 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
31 |
+
handlers=[
|
32 |
+
logging.StreamHandler(),
|
33 |
+
logging.FileHandler("embed_data.log", encoding='utf-8')
|
34 |
+
]
|
35 |
+
)
|
36 |
+
|
37 |
+
return logging.getLogger("embed_data")
|
38 |
+
|
39 |
+
logger = setup_logging()
|
40 |
|
41 |
def embed_all_data(data_dir, force=False):
|
42 |
"""
|
43 |
Embedding tất cả dữ liệu từ thư mục data
|
|
|
|
|
|
|
|
|
44 |
"""
|
45 |
logger.info(f"Bat dau qua trinh embedding du lieu tu {data_dir}")
|
46 |
start_time = time.time()
|
47 |
|
48 |
+
try:
|
49 |
+
# Khởi tạo các components
|
50 |
+
logger.info("Khoi tao data processor...")
|
51 |
+
data_processor = DataProcessor(data_dir=data_dir)
|
52 |
+
|
53 |
+
logger.info("Khoi tao embedding model...")
|
54 |
+
embedding_model = get_embedding_model()
|
55 |
+
|
56 |
+
# Kiểm tra xem có chỉ mục hiện có không
|
57 |
+
collection_size = embedding_model.count()
|
58 |
+
logger.info(f"Kich thuoc collection hien tai: {collection_size}")
|
59 |
+
|
60 |
+
if collection_size > 0 and not force:
|
61 |
+
logger.info(f"Da ton tai chi muc voi {collection_size} items")
|
62 |
+
end_time = time.time()
|
63 |
+
logger.info(f"Hoan thanh kiem tra chi muc trong {end_time - start_time:.2f} giay")
|
64 |
+
return True
|
65 |
+
|
66 |
+
# Nếu buộc tạo lại hoặc chưa có chỉ mục, tạo mới
|
67 |
+
if force and collection_size > 0:
|
68 |
+
logger.info("Xoa chi muc cu va tao lai...")
|
69 |
+
try:
|
70 |
+
embedding_model.delete_collection()
|
71 |
+
logger.info("Da xoa va tao lai collection")
|
72 |
+
except Exception as e:
|
73 |
+
logger.error(f"Loi khi xoa collection: {e}")
|
74 |
+
|
75 |
+
# Chuẩn bị dữ liệu cho embedding
|
76 |
+
logger.info("Dang chuan bi du lieu cho qua trinh embedding...")
|
77 |
+
all_items = data_processor.prepare_for_embedding()
|
78 |
+
logger.info(f"Da chuan bi {len(all_items)} items de embedding")
|
79 |
+
|
80 |
+
if not all_items:
|
81 |
+
logger.warning("Khong co items nao de embedding")
|
82 |
+
return False
|
83 |
+
|
84 |
+
# Thống kê các loại dữ liệu
|
85 |
+
text_chunks = len([item for item in all_items if item.get("metadata", {}).get("content_type") == "text"])
|
86 |
+
tables = len([item for item in all_items if item.get("metadata", {}).get("content_type") == "table"])
|
87 |
+
figures = len([item for item in all_items if item.get("metadata", {}).get("content_type") == "figure"])
|
88 |
+
|
89 |
+
logger.info(f"Bao gom: {text_chunks} van ban, {tables} bang bieu, {figures} hinh anh")
|
90 |
+
|
91 |
+
# Thực hiện embedding
|
92 |
+
logger.info("Bat dau qua trinh embedding...")
|
93 |
+
success = embedding_model.index_chunks(all_items)
|
94 |
+
|
95 |
if not success:
|
96 |
+
logger.error("Loi xu ly embedding")
|
97 |
+
return False
|
98 |
+
|
99 |
+
end_time = time.time()
|
100 |
+
elapsed_time = end_time - start_time
|
101 |
+
|
102 |
+
# Kiểm tra kết quả cuối cùng
|
103 |
+
final_count = embedding_model.count()
|
104 |
+
logger.info(f"Hoan thanh qua trinh embedding {final_count} items trong {elapsed_time:.2f} giay")
|
105 |
+
|
106 |
+
return True
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
logger.error(f"Loi embedding: {str(e)}")
|
110 |
+
import traceback
|
111 |
+
logger.error(traceback.format_exc())
|
112 |
+
return False
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
parser = argparse.ArgumentParser(description="Embedding du lieu cho he thong Nutribot")
|
|
|
129 |
sys.exit(1)
|
130 |
|
131 |
# Thực hiện embedding
|
132 |
+
success = embed_all_data(data_dir, args.force)
|
133 |
+
if success:
|
134 |
+
logger.info("Embedding hoan thanh thanh cong!")
|
135 |
+
sys.exit(0)
|
136 |
+
else:
|
137 |
+
logger.error("Embedding that bai!")
|
138 |
+
sys.exit(1)
|
startup.py
CHANGED
@@ -3,39 +3,72 @@ import sys
|
|
3 |
import logging
|
4 |
from pathlib import Path
|
5 |
|
6 |
-
# Setup logging
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
logger = logging.getLogger(__name__)
|
9 |
|
10 |
def setup_data():
|
11 |
"""Setup and embed data on startup"""
|
12 |
try:
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
from core.embedding_model import get_embedding_model
|
16 |
|
17 |
# Kiểm tra xem đã có data chưa
|
|
|
18 |
embedding_model = get_embedding_model()
|
19 |
current_count = embedding_model.count()
|
20 |
|
21 |
logger.info(f"Current embeddings count: {current_count}")
|
22 |
|
23 |
# Nếu chưa có data hoặc ít hơn expected, thì embed
|
24 |
-
if current_count <
|
25 |
logger.info("Starting data embedding process...")
|
26 |
-
data_dir = "data"
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
33 |
else:
|
34 |
logger.info("Data already embedded, skipping...")
|
35 |
|
36 |
except Exception as e:
|
37 |
logger.error(f"Error in setup_data: {e}")
|
|
|
|
|
38 |
|
39 |
if __name__ == "__main__":
|
40 |
-
setup_data()
|
41 |
-
|
|
|
3 |
import logging
|
4 |
from pathlib import Path
|
5 |
|
6 |
+
# Setup logging cho HuggingFace environment
|
7 |
+
def setup_logging():
|
8 |
+
"""Setup logging phù hợp với HF environment"""
|
9 |
+
if os.getenv("SPACE_ID"):
|
10 |
+
# Trên HF, chỉ log ra console
|
11 |
+
logging.basicConfig(
|
12 |
+
level=logging.INFO,
|
13 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
14 |
+
handlers=[logging.StreamHandler()]
|
15 |
+
)
|
16 |
+
else:
|
17 |
+
# Local, có thể ghi file
|
18 |
+
logging.basicConfig(
|
19 |
+
level=logging.INFO,
|
20 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
21 |
+
handlers=[
|
22 |
+
logging.StreamHandler(),
|
23 |
+
logging.FileHandler("embed_data.log", encoding='utf-8')
|
24 |
+
]
|
25 |
+
)
|
26 |
+
|
27 |
+
setup_logging()
|
28 |
logger = logging.getLogger(__name__)
|
29 |
|
30 |
def setup_data():
|
31 |
"""Setup and embed data on startup"""
|
32 |
try:
|
33 |
+
logger.info("Starting data setup process...")
|
34 |
+
|
35 |
+
# Kiểm tra data directory
|
36 |
+
data_dir = "data"
|
37 |
+
if not os.path.exists(data_dir):
|
38 |
+
logger.error(f"Data directory {data_dir} not found!")
|
39 |
+
return
|
40 |
+
|
41 |
+
# Import sau khi đã setup logging
|
42 |
+
logger.info("Importing embedding modules...")
|
43 |
from core.embedding_model import get_embedding_model
|
44 |
|
45 |
# Kiểm tra xem đã có data chưa
|
46 |
+
logger.info("Checking existing embeddings...")
|
47 |
embedding_model = get_embedding_model()
|
48 |
current_count = embedding_model.count()
|
49 |
|
50 |
logger.info(f"Current embeddings count: {current_count}")
|
51 |
|
52 |
# Nếu chưa có data hoặc ít hơn expected, thì embed
|
53 |
+
if current_count < 50: # Threshold thấp hơn để test
|
54 |
logger.info("Starting data embedding process...")
|
|
|
55 |
|
56 |
+
# Import embed function
|
57 |
+
from scripts.embed_data import embed_all_data
|
58 |
+
|
59 |
+
# Chạy embedding
|
60 |
+
embed_all_data(data_dir, force=False)
|
61 |
+
|
62 |
+
# Kiểm tra lại
|
63 |
+
final_count = embedding_model.count()
|
64 |
+
logger.info(f"Embedding completed! Final count: {final_count}")
|
65 |
else:
|
66 |
logger.info("Data already embedded, skipping...")
|
67 |
|
68 |
except Exception as e:
|
69 |
logger.error(f"Error in setup_data: {e}")
|
70 |
+
import traceback
|
71 |
+
logger.error(traceback.format_exc())
|
72 |
|
73 |
if __name__ == "__main__":
|
74 |
+
setup_data()
|
|