BackEnd / startup.py
HaRin2806
fix embedding data
89397a4
import os
import sys
import logging
from pathlib import Path
# Setup logging cho HuggingFace environment
def setup_logging():
"""Setup logging phù hợp với HF environment"""
if os.getenv("SPACE_ID"):
# Trên HF, chỉ log ra console
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[logging.StreamHandler()]
)
else:
# Local, có thể ghi file
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler("embed_data.log", encoding='utf-8')
]
)
setup_logging()
logger = logging.getLogger(__name__)
def setup_data():
"""Setup and embed data on startup"""
try:
logger.info("Starting data setup process...")
# Kiểm tra data directory
data_dir = "data"
if not os.path.exists(data_dir):
logger.error(f"Data directory {data_dir} not found!")
return
# Import sau khi đã setup logging
logger.info("Importing embedding modules...")
from core.embedding_model import get_embedding_model
# Kiểm tra xem đã có data chưa
logger.info("Checking existing embeddings...")
embedding_model = get_embedding_model()
current_count = embedding_model.count()
logger.info(f"Current embeddings count: {current_count}")
# Nếu chưa có data hoặc ít hơn expected, thì embed
if current_count < 50: # Threshold thấp hơn để test
logger.info("Starting data embedding process...")
# Import embed function
from scripts.embed_data import embed_all_data
# Chạy embedding
embed_all_data(data_dir, force=False)
# Kiểm tra lại
final_count = embedding_model.count()
logger.info(f"Embedding completed! Final count: {final_count}")
else:
logger.info("Data already embedded, skipping...")
except Exception as e:
logger.error(f"Error in setup_data: {e}")
import traceback
logger.error(traceback.format_exc())
if __name__ == "__main__":
setup_data()