import os import sys import logging from pathlib import Path # Setup logging cho HuggingFace environment def setup_logging(): """Setup logging phù hợp với HF environment""" if os.getenv("SPACE_ID"): # Trên HF, chỉ log ra console logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler()] ) else: # Local, có thể ghi file logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler("embed_data.log", encoding='utf-8') ] ) setup_logging() logger = logging.getLogger(__name__) def setup_data(): """Setup and embed data on startup""" try: logger.info("Starting data setup process...") # Kiểm tra data directory data_dir = "data" if not os.path.exists(data_dir): logger.error(f"Data directory {data_dir} not found!") return # Import sau khi đã setup logging logger.info("Importing embedding modules...") from core.embedding_model import get_embedding_model # Kiểm tra xem đã có data chưa logger.info("Checking existing embeddings...") embedding_model = get_embedding_model() current_count = embedding_model.count() logger.info(f"Current embeddings count: {current_count}") # Nếu chưa có data hoặc ít hơn expected, thì embed if current_count < 50: # Threshold thấp hơn để test logger.info("Starting data embedding process...") # Import embed function from scripts.embed_data import embed_all_data # Chạy embedding embed_all_data(data_dir, force=False) # Kiểm tra lại final_count = embedding_model.count() logger.info(f"Embedding completed! Final count: {final_count}") else: logger.info("Data already embedded, skipping...") except Exception as e: logger.error(f"Error in setup_data: {e}") import traceback logger.error(traceback.format_exc()) if __name__ == "__main__": setup_data()