Spaces:
Running
Running
import os | |
import sys | |
import logging | |
from pathlib import Path | |
# Setup logging cho HuggingFace environment | |
def setup_logging(): | |
"""Setup logging phù hợp với HF environment""" | |
if os.getenv("SPACE_ID"): | |
# Trên HF, chỉ log ra console | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[logging.StreamHandler()] | |
) | |
else: | |
# Local, có thể ghi file | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
handlers=[ | |
logging.StreamHandler(), | |
logging.FileHandler("embed_data.log", encoding='utf-8') | |
] | |
) | |
setup_logging() | |
logger = logging.getLogger(__name__) | |
def setup_data(): | |
"""Setup and embed data on startup""" | |
try: | |
logger.info("Starting data setup process...") | |
# Kiểm tra data directory | |
data_dir = "data" | |
if not os.path.exists(data_dir): | |
logger.error(f"Data directory {data_dir} not found!") | |
return | |
# Import sau khi đã setup logging | |
logger.info("Importing embedding modules...") | |
from core.embedding_model import get_embedding_model | |
# Kiểm tra xem đã có data chưa | |
logger.info("Checking existing embeddings...") | |
embedding_model = get_embedding_model() | |
current_count = embedding_model.count() | |
logger.info(f"Current embeddings count: {current_count}") | |
# Nếu chưa có data hoặc ít hơn expected, thì embed | |
if current_count < 50: # Threshold thấp hơn để test | |
logger.info("Starting data embedding process...") | |
# Import embed function | |
from scripts.embed_data import embed_all_data | |
# Chạy embedding | |
embed_all_data(data_dir, force=False) | |
# Kiểm tra lại | |
final_count = embedding_model.count() | |
logger.info(f"Embedding completed! Final count: {final_count}") | |
else: | |
logger.info("Data already embedded, skipping...") | |
except Exception as e: | |
logger.error(f"Error in setup_data: {e}") | |
import traceback | |
logger.error(traceback.format_exc()) | |
if __name__ == "__main__": | |
setup_data() |