"""Keyword collector for historical backfill using LLM-based keyword extraction.""" from datetime import datetime, timedelta import logging from models.database import article_collection, keywords_collection # pylint: disable=import-error from controllers.keyword import ( # pylint: disable=import-error fetch_articles_for_period, fetch_historical_keywords, run_llm_extraction, calculate_metrics_and_save_for_date ) # Configure logger logger = logging.getLogger(__name__) def collect(): """ Main collection function for historical keyword backfill. This function performs a 60-day historical keyword backfill using the standard approach: - Processes articles day by day to build historical context gradually - Uses LLM-based keyword extraction with proper categorization - Stores results in the keywords collection with frequency and popularity metrics - Maintains consistency with existing keyword patterns """ logger.info("Starting 60-day historical keyword backfill") # Calculate date range: 60 days ago to yesterday end_date = datetime.now() - timedelta(days=1) # Yesterday start_date = end_date - timedelta(days=60) # 60 days ago logger.info("Processing articles from %s to %s", start_date.date(), end_date.date()) # Verify we have articles in the date range total_articles = article_collection.count_documents({ "publishDate": { "$gte": start_date.strftime("%Y-%m-%d"), "$lte": end_date.strftime("%Y-%m-%d") } }) if total_articles == 0: logger.warning("No articles found in the specified date range") return logger.info("Found %d articles to process", total_articles) # Process day by day to build historical context gradually current_date = start_date processed_days = 0 total_keywords = 0 while current_date <= end_date: logger.info("Processing %s", current_date.date()) # Get articles for this specific day day_start = current_date.replace(hour=0, minute=0, second=0, microsecond=0) day_end = current_date.replace(hour=23, minute=59, second=59, microsecond=999999) day_articles = fetch_articles_for_period(article_collection, day_start, day_end) if day_articles: try: # Get historical context from previously processed days historical_keywords = fetch_historical_keywords(keywords_collection) # Extract keywords for this day extracted_keywords = run_llm_extraction(day_articles, historical_keywords) if extracted_keywords: # Calculate metrics and save calculate_metrics_and_save_for_date( keywords_collection, extracted_keywords, current_date ) processed_days += 1 total_keywords += len(extracted_keywords) logger.info("Extracted %d keywords for %s", len(extracted_keywords), current_date.date()) else: logger.warning("No keywords extracted for %s", current_date.date()) except Exception as e: # pylint: disable=broad-exception-caught logger.error("Error processing %s: %s", current_date.date(), e) else: logger.info("No articles found for %s", current_date.date()) current_date += timedelta(days=1) logger.info("Historical backfill completed") logger.info("Summary: Processed %d days, extracted %d total keywords", processed_days, total_keywords) if processed_days > 0: logger.info("Average keywords per day: %.1f", total_keywords / processed_days) if __name__ == "__main__": collect()