"""Keyword collector for historical backfill using LLM-based keyword extraction."""
from datetime import datetime, timedelta
import logging

from models.database import article_collection, keywords_collection # pylint: disable=import-error
from controllers.keyword import ( # pylint: disable=import-error
    fetch_articles_for_period,
    fetch_historical_keywords,
    run_llm_extraction,
    calculate_metrics_and_save_for_date
)

# Configure logger
logger = logging.getLogger(__name__)

def collect():
    """
    Main collection function for historical keyword backfill.
    
    This function performs a 60-day historical keyword backfill using the standard approach:
    - Processes articles day by day to build historical context gradually
    - Uses LLM-based keyword extraction with proper categorization
    - Stores results in the keywords collection with frequency and popularity metrics
    - Maintains consistency with existing keyword patterns
    """
    logger.info("Starting 60-day historical keyword backfill")

    # Calculate date range: 60 days ago to yesterday
    end_date = datetime.now() - timedelta(days=1)  # Yesterday
    start_date = end_date - timedelta(days=60)     # 60 days ago

    logger.info("Processing articles from %s to %s", start_date.date(), end_date.date())

    # Verify we have articles in the date range
    total_articles = article_collection.count_documents({
        "publishDate": {
            "$gte": start_date.strftime("%Y-%m-%d"),
            "$lte": end_date.strftime("%Y-%m-%d")
        }
    })

    if total_articles == 0:
        logger.warning("No articles found in the specified date range")
        return

    logger.info("Found %d articles to process", total_articles)

    # Process day by day to build historical context gradually
    current_date = start_date
    processed_days = 0
    total_keywords = 0

    while current_date <= end_date:
        logger.info("Processing %s", current_date.date())

        # Get articles for this specific day
        day_start = current_date.replace(hour=0, minute=0, second=0, microsecond=0)
        day_end = current_date.replace(hour=23, minute=59, second=59, microsecond=999999)

        day_articles = fetch_articles_for_period(article_collection, day_start, day_end)

        if day_articles:
            try:
                # Get historical context from previously processed days
                historical_keywords = fetch_historical_keywords(keywords_collection)

                # Extract keywords for this day
                extracted_keywords = run_llm_extraction(day_articles, historical_keywords)

                if extracted_keywords:
                    # Calculate metrics and save
                    calculate_metrics_and_save_for_date(
                        keywords_collection,
                        extracted_keywords,
                        current_date
                    )
                    processed_days += 1
                    total_keywords += len(extracted_keywords)
                    logger.info("Extracted %d keywords for %s",
                        len(extracted_keywords), current_date.date())
                else:
                    logger.warning("No keywords extracted for %s", current_date.date())
            except Exception as e: # pylint: disable=broad-exception-caught
                logger.error("Error processing %s: %s", current_date.date(), e)
        else:
            logger.info("No articles found for %s", current_date.date())

        current_date += timedelta(days=1)

    logger.info("Historical backfill completed")
    logger.info("Summary: Processed %d days, extracted %d total keywords",
        processed_days, total_keywords)
    if processed_days > 0:
        logger.info("Average keywords per day: %.1f",
            total_keywords / processed_days)

if __name__ == "__main__":
    collect()