Muhammad Abdur Rahman Saad
fix pylint errors
72f4cb5
"""Keyword collector for historical backfill using LLM-based keyword extraction."""
from datetime import datetime, timedelta
import logging
from models.database import article_collection, keywords_collection # pylint: disable=import-error
from controllers.keyword import ( # pylint: disable=import-error
fetch_articles_for_period,
fetch_historical_keywords,
run_llm_extraction,
calculate_metrics_and_save_for_date
)
# Configure logger
logger = logging.getLogger(__name__)
def collect():
"""
Main collection function for historical keyword backfill.
This function performs a 60-day historical keyword backfill using the standard approach:
- Processes articles day by day to build historical context gradually
- Uses LLM-based keyword extraction with proper categorization
- Stores results in the keywords collection with frequency and popularity metrics
- Maintains consistency with existing keyword patterns
"""
logger.info("Starting 60-day historical keyword backfill")
# Calculate date range: 60 days ago to yesterday
end_date = datetime.now() - timedelta(days=1) # Yesterday
start_date = end_date - timedelta(days=60) # 60 days ago
logger.info("Processing articles from %s to %s", start_date.date(), end_date.date())
# Verify we have articles in the date range
total_articles = article_collection.count_documents({
"publishDate": {
"$gte": start_date.strftime("%Y-%m-%d"),
"$lte": end_date.strftime("%Y-%m-%d")
}
})
if total_articles == 0:
logger.warning("No articles found in the specified date range")
return
logger.info("Found %d articles to process", total_articles)
# Process day by day to build historical context gradually
current_date = start_date
processed_days = 0
total_keywords = 0
while current_date <= end_date:
logger.info("Processing %s", current_date.date())
# Get articles for this specific day
day_start = current_date.replace(hour=0, minute=0, second=0, microsecond=0)
day_end = current_date.replace(hour=23, minute=59, second=59, microsecond=999999)
day_articles = fetch_articles_for_period(article_collection, day_start, day_end)
if day_articles:
try:
# Get historical context from previously processed days
historical_keywords = fetch_historical_keywords(keywords_collection)
# Extract keywords for this day
extracted_keywords = run_llm_extraction(day_articles, historical_keywords)
if extracted_keywords:
# Calculate metrics and save
calculate_metrics_and_save_for_date(
keywords_collection,
extracted_keywords,
current_date
)
processed_days += 1
total_keywords += len(extracted_keywords)
logger.info("Extracted %d keywords for %s",
len(extracted_keywords), current_date.date())
else:
logger.warning("No keywords extracted for %s", current_date.date())
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Error processing %s: %s", current_date.date(), e)
else:
logger.info("No articles found for %s", current_date.date())
current_date += timedelta(days=1)
logger.info("Historical backfill completed")
logger.info("Summary: Processed %d days, extracted %d total keywords",
processed_days, total_keywords)
if processed_days > 0:
logger.info("Average keywords per day: %.1f",
total_keywords / processed_days)
if __name__ == "__main__":
collect()