|
"""Keyword collector for historical backfill using LLM-based keyword extraction.""" |
|
from datetime import datetime, timedelta |
|
import logging |
|
|
|
from models.database import article_collection, keywords_collection |
|
from controllers.keyword import ( |
|
fetch_articles_for_period, |
|
fetch_historical_keywords, |
|
run_llm_extraction, |
|
calculate_metrics_and_save_for_date |
|
) |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def collect(): |
|
""" |
|
Main collection function for historical keyword backfill. |
|
|
|
This function performs a 60-day historical keyword backfill using the standard approach: |
|
- Processes articles day by day to build historical context gradually |
|
- Uses LLM-based keyword extraction with proper categorization |
|
- Stores results in the keywords collection with frequency and popularity metrics |
|
- Maintains consistency with existing keyword patterns |
|
""" |
|
logger.info("Starting 60-day historical keyword backfill") |
|
|
|
|
|
end_date = datetime.now() - timedelta(days=1) |
|
start_date = end_date - timedelta(days=60) |
|
|
|
logger.info("Processing articles from %s to %s", start_date.date(), end_date.date()) |
|
|
|
|
|
total_articles = article_collection.count_documents({ |
|
"publishDate": { |
|
"$gte": start_date.strftime("%Y-%m-%d"), |
|
"$lte": end_date.strftime("%Y-%m-%d") |
|
} |
|
}) |
|
|
|
if total_articles == 0: |
|
logger.warning("No articles found in the specified date range") |
|
return |
|
|
|
logger.info("Found %d articles to process", total_articles) |
|
|
|
|
|
current_date = start_date |
|
processed_days = 0 |
|
total_keywords = 0 |
|
|
|
while current_date <= end_date: |
|
logger.info("Processing %s", current_date.date()) |
|
|
|
|
|
day_start = current_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
day_end = current_date.replace(hour=23, minute=59, second=59, microsecond=999999) |
|
|
|
day_articles = fetch_articles_for_period(article_collection, day_start, day_end) |
|
|
|
if day_articles: |
|
try: |
|
|
|
historical_keywords = fetch_historical_keywords(keywords_collection) |
|
|
|
|
|
extracted_keywords = run_llm_extraction(day_articles, historical_keywords) |
|
|
|
if extracted_keywords: |
|
|
|
calculate_metrics_and_save_for_date( |
|
keywords_collection, |
|
extracted_keywords, |
|
current_date |
|
) |
|
processed_days += 1 |
|
total_keywords += len(extracted_keywords) |
|
logger.info("Extracted %d keywords for %s", |
|
len(extracted_keywords), current_date.date()) |
|
else: |
|
logger.warning("No keywords extracted for %s", current_date.date()) |
|
except Exception as e: |
|
logger.error("Error processing %s: %s", current_date.date(), e) |
|
else: |
|
logger.info("No articles found for %s", current_date.date()) |
|
|
|
current_date += timedelta(days=1) |
|
|
|
logger.info("Historical backfill completed") |
|
logger.info("Summary: Processed %d days, extracted %d total keywords", |
|
processed_days, total_keywords) |
|
if processed_days > 0: |
|
logger.info("Average keywords per day: %.1f", |
|
total_keywords / processed_days) |
|
|
|
if __name__ == "__main__": |
|
collect() |
|
|