|
""" |
|
Daily keyword pipeline collector for combined keyword detection, analysis, and cleanup. |
|
""" |
|
from datetime import datetime |
|
import logging |
|
|
|
from models.database import article_collection, keywords_collection, summary_collection |
|
from controllers.keyword import ( |
|
fetch_articles_for_period, |
|
fetch_historical_keywords, |
|
run_llm_extraction, |
|
calculate_metrics_and_save_for_date |
|
) |
|
from controllers.keyword_analysis import ( |
|
analyze_keywords_from_database, |
|
save_summary_to_database, |
|
cleanup_old_keywords |
|
) |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
def collect(): |
|
""" |
|
Daily pipeline: keyword extraction + analysis + cleanup. |
|
|
|
This function orchestrates the complete daily pipeline: |
|
1. Fetch latest articles (past day), run keyword detection, and store keywords |
|
2. Perform analysis for "today", "recent week", and "recent month" periods |
|
3. Delete keywords older than 60 days from the keywords collection |
|
""" |
|
logger.info("Starting daily keyword analysis pipeline") |
|
|
|
try: |
|
|
|
logger.info("Step 1: Daily keyword detection") |
|
latest_date = _get_latest_article_date() |
|
if not latest_date: |
|
logger.warning("No articles found in database. Exiting pipeline.") |
|
return |
|
|
|
|
|
existing_doc = keywords_collection.find_one({"_id": latest_date.strftime("%Y-%m-%d")}) |
|
if existing_doc: |
|
logger.info("Keywords already exist for %s, skipping extraction", |
|
latest_date.strftime("%Y-%m-%d")) |
|
else: |
|
_extract_keywords_for_date(latest_date) |
|
|
|
|
|
logger.info("Step 2: Performing analysis for all periods") |
|
_perform_analysis_for_all_periods() |
|
|
|
|
|
logger.info("Step 3: Cleaning up old keywords") |
|
deleted_count = cleanup_old_keywords(keywords_collection, latest_date) |
|
logger.info("Cleanup completed: deleted %d old records", deleted_count) |
|
|
|
logger.info("Daily keyword pipeline completed successfully") |
|
|
|
except Exception as e: |
|
logger.error("Error in daily keyword pipeline: %s", str(e)) |
|
raise |
|
|
|
def _get_latest_article_date() -> datetime: |
|
""" |
|
Get the latest article date from the database. |
|
|
|
Returns: |
|
datetime: Latest article date or None if no articles found |
|
""" |
|
logger.info("Finding the most recent date with articles") |
|
|
|
latest_article = article_collection.find().sort("publishDate", -1).limit(1) |
|
latest_articles_list = list(latest_article) |
|
|
|
if not latest_articles_list: |
|
logger.warning("No articles found in database") |
|
return None |
|
|
|
latest_date_str = latest_articles_list[0]['publishDate'] |
|
latest_date = datetime.strptime(latest_date_str, "%Y-%m-%d") |
|
|
|
logger.info("Processing articles for %s", latest_date.date()) |
|
return latest_date |
|
|
|
def _extract_keywords_for_date(target_date: datetime) -> None: |
|
""" |
|
Extract keywords for a specific date. |
|
|
|
Args: |
|
target_date: Date for which to extract keywords |
|
""" |
|
|
|
start_date = target_date.replace(hour=0, minute=0, second=0, microsecond=0) |
|
end_date = target_date.replace(hour=23, minute=59, second=59, microsecond=999999) |
|
|
|
articles = fetch_articles_for_period(article_collection, start_date, end_date) |
|
|
|
if not articles: |
|
logger.warning("No articles found for %s", target_date.date()) |
|
return |
|
|
|
|
|
historical_keywords = fetch_historical_keywords(keywords_collection) |
|
|
|
|
|
logger.info("Extracting keywords from articles for %s", target_date.date()) |
|
extracted_keywords = run_llm_extraction(articles, historical_keywords) |
|
|
|
if extracted_keywords: |
|
|
|
calculate_metrics_and_save_for_date(keywords_collection, extracted_keywords, target_date) |
|
logger.info("Saved %d keywords for %s", |
|
len(extracted_keywords), target_date.strftime('%Y-%m-%d')) |
|
else: |
|
logger.warning("No keywords extracted from articles for %s", target_date.date()) |
|
|
|
def _perform_analysis_for_all_periods() -> None: |
|
""" |
|
Perform analysis for today, week, and month periods. |
|
""" |
|
periods = ["today", "week", "month"] |
|
|
|
for period in periods: |
|
try: |
|
logger.info("Analyzing keywords for %s period", period) |
|
|
|
|
|
results = analyze_keywords_from_database(period, keywords_collection) |
|
|
|
if results: |
|
|
|
save_summary_to_database(period, results, summary_collection) |
|
logger.info("Analysis completed for %s: %d terms, %d themes", |
|
period, len(results.get('terms', [])), len(results.get('themes', []))) |
|
else: |
|
logger.warning("No analysis results for %s period", period) |
|
|
|
except Exception as e: |
|
logger.error("Error analyzing %s period: %s", period, str(e)) |
|
|
|
|