Spaces:

Oxbridge-Economics
/

finfast-summary

Running

finfast-summary / app /collectors /finfast /keyword.py

Muhammad Abdur Rahman Saad

fix pylint errors

72f4cb5 about 2 months ago

3.88 kB

	"""Keyword collector for historical backfill using LLM-based keyword extraction."""
	from datetime import datetime, timedelta
	import logging

	from models.database import article_collection, keywords_collection # pylint: disable=import-error
	from controllers.keyword import ( # pylint: disable=import-error
	fetch_articles_for_period,
	fetch_historical_keywords,
	run_llm_extraction,
	calculate_metrics_and_save_for_date
	)

	# Configure logger
	logger = logging.getLogger(__name__)

	def collect():
	"""
	Main collection function for historical keyword backfill.

	This function performs a 60-day historical keyword backfill using the standard approach:
	- Processes articles day by day to build historical context gradually
	- Uses LLM-based keyword extraction with proper categorization
	- Stores results in the keywords collection with frequency and popularity metrics
	- Maintains consistency with existing keyword patterns
	"""
	logger.info("Starting 60-day historical keyword backfill")

	# Calculate date range: 60 days ago to yesterday
	end_date = datetime.now() - timedelta(days=1) # Yesterday
	start_date = end_date - timedelta(days=60) # 60 days ago

	logger.info("Processing articles from %s to %s", start_date.date(), end_date.date())

	# Verify we have articles in the date range
	total_articles = article_collection.count_documents({
	"publishDate": {
	"$gte": start_date.strftime("%Y-%m-%d"),
	"$lte": end_date.strftime("%Y-%m-%d")
	}
	})

	if total_articles == 0:
	logger.warning("No articles found in the specified date range")
	return

	logger.info("Found %d articles to process", total_articles)

	# Process day by day to build historical context gradually
	current_date = start_date
	processed_days = 0
	total_keywords = 0

	while current_date <= end_date:
	logger.info("Processing %s", current_date.date())

	# Get articles for this specific day
	day_start = current_date.replace(hour=0, minute=0, second=0, microsecond=0)
	day_end = current_date.replace(hour=23, minute=59, second=59, microsecond=999999)

	day_articles = fetch_articles_for_period(article_collection, day_start, day_end)

	if day_articles:
	try:
	# Get historical context from previously processed days
	historical_keywords = fetch_historical_keywords(keywords_collection)

	# Extract keywords for this day
	extracted_keywords = run_llm_extraction(day_articles, historical_keywords)

	if extracted_keywords:
	# Calculate metrics and save
	calculate_metrics_and_save_for_date(
	keywords_collection,
	extracted_keywords,
	current_date
	)
	processed_days += 1
	total_keywords += len(extracted_keywords)
	logger.info("Extracted %d keywords for %s",
	len(extracted_keywords), current_date.date())
	else:
	logger.warning("No keywords extracted for %s", current_date.date())
	except Exception as e: # pylint: disable=broad-exception-caught
	logger.error("Error processing %s: %s", current_date.date(), e)
	else:
	logger.info("No articles found for %s", current_date.date())

	current_date += timedelta(days=1)

	logger.info("Historical backfill completed")
	logger.info("Summary: Processed %d days, extracted %d total keywords",
	processed_days, total_keywords)
	if processed_days > 0:
	logger.info("Average keywords per day: %.1f",
	total_keywords / processed_days)

	if __name__ == "__main__":
	collect()