Spaces:

Oxbridge-Economics
/

finfast-summary

Running

finfast-summary / app /collectors /finfast /article.py

Muhammad Abdur Rahman Saad

fix pylint errors

72f4cb5 about 2 months ago

4.07 kB

	"""Module for collecting and managing article data from DynamoDB to MongoDB."""
	from venv import logger
	from datetime import datetime, timedelta
	from pymongo.errors import PyMongoError
	from models.database import article_collection # pylint: disable=import-error

	from .utils import scan_dynamodb_table, delete_old_documents, upsert_item

	def _process_article_item(item):
	"""
	Process a single article item by converting data types and upserting to MongoDB.

	Args:
	item (dict): The article item to process

	Raises:
	ValueError: If data type conversion fails
	KeyError: If required keys are missing
	TypeError: If type conversion fails
	PyMongoError: If MongoDB operation fails
	"""
	try:
	# Convert sentimentScore to float
	item["sentimentScore"] = float(item.get("sentimentScore", 0.0))
	# Set _id and remove id
	item["_id"] = item.pop("id", None)

	# Convert entityList inner values to float (for MongoDB compatibility)
	if "entityList" not in item or not isinstance(item["entityList"], list):
	return
	for entity in item["entityList"]:
	if isinstance(entity, dict):
	if "sentimentScore" in entity:
	try:
	entity["sentimentScore"] = float(entity["sentimentScore"])
	except (ValueError, TypeError):
	entity["sentimentScore"] = 0.0
	if "occurrence" in entity:
	try:
	entity["occurrence"] = float(entity["occurrence"])
	except (ValueError, TypeError):
	entity["occurrence"] = 0.0

	# Upsert into MongoDB
	upsert_item(article_collection, item)

	except (ValueError, KeyError, TypeError, PyMongoError) as e:
	logger.error("Error processing item with _id %s: %s",
	item.get('_id', 'unknown'), e)


	def upsert_documents(filter_date):
	"""
	Scan and upsert documents from DynamoDB Article_China table to MongoDB collection.

	This function scans the DynamoDB table for articles published on or after the specified
	filter date, processes each item by converting data types and field names, then upserts
	them into the MongoDB collection. The operation handles pagination automatically using
	LastEvaluatedKey from DynamoDB scan responses.

	Args:
	filter_date (str): The minimum publish date in 'YYYY-MM-DD' format to filter articles.

	Raises:
	ClientError: If there is an error during DynamoDB scan operation
	BotoCoreError: If there is a boto3 core error

	Note:
	- Converts sentimentScore to float type
	- Renames 'id' field to '_id' for MongoDB compatibility
	- Uses upsert operation to avoid duplicates
	"""
	scan_dynamodb_table("Article_China", filter_date, _process_article_item)


	def collect():
	"""
	Main collection function that orchestrates the article data collection process.

	This function performs a complete data collection cycle by:
	1. Calculating a filter date (30 days ago from current date)
	2. Scanning and upserting recent articles from DynamoDB to MongoDB
	3. Deleting articles older than 30 days from MongoDB to maintain data freshness

	The function maintains a rolling 30-day window of article data, ensuring that
	only recent and relevant articles are kept in the MongoDB collection.

	Args:
	None

	Returns:
	None

	Raises:
	Exception: Propagates any exceptions from upsert_documents or delete_old_documents.

	Note:
	This function is typically called as part of a scheduled data collection process.
	"""
	# Calculate date 60 days ago
	sixty_days_ago = datetime.now() - timedelta(days=60)
	filter_date = sixty_days_ago.strftime('%Y-%m-%d')

	# Scan and process items
	upsert_documents(filter_date)

	# Delete documents older than 60 days
	delete_old_documents(article_collection, filter_date, logger)