Muhammad Abdur Rahman Saad
fix pylint errors
72f4cb5
"""Module for collecting and managing article data from DynamoDB to MongoDB."""
from venv import logger
from datetime import datetime, timedelta
from pymongo.errors import PyMongoError
from models.database import article_collection # pylint: disable=import-error
from .utils import scan_dynamodb_table, delete_old_documents, upsert_item
def _process_article_item(item):
"""
Process a single article item by converting data types and upserting to MongoDB.
Args:
item (dict): The article item to process
Raises:
ValueError: If data type conversion fails
KeyError: If required keys are missing
TypeError: If type conversion fails
PyMongoError: If MongoDB operation fails
"""
try:
# Convert sentimentScore to float
item["sentimentScore"] = float(item.get("sentimentScore", 0.0))
# Set _id and remove id
item["_id"] = item.pop("id", None)
# Convert entityList inner values to float (for MongoDB compatibility)
if "entityList" not in item or not isinstance(item["entityList"], list):
return
for entity in item["entityList"]:
if isinstance(entity, dict):
if "sentimentScore" in entity:
try:
entity["sentimentScore"] = float(entity["sentimentScore"])
except (ValueError, TypeError):
entity["sentimentScore"] = 0.0
if "occurrence" in entity:
try:
entity["occurrence"] = float(entity["occurrence"])
except (ValueError, TypeError):
entity["occurrence"] = 0.0
# Upsert into MongoDB
upsert_item(article_collection, item)
except (ValueError, KeyError, TypeError, PyMongoError) as e:
logger.error("Error processing item with _id %s: %s",
item.get('_id', 'unknown'), e)
def upsert_documents(filter_date):
"""
Scan and upsert documents from DynamoDB Article_China table to MongoDB collection.
This function scans the DynamoDB table for articles published on or after the specified
filter date, processes each item by converting data types and field names, then upserts
them into the MongoDB collection. The operation handles pagination automatically using
LastEvaluatedKey from DynamoDB scan responses.
Args:
filter_date (str): The minimum publish date in 'YYYY-MM-DD' format to filter articles.
Raises:
ClientError: If there is an error during DynamoDB scan operation
BotoCoreError: If there is a boto3 core error
Note:
- Converts sentimentScore to float type
- Renames 'id' field to '_id' for MongoDB compatibility
- Uses upsert operation to avoid duplicates
"""
scan_dynamodb_table("Article_China", filter_date, _process_article_item)
def collect():
"""
Main collection function that orchestrates the article data collection process.
This function performs a complete data collection cycle by:
1. Calculating a filter date (30 days ago from current date)
2. Scanning and upserting recent articles from DynamoDB to MongoDB
3. Deleting articles older than 30 days from MongoDB to maintain data freshness
The function maintains a rolling 30-day window of article data, ensuring that
only recent and relevant articles are kept in the MongoDB collection.
Args:
None
Returns:
None
Raises:
Exception: Propagates any exceptions from upsert_documents or delete_old_documents.
Note:
This function is typically called as part of a scheduled data collection process.
"""
# Calculate date 60 days ago
sixty_days_ago = datetime.now() - timedelta(days=60)
filter_date = sixty_days_ago.strftime('%Y-%m-%d')
# Scan and process items
upsert_documents(filter_date)
# Delete documents older than 60 days
delete_old_documents(article_collection, filter_date, logger)