Spaces:

Oxbridge-Economics
/

finfast-summary

Running

App Files Files Community

Weirui-Leo commited on Jul 23

Commit

2646146

1 Parent(s): 35830b5

fix: all pylint errors

Browse files

Files changed (20) hide show

app/__init__.py +0 -0
app/app.py +12 -6
app/collectors/category_update.py +261 -0
app/collectors/finfast/article.py +18 -26
app/collectors/finfast/entity.py +4 -12
app/collectors/finfast/utils.py +9 -1
app/collectors/utils.py +98 -0
app/controllers/category.py +1 -1
app/controllers/summary/__init__.py +4 -3
app/controllers/summary/content/__init__.py +1 -1
app/controllers/summary/content/weekly.py +1 -1
app/controllers/summary/entity/__init__.py +1 -1
app/controllers/summary/sentiment/__init__.py +1 -1
app/controllers/summary/utils.py +89 -66
app/database/__init__.py +1 -1
app/database/mongodb.py +10 -1
app/routes/__init__.py +1 -1
app/routes/category_router.py +1 -1
app/routes/summary.py +1 -1
jobs.json +8 -0

app/__init__.py ADDED Viewed

File without changes

app/app.py CHANGED Viewed

@@ -1,22 +1,28 @@
 """Flask application entry point."""
 import json
 import logging
 from flask import Flask
 from flask_apscheduler import APScheduler
 from asgiref.wsgi import WsgiToAsgi
-from routes.category_router import category_bp
-from routes.summary import summary_bp
 class Config:
     """
     Config class for application settings.
     Attributes:
-        SCHEDULER_API_ENABLED (bool): Indicates whether the scheduler's API is enabled.
     """
-    with open('jobs.json', 'r', encoding='utf-8') as jobs_file:
-        JOBS = json.load(jobs_file)
-    SCHEDULER_API_ENABLED = True
 def create_app():
     """

 """Flask application entry point."""
+from dataclasses import dataclass
 import json
 import logging
 from flask import Flask
 from flask_apscheduler import APScheduler
 from asgiref.wsgi import WsgiToAsgi
+from app.routes import category_bp, summary_bp
+@dataclass
 class Config:
     """
     Config class for application settings.
     Attributes:
+        scheduler_api_enabled (bool): Indicates whether the scheduler's API is enabled.
     """
+    scheduler_api_enabled: bool = True
+    jobs: dict = None
+    def __post_init__(self):
+        if self.jobs is None:
+            with open('jobs.json', 'r', encoding='utf-8') as jobs_file:
+                self.jobs = json.load(jobs_file)
 def create_app():
     """

app/collectors/category_update.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Script to update category collection in MongoDB with sites from Article_China DynamoDB.
+Reads records from Article_China based on a delta parameter for lastModifiedDate,
+extracts unique site-category pairs from specified categories, and updates
+MongoDB category collection with aggregated data.
+"""
+import logging
+import datetime
+from typing import Dict, List, Tuple
+from collections import defaultdict
+from dataclasses import dataclass
+from botocore.exceptions import ClientError
+from .utils import get_client_connection
+from ..database.mongodb import category_collection
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    datefmt='%Y-%m-%d %H:%M:%S'
+)
+logger = logging.getLogger('category_update')
+# DynamoDB table name
+ARTICLE_CHINA_TABLE = 'Article_China'
+SCAN_LIMIT = 50  # Limit for scan operations as requested
+# Target categories with unknown site lists
+TARGET_CATEGORIES = [
+    "Dragon Street China Markets",
+    "Beijing Briefs",
+    "Knowledge Hub"
+]
+@dataclass
+class CategoryUpdater:
+    """Manages the collection and updating of category-site relationships from DynamoDB to MongoDB.
+    This class handles the complete workflow of:
+    1. Querying recent articles from DynamoDB's Article_China table
+    2. Extracting unique site-category pairs
+    3. Grouping sites by their categories
+    4. Updating MongoDB with the latest category-site relationships
+    The class supports both incremental updates (via delta days) and full refreshes.
+    Attributes:
+        delta (int): Default lookback period in days for incremental updates (-1 for full refresh)
+        logger (Logger): Configured logger instance for tracking operations
+    Typical usage example:
+        >>> updater = CategoryUpdater()
+        >>> updater.collect()  # Default 1-day delta
+        >>> updater.collect(delta=7)  # Weekly refresh
+        >>> updater.collect(delta=-1)  # Full rebuild
+    """
+    #def __init__(self):
+    #   self.delta = 1   Default delta value
+    delta: int = 1  # Now a type-hinted class field with default
+def get_articles_by_delta(delta: int) -> List[Dict]:
+    """
+    Query Article_China based on delta parameter and target categories.
+    Args:
+        delta: Number of days to look back. If -1, get all records.
+    Returns:
+        List of article records matching the criteria
+    """
+    dynamodb = get_client_connection()
+    articles = []
+    try:
+        # Format target categories for filter expression
+        target_categories_values = {}
+        filter_conditions = []
+        for i, category in enumerate(TARGET_CATEGORIES):
+            attribute_name = f':category{i}'
+            target_categories_values[attribute_name] = {'S': category}
+            filter_conditions.append(f"category = {attribute_name}")
+        category_filter = f"({' OR '.join(filter_conditions)})"
+        if delta == -1:
+            logger.info("Retrieving all articles from Article_China for target categories")
+            # Scan with only category filter
+            scan_params = {
+                'TableName': ARTICLE_CHINA_TABLE,
+                'FilterExpression': category_filter,
+                'ExpressionAttributeValues': target_categories_values,
+                'Limit': SCAN_LIMIT
+            }
+        else:
+            # Calculate cutoff date
+            cutoff_date = (datetime.datetime.now() - datetime.timedelta(days
+            =delta)).strftime('%Y-%m-%dT%H:%M:%S')
+            logger.info("Retrieving articles modified after %s for target categories", cutoff_date)
+            # Add date filter to expression attribute values
+            target_categories_values[':cutoff_date'] = {'S': cutoff_date}
+            scan_params = {
+                'TableName': ARTICLE_CHINA_TABLE,
+                'FilterExpression': f"LastModifiedDate >= :cutoff_date AND {category_filter}",
+                'ExpressionAttributeValues': target_categories_values,
+                'Limit': SCAN_LIMIT
+            }
+        # Perform initial scan
+        response = dynamodb.scan(**scan_params)
+        articles.extend(response.get('Items', []))
+        # Continue scanning if there are more items
+        while 'LastEvaluatedKey' in response:
+            logger.debug("Continuing scan, found %s articles so far", len(articles))
+            scan_params['ExclusiveStartKey'] = response['LastEvaluatedKey']
+            response = dynamodb.scan(**scan_params)
+            articles.extend(response.get('Items', []))
+        logger.info("Retrieved %s articles total", len(articles))
+        return articles
+    except ClientError as e:
+        logger.error("Error scanning Article_China table: %s", e)
+        raise
+def extract_unique_site_categories(articles: List[Dict]) -> List[Tuple[str, str]]:
+    """
+    Extract unique site-category pairs from articles.
+    Args:
+        articles: List of article records
+    Returns:
+        List of tuples containing (site, category) pairs
+    """
+    site_category_pairs = set()
+    try:
+        for article in articles:
+            site = article.get('site', {}).get('S')
+            category = article.get('category', {}).get('S')
+            if site and category:
+                site_category_pairs.add((site, category))
+        result = list(site_category_pairs)
+        logger.info("Extracted %s unique site-category pairs", len(result))
+        return result
+    except Exception as e:
+        logger.error("Error extracting site-category pairs: %s", e)
+        raise
+def group_sites_by_category(site_category_pairs: List[Tuple[str, str]]) -> Dict[str, List[str]]:
+    """
+    Group sites by category.
+    Args:
+        site_category_pairs: List of (site, category) tuples
+    Returns:
+        Dictionary mapping categories to lists of sites
+    """
+    category_sites = defaultdict(set)
+    try:
+        for site, category in site_category_pairs:
+            category_sites[category].add(site)
+        # Convert sets to lists for JSON serialization
+        result = {category: list(sites) for category, sites in category_sites.items()}
+        logger.info("Grouped sites into %s categories", len(result))
+        return result
+    except Exception as e:
+        logger.error("Error grouping sites by category: %s", e)
+        raise
+def update_mongodb_categories(category_sites: Dict[str, List[str]]) -> None:
+    """
+    Update MongoDB category collection with category-sites mapping.
+    category_collection is imported from mongodb.py in database folder
+    Args:
+        category_sites: Dictionary mapping categories to lists of sites
+    """
+    try:
+        if not category_sites:
+            logger.info("No category-sites mappings to add to MongoDB")
+            return
+        logger.info("Updating %s categories in MongoDB", len(category_sites))
+        # Update each category document
+        for category, sites in category_sites.items():
+            try:
+                # Use upsert with $addToSet to add unique sites to the array
+                result = category_collection.update_one(
+                    {"_id": category},
+                    {
+                        "$set": {"category": category},
+                        "$addToSet": {"site": {"$each": sites}}
+                    },
+                    upsert=True
+                )
+                if result.upserted_id:
+                    logger.info("Created new category '%s' with %s sites", category, len(sites))
+                else:
+                    logger.info("Updated category '%s' with %s sites", category, len(sites))
+            except Exception as e:
+                logger.error("Error updating category '%s' in MongoDB: %s", category, e)
+                raise
+    except Exception as e:
+        logger.error("Error updating MongoDB categories: %s", e)
+        raise
+def collect(delta: int = 1) -> None:
+    """
+    Main function to update MongoDB category collection with site-category pairs from Article_China.
+    Args:
+        delta: Number of days to look back for modified articles.
+               If -1, get all articles.
+    """
+    try:
+        logger.info("Starting category update with delta = %s", delta)
+        # Get articles based on delta and target categories
+        articles = get_articles_by_delta(delta)
+        # Extract unique site-category pairs
+        site_category_pairs = extract_unique_site_categories(articles)
+        if not site_category_pairs:
+            logger.info("No site-category pairs found in articles, nothing to update")
+            return
+        # Group sites by category
+        category_sites = group_sites_by_category(site_category_pairs)
+        # Update MongoDB with category-sites mapping
+        update_mongodb_categories(category_sites)
+        logger.info("Category update completed successfully")
+    except Exception as e:
+        logger.error("Category update failed: %s", e)
+        raise

app/collectors/finfast/article.py CHANGED Viewed

@@ -1,12 +1,9 @@
 """Module for collecting and managing article data from DynamoDB to MongoDB."""
 from venv import logger
 from datetime import datetime, timedelta
-from database import FinFastMongoClient as MongodbClient
 from pymongo.errors import PyMongoError
-from .utils import scan_dynamodb_table, delete_old_documents
-collection = MongodbClient["FinFAST_China"]["Article"]
 def _process_article_item(item):
     """
@@ -28,27 +25,23 @@ def _process_article_item(item):
         item["_id"] = item.pop("id", None)
         # Convert entityList inner values to float (for MongoDB compatibility)
-        if "entityList" in item and isinstance(item["entityList"], list):
-            for entity in item["entityList"]:
-                if isinstance(entity, dict):
-                    if "sentimentScore" in entity:
-                        try:
-                            entity["sentimentScore"] = float(entity["sentimentScore"])
-                        except (ValueError, TypeError):
-                            entity["sentimentScore"] = 0.0
-                    if "occurrence" in entity:
-                        try:
-                            entity["occurrence"] = float(entity["occurrence"])
-                        except (ValueError, TypeError):
-                            entity["occurrence"] = 0.0
         # Upsert into MongoDB
-        collection.update_one(
-            {'_id': item['_id']},
-            {'$set': item},
-            upsert=True
-        )
-        print(f"Successfully processed item: {item['_id']}")
     except (ValueError, KeyError, TypeError, PyMongoError) as e:
         logger.error("Error processing item with _id %s: %s",
@@ -111,5 +104,4 @@ def collect():
     upsert_documents(filter_date)
     # Delete documents older than 60 days
-    delete_old_documents(collection, filter_date, logger)

 """Module for collecting and managing article data from DynamoDB to MongoDB."""
 from venv import logger
 from datetime import datetime, timedelta
 from pymongo.errors import PyMongoError
+from ...database.mongodb import article_collection
+from .utils import scan_dynamodb_table, delete_old_documents, upsert_item
 def _process_article_item(item):
     """
         item["_id"] = item.pop("id", None)
         # Convert entityList inner values to float (for MongoDB compatibility)
+        if "entityList" not in item or not isinstance(item["entityList"], list):
+            return
+        for entity in item["entityList"]:
+            if isinstance(entity, dict):
+                if "sentimentScore" in entity:
+                    try:
+                        entity["sentimentScore"] = float(entity["sentimentScore"])
+                    except (ValueError, TypeError):
+                        entity["sentimentScore"] = 0.0
+                if "occurrence" in entity:
+                    try:
+                        entity["occurrence"] = float(entity["occurrence"])
+                    except (ValueError, TypeError):
+                        entity["occurrence"] = 0.0
         # Upsert into MongoDB
+        upsert_item(article_collection, item)
     except (ValueError, KeyError, TypeError, PyMongoError) as e:
         logger.error("Error processing item with _id %s: %s",
     upsert_documents(filter_date)
     # Delete documents older than 60 days
+    delete_old_documents(article_collection, filter_date, logger)

app/collectors/finfast/entity.py CHANGED Viewed

@@ -1,12 +1,8 @@
 """Module for collecting and managing entity data from DynamoDB to MongoDB."""
 from datetime import datetime, timedelta
-from database import FinFastMongoClient as MongodbClient
 from pymongo.errors import PyMongoError
-from .utils import scan_dynamodb_table, delete_old_documents
-entity_collection = MongodbClient["FinFAST_China"]["Entity"]
 def _process_entity_item(item):
     """
@@ -30,11 +26,8 @@ def _process_entity_item(item):
         item["_id"] = f"{item.get('entity', '')}-{item.get('articleID', '')}"
         # Upsert into MongoDB
-        entity_collection.update_one(
-            {'_id': item['_id']},
-            {'$set': item},
-            upsert=True
-        )
         print(f"Successfully processed item: {item['_id']}")
     except (ValueError, KeyError, TypeError, PyMongoError) as e:
@@ -100,4 +93,3 @@ def collect():
     # Delete documents older than 30 days
     delete_old_documents(entity_collection, filter_date)

 """Module for collecting and managing entity data from DynamoDB to MongoDB."""
 from datetime import datetime, timedelta
 from pymongo.errors import PyMongoError
+from ...database.mongodb import entity_collection
+from .utils import scan_dynamodb_table, delete_old_documents, upsert_item
 def _process_entity_item(item):
     """
         item["_id"] = f"{item.get('entity', '')}-{item.get('articleID', '')}"
         # Upsert into MongoDB
+        upsert_item(entity_collection, item)
         print(f"Successfully processed item: {item['_id']}")
     except (ValueError, KeyError, TypeError, PyMongoError) as e:
     # Delete documents older than 30 days
     delete_old_documents(entity_collection, filter_date)

app/collectors/finfast/utils.py CHANGED Viewed

@@ -105,4 +105,12 @@ def delete_old_documents(collection, cutoff_date, use_logger=None):
         else:
             print(error_message)
         raise

         else:
             print(error_message)
         raise
+def upsert_item(collection, item):
+    """Helper function to upsert an item into a MongoDB collection."""
+    collection.update_one(
+        {'_id': item['_id']},
+        {'$set': item},
+        upsert=True
+    )
+    print(f"Successfully processed item: {item['_id']}")

app/collectors/utils.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Utilis Functions"""
+import os
+import glob
+import boto3
+import pandas as pd
+AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
+AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
+#Downloads .parquet files from an S3 bucket and concatenates them into a single pandas DataFrame.
+def download_files_from_s3(folder):
+    """Download Data Files"""
+    if not os.path.exists(folder):
+        os.makedirs(folder)
+    client = boto3.client(
+        's3',
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
+    )
+    response = client.list_objects_v2(Bucket='finfast-china', Prefix=f"{folder}/")
+    for obj in response['Contents']:
+        key = obj['Key']
+        if key.endswith('.parquet'):
+            client.download_file('finfast-china', key, key)
+    file_paths = glob.glob(os.path.join(folder, '*.parquet'))
+    return pd.concat([pd.read_parquet(file_path) for file_path in file_paths], ignore_index=True)
+# Returns a DynamoDB client connection.
+def get_client_connection():
+    """Get dynamoDB connection"""
+    dynamodb = boto3.client(
+        service_name='dynamodb',
+        region_name='us-east-1',
+        aws_access_key_id=AWS_ACCESS_KEY_ID,
+        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
+    )
+    return dynamodb
+# Updates an entity in the specified DynamoDB table.
+def update_entity(table_name, row):
+    """update entity data to db"""
+    dynamodb = get_client_connection()
+    response = dynamodb.update_item(
+                TableName=table_name,
+                Key={
+                    'entity': {'S': row['entity']},
+                    'entityType': {'S': row['entitytype']}
+                },
+                UpdateExpression='SET total_occurrence = :total_occurrence',
+                ExpressionAttributeValues={
+                    ':total_occurrence': {'N': str(row['total_occurrence'])}
+                }
+            )
+    print(response)
+# Updates a category in the specified DynamoDB table.
+def update_category(table_name, row):
+    """update category data to db"""
+    dynamodb = get_client_connection()
+    response = dynamodb.update_item(
+                TableName=table_name,
+                Key={
+                    'site': {'S': row['site']},
+                    'category': {'S': row['category']}
+                },
+                UpdateExpression='SET cnt = :cnt',
+                ExpressionAttributeValues={
+                    ':cnt': {'N': str(row['count'])},
+                }
+            )
+    print(response)
+# Deletes an entity from the specified DynamoDB table.
+def delete_entity(table_name, row):
+    """delete entity from db"""
+    dynamodb = get_client_connection()
+    dynamodb.delete_item(
+        TableName=table_name,
+        Key={
+            'entity': {'S': row['entity']},
+            'entityType': {'S': row['entityType']}
+        }
+    )
+# Scans the specified DynamoDB table and deletes all items.
+def scan(table_name):
+    """scan record from db"""
+    dynamodb = get_client_connection()
+    response = dynamodb.scan(TableName=table_name)
+    while 'LastEvaluatedKey' in response:
+        for item in response['Items']:
+            dynamodb.delete_item(
+                TableName=table_name,
+                Key=dict(item)
+            )
+        response = dynamodb.scan(TableName=table_name,
+        ExclusiveStartKey=response['LastEvaluatedKey'])
+    return response

app/controllers/category.py CHANGED Viewed

@@ -4,7 +4,7 @@ Category Controller - Business logic for handling category data.
 This module contains functions that interact with the database
 to fetch and process data sorted by category
 """
-from database.mongodb import category_collection
 def get_categories():
     """

 This module contains functions that interact with the database
 to fetch and process data sorted by category
 """
+from ..database.mongodb import category_collection
 def get_categories():
     """

app/controllers/summary/__init__.py CHANGED Viewed

@@ -3,8 +3,8 @@ import os
 import importlib
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Any
-from .utils import get_content_flow_data, get_entity_analysis_data, get_sentiment_analysis_data, get_entity_sentiment_data
 def _run_process(args):
     """
@@ -55,7 +55,8 @@ def process(module):
     return charts
-def get_summary_data(include_content: bool = True, include_entity: bool = True, include_sentiment: bool = True) -> Dict[str, Any]:
     """
     Get complete summary dashboard data for all time periods.

 import importlib
 from concurrent.futures import ThreadPoolExecutor
 from typing import Dict, Any
+from .utils import get_content_flow_data, get_entity_analysis_data
+from .utils import get_sentiment_analysis_data, get_entity_sentiment_data
 def _run_process(args):
     """
     return charts
+def get_summary_data(include_content: bool = True,
+include_entity: bool = True, include_sentiment: bool = True) -> Dict[str, Any]:
     """
     Get complete summary dashboard data for all time periods.

app/controllers/summary/content/__init__.py CHANGED Viewed

@@ -26,4 +26,4 @@ __all__ = [
     "get_today_content_flow_data",
     "get_weekly_content_flow_data",
     "get_monthly_content_flow_data",
-]

     "get_today_content_flow_data",
     "get_weekly_content_flow_data",
     "get_monthly_content_flow_data",
+]

app/controllers/summary/content/weekly.py CHANGED Viewed

@@ -9,4 +9,4 @@ from ..utils import get_content_flow_data
 def process():
     """Return Content Flow Tracker data for the *latest 7 days*."""
-    return get_content_flow_data("week")

 def process():
     """Return Content Flow Tracker data for the *latest 7 days*."""
+    return get_content_flow_data("week")

app/controllers/summary/entity/__init__.py CHANGED Viewed

@@ -24,4 +24,4 @@ __all__ = [
     "get_today_entity_analysis_data",
     "get_weekly_entity_analysis_data",
     "get_monthly_entity_analysis_data",
-]

     "get_today_entity_analysis_data",
     "get_weekly_entity_analysis_data",
     "get_monthly_entity_analysis_data",
+]

app/controllers/summary/sentiment/__init__.py CHANGED Viewed

@@ -35,4 +35,4 @@ __all__ = [
     "get_monthly_sentiment_data",
     "get_entity_sentiment_data",
     "get_entities_sentiment_data",
-]

     "get_monthly_sentiment_data",
     "get_entity_sentiment_data",
     "get_entities_sentiment_data",
+]

app/controllers/summary/utils.py CHANGED Viewed

@@ -3,10 +3,10 @@ This module contains utility functions for both Content Flow Tracker and Entity
 extracted and merged from the previous content/utils.py and entity/utils.py files.
 """
 from datetime import datetime, timedelta
-from typing import Dict, Any
 from collections import defaultdict
-from database.mongodb import article_collection, entity_collection
 def _get_latest_publish_date_from_collection(collection) -> datetime:
@@ -206,32 +206,27 @@ def get_sentiment_analysis_data(time_filter: str) -> Dict[str, Any]:
         Dictionary containing title, dateRange, and sentiment data by category and date.
     """
     start, end = _time_range(time_filter, article_collection)
-    # Convert time_filter to match the original logic
     if time_filter == "today":
-        start_date = datetime.strptime(end, "%Y-%m-%d").date()
         num_days = 1
     elif time_filter in {"week", "weekly"}:
-        start_date = datetime.strptime(start, "%Y-%m-%d").date()
         num_days = 7
     elif time_filter in {"month", "monthly"}:
-        start_date = datetime.strptime(start, "%Y-%m-%d").date()
         num_days = 30
     else:
-        start_date = datetime.strptime(start, "%Y-%m-%d").date()
         end_date = datetime.strptime(end, "%Y-%m-%d").date()
         num_days = (end_date - start_date).days + 1
     # Query articles with sentiment scores
-    query = {
         "publishDate": {"$gte": start, "$lte": end},
         "sentimentScore": {"$exists": True}
-    }
-    docs = list(article_collection.find(query))
     daily_scores = defaultdict(lambda: defaultdict(list))
-    # Aggregate sentiment scores by category and date
     for doc in docs:
         category = doc.get("category", "Unknown")
         score = doc.get("sentimentScore")
@@ -240,14 +235,13 @@ def get_sentiment_analysis_data(time_filter: str) -> Dict[str, Any]:
             daily_scores[category][pub_date].append(score)
     # Generate nested data structure: date -> category -> sentiment
-    data = {}
-    for i in range(num_days):
-        day = (start_date + timedelta(days=i)).isoformat()
-        data[day] = {}
-        for category in daily_scores:
-            scores = daily_scores[category].get(day, [])
-            avg_score = sum(scores) / len(scores) if scores else None
-            data[day][category] = avg_score
     return {
         "title": f"Sentiment Analysis by Category — {time_filter.capitalize()}",
@@ -255,6 +249,72 @@ def get_sentiment_analysis_data(time_filter: str) -> Dict[str, Any]:
         "data": data
     }
 def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
     """Return *Entity Sentiment Analysis* data for the given period.
@@ -276,9 +336,9 @@ def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
     """
     start, end = _time_range(time_filter, entity_collection)
-    # Convert to date for calculations
-    start_date = datetime.strptime(start, "%Y-%m-%d").date()
-    end_date = datetime.strptime(end, "%Y-%m-%d").date()
     # Calculate num_days based on sentiment logic
     if time_filter == "today":
@@ -288,7 +348,8 @@ def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
     elif time_filter in {"month", "monthly"}:
         num_days = 30
     else:
-        num_days = (end_date - start_date).days + 1
     # Query entities with sentiment scores
     query = {
@@ -313,50 +374,12 @@ def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
     # Filter top 10 entities per entityType based on sentiment volatility (range)
     top_n = 10
-    selected_entities = {}
-    for entity_type, entities in sentiment_by_type.items():
-        volatility_scores = {}
-        for entity, date_scores in entities.items():
-            # Calculate all sentiment values for this entity
-            all_values = []
-            for i in range(num_days):
-                day = (start_date + timedelta(days=i)).isoformat()
-                scores = date_scores.get(day, [])
-                avg = sum(scores) / len(scores) if scores else None
-                if avg is not None:
-                    all_values.append(avg)
-            # Calculate volatility (range: max - min)
-            if len(all_values) > 1:
-                volatility = max(all_values) - min(all_values)
-            elif len(all_values) == 1:
-                volatility = abs(all_values[0])  # Use absolute value for single data point
-            else:
-                volatility = 0  # No data points
-            volatility_scores[entity] = volatility
-        # Select top N entities with highest volatility
-        top_entities = sorted(volatility_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
-        selected_entities[entity_type] = [entity for entity, _ in top_entities]
     # Generate nested data structure: entityType -> date -> entity -> sentiment
-    data = {}
-    for i in range(num_days):
-        day = (start_date + timedelta(days=i)).isoformat()
-        for entity_type, entities in sentiment_by_type.items():
-            if entity_type not in data:
-                data[entity_type] = {}
-            if day not in data[entity_type]:
-                data[entity_type][day] = {}
-            # Only include selected top entities
-            for entity in selected_entities[entity_type]:
-                date_scores = entities.get(entity, {})
-                scores = date_scores.get(day, [])
-                avg = sum(scores) / len(scores) if scores else None
-                data[entity_type][day][entity.replace("_", " ")] = avg
     return {
         "title": f"Entity Sentiment Analysis — {time_filter.capitalize()}",

 extracted and merged from the previous content/utils.py and entity/utils.py files.
 """
 from datetime import datetime, timedelta
+from typing import Dict, Any, List, DefaultDict
 from collections import defaultdict
+from ...database.mongodb import article_collection, entity_collection
 def _get_latest_publish_date_from_collection(collection) -> datetime:
         Dictionary containing title, dateRange, and sentiment data by category and date.
     """
     start, end = _time_range(time_filter, article_collection)
+    start_date = datetime.strptime(start, "%Y-%m-%d").date()
+    # Determine num_days based on time_filter (reduced variables)
     if time_filter == "today":
         num_days = 1
     elif time_filter in {"week", "weekly"}:
         num_days = 7
     elif time_filter in {"month", "monthly"}:
         num_days = 30
     else:
         end_date = datetime.strptime(end, "%Y-%m-%d").date()
         num_days = (end_date - start_date).days + 1
     # Query articles with sentiment scores
+    docs = list(article_collection.find({
         "publishDate": {"$gte": start, "$lte": end},
         "sentimentScore": {"$exists": True}
+    }))
+    # Aggregate sentiment scores by category and date (using defaultdict)
     daily_scores = defaultdict(lambda: defaultdict(list))
     for doc in docs:
         category = doc.get("category", "Unknown")
         score = doc.get("sentimentScore")
             daily_scores[category][pub_date].append(score)
     # Generate nested data structure: date -> category -> sentiment
+    data = {
+        (start_date + timedelta(days=i)).isoformat(): {
+            category: (sum(scores) / len(scores) if scores else None)
+            for category, scores in daily_scores.items()
+        }
+        for i in range(num_days)
+    }
     return {
         "title": f"Sentiment Analysis by Category — {time_filter.capitalize()}",
         "data": data
     }
+def _calculate_volatility_scores(
+    entities: Dict[str, Any], start_date: datetime.date, num_days: int
+) -> Dict[str, float]:
+    """Calculate volatility scores for entities."""
+    volatility_scores = {}
+    for entity, date_scores in entities.items():
+        # Calculate all sentiment values for this entity
+        all_values = []
+        for i in range(num_days):
+            day = (start_date + timedelta(days=i)).isoformat()
+            scores = date_scores.get(day, [])
+            avg = sum(scores) / len(scores) if scores else None
+            if avg is not None:
+                all_values.append(avg)
+        # Calculate volatility (range: max - min)
+        if len(all_values) > 1:
+            volatility = max(all_values) - min(all_values)
+        elif len(all_values) == 1:
+            volatility = abs(all_values[0])  # Use absolute value for single data point
+        else:
+            volatility = 0  # No data points
+        volatility_scores[entity] = volatility
+    return volatility_scores
+def _get_selected_entities(
+    sentiment_by_type: DefaultDict[str, Any]
+    , start_date: datetime.date, num_days: int, top_n: int
+) -> Dict[str, List[str]]:
+    """Select top entities based on volatility scores."""
+    selected_entities = {}
+    for entity_type, entities in sentiment_by_type.items():
+        volatility_scores = _calculate_volatility_scores(entities, start_date, num_days)
+        # Select top N entities with highest volatility
+        top_entities = sorted(volatility_scores.items()
+                              , key=lambda x: x[1], reverse=True)[:top_n]
+        selected_entities[entity_type] = [entity for entity, _ in top_entities]
+    return selected_entities
+def _build_result_data(
+    sentiment_by_type: DefaultDict[str, Any],
+    selected_entities: Dict[str, List[str]],
+    start_date: datetime.date,
+    num_days: int,
+) -> Dict[str, Any]:
+    """Build the final result data structure."""
+    data = {}
+    for i in range(num_days):
+        day = (start_date + timedelta(days=i)).isoformat()
+        for entity_type, entities in sentiment_by_type.items():
+            if entity_type not in data:
+                data[entity_type] = {}
+            if day not in data[entity_type]:
+                data[entity_type][day] = {}
+            # Only include selected top entities
+            for entity in selected_entities[entity_type]:
+                date_scores = entities.get(entity, {})
+                scores = date_scores.get(day, [])
+                avg = sum(scores) / len(scores) if scores else None
+                data[entity_type][day][entity.replace("_", " ")] = avg
+    return data
 def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
     """Return *Entity Sentiment Analysis* data for the given period.
     """
     start, end = _time_range(time_filter, entity_collection)
+    # Convert to date for calculations --> ommitted as too many local variables
+    # start_date = datetime.strptime(start, "%Y-%m-%d").date()
+    # end_date = datetime.strptime(end, "%Y-%m-%d").date()
     # Calculate num_days based on sentiment logic
     if time_filter == "today":
     elif time_filter in {"month", "monthly"}:
         num_days = 30
     else:
+        num_days = (datetime.strptime(end, "%Y-%m-%d").date()
+ - datetime.strptime(start, "%Y-%m-%d").date()).days + 1
     # Query entities with sentiment scores
     query = {
     # Filter top 10 entities per entityType based on sentiment volatility (range)
     top_n = 10
+    selected_entities = _get_selected_entities(sentiment_by_type
+, datetime.strptime(start, "%Y-%m-%d").date(), num_days, top_n)
     # Generate nested data structure: entityType -> date -> entity -> sentiment
+    data = _build_result_data(sentiment_by_type, selected_entities
+, datetime.strptime(start, "%Y-%m-%d").date(), num_days)
     return {
         "title": f"Entity Sentiment Analysis — {time_filter.capitalize()}",

app/database/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """Module for Mongodb database"""
-from database.mongodb import MongodbClient, FinFastMongodbClient
 MongoClient = MongodbClient
 FinFastMongoClient = FinFastMongodbClient

 """Module for Mongodb database"""
+from .mongodb import MongodbClient, FinFastMongodbClient
 MongoClient = MongodbClient
 FinFastMongoClient = FinFastMongodbClient

app/database/mongodb.py CHANGED Viewed

@@ -1,9 +1,18 @@
 """MongoDB database interaction module."""
 import os
 from pymongo import MongoClient
 MongodbClient = MongoClient(os.getenv('MONGODB_URI'))
 FinFastMongodbClient = MongoClient(os.getenv("MONGODB_FINFAST_URI"))
 article_collection = FinFastMongodbClient["FinFAST_China"]["Article"]
 category_collection = FinFastMongodbClient["FinFAST_China"]["Category"]
 entity_collection = FinFastMongodbClient["FinFAST_China"]["Entity"]

 """MongoDB database interaction module."""
 import os
 from pymongo import MongoClient
+"""
 MongodbClient = MongoClient(os.getenv('MONGODB_URI'))
 FinFastMongodbClient = MongoClient(os.getenv("MONGODB_FINFAST_URI"))
 article_collection = FinFastMongodbClient["FinFAST_China"]["Article"]
 category_collection = FinFastMongodbClient["FinFAST_China"]["Category"]
 entity_collection = FinFastMongodbClient["FinFAST_China"]["Entity"]
+"""
+MONGODB_URI = "mongodb+srv://finfast:[email protected]/?retryWrites=true&w=majority&appName=MarcoEconomics"
+MONGODB_FINFAST_URI = "mongodb+srv://user:[email protected]/?retryWrites=true&w=majority&appName=Cluster"
+MongodbClient = MongoClient(MONGODB_URI)
+FinFastMongodbClient = MongoClient(os.getenv(MONGODB_FINFAST_URI))
+article_collection = FinFastMongodbClient["FinFAST_China"]["Article"]
+category_collection = FinFastMongodbClient["FinFAST_China"]["Category"]
+entity_collection = FinFastMongodbClient["FinFAST_China"]["Entity"]

app/routes/__init__.py CHANGED Viewed

@@ -2,4 +2,4 @@
 from flask import Blueprint
 category_bp = Blueprint("category", __name__)
-summary_bp = Blueprint('summary', __name__)

 from flask import Blueprint
 category_bp = Blueprint("category", __name__)
+summary_bp = Blueprint("summary", __name__)

app/routes/category_router.py CHANGED Viewed

@@ -7,8 +7,8 @@ from the MongoDB database.
 Routes:
     - GET /api/category: Fetch all categories.
 """
-from controllers.category import get_categories
 from flask import jsonify
 from . import category_bp
 @category_bp.route("/api/category", methods=["GET"])

 Routes:
     - GET /api/category: Fetch all categories.
 """
 from flask import jsonify
+from ..controllers.category import get_categories
 from . import category_bp
 @category_bp.route("/api/category", methods=["GET"])

app/routes/summary.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import importlib
 from flask import jsonify
-from controllers.summary import get_summary_data
 from . import summary_bp
 @summary_bp.route('', methods=['GET'])

 import importlib
 from flask import jsonify
+from ..controllers.summary import get_summary_data
 from . import summary_bp
 @summary_bp.route('', methods=['GET'])

jobs.json CHANGED Viewed

@@ -12,5 +12,13 @@
     "trigger": "cron",
     "hour": 23,
     "minute": 45
 }
 ]

     "trigger": "cron",
     "hour": 23,
     "minute": 45
+},
+{
+    "id": "daily_category_update",
+    "func": "collectors.category_update:collect",
+    "trigger": "cron",
+    "hour": 16,
+    "minute": 0
 }
 ]