Weirui-Leo commited on
Commit
c2e02ba
·
1 Parent(s): 2d59648

migrate finfast summary data draft

Browse files
app/app.py CHANGED
@@ -1,23 +1,23 @@
1
  """Flask application entry point."""
2
- # import json
3
  import logging
4
  from flask import Flask
5
  from flask_apscheduler import APScheduler
6
  from asgiref.wsgi import WsgiToAsgi
7
- from routes.category_router import category_bp
8
 
9
- # from routes.summary import summary_bp
 
10
 
11
- # class Config:
12
- # """
13
- # Config class for application settings.
14
 
15
- # Attributes:
16
- # SCHEDULER_API_ENABLED (bool): Indicates whether the scheduler's API is enabled.
17
- # """
18
- # with open('jobs.json', 'r', encoding='utf-8') as jobs_file:
19
- # JOBS = json.load(jobs_file)
20
- # SCHEDULER_API_ENABLED = True
21
 
22
  def create_app():
23
  """
 
1
  """Flask application entry point."""
2
+ import json
3
  import logging
4
  from flask import Flask
5
  from flask_apscheduler import APScheduler
6
  from asgiref.wsgi import WsgiToAsgi
 
7
 
8
+ from routes.summary import summary_bp
9
+ from routes.category_router import category_bp
10
 
11
+ class Config:
12
+ """
13
+ Config class for application settings.
14
 
15
+ Attributes:
16
+ SCHEDULER_API_ENABLED (bool): Indicates whether the scheduler's API is enabled.
17
+ """
18
+ with open('jobs.json', 'r', encoding='utf-8') as jobs_file:
19
+ JOBS = json.load(jobs_file)
20
+ SCHEDULER_API_ENABLED = True
21
 
22
  def create_app():
23
  """
app/controllers/summary/__init__.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Summary controller package for handling summary page related operations."""
2
+ import os
3
+ import importlib
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Dict, Any
6
+ from .utils import get_content_flow_data, get_entity_analysis_data, get_sentiment_analysis_data, get_entity_sentiment_data
7
+
8
+
9
+ def _run_process(args):
10
+ """
11
+ Dynamically imports and runs the 'process' function from a specified summary module.
12
+
13
+ Args:
14
+ args (tuple):
15
+ - module (str): The name of the summary module folder to import from.
16
+ - chart_id (str): The name of the chart module to import and run.
17
+
18
+ Returns:
19
+ Any: The result returned by the 'process' function of the specified module.
20
+
21
+ Raises:
22
+ ModuleNotFoundError: If the specified summary module does not exist.
23
+ AttributeError: If the 'process' function is not found in the module.
24
+ """
25
+ module, chart_id = args
26
+ return importlib.import_module(f"controllers.summary.{module}.{chart_id}").process()
27
+
28
+
29
+ def process(module):
30
+ """
31
+ Processes all Python chart modules within a specified subdirectory.
32
+
33
+ Args:
34
+ module (str): The name of the subdirectory (module) containing chart Python files.
35
+
36
+ Returns:
37
+ list: A list of results returned by processing each chart Python file.
38
+
39
+ Notes:
40
+ - Only files ending with ".py" and not named "__init__.py" are considered.
41
+ - Utility files (utils.py, common.py, etc.) are automatically excluded.
42
+ - Chart files are processed concurrently using a thread pool.
43
+ - The helper function `_run_process` is used to process each chart file.
44
+ """
45
+ current_dir = os.path.join(os.path.dirname(__file__), module)
46
+ chart_ids = [
47
+ f[:-3] for f in os.listdir(current_dir)
48
+ if f.endswith(".py") and f not in ("__init__.py",)
49
+ ]
50
+ with ThreadPoolExecutor() as executor:
51
+ charts = list(executor.map(
52
+ _run_process,
53
+ [(module, chart_id) for chart_id in sorted(chart_ids)]
54
+ ))
55
+ return charts
56
+
57
+
58
+ def get_summary_data(include_content: bool = True, include_entity: bool = True, include_sentiment: bool = True) -> Dict[str, Any]:
59
+ """
60
+ Get complete summary dashboard data for all time periods.
61
+
62
+ This function aggregates content flow, entity analysis, and sentiment analysis data across
63
+ three time periods: today, week, and month.
64
+
65
+ Args:
66
+ include_content (bool, optional): Whether to include content flow data.
67
+ Defaults to True.
68
+ include_entity (bool, optional): Whether to include entity analysis data.
69
+ Defaults to True.
70
+ include_sentiment (bool, optional): Whether to include sentiment analysis data.
71
+ Defaults to True.
72
+
73
+ Returns:
74
+ Dict[str, Any]: Summary data containing content, entity, and/or sentiment information:
75
+ - "content": Content flow data for today/week/month (if include_content=True)
76
+ - "entity": Entity analysis data for today/week/month (if include_entity=True)
77
+ - "sentiment": Sentiment analysis data for today/week/month (if include_sentiment=True)
78
+ """
79
+ summary = {}
80
+ if include_content:
81
+ summary["content"] = {
82
+ "today": get_content_flow_data("today"),
83
+ "week": get_content_flow_data("week"),
84
+ "month": get_content_flow_data("month")
85
+ }
86
+ if include_entity:
87
+ summary["entity"] = {
88
+ "today": get_entity_analysis_data("today"),
89
+ "week": get_entity_analysis_data("week"),
90
+ "month": get_entity_analysis_data("month")
91
+ }
92
+ if include_sentiment:
93
+ summary["sentiment"] = {
94
+ "today": get_sentiment_analysis_data("today"),
95
+ "week": get_sentiment_analysis_data("week"),
96
+ "month": get_sentiment_analysis_data("month"),
97
+ "entities": get_entity_sentiment_data("week")
98
+ }
99
+ return summary
app/controllers/summary/content/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Public interface for content-related summary calculations."""
2
+
3
+ from ..utils import get_content_flow_data as _base
4
+
5
+
6
+ def get_today_content_flow_data():
7
+ """Return Content Flow data for *today*."""
8
+ return _base("today")
9
+
10
+
11
+ def get_weekly_content_flow_data():
12
+ """Return Content Flow data for the *latest 7 days*."""
13
+ # the internal util accepts either ``week`` or ``weekly``
14
+ return _base("week")
15
+
16
+
17
+ def get_monthly_content_flow_data():
18
+ """Return Content Flow data for the *latest 30 days*."""
19
+ return _base("month")
20
+
21
+ # Re-export the generic helper so legacy imports keep working
22
+ get_content_flow_data = _base # type: ignore
23
+
24
+ __all__ = [
25
+ "get_content_flow_data",
26
+ "get_today_content_flow_data",
27
+ "get_weekly_content_flow_data",
28
+ "get_monthly_content_flow_data",
29
+ ]
app/controllers/summary/content/monthly.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Content Flow Tracker - Monthly Module.
2
+
3
+ This module provides content flow analysis for the latest 30 days of data.
4
+ It processes and returns aggregated article statistics by source and category
5
+ for the past 30 days from the most recent article date.
6
+ """
7
+ from ..utils import get_content_flow_data
8
+
9
+
10
+ def process():
11
+ """Return Content Flow Tracker data for the *latest 30 days*."""
12
+ return get_content_flow_data("month")
app/controllers/summary/content/today.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Content Flow Tracker - Today Module.
2
+
3
+ This module provides content flow analysis for today's data only.
4
+ It processes and returns aggregated article statistics by source and category
5
+ for the current day.
6
+ """
7
+ from ..utils import get_content_flow_data
8
+
9
+
10
+ def process():
11
+ """Return Content Flow Tracker data for *today* only."""
12
+ return get_content_flow_data("today")
app/controllers/summary/content/weekly.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Content Flow Tracker - Weekly Module.
2
+
3
+ This module provides content flow analysis for the latest 7 days of data.
4
+ It processes and returns aggregated article statistics by source and category
5
+ for the past 7 days from the most recent article date.
6
+ """
7
+ from ..utils import get_content_flow_data
8
+
9
+
10
+ def process():
11
+ """Return Content Flow Tracker data for the *latest 7 days*."""
12
+ return get_content_flow_data("week")
app/controllers/summary/entity/__init__.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Public interface for entity analysis calculations."""
2
+ from ..utils import get_entity_analysis_data as _base
3
+
4
+
5
+ def get_today_entity_analysis_data():
6
+ """Return Entity Analysis data for *today*."""
7
+ return _base("today")
8
+
9
+
10
+ def get_weekly_entity_analysis_data():
11
+ """Return Entity Analysis data for the *latest 7 days*."""
12
+ return _base("week")
13
+
14
+
15
+ def get_monthly_entity_analysis_data():
16
+ """Return Entity Analysis data for the *latest 30 days*."""
17
+ return _base("month")
18
+
19
+ # Re-export generic helper for backward compatibility
20
+ get_entity_analysis_data = _base # type: ignore
21
+
22
+ __all__ = [
23
+ "get_entity_analysis_data",
24
+ "get_today_entity_analysis_data",
25
+ "get_weekly_entity_analysis_data",
26
+ "get_monthly_entity_analysis_data",
27
+ ]
app/controllers/summary/entity/monthly.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entity Analysis - Monthly Module.
2
+
3
+ This module provides entity analysis for the latest 30 days of data.
4
+ It processes and returns top entities with their mention counts and types
5
+ for the past 30 days from the most recent article date.
6
+ """
7
+ from ..utils import get_entity_analysis_data
8
+
9
+
10
+ def process():
11
+ """Return Entity Analysis data for the *latest 30 days*."""
12
+ return get_entity_analysis_data("month")
app/controllers/summary/entity/today.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entity Analysis - Today Module.
2
+
3
+ This module provides entity analysis for today's data only.
4
+ It processes and returns top entities with their mention counts and types
5
+ for the current day.
6
+ """
7
+ from ..utils import get_entity_analysis_data
8
+
9
+
10
+ def process():
11
+ """Return Entity Analysis data for *today*."""
12
+ return get_entity_analysis_data("today")
app/controllers/summary/entity/weekly.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entity Analysis - Weekly Module.
2
+
3
+ This module provides entity analysis for the latest 7 days of data.
4
+ It processes and returns top entities with their mention counts and types
5
+ for the past 7 days from the most recent article date.
6
+ """
7
+ from ..utils import get_entity_analysis_data
8
+
9
+
10
+ def process():
11
+ """Return Entity Analysis data for the *latest 7 days*."""
12
+ return get_entity_analysis_data("week")
app/controllers/summary/sentiment/__init__.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Public interface for sentiment analysis calculations."""
2
+
3
+ from ..utils import get_sentiment_analysis_data as _base_sentiment
4
+ from ..utils import get_entity_sentiment_data as _base_entity_sentiment
5
+
6
+
7
+ def get_today_sentiment_data():
8
+ """Return Sentiment Analysis data for *today*."""
9
+ return _base_sentiment("today")
10
+
11
+
12
+ def get_weekly_sentiment_data():
13
+ """Return Sentiment Analysis data for the *latest 7 days*."""
14
+ return _base_sentiment("week")
15
+
16
+
17
+ def get_monthly_sentiment_data():
18
+ """Return Sentiment Analysis data for the *latest 30 days*."""
19
+ return _base_sentiment("month")
20
+
21
+
22
+ def get_entity_sentiment_data():
23
+ """Return Entity Sentiment Analysis data for the *latest 7 days*."""
24
+ return _base_entity_sentiment("week")
25
+
26
+
27
+ # Re-export the generic helpers for backward compatibility
28
+ get_sentiment_analysis_data = _base_sentiment
29
+ get_entities_sentiment_data = _base_entity_sentiment
30
+
31
+ __all__ = [
32
+ "get_sentiment_analysis_data",
33
+ "get_today_sentiment_data",
34
+ "get_weekly_sentiment_data",
35
+ "get_monthly_sentiment_data",
36
+ "get_entity_sentiment_data",
37
+ "get_entities_sentiment_data",
38
+ ]
app/controllers/summary/sentiment/entitites.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Entity Sentiment Analysis Module.
2
+
3
+ This module provides sentiment analysis for entities mentioned in articles.
4
+ It processes and returns aggregated sentiment scores by entity type and entity name
5
+ for the current week.
6
+ """
7
+ from ..utils import get_entity_sentiment_data
8
+
9
+
10
+ def process():
11
+ """Return Entity Sentiment Analysis data for the *current week*."""
12
+ return get_entity_sentiment_data("week")
app/controllers/summary/sentiment/monthly.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sentiment Analysis - Monthly Module.
2
+
3
+ This module provides sentiment analysis for the current month's articles by category.
4
+ It processes and returns aggregated sentiment scores by category for the current month.
5
+ """
6
+ from ..utils import get_sentiment_analysis_data
7
+
8
+
9
+ def process():
10
+ """Return Sentiment Analysis data for the *current month*."""
11
+ return get_sentiment_analysis_data("month")
app/controllers/summary/sentiment/today.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sentiment Analysis - Today Module.
2
+
3
+ This module provides sentiment analysis for today's articles by category.
4
+ It processes and returns aggregated sentiment scores by category for the current day.
5
+ """
6
+ from ..utils import get_sentiment_analysis_data
7
+
8
+
9
+ def process():
10
+ """Return Sentiment Analysis data for *today* only."""
11
+ return get_sentiment_analysis_data("today")
app/controllers/summary/sentiment/weekly.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Sentiment Analysis - Weekly Module.
2
+
3
+ This module provides sentiment analysis for the current week's articles by category.
4
+ It processes and returns aggregated sentiment scores by category for the past 7 days.
5
+ """
6
+ from ..utils import get_sentiment_analysis_data
7
+
8
+
9
+ def process():
10
+ """Return Sentiment Analysis data for the *current week*."""
11
+ return get_sentiment_analysis_data("week")
app/controllers/summary/utils.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for Summary calculations.
2
+ This module contains utility functions for both Content Flow Tracker and Entity Analysis,
3
+ extracted and merged from the previous content/utils.py and entity/utils.py files.
4
+ """
5
+ from datetime import datetime, timedelta
6
+ from typing import Dict, Any
7
+ from collections import defaultdict
8
+
9
+ from database.mongodb import article_collection, entity_collection
10
+
11
+
12
+ def _get_latest_publish_date_from_collection(collection) -> datetime:
13
+ """Return the latest publish date found in the specified collection.
14
+
15
+ Parameters
16
+ ----------
17
+ collection:
18
+ MongoDB collection to query for the latest publishDate.
19
+
20
+ Returns
21
+ -------
22
+ datetime
23
+ Latest publish date found, or current date if collection is empty.
24
+ """
25
+ latest_doc = collection.find_one(
26
+ sort=[("publishDate", -1)], projection={"publishDate": 1}
27
+ )
28
+ if latest_doc and "publishDate" in latest_doc:
29
+ return datetime.strptime(latest_doc["publishDate"], "%Y-%m-%d")
30
+ return datetime.today()
31
+
32
+
33
+ def _time_range(filter_type: str, collection) -> tuple[str, str]:
34
+ """Calculate *inclusive* start / end date strings using rolling window approach.
35
+
36
+ Uses rolling window logic:
37
+ - today: only the latest date
38
+ - weekly: latest date - 6 days (total 7 days)
39
+ - monthly: latest date - 29 days (total 30 days)
40
+
41
+ Parameters
42
+ ----------
43
+ filter_type:
44
+ One of ``today``, ``week``/``weekly`` or ``month``/``monthly``. Any
45
+ unrecognised value will fall back to *all time* where the start date is
46
+ ``datetime.min``.
47
+ collection:
48
+ MongoDB collection to get the latest date from.
49
+
50
+ Returns
51
+ -------
52
+ tuple[str, str]
53
+ Start and end dates as strings in YYYY-MM-DD format.
54
+ """
55
+ latest_date = _get_latest_publish_date_from_collection(collection)
56
+
57
+ if filter_type in {"today"}:
58
+ start = latest_date.date()
59
+ elif filter_type in {"week", "weekly"}:
60
+ # Latest date minus 6 days (total 7 days)
61
+ start = (latest_date - timedelta(days=6)).date()
62
+ elif filter_type in {"month", "monthly"}:
63
+ # Latest date minus 29 days (total 30 days)
64
+ start = (latest_date - timedelta(days=29)).date()
65
+ else:
66
+ start = datetime.min.date()
67
+
68
+ return str(start), str(latest_date.date())
69
+
70
+
71
+
72
+ def get_content_flow_data(time_filter: str) -> Dict[str, Any]:
73
+ """Return aggregated *Content Flow Tracker* data for the given period.
74
+
75
+ Uses rolling window approach:
76
+ - today: only the latest date
77
+ - weekly: latest date - 6 days (total 7 days)
78
+ - monthly: latest date - 29 days (total 30 days)
79
+
80
+ Parameters
81
+ ----------
82
+ time_filter:
83
+ Time period filter ('today', 'week'/'weekly', 'month'/'monthly', or any other for all time).
84
+
85
+ Returns
86
+ -------
87
+ Dict[str, Any]
88
+ Dictionary containing title, dateRange, and aggregated content flow data.
89
+ """
90
+ start, end = _time_range(time_filter, article_collection)
91
+
92
+ pipeline = [
93
+ {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
94
+ {"$group": {"_id": {"source": "$site", "category": "$category"}, "count": {"$sum": 1}}},
95
+ {"$sort": {"count": -1}},
96
+ ]
97
+
98
+ results = list(article_collection.aggregate(pipeline))
99
+
100
+ data = [
101
+ {
102
+ "category": r["_id"].get("category", "Uncategorized"),
103
+ "source": r["_id"]["source"],
104
+ "count": r["count"],
105
+ }
106
+ for r in results
107
+ ]
108
+
109
+ return {
110
+ "title": f"Content Flow Tracker — {time_filter.capitalize()}",
111
+ "dateRange": {"start": start, "end": end},
112
+ "data": data,
113
+ }
114
+
115
+
116
+ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
117
+ """Return *Entity Analysis* data for the given period.
118
+
119
+ Uses rolling window approach:
120
+ - today: only the latest date
121
+ - weekly: latest date - 6 days (total 7 days)
122
+ - monthly: latest date - 29 days (total 30 days)
123
+
124
+ Parameters
125
+ ----------
126
+ time_filter:
127
+ Time period filter ('today', 'week'/'weekly', 'month'/'monthly', or any other for all time).
128
+
129
+ Returns
130
+ -------
131
+ Dict[str, Any]
132
+ Dictionary containing title, dateRange, and aggregated entity analysis data.
133
+ """
134
+ start, end = _time_range(time_filter, entity_collection)
135
+
136
+ pipeline = [
137
+ {"$match": {"publishDate": {"$gte": start, "$lte": end}}},
138
+ {"$group": {"_id": {"entity": "$entity", "type": "$entityType"},
139
+ "mentions": {"$sum": "$occurrence"}}},
140
+ {"$sort": {"mentions": -1}},
141
+ ]
142
+
143
+ results = list(entity_collection.aggregate(pipeline))
144
+
145
+ type_full_names = {
146
+ "GPE": "Geopolitical Entities (Countries/Cities)",
147
+ "LOC": "Locations (Non-political)",
148
+ "ORG": "Organizations",
149
+ "PER": "People",
150
+ "PERSON": "People",
151
+ "PROD": "Products",
152
+ "PRODUCT": "Products",
153
+ "PRODCAT": "Product Categories",
154
+ "PRODUCT_CATEGORY": "Product Categories",
155
+ "COM": "Companies",
156
+ "EVENT": "Events",
157
+ "LANGUAGE": "Languages",
158
+ "NORP": "Nationalities/Religious/Political Groups",
159
+ "LAW": "Laws/Legal Documents",
160
+ "FAC": "Facilities/Landmarks",
161
+ "INS": "Industry Institutions",
162
+ }
163
+
164
+ entity_types: Dict[str, Any] = {}
165
+ for r in results:
166
+ e_type = r["_id"]["type"]
167
+ entity_name = r["_id"]["entity"].replace("_", " ")
168
+ if e_type not in entity_types:
169
+ entity_types[e_type] = {
170
+ "fullName": type_full_names.get(e_type, e_type),
171
+ "entities": [],
172
+ }
173
+ entity_types[e_type]["entities"].append(
174
+ {"entityName": entity_name, "mentions": r["mentions"]}
175
+ )
176
+
177
+ # keep only the top 10 per type
178
+ for type_data in entity_types.values():
179
+ type_data["entities"] = sorted(
180
+ type_data["entities"], key=lambda x: -x["mentions"]
181
+ )[:10]
182
+
183
+ return {
184
+ "title": f"Top Entities - {time_filter.capitalize()}",
185
+ "dateRange": {"start": start, "end": end},
186
+ "data": entity_types,
187
+ }
188
+
189
+
190
+ def get_sentiment_analysis_data(time_filter: str) -> Dict[str, Any]:
191
+ """Return aggregated *Sentiment Analysis* data for articles by category for the given period.
192
+
193
+ Uses rolling window approach:
194
+ - today: only the latest date
195
+ - weekly: latest date - 6 days (total 7 days)
196
+ - monthly: latest date - 29 days (total 30 days)
197
+
198
+ Parameters
199
+ ----------
200
+ time_filter:
201
+ Time period filter ('today', 'week'/'weekly', 'month'/'monthly', or any other for all time).
202
+
203
+ Returns
204
+ -------
205
+ Dict[str, Any]
206
+ Dictionary containing title, dateRange, and sentiment data by category and date.
207
+ """
208
+ start, end = _time_range(time_filter, article_collection)
209
+
210
+ # Convert time_filter to match the original logic
211
+ if time_filter == "today":
212
+ start_date = datetime.strptime(end, "%Y-%m-%d").date()
213
+ num_days = 1
214
+ elif time_filter in {"week", "weekly"}:
215
+ start_date = datetime.strptime(start, "%Y-%m-%d").date()
216
+ num_days = 7
217
+ elif time_filter in {"month", "monthly"}:
218
+ start_date = datetime.strptime(start, "%Y-%m-%d").date()
219
+ num_days = 30
220
+ else:
221
+ start_date = datetime.strptime(start, "%Y-%m-%d").date()
222
+ end_date = datetime.strptime(end, "%Y-%m-%d").date()
223
+ num_days = (end_date - start_date).days + 1
224
+
225
+ # Query articles with sentiment scores
226
+ query = {
227
+ "publishDate": {"$gte": start, "$lte": end},
228
+ "sentimentScore": {"$exists": True}
229
+ }
230
+
231
+ docs = list(article_collection.find(query))
232
+ daily_scores = defaultdict(lambda: defaultdict(list))
233
+
234
+ # Aggregate sentiment scores by category and date
235
+ for doc in docs:
236
+ category = doc.get("category", "Unknown")
237
+ score = doc.get("sentimentScore")
238
+ pub_date = doc.get("publishDate")
239
+ if category and score is not None and pub_date:
240
+ daily_scores[category][pub_date].append(score)
241
+
242
+ # Generate nested data structure: date -> category -> sentiment
243
+ data = {}
244
+ for i in range(num_days):
245
+ day = (start_date + timedelta(days=i)).isoformat()
246
+ data[day] = {}
247
+ for category in daily_scores:
248
+ scores = daily_scores[category].get(day, [])
249
+ avg_score = sum(scores) / len(scores) if scores else None
250
+ data[day][category] = avg_score
251
+
252
+ return {
253
+ "title": f"Sentiment Analysis by Category — {time_filter.capitalize()}",
254
+ "dateRange": {"start": start, "end": end},
255
+ "data": data
256
+ }
257
+
258
+
259
+ def get_entity_sentiment_data(time_filter: str = "weekly") -> Dict[str, Any]:
260
+ """Return *Entity Sentiment Analysis* data for the given period.
261
+
262
+ Uses rolling window approach:
263
+ - today: only the latest date
264
+ - weekly: latest date - 6 days (total 7 days)
265
+ - monthly: latest date - 29 days (total 30 days)
266
+
267
+ Parameters
268
+ ----------
269
+ time_filter:
270
+ Time period filter. Defaults to 'weekly' to match original behavior.
271
+
272
+ Returns
273
+ -------
274
+ Dict[str, Any]
275
+ Dictionary containing title, dateRange, and entity sentiment data.
276
+ """
277
+ start, end = _time_range(time_filter, entity_collection)
278
+
279
+ # Convert to date for calculations
280
+ start_date = datetime.strptime(start, "%Y-%m-%d").date()
281
+ end_date = datetime.strptime(end, "%Y-%m-%d").date()
282
+
283
+ # Calculate num_days based on sentiment logic
284
+ if time_filter == "today":
285
+ num_days = 1
286
+ elif time_filter in {"week", "weekly"}:
287
+ num_days = 7
288
+ elif time_filter in {"month", "monthly"}:
289
+ num_days = 30
290
+ else:
291
+ num_days = (end_date - start_date).days + 1
292
+
293
+ # Query entities with sentiment scores
294
+ query = {
295
+ "publishDate": {"$gte": start, "$lte": end},
296
+ "sentimentScore": {"$exists": True},
297
+ "entity": {"$exists": True},
298
+ "entityType": {"$exists": True}
299
+ }
300
+
301
+ docs = list(entity_collection.find(query))
302
+ sentiment_by_type = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
303
+
304
+ # Aggregate sentiment scores by entity type, entity name, and date
305
+ for doc in docs:
306
+ entity = doc.get("entity", "Unknown")
307
+ entity_type = doc.get("entityType", "Unknown")
308
+ score = doc.get("sentimentScore")
309
+ pub_date = doc.get("publishDate")
310
+
311
+ if entity and entity_type and score is not None and pub_date:
312
+ sentiment_by_type[entity_type][entity][pub_date].append(score)
313
+
314
+ # Filter top 10 entities per entityType based on sentiment volatility (range)
315
+ top_n = 10
316
+ selected_entities = {}
317
+
318
+ for entity_type, entities in sentiment_by_type.items():
319
+ volatility_scores = {}
320
+ for entity, date_scores in entities.items():
321
+ # Calculate all sentiment values for this entity
322
+ all_values = []
323
+ for i in range(num_days):
324
+ day = (start_date + timedelta(days=i)).isoformat()
325
+ scores = date_scores.get(day, [])
326
+ avg = sum(scores) / len(scores) if scores else None
327
+ if avg is not None:
328
+ all_values.append(avg)
329
+
330
+ # Calculate volatility (range: max - min)
331
+ if len(all_values) > 1:
332
+ volatility = max(all_values) - min(all_values)
333
+ elif len(all_values) == 1:
334
+ volatility = abs(all_values[0]) # Use absolute value for single data point
335
+ else:
336
+ volatility = 0 # No data points
337
+
338
+ volatility_scores[entity] = volatility
339
+
340
+ # Select top N entities with highest volatility
341
+ top_entities = sorted(volatility_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
342
+ selected_entities[entity_type] = [entity for entity, _ in top_entities]
343
+
344
+ # Generate nested data structure: entityType -> date -> entity -> sentiment
345
+ data = {}
346
+ for i in range(num_days):
347
+ day = (start_date + timedelta(days=i)).isoformat()
348
+ for entity_type, entities in sentiment_by_type.items():
349
+ if entity_type not in data:
350
+ data[entity_type] = {}
351
+ if day not in data[entity_type]:
352
+ data[entity_type][day] = {}
353
+
354
+ # Only include selected top entities
355
+ for entity in selected_entities[entity_type]:
356
+ date_scores = entities.get(entity, {})
357
+ scores = date_scores.get(day, [])
358
+ avg = sum(scores) / len(scores) if scores else None
359
+ data[entity_type][day][entity.replace("_", " ")] = avg
360
+
361
+ return {
362
+ "title": f"Entity Sentiment Analysis — {time_filter.capitalize()}",
363
+ "dateRange": {"start": start, "end": end},
364
+ "data": data
365
+ }
app/database/mongodb.py CHANGED
@@ -4,4 +4,7 @@ from pymongo import MongoClient
4
 
5
  MongodbClient = MongoClient(os.getenv('MONGODB_URI'))
6
  FinFastMongodbClient = MongoClient(os.getenv("MONGODB_FINFAST_URI"))
 
7
  category_collection = FinFastMongodbClient["FinFAST_China"]["Category"]
 
 
 
4
 
5
  MongodbClient = MongoClient(os.getenv('MONGODB_URI'))
6
  FinFastMongodbClient = MongoClient(os.getenv("MONGODB_FINFAST_URI"))
7
+ article_collection = FinFastMongodbClient["FinFAST_China"]["Article"]
8
  category_collection = FinFastMongodbClient["FinFAST_China"]["Category"]
9
+ entity_collection = FinFastMongodbClient["FinFAST_China"]["Entity"]
10
+
app/requirements.txt CHANGED
@@ -1,19 +1,26 @@
1
  APScheduler==3.11.0
2
  asgiref==3.9.1
3
  blinker==1.9.0
 
 
4
  click==8.2.1
5
  colorama==0.4.6
6
  dnspython==2.7.0
7
  Flask==3.1.1
8
  Flask-APScheduler==1.13.1
9
  h11==0.16.0
 
10
  itsdangerous==2.2.0
11
  Jinja2==3.1.6
12
  MarkupSafe==3.0.2
13
  pymongo==3.12.0
14
  python-dateutil==2.9.0.post0
 
 
15
  six==1.17.0
16
  tzdata==2025.2
17
  tzlocal==5.3.1
18
  uvicorn==0.35.0
19
  Werkzeug==3.1.3
 
 
 
1
  APScheduler==3.11.0
2
  asgiref==3.9.1
3
  blinker==1.9.0
4
+ certifi==2025.1.31
5
+ charset-normalizer==3.4.1
6
  click==8.2.1
7
  colorama==0.4.6
8
  dnspython==2.7.0
9
  Flask==3.1.1
10
  Flask-APScheduler==1.13.1
11
  h11==0.16.0
12
+ idna==3.10
13
  itsdangerous==2.2.0
14
  Jinja2==3.1.6
15
  MarkupSafe==3.0.2
16
  pymongo==3.12.0
17
  python-dateutil==2.9.0.post0
18
+ pytz==2025.2
19
+ requests==2.32.3
20
  six==1.17.0
21
  tzdata==2025.2
22
  tzlocal==5.3.1
23
  uvicorn==0.35.0
24
  Werkzeug==3.1.3
25
+ bidict==0.23.1
26
+ boto3==1.38.43
app/routes/__init__.py CHANGED
@@ -2,3 +2,4 @@
2
  from flask import Blueprint
3
 
4
  category_bp = Blueprint("category", __name__)
 
 
2
  from flask import Blueprint
3
 
4
  category_bp = Blueprint("category", __name__)
5
+ summary_bp = Blueprint('summary', __name__)
app/routes/summary.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module defines the /summary route for the Flask application."""
2
+
3
+ import importlib
4
+ from flask import jsonify
5
+ from controllers.summary import get_summary_data
6
+ from . import summary_bp
7
+
8
+ @summary_bp.route('', methods=['GET'])
9
+ def get_summary():
10
+ """
11
+ Generate a summary dashboard with content flow and entity analysis data.
12
+
13
+ This endpoint provides a complete summary overview, including:
14
+ - Content Flow Tracker: Article counts by source and category
15
+ - Entity Analysis: Top entities by type with mentions
16
+
17
+ All data is returned together, divided into three time periods: today, week, and month.
18
+
19
+ Returns:
20
+ JSONResponse: A JSON response containing the complete summary dashboard data:
21
+ {
22
+ "content": {"today": {...}, "week": {...}, "month": {...}},
23
+ "entity": {"today": {...}, "week": {...}, "month": {...}}
24
+ }
25
+ """
26
+ summary_data = get_summary_data()
27
+ return jsonify(summary_data)
28
+
29
+
30
+ @summary_bp.route('/<string:module>/<string:chart_id>', methods=['GET'])
31
+ def get_summary_chart(module, chart_id):
32
+ """
33
+ Handles GET requests to the summary route with a specific module and chart ID.
34
+
35
+ Args:
36
+ module (str): The module identifier (content or entity).
37
+ chart_id (str): The chart identifier (today, weekly, monthly).
38
+
39
+ Returns:
40
+ tuple: The result of the chart's process function and HTTP status code 200.
41
+
42
+ Raises:
43
+ ImportError: If the specified chart module cannot be imported.
44
+ AttributeError: If the imported module does not have a 'process' function.
45
+
46
+ Endpoint:
47
+ GET /<module>/<chart_id>
48
+ """
49
+ result = importlib.import_module(f"controllers.summary.{module}.{chart_id}").process()
50
+ return jsonify(result), 200
51
+
52
+
53
+ @summary_bp.route('/<string:module>', methods=['GET'])
54
+ def get_summary_module(module):
55
+ """
56
+ Handles GET requests to the summary route for a specific module.
57
+ Triggers the process for each chart under this module concurrently.
58
+
59
+ Args:
60
+ module (str): The module identifier (content or entity).
61
+
62
+ Returns:
63
+ dict: {"module": module, "charts": [chart1, chart2, ...]}
64
+
65
+ Raises:
66
+ ImportError: If the specified module cannot be imported.
67
+ """
68
+ result = importlib.import_module("controllers.summary").process(module)
69
+ return jsonify(result), 200