Merge branch 'main' of https://github.com/oxbridge-econ/finfast-summary-backend into dev
Browse files- .github/workflows/check-size.yml +1 -1
- app/controllers/lda.py +5 -11
- app/controllers/summary/utils.py +133 -9
- app/mapping.json +70 -0
- app/routes/lda.py +3 -0
.github/workflows/check-size.yml
CHANGED
|
@@ -8,7 +8,7 @@ on: # or directly `on: [push]` to run the action on every push on
|
|
| 8 |
|
| 9 |
jobs:
|
| 10 |
sync-to-hub:
|
| 11 |
-
runs-on:
|
| 12 |
steps:
|
| 13 |
- name: Check large files
|
| 14 |
uses: ActionsDesk/[email protected]
|
|
|
|
| 8 |
|
| 9 |
jobs:
|
| 10 |
sync-to-hub:
|
| 11 |
+
runs-on: self-hosted
|
| 12 |
steps:
|
| 13 |
- name: Check large files
|
| 14 |
uses: ActionsDesk/[email protected]
|
app/controllers/lda.py
CHANGED
|
@@ -645,18 +645,12 @@ class HeatedKeywordsAnalyzer: # pylint: disable=too-many-instance-attributes
|
|
| 645 |
"""
|
| 646 |
Dynamically determines the optimal number of topics for a gensim model.
|
| 647 |
"""
|
| 648 |
-
if documents_count <
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
# With a moderate number of documents, we can explore a slightly larger range.
|
| 653 |
-
# The lower limit is now increased to 4.
|
| 654 |
-
topic_range = range(4, min(8, documents_count // 3))
|
| 655 |
else:
|
| 656 |
-
|
| 657 |
-
# The upper limit is now the smaller of 25 or a fifth of the document count,
|
| 658 |
-
# allowing it to scale for hundreds of documents.
|
| 659 |
-
topic_range = range(8, min(25, documents_count // 5))
|
| 660 |
|
| 661 |
if len(topic_range) < 2:
|
| 662 |
return max(2, min(3, documents_count))
|
|
|
|
| 645 |
"""
|
| 646 |
Dynamically determines the optimal number of topics for a gensim model.
|
| 647 |
"""
|
| 648 |
+
if documents_count < 100:
|
| 649 |
+
topic_range = range(6, min(12, documents_count // 8))
|
| 650 |
+
elif documents_count< 300:
|
| 651 |
+
topic_range = range(12, min(20, documents_count // 10))
|
|
|
|
|
|
|
|
|
|
| 652 |
else:
|
| 653 |
+
topic_range = range(20, min(35, documents_count // 25))
|
|
|
|
|
|
|
|
|
|
| 654 |
|
| 655 |
if len(topic_range) < 2:
|
| 656 |
return max(2, min(3, documents_count))
|
app/controllers/summary/utils.py
CHANGED
|
@@ -2,8 +2,9 @@
|
|
| 2 |
This module contains utility functions for both Content Flow Tracker and Entity Analysis,
|
| 3 |
extracted and merged from the previous content/utils.py and entity/utils.py files.
|
| 4 |
"""
|
|
|
|
| 5 |
from datetime import datetime, timedelta
|
| 6 |
-
from typing import Dict, Any
|
| 7 |
from collections import defaultdict
|
| 8 |
|
| 9 |
from models.database import article_collection, entity_collection # pylint: disable=import-error
|
|
@@ -13,21 +14,112 @@ ENTITY_TYPE_FULL_NAMES = {
|
|
| 13 |
"GPE": "Geopolitical Entities (Countries/Cities)",
|
| 14 |
"LOC": "Locations (Non-political)",
|
| 15 |
"ORG": "Organizations",
|
| 16 |
-
"PER": "People",
|
| 17 |
"PERSON": "People",
|
| 18 |
"PROD": "Products",
|
| 19 |
"PRODUCT": "Products",
|
| 20 |
"PRODCAT": "Product Categories",
|
| 21 |
"PRODUCT_CATEGORY": "Product Categories",
|
| 22 |
-
"
|
|
|
|
|
|
|
| 23 |
"EVENT": "Events",
|
| 24 |
"LANGUAGE": "Languages",
|
| 25 |
"NORP": "Nationalities/Religious/Political Groups",
|
| 26 |
"LAW": "Laws/Legal Documents",
|
| 27 |
"FAC": "Facilities/Landmarks",
|
| 28 |
-
"
|
| 29 |
}
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
def _build_sentiment_lookup(sentiment_results: list) -> Dict:
|
| 33 |
"""Build sentiment lookup dictionary from sentiment aggregation results."""
|
|
@@ -198,12 +290,44 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
|
|
| 198 |
mentions_results = list(entity_collection.aggregate(mentions_pipeline))
|
| 199 |
sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
entity_types: Dict[str, Any] = {}
|
| 206 |
-
for mentions_result in
|
| 207 |
entity_type = mentions_result["_id"]["type"]
|
| 208 |
|
| 209 |
if entity_type not in entity_types:
|
|
@@ -213,7 +337,7 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
|
|
| 213 |
}
|
| 214 |
|
| 215 |
entity_types[entity_type]["entities"].append(
|
| 216 |
-
_process_entity_with_sentiment(mentions_result,
|
| 217 |
)
|
| 218 |
|
| 219 |
# Keep only the top 10 per type
|
|
|
|
| 2 |
This module contains utility functions for both Content Flow Tracker and Entity Analysis,
|
| 3 |
extracted and merged from the previous content/utils.py and entity/utils.py files.
|
| 4 |
"""
|
| 5 |
+
import json
|
| 6 |
from datetime import datetime, timedelta
|
| 7 |
+
from typing import Dict, Any, List
|
| 8 |
from collections import defaultdict
|
| 9 |
|
| 10 |
from models.database import article_collection, entity_collection # pylint: disable=import-error
|
|
|
|
| 14 |
"GPE": "Geopolitical Entities (Countries/Cities)",
|
| 15 |
"LOC": "Locations (Non-political)",
|
| 16 |
"ORG": "Organizations",
|
|
|
|
| 17 |
"PERSON": "People",
|
| 18 |
"PROD": "Products",
|
| 19 |
"PRODUCT": "Products",
|
| 20 |
"PRODCAT": "Product Categories",
|
| 21 |
"PRODUCT_CATEGORY": "Product Categories",
|
| 22 |
+
"COMPANY": "Companies",
|
| 23 |
+
"FINANCIAL_ASSET": "Financial Assets",
|
| 24 |
+
"ECONOMIC_INDICATOR": "Economic Indicators",
|
| 25 |
"EVENT": "Events",
|
| 26 |
"LANGUAGE": "Languages",
|
| 27 |
"NORP": "Nationalities/Religious/Political Groups",
|
| 28 |
"LAW": "Laws/Legal Documents",
|
| 29 |
"FAC": "Facilities/Landmarks",
|
| 30 |
+
"INDUSTRY": "Industries",
|
| 31 |
}
|
| 32 |
|
| 33 |
+
# Allowed entity types for analysis
|
| 34 |
+
ALLOWED_ENTITY_TYPES = {
|
| 35 |
+
"GPE", "ORG", "PERSON", "COMPANY",
|
| 36 |
+
"FINANCIAL_ASSET", "ECONOMIC_INDICATOR", "INDUSTRY"
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Load entity normalization mapping
|
| 40 |
+
def _load_entity_mapping() -> Dict[str, str]:
|
| 41 |
+
"""Load entity normalization mapping from JSON file."""
|
| 42 |
+
try:
|
| 43 |
+
with open("mapping.json", 'r', encoding='utf-8') as f:
|
| 44 |
+
return json.load(f)
|
| 45 |
+
except FileNotFoundError:
|
| 46 |
+
return {}
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def normalize_entity_name(entity_name: str) -> str:
|
| 50 |
+
"""
|
| 51 |
+
Normalize entity names using the mapping file.
|
| 52 |
+
|
| 53 |
+
Parameters
|
| 54 |
+
----------
|
| 55 |
+
entity_name : str
|
| 56 |
+
The original entity name to normalize.
|
| 57 |
+
|
| 58 |
+
Returns
|
| 59 |
+
-------
|
| 60 |
+
str
|
| 61 |
+
The normalized entity name, or original if no mapping found.
|
| 62 |
+
"""
|
| 63 |
+
if not entity_name:
|
| 64 |
+
return entity_name
|
| 65 |
+
|
| 66 |
+
# Convert to string and clean
|
| 67 |
+
normalized = str(entity_name).strip()
|
| 68 |
+
|
| 69 |
+
# Apply basic replacements
|
| 70 |
+
normalized = normalized.replace("U.S.", "US")
|
| 71 |
+
normalized = normalized.replace("consumer price index", "CPI")
|
| 72 |
+
normalized = normalized.replace("Gross Domestic Product", "GDP")
|
| 73 |
+
|
| 74 |
+
# Load and apply mapping
|
| 75 |
+
mapping = _load_entity_mapping()
|
| 76 |
+
return mapping.get(normalized, normalized)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def aggregate_entities(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 80 |
+
"""
|
| 81 |
+
Aggregate duplicate entities by summing their occurrence counts.
|
| 82 |
+
|
| 83 |
+
Parameters
|
| 84 |
+
----------
|
| 85 |
+
entities : List[Dict[str, Any]]
|
| 86 |
+
A list of entity dictionaries where each dictionary must contain:
|
| 87 |
+
- 'entity' (str): The name of the entity
|
| 88 |
+
- 'type' (str): The type/category of the entity
|
| 89 |
+
- 'occurrence' (int): The count of occurrences for this entity
|
| 90 |
+
|
| 91 |
+
Returns
|
| 92 |
+
-------
|
| 93 |
+
List[Dict[str, Any]]
|
| 94 |
+
A list of unique entity dictionaries with aggregated occurrence counts,
|
| 95 |
+
where each dictionary contains:
|
| 96 |
+
- 'entity' (str): The normalized entity name
|
| 97 |
+
- 'type' (str): The entity type (unchanged)
|
| 98 |
+
- 'occurrence' (int): The summed occurrence count across all duplicates
|
| 99 |
+
"""
|
| 100 |
+
aggregated = {}
|
| 101 |
+
|
| 102 |
+
for entity in entities:
|
| 103 |
+
# Normalize entity name
|
| 104 |
+
normalized_name = normalize_entity_name(entity['entity'])
|
| 105 |
+
key = (normalized_name, entity['type'])
|
| 106 |
+
|
| 107 |
+
if key in aggregated:
|
| 108 |
+
aggregated[key] += entity['occurrence']
|
| 109 |
+
else:
|
| 110 |
+
aggregated[key] = entity['occurrence']
|
| 111 |
+
|
| 112 |
+
# Convert back to list of dictionaries
|
| 113 |
+
result = []
|
| 114 |
+
for (entity_name, entity_type), count in aggregated.items():
|
| 115 |
+
result.append({
|
| 116 |
+
'entity': entity_name,
|
| 117 |
+
'type': entity_type,
|
| 118 |
+
'occurrence': count
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
return result
|
| 122 |
+
|
| 123 |
|
| 124 |
def _build_sentiment_lookup(sentiment_results: list) -> Dict:
|
| 125 |
"""Build sentiment lookup dictionary from sentiment aggregation results."""
|
|
|
|
| 290 |
mentions_results = list(entity_collection.aggregate(mentions_pipeline))
|
| 291 |
sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
|
| 292 |
|
| 293 |
+
# Filter to only include allowed entity types
|
| 294 |
+
mentions_results = [r for r in mentions_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
|
| 295 |
+
sentiment_results = [r for r in sentiment_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
|
| 296 |
+
|
| 297 |
+
# Convert mentions results to format expected by aggregate_entities
|
| 298 |
+
entities_for_aggregation = []
|
| 299 |
+
for result in mentions_results:
|
| 300 |
+
entities_for_aggregation.append({
|
| 301 |
+
'entity': result['_id']['entity'],
|
| 302 |
+
'type': result['_id']['type'],
|
| 303 |
+
'occurrence': result['mentions']
|
| 304 |
+
})
|
| 305 |
+
|
| 306 |
+
# Normalize and aggregate entities
|
| 307 |
+
aggregated_entities = aggregate_entities(entities_for_aggregation)
|
| 308 |
+
|
| 309 |
+
# Rebuild mentions results with normalized names
|
| 310 |
+
normalized_mentions_results = []
|
| 311 |
+
for agg_entity in aggregated_entities:
|
| 312 |
+
normalized_mentions_results.append({
|
| 313 |
+
'_id': {'entity': agg_entity['entity'], 'type': agg_entity['type']},
|
| 314 |
+
'mentions': agg_entity['occurrence']
|
| 315 |
+
})
|
| 316 |
+
|
| 317 |
+
# Rebuild sentiment lookup with normalized names
|
| 318 |
+
normalized_sentiment_lookup = {}
|
| 319 |
+
for result in sentiment_results:
|
| 320 |
+
normalized_name = normalize_entity_name(result["_id"]["entity"])
|
| 321 |
+
key = (normalized_name, result["_id"]["type"])
|
| 322 |
+
if key in normalized_sentiment_lookup:
|
| 323 |
+
# Average multiple sentiment scores for the same normalized entity
|
| 324 |
+
normalized_sentiment_lookup[key] = (
|
| 325 |
+
normalized_sentiment_lookup[key] + result["avgSentiment"]) / 2
|
| 326 |
+
else:
|
| 327 |
+
normalized_sentiment_lookup[key] = result["avgSentiment"]
|
| 328 |
|
| 329 |
entity_types: Dict[str, Any] = {}
|
| 330 |
+
for mentions_result in normalized_mentions_results:
|
| 331 |
entity_type = mentions_result["_id"]["type"]
|
| 332 |
|
| 333 |
if entity_type not in entity_types:
|
|
|
|
| 337 |
}
|
| 338 |
|
| 339 |
entity_types[entity_type]["entities"].append(
|
| 340 |
+
_process_entity_with_sentiment(mentions_result, normalized_sentiment_lookup)
|
| 341 |
)
|
| 342 |
|
| 343 |
# Keep only the top 10 per type
|
app/mapping.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"President Biden": "Joe Biden",
|
| 3 |
+
"Biden": "Joe Biden",
|
| 4 |
+
"President Joe Biden": "Joe Biden",
|
| 5 |
+
"Joe R Biden": "Joe Biden",
|
| 6 |
+
"President Trump": "Donald Trump",
|
| 7 |
+
"Trump": "Donald Trump",
|
| 8 |
+
"President Donald Trump": "Donald Trump",
|
| 9 |
+
"Donald J Trump": "Donald Trump",
|
| 10 |
+
"President Obama": "Barack Obama",
|
| 11 |
+
"Obama": "Barack Obama",
|
| 12 |
+
"President Barack Obama": "Barack Obama",
|
| 13 |
+
"President Macron": "Emmanuel Macron",
|
| 14 |
+
"Macron": "Emmanuel Macron",
|
| 15 |
+
"President Emmanuel Macron": "Emmanuel Macron",
|
| 16 |
+
"French President Emmanuel Macron": "Emmanuel Macron",
|
| 17 |
+
"Chancellor Merkel": "Angela Merkel",
|
| 18 |
+
"Merkel": "Angela Merkel",
|
| 19 |
+
"German Chancellor Angela Merkel": "Angela Merkel",
|
| 20 |
+
"Prime Minister Sunak": "Rishi Sunak",
|
| 21 |
+
"Sunak": "Rishi Sunak",
|
| 22 |
+
"Rishi Sunak MP": "Rishi Sunak",
|
| 23 |
+
"PM Sunak": "Rishi Sunak",
|
| 24 |
+
"Prime Minister Trudeau": "Justin Trudeau",
|
| 25 |
+
"Trudeau": "Justin Trudeau",
|
| 26 |
+
"Justin Trudeau PM": "Justin Trudeau",
|
| 27 |
+
"Chair Powell": "Jerome Powell",
|
| 28 |
+
"Powell": "Jerome Powell",
|
| 29 |
+
"Jerome H Powell": "Jerome Powell",
|
| 30 |
+
"Fed Chair Powell": "Jerome Powell",
|
| 31 |
+
"Federal Reserve Chair Powell": "Jerome Powell",
|
| 32 |
+
"Chair Yellen": "Janet Yellen",
|
| 33 |
+
"Yellen": "Janet Yellen",
|
| 34 |
+
"Secretary Yellen": "Janet Yellen",
|
| 35 |
+
"Treasury Secretary Yellen": "Janet Yellen",
|
| 36 |
+
"Janet L Yellen": "Janet Yellen",
|
| 37 |
+
"Lagarde": "Christine Lagarde",
|
| 38 |
+
"President Lagarde": "Christine Lagarde",
|
| 39 |
+
"Christine Lagarde ECB": "Christine Lagarde",
|
| 40 |
+
"ECB President Lagarde": "Christine Lagarde",
|
| 41 |
+
"Elon": "Elon Musk",
|
| 42 |
+
"Musk": "Elon Musk",
|
| 43 |
+
"Elon R Musk": "Elon Musk",
|
| 44 |
+
"Buffett": "Warren Buffett",
|
| 45 |
+
"yen": "Japanese yen",
|
| 46 |
+
"JPY": "Japanese yen",
|
| 47 |
+
"dollar": "US dollar",
|
| 48 |
+
"USD": "US dollar",
|
| 49 |
+
"United States dollar": "US dollar",
|
| 50 |
+
"EUR": "Euro",
|
| 51 |
+
"Pound": "British Pound",
|
| 52 |
+
"Sterling": "British Pound",
|
| 53 |
+
"GBP": "British Pound",
|
| 54 |
+
"yuan": "Chinese Yuan",
|
| 55 |
+
"Renminbi": "Chinese Yuan",
|
| 56 |
+
"CNY": "Chinese Yuan",
|
| 57 |
+
"RMB": "Chinese Yuan",
|
| 58 |
+
"BTC": "Bitcoin",
|
| 59 |
+
"International Monetary Fund": "IMF",
|
| 60 |
+
"IMFC": "IMF",
|
| 61 |
+
"IMF Executive Board": "IMF",
|
| 62 |
+
"Executive Board of the International Monetary Fund": "IMF",
|
| 63 |
+
"US central bank": "Federal Reserve",
|
| 64 |
+
"State Council": "State Council of China",
|
| 65 |
+
"FED": "Federal Reserve",
|
| 66 |
+
"European Union": "EU",
|
| 67 |
+
"banks": "Banks",
|
| 68 |
+
"CPI of China": "Chinese CPI",
|
| 69 |
+
"PPI of China": "Chinese PPI"
|
| 70 |
+
}
|
app/routes/lda.py
CHANGED
|
@@ -30,6 +30,9 @@ async def get_lda_results(filter_type: str):
|
|
| 30 |
'error': 'Invalid filter_type',
|
| 31 |
'message': f'filter_type must be one of: {valid_filters}'
|
| 32 |
}, status_code=400)
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Retrieve from MongoDB
|
| 35 |
document = lda_collection.find_one({'_id': filter_type})
|
|
|
|
| 30 |
'error': 'Invalid filter_type',
|
| 31 |
'message': f'filter_type must be one of: {valid_filters}'
|
| 32 |
}, status_code=400)
|
| 33 |
+
|
| 34 |
+
if filter_type == 'today':
|
| 35 |
+
filter_type = 'daily'
|
| 36 |
|
| 37 |
# Retrieve from MongoDB
|
| 38 |
document = lda_collection.find_one({'_id': filter_type})
|