Muhammad Abdur Rahman Saad commited on
Commit
9b568c5
·
2 Parent(s): dbe1923 9141a95

Merge branch 'main' of https://github.com/oxbridge-econ/finfast-summary-backend into dev

Browse files
.github/workflows/check-size.yml CHANGED
@@ -8,7 +8,7 @@ on: # or directly `on: [push]` to run the action on every push on
8
 
9
  jobs:
10
  sync-to-hub:
11
- runs-on: ubuntu-latest
12
  steps:
13
  - name: Check large files
14
  uses: ActionsDesk/[email protected]
 
8
 
9
  jobs:
10
  sync-to-hub:
11
+ runs-on: self-hosted
12
  steps:
13
  - name: Check large files
14
  uses: ActionsDesk/[email protected]
app/controllers/lda.py CHANGED
@@ -645,18 +645,12 @@ class HeatedKeywordsAnalyzer: # pylint: disable=too-many-instance-attributes
645
  """
646
  Dynamically determines the optimal number of topics for a gensim model.
647
  """
648
- if documents_count < 20:
649
- # For very few documents, keep the range small and conservative.
650
- topic_range = range(2, min(5, documents_count))
651
- elif documents_count < 50:
652
- # With a moderate number of documents, we can explore a slightly larger range.
653
- # The lower limit is now increased to 4.
654
- topic_range = range(4, min(8, documents_count // 3))
655
  else:
656
- # For 50 or more documents, start with at least 8 topics and have a higher upper bound.
657
- # The upper limit is now the smaller of 25 or a fifth of the document count,
658
- # allowing it to scale for hundreds of documents.
659
- topic_range = range(8, min(25, documents_count // 5))
660
 
661
  if len(topic_range) < 2:
662
  return max(2, min(3, documents_count))
 
645
  """
646
  Dynamically determines the optimal number of topics for a gensim model.
647
  """
648
+ if documents_count < 100:
649
+ topic_range = range(6, min(12, documents_count // 8))
650
+ elif documents_count< 300:
651
+ topic_range = range(12, min(20, documents_count // 10))
 
 
 
652
  else:
653
+ topic_range = range(20, min(35, documents_count // 25))
 
 
 
654
 
655
  if len(topic_range) < 2:
656
  return max(2, min(3, documents_count))
app/controllers/summary/utils.py CHANGED
@@ -2,8 +2,9 @@
2
  This module contains utility functions for both Content Flow Tracker and Entity Analysis,
3
  extracted and merged from the previous content/utils.py and entity/utils.py files.
4
  """
 
5
  from datetime import datetime, timedelta
6
- from typing import Dict, Any
7
  from collections import defaultdict
8
 
9
  from models.database import article_collection, entity_collection # pylint: disable=import-error
@@ -13,21 +14,112 @@ ENTITY_TYPE_FULL_NAMES = {
13
  "GPE": "Geopolitical Entities (Countries/Cities)",
14
  "LOC": "Locations (Non-political)",
15
  "ORG": "Organizations",
16
- "PER": "People",
17
  "PERSON": "People",
18
  "PROD": "Products",
19
  "PRODUCT": "Products",
20
  "PRODCAT": "Product Categories",
21
  "PRODUCT_CATEGORY": "Product Categories",
22
- "COM": "Companies",
 
 
23
  "EVENT": "Events",
24
  "LANGUAGE": "Languages",
25
  "NORP": "Nationalities/Religious/Political Groups",
26
  "LAW": "Laws/Legal Documents",
27
  "FAC": "Facilities/Landmarks",
28
- "INS": "Industry Institutions",
29
  }
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  def _build_sentiment_lookup(sentiment_results: list) -> Dict:
33
  """Build sentiment lookup dictionary from sentiment aggregation results."""
@@ -198,12 +290,44 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
198
  mentions_results = list(entity_collection.aggregate(mentions_pipeline))
199
  sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
200
 
201
- sentiment_lookup = _build_sentiment_lookup(sentiment_results)
202
-
203
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  entity_types: Dict[str, Any] = {}
206
- for mentions_result in mentions_results:
207
  entity_type = mentions_result["_id"]["type"]
208
 
209
  if entity_type not in entity_types:
@@ -213,7 +337,7 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
213
  }
214
 
215
  entity_types[entity_type]["entities"].append(
216
- _process_entity_with_sentiment(mentions_result, sentiment_lookup)
217
  )
218
 
219
  # Keep only the top 10 per type
 
2
  This module contains utility functions for both Content Flow Tracker and Entity Analysis,
3
  extracted and merged from the previous content/utils.py and entity/utils.py files.
4
  """
5
+ import json
6
  from datetime import datetime, timedelta
7
+ from typing import Dict, Any, List
8
  from collections import defaultdict
9
 
10
  from models.database import article_collection, entity_collection # pylint: disable=import-error
 
14
  "GPE": "Geopolitical Entities (Countries/Cities)",
15
  "LOC": "Locations (Non-political)",
16
  "ORG": "Organizations",
 
17
  "PERSON": "People",
18
  "PROD": "Products",
19
  "PRODUCT": "Products",
20
  "PRODCAT": "Product Categories",
21
  "PRODUCT_CATEGORY": "Product Categories",
22
+ "COMPANY": "Companies",
23
+ "FINANCIAL_ASSET": "Financial Assets",
24
+ "ECONOMIC_INDICATOR": "Economic Indicators",
25
  "EVENT": "Events",
26
  "LANGUAGE": "Languages",
27
  "NORP": "Nationalities/Religious/Political Groups",
28
  "LAW": "Laws/Legal Documents",
29
  "FAC": "Facilities/Landmarks",
30
+ "INDUSTRY": "Industries",
31
  }
32
 
33
+ # Allowed entity types for analysis
34
+ ALLOWED_ENTITY_TYPES = {
35
+ "GPE", "ORG", "PERSON", "COMPANY",
36
+ "FINANCIAL_ASSET", "ECONOMIC_INDICATOR", "INDUSTRY"
37
+ }
38
+
39
+ # Load entity normalization mapping
40
+ def _load_entity_mapping() -> Dict[str, str]:
41
+ """Load entity normalization mapping from JSON file."""
42
+ try:
43
+ with open("mapping.json", 'r', encoding='utf-8') as f:
44
+ return json.load(f)
45
+ except FileNotFoundError:
46
+ return {}
47
+
48
+
49
+ def normalize_entity_name(entity_name: str) -> str:
50
+ """
51
+ Normalize entity names using the mapping file.
52
+
53
+ Parameters
54
+ ----------
55
+ entity_name : str
56
+ The original entity name to normalize.
57
+
58
+ Returns
59
+ -------
60
+ str
61
+ The normalized entity name, or original if no mapping found.
62
+ """
63
+ if not entity_name:
64
+ return entity_name
65
+
66
+ # Convert to string and clean
67
+ normalized = str(entity_name).strip()
68
+
69
+ # Apply basic replacements
70
+ normalized = normalized.replace("U.S.", "US")
71
+ normalized = normalized.replace("consumer price index", "CPI")
72
+ normalized = normalized.replace("Gross Domestic Product", "GDP")
73
+
74
+ # Load and apply mapping
75
+ mapping = _load_entity_mapping()
76
+ return mapping.get(normalized, normalized)
77
+
78
+
79
+ def aggregate_entities(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
80
+ """
81
+ Aggregate duplicate entities by summing their occurrence counts.
82
+
83
+ Parameters
84
+ ----------
85
+ entities : List[Dict[str, Any]]
86
+ A list of entity dictionaries where each dictionary must contain:
87
+ - 'entity' (str): The name of the entity
88
+ - 'type' (str): The type/category of the entity
89
+ - 'occurrence' (int): The count of occurrences for this entity
90
+
91
+ Returns
92
+ -------
93
+ List[Dict[str, Any]]
94
+ A list of unique entity dictionaries with aggregated occurrence counts,
95
+ where each dictionary contains:
96
+ - 'entity' (str): The normalized entity name
97
+ - 'type' (str): The entity type (unchanged)
98
+ - 'occurrence' (int): The summed occurrence count across all duplicates
99
+ """
100
+ aggregated = {}
101
+
102
+ for entity in entities:
103
+ # Normalize entity name
104
+ normalized_name = normalize_entity_name(entity['entity'])
105
+ key = (normalized_name, entity['type'])
106
+
107
+ if key in aggregated:
108
+ aggregated[key] += entity['occurrence']
109
+ else:
110
+ aggregated[key] = entity['occurrence']
111
+
112
+ # Convert back to list of dictionaries
113
+ result = []
114
+ for (entity_name, entity_type), count in aggregated.items():
115
+ result.append({
116
+ 'entity': entity_name,
117
+ 'type': entity_type,
118
+ 'occurrence': count
119
+ })
120
+
121
+ return result
122
+
123
 
124
  def _build_sentiment_lookup(sentiment_results: list) -> Dict:
125
  """Build sentiment lookup dictionary from sentiment aggregation results."""
 
290
  mentions_results = list(entity_collection.aggregate(mentions_pipeline))
291
  sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
292
 
293
+ # Filter to only include allowed entity types
294
+ mentions_results = [r for r in mentions_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
295
+ sentiment_results = [r for r in sentiment_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
296
+
297
+ # Convert mentions results to format expected by aggregate_entities
298
+ entities_for_aggregation = []
299
+ for result in mentions_results:
300
+ entities_for_aggregation.append({
301
+ 'entity': result['_id']['entity'],
302
+ 'type': result['_id']['type'],
303
+ 'occurrence': result['mentions']
304
+ })
305
+
306
+ # Normalize and aggregate entities
307
+ aggregated_entities = aggregate_entities(entities_for_aggregation)
308
+
309
+ # Rebuild mentions results with normalized names
310
+ normalized_mentions_results = []
311
+ for agg_entity in aggregated_entities:
312
+ normalized_mentions_results.append({
313
+ '_id': {'entity': agg_entity['entity'], 'type': agg_entity['type']},
314
+ 'mentions': agg_entity['occurrence']
315
+ })
316
+
317
+ # Rebuild sentiment lookup with normalized names
318
+ normalized_sentiment_lookup = {}
319
+ for result in sentiment_results:
320
+ normalized_name = normalize_entity_name(result["_id"]["entity"])
321
+ key = (normalized_name, result["_id"]["type"])
322
+ if key in normalized_sentiment_lookup:
323
+ # Average multiple sentiment scores for the same normalized entity
324
+ normalized_sentiment_lookup[key] = (
325
+ normalized_sentiment_lookup[key] + result["avgSentiment"]) / 2
326
+ else:
327
+ normalized_sentiment_lookup[key] = result["avgSentiment"]
328
 
329
  entity_types: Dict[str, Any] = {}
330
+ for mentions_result in normalized_mentions_results:
331
  entity_type = mentions_result["_id"]["type"]
332
 
333
  if entity_type not in entity_types:
 
337
  }
338
 
339
  entity_types[entity_type]["entities"].append(
340
+ _process_entity_with_sentiment(mentions_result, normalized_sentiment_lookup)
341
  )
342
 
343
  # Keep only the top 10 per type
app/mapping.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "President Biden": "Joe Biden",
3
+ "Biden": "Joe Biden",
4
+ "President Joe Biden": "Joe Biden",
5
+ "Joe R Biden": "Joe Biden",
6
+ "President Trump": "Donald Trump",
7
+ "Trump": "Donald Trump",
8
+ "President Donald Trump": "Donald Trump",
9
+ "Donald J Trump": "Donald Trump",
10
+ "President Obama": "Barack Obama",
11
+ "Obama": "Barack Obama",
12
+ "President Barack Obama": "Barack Obama",
13
+ "President Macron": "Emmanuel Macron",
14
+ "Macron": "Emmanuel Macron",
15
+ "President Emmanuel Macron": "Emmanuel Macron",
16
+ "French President Emmanuel Macron": "Emmanuel Macron",
17
+ "Chancellor Merkel": "Angela Merkel",
18
+ "Merkel": "Angela Merkel",
19
+ "German Chancellor Angela Merkel": "Angela Merkel",
20
+ "Prime Minister Sunak": "Rishi Sunak",
21
+ "Sunak": "Rishi Sunak",
22
+ "Rishi Sunak MP": "Rishi Sunak",
23
+ "PM Sunak": "Rishi Sunak",
24
+ "Prime Minister Trudeau": "Justin Trudeau",
25
+ "Trudeau": "Justin Trudeau",
26
+ "Justin Trudeau PM": "Justin Trudeau",
27
+ "Chair Powell": "Jerome Powell",
28
+ "Powell": "Jerome Powell",
29
+ "Jerome H Powell": "Jerome Powell",
30
+ "Fed Chair Powell": "Jerome Powell",
31
+ "Federal Reserve Chair Powell": "Jerome Powell",
32
+ "Chair Yellen": "Janet Yellen",
33
+ "Yellen": "Janet Yellen",
34
+ "Secretary Yellen": "Janet Yellen",
35
+ "Treasury Secretary Yellen": "Janet Yellen",
36
+ "Janet L Yellen": "Janet Yellen",
37
+ "Lagarde": "Christine Lagarde",
38
+ "President Lagarde": "Christine Lagarde",
39
+ "Christine Lagarde ECB": "Christine Lagarde",
40
+ "ECB President Lagarde": "Christine Lagarde",
41
+ "Elon": "Elon Musk",
42
+ "Musk": "Elon Musk",
43
+ "Elon R Musk": "Elon Musk",
44
+ "Buffett": "Warren Buffett",
45
+ "yen": "Japanese yen",
46
+ "JPY": "Japanese yen",
47
+ "dollar": "US dollar",
48
+ "USD": "US dollar",
49
+ "United States dollar": "US dollar",
50
+ "EUR": "Euro",
51
+ "Pound": "British Pound",
52
+ "Sterling": "British Pound",
53
+ "GBP": "British Pound",
54
+ "yuan": "Chinese Yuan",
55
+ "Renminbi": "Chinese Yuan",
56
+ "CNY": "Chinese Yuan",
57
+ "RMB": "Chinese Yuan",
58
+ "BTC": "Bitcoin",
59
+ "International Monetary Fund": "IMF",
60
+ "IMFC": "IMF",
61
+ "IMF Executive Board": "IMF",
62
+ "Executive Board of the International Monetary Fund": "IMF",
63
+ "US central bank": "Federal Reserve",
64
+ "State Council": "State Council of China",
65
+ "FED": "Federal Reserve",
66
+ "European Union": "EU",
67
+ "banks": "Banks",
68
+ "CPI of China": "Chinese CPI",
69
+ "PPI of China": "Chinese PPI"
70
+ }
app/routes/lda.py CHANGED
@@ -30,6 +30,9 @@ async def get_lda_results(filter_type: str):
30
  'error': 'Invalid filter_type',
31
  'message': f'filter_type must be one of: {valid_filters}'
32
  }, status_code=400)
 
 
 
33
 
34
  # Retrieve from MongoDB
35
  document = lda_collection.find_one({'_id': filter_type})
 
30
  'error': 'Invalid filter_type',
31
  'message': f'filter_type must be one of: {valid_filters}'
32
  }, status_code=400)
33
+
34
+ if filter_type == 'today':
35
+ filter_type = 'daily'
36
 
37
  # Retrieve from MongoDB
38
  document = lda_collection.find_one({'_id': filter_type})