Spaces:

Oxbridge-Economics
/

finfast-summary

Running

App Files Files Community

Muhammad Abdur Rahman Saad commited on Sep 7

Commit

9b568c5

2 Parent(s): dbe1923 9141a95

Merge branch 'main' of https://github.com/oxbridge-econ/finfast-summary-backend into dev

Browse files

Files changed (5) hide show

.github/workflows/check-size.yml +1 -1
app/controllers/lda.py +5 -11
app/controllers/summary/utils.py +133 -9
app/mapping.json +70 -0
app/routes/lda.py +3 -0

.github/workflows/check-size.yml CHANGED Viewed

@@ -8,7 +8,7 @@ on:               # or directly `on: [push]` to run the action on every push on
 jobs:
   sync-to-hub:
-    runs-on: ubuntu-latest
     steps:
       - name: Check large files
         uses: ActionsDesk/[email protected]

 jobs:
   sync-to-hub:
+    runs-on: self-hosted
     steps:
       - name: Check large files
         uses: ActionsDesk/[email protected]

app/controllers/lda.py CHANGED Viewed

@@ -645,18 +645,12 @@ class HeatedKeywordsAnalyzer: # pylint: disable=too-many-instance-attributes
         """
         Dynamically determines the optimal number of topics for a gensim model.
         """
-        if documents_count < 20:
-            # For very few documents, keep the range small and conservative.
-            topic_range = range(2, min(5, documents_count))
-        elif documents_count < 50:
-            # With a moderate number of documents, we can explore a slightly larger range.
-            # The lower limit is now increased to 4.
-            topic_range = range(4, min(8, documents_count // 3))
         else:
-            # For 50 or more documents, start with at least 8 topics and have a higher upper bound.
-            # The upper limit is now the smaller of 25 or a fifth of the document count,
-            # allowing it to scale for hundreds of documents.
-            topic_range = range(8, min(25, documents_count // 5))
         if len(topic_range) < 2:
             return max(2, min(3, documents_count))

         """
         Dynamically determines the optimal number of topics for a gensim model.
         """
+        if documents_count < 100:
+            topic_range = range(6, min(12, documents_count // 8))
+        elif documents_count< 300:
+            topic_range = range(12, min(20, documents_count // 10))
         else:
+            topic_range = range(20, min(35, documents_count // 25))
         if len(topic_range) < 2:
             return max(2, min(3, documents_count))

app/controllers/summary/utils.py CHANGED Viewed

@@ -2,8 +2,9 @@
 This module contains utility functions for both Content Flow Tracker and Entity Analysis,
 extracted and merged from the previous content/utils.py and entity/utils.py files.
 """
 from datetime import datetime, timedelta
-from typing import Dict, Any
 from collections import defaultdict
 from models.database import article_collection, entity_collection # pylint: disable=import-error
@@ -13,21 +14,112 @@ ENTITY_TYPE_FULL_NAMES = {
     "GPE": "Geopolitical Entities (Countries/Cities)",
     "LOC": "Locations (Non-political)",
     "ORG": "Organizations",
-    "PER": "People",
     "PERSON": "People",
     "PROD": "Products",
     "PRODUCT": "Products",
     "PRODCAT": "Product Categories",
     "PRODUCT_CATEGORY": "Product Categories",
-    "COM": "Companies",
     "EVENT": "Events",
     "LANGUAGE": "Languages",
     "NORP": "Nationalities/Religious/Political Groups",
     "LAW": "Laws/Legal Documents",
     "FAC": "Facilities/Landmarks",
-    "INS": "Industry Institutions",
 }
 def _build_sentiment_lookup(sentiment_results: list) -> Dict:
     """Build sentiment lookup dictionary from sentiment aggregation results."""
@@ -198,12 +290,44 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
     mentions_results = list(entity_collection.aggregate(mentions_pipeline))
     sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
-    sentiment_lookup = _build_sentiment_lookup(sentiment_results)
     entity_types: Dict[str, Any] = {}
-    for mentions_result in mentions_results:
         entity_type = mentions_result["_id"]["type"]
         if entity_type not in entity_types:
@@ -213,7 +337,7 @@ def get_entity_analysis_data(time_filter: str) -> Dict[str, Any]:
             }
         entity_types[entity_type]["entities"].append(
-            _process_entity_with_sentiment(mentions_result, sentiment_lookup)
         )
     # Keep only the top 10 per type

 This module contains utility functions for both Content Flow Tracker and Entity Analysis,
 extracted and merged from the previous content/utils.py and entity/utils.py files.
 """
+import json
 from datetime import datetime, timedelta
+from typing import Dict, Any, List
 from collections import defaultdict
 from models.database import article_collection, entity_collection # pylint: disable=import-error
     "GPE": "Geopolitical Entities (Countries/Cities)",
     "LOC": "Locations (Non-political)",
     "ORG": "Organizations",
     "PERSON": "People",
     "PROD": "Products",
     "PRODUCT": "Products",
     "PRODCAT": "Product Categories",
     "PRODUCT_CATEGORY": "Product Categories",
+    "COMPANY": "Companies",
+    "FINANCIAL_ASSET": "Financial Assets",
+    "ECONOMIC_INDICATOR": "Economic Indicators",
     "EVENT": "Events",
     "LANGUAGE": "Languages",
     "NORP": "Nationalities/Religious/Political Groups",
     "LAW": "Laws/Legal Documents",
     "FAC": "Facilities/Landmarks",
+    "INDUSTRY": "Industries",
 }
+# Allowed entity types for analysis
+ALLOWED_ENTITY_TYPES = {
+    "GPE", "ORG", "PERSON", "COMPANY",
+    "FINANCIAL_ASSET", "ECONOMIC_INDICATOR", "INDUSTRY"
+}
+# Load entity normalization mapping
+def _load_entity_mapping() -> Dict[str, str]:
+    """Load entity normalization mapping from JSON file."""
+    try:
+        with open("mapping.json", 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except FileNotFoundError:
+        return {}
+def normalize_entity_name(entity_name: str) -> str:
+    """
+    Normalize entity names using the mapping file.
+    Parameters
+    ----------
+    entity_name : str
+        The original entity name to normalize.
+    Returns
+    -------
+    str
+        The normalized entity name, or original if no mapping found.
+    """
+    if not entity_name:
+        return entity_name
+    # Convert to string and clean
+    normalized = str(entity_name).strip()
+    # Apply basic replacements
+    normalized = normalized.replace("U.S.", "US")
+    normalized = normalized.replace("consumer price index", "CPI")
+    normalized = normalized.replace("Gross Domestic Product", "GDP")
+    # Load and apply mapping
+    mapping = _load_entity_mapping()
+    return mapping.get(normalized, normalized)
+def aggregate_entities(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Aggregate duplicate entities by summing their occurrence counts.
+    Parameters
+    ----------
+    entities : List[Dict[str, Any]]
+        A list of entity dictionaries where each dictionary must contain:
+        - 'entity' (str): The name of the entity
+        - 'type' (str): The type/category of the entity
+        - 'occurrence' (int): The count of occurrences for this entity
+    Returns
+    -------
+    List[Dict[str, Any]]
+        A list of unique entity dictionaries with aggregated occurrence counts,
+        where each dictionary contains:
+        - 'entity' (str): The normalized entity name
+        - 'type' (str): The entity type (unchanged)
+        - 'occurrence' (int): The summed occurrence count across all duplicates
+    """
+    aggregated = {}
+    for entity in entities:
+        # Normalize entity name
+        normalized_name = normalize_entity_name(entity['entity'])
+        key = (normalized_name, entity['type'])
+        if key in aggregated:
+            aggregated[key] += entity['occurrence']
+        else:
+            aggregated[key] = entity['occurrence']
+    # Convert back to list of dictionaries
+    result = []
+    for (entity_name, entity_type), count in aggregated.items():
+        result.append({
+            'entity': entity_name,
+            'type': entity_type,
+            'occurrence': count
+        })
+    return result
 def _build_sentiment_lookup(sentiment_results: list) -> Dict:
     """Build sentiment lookup dictionary from sentiment aggregation results."""
     mentions_results = list(entity_collection.aggregate(mentions_pipeline))
     sentiment_results = list(entity_collection.aggregate(sentiment_pipeline))
+    # Filter to only include allowed entity types
+    mentions_results = [r for r in mentions_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
+    sentiment_results = [r for r in sentiment_results if r["_id"]["type"] in ALLOWED_ENTITY_TYPES]
+    # Convert mentions results to format expected by aggregate_entities
+    entities_for_aggregation = []
+    for result in mentions_results:
+        entities_for_aggregation.append({
+            'entity': result['_id']['entity'],
+            'type': result['_id']['type'],
+            'occurrence': result['mentions']
+        })
+    # Normalize and aggregate entities
+    aggregated_entities = aggregate_entities(entities_for_aggregation)
+    # Rebuild mentions results with normalized names
+    normalized_mentions_results = []
+    for agg_entity in aggregated_entities:
+        normalized_mentions_results.append({
+            '_id': {'entity': agg_entity['entity'], 'type': agg_entity['type']},
+            'mentions': agg_entity['occurrence']
+        })
+    # Rebuild sentiment lookup with normalized names
+    normalized_sentiment_lookup = {}
+    for result in sentiment_results:
+        normalized_name = normalize_entity_name(result["_id"]["entity"])
+        key = (normalized_name, result["_id"]["type"])
+        if key in normalized_sentiment_lookup:
+            # Average multiple sentiment scores for the same normalized entity
+            normalized_sentiment_lookup[key] = (
+                normalized_sentiment_lookup[key] + result["avgSentiment"]) / 2
+        else:
+            normalized_sentiment_lookup[key] = result["avgSentiment"]
     entity_types: Dict[str, Any] = {}
+    for mentions_result in normalized_mentions_results:
         entity_type = mentions_result["_id"]["type"]
         if entity_type not in entity_types:
             }
         entity_types[entity_type]["entities"].append(
+            _process_entity_with_sentiment(mentions_result, normalized_sentiment_lookup)
         )
     # Keep only the top 10 per type

app/mapping.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+    "President Biden": "Joe Biden",
+    "Biden": "Joe Biden",
+    "President Joe Biden": "Joe Biden",
+    "Joe R Biden": "Joe Biden",
+    "President Trump": "Donald Trump",
+    "Trump": "Donald Trump",
+    "President Donald Trump": "Donald Trump",
+    "Donald J Trump": "Donald Trump",
+    "President Obama": "Barack Obama",
+    "Obama": "Barack Obama",
+    "President Barack Obama": "Barack Obama",
+    "President Macron": "Emmanuel Macron",
+    "Macron": "Emmanuel Macron",
+    "President Emmanuel Macron": "Emmanuel Macron",
+    "French President Emmanuel Macron": "Emmanuel Macron",
+    "Chancellor Merkel": "Angela Merkel",
+    "Merkel": "Angela Merkel",
+    "German Chancellor Angela Merkel": "Angela Merkel",
+    "Prime Minister Sunak": "Rishi Sunak",
+    "Sunak": "Rishi Sunak",
+    "Rishi Sunak MP": "Rishi Sunak",
+    "PM Sunak": "Rishi Sunak",
+    "Prime Minister Trudeau": "Justin Trudeau",
+    "Trudeau": "Justin Trudeau",
+    "Justin Trudeau PM": "Justin Trudeau",
+    "Chair Powell": "Jerome Powell",
+    "Powell": "Jerome Powell",
+    "Jerome H Powell": "Jerome Powell",
+    "Fed Chair Powell": "Jerome Powell",
+    "Federal Reserve Chair Powell": "Jerome Powell",
+    "Chair Yellen": "Janet Yellen",
+    "Yellen": "Janet Yellen",
+    "Secretary Yellen": "Janet Yellen",
+    "Treasury Secretary Yellen": "Janet Yellen",
+    "Janet L Yellen": "Janet Yellen",
+    "Lagarde": "Christine Lagarde",
+    "President Lagarde": "Christine Lagarde",
+    "Christine Lagarde ECB": "Christine Lagarde",
+    "ECB President Lagarde": "Christine Lagarde",
+    "Elon": "Elon Musk",
+    "Musk": "Elon Musk",
+    "Elon R Musk": "Elon Musk",
+    "Buffett": "Warren Buffett",
+    "yen": "Japanese yen",
+    "JPY": "Japanese yen",
+    "dollar": "US dollar",
+    "USD": "US dollar",
+    "United States dollar": "US dollar",
+    "EUR": "Euro",
+    "Pound": "British Pound",
+    "Sterling": "British Pound",
+    "GBP": "British Pound",
+    "yuan": "Chinese Yuan",
+    "Renminbi": "Chinese Yuan",
+    "CNY": "Chinese Yuan",
+    "RMB": "Chinese Yuan",
+    "BTC": "Bitcoin",
+    "International Monetary Fund": "IMF",
+    "IMFC": "IMF",
+    "IMF Executive Board": "IMF",
+    "Executive Board of the International Monetary Fund": "IMF",
+    "US central bank": "Federal Reserve",
+    "State Council": "State Council of China",
+    "FED": "Federal Reserve",
+    "European Union": "EU",
+    "banks": "Banks",
+    "CPI of China": "Chinese CPI",
+    "PPI of China": "Chinese PPI"
+}

app/routes/lda.py CHANGED Viewed

@@ -30,6 +30,9 @@ async def get_lda_results(filter_type: str):
                 'error': 'Invalid filter_type',
                 'message': f'filter_type must be one of: {valid_filters}'
             }, status_code=400)
         # Retrieve from MongoDB
         document = lda_collection.find_one({'_id': filter_type})

                 'error': 'Invalid filter_type',
                 'message': f'filter_type must be one of: {valid_filters}'
             }, status_code=400)
+        if filter_type == 'today':
+            filter_type = 'daily'
         # Retrieve from MongoDB
         document = lda_collection.find_one({'_id': filter_type})