""" Keyword controller for LLM-based keyword extraction and analysis. """ from typing import List from datetime import datetime, timedelta from collections import Counter import logging from pydantic import BaseModel, Field from langchain.output_parsers import PydanticOutputParser from langchain.prompts import ChatPromptTemplate from models.llm import gpt # pylint: disable=import-error # Configure logger logger = logging.getLogger(__name__) # Financial keywords dictionary for categorization FIN_KEYWORDS = { # Keywords related to the actions and policies of central banks. 'Monetary Policy': [ 'fed', 'interest rate','federal reserve', 'central bank', 'fiscal policy', 'monetary policy', 'monetary easing', 'quantitative easing', 'qe', 'quantitative tightening', 'qt', 'rate hike', 'rate cut', 'interest rate cut', 'forward guidance', 'policy rate', 'reserve requirement', 'open market operation', 'omo', 'liquidity crunch', 'yield curve inversion', 'stagflation', 'reflation', 'disinflation', 'hawkish', 'dovish', 'tapering', 'ecb', 'european central bank', 'boe', 'bank of england','boj', 'bank of japan', 'pboc', 'people\'s bank of china', 'rba', 'reserve bank of australia', 'rbc', 'reserve bank of canada', 'rbnz', 'reserve bank of new zealand', 'inflation targeting', 'dual mandate', 'neutral interest rate', 'real interest rate', 'nominal interest rate', 'monetary tightening', 'monetary accommodation', 'quantitative easing taper', 'balance sheet normalization', 'negative interest rate policy', 'nirp', 'macroprudential policy','credit easing', 'yield curve control', 'interest on excess reserves', 'ioer', 'discount rate', 'standing repo facility', 'monetary aggregates', 'base rate', 'transmission mechanism', 'inflation expectation', 'monetary policy framework', 'central bank independence', 'monetary policy committee', 'unconventional monetary policy', 'liquidity trap', 'money supply', 'velocity of money', 'open market desk', 'monetary policy tightening cycle', 'monetary policy easing cycle', 'price stability', 'financial repression', 'central bank swap lines', 'forward guidance credibility', 'monetary policy signaling' ], # Keywords related to government revenue, spending, and debt. 'Fiscal Policy': [ 'fiscal stimulus', 'debt ceiling', 'government spending', 'tax cut', 'budget deficit', 'public debt', 'budget surplus', 'tax revenue', 'public spending', 'infrastructure spending', 'debt servicing', 'fiscal responsibility', 'fiscal responsibilities', 'austerity', 'stimulus', 'bailout', 'tax-hike', 'tax-cut', 'pro-growth', 'automatic stabilizers', 'balanced budget', 'contractionary fiscal policy', 'corporate income tax', 'crowding out', 'discretionary fiscal policy', 'estate and gift tax', 'excise tax', 'fiscal consolidation', 'fiscal framework', 'fiscal impulse', 'fiscal policy stance', 'fiscal rule', 'fiscal space', 'fiscal surveillance', 'fiscal buffer', 'general government balance', 'output gap', 'potential output', 'primary balance', 'procyclical fiscal policy', 'progressive tax', 'regressive policy', 'program budgeting', 'public financial management', 'public sector', 'stock-flow adjustment', 'tax revenue elasticity', 'tax base', 'tax incidence', 'tax burden', 'tax compliance', 'tax evasion', 'tax avoidance', 'tax reform', 'tax policy', 'tax progressivity', 'tax regressivity', 'tax neutrality' ], # Keywords related to inflation and price levels. 'Inflation': [ 'inflation', 'deflation', 'cpi', 'pce', 'inflation rate', 'consumer price', 'core inflation', 'producer price index', 'ppi', 'inflation targeting', 'inflation target', 'inflationary pressures', 'inflationary pressure', 'wage price spiral', 'hyperinflation', 'shrinkflation', 'headline inflation', 'underlying inflation', 'demand-pull inflation', 'cost-push inflation', 'stagflation', 'disinflation', 'deflationary spiral', 'price stability', 'inflation expectation', 'wage inflation', 'monetary inflation', 'asset price inflation', 'import price inflation', 'supply shock', 'inflation volatility', 'inflation premium', 'inflation risk', 'inflation-indexed bond', 'real interest rate', 'nominal interest rate', 'purchasing power', 'inflation adjustment', 'price control', 'inflation forecast', 'inflation persistence', 'inflation targeting regime', 'base effect', 'core cpi', 'trimmed mean inflation', 'sticky price', 'menu cost', 'inflation surprise', 'inflation gap' ], # Keywords related to broad stock market indices and sentiment. 'Market Indices and Sentiment': [ 'nasdaq', 's&p 500', 's&p', 'russell', 'dow', 'djia', 'hang seng index', 'hsi', 'ftse 100', 'cac 40', 'dax', 'nikkei 225', 'shanghai composite', 'bovespa', 'sensex', 'tsx composite', 'aex', 'ibex 35', 'msci world', 'msci emerging markets', 'bitcoin', 'stock market', 'market volatility', 'vix', 'market capitalization', 'equity market', 'equity market', 'commodity price', 'investor sentiment', 'market correction', 'bear market', 'bull market', 'recession fear', 'market sentiment', 'irrational exuberance', 'animal spirit', 'risk-on', 'risk-off', 'rally', 'panic', 'capitulation', 'dow jones global titans 50', 'ftse all-world', 'msci eafe', 's&p asia 50', 'euro stoxx 50', 'stoxx europe 600', 's&p europe 350', 'ubs 100 index', 's&p latin america 40', 'szse component index', 'csi 300 index', 'sse 50 index', 'szse 100 index', 'szse 200 index', 'szse 300 index', 'csi 100 index', 'hang seng composite index', 'msci hong kong index', 'bse sensex', 'nifty 50', 'nifty next 50', 'nifty 500', 'idx composite', 'jakarta islamic index', 'ta-125 index', 'ta-35 index', 'ta-90', 'topix', 'ase market capitalization weighted index', 'ftse bursa malaysia index', 'ftse bursa malaysia klci', 'kse 100 index', 'kse 30 index', 'kmi 30 index', 'pse composite index', 'pse all shares index', 'pse financials index', 'pse mining and oil index', 'tadawul all-share index', 'straits times index', 'kospi', 'all share price index', 'milanka price index', 'omx stockholm 30', 'omx stockholm pi', 'smi mid', 'smi expanded', 'swiss leader index', 'swiss market index', 'swiss performance index', 'pfts index', 'ft 30', 'ftse mid 250 index', 'ftse 350 index', 'ftse aim all-share index', 'ftse aim uk 50 index', 'ftse all-share index', 'ftse fledgling index', 'ftse smallcap index', 'ftse techmark 100', 'all ordinaries', 's&p/asx 20', 's&p/asx 50', 's&p/asx 200', 's&p/asx 300', 's&p/nzx 50', 'market breadth', 'put-call ratio', 'short interest', 'volatility index', 'fear index', 'momentum', 'contrarian investing', 'market divergence', 'overbought', 'oversold', 'volume', 'liquidity', 'dividend yield', 'beta', 'alpha', 'sentiment analysis', 'technical analysis', 'fundamental analysis', 'market rally', 'market sell-off', 'correction rally', 'dead cat bounce', 'flash crash' ], # Keywords related to overall economic health and growth. 'Economic Indicators': [ 'gdp', 'employment', 'unemployment', 'pmi', 'manufacturing', 'retail', 'consumer', 'gdp growth', 'unemployment rate', 'economic growth', 'economic indicator', 'business cycle', 'consumer confidence', 'economic recession', 'economic outlook', 'global economy', 'job growth', 'wage growth', 'manufacturing output', 'services sector', 'leading indicator', 'lagging indicator', 'coincident indicator', 'business confidence', 'consumer sentiment index', 'retail sale', 'industrial production', 'capacity utilization', 'economic recovery', 'productivity growth', 'non-farm payroll', 'nfp', 'recession', 'recovery', 'boom', 'bust', 'double-dip' ], # Keywords related to international trade. 'Trade': [ 'tariff', 'export', 'import', 'deficit', 'surplus', 'customs', 'trade deficit', 'trade surplus', 'supply chain', 'balance of trade', 'current account deficit', 'trade wars', 'free trade agreement', 'fta', 'global trade agreement', 'world trade organization', 'wto', 'international trade law', 'customs union', 'common markets', 'common market', 'embargo', 'sanctions', 'protectionism', 'anti-dumping', 'trade facilitation', 'non-tariff barrier', 'trade liberalization', 'trade policy', 'multilateral trade agreement', 'trade negotiation', 'trade compliance', 'export control', 'import quota', 'trade tariff', 'trade embargo', 'trade embargoes', 'supply chain resilience', 'digital trade', 'e-commerce trade', 'trade finance', 'customs clearance', 'trade dispute', 'dispute settlement mechanism', 'trade sanction', 'intellectual property right', 'ipr in trade', 'regional trade agreement', 'rta', 'trade diversification', 'trade logistic', 'trade imbalance', 'trade integration', 'global value chain', 'gvc', 'cross-border trade', 'trade barrier', 'trade facilitation agreement', 'tfa', 'trade remedy', 'trade remedies', 'trade sanctions enforcement', 'preferential trade agreement', 'pta', 'export promotion', 'import substitution', 'trade tariffs escalation', 'trade war escalation', 'trade sanctions impact' ], # Keywords related to the energy sector. 'Energy': [ 'oil', 'gas', 'energy', 'crude oil', 'petroleum', 'renewable energy', 'non-renewable energy', 'coal', 'energy supply', 'energy stock', 'oil price', 'organization of the petroleum exporting countries', 'opec', 'shale', 'fracking', 'gas price', 'energy market', 'commodity price','energy demand', 'energy cost','economic growth', 'international energy agency', 'iea', 'energy information administration', 'eia', 'energy investment', 'renewable energy fund', 'energy sector performance', 'energy policy', 'carbon pricing', 'emissions trading', 'carbon emission', 'energy service performance contract', 'espc', 'energy procurement', 'energy service company', 'esco', 'energy supplier', 'fleet electrification', 'forward capacity market', 'front-of-meter', 'fuel switching', 'settle price', 'solar power purchase agreement', 'solar ppa', 'storage', 'strip price', 'sustainable energy', 'load shifting', 'local distribution company', 'ldc', 'local utility', 'market clearing price', 'carbon intensity', 'energy transmission', 'maximum demand', 'mcf', 'megawatt', 'mw', 'megawatt hour', 'mwh', 'capacity market', 'carbon credit', 'carbon footprint', 'maximum demand charge', 'network connection agreement', 'peak load', 'statutory basis', 'active power', 'agreed capacity' ], # Keywords related to lending, debt, and credit institutions. 'Financial Institutions and Credit': [ 'credit rating', 'public debt', 'credit default swap', 'cds', 'bond yield', 'credit crunch', 'sovereign debt', 'corporate debt', 'household debt', 'consumer credit', 'loan default', 'bank lending', 'credit availability', 'debt restructuring', 'private credit', 'distressed debt', 'interbank lending', 'insolvency', 'bankruptcy', 'foreclosure', 'leverage', 'investment-grade', 'high-yield', 'sub-prime', 'junk-bond', 'junk bond', 'subprime', 'high yield', 'credit risk', 'debt servicing', 'loan-to-value ratio', 'non-performing loan', 'secured loan', 'unsecured loan', 'mortgage-backed security', 'mortgage-backed securities', 'collateral', 'debt-to-gdp ratio', 'credit facility', 'guarantor', 'debt consolidation', 'debt-to-income ratio', 'revolving credit', 'term loan', 'overdraft', 'factoring', 'hire purchase', 'loan origination', 'loan portfolio', 'loan loss provision', 'loan covenant', 'credit limit', 'credit utilization', 'credit bureau', 'credit score', 'creditworthiness', 'default risk', 'debt recovery', 'debt collection', 'letter of credit', 'standby letter of credit', 'subordinated debt', 'mezzanine financing', 'bridge loan', 'syndicated loan', 'loan agreement', 'interest rate spread', 'amortization', 'principal repayment', 'credit enhancement', 'financial leverage', 'capital adequacy', 'risk-weighted assets', 'loan underwriting', 'loan servicing', 'credit exposure', 'loan refinancing', 'debt issuance', 'bond issuance', 'credit derivative', 'loan syndication', 'loan commitment', 'loan default rate', 'credit monitoring', 'loan restructuring', 'credit portfolio management', 'loan documentation', 'loan approval', 'asset-backed securities', 'asset-backed security', 'securitization', 'credit approval', 'loan disbursement', 'debt instrument', 'credit instrument' ], # Keywords related to currencies and foreign exchange markets. 'Currency and Forex': [ 'dollar', 'euro', 'yuan', 'yen', 'pound', 'franc', 'rupee', 'ruble', 'peso', 'krone', 'rand', 'lira', 'hong kong dollar', 'forex', 'fx', 'ringgit', 'baht', 'dinar', 'shekel', 'peso', 'krona', 'ngultrum', 'taka', 'devaluation', 'foreign exchange', 'currency depreciation', 'currency appreciation', 'exchange rate volatility', 'currency volatility', 'real effective exchange rate', 'digital currency', 'single currency area', 'arbitrage', 'peg', 'float', 'hard-currency', 'soft-currency','eurusd','usdjpy','gbpusd','usdchf','audusd', 'usdcad','nzdusd', 'eur/usd','usd/jpy','gbp/usd','usd/chf','aud/usd', 'usd/cad','nzd/usd', 'cross currency pair', 'carry trade', 'currency swap', 'currency future', 'currency option', 'fx reserve', 'fx intervention', 'currency basket', 'exchange rate regime', 'capital control', 'currency peg', 'managed float', 'currency risk', 'fx liquidity', 'fx market depth', 'fx settlement', 'fx hedging', 'currency crisis', 'currency crises','black market rate', 'exchange rate pass-through', 'nominal effective exchange rate', 'currency depreciation spiral', 'digital forex' ], # Keywords related to corporate finance and performance. 'Corporate Finance': [ 'earning', 'revenue', 'profit', 'margin', 'dividend', 'buyback', 'merger', 'capital gain', 'dividend yield', 'market capitalization', 'earnings report', 'quarterly earnings', 'corporate profit', 'financial statement', 'mergers and acquisitions', 'merger and acquisition', 'm&a', 'shareholder value', 'initial public offering', 'ipo', 'operating margin', 'ebitda', 'ebitda margin', 'earnings per share', 'eps', 'price to earnings ratio', 'p/e', 'return on investment', 'roi', 'return on equity', 'roe', 'corporate governance', 'buyout', 'spinoff', 'blue-chip', 'free cash flow', 'gross margin', 'net margin', 'capital expenditure', 'capex', 'goodwill', 'intangible asset', 'tangible asset', 'leveraged buyout', 'lbo', 'management buyout', 'mbo', 'mezzanine financing', 'debt financing', 'equity financing', 'preferred stock', 'common stock', 'stock option', 'warrant', 'convertible bond', 'dividend payout ratio', 'retained earnings', 'share repurchase', 'capital structure', 'cost of capital', 'weighted average cost of capital', 'wacc', 'beta', 'discounted cash flow', 'dcf', 'net present value', 'npv', 'cash flow statement', 'balance sheet', 'income statement', 'financial ratio', 'liquidity ratio', 'solvency ratio', 'profitability ratio', 'efficiency ratio', 'earnings guidance', 'shareholder equity', 'book value', 'market value', 'price to book ratio', 'p/b ratio', 'price to sales ratio', 'p/s ratio', 'enterprise value', 'ev', 'ev/ebitda', 'stock split', 'reverse stock split', 'ipo lockup period', 'quiet period', 'insider trading', 'corporate restructuring', 'divestiture', 'asset sale', 'spin-off', 'joint venture', 'strategic alliance', 'proxy fight', 'poison pill', 'golden parachute', 'hostile takeover', 'friendly takeover', 'due diligence', 'earnings surprise', 'guidance revision', 'shareholder activism', 'board of directors', 'executive compensation', 'stock-based compensation', 'performance metric' ], # Keywords related to the real estate market. 'Real Estate': [ 'housing', 'property', 'rent', 'construction', 'housing market', 'real estate market', 'property market', 'housing bubble', 'subprime lending', 'housing start','property valuation', 'real estate appraisal', 'building permit', 'existing home sale', 'mortgage rate', 'foreclosure rate', 'foreclosure rate', 'real estate investment trust', 'reit', 'mixed-use', 'affordable housing', 'housing affordability', 'urban sprawl', 'gentrification', 'short-term rental', 'co-living', 'smart building', 'green building', 'sustainable development', 'commercial real estate', 'industrial real estate', 'real estate crowdfunding', 'mortgage-backed security', 'mortgage-backed securities', 'mbs', 'real estate bubble', 'housing supply', 'housing demand', 'real estate financing', 'property taxes', 'zoning laws', 'land use regulation', 'rent control', 'eviction rates', 'real estate development', 'home equity', 'real estate brokerage', 'property management', 'real estate technology', 'proptech', 'vacancy rate', 'real estate cycle', 'lease agreement', 'tenant right', 'mortgage default', 'real estate market trend' ], # Keywords related to financial risk and stability. 'Risk and Stability': [ 'risk assessment', 'financial stability', 'geopolitical risk', 'asset bubble', 'dot com bubble', 'systemic crisis', 'systematic crises','contagion risk', 'financial contagion', 'market liquidity', 'black swan event', 'counterparty risk', 'operational risk', 'cybersecurity risk', 'meltdown', 'too-big-to-fail', 'regulatory risk', 'policy uncertainty', 'economic uncertainty', 'trade tension', 'debt sustainability risk', 'credit risk', 'liquidity risk', 'market volatility', 'valuation risk', 'leverage risk', 'fiscal risk', 'sovereign risk', 'geopolitical tension', 'financial fragmentation', 'financial market stress', 'nonbank financial intermediation risk', 'emerging market vulnerabilities', 'cyberattack risk', 'ai-related risk', 'systemic crisis risk', 'operational resilience', 'climate-related financial risk', 'inflation risk', 'interest rate risk', 'counterparty default risk', 'market dislocation', 'capital flow reversal', 'financial sector consolidation risk', 'emerging market vulnerability' ], # Keywords related to investment strategies and vehicles. 'Investment and Portfolio Management': [ 'asset allocation', 'investment portfolio', 'venture capital', 'vc', 'private equity', 'pe', 'hedge fund activity', 'mutual fund', 'exchange traded fund', 'etf', 'pension fund', 'sovereign wealth fund','esg investing', 'green finance', 'options trading', 'derivatives market', 'derivative market', 'fixed income security', 'speculative trading', 'hedging strategies', 'hedging strategy', 'fixed income securities', 'alternative investment', 'real asset', 'value investing', 'growth investing', 'momentum investing', 'contrarian investing', 'passive investing', 'active management', 'portfolio', 'diversification', 'hedging', 'arbitrage', 'long-term investment', 'short-term investment', 'high-risk', 'long-term investor', 'short-term investor', 'asset management', 'fund of funds', 'liquid alternative investment', 'private debt', 'real estate investment trust', 'reit', 'commodities trading', 'bond portfolio', 'credit risk management', 'risk-adjusted return', 'sharpe ratio', 'alpha generation', 'beta', 'tracking error', 'benchmarking', 'portfolio rebalancing', 'tax-efficient investing', 'dollar-cost averaging', 'investment horizon', 'capital preservation', 'investment policy statement', 'liquidity management', 'market timing', 'sector rotation', 'factor investing', 'smart beta', 'quantitative investing', 'algorithmic trading', 'robo-advisor', 'socially responsible investing', 'impact investing', 'thematic investing', 'dividend investing', 'income investing', 'capital gains tax', 'investment mandate', 'portfolio optimization', 'risk tolerance', 'volatility', 'correlation', 'systematic risk', 'unsystematic risk', 'market risk', 'credit risk', 'currency risk', 'reinvestment risk', 'inflation risk', 'liquidity risk', 'investment strategy', 'buy and hold', 'value at risk', 'var', 'stop loss', 'take profit', 'order execution', 'limit order', 'market order', 'short selling', 'margin trading', 'leverage', 'derivative instrument', 'futures contract', 'swaps', 'options', 'call option', 'put option', 'strike price', 'expiration date', 'implied volatility', 'delta', 'gamma', 'theta', 'vega', 'portfolio diversification', 'risk management', 'capital allocation', 'investment committee', 'due diligence', 'investment research', 'fund performance', 'net asset value', 'nav', 'expense ratio', 'management fee', 'performance fee' ], # Keywords related to financial regulation and compliance.. 'Regulation and Compliance': [ 'regulatory framework', 'compliance cost', 'regulatory oversight', 'financial conduct authority', 'fca', 'securities and exchange commission', 'sec', 'commodity futures trading commission', 'cftc', 'prudential regulation', 'macroprudential policy', 'microprudential regulation', 'capital requirements', 'liquidity requirement', 'leverage ratio', 'capital requirement', 'resolution authority', 'deposit insurance', 'investor protection', 'oversight', 'enforcement', 'compliance', 'know-your-customer', 'kyc', 'anti-money-laundering', 'aml', 'securities and futures commission', 'hong kong monetary authority', 'hkma', 'sfc', 'hong kong exchanges and clearing limited', 'hkex', 'basel accords', 'basel iii', 'dodd-frank act', 'volcker rule', 'glass-steagall act', 'financial stability board', 'fsb', 'international organization of securities commissions', 'iosco', 'esma', 'regulatory technology', 'regtech', 'cybersecurity regulation', 'bank for international settlements', 'bis', 'european securities and markets authority', 'european banking authority', 'eba', 'general data protection regulation', 'gdpr', 'payment services directive', 'psd2', 'anti-bribery and corruption', 'abc', 'financial action task force', 'fatf', 'risk-based approach', 'rba', 'customer due diligence', 'cdd', 'enhanced due diligence', 'edd', 'transaction monitoring', 'suspicious activity report', 'sar', 'beneficial ownership', 'regulatory reporting', 'regulatory compliance', 'financial crime compliance', 'fcc', 'insider trading', 'market abuse', 'conflict of interest', 'whistleblower protection', 'data privacy', 'operational risk', 'conduct risk', 'model risk', 'stress testing', 'capital adequacy ratio', 'resolution planning', 'regulatory penalty', 'liquidity coverage ratio', 'net stable funding ratio', 'nsfr', 'systemically important financial institution', 'sifi', 'living wills', 'capital buffers', 'countercyclical capital buffer', 'capital conservation buffer', 'financial consumer protection', 'market conduct', 'prudential supervision', 'supervisory review and evaluation process', 'srep', 'fit and proper requirement', 'regulatory penalties', 'regulatory sanction', 'license revocation', 'cross-border regulation', 'international regulatory cooperation', 'regulatory harmonization', 'regulatory impact assessment', 'regulatory transparency', 'compliance audit', 'internal control', 'compliance training', 'audit trail', 'board governance', 'corporate governance code', 'ethical standard', 'anti-fraud measure', 'financial reporting standard', 'ifrs', 'us gaap', 'reporting requirement', 'disclosure obligation', 'transparency requirement', 'risk governance', 'third-party risk management', 'outsourcing regulation', 'vendor risk management', 'business continuity planning', 'disaster recovery', 'incident response', 'regulatory notification', 'regulatory filing', 'regulatory update', 'regulatory enforcement action', 'enforcement proceeding', 'settlement agreement' ], # Keywords related to financial markets and trading instruments. 'Financial Markets and Instruments': [ 'stock exchange', 'bond market rally', 'futures contract', 'futures contracts', 'over the counter market', 'complex financial instruments', 'complex financial instrument', 'otc', 'repo market', 'commercial paper', 'treasury bond', 'municipal bond', 'corporate bond', 'junk bond', 'investment grade bond', 'investment grade bonds', 'credit spread', 'yield to maturity', 'structured products', 'structured product', 'financial instrument', 'derivative', 'commodities', 'commodity', 'equities', 'equity', 'stock', 'future', 'future contract', 'option', 'option contract', 'over-the-counter', 'exchange-traded fund', 'etf', 'bond yield', 'call option', 'put option', 'swap', 'interest rate swap', 'credit default swap', 'cds', 'currency swap', 'forward contract', 'forward rate agreement', 'fra', 'margin trading', 'margin call', 'clearinghouse', 'central counterparty', 'ccp', 'market maker', 'bid price', 'ask price', 'bid-ask spread', 'liquidity', 'market depth', 'order book', 'limit order', 'market order', 'stop order', 'stop-loss order', 'block trade', 'dark pool', 'high-frequency trading', 'hft', 'algorithmic trading', 'price discovery', 'volatility', 'implied volatility', 'historical volatility', 'yield curve', 'zero-coupon bond', 'perpetual bond', 'convertible bond', 'preferred stock', 'common stock', 'dividend yield', 'total return', 'capital gain', 'capital loss', 'securitization', 'collateralized debt obligation', 'cdo', 'exchange-traded note', 'etn', 'money market instrument', 'treasury bill', 'certificate of deposit', 'cd', 'repurchase agreement', 'repo', 'securities lending', 'short selling', 'leverage', 'financial leverage' ], # Keywords related to broader economic and social concepts. 'Socioeconomic Factors': [ 'wealth inequality', 'income inequality', 'labor market', 'demographic shift', 'income disparity', 'poverty rate', 'social safety net', 'human development index', 'HDI', 'gini coefficient', 'poverty', 'disparity', 'labor force participation', 'social mobility', 'education access', 'healthcare access', 'cost of living', 'inflation impact', 'human capital', 'workforce skills gap', 'fiscal policy', 'social unrest', 'urbanization', 'rural development', 'demographic aging', 'youth unemployment', 'informal economy', 'economic resilience', 'social capital', 'gender inequality', 'digital divide', 'poverty alleviation', 'economic diversification', 'migration pattern', 'economic vulnerability', 'financial inclusion', 'living standard' ], # Keywords related to international economics and development. 'International and Development Economics': [ 'emerging market', 'developed economies', 'developed countries', 'developed economy', 'developed country', 'foreign direct investment', 'fdi', 'capital flows', 'capital flow', 'world bank', 'international monetary fund', 'imf', 'emerging market debt', 'frontier market', 'developing countries', 'developing country', 'least developed countries', 'least developed country', 'ldc', 'developing economies', 'developing economy', 'sustainable development goal', 'sdg', 'climate finance', 'globalization', 'group of seven', 'g7', 'group of twenty', 'g20', 'brics' ], # Keywords related to financial technology. 'Fintech': [ 'fintech', 'financial technology', 'financial technologies', 'cryptocurrency regulation', 'blockchain technology', 'blockchain technologies', 'distributed ledger technologies', 'robo advisor', 'algorithmic trading', 'high frequency trading', 'dark pool', 'decentralized finance', 'defi', 'distributed ledger technology', 'dlt', 'cryptocurrency', 'blockchain', 'insurtech', 'peer-to-peer', 'p2p', 'central bank digital currency', 'cbdc', 'digital wallet', 'payment gateway', 'open banking', 'regtech', 'smart contract', 'non-fungible token', 'nft', 'nfts', 'tokenization', 'cryptocurrency exchange', 'stablecoin', 'digital identity', 'biometric authentication', 'mobile payment', 'buy now, pay later', 'bnpl', 'machine learning', 'cryptolending', 'crypto custody', 'cross-border payment', 'financial inclusion', 'embedded finance', 'ai-driven credit scoring', 'fraud detection', 'digital banking', 'neobank', 'yield farming', 'liquidity mining', 'smart contract auditing' ], # Keywords related to economic theories and analysis. 'Economic Theory and Analysis': [ 'economic modeling', 'economic forecasting', 'scenario planning', 'risk management', 'valuation models', 'valuation model', 'analyst ratings', 'analyst rating', 'net present value', 'npv', 'monetarism', 'laissez-faire' 'internal rate of return', 'irr', 'discounted cash flow', 'dcf', 'piketty curve', 'kuznets curve', 'solow growth model', 'new classical economics', 'new Keynesian economics', 'austrian economics', 'behavioral finance', 'efficient market hypothesis', 'emh', 'random walk theory', 'Keynesianism' ], # Keywords related to the intersection of geography, politics, and economics. 'Geopolitics': [ 'geopolitics', 'geopolitical risk', 'trade war', 'sanction', 'embargo', 'political instability', 'election', 'summit', 'international relations', 'foreign policy', 'national security', 'sovereignty', 'global governance', 'brexit impact', 'brexit', 'eurozone crisis', 'eurozone crises','conflict', 'unrest', 'regime-change', 'north atlantic treaty organization', 'nato', 'united states', 'us', 'usa', 'russia', 'india', 'germany', 'france', 'united kingdom', 'japan', 'brazil', 'canada', 'australia', 'south korea', 'mexico', 'italy', 'turkey', 'south africa', 'saudi arabia', 'argentina', 'spain', 'netherlands', 'sweden', 'norway', 'switzerland', 'poland', 'ukraine', 'palestine', 'iraq', 'israel', 'pakistan','eurozone', 'de-risking', 'ais gap', 'maritime terrorism', 'subsea infrastructure', 'friend-shoring', 'integrated deterrence', 'pacing threat', 'sportswashing', 'territorial dispute', 'ideological conflict', 'terrorism', 'regime change', 'climate geopolitics', 'power competition', 'shigeru ishiba', 'keir starmer', 'hybrid threat', 'cybersecurity', 'propaganda', 'diplomatic power struggle', 'regionalism', 'global order', 'mark carney', 'emmanuel macron', 'friedrich merz', 'giorgia meloni', 'merz', 'meloni', 'starmer', 'trump', 'xi', 'donald trump', 'antónio costa', 'ursula von der leyen', 'xi jinping', 'macron' ], 'Other Categories': [] } # Data models for keyword extraction class Keyword(BaseModel): """ Keyword model for keyword extraction. """ keyword: str = Field(description="Singular, canonical form (e.g., 'interest rate')") category: str variations_found: List[str] = Field( description="All forms found: plurals, abbreviations, verb forms") is_new_discovery: bool = Field(default=False, description="True if this is a newly discovered term not in historical keywords") class KeywordAnalysis(BaseModel): """ Keyword analysis model for keyword extraction. """ keywords: List[Keyword] = Field( description=( "A comprehensive list of all unique financial keywords extracted from the articles." ) ) # LangChain Setup llm_parser = PydanticOutputParser(pydantic_object=KeywordAnalysis) prompt_template = ChatPromptTemplate.from_template( """ You are a financial keyword extraction specialist. Your task is to identify and categorize financial keywords from news articles. **AVAILABLE CATEGORIES:** {categories} **HISTORICAL KEYWORDS FOR CONTEXT:** {historical_keywords} **NORMALIZATION RULES (CRITICAL):** 1. **Multi-word terms**: Prefer full forms over abbreviations (e.g., "federal reserve" not "fed") 2. **Singular/Plural**: Always use singular form (e.g., "interest rate" not "interest rates") 3. **Verb forms**: Use noun form (e.g., "rate hike" not "hiking rates") 4. **Consistency**: If a term exists in historical keywords, use that exact form 5. **Variations tracking**: List UNIQUE variations found (max 10 per term, no duplicates) 6. **New discoveries**: Mark as new_discovery=True if the canonical form is not in historical keywords **EXTRACTION INSTRUCTIONS:** 1. Read through all articles carefully 2. Extract ALL financial terms, concepts, and entities 3. Normalize each term according to the rules above 4. Categorize each term using the provided categories 5. For each term, list UNIQUE variations found (maximum 10 variations per term) 6. Mark terms as new discoveries if they don't exist in historical keywords 7. IMPORTANT: Avoid repetitive variations - each variation should be unique **ARTICLES TO ANALYZE:** {articles} **OUTPUT FORMAT:** {format_instructions} Focus on financial relevance and ensure comprehensive extraction while maintaining consistency with historical terms. """ ) def fetch_articles_for_period( articles_collection, start_date: datetime, end_date: datetime, sample_size: int = None ) -> list: """ Fetch articles from the database for a specific time period. Args: articles_collection: MongoDB collection containing articles start_date: Start date for article filtering end_date: End date for article filtering sample_size: Optional sample size for faster processing Returns: list: List of article texts formatted for LLM processing """ logger.info("Fetching articles from %s to %s", start_date.date(), end_date.date()) if sample_size: # Sample articles for faster processing pipeline = [ {"$match": { "publishDate": { "$gte": start_date.strftime("%Y-%m-%d"), "$lte": end_date.strftime("%Y-%m-%d") } }}, {"$sample": {"size": sample_size}} ] cursor = articles_collection.aggregate(pipeline) else: cursor = articles_collection.find({ "publishDate": { "$gte": start_date.strftime("%Y-%m-%d"), "$lte": end_date.strftime("%Y-%m-%d") } }) documents = [ f"Title: {doc.get('title', '')}\nContent: {doc.get('content', '')}" for doc in cursor] logger.info("Found %d articles", len(documents)) return documents def fetch_historical_keywords(collection) -> set: """ Fetch historical keywords from the database for context. Args: collection: MongoDB collection containing keywords Returns: set: Set of historical keyword strings """ logger.info("Fetching historical keywords from the last 2 months for context") two_months_ago = datetime.now() - timedelta(days=60) cursor = collection.find( {"_id": {"$gte": two_months_ago.strftime("%Y-%m-%d")}}, {"keywords.keyword": 1} ) historical_keywords = { kw_data['keyword'] for doc in cursor if 'keywords' in doc for kw_data in doc['keywords']} logger.info("Found %d unique historical keywords", len(historical_keywords)) return historical_keywords def run_llm_extraction(articles: list, historical_keywords: set) -> List[Keyword]: """ Run LLM extraction for keyword analysis using batch processing. Args: articles: List of article texts to analyze historical_keywords: Set of historical keywords for context Returns: List[Keyword]: List of extracted keywords """ if not articles: logger.info("No articles to analyze") return [] BATCH_CHAR_LIMIT = 45000 # pylint: disable=invalid-name all_extracted_keywords = {} current_batch_articles = [] current_batch_chars = 0 logger.info("Processing %d articles in batches to ensure full coverage", len(articles)) for article in articles: # Check if adding the next article would exceed the batch limit if current_batch_chars + len(article) > BATCH_CHAR_LIMIT and current_batch_articles: # Process the current batch first logger.info("Processing a batch of %d articles (%d chars)", len(current_batch_articles), current_batch_chars) batch_keywords = invoke_llm_for_batch(current_batch_articles, historical_keywords) # Add the results to our master list, merging duplicates for kw in batch_keywords: if kw.keyword in all_extracted_keywords: existing_variations = set(all_extracted_keywords[kw.keyword].variations_found) new_variations = set(kw.variations_found) all_extracted_keywords[kw.keyword].variations_found = list( existing_variations.union(new_variations)) else: all_extracted_keywords[kw.keyword] = kw # Reset for the next batch current_batch_articles = [] current_batch_chars = 0 # Add the current article to the new batch current_batch_articles.append(article) current_batch_chars += len(article) # Process the final batch if any articles are remaining if current_batch_articles: logger.info("Processing the final batch of %d articles (%d chars)", len(current_batch_articles), current_batch_chars) batch_keywords = invoke_llm_for_batch(current_batch_articles, historical_keywords) for kw in batch_keywords: if kw.keyword in all_extracted_keywords: existing_variations = set(all_extracted_keywords[kw.keyword].variations_found) new_variations = set(kw.variations_found) all_extracted_keywords[kw.keyword].variations_found = list( existing_variations.union(new_variations)) else: all_extracted_keywords[kw.keyword] = kw final_keyword_list = list(all_extracted_keywords.values()) logger.info("Extracted a total of %d unique keywords from all batches", len(final_keyword_list)) return final_keyword_list def invoke_llm_for_batch(articles_in_batch: list, historical_keywords: set) -> List[Keyword]: """ Invoke the LLM for a single batch of articles. Args: articles_in_batch: List of article texts in the current batch historical_keywords: Set of historical keywords for context Returns: List[Keyword]: List of extracted keywords from this batch """ articles_text = "\n\n---\n\n".join(articles_in_batch) chain = prompt_template | gpt | llm_parser try: categories_text = "\n".join([ f"- {cat}: {', '.join(keywords[:10])}" + ("..." if len(keywords) > 10 else "") for cat, keywords in FIN_KEYWORDS.items() ]) response = chain.invoke({ "categories": categories_text, "historical_keywords": list(historical_keywords) if historical_keywords else [], "articles": articles_text, "format_instructions": llm_parser.get_format_instructions() }) processed_keywords = [] for kw in response.keywords: if kw.category not in FIN_KEYWORDS: kw.category = "other_categories" kw.is_new_discovery = kw.keyword not in historical_keywords if len(kw.variations_found) > 20: kw.variations_found = kw.variations_found[:20] processed_keywords.append(kw) return processed_keywords except Exception as e: # pylint: disable=broad-exception-caught logger.error("LLM batch invocation failed: %s", e) return [] def calculate_metrics_and_save_for_date( # pylint: disable=too-many-locals collection, today_keywords: List[Keyword], target_date: datetime ) -> None: """ Calculate frequency and popularity for keywords and save to database. Args: collection: MongoDB collection to save keywords today_keywords: List of extracted keywords target_date: Date for which keywords are being processed """ if not today_keywords: return # Calculate frequency based on the number of variations found for each keyword today_freq = Counter() for kw_obj in today_keywords: today_freq[kw_obj.keyword] = len(kw_obj.variations_found) total_today = sum(today_freq.values()) # Get previous day's data for popularity score calculation previous_date = target_date - timedelta(days=1) prev_day_doc = collection.find_one({"_id": previous_date.strftime("%Y-%m-%d")}) previous_freq = ( Counter({kw['keyword']: kw['frequency'] for kw in prev_day_doc['keywords']}) if prev_day_doc and 'keywords' in prev_day_doc else Counter() ) total_previous = sum(previous_freq.values()) final_keywords_list = [] for kw_obj in today_keywords: keyword = kw_obj.keyword frequency = today_freq[keyword] current_pct = (frequency / total_today) * 100 if total_today > 0 else 0 previous_pct = ( (previous_freq.get(keyword, 0) / total_previous) * 100 if total_previous > 0 else 0 ) # Calculate popularity score (percentage change in relative frequency) if previous_pct > 0: popularity_score = ((current_pct - previous_pct) / previous_pct) * 100 else: popularity_score = 100.0 # New keywords get max popularity score final_keywords_list.append({ "keyword": keyword, "label": keyword.title(), "category": kw_obj.category, "frequency": frequency, "popularity": round(popularity_score, 2), "new": kw_obj.is_new_discovery, "variations": kw_obj.variations_found }) document_to_save = { "_id": target_date.strftime("%Y-%m-%d"), "keywords": final_keywords_list, "frequency": total_today } collection.update_one( {"_id": document_to_save["_id"]}, {"$set": document_to_save}, upsert=True ) logger.info("Saved %d keywords for %s", len(final_keywords_list), target_date.strftime('%Y-%m-%d'))