hollywoodfrancis commited on
Commit
91614ba
·
verified ·
1 Parent(s): b3e9d26

Upload 6 files

Browse files
finance_expert/__pycache__/config.cpython-312.pyc ADDED
Binary file (4.06 kB). View file
 
finance_expert/__pycache__/expert.cpython-312.pyc ADDED
Binary file (1.26 kB). View file
 
finance_expert/config.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for the Finance Expert model
3
+ """
4
+
5
+ # Core financial domains
6
+ FINANCE_DOMAINS = {
7
+ "corporate_finance": {
8
+ "level": "expert",
9
+ "focus": [
10
+ "DCF valuation",
11
+ "LBO modeling",
12
+ "M&A analysis",
13
+ "financial_statement_analysis",
14
+ "ratio_analysis"
15
+ ]
16
+ },
17
+ "investment_analysis": {
18
+ "level": "expert",
19
+ "focus": [
20
+ "portfolio_theory",
21
+ "CAPM",
22
+ "risk_return_models",
23
+ "asset_allocation"
24
+ ]
25
+ },
26
+ "financial_instruments": {
27
+ "level": "expert",
28
+ "focus": [
29
+ "bonds",
30
+ "derivatives",
31
+ "structured_products",
32
+ "interest_rate_products"
33
+ ]
34
+ },
35
+ "quantitative_finance": {
36
+ "level": "expert",
37
+ "focus": [
38
+ "time_series_analysis",
39
+ "option_pricing",
40
+ "risk_modeling",
41
+ "statistical_arbitrage"
42
+ ]
43
+ },
44
+ "macroeconomics": {
45
+ "level": "expert",
46
+ "focus": [
47
+ "interest_rate_models",
48
+ "inflation_models",
49
+ "FX_markets",
50
+ "policy_analysis"
51
+ ]
52
+ },
53
+ "alternative_investments": {
54
+ "level": "expert",
55
+ "focus": [
56
+ "real_estate_finance",
57
+ "private_equity",
58
+ "hedge_fund_strategies",
59
+ "structured_credit"
60
+ ]
61
+ }
62
+ }
63
+
64
+ # Core financial tasks
65
+ FINANCE_TASKS = {
66
+ "model_building": {
67
+ "level": "expert",
68
+ "subtasks": [
69
+ "DCF_modeling",
70
+ "LBO_modeling",
71
+ "M&A_modeling",
72
+ "valuation_modeling"
73
+ ]
74
+ },
75
+ "forecasting": {
76
+ "level": "expert",
77
+ "subtasks": [
78
+ "financial_statement_forecasting",
79
+ "economic_indicator_forecasting",
80
+ "market_forecasting",
81
+ "risk_forecasting"
82
+ ]
83
+ },
84
+ "statement_analysis": {
85
+ "level": "expert",
86
+ "subtasks": [
87
+ "balance_sheet_analysis",
88
+ "income_statement_analysis",
89
+ "cash_flow_statement_analysis",
90
+ "ratio_analysis"
91
+ ]
92
+ },
93
+ "risk_assessment": {
94
+ "level": "expert",
95
+ "subtasks": [
96
+ "market_risk_analysis",
97
+ "credit_risk_analysis",
98
+ "operational_risk_analysis",
99
+ "liquidity_risk_analysis"
100
+ ]
101
+ },
102
+ "reporting": {
103
+ "level": "expert",
104
+ "subtasks": [
105
+ "financial_reporting",
106
+ "investment_reporting",
107
+ "risk_reporting",
108
+ "performance_reporting"
109
+ ]
110
+ },
111
+ "portfolio_optimization": {
112
+ "level": "expert",
113
+ "subtasks": [
114
+ "asset_allocation",
115
+ "risk_management",
116
+ "performance_optimization",
117
+ "tax_efficiency"
118
+ ]
119
+ }
120
+ }
121
+
122
+ # Core datasets
123
+ FINANCE_DATASETS = {
124
+ "FinQA": {
125
+ "source": "finqa/finqa",
126
+ "split": "train",
127
+ "fields": ["question", "table", "answer", "program"],
128
+ "description": "Financial question answering dataset",
129
+ "tasks": ["financial_qa", "table_analysis", "calculation"]
130
+ },
131
+ "TAT-QA": {
132
+ "source": "tatqa/tatqa",
133
+ "split": "train",
134
+ "fields": ["passage", "question", "answer", "scale", "type"],
135
+ "description": "Financial table question answering",
136
+ "tasks": ["table_qa", "calculation", "financial_analysis"]
137
+ },
138
+ "DocVQA": {
139
+ "source": "docvqa/docvqa",
140
+ "split": "train",
141
+ "fields": ["question", "image", "answer", "type"],
142
+ "description": "Document understanding and VQA",
143
+ "tasks": ["document_analysis", "financial_reading", "information_extraction"]
144
+ },
145
+ "FinancialPhraseBank": {
146
+ "source": "financial_phrasebank/financial_phrasebank",
147
+ "split": "train",
148
+ "fields": ["sentence", "label"],
149
+ "description": "Financial sentiment analysis",
150
+ "tasks": ["sentiment_analysis", "financial_language_processing"]
151
+ },
152
+ "SECFilings": {
153
+ "source": "sec_filings/sec_filings",
154
+ "split": "train",
155
+ "fields": ["company", "filing_type", "content", "date"],
156
+ "description": "SEC filings data",
157
+ "tasks": ["document_analysis", "financial_reporting", "company_analysis"]
158
+ },
159
+ "FRED": {
160
+ "source": "fred/fred",
161
+ "split": "train",
162
+ "fields": ["series_id", "date", "value"],
163
+ "description": "Federal Reserve Economic Data",
164
+ "tasks": ["economic_analysis", "time_series_analysis", "forecasting"]
165
+ }
166
+ }
167
+
168
+ # Print configuration summary
169
+ def print_config_summary():
170
+ print("\nFinance Expert Configuration Summary:")
171
+ print(f"Number of domains: {len(FINANCE_DOMAINS)}")
172
+ print(f"Number of tasks: {len(FINANCE_TASKS)}")
173
+ print(f"Number of datasets: {len(FINANCE_DATASETS)}")
174
+ print("\nDataset Details:")
175
+ for name, config in FINANCE_DATASETS.items():
176
+ print(f"\n{name}:")
177
+ print(f"Description: {config['description']}")
178
+ print(f"Tasks: {', '.join(config['tasks'])}")
179
+ print(f"Fields: {', '.join(config['fields'])}")
180
+
181
+ if __name__ == "__main__":
182
+ print_config_summary()
finance_expert/data_processor.py ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data processing utilities for the Finance Expert model
3
+ """
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import jsonlines
8
+ from typing import Dict, List, Any, Optional, Tuple
9
+ import hashlib
10
+ import datetime
11
+ import logging
12
+ import numpy as np
13
+ import pandas as pd
14
+ from datasets import Dataset
15
+ from tqdm import tqdm
16
+ import re
17
+ from dateutil.parser import parse as date_parse
18
+ from decimal import Decimal, ROUND_HALF_UP
19
+
20
+ class FinanceDataProcessor:
21
+ def __init__(self, output_dir: str = "processed_data"):
22
+ self.output_dir = Path(output_dir)
23
+ self.output_dir.mkdir(exist_ok=True)
24
+ self.logger = self._setup_logger()
25
+
26
+ def _setup_logger(self) -> logging.Logger:
27
+ """Setup logging specific to finance data processing"""
28
+ logger = logging.getLogger(__name__)
29
+ logger.setLevel(logging.INFO)
30
+ handler = logging.StreamHandler()
31
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
32
+ handler.setFormatter(formatter)
33
+ logger.addHandler(handler)
34
+ return logger
35
+
36
+ def process_financial_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
37
+ """Process and normalize financial data"""
38
+ try:
39
+ # Handle different data types
40
+ processed = self._normalize_data(data)
41
+
42
+ # Extract financial metrics
43
+ metrics = self._extract_financial_metrics(processed)
44
+
45
+ # Validate financial data
46
+ validation = self._validate_financial_data(processed)
47
+
48
+ # Generate financial ratios
49
+ ratios = self._calculate_financial_ratios(processed)
50
+
51
+ return {
52
+ "processed_data": processed,
53
+ "metrics": metrics,
54
+ "validation": validation,
55
+ "ratios": ratios
56
+ }
57
+ except Exception as e:
58
+ self.logger.warning(f"Error processing financial data: {str(e)}")
59
+ return {"error": str(e)}
60
+
61
+ def _normalize_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
62
+ """Normalize financial data types and formats"""
63
+ normalized = {}
64
+
65
+ for key, value in data.items():
66
+ if isinstance(value, str):
67
+ # Handle currency and number formatting
68
+ if any(c in value for c in ["$", "€", "£", "¥"]):
69
+ normalized[key] = self._normalize_currency(value)
70
+ elif value.isdigit():
71
+ normalized[key] = int(value)
72
+ elif self._is_float(value):
73
+ normalized[key] = float(value)
74
+ else:
75
+ normalized[key] = value.strip()
76
+ elif isinstance(value, (int, float)):
77
+ normalized[key] = value
78
+ elif isinstance(value, dict):
79
+ normalized[key] = self._normalize_data(value)
80
+ elif isinstance(value, list):
81
+ normalized[key] = [self._normalize_data(item) if isinstance(item, dict) else item for item in value]
82
+ else:
83
+ normalized[key] = value
84
+
85
+ return normalized
86
+
87
+ def _normalize_currency(self, value: str) -> float:
88
+ """Convert currency strings to standardized format"""
89
+ try:
90
+ # Remove currency symbols and commas
91
+ value = re.sub(r'[\$€£¥,]', '', value)
92
+ # Handle negative numbers
93
+ value = value.replace('(', '').replace(')', '')
94
+ # Convert to float with proper decimal places
95
+ return float(value)
96
+ except:
97
+ return 0.0
98
+
99
+ def _is_float(self, value: str) -> bool:
100
+ """Check if string can be converted to float"""
101
+ try:
102
+ float(value)
103
+ return True
104
+ except ValueError:
105
+ return False
106
+
107
+ def _extract_financial_metrics(self, data: Dict[str, Any]) -> Dict[str, Any]:
108
+ """Extract key financial metrics"""
109
+ metrics = {
110
+ "revenue": self._get_metric(data, "revenue", "income", "sales"),
111
+ "expenses": self._get_metric(data, "expenses", "costs"),
112
+ "profit": self._get_metric(data, "profit", "net_income"),
113
+ "assets": self._get_metric(data, "assets", "total_assets"),
114
+ "liabilities": self._get_metric(data, "liabilities", "total_liabilities"),
115
+ "equity": self._get_metric(data, "equity", "shareholders_equity")
116
+ }
117
+ return metrics
118
+
119
+ def _get_metric(self, data: Dict[str, Any], *keys: str) -> float:
120
+ """Get metric value from various possible keys"""
121
+ for key in keys:
122
+ if key in data:
123
+ return self._normalize_currency(str(data[key]))
124
+ return 0.0
125
+
126
+ def _validate_financial_data(self, data: Dict[str, Any]) -> Dict[str, bool]:
127
+ """Validate financial data consistency"""
128
+ validation = {
129
+ "balance_sheet_consistency": self._check_balance_sheet(data),
130
+ "income_statement_consistency": self._check_income_statement(data),
131
+ "cash_flow_consistency": self._check_cash_flow(data)
132
+ }
133
+ return validation
134
+
135
+ def _check_balance_sheet(self, data: Dict[str, Any]) -> bool:
136
+ """Check balance sheet consistency"""
137
+ assets = self._get_metric(data, "assets", "total_assets")
138
+ liabilities = self._get_metric(data, "liabilities", "total_liabilities")
139
+ equity = self._get_metric(data, "equity", "shareholders_equity")
140
+
141
+ return abs(assets - (liabilities + equity)) < 1e-6
142
+
143
+ def _check_income_statement(self, data: Dict[str, Any]) -> bool:
144
+ """Check income statement consistency"""
145
+ revenue = self._get_metric(data, "revenue", "income", "sales")
146
+ expenses = self._get_metric(data, "expenses", "costs")
147
+ profit = self._get_metric(data, "profit", "net_income")
148
+
149
+ return abs(profit - (revenue - expenses)) < 1e-6
150
+
151
+ def _check_cash_flow(self, data: Dict[str, Any]) -> bool:
152
+ """Check cash flow statement consistency"""
153
+ operating = self._get_metric(data, "operating_cash_flow")
154
+ investing = self._get_metric(data, "investing_cash_flow")
155
+ financing = self._get_metric(data, "financing_cash_flow")
156
+ net_change = self._get_metric(data, "net_change_in_cash")
157
+
158
+ return abs(net_change - (operating + investing + financing)) < 1e-6
159
+
160
+ def _calculate_financial_ratios(self, data: Dict[str, Any]) -> Dict[str, float]:
161
+ """Calculate key financial ratios"""
162
+ try:
163
+ metrics = self._extract_financial_metrics(data)
164
+
165
+ ratios = {
166
+ "current_ratio": metrics["assets"] / metrics["liabilities"] if metrics["liabilities"] != 0 else float('inf'),
167
+ "debt_to_equity": metrics["liabilities"] / metrics["equity"] if metrics["equity"] != 0 else float('inf'),
168
+ "profit_margin": metrics["profit"] / metrics["revenue"] if metrics["revenue"] != 0 else 0.0,
169
+ "return_on_equity": metrics["profit"] / metrics["equity"] if metrics["equity"] != 0 else 0.0,
170
+ "return_on_assets": metrics["profit"] / metrics["assets"] if metrics["assets"] != 0 else 0.0
171
+ }
172
+
173
+ return ratios
174
+ except ZeroDivisionError:
175
+ return {"error": "Division by zero in ratio calculation"}
176
+
177
+ def process_dataset(self, dataset: Dataset, dataset_name: str) -> List[Dict[str, Any]]:
178
+ """Process a complete financial dataset"""
179
+ processed = []
180
+ error_count = 0
181
+
182
+ self.logger.info(f"Processing {dataset_name} dataset with {len(dataset)} samples")
183
+
184
+ for idx, example in enumerate(tqdm(dataset, desc=f"Processing {dataset_name}")):
185
+ try:
186
+ processed_example = self._process_example(example, dataset_name)
187
+ processed.append(processed_example)
188
+ except Exception as e:
189
+ error_count += 1
190
+ self.logger.error(f"Error processing example {idx} in {dataset_name}: {str(e)}")
191
+
192
+ self.logger.info(f"Processed {len(processed)} examples")
193
+ self.logger.info(f"Encountered {error_count} errors")
194
+
195
+ return processed
196
+
197
+ def _process_example(self, example: Dict[str, Any], dataset_name: str) -> Dict[str, Any]:
198
+ """Process a single example based on dataset type"""
199
+ if dataset_name == "FinQA":
200
+ return self._process_finqa(example)
201
+ elif dataset_name == "TAT-QA":
202
+ return self._process_tat_qa(example)
203
+ elif dataset_name == "DocVQA":
204
+ return self._process_docvqa(example)
205
+ elif dataset_name == "FinancialPhraseBank":
206
+ return self._process_phrasebank(example)
207
+ elif dataset_name == "SECFilings":
208
+ return self._process_sec_filings(example)
209
+ elif dataset_name == "FRED":
210
+ return self._process_fred(example)
211
+ else:
212
+ raise ValueError(f"Unknown dataset: {dataset_name}")
213
+
214
+ def _process_finqa(self, example: Dict[str, Any]) -> Dict[str, Any]:
215
+ """Process FinQA example"""
216
+ return {
217
+ "question": example["question"].strip(),
218
+ "table": example["table"],
219
+ "answer": example["answer"],
220
+ "program": example["program"],
221
+ "data_analysis": self.process_financial_data(example["table"])
222
+ }
223
+
224
+ def _process_tat_qa(self, example: Dict[str, Any]) -> Dict[str, Any]:
225
+ """Process TAT-QA example"""
226
+ return {
227
+ "passage": example["passage"].strip(),
228
+ "question": example["question"].strip(),
229
+ "answer": example["answer"],
230
+ "scale": example["scale"],
231
+ "type": example["type"],
232
+ "data_analysis": self.process_financial_data({"passage": example["passage"]})
233
+ }
234
+
235
+ def _process_docvqa(self, example: Dict[str, Any]) -> Dict[str, Any]:
236
+ """Process DocVQA example"""
237
+ return {
238
+ "question": example["question"].strip(),
239
+ "image": example["image"],
240
+ "answer": example["answer"],
241
+ "type": example["type"],
242
+ "data_analysis": self.process_financial_data({"answer": example["answer"]})
243
+ }
244
+
245
+ def _process_phrasebank(self, example: Dict[str, Any]) -> Dict[str, Any]:
246
+ """Process FinancialPhraseBank example"""
247
+ return {
248
+ "sentence": example["sentence"].strip(),
249
+ "label": example["label"],
250
+ "sentiment_analysis": self._analyze_sentiment(example["sentence"]) # Reuse sentiment analysis
251
+ }
252
+
253
+ def _process_sec_filings(self, example: Dict[str, Any]) -> Dict[str, Any]:
254
+ """Process SEC filings example"""
255
+ return {
256
+ "company": example["company"].strip(),
257
+ "filing_type": example["filing_type"],
258
+ "content": example["content"],
259
+ "date": example["date"],
260
+ "financial_analysis": self.process_financial_data({"content": example["content"]})
261
+ }
262
+
263
+ def _process_fred(self, example: Dict[str, Any]) -> Dict[str, Any]:
264
+ """Process FRED example"""
265
+ return {
266
+ "series_id": example["series_id"],
267
+ "date": example["date"],
268
+ "value": example["value"],
269
+ "economic_analysis": self._analyze_economic_data(example)
270
+ }
271
+
272
+ def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str) -> Path:
273
+ """Save processed data to JSONL file"""
274
+ filepath = self.output_dir / filename
275
+ with jsonlines.open(filepath, mode='w') as writer:
276
+ writer.write_all(data)
277
+ self.logger.info(f"Saved data to {filepath}")
278
+ return filepath
279
+
280
+ def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
281
+ """Print sample of processed data"""
282
+ self.logger.info("\nSample data:")
283
+ for i, example in enumerate(data[:count]):
284
+ self.logger.info(f"\nSample {i+1}:")
285
+ self.logger.info(json.dumps(example, indent=2))
286
+
287
+ def print_memory_usage(self):
288
+ """Print current memory usage"""
289
+ process = psutil.Process()
290
+ memory_info = process.memory_info()
291
+ self.logger.info(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
finance_expert/expert.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Finance Expert Module
3
+ """
4
+ from typing import Dict, Any, List
5
+
6
+ class FinanceExpert:
7
+ def __init__(self):
8
+ self.name = "finance"
9
+ self.domains = ["finance", "economics", "investment"]
10
+
11
+ def handle_query(self, query: str, context: Dict[str, Any]) -> Dict[str, Any]:
12
+ return {
13
+ 'response': f"Finance expert response to: {query}",
14
+ 'confidence': 0.85,
15
+ 'metadata': {'domains': self.domains}
16
+ }
17
+
18
+ def get_domains(self) -> List[str]:
19
+ return self.domains
finance_expert/requirements.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ numpy>=1.21.0
2
+ pandas>=1.3.0
3
+ scikit-learn>=0.24.2
4
+ scipy>=1.7.0
5
+ statsmodels>=0.12.2
6
+ yfinance>=0.1.63
7
+ pandas-datareader>=0.10.0
8
+ quandl>=3.6.0
9
+ pandas-ta>=0.3.14b0
10
+ pandas-profiling>=3.0.0
11
+ plotly>=5.3.0
12
+ seaborn>=0.11.2
13
+ matplotlib>=3.4.3
14
+ scikit-learn>=0.24.2
15
+ xgboost>=1.5.0
16
+ lightgbm>=3.3.0
17
+ tensorflow>=2.6.0
18
+ keras>=2.6.0
19
+ prophet>=1.0.1
20
+ fastapi>=0.68.0
21
+ uvicorn>=0.15.0
22
+ python-dotenv>=0.19.0
23
+ requests>=2.26.0
24
+ beautifulsoup4>=4.9.3
25
+ lxml>=4.6.3
26
+ pytest>=6.2.5
27
+ black>=21.7b0
28
+ isort>=5.9.3
29
+ flake8>=4.0.1
30
+ mypy>=0.910
31
+ jupyter>=1.0.0
32
+ ipykernel>=6.4.0
33
+ notebook>=6.4.5
34
+ jupyterlab>=3.1.12
35
+ ipywidgets>=7.6.5
36
+ plotly>=5.3.0
37
+ seaborn>=0.11.2
38
+ matplotlib>=3.4.3
39
+ scikit-learn>=0.24.2
40
+ xgboost>=1.5.0
41
+ lightgbm>=3.3.0
42
+ tensorflow>=2.6.0
43
+ keras>=2.6.0
44
+ prophet>=1.0.1
45
+ fastapi>=0.68.0
46
+ uvicorn>=0.15.0
47
+ python-dotenv>=0.19.0
48
+ requests>=2.26.0
49
+ beautifulsoup4>=4.9.3
50
+ lxml>=4.6.3
51
+ pytest>=6.2.5
52
+ black>=21.7b0
53
+ isort>=5.9.3
54
+ flake8>=4.0.1
55
+ mypy>=0.910