Upload 6 files

Browse files

Files changed (6) hide show

finance_expert/__pycache__/config.cpython-312.pyc +0 -0
finance_expert/__pycache__/expert.cpython-312.pyc +0 -0
finance_expert/config.py +182 -0
finance_expert/data_processor.py +291 -0
finance_expert/expert.py +19 -0
finance_expert/requirements.txt +55 -0

finance_expert/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (4.06 kB). View file

finance_expert/__pycache__/expert.cpython-312.pyc ADDED Viewed

Binary file (1.26 kB). View file

finance_expert/config.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Configuration for the Finance Expert model
+"""
+# Core financial domains
+FINANCE_DOMAINS = {
+    "corporate_finance": {
+        "level": "expert",
+        "focus": [
+            "DCF valuation",
+            "LBO modeling",
+            "M&A analysis",
+            "financial_statement_analysis",
+            "ratio_analysis"
+        ]
+    },
+    "investment_analysis": {
+        "level": "expert",
+        "focus": [
+            "portfolio_theory",
+            "CAPM",
+            "risk_return_models",
+            "asset_allocation"
+        ]
+    },
+    "financial_instruments": {
+        "level": "expert",
+        "focus": [
+            "bonds",
+            "derivatives",
+            "structured_products",
+            "interest_rate_products"
+        ]
+    },
+    "quantitative_finance": {
+        "level": "expert",
+        "focus": [
+            "time_series_analysis",
+            "option_pricing",
+            "risk_modeling",
+            "statistical_arbitrage"
+        ]
+    },
+    "macroeconomics": {
+        "level": "expert",
+        "focus": [
+            "interest_rate_models",
+            "inflation_models",
+            "FX_markets",
+            "policy_analysis"
+        ]
+    },
+    "alternative_investments": {
+        "level": "expert",
+        "focus": [
+            "real_estate_finance",
+            "private_equity",
+            "hedge_fund_strategies",
+            "structured_credit"
+        ]
+    }
+}
+# Core financial tasks
+FINANCE_TASKS = {
+    "model_building": {
+        "level": "expert",
+        "subtasks": [
+            "DCF_modeling",
+            "LBO_modeling",
+            "M&A_modeling",
+            "valuation_modeling"
+        ]
+    },
+    "forecasting": {
+        "level": "expert",
+        "subtasks": [
+            "financial_statement_forecasting",
+            "economic_indicator_forecasting",
+            "market_forecasting",
+            "risk_forecasting"
+        ]
+    },
+    "statement_analysis": {
+        "level": "expert",
+        "subtasks": [
+            "balance_sheet_analysis",
+            "income_statement_analysis",
+            "cash_flow_statement_analysis",
+            "ratio_analysis"
+        ]
+    },
+    "risk_assessment": {
+        "level": "expert",
+        "subtasks": [
+            "market_risk_analysis",
+            "credit_risk_analysis",
+            "operational_risk_analysis",
+            "liquidity_risk_analysis"
+        ]
+    },
+    "reporting": {
+        "level": "expert",
+        "subtasks": [
+            "financial_reporting",
+            "investment_reporting",
+            "risk_reporting",
+            "performance_reporting"
+        ]
+    },
+    "portfolio_optimization": {
+        "level": "expert",
+        "subtasks": [
+            "asset_allocation",
+            "risk_management",
+            "performance_optimization",
+            "tax_efficiency"
+        ]
+    }
+}
+# Core datasets
+FINANCE_DATASETS = {
+    "FinQA": {
+        "source": "finqa/finqa",
+        "split": "train",
+        "fields": ["question", "table", "answer", "program"],
+        "description": "Financial question answering dataset",
+        "tasks": ["financial_qa", "table_analysis", "calculation"]
+    },
+    "TAT-QA": {
+        "source": "tatqa/tatqa",
+        "split": "train",
+        "fields": ["passage", "question", "answer", "scale", "type"],
+        "description": "Financial table question answering",
+        "tasks": ["table_qa", "calculation", "financial_analysis"]
+    },
+    "DocVQA": {
+        "source": "docvqa/docvqa",
+        "split": "train",
+        "fields": ["question", "image", "answer", "type"],
+        "description": "Document understanding and VQA",
+        "tasks": ["document_analysis", "financial_reading", "information_extraction"]
+    },
+    "FinancialPhraseBank": {
+        "source": "financial_phrasebank/financial_phrasebank",
+        "split": "train",
+        "fields": ["sentence", "label"],
+        "description": "Financial sentiment analysis",
+        "tasks": ["sentiment_analysis", "financial_language_processing"]
+    },
+    "SECFilings": {
+        "source": "sec_filings/sec_filings",
+        "split": "train",
+        "fields": ["company", "filing_type", "content", "date"],
+        "description": "SEC filings data",
+        "tasks": ["document_analysis", "financial_reporting", "company_analysis"]
+    },
+    "FRED": {
+        "source": "fred/fred",
+        "split": "train",
+        "fields": ["series_id", "date", "value"],
+        "description": "Federal Reserve Economic Data",
+        "tasks": ["economic_analysis", "time_series_analysis", "forecasting"]
+    }
+}
+# Print configuration summary
+def print_config_summary():
+    print("\nFinance Expert Configuration Summary:")
+    print(f"Number of domains: {len(FINANCE_DOMAINS)}")
+    print(f"Number of tasks: {len(FINANCE_TASKS)}")
+    print(f"Number of datasets: {len(FINANCE_DATASETS)}")
+    print("\nDataset Details:")
+    for name, config in FINANCE_DATASETS.items():
+        print(f"\n{name}:")
+        print(f"Description: {config['description']}")
+        print(f"Tasks: {', '.join(config['tasks'])}")
+        print(f"Fields: {', '.join(config['fields'])}")
+if __name__ == "__main__":
+    print_config_summary()

finance_expert/data_processor.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Data processing utilities for the Finance Expert model
+"""
+import json
+import os
+from pathlib import Path
+import jsonlines
+from typing import Dict, List, Any, Optional, Tuple
+import hashlib
+import datetime
+import logging
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+import re
+from dateutil.parser import parse as date_parse
+from decimal import Decimal, ROUND_HALF_UP
+class FinanceDataProcessor:
+    def __init__(self, output_dir: str = "processed_data"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.logger = self._setup_logger()
+    def _setup_logger(self) -> logging.Logger:
+        """Setup logging specific to finance data processing"""
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    def process_financial_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Process and normalize financial data"""
+        try:
+            # Handle different data types
+            processed = self._normalize_data(data)
+            # Extract financial metrics
+            metrics = self._extract_financial_metrics(processed)
+            # Validate financial data
+            validation = self._validate_financial_data(processed)
+            # Generate financial ratios
+            ratios = self._calculate_financial_ratios(processed)
+            return {
+                "processed_data": processed,
+                "metrics": metrics,
+                "validation": validation,
+                "ratios": ratios
+            }
+        except Exception as e:
+            self.logger.warning(f"Error processing financial data: {str(e)}")
+            return {"error": str(e)}
+    def _normalize_data(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Normalize financial data types and formats"""
+        normalized = {}
+        for key, value in data.items():
+            if isinstance(value, str):
+                # Handle currency and number formatting
+                if any(c in value for c in ["$", "€", "£", "¥"]):
+                    normalized[key] = self._normalize_currency(value)
+                elif value.isdigit():
+                    normalized[key] = int(value)
+                elif self._is_float(value):
+                    normalized[key] = float(value)
+                else:
+                    normalized[key] = value.strip()
+            elif isinstance(value, (int, float)):
+                normalized[key] = value
+            elif isinstance(value, dict):
+                normalized[key] = self._normalize_data(value)
+            elif isinstance(value, list):
+                normalized[key] = [self._normalize_data(item) if isinstance(item, dict) else item for item in value]
+            else:
+                normalized[key] = value
+        return normalized
+    def _normalize_currency(self, value: str) -> float:
+        """Convert currency strings to standardized format"""
+        try:
+            # Remove currency symbols and commas
+            value = re.sub(r'[\$€£¥,]', '', value)
+            # Handle negative numbers
+            value = value.replace('(', '').replace(')', '')
+            # Convert to float with proper decimal places
+            return float(value)
+        except:
+            return 0.0
+    def _is_float(self, value: str) -> bool:
+        """Check if string can be converted to float"""
+        try:
+            float(value)
+            return True
+        except ValueError:
+            return False
+    def _extract_financial_metrics(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract key financial metrics"""
+        metrics = {
+            "revenue": self._get_metric(data, "revenue", "income", "sales"),
+            "expenses": self._get_metric(data, "expenses", "costs"),
+            "profit": self._get_metric(data, "profit", "net_income"),
+            "assets": self._get_metric(data, "assets", "total_assets"),
+            "liabilities": self._get_metric(data, "liabilities", "total_liabilities"),
+            "equity": self._get_metric(data, "equity", "shareholders_equity")
+        }
+        return metrics
+    def _get_metric(self, data: Dict[str, Any], *keys: str) -> float:
+        """Get metric value from various possible keys"""
+        for key in keys:
+            if key in data:
+                return self._normalize_currency(str(data[key]))
+        return 0.0
+    def _validate_financial_data(self, data: Dict[str, Any]) -> Dict[str, bool]:
+        """Validate financial data consistency"""
+        validation = {
+            "balance_sheet_consistency": self._check_balance_sheet(data),
+            "income_statement_consistency": self._check_income_statement(data),
+            "cash_flow_consistency": self._check_cash_flow(data)
+        }
+        return validation
+    def _check_balance_sheet(self, data: Dict[str, Any]) -> bool:
+        """Check balance sheet consistency"""
+        assets = self._get_metric(data, "assets", "total_assets")
+        liabilities = self._get_metric(data, "liabilities", "total_liabilities")
+        equity = self._get_metric(data, "equity", "shareholders_equity")
+        return abs(assets - (liabilities + equity)) < 1e-6
+    def _check_income_statement(self, data: Dict[str, Any]) -> bool:
+        """Check income statement consistency"""
+        revenue = self._get_metric(data, "revenue", "income", "sales")
+        expenses = self._get_metric(data, "expenses", "costs")
+        profit = self._get_metric(data, "profit", "net_income")
+        return abs(profit - (revenue - expenses)) < 1e-6
+    def _check_cash_flow(self, data: Dict[str, Any]) -> bool:
+        """Check cash flow statement consistency"""
+        operating = self._get_metric(data, "operating_cash_flow")
+        investing = self._get_metric(data, "investing_cash_flow")
+        financing = self._get_metric(data, "financing_cash_flow")
+        net_change = self._get_metric(data, "net_change_in_cash")
+        return abs(net_change - (operating + investing + financing)) < 1e-6
+    def _calculate_financial_ratios(self, data: Dict[str, Any]) -> Dict[str, float]:
+        """Calculate key financial ratios"""
+        try:
+            metrics = self._extract_financial_metrics(data)
+            ratios = {
+                "current_ratio": metrics["assets"] / metrics["liabilities"] if metrics["liabilities"] != 0 else float('inf'),
+                "debt_to_equity": metrics["liabilities"] / metrics["equity"] if metrics["equity"] != 0 else float('inf'),
+                "profit_margin": metrics["profit"] / metrics["revenue"] if metrics["revenue"] != 0 else 0.0,
+                "return_on_equity": metrics["profit"] / metrics["equity"] if metrics["equity"] != 0 else 0.0,
+                "return_on_assets": metrics["profit"] / metrics["assets"] if metrics["assets"] != 0 else 0.0
+            }
+            return ratios
+        except ZeroDivisionError:
+            return {"error": "Division by zero in ratio calculation"}
+    def process_dataset(self, dataset: Dataset, dataset_name: str) -> List[Dict[str, Any]]:
+        """Process a complete financial dataset"""
+        processed = []
+        error_count = 0
+        self.logger.info(f"Processing {dataset_name} dataset with {len(dataset)} samples")
+        for idx, example in enumerate(tqdm(dataset, desc=f"Processing {dataset_name}")):
+            try:
+                processed_example = self._process_example(example, dataset_name)
+                processed.append(processed_example)
+            except Exception as e:
+                error_count += 1
+                self.logger.error(f"Error processing example {idx} in {dataset_name}: {str(e)}")
+        self.logger.info(f"Processed {len(processed)} examples")
+        self.logger.info(f"Encountered {error_count} errors")
+        return processed
+    def _process_example(self, example: Dict[str, Any], dataset_name: str) -> Dict[str, Any]:
+        """Process a single example based on dataset type"""
+        if dataset_name == "FinQA":
+            return self._process_finqa(example)
+        elif dataset_name == "TAT-QA":
+            return self._process_tat_qa(example)
+        elif dataset_name == "DocVQA":
+            return self._process_docvqa(example)
+        elif dataset_name == "FinancialPhraseBank":
+            return self._process_phrasebank(example)
+        elif dataset_name == "SECFilings":
+            return self._process_sec_filings(example)
+        elif dataset_name == "FRED":
+            return self._process_fred(example)
+        else:
+            raise ValueError(f"Unknown dataset: {dataset_name}")
+    def _process_finqa(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process FinQA example"""
+        return {
+            "question": example["question"].strip(),
+            "table": example["table"],
+            "answer": example["answer"],
+            "program": example["program"],
+            "data_analysis": self.process_financial_data(example["table"])
+        }
+    def _process_tat_qa(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process TAT-QA example"""
+        return {
+            "passage": example["passage"].strip(),
+            "question": example["question"].strip(),
+            "answer": example["answer"],
+            "scale": example["scale"],
+            "type": example["type"],
+            "data_analysis": self.process_financial_data({"passage": example["passage"]})
+        }
+    def _process_docvqa(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process DocVQA example"""
+        return {
+            "question": example["question"].strip(),
+            "image": example["image"],
+            "answer": example["answer"],
+            "type": example["type"],
+            "data_analysis": self.process_financial_data({"answer": example["answer"]})
+        }
+    def _process_phrasebank(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process FinancialPhraseBank example"""
+        return {
+            "sentence": example["sentence"].strip(),
+            "label": example["label"],
+            "sentiment_analysis": self._analyze_sentiment(example["sentence"])  # Reuse sentiment analysis
+        }
+    def _process_sec_filings(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process SEC filings example"""
+        return {
+            "company": example["company"].strip(),
+            "filing_type": example["filing_type"],
+            "content": example["content"],
+            "date": example["date"],
+            "financial_analysis": self.process_financial_data({"content": example["content"]})
+        }
+    def _process_fred(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process FRED example"""
+        return {
+            "series_id": example["series_id"],
+            "date": example["date"],
+            "value": example["value"],
+            "economic_analysis": self._analyze_economic_data(example)
+        }
+    def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str) -> Path:
+        """Save processed data to JSONL file"""
+        filepath = self.output_dir / filename
+        with jsonlines.open(filepath, mode='w') as writer:
+            writer.write_all(data)
+        self.logger.info(f"Saved data to {filepath}")
+        return filepath
+    def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
+        """Print sample of processed data"""
+        self.logger.info("\nSample data:")
+        for i, example in enumerate(data[:count]):
+            self.logger.info(f"\nSample {i+1}:")
+            self.logger.info(json.dumps(example, indent=2))
+    def print_memory_usage(self):
+        """Print current memory usage"""
+        process = psutil.Process()
+        memory_info = process.memory_info()
+        self.logger.info(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")

finance_expert/expert.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+Finance Expert Module
+"""
+from typing import Dict, Any, List
+class FinanceExpert:
+    def __init__(self):
+        self.name = "finance"
+        self.domains = ["finance", "economics", "investment"]
+    def handle_query(self, query: str, context: Dict[str, Any]) -> Dict[str, Any]:
+        return {
+            'response': f"Finance expert response to: {query}",
+            'confidence': 0.85,
+            'metadata': {'domains': self.domains}
+        }
+    def get_domains(self) -> List[str]:
+        return self.domains

finance_expert/requirements.txt ADDED Viewed

	@@ -0,0 +1,55 @@

+numpy>=1.21.0
+pandas>=1.3.0
+scikit-learn>=0.24.2
+scipy>=1.7.0
+statsmodels>=0.12.2
+yfinance>=0.1.63
+pandas-datareader>=0.10.0
+quandl>=3.6.0
+pandas-ta>=0.3.14b0
+pandas-profiling>=3.0.0
+plotly>=5.3.0
+seaborn>=0.11.2
+matplotlib>=3.4.3
+scikit-learn>=0.24.2
+xgboost>=1.5.0
+lightgbm>=3.3.0
+tensorflow>=2.6.0
+keras>=2.6.0
+prophet>=1.0.1
+fastapi>=0.68.0
+uvicorn>=0.15.0
+python-dotenv>=0.19.0
+requests>=2.26.0
+beautifulsoup4>=4.9.3
+lxml>=4.6.3
+pytest>=6.2.5
+black>=21.7b0
+isort>=5.9.3
+flake8>=4.0.1
+mypy>=0.910
+jupyter>=1.0.0
+ipykernel>=6.4.0
+notebook>=6.4.5
+jupyterlab>=3.1.12
+ipywidgets>=7.6.5
+plotly>=5.3.0
+seaborn>=0.11.2
+matplotlib>=3.4.3
+scikit-learn>=0.24.2
+xgboost>=1.5.0
+lightgbm>=3.3.0
+tensorflow>=2.6.0
+keras>=2.6.0
+prophet>=1.0.1
+fastapi>=0.68.0
+uvicorn>=0.15.0
+python-dotenv>=0.19.0
+requests>=2.26.0
+beautifulsoup4>=4.9.3
+lxml>=4.6.3
+pytest>=6.2.5
+black>=21.7b0
+isort>=5.9.3
+flake8>=4.0.1
+mypy>=0.910