""" Comprehensive Santiment Data Fetcher ==================================== This module provides a complete data fetcher for the Santiment API using the sanpy library. It maximizes data retrieval by organizing metrics into categories and providing batch operations. Features: - Fetches all available metrics organized by category - Supports batch operations for efficient API usage - Handles rate limiting and error management - Provides data export capabilities - Supports both single asset and multi-asset queries - Includes SQL query execution for custom data needs Author: AI Assistant Version: 1.0.0 """ import san import pandas as pd import numpy as np import time import logging from datetime import datetime, timedelta from typing import List, Dict, Optional, Union, Any import json import os from dataclasses import dataclass, field from concurrent.futures import ThreadPoolExecutor, as_completed # Load environment variables try: from dotenv import load_dotenv load_dotenv() except ImportError: pass # dotenv not available, continue without it import warnings # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Resolve data directory base try: from src.config import DATA_DIR as CFG_DATA_DIR except Exception: try: from config import DATA_DIR as CFG_DATA_DIR except Exception: CFG_DATA_DIR = "/data" from pathlib import Path def _resolve_under_data(path_like: str | os.PathLike) -> str: p = Path(path_like) if p.is_absolute(): return str(p) parts = p.parts if parts and parts[0].lower() == "data": rel = Path(*parts[1:]) if len(parts) > 1 else Path() else: rel = p return str(Path(CFG_DATA_DIR) / rel) @dataclass class FetchConfig: """Configuration class for data fetching parameters - OPTIMIZED FOR API CONSERVATION""" from_date: str = "2024-01-01" # Reduced from 2020 to save API calls to_date: str = "utc_now" interval: str = "1d" include_incomplete_data: bool = False batch_size: int = 25 # Reduced from 50 to save API calls max_workers: int = 5 # Reduced from 10 to save API calls rate_limit_delay: int = 60 export_format: str = "parquet" # csv, json, parquet export_directory: str = "data/santiment" class SantimentDataFetcher: """ Comprehensive Santiment Data Fetcher This class provides methods to fetch maximum possible data from Santiment API using the sanpy library with efficient batch operations and error handling. """ def __init__(self, api_key: Optional[str] = None, config: Optional[FetchConfig] = None): """ Initialize the Santiment Data Fetcher Args: api_key: Santiment API key(s) for accessing restricted data (comma-separated for multiple keys) config: FetchConfig object with fetching parameters """ self.config = config or FetchConfig() self._normalize_dates() # Set up multiple API keys self._setup_api_keys(api_key) # Resolve export directory under DATA_DIR, create and clean up existing files self.config.export_directory = _resolve_under_data(self.config.export_directory) os.makedirs(self.config.export_directory, exist_ok=True) self._cleanup_existing_files() # Initialize data storage self.fetched_data: Dict[str, pd.DataFrame] = {} self.failed_queries: List[Dict] = [] # Define comprehensive metric categories self.metric_categories = self._define_metric_categories() # Get available metrics and projects self._initialize_metadata() # Initialize symbol normalization self.symbol_normalizer = self._setup_symbol_normalizer() def _setup_symbol_normalizer(self): """ Set up symbol normalization mapping for consistent asset identification Returns: Dictionary mapping various symbol formats to canonical slugs """ # Canonical mapping for major crypto assets # Maps various symbols/names to the official Santiment slug symbol_mapping = { # Bitcoin variants 'bitcoin': 'bitcoin', 'btc': 'bitcoin', 'Bitcoin': 'bitcoin', 'BTC': 'bitcoin', # Ethereum variants 'ethereum': 'ethereum', 'eth': 'ethereum', 'Ethereum': 'ethereum', 'ETH': 'ethereum', # Ripple/XRP variants 'ripple': 'ripple', 'xrp': 'ripple', 'Ripple': 'ripple', 'XRP': 'ripple', # Solana variants 'solana': 'solana', 'sol': 'solana', 'Solana': 'solana', 'SOL': 'solana', # Cardano variants 'cardano': 'cardano', 'ada': 'cardano', 'Cardano': 'cardano', 'ADA': 'cardano', # Polkadot variants 'polkadot': 'polkadot', 'dot': 'polkadot', 'Polkadot': 'polkadot', 'DOT': 'polkadot', # Chainlink variants 'chainlink': 'chainlink', 'link': 'chainlink', 'Chainlink': 'chainlink', 'LINK': 'chainlink', # Litecoin variants 'litecoin': 'litecoin', 'ltc': 'litecoin', 'Litecoin': 'litecoin', 'LTC': 'litecoin', # Bitcoin Cash variants 'bitcoin-cash': 'bitcoin-cash', 'bch': 'bitcoin-cash', 'Bitcoin Cash': 'bitcoin-cash', 'BCH': 'bitcoin-cash', # Stellar variants 'stellar': 'stellar', 'xlm': 'stellar', 'Stellar': 'stellar', 'XLM': 'stellar', # Ethereum Classic variants 'ethereum-classic': 'ethereum-classic', 'etc': 'ethereum-classic', 'Ethereum Classic': 'ethereum-classic', 'ETC': 'ethereum-classic', # EOS variants 'eos': 'eos', 'EOS': 'eos', } logger.info(f"Initialized symbol normalizer with {len(symbol_mapping)} mappings") return symbol_mapping def normalize_symbol(self, symbol: str) -> str: """ Normalize a symbol to its canonical Santiment slug Args: symbol: Symbol to normalize Returns: Canonical slug """ if symbol in self.symbol_normalizer: canonical = self.symbol_normalizer[symbol] if symbol != canonical: logger.debug(f"Normalized '{symbol}' -> '{canonical}'") return canonical # If not found in mapping, return as-is but log warning logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping") return symbol.lower() def get_symbol_alternatives(self, symbol: str) -> List[str]: """ Get all alternative symbols for a given symbol (both directions) Args: symbol: Symbol to find alternatives for Returns: List of alternative symbols including the original """ alternatives = [symbol] # Create reverse mapping to find alternatives reverse_mapping = {} for variant, canonical in self.symbol_normalizer.items(): if canonical not in reverse_mapping: reverse_mapping[canonical] = [] reverse_mapping[canonical].append(variant) # If symbol is a canonical, get all its variants if symbol in reverse_mapping: alternatives.extend(reverse_mapping[symbol]) # If symbol is a variant, get the canonical and other variants canonical = self.normalize_symbol(symbol) if canonical in reverse_mapping: alternatives.extend(reverse_mapping[canonical]) # Remove duplicates and return return list(set(alternatives)) def fetch_single_metric_with_alternatives(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]: """ Fetch a single metric for a single asset, trying alternative symbols if the primary fails Args: metric: The metric name slug: The asset slug (will try alternatives if this fails) **kwargs: Additional parameters for the API call Returns: DataFrame with the metric data or None if failed """ # Get all alternative symbols to try alternatives = self.get_symbol_alternatives(slug) logger.debug(f"Trying alternatives for {slug}: {alternatives}") # Try each alternative in order (start with the normalized canonical form) canonical = self.normalize_symbol(slug) if canonical != slug: alternatives = [canonical] + [alt for alt in alternatives if alt != canonical] for i, alt_slug in enumerate(alternatives): try: data = self.fetch_single_metric(metric, alt_slug, **kwargs) if data is not None and not data.empty: if i > 0 or alt_slug != slug: # Successfully fetched with alternative logger.info(f"[ALT_SUCCESS] {metric} for {slug} succeeded using alternative '{alt_slug}'") # Update slug column to reflect the original requested slug for consistency data['slug'] = slug data['alternative_slug_used'] = alt_slug return data except Exception as e: error_msg = str(e) # Check if this is a metric-level error that won't be fixed by trying other slugs if any(skip_phrase in error_msg.lower() for skip_phrase in [ 'not supported for', 'not implemented for', 'outside the allowed interval', 'upgrade to a higher tier' ]): logger.warning(f"[METRIC_SKIP] {metric} has fundamental issues, skipping all alternatives: {error_msg}") break # Don't try other alternatives for this metric # If it's just a slug issue, continue trying alternatives if 'is not an existing slug' in error_msg.lower(): logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}") continue else: logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}") continue logger.warning(f"[ALT_FAILED] All alternatives failed for {metric} with slug {slug}") return None def normalize_slug_list(self, slugs: List[str]) -> List[str]: """ Normalize a list of slugs and remove duplicates Args: slugs: List of slugs to normalize Returns: List of normalized, deduplicated slugs """ normalized = [] seen = set() for slug in slugs: canonical = self.normalize_symbol(slug) if canonical not in seen: normalized.append(canonical) seen.add(canonical) else: logger.debug(f"Removed duplicate slug: {slug} (canonical: {canonical})") logger.info(f"Normalized {len(slugs)} slugs to {len(normalized)} unique canonical slugs") return normalized def _normalize_dates(self): """ Convert relative date strings in self.config.from_date / to_date into absolute YYYY-MM-DD dates that Sanpy can parse. Supports: - "ND" (e.g. "30d") → today minus N days - "utc_now" → today """ now = datetime.utcnow() # from_date: e.g. "30d" fd = self.config.from_date.strip().lower() if fd.endswith('d') and fd[:-1].isdigit(): days = int(fd[:-1]) from_dt = now - timedelta(days=days) # Sanpy expects "YYYY-MM-DD" self.config.from_date = from_dt.strftime('%Y-%m-%d') # to_date: sometimes set to "utc_now" td = self.config.to_date.strip().lower() if td == 'utc_now': self.config.to_date = now.strftime('%Y-%m-%d') def _setup_api_keys(self, api_key: Optional[str] = None): """ Set up multiple API keys for rate limit handling Args: api_key: API key(s) - can be comma-separated for multiple keys """ # Parse API keys from parameter or environment api_key_string = api_key or os.getenv('SANTIMENT_API_KEY') if api_key_string: # Support comma-separated API keys self.api_keys = [key.strip() for key in api_key_string.split(',') if key.strip()] logger.info(f"Santiment fetcher initialized with {len(self.api_keys)} API key(s)") # Check if all keys are from the same account if len(self.api_keys) > 1: logger.info("Multiple API keys detected. Testing key diversity...") self._validate_api_key_diversity() else: self.api_keys = [] logger.warning("No API key provided - limited to free tier data") # Initialize API key management self.current_key_index = 0 self.rate_limit_switches = 0 # Set initial API key if self.api_keys: self._set_current_api_key() def _validate_api_key_diversity(self): """ Validate that API keys are from different accounts for effective rate limit handling """ try: user_ids = set() functional_keys = 0 rate_limited_keys = 0 for i, key in enumerate(self.api_keys[:3]): # Test only first 3 to avoid exhausting quota # Temporarily set this key san.ApiConfig.api_key = key try: # Make a simple query to get user info result = san.execute_sql(query="SELECT 1", set_index=None) # If successful, key is functional but we can't determine user ID without error functional_keys += 1 logger.info(f"API Key #{i+1}: {key[:8]}... appears functional") except Exception as e: error_str = str(e) if 'user with id' in error_str: # Extract user ID from error message import re match = re.search(r'user with id (\d+)', error_str) if match: user_id = match.group(1) user_ids.add(user_id) rate_limited_keys += 1 logger.info(f"API Key #{i+1}: {key[:8]}... belongs to user ID {user_id} (rate limited)") else: logger.debug(f"API Key #{i+1}: {key[:8]}... - {error_str}") # Reset to first key self.current_key_index = 0 self._set_current_api_key() # Analyze results if rate_limited_keys > 0 and len(user_ids) == 1: if functional_keys > 0: logger.warning("⚠️ WARNING: Cannot determine if all API keys are from different accounts!") logger.warning(f"⚠️ {rate_limited_keys} key(s) belong to user ID {list(user_ids)[0]}, {functional_keys} key(s) appear functional") logger.warning("⚠️ If functional keys are from the same account, rate limit switching won't work.") logger.warning("⚠️ For guaranteed effective rate limiting, use API keys from different Santiment accounts.") logger.warning("⚠️ Create additional accounts at https://app.santiment.net/") else: logger.warning("⚠️ WARNING: All tested API keys belong to the same Santiment account!") logger.warning("⚠️ Rate limits are applied per account, not per key.") logger.warning("⚠️ API key switching will not be effective with same-account keys.") logger.warning("⚠️ Create additional accounts at https://app.santiment.net/") elif len(user_ids) > 1: logger.info(f"✅ Good! API keys are from {len(user_ids)} different accounts.") logger.info("✅ This will provide effective rate limit distribution.") elif functional_keys == len(self.api_keys): logger.info("✅ All API keys appear functional.") logger.info("ℹ️ Cannot determine account diversity without rate limit errors.") logger.info("ℹ️ Monitor rate limit switches during operation to verify effectiveness.") except Exception as e: logger.debug(f"Could not validate API key diversity: {e}") logger.info("API key diversity validation skipped - continuing with provided keys") def _set_current_api_key(self): """Set the current API key in san.ApiConfig""" if self.api_keys: current_key = self.api_keys[self.current_key_index] san.ApiConfig.api_key = current_key logger.info(f"Using API key #{self.current_key_index + 1}: {current_key[:8]}...") else: san.ApiConfig.api_key = None def _switch_api_key(self): """Switch to the next available API key""" if len(self.api_keys) <= 1: logger.warning("Only one or no API keys available, cannot switch") return False old_index = self.current_key_index self.current_key_index = (self.current_key_index + 1) % len(self.api_keys) self.rate_limit_switches += 1 logger.info(f"[SWITCH] Switching from API key #{old_index + 1} to #{self.current_key_index + 1} (switch #{self.rate_limit_switches})") # Warn if switching too frequently (indicates same account issue) if self.rate_limit_switches > len(self.api_keys) * 2: logger.warning("⚠️ High number of API key switches detected!") logger.warning("⚠️ This suggests all keys may be from the same account.") logger.warning("⚠️ Consider using API keys from different Santiment accounts.") # Set new API key self._set_current_api_key() # Add a delay after switching keys time.sleep(2.0) return True def _is_rate_limit_error(self, error_message): """Check if the error indicates a rate limit issue""" rate_limit_indicators = [ "429", "rate limit", "too many requests", "api limit", "quota exceeded", "limit exceeded", "rate_limit_exception", "API Rate Limit Reached", "rate limit reached" ] error_str = str(error_message).lower() return any(indicator in error_str for indicator in rate_limit_indicators) def _cleanup_existing_files(self): """ Clean up all existing files in the export directory before starting a new fetch. This prevents accumulation of old data files from previous runs. """ import glob import shutil if not os.path.exists(self.config.export_directory): return try: # Get all files in the export directory all_files = glob.glob(os.path.join(self.config.export_directory, "*")) if all_files: logger.info(f"Cleaning up {len(all_files)} existing files in {self.config.export_directory}") for file_path in all_files: try: if os.path.isfile(file_path): os.remove(file_path) logger.debug(f"Removed file: {os.path.basename(file_path)}") elif os.path.isdir(file_path): shutil.rmtree(file_path) logger.debug(f"Removed directory: {os.path.basename(file_path)}") except Exception as e: logger.warning(f"Failed to remove {file_path}: {e}") logger.info(f"Successfully cleaned up export directory: {self.config.export_directory}") else: logger.info(f"Export directory is already clean: {self.config.export_directory}") except Exception as e: logger.error(f"Failed to cleanup export directory {self.config.export_directory}: {e}") # Don't raise the exception - just log it and continue def _define_metric_categories(self) -> Dict[str, List[str]]: """Define REDUCED categories of Santiment metrics for API conservation.""" return { # Essential Financial Metrics Only 'financial': [ 'price_usd', 'marketcap_usd', 'volume_usd' # Reduced from 12 to 3 most important metrics ], # Core Network Activity 'network_activity': [ 'daily_active_addresses', 'new_addresses' # Reduced from 9 to 2 most important metrics ], # Basic Transaction Metrics 'transactions': [ 'transaction_count', 'transaction_volume_usd' # Reduced from 8 to 2 most important metrics ], # Essential Exchange Metrics 'exchange': [ 'exchange_inflow', 'exchange_outflow' # Reduced from 8 to 2 most important metrics ] # Removed: supply, development, social, derivatives, whales # This reduces API calls by ~70% while keeping core metrics } def _initialize_metadata(self): """Initialize metadata about available metrics and projects""" try: logger.info("Fetching available metrics...") self.available_metrics = san.available_metrics() logger.info(f"Found {len(self.available_metrics)} available metrics") logger.info("Fetching available projects...") self.projects_df = san.get("projects/all") self.available_slugs = self.projects_df['slug'].tolist() logger.info(f"Found {len(self.available_slugs)} available projects") except Exception as e: logger.error(f"Failed to initialize metadata: {e}") self.available_metrics = [] self.available_slugs = [] def get_metric_metadata(self, metric: str) -> Dict[str, Any]: """ Get metadata for a specific metric Args: metric: The metric name Returns: Dictionary containing metric metadata """ try: metadata = san.metadata( metric, arr=["availableSlugs", "defaultAggregation", "humanReadableName", "isAccessible", "isRestricted", "restrictedFrom", "restrictedTo"] ) return metadata except Exception as e: logger.warning(f"Failed to get metadata for {metric}: {e}") return {} def fetch_single_metric(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]: """ Fetch a single metric for a single asset Args: metric: The metric name slug: The asset slug **kwargs: Additional parameters for the API call Returns: DataFrame with the metric data or None if failed """ max_retries = len(self.api_keys) if self.api_keys else 1 keys_tried = set() for attempt in range(max_retries): try: # If we've tried all keys, reset and wait if len(keys_tried) >= len(self.api_keys) and self.api_keys: logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...") time.sleep(30) keys_tried.clear() self.current_key_index = 0 self._set_current_api_key() params = { 'slug': slug, 'from_date': kwargs.get('from_date', self.config.from_date), 'to_date': kwargs.get('to_date', self.config.to_date), 'interval': kwargs.get('interval', self.config.interval), 'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data) } # Add any additional selector parameters if 'selector' in kwargs: params['selector'] = kwargs['selector'] data = san.get(metric, **params) if data is not None and not data.empty: # Add metadata columns data['metric'] = metric data['slug'] = slug if attempt > 0: logger.info(f"[SUCCESS] {metric} for {slug} succeeded on attempt {attempt + 1}") return data except Exception as e: error_msg = str(e) keys_tried.add(self.current_key_index) # Check if it's a rate limit error if self._is_rate_limit_error(error_msg) and self.api_keys: logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}") # Check if we've tried all keys if len(keys_tried) >= len(self.api_keys): logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.") break # Exit retry loop since all keys are exhausted # Try to switch to next API key if self._switch_api_key(): continue # Retry with new API key else: logger.error("No more API keys available for switching") # Handle rate limit with san library specific check if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e): if hasattr(san, 'rate_limit_time_left'): rate_limit_seconds = san.rate_limit_time_left(e) logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds") time.sleep(rate_limit_seconds) else: # Try switching API key if available if self.api_keys and self._switch_api_key(): continue else: time.sleep(60) # Default wait else: # Check for specific error types that mean we should skip this metric entirely if any(skip_phrase in error_msg.lower() for skip_phrase in [ 'not supported for', 'is not an existing slug', 'not implemented for', 'missing_contract', 'outside the allowed interval', 'upgrade to a higher tier' ]): logger.warning(f"[SKIP] {metric} for {slug} - {error_msg}") return None # Skip this metric/slug combination entirely logger.error(f"Failed to fetch {metric} for {slug}: {error_msg}") error_info = { 'metric': metric, 'slug': slug, 'error': error_msg, 'timestamp': datetime.now().isoformat(), 'api_key_index': self.current_key_index } self.failed_queries.append(error_info) return None def fetch_multi_asset_metric(self, metric: str, slugs: List[str], **kwargs) -> Optional[pd.DataFrame]: """ Fetch a single metric for multiple assets using get_many Args: metric: The metric name slugs: List of asset slugs **kwargs: Additional parameters for the API call Returns: DataFrame with the metric data or None if failed """ max_retries = len(self.api_keys) if self.api_keys else 1 keys_tried = set() for attempt in range(max_retries): try: # If we've tried all keys, reset and wait if len(keys_tried) >= len(self.api_keys) and self.api_keys: logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...") time.sleep(30) keys_tried.clear() self.current_key_index = 0 self._set_current_api_key() params = { 'slugs': slugs, 'from_date': kwargs.get('from_date', self.config.from_date), 'to_date': kwargs.get('to_date', self.config.to_date), 'interval': kwargs.get('interval', self.config.interval), 'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data) } data = san.get_many(metric, **params) if data is not None and not data.empty: # Reshape data for consistent format data_melted = data.reset_index().melt( id_vars=['datetime'], var_name='slug', value_name='value' ) data_melted['metric'] = metric data_melted.set_index('datetime', inplace=True) if attempt > 0: logger.info(f"[SUCCESS] {metric} for multiple assets succeeded on attempt {attempt + 1}") return data_melted except Exception as e: error_msg = str(e) keys_tried.add(self.current_key_index) # Check if it's a rate limit error if self._is_rate_limit_error(error_msg) and self.api_keys: logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}") # Check if we've tried all keys if len(keys_tried) >= len(self.api_keys): logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.") break # Exit retry loop since all keys are exhausted # Try to switch to next API key if self._switch_api_key(): continue # Retry with new API key else: logger.error("No more API keys available for switching") # Handle rate limit with san library specific check if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e): if hasattr(san, 'rate_limit_time_left'): rate_limit_seconds = san.rate_limit_time_left(e) logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds") time.sleep(rate_limit_seconds) else: # Try switching API key if available if self.api_keys and self._switch_api_key(): continue else: time.sleep(60) # Default wait else: logger.error(f"Failed to fetch {metric} for multiple assets: {error_msg}") error_info = { 'metric': metric, 'slugs': slugs, 'error': error_msg, 'timestamp': datetime.now().isoformat(), 'api_key_index': self.current_key_index } self.failed_queries.append(error_info) return None def fetch_category_batch(self, category: str, slugs: List[str], use_async_batch: bool = True) -> Dict[str, pd.DataFrame]: """ Fetch all metrics in a category using batch operations with symbol alternatives fallback Args: category: The metric category name slugs: List of asset slugs to fetch for use_async_batch: Whether to use AsyncBatch (recommended) or Batch Returns: Dictionary mapping metric names to DataFrames """ if category not in self.metric_categories: logger.error(f"Unknown category: {category}") return {} metrics = self.metric_categories[category] category_data = {} # Filter metrics that are actually available available_metrics_in_category = [m for m in metrics if m in self.available_metrics] if not available_metrics_in_category: logger.warning(f"No available metrics found for category: {category}") return {} logger.info(f"Fetching {len(available_metrics_in_category)} metrics for category: {category}") # First try batch operation with normalized slugs normalized_slugs = self.normalize_slug_list(slugs) batch_success = self._try_batch_fetch(category, available_metrics_in_category, normalized_slugs, use_async_batch) category_data.update(batch_success) # For failed metrics, try individual fetches with alternatives failed_metrics = [m for m in available_metrics_in_category if m not in batch_success] if failed_metrics: logger.info(f"Retrying {len(failed_metrics)} failed metrics with alternatives") individual_results = self._fetch_failed_metrics_with_alternatives(failed_metrics, slugs) category_data.update(individual_results) return category_data def _try_batch_fetch(self, category: str, metrics: List[str], slugs: List[str], use_async_batch: bool) -> Dict[str, pd.DataFrame]: """Try batch fetch operation""" category_data = {} try: if use_async_batch: batch = san.AsyncBatch() else: batch = san.Batch() # Add queries to batch for metric in metrics: try: if len(slugs) == 1: batch.get( metric, slug=slugs[0], from_date=self.config.from_date, to_date=self.config.to_date, interval=self.config.interval, include_incomplete_data=self.config.include_incomplete_data ) else: batch.get_many( metric, slugs=slugs, from_date=self.config.from_date, to_date=self.config.to_date, interval=self.config.interval, include_incomplete_data=self.config.include_incomplete_data ) except Exception as e: logger.warning(f"Failed to add {metric} to batch: {e}") # Execute batch if use_async_batch: results = batch.execute(max_workers=self.config.max_workers) else: results = batch.execute() # Process results for i, (metric, result) in enumerate(zip(metrics, results)): if result is not None and not result.empty: if len(slugs) > 1: # Reshape multi-asset data result_melted = result.reset_index().melt( id_vars=['datetime'], var_name='slug', value_name='value' ) result_melted['metric'] = metric result_melted.set_index('datetime', inplace=True) category_data[metric] = result_melted else: result['metric'] = metric result['slug'] = slugs[0] category_data[metric] = result else: logger.debug(f"No data received for metric: {metric} in batch") except Exception as e: logger.error(f"Batch execution failed for category {category}: {e}") return category_data def _fetch_failed_metrics_with_alternatives(self, metrics: List[str], original_slugs: List[str]) -> Dict[str, pd.DataFrame]: """Fetch failed metrics individually using symbol alternatives""" individual_data = {} for metric in metrics: logger.info(f"Retrying {metric} with symbol alternatives...") if len(original_slugs) == 1: # Single asset - use alternatives result = self.fetch_single_metric_with_alternatives(metric, original_slugs[0]) if result is not None: individual_data[metric] = result else: # Multiple assets - try each with alternatives and combine all_results = [] for slug in original_slugs: result = self.fetch_single_metric_with_alternatives(metric, slug) if result is not None: all_results.append(result) if all_results: # Concatenate results - they already have datetime as index combined_result = pd.concat(all_results, ignore_index=False, sort=False) # Ensure datetime index is properly set if not isinstance(combined_result.index, pd.DatetimeIndex): if 'datetime' in combined_result.columns: combined_result.set_index('datetime', inplace=True) individual_data[metric] = combined_result return individual_data def fetch_special_metrics(self, slugs: List[str]) -> Dict[str, pd.DataFrame]: """ Fetch special metrics that have different API signatures Args: slugs: List of asset slugs Returns: Dictionary mapping metric names to DataFrames """ special_data = {} for slug in slugs: max_retries = len(self.api_keys) if self.api_keys else 1 keys_tried = set() for attempt in range(max_retries): try: # If we've tried all keys, reset and wait if len(keys_tried) >= len(self.api_keys) and self.api_keys: logger.warning(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}, waiting 30 seconds...") time.sleep(30) keys_tried.clear() self.current_key_index = 0 self._set_current_api_key() # OHLCV data logger.info(f"Fetching OHLCV data for {slug}") ohlcv = san.get( f"ohlcv/{slug}", from_date=self.config.from_date, to_date=self.config.to_date, interval=self.config.interval ) if ohlcv is not None and not ohlcv.empty: ohlcv['metric'] = 'ohlcv' ohlcv['slug'] = slug special_data[f'ohlcv_{slug}'] = ohlcv # Prices with OHLC format logger.info(f"Fetching detailed prices for {slug}") prices = san.get( "prices", slug=slug, from_date=self.config.from_date, to_date=self.config.to_date, interval=self.config.interval ) if prices is not None and not prices.empty: prices['metric'] = 'prices_detailed' prices['slug'] = slug special_data[f'prices_{slug}'] = prices # If we get here, the attempt was successful break except Exception as e: error_msg = str(e) keys_tried.add(self.current_key_index) # Check if it's a rate limit error if self._is_rate_limit_error(error_msg) and self.api_keys: logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for special metrics on {slug}: {error_msg}") # Check if we've tried all keys if len(keys_tried) >= len(self.api_keys): logger.error(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}. Skipping.") break # Exit retry loop since all keys are exhausted # Try to switch to next API key if self._switch_api_key(): continue # Retry with new API key else: logger.error("No more API keys available for switching") logger.error(f"Failed to fetch special metrics for {slug}: {e}") break # Exit retry loop for this slug return special_data def fetch_blockchain_address_data(self, addresses: List[str], slugs: List[str]) -> Dict[str, pd.DataFrame]: """ Fetch blockchain address-related data Args: addresses: List of blockchain addresses slugs: List of asset slugs for context Returns: Dictionary mapping data types to DataFrames """ address_data = {} for slug in slugs: for address in addresses: try: # Historical balance balance = san.get( "historical_balance", slug=slug, address=address, from_date=self.config.from_date, to_date=self.config.to_date, interval=self.config.interval ) if balance is not None and not balance.empty: balance['address'] = address balance['slug'] = slug address_data[f'historical_balance_{slug}_{address[:8]}'] = balance # Top transactions top_txs = san.get( "eth_top_transactions", slug=slug, from_date=self.config.from_date, to_date=self.config.to_date, limit=100, transaction_type="ALL" ) if top_txs is not None and not top_txs.empty: top_txs['slug'] = slug address_data[f'eth_top_transactions_{slug}'] = top_txs except Exception as e: logger.error(f"Failed to fetch address data for {address} on {slug}: {e}") return address_data def execute_custom_sql_queries(self) -> Dict[str, pd.DataFrame]: """ Execute custom SQL queries for additional data insights, using dictGetString for asset metadata. Returns: Dictionary mapping query names to DataFrames """ sql_data = {} custom_queries = { 'top_assets_by_volume': """ SELECT dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, dictGetString('default.asset_metadata_dict', 'slug', asset_id) as slug, SUM(value) as total_volume FROM daily_metrics_v2 WHERE metric_id = get_metric_id('volume_usd') AND dt >= now() - INTERVAL 30 DAY GROUP BY asset_id ORDER BY total_volume DESC LIMIT 50 """, 'recent_high_activity_addresses': """ SELECT dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, get_metric_name(metric_id) as metric_name, dt, value FROM daily_metrics_v2 WHERE metric_id = get_metric_id('daily_active_addresses') AND dt >= now() - INTERVAL 7 DAY AND value > 1000 ORDER BY dt DESC, value DESC LIMIT 100 """, 'exchange_flow_summary': """ SELECT dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, dt, SUM(CASE WHEN metric_id = get_metric_id('exchange_inflow') THEN value ELSE 0 END) as inflow, SUM(CASE WHEN metric_id = get_metric_id('exchange_outflow') THEN value ELSE 0 END) as outflow FROM daily_metrics_v2 WHERE metric_id IN (get_metric_id('exchange_inflow'), get_metric_id('exchange_outflow')) AND dt >= now() - INTERVAL 30 DAY GROUP BY asset_id, dt ORDER BY dt DESC LIMIT 1000 """ } for query_name, query in custom_queries.items(): try: logger.info(f"Executing SQL query: {query_name}") result = san.execute_sql(query=query, set_index="dt" if "dt" in query else None) if result is not None and not result.empty: sql_data[query_name] = result logger.info(f"SQL query {query_name} returned {len(result)} rows") except Exception as e: logger.error(f"Failed to execute SQL query {query_name}: {e}") return sql_data def fetch_comprehensive_data(self, slugs: List[str] = None, categories: List[str] = None, include_special_metrics: bool = True, include_sql_queries: bool = True, addresses: List[str] = None) -> Dict[str, Any]: """ Fetch comprehensive data across all categories and metrics Args: slugs: List of asset slugs (if None, uses top assets) categories: List of categories to fetch (if None, fetches all) include_special_metrics: Whether to include special format metrics include_sql_queries: Whether to execute custom SQL queries addresses: List of blockchain addresses for address-specific data Returns: Dictionary containing all fetched data organized by category """ # Set defaults if slugs is None: slugs = ['bitcoin', 'ethereum', 'cardano', 'polkadot', 'chainlink', 'litecoin', 'bitcoin-cash', 'stellar', 'ethereum-classic', 'eos'] # Normalize and deduplicate slugs slugs = self.normalize_slug_list(slugs) if categories is None: categories = list(self.metric_categories.keys()) # Limit slugs for free tier if not san.ApiConfig.api_key: slugs = slugs[:3] # Limit to 3 assets for free tier logger.warning("No API key detected. Limiting to 3 assets to avoid rate limits.") all_data = {} start_time = datetime.now() logger.info(f"Starting comprehensive data fetch for {len(slugs)} assets across {len(categories)} categories") # Check if all API keys are exhausted early all_keys_exhausted = False if self.api_keys and self.rate_limit_switches > len(self.api_keys) * 3: logger.warning("⚠️ All API keys appear to be rate-limited. Attempting reduced fetch...") all_keys_exhausted = True # Fetch data by category for category in categories: if all_keys_exhausted: logger.info(f"Skipping category {category} due to API exhaustion") continue logger.info(f"Fetching category: {category}") category_data = self.fetch_category_batch(category, slugs, use_async_batch=True) if category_data: all_data[category] = category_data # Store individual DataFrames for later use for metric_name, df in category_data.items(): self.fetched_data[f"{category}_{metric_name}"] = df # Check if we should stop due to rate limits if self.rate_limit_switches > len(self.api_keys) * 5: logger.warning("⚠️ Excessive rate limit switches detected. Stopping data fetch to avoid further exhaustion.") all_keys_exhausted = True break # Fetch special metrics (only if not exhausted) if include_special_metrics and not all_keys_exhausted: logger.info("Fetching special metrics...") special_data = self.fetch_special_metrics(slugs) if special_data: all_data['special_metrics'] = special_data self.fetched_data.update(special_data) elif all_keys_exhausted: logger.info("Skipping special metrics due to API exhaustion") # Fetch blockchain address data if addresses and not all_keys_exhausted: logger.info("Fetching blockchain address data...") address_data = self.fetch_blockchain_address_data(addresses, slugs) if address_data: all_data['address_data'] = address_data self.fetched_data.update(address_data) elif addresses and all_keys_exhausted: logger.info("Skipping blockchain address data due to API exhaustion") # Execute SQL queries (only if not exhausted) if include_sql_queries and san.ApiConfig.api_key and not all_keys_exhausted: logger.info("Executing custom SQL queries...") sql_data = self.execute_custom_sql_queries() if sql_data: all_data['sql_queries'] = sql_data self.fetched_data.update(sql_data) elif all_keys_exhausted: logger.info("Skipping SQL queries due to API exhaustion") end_time = datetime.now() duration = end_time - start_time logger.info(f"Comprehensive data fetch completed in {duration}") logger.info(f"Successfully fetched {len(self.fetched_data)} datasets") logger.info(f"Failed queries: {len(self.failed_queries)}") # Add exhaustion notice to summary if all_keys_exhausted: logger.warning("⚠️ Data fetch completed with API rate limit exhaustion - some data may be missing") # Generate summary summary = self._generate_fetch_summary(all_data, duration) summary['all_keys_exhausted'] = all_keys_exhausted summary['rate_limit_switches'] = self.rate_limit_switches all_data['fetch_summary'] = summary return all_data def _generate_fetch_summary(self, data: Dict[str, Any], duration: timedelta) -> Dict[str, Any]: """Generate a summary of the data fetching operation""" summary = { 'fetch_duration': str(duration), 'total_datasets': len(self.fetched_data), 'failed_queries': len(self.failed_queries), 'categories_fetched': list(data.keys()), 'data_points_by_category': {}, 'date_range': f"{self.config.from_date} to {self.config.to_date}", 'interval': self.config.interval, 'timestamp': datetime.now().isoformat() } # Count data points by category for category, category_data in data.items(): if isinstance(category_data, dict): total_points = sum(len(df) for df in category_data.values() if isinstance(df, pd.DataFrame)) summary['data_points_by_category'][category] = total_points return summary def export_data(self, export_format: str = None, combine_categories: bool = False, include_metadata: bool = True) -> Dict[str, str]: """ Export fetched data to files Args: export_format: Export format ('csv', 'json', 'parquet') combine_categories: Whether to combine all data into single files include_metadata: Whether to include metadata files Returns: Dictionary mapping data names to file paths """ export_format = export_format or self.config.export_format exported_files = {} if not self.fetched_data: logger.warning("No data to export") return exported_files timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if combine_categories: # Combine all DataFrames all_dfs = [] for name, df in self.fetched_data.items(): if isinstance(df, pd.DataFrame) and not df.empty: df_copy = df.copy() df_copy['dataset_name'] = name all_dfs.append(df_copy) if all_dfs: combined_df = pd.concat(all_dfs, ignore_index=True, sort=False) filename = f"santiment_comprehensive_data_{timestamp}.{export_format}" filepath = os.path.join(self.config.export_directory, filename) self._export_dataframe(combined_df, filepath, export_format) exported_files['combined_data'] = filepath else: # Export individual datasets for name, df in self.fetched_data.items(): if isinstance(df, pd.DataFrame) and not df.empty: filename = f"santiment_{name}_{timestamp}.{export_format}" filepath = os.path.join(self.config.export_directory, filename) self._export_dataframe(df, filepath, export_format) exported_files[name] = filepath # Export metadata and summary if include_metadata: metadata = { 'failed_queries': self.failed_queries, 'available_metrics': self.available_metrics, 'config': { 'from_date': self.config.from_date, 'to_date': self.config.to_date, 'interval': self.config.interval, 'batch_size': self.config.batch_size }, 'export_timestamp': datetime.now().isoformat() } metadata_file = os.path.join(self.config.export_directory, f"santiment_metadata_{timestamp}.json") with open(metadata_file, 'w') as f: json.dump(metadata, f, indent=2) exported_files['metadata'] = metadata_file logger.info(f"Exported {len(exported_files)} files to {self.config.export_directory}") return exported_files def _export_dataframe(self, df: pd.DataFrame, filepath: str, format_type: str): """Export a DataFrame to the specified format""" try: if format_type == 'csv': df.to_csv(filepath) elif format_type == 'json': df.to_json(filepath, date_format='iso', orient='records') elif format_type == 'parquet': df.to_parquet(filepath) else: logger.error(f"Unsupported export format: {format_type}") return logger.info(f"Exported DataFrame to {filepath}") except Exception as e: logger.error(f"Failed to export DataFrame to {filepath}: {e}") def get_api_usage_stats(self) -> Dict[str, Any]: """Get API usage statistics""" try: stats = { 'calls_made': san.api_calls_made(), 'calls_remaining': san.api_calls_remaining(), 'failed_queries': len(self.failed_queries), 'successful_datasets': len(self.fetched_data) } return stats except Exception as e: logger.error(f"Failed to get API usage stats: {e}") return {} def print_summary(self): """Print a comprehensive summary of the fetching operation""" print("\n" + "="*60) print("SANTIMENT DATA FETCHER SUMMARY") print("="*60) # Basic stats print(f"Total datasets fetched: {len(self.fetched_data)}") print(f"Failed queries: {len(self.failed_queries)}") # Configuration info print(f"\nConfiguration:") print(f" Date range: {self.config.from_date} to {self.config.to_date}") print(f" Interval: {self.config.interval}") print(f" Export directory: {self.config.export_directory}") # Categories summary if self.fetched_data: print(f"\nData by category:") category_counts = {} for key in self.fetched_data.keys(): if '_' in key: category = key.split('_')[0] category_counts[category] = category_counts.get(category, 0) + 1 for category, count in sorted(category_counts.items()): print(f" {category}: {count} datasets") # Sample data info if self.fetched_data: print(f"\nSample datasets:") for i, (name, df) in enumerate(list(self.fetched_data.items())[:5]): if isinstance(df, pd.DataFrame): print(f" {name}: {len(df)} rows, {len(df.columns)} columns") if not df.empty: date_range = f"{df.index.min()} to {df.index.max()}" if hasattr(df.index, 'min') else "N/A" print(f" Date range: {date_range}") # Failed queries summary if self.failed_queries: print(f"\nFailed queries summary:") error_types = {} for failed in self.failed_queries: error_msg = str(failed.get('error', 'Unknown error')) error_type = error_msg.split(':')[0] if ':' in error_msg else error_msg error_types[error_type] = error_types.get(error_type, 0) + 1 for error_type, count in sorted(error_types.items()): print(f" {error_type}: {count} occurrences") # API usage stats try: api_stats = self.get_api_usage_stats() if api_stats: print(f"\nAPI Usage:") print(f" Calls made: {api_stats.get('calls_made', 'N/A')}") print(f" Calls remaining: {api_stats.get('calls_remaining', 'N/A')}") except: pass print("="*60) def analyze_data_quality(self) -> Dict[str, Any]: """Analyze the quality of fetched data""" quality_report = { 'total_datasets': len(self.fetched_data), 'empty_datasets': 0, 'datasets_with_nulls': 0, 'date_coverage': {}, 'data_completeness': {}, 'outliers_detected': {} } for name, df in self.fetched_data.items(): if isinstance(df, pd.DataFrame): # Check if dataset is empty if df.empty: quality_report['empty_datasets'] += 1 continue # Check for null values if df.isnull().any().any(): quality_report['datasets_with_nulls'] += 1 null_percentage = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 quality_report['data_completeness'][name] = f"{100 - null_percentage:.2f}%" # Analyze date coverage if hasattr(df.index, 'min') and hasattr(df.index, 'max'): try: date_range = { 'start': str(df.index.min()), 'end': str(df.index.max()), 'days': (df.index.max() - df.index.min()).days if hasattr(df.index.max() - df.index.min(), 'days') else 'N/A' } quality_report['date_coverage'][name] = date_range except: quality_report['date_coverage'][name] = 'Unable to determine' # Simple outlier detection for numeric columns numeric_cols = df.select_dtypes(include=[np.number]).columns outlier_info = {} for col in numeric_cols: if col not in ['metric', 'slug']: # Skip metadata columns try: q1 = df[col].quantile(0.25) q3 = df[col].quantile(0.75) iqr = q3 - q1 lower_bound = q1 - 1.5 * iqr upper_bound = q3 + 1.5 * iqr outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] if len(outliers) > 0: outlier_info[col] = len(outliers) except: continue if outlier_info: quality_report['outliers_detected'][name] = outlier_info return quality_report def create_data_dashboard(self) -> str: """Create a simple HTML dashboard summarizing the fetched data""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") total_datasets = len(self.fetched_data) date_range = f"{self.config.from_date} to {self.config.to_date}" html_content = f""" Santiment Data Dashboard

Santiment Data Dashboard

Generated on: {timestamp}

Total Datasets: {total_datasets}

Date Range: {date_range}

""" # Add category summary if self.fetched_data: category_counts = {} for key in self.fetched_data.keys(): if '_' in key: category = key.split('_')[0] category_counts[category] = category_counts.get(category, 0) + 1 html_content += """

Categories Overview

""" for category, count in sorted(category_counts.items()): html_content += f'
{category}
{count} datasets
' html_content += "
" # Add failed queries section if self.failed_queries: html_content += """

Failed Queries

""" for failed in self.failed_queries[:10]: # Show first 10 metric = failed.get('metric', 'N/A') slug = failed.get('slug', failed.get('slugs', 'N/A')) error = str(failed.get('error', 'Unknown'))[:100] + '...' if len(str(failed.get('error', ''))) > 100 else failed.get('error', 'Unknown') html_content += f"" html_content += "
MetricSlugError
{metric}{slug}{error}
" html_content += "" # Save dashboard dashboard_path = os.path.join( self.config.export_directory, f"santiment_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html" ) with open(dashboard_path, 'w') as f: f.write(html_content) logger.info(f"Dashboard created at {dashboard_path}") return dashboard_path def get_top_performing_assets(self, metric: str = 'price_usd', days: int = 30) -> pd.DataFrame: """ Analyze top performing assets based on a specific metric Args: metric: The metric to analyze performance on days: Number of days to look back for performance calculation Returns: DataFrame with performance analysis """ performance_data = [] for name, df in self.fetched_data.items(): if isinstance(df, pd.DataFrame) and metric in str(name) and not df.empty: try: if 'slug' in df.columns: # Group by slug and calculate performance for slug in df['slug'].unique(): slug_data = df[df['slug'] == slug].copy() if len(slug_data) >= 2: slug_data = slug_data.sort_index() # Calculate performance over the specified period if len(slug_data) > days: recent_data = slug_data.tail(days) else: recent_data = slug_data if 'value' in recent_data.columns and not recent_data['value'].empty: start_value = recent_data['value'].iloc[0] end_value = recent_data['value'].iloc[-1] if start_value and start_value != 0: performance = ((end_value - start_value) / start_value) * 100 performance_data.append({ 'slug': slug, 'metric': metric, 'start_value': start_value, 'end_value': end_value, 'performance_pct': performance, 'data_points': len(recent_data), 'period_days': days }) except Exception as e: logger.warning(f"Failed to analyze performance for {name}: {e}") if performance_data: performance_df = pd.DataFrame(performance_data) return performance_df.sort_values('performance_pct', ascending=False) else: return pd.DataFrame() def cleanup_export_directory(self) -> bool: """ Manually clean up the export directory. Returns: bool: True if cleanup was successful, False otherwise """ try: self._cleanup_existing_files() return True except Exception as e: logger.error(f"Manual cleanup failed: {e}") return False def get_api_key_status(self): """Get status information about API key usage""" if not self.api_keys: return { "total_keys": 0, "current_key": "None", "rate_limit_switches": self.rate_limit_switches, "current_key_preview": "No API key" } return { "total_keys": len(self.api_keys), "current_key": self.current_key_index + 1, "rate_limit_switches": self.rate_limit_switches, "current_key_preview": self.api_keys[self.current_key_index][:8] + "..." } def print_api_key_status(self): """Print API key usage status""" status = self.get_api_key_status() print(f"\n[API_STATUS] Using {status['total_keys']} API key(s)") if status['total_keys'] > 0: print(f"[API_STATUS] Current: Key #{status['current_key']} ({status['current_key_preview']})") print(f"[API_STATUS] Rate limit switches: {status['rate_limit_switches']}") if status['rate_limit_switches'] > 0: print(f"[API_STATUS] Effective rate limit handling active") else: print(f"[API_STATUS] No API keys configured - using free tier") print() def save_configuration(self, config_path: str = None) -> str: """Save current configuration to a JSON file""" if config_path is None: config_path = os.path.join(self.config.export_directory, "santiment_config.json") config_dict = { 'from_date': self.config.from_date, 'to_date': self.config.to_date, 'interval': self.config.interval, 'include_incomplete_data': self.config.include_incomplete_data, 'batch_size': self.config.batch_size, 'max_workers': self.config.max_workers, 'rate_limit_delay': self.config.rate_limit_delay, 'export_format': self.config.export_format, 'export_directory': self.config.export_directory, 'saved_at': datetime.now().isoformat() } with open(config_path, 'w') as f: json.dump(config_dict, f, indent=2) logger.info(f"Configuration saved to {config_path}") return config_path @classmethod def load_configuration(cls, config_path: str) -> 'SantimentDataFetcher': """Load configuration from a JSON file and create a fetcher instance""" with open(config_path, 'r') as f: config_dict = json.load(f) # Remove metadata fields config_dict.pop('saved_at', None) config = FetchConfig(**config_dict) return cls(config=config) # Utility functions for easy usage def cleanup_santiment_directory(directory_path: str = "data/santiment") -> bool: """ Utility function to clean up a Santiment data directory without creating a fetcher instance. Args: directory_path: Path to the directory to clean up Returns: bool: True if cleanup was successful, False otherwise """ import glob import shutil try: if not os.path.exists(directory_path): logger.info(f"Directory does not exist: {directory_path}") return True # Get all files in the directory all_files = glob.glob(os.path.join(directory_path, "*")) if all_files: logger.info(f"Cleaning up {len(all_files)} existing files in {directory_path}") for file_path in all_files: try: if os.path.isfile(file_path): os.remove(file_path) logger.debug(f"Removed file: {os.path.basename(file_path)}") elif os.path.isdir(file_path): shutil.rmtree(file_path) logger.debug(f"Removed directory: {os.path.basename(file_path)}") except Exception as e: logger.warning(f"Failed to remove {file_path}: {e}") logger.info(f"Successfully cleaned up directory: {directory_path}") else: logger.info(f"Directory is already clean: {directory_path}") return True except Exception as e: logger.error(f"Failed to cleanup directory {directory_path}: {e}") return False def fetch_quick_crypto_overview(assets: List[str] = None, api_key: str = None) -> Dict[str, pd.DataFrame]: """ Quick function to fetch essential crypto data for analysis Args: assets: List of asset slugs (defaults to top 10 cryptos) api_key: Santiment API key Returns: Dictionary with essential data """ if assets is None: assets = ['bitcoin', 'ethereum', 'solana', 'ripple', 'cardano'] config = FetchConfig( from_date="2025-07-01", # Changed to be within free tier allowed range to_date="2025-07-06", # Use last valid date for free tier interval="30m", export_format="parquet" ) fetcher = SantimentDataFetcher(api_key=api_key, config=config) # Fetch essential categories essential_categories = ['financial', 'network_activity', 'exchange'] data = fetcher.fetch_comprehensive_data( slugs=assets, categories=essential_categories, include_special_metrics=True, include_sql_queries=False ) return data def create_crypto_report(assets: List[str], output_dir: str = "./crypto_report", api_key: str = None): """ Create a comprehensive crypto analysis report Args: assets: List of asset slugs to analyze output_dir: Directory to save the report api_key: Santiment API key(s) - can be comma-separated for multiple keys """ config = FetchConfig( from_date="2025-07-01", # Changed to be within free tier allowed range to_date="2025-07-06", # Use last valid date for free tier interval="30m", export_directory=output_dir, export_format="parquet" # Use Parquet for output ) fetcher = SantimentDataFetcher(api_key=api_key, config=config) # Print API key status fetcher.print_api_key_status() # Fetch comprehensive data logger.info("Fetching comprehensive cryptocurrency data...") data = fetcher.fetch_comprehensive_data( slugs=assets, include_special_metrics=True, include_sql_queries=True ) # Export data logger.info("Exporting data to files...") exported_files = fetcher.export_data(combine_categories=False, include_metadata=True) # Create dashboard logger.info("Creating data dashboard...") dashboard_path = fetcher.create_data_dashboard() # Analyze data quality logger.info("Analyzing data quality...") quality_report = fetcher.analyze_data_quality() # Save quality report quality_path = os.path.join(output_dir, "data_quality_report.json") with open(quality_path, 'w') as f: json.dump(quality_report, f, indent=2, default=str) # Print summary fetcher.print_summary() print(f"\nReport generated successfully!") print(f"Dashboard: {dashboard_path}") print(f"Data files: {len(exported_files)} files in {output_dir}") print(f"Quality report: {quality_path}") # Print final API key status print("\n[FINAL_STATUS] Santiment API Key Usage Summary:") fetcher.print_api_key_status() # Example usage def main(): # Get API key from environment (already loaded at module top) santiment_api_key = os.getenv("SANTIMENT_API_KEY") # Create fetcher instance fetcher = SantimentDataFetcher(api_key=santiment_api_key) # Print API key status fetcher.print_api_key_status() # DISABLED: Do not cleanup Santiment directory to preserve data # cleanup_santiment_directory("./data/santiment") print("[SANTIMENT] Data preservation mode - keeping existing data") # Reduced scope for API conservation - only top 2 crypto assets print("Fetching reduced crypto overview (API conservation mode)...") # Note: Reduced from 5 to 2 assets to conserve API calls overview_data = fetch_quick_crypto_overview(['bitcoin', 'ethereum'], api_key=santiment_api_key) # Comprehensive analysis - reduced scope print("\nCreating conservative crypto report...") # Note: Reduced scope - only Bitcoin and Ethereum to preserve API limits create_crypto_report( assets=['bitcoin', 'ethereum'], # Reduced from 5 to 2 assets output_dir="./data/santiment", api_key=santiment_api_key ) # Print final API key status print("\n[FINAL_STATUS] Santiment API Key Usage Summary:") fetcher.print_api_key_status() if __name__ == "__main__": main()