|
|
|
""" |
|
Extract symbols from symbols.* columns and populate the symbol field for crypto data. |
|
|
|
This script runs after merge steps but before data_filler phases to ensure |
|
the symbol column is properly populated from existing exchange symbol data. |
|
|
|
Example: symbols.gateio:"BTC_USDT" -> symbol:"BTC" |
|
""" |
|
|
|
import pandas as pd |
|
import sys |
|
from pathlib import Path |
|
import re |
|
|
|
def extract_symbol_from_exchange_symbols(df): |
|
"""Extract base symbol from exchange symbol columns""" |
|
|
|
if 'symbol' not in df.columns: |
|
df['symbol'] = None |
|
|
|
|
|
symbol_columns = [col for col in df.columns if col.startswith('symbols.')] |
|
|
|
if not symbol_columns: |
|
return df |
|
|
|
|
|
symbols_extracted = 0 |
|
symbols_normalized = 0 |
|
|
|
|
|
for idx, row in df.iterrows(): |
|
|
|
if pd.notna(row.get('symbol')): |
|
continue |
|
|
|
|
|
extracted_symbol = None |
|
|
|
for col in symbol_columns: |
|
exchange_symbol = row.get(col) |
|
if pd.notna(exchange_symbol) and isinstance(exchange_symbol, str): |
|
|
|
symbol = extract_base_symbol(exchange_symbol) |
|
if symbol: |
|
extracted_symbol = symbol |
|
break |
|
|
|
if extracted_symbol: |
|
df.at[idx, 'symbol'] = extracted_symbol |
|
symbols_extracted += 1 |
|
|
|
|
|
cg_id_to_symbol_mapping = { |
|
'bitcoin': 'BTC', |
|
'ethereum': 'ETH', |
|
'solana': 'SOL', |
|
'cardano': 'ADA', |
|
'ripple': 'XRP', |
|
'binancecoin': 'BNB', |
|
'dogecoin': 'DOGE', |
|
'polkadot': 'DOT', |
|
'chainlink': 'LINK', |
|
'litecoin': 'LTC', |
|
'uniswap': 'UNI', |
|
'avalanche-2': 'AVAX', |
|
'polygon': 'MATIC', |
|
'stellar': 'XLM', |
|
'bitcoin-cash': 'BCH', |
|
'filecoin': 'FIL', |
|
'tron': 'TRX', |
|
'ethereum-classic': 'ETC', |
|
'monero': 'XMR', |
|
'cosmos': 'ATOM', |
|
'algorand': 'ALGO', |
|
'vechain': 'VET', |
|
'hedera-hashgraph': 'HBAR', |
|
'internet-computer': 'ICP', |
|
'theta-token': 'THETA', |
|
'eos': 'EOS', |
|
'aave': 'AAVE', |
|
'maker': 'MKR', |
|
'curve-dao-token': 'CRV', |
|
'pancakeswap-token': 'CAKE', |
|
'the-sandbox': 'SAND', |
|
'decentraland': 'MANA', |
|
'axie-infinity': 'AXS', |
|
'shiba-inu': 'SHIB', |
|
'terra-luna': 'LUNA', |
|
'near': 'NEAR', |
|
'flow': 'FLOW', |
|
'fantom': 'FTM', |
|
'harmony': 'ONE', |
|
'basic-attention-token': 'BAT', |
|
'enjincoin': 'ENJ', |
|
'sushi': 'SUSHI', |
|
'compound': 'COMP', |
|
'yearn-finance': 'YFI', |
|
'synthetix': 'SNX', |
|
'uma': 'UMA', |
|
'0x': 'ZRX', |
|
'loopring': 'LRC', |
|
'balancer': 'BAL' |
|
} |
|
|
|
for idx, row in df.iterrows(): |
|
current_symbol = row.get('symbol') |
|
|
|
|
|
if pd.notna(current_symbol) and isinstance(current_symbol, str): |
|
normalized_symbol = cg_id_to_symbol_mapping.get(current_symbol.lower()) |
|
if normalized_symbol and normalized_symbol != current_symbol: |
|
df.at[idx, 'symbol'] = normalized_symbol |
|
symbols_normalized += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
null_symbols_remaining = df['symbol'].isnull().sum() |
|
|
|
|
|
if null_symbols_remaining > 0: |
|
|
|
sample_nulls = df[df['symbol'].isnull()][['symbol', 'cg_id'] + symbol_columns[:3]].head(5) |
|
|
|
|
|
return df |
|
|
|
def extract_base_symbol(exchange_symbol): |
|
"""Extract base symbol from exchange symbol formats""" |
|
|
|
if not isinstance(exchange_symbol, str): |
|
return None |
|
|
|
exchange_symbol = exchange_symbol.strip().upper() |
|
|
|
|
|
patterns = [ |
|
r'^([A-Z]{2,10})USDT?$', |
|
r'^([A-Z]{2,10})_USDT?$', |
|
r'^([A-Z]{2,10})/USDT?$', |
|
r'^([A-Z]{2,10})-USDT?$', |
|
r'^([A-Z]{2,10})USD$', |
|
r'^([A-Z]{2,10})_USD$', |
|
r'^([A-Z]{2,10})/USD$', |
|
r'^([A-Z]{2,10})-USD$', |
|
r'^([A-Z]{2,10})BUSD$', |
|
r'^([A-Z]{2,10})_BUSD$', |
|
r'^([A-Z]{2,10})EUR$', |
|
r'^([A-Z]{2,10})_EUR$', |
|
r'^([A-Z]{2,10})BTC$', |
|
r'^([A-Z]{2,10})_BTC$', |
|
] |
|
|
|
for pattern in patterns: |
|
match = re.match(pattern, exchange_symbol) |
|
if match: |
|
base_symbol = match.group(1) |
|
|
|
if len(base_symbol) >= 2 and len(base_symbol) <= 10: |
|
|
|
if base_symbol not in ['USDT', 'USDC', 'USD', 'EUR', 'BTC', 'ETH', 'BNB', 'BUSD']: |
|
return base_symbol |
|
elif base_symbol in ['BTC', 'ETH', 'BNB']: |
|
return base_symbol |
|
|
|
|
|
|
|
for suffix in ['USDT', 'USDC', 'USD', 'EUR', 'BUSD']: |
|
if exchange_symbol.endswith(suffix): |
|
base = exchange_symbol[:-len(suffix)] |
|
if len(base) >= 2 and len(base) <= 10: |
|
return base |
|
|
|
|
|
for delimiter in ['_', '/', '-']: |
|
if delimiter in exchange_symbol: |
|
parts = exchange_symbol.split(delimiter) |
|
if len(parts) >= 2: |
|
base = parts[0] |
|
if len(base) >= 2 and len(base) <= 10: |
|
return base |
|
|
|
return None |
|
|
|
def process_crypto_features(): |
|
"""Process crypto features to extract symbols""" |
|
|
|
|
|
possible_paths = [ |
|
Path('data/merged/features/crypto_features.parquet'), |
|
Path('../../data/merged/features/crypto_features.parquet'), |
|
Path('../../../data/merged/features/crypto_features.parquet') |
|
] |
|
|
|
crypto_file = None |
|
for path in possible_paths: |
|
if path.exists(): |
|
crypto_file = path |
|
break |
|
|
|
if crypto_file is None: |
|
print(f"Crypto features file not found in any of these locations:") |
|
for path in possible_paths: |
|
print(f" {path.absolute()}") |
|
return False |
|
|
|
print(f"Loading crypto features from: {crypto_file}") |
|
df = pd.read_parquet(crypto_file) |
|
|
|
print(f"Loaded {len(df)} rows with {len(df.columns)} columns") |
|
|
|
|
|
null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df) |
|
print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)") |
|
|
|
|
|
df_fixed = extract_symbol_from_exchange_symbols(df) |
|
|
|
|
|
null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed) |
|
|
|
|
|
total_improvement = null_symbols_before - null_symbols_after |
|
|
|
print("Successfully extracted crypto symbols!") |
|
|
|
|
|
if total_improvement > 0 or null_symbols_after <= 2: |
|
|
|
df_fixed.to_parquet(crypto_file) |
|
return True |
|
else: |
|
return True |
|
|
|
def process_stocks_features(): |
|
"""Process stocks features to extract symbols (if needed)""" |
|
|
|
|
|
possible_paths = [ |
|
Path('data/merged/features/stocks_features.parquet'), |
|
Path('../../data/merged/features/stocks_features.parquet'), |
|
Path('../../../data/merged/features/stocks_features.parquet') |
|
] |
|
|
|
stocks_file = None |
|
for path in possible_paths: |
|
if path.exists(): |
|
stocks_file = path |
|
break |
|
|
|
if stocks_file is None: |
|
return False |
|
|
|
df = pd.read_parquet(stocks_file) |
|
|
|
|
|
null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df) |
|
print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)") |
|
|
|
if null_symbols_before == 0: |
|
print("Stocks symbols are already populated, skipping") |
|
return True |
|
|
|
|
|
|
|
df_fixed = extract_symbol_from_exchange_symbols(df) |
|
|
|
|
|
null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed) |
|
symbols_fixed = null_symbols_before - null_symbols_after |
|
|
|
print(f"\nResults:") |
|
print(f"- Symbols fixed: {symbols_fixed}") |
|
print(f"- Null symbols after: {null_symbols_after} ({null_symbols_after/len(df_fixed)*100:.1f}%)") |
|
|
|
if symbols_fixed > 0: |
|
|
|
print(f"\nSaving fixed stocks features to: {stocks_file}") |
|
df_fixed.to_parquet(stocks_file) |
|
print("File saved successfully!") |
|
|
|
return True |
|
else: |
|
print("No symbols were extracted/fixed for stocks") |
|
return True |
|
|
|
def main(): |
|
"""Main function to extract symbols from exchange symbol data""" |
|
|
|
print("=== EXTRACTING SYMBOLS FROM EXCHANGE DATA ===") |
|
print("This script extracts base symbols from symbols.* columns") |
|
print("Example: symbols.gateio:'BTC_USDT' -> symbol:'BTC'") |
|
print() |
|
|
|
|
|
print("Processing crypto features...") |
|
crypto_success = process_crypto_features() |
|
|
|
print("\n" + "="*50 + "\n") |
|
|
|
|
|
print("Processing stocks features...") |
|
stocks_success = process_stocks_features() |
|
|
|
print("\n" + "="*50) |
|
|
|
if crypto_success: |
|
print("Successfully extracted crypto symbols!") |
|
else: |
|
print("Failed to extract crypto symbols!") |
|
|
|
if stocks_success: |
|
print("Stocks symbols processing completed!") |
|
else: |
|
print("Failed to process stocks symbols!") |
|
|
|
if crypto_success and stocks_success: |
|
print("\nSymbol extraction completed successfully!") |
|
return True |
|
else: |
|
print("\nSome issues occurred during symbol extraction") |
|
return False |
|
|
|
if __name__ == "__main__": |
|
success = main() |
|
sys.exit(0 if success else 1) |
|
|