File size: 11,488 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
#!/usr/bin/env python3
"""
Extract symbols from symbols.* columns and populate the symbol field for crypto data.

This script runs after merge steps but before data_filler phases to ensure
the symbol column is properly populated from existing exchange symbol data.

Example: symbols.gateio:"BTC_USDT" -> symbol:"BTC"
"""

import pandas as pd
import sys
from pathlib import Path
import re

def extract_symbol_from_exchange_symbols(df):
    """Extract base symbol from exchange symbol columns"""
    
    if 'symbol' not in df.columns:
        df['symbol'] = None
    
    # Find all symbols.* columns
    symbol_columns = [col for col in df.columns if col.startswith('symbols.')]
    
    if not symbol_columns:
        return df
    
    # Extract symbols from exchange symbol data
    symbols_extracted = 0
    symbols_normalized = 0
    
    # First pass: extract symbols from exchange data for null symbols
    for idx, row in df.iterrows():
        # Skip if symbol is already populated
        if pd.notna(row.get('symbol')):
            continue
            
        # Try to extract symbol from any exchange symbol column
        extracted_symbol = None
        
        for col in symbol_columns:
            exchange_symbol = row.get(col)
            if pd.notna(exchange_symbol) and isinstance(exchange_symbol, str):
                # Extract base symbol from various exchange formats
                symbol = extract_base_symbol(exchange_symbol)
                if symbol:
                    extracted_symbol = symbol
                    break
        
        if extracted_symbol:
            df.at[idx, 'symbol'] = extracted_symbol
            symbols_extracted += 1
    
    # Second pass: normalize cg_id values to proper ticker symbols
    cg_id_to_symbol_mapping = {
        'bitcoin': 'BTC',
        'ethereum': 'ETH',
        'solana': 'SOL',
        'cardano': 'ADA',
        'ripple': 'XRP',
        'binancecoin': 'BNB',
        'dogecoin': 'DOGE',
        'polkadot': 'DOT',
        'chainlink': 'LINK',
        'litecoin': 'LTC',
        'uniswap': 'UNI',
        'avalanche-2': 'AVAX',
        'polygon': 'MATIC',
        'stellar': 'XLM',
        'bitcoin-cash': 'BCH',
        'filecoin': 'FIL',
        'tron': 'TRX',
        'ethereum-classic': 'ETC',
        'monero': 'XMR',
        'cosmos': 'ATOM',
        'algorand': 'ALGO',
        'vechain': 'VET',
        'hedera-hashgraph': 'HBAR',
        'internet-computer': 'ICP',
        'theta-token': 'THETA',
        'eos': 'EOS',
        'aave': 'AAVE',
        'maker': 'MKR',
        'curve-dao-token': 'CRV',
        'pancakeswap-token': 'CAKE',
        'the-sandbox': 'SAND',
        'decentraland': 'MANA',
        'axie-infinity': 'AXS',
        'shiba-inu': 'SHIB',
        'terra-luna': 'LUNA',
        'near': 'NEAR',
        'flow': 'FLOW',
        'fantom': 'FTM',
        'harmony': 'ONE',
        'basic-attention-token': 'BAT',
        'enjincoin': 'ENJ',
        'sushi': 'SUSHI',
        'compound': 'COMP',
        'yearn-finance': 'YFI',
        'synthetix': 'SNX',
        'uma': 'UMA',
        '0x': 'ZRX',
        'loopring': 'LRC',
        'balancer': 'BAL'
    }
    
    for idx, row in df.iterrows():
        current_symbol = row.get('symbol')
        
        # If symbol matches a cg_id pattern, normalize it to ticker symbol
        if pd.notna(current_symbol) and isinstance(current_symbol, str):
            normalized_symbol = cg_id_to_symbol_mapping.get(current_symbol.lower())
            if normalized_symbol and normalized_symbol != current_symbol:
                df.at[idx, 'symbol'] = normalized_symbol
                symbols_normalized += 1
    
    # Final stats for debugging if needed
    # print(f"Extracted symbols for {symbols_extracted} rows")
    # print(f"Normalized symbols for {symbols_normalized} rows")
    
    # Show results
    null_symbols_remaining = df['symbol'].isnull().sum()
    # print(f"Remaining null symbols: {null_symbols_remaining}")
    
    if null_symbols_remaining > 0:
        # print("Rows with remaining null symbols:")
        sample_nulls = df[df['symbol'].isnull()][['symbol', 'cg_id'] + symbol_columns[:3]].head(5)
        # print(sample_nulls)
    
    return df

def extract_base_symbol(exchange_symbol):
    """Extract base symbol from exchange symbol formats"""
    
    if not isinstance(exchange_symbol, str):
        return None
    
    exchange_symbol = exchange_symbol.strip().upper()
    
    # Common patterns for crypto exchange symbols
    patterns = [
        r'^([A-Z]{2,10})USDT?$',      # BTCUSDT -> BTC
        r'^([A-Z]{2,10})_USDT?$',     # BTC_USDT -> BTC  
        r'^([A-Z]{2,10})/USDT?$',     # BTC/USDT -> BTC
        r'^([A-Z]{2,10})-USDT?$',     # BTC-USDT -> BTC
        r'^([A-Z]{2,10})USD$',        # BTCUSD -> BTC
        r'^([A-Z]{2,10})_USD$',       # BTC_USD -> BTC
        r'^([A-Z]{2,10})/USD$',       # BTC/USD -> BTC
        r'^([A-Z]{2,10})-USD$',       # BTC-USD -> BTC
        r'^([A-Z]{2,10})BUSD$',       # BTCBUSD -> BTC
        r'^([A-Z]{2,10})_BUSD$',      # BTC_BUSD -> BTC
        r'^([A-Z]{2,10})EUR$',        # BTCEUR -> BTC
        r'^([A-Z]{2,10})_EUR$',       # BTC_EUR -> BTC
        r'^([A-Z]{2,10})BTC$',        # ETHBTC -> ETH
        r'^([A-Z]{2,10})_BTC$',       # ETH_BTC -> ETH
    ]
    
    for pattern in patterns:
        match = re.match(pattern, exchange_symbol)
        if match:
            base_symbol = match.group(1)
            # Filter out obvious non-crypto symbols and ensure reasonable length
            if len(base_symbol) >= 2 and len(base_symbol) <= 10:
                # Skip if it looks like a quote currency
                if base_symbol not in ['USDT', 'USDC', 'USD', 'EUR', 'BTC', 'ETH', 'BNB', 'BUSD']:
                    return base_symbol
                elif base_symbol in ['BTC', 'ETH', 'BNB']:  # These are valid base symbols
                    return base_symbol
    
    # If no pattern matches, try simple heuristics
    # Remove common suffixes
    for suffix in ['USDT', 'USDC', 'USD', 'EUR', 'BUSD']:
        if exchange_symbol.endswith(suffix):
            base = exchange_symbol[:-len(suffix)]
            if len(base) >= 2 and len(base) <= 10:
                return base
    
    # Split on common delimiters and take first part
    for delimiter in ['_', '/', '-']:
        if delimiter in exchange_symbol:
            parts = exchange_symbol.split(delimiter)
            if len(parts) >= 2:
                base = parts[0]
                if len(base) >= 2 and len(base) <= 10:
                    return base
    
    return None

def process_crypto_features():
    """Process crypto features to extract symbols"""
    
    # Try different possible paths
    possible_paths = [
        Path('data/merged/features/crypto_features.parquet'),
        Path('../../data/merged/features/crypto_features.parquet'),
        Path('../../../data/merged/features/crypto_features.parquet')
    ]
    
    crypto_file = None
    for path in possible_paths:
        if path.exists():
            crypto_file = path
            break
    
    if crypto_file is None:
        print(f"Crypto features file not found in any of these locations:")
        for path in possible_paths:
            print(f"  {path.absolute()}")
        return False
    
    print(f"Loading crypto features from: {crypto_file}")
    df = pd.read_parquet(crypto_file)
    
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
    
    # Check current state
    null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df)
    print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)")
    
    # Extract symbols
    df_fixed = extract_symbol_from_exchange_symbols(df)
    
    # Check results - note that extract_symbol_from_exchange_symbols tracks its own changes
    null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed)
    
    # Calculate total improvement
    total_improvement = null_symbols_before - null_symbols_after
    
    print("Successfully extracted crypto symbols!")
    
    # Save if there's been any improvement or if nulls are very low
    if total_improvement > 0 or null_symbols_after <= 2:
        # Save the fixed file
        df_fixed.to_parquet(crypto_file)
        return True
    else:
        return True  # Success even if no changes needed

def process_stocks_features():
    """Process stocks features to extract symbols (if needed)"""
    
    # Try different possible paths
    possible_paths = [
        Path('data/merged/features/stocks_features.parquet'),
        Path('../../data/merged/features/stocks_features.parquet'),
        Path('../../../data/merged/features/stocks_features.parquet')
    ]
    
    stocks_file = None
    for path in possible_paths:
        if path.exists():
            stocks_file = path
            break
    
    if stocks_file is None:
        return False
    
    df = pd.read_parquet(stocks_file)
    
    # Check if stocks need symbol extraction too
    null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df)
    print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)")
    
    if null_symbols_before == 0:
        print("Stocks symbols are already populated, skipping")
        return True
    
    # For stocks, we might have different symbol patterns
    # Extract symbols if needed
    df_fixed = extract_symbol_from_exchange_symbols(df)
    
    # Check results
    null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed)
    symbols_fixed = null_symbols_before - null_symbols_after
    
    print(f"\nResults:")
    print(f"- Symbols fixed: {symbols_fixed}")
    print(f"- Null symbols after: {null_symbols_after} ({null_symbols_after/len(df_fixed)*100:.1f}%)")
    
    if symbols_fixed > 0:
        # Save the fixed file
        print(f"\nSaving fixed stocks features to: {stocks_file}")
        df_fixed.to_parquet(stocks_file)
        print("File saved successfully!")
        
        return True
    else:
        print("No symbols were extracted/fixed for stocks")
        return True  # Not an error for stocks

def main():
    """Main function to extract symbols from exchange symbol data"""
    
    print("=== EXTRACTING SYMBOLS FROM EXCHANGE DATA ===")
    print("This script extracts base symbols from symbols.* columns")
    print("Example: symbols.gateio:'BTC_USDT' -> symbol:'BTC'")
    print()
    
    # Process crypto features
    print("Processing crypto features...")
    crypto_success = process_crypto_features()
    
    print("\n" + "="*50 + "\n")
    
    # Process stocks features  
    print("Processing stocks features...")
    stocks_success = process_stocks_features()
    
    print("\n" + "="*50)
    
    if crypto_success:
        print("Successfully extracted crypto symbols!")
    else:
        print("Failed to extract crypto symbols!")
    
    if stocks_success:
        print("Stocks symbols processing completed!")
    else:
        print("Failed to process stocks symbols!")
    
    if crypto_success and stocks_success:
        print("\nSymbol extraction completed successfully!")
        return True
    else:
        print("\nSome issues occurred during symbol extraction")
        return False

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)