Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

File size: 7,817 Bytes

c49b21b

import subprocess
from pathlib import Path
import sys
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv

DAYS_OLD = 7
MERGED_PATH = Path("data/merged/features/merged_features.parquet")
ARCHIVE_DIR = Path("data/merged/archive")
ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)

def run_script(script, args=None):
    cmd = [sys.executable, str(Path(__file__).parent / script)]
    if args:
        cmd += args
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, check=True)
    return result

def archive_old_records():
    feature_files = [
        Path("data/merged/features/crypto_features.parquet"),
        Path("data/merged/features/stocks_features.parquet")
    ]
    now = datetime.utcnow()
    cutoff = int((now - timedelta(days=DAYS_OLD)).timestamp() * 1000)

    for feature_path in feature_files:
        if not feature_path.exists():
            print(f"[WARN] {feature_path} does not exist.")
            continue

        df = pd.read_parquet(feature_path)
        old = df.loc[df['interval_timestamp'] < cutoff].copy()
        keep = df.loc[df['interval_timestamp'] >= cutoff].copy()

        if old.empty:
            print(f"[INFO] No records to archive in {feature_path}.")
            continue

        # Group by day (UTC) and write each group to a separate parquet file under archive/{day}/
        old['archive_date'] = pd.to_datetime(old['interval_timestamp'], unit='ms').dt.strftime('%Y%m%d')
        for day, group in old.groupby('archive_date'):
            day_dir = ARCHIVE_DIR / day
            day_dir.mkdir(parents=True, exist_ok=True)
            out_path = day_dir / f"{feature_path.stem}_archived_{day}.parquet"
            if out_path.exists():
                existing = pd.read_parquet(out_path)
                group = pd.concat([existing, group.drop(columns=['archive_date'])], ignore_index=True)
            else:
                group = group.drop(columns=['archive_date'])

            group.to_parquet(out_path, index=False)
            print(f"[ARCHIVE] {len(group)} records -> {out_path}")

        # Save the remaining (unarchived) records back to the feature file
        keep.to_parquet(feature_path, index=False)
        print(f"[INFO] Archived {len(old)} records from {feature_path}. {len(keep)} remain.")

def store_in_cloud():
    # Import StorageHandler from cloud_utils, ensuring src is in sys.path
    import os
    import sys
    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'src')))
    from data_cloud.cloud_utils import StorageHandler

    # Filebase credentials from env
    load_dotenv()
    endpoint_url = os.getenv("FILEBASE_ENDPOINT")
    access_key = os.getenv("FILEBASE_ACCESS_KEY")
    secret_key = os.getenv("FILEBASE_SECRET_KEY")
    bucket_name = os.getenv("FILEBASE_BUCKET")
    if not all([endpoint_url, access_key, secret_key, bucket_name]):
        print("[ERROR] Filebase credentials not set in environment.")
        return

    storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name)

    merged_dir = os.path.join("data", "merged")
    archive_dir = os.path.join(merged_dir, "archive")
    # Upload all files in merged except archive
    for root, dirs, files in os.walk(merged_dir):
        # Skip archive subdir for now
        if os.path.abspath(root) == os.path.abspath(archive_dir):
            continue
        for fname in files:
            local_path = os.path.join(root, fname)
            rel_path = os.path.relpath(local_path, "data")
            key = rel_path.replace(os.sep, "/")
            with open(local_path, "rb") as f:
                data = f.read()
            storage.upload(key, data)

    # Only upload archive files newer than DAYS_OLD days
    import time
    cutoff = time.time() - DAYS_OLD * 86400
    if os.path.exists(archive_dir):
        for fname in os.listdir(archive_dir):
            local_path = os.path.join(archive_dir, fname)
            if not os.path.isfile(local_path):
                continue
            mtime = os.path.getmtime(local_path)
            if mtime >= cutoff:
                rel_path = os.path.relpath(local_path, "data")
                key = rel_path.replace(os.sep, "/")
                with open(local_path, "rb") as f:
                    data = f.read()
                storage.upload(key, data)

# Save stocks and crypto features to data/merged/raw
def save_raw_features():
    import shutil
    raw_dir = Path('data/merged/raw')
    raw_dir.mkdir(parents=True, exist_ok=True)
    src_stocks = Path('data/merged/features/stocks_features.parquet')
    src_crypto = Path('data/merged/features/crypto_features.parquet')
    dst_stocks = raw_dir / 'stocks_features.parquet'
    dst_crypto = raw_dir / 'crypto_features.parquet'
    if src_stocks.exists():
        shutil.copy2(src_stocks, dst_stocks)
        print(f"[RAW] Saved stocks features to {dst_stocks}")
    else:
        print(f"[RAW] Source stocks features not found: {src_stocks}")
    if src_crypto.exists():
        shutil.copy2(src_crypto, dst_crypto)
        print(f"[RAW] Saved crypto features to {dst_crypto}")
    else:
        print(f"[RAW] Source crypto features not found: {src_crypto}")

def main():
    # Run all merge steps
    run_script('merge_0.py')
    run_script('merge_1.py', [
        '--latest', 'data/advisorai-data/features/latest_features.parquet',
        '--finnhub', 'data/advisorai-data/features/latest_features.parquet',
        '--out', 'data/merged/features/merged_features.parquet'
    ])
    run_script('merge_2.py')
    run_script('merge_3.py')
    run_script('merge_4.py')
    run_script('separator.py')
    run_script('merge_5.py')
    run_script('merge_6.py')
    run_script('merge_7.py')

    save_raw_features()

    # Extract symbols from exchange symbol data before data fillers
    try:
        run_script('extract_symbols.py')
    except subprocess.CalledProcessError as e:
        print(f"[WARNING] Symbol extraction failed: {e}")

    # Remove rows with null symbols after symbol extraction
    try:
        run_script('remove_null_symbols.py')
    except subprocess.CalledProcessError as e:
        print(f"[WARNING] Null symbol removal failed: {e}")

    # # Run normalization scripts with error handling
    # run_script('stocks_data_filler.py')
    
    # try:
    #     run_script('crypto_data_filler.py')
    # except subprocess.CalledProcessError as e:
    #     print(f"[WARNING] Crypto data filler failed: {e}")

    # Merge temp files into merged - with error handling
    try:
        run_script('merge_temp.py')
    except subprocess.CalledProcessError as e:
        print(f"[WARNING] Merge temp failed: {e}")

    try:
        run_script('merge_sant.py')
    except subprocess.CalledProcessError as e:
        print(f"[WARNING] Santiment merge failed: {e}")
        
    try:
        run_script('merge_santiment_with_crypto.py')
    except subprocess.CalledProcessError as e:
        print(f"[WARNING] Santiment-crypto merge failed: {e}")

    # # Final comprehensive null value handling - clean up any remaining nulls
    # try:
    #     run_script('run_final_null_handling.py')
    # except subprocess.CalledProcessError as e:
    #     print(f"[WARNING] Final null handling failed: {e}")

    # # Normalize features
    # run_script('normalize.py')
    # # Normalize train files for both crypto and stocks
    # run_script('norm/crypto.py', ['--train'])
    # run_script('norm/stocks.py', ['--train'])

    # Archive old records
    archive_old_records()

    # Generate and store full report
    run_script('full_report.py')

    # Store all merged data in cloud
    store_in_cloud()

    print("[OK] All merge steps, null handling, normalization, and reporting completed successfully.")

if __name__ == "__main__":
    main()