|
import os |
|
from pathlib import Path |
|
import pandas as pd |
|
import glob |
|
|
|
|
|
try: |
|
from src.config import DATA_DIR as CFG_DATA_DIR |
|
except Exception: |
|
try: |
|
from config import DATA_DIR as CFG_DATA_DIR |
|
except Exception: |
|
CFG_DATA_DIR = "/data" |
|
|
|
|
|
def _resolve_under_data(path_like: str | os.PathLike) -> Path: |
|
"""Map a repo-style path like 'data/...' to <DATA_DIR>/...; keep absolute paths as-is.""" |
|
p = Path(path_like) |
|
if p.is_absolute(): |
|
return p |
|
parts = p.parts |
|
if parts and parts[0].lower() == "data": |
|
rel = Path(*parts[1:]) if len(parts) > 1 else Path() |
|
else: |
|
rel = p |
|
return Path(CFG_DATA_DIR) / rel |
|
|
|
def load_company_profiles(profiles_dir): |
|
""" |
|
Load all company profile parquet files from the directory into a DataFrame. |
|
Returns a DataFrame indexed by symbol. |
|
""" |
|
profile_files = glob.glob(os.path.join(profiles_dir, '*_company_profile.parquet')) |
|
profiles = [] |
|
for file in profile_files: |
|
df = pd.read_parquet(file) |
|
|
|
symbol = os.path.basename(file).split('_')[0] |
|
df['symbol'] = symbol |
|
profiles.append(df) |
|
if profiles: |
|
profiles_df = pd.concat(profiles, ignore_index=True) |
|
profiles_df.set_index('symbol', inplace=True) |
|
return profiles_df |
|
else: |
|
return pd.DataFrame() |
|
|
|
def merge_company_info_to_features(features_path, profiles_dir, output_path): |
|
""" |
|
Merge company profile info into stocks features DataFrame by symbol. |
|
""" |
|
|
|
features_path = _resolve_under_data(features_path) |
|
profiles_dir = _resolve_under_data(profiles_dir) |
|
output_path = _resolve_under_data(output_path) |
|
|
|
features_df = pd.read_parquet(features_path) |
|
|
|
profiles_df = load_company_profiles(profiles_dir) |
|
|
|
merged_df = features_df.join(profiles_df, on='symbol', rsuffix='_company') |
|
|
|
merged_df.to_parquet(output_path, compression='snappy') |
|
return merged_df |
|
|
|
|
|
def main(): |
|
features_path = "data/merged/features/stocks_features.parquet" |
|
profiles_dir = "data/finnhub/company_info" |
|
output_path = features_path |
|
merge_company_info_to_features(features_path, profiles_dir, output_path) |
|
print(f"[INFO] Merged company info into features and saved to: {output_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |