Maaroufabousaleh
f
c49b21b
import os
from pathlib import Path
import pandas as pd
import glob
# Resolve DATA_DIR similar to other modules
try:
from src.config import DATA_DIR as CFG_DATA_DIR # when run as module
except Exception:
try:
from config import DATA_DIR as CFG_DATA_DIR # when run as script
except Exception:
CFG_DATA_DIR = "/data"
def _resolve_under_data(path_like: str | os.PathLike) -> Path:
"""Map a repo-style path like 'data/...' to <DATA_DIR>/...; keep absolute paths as-is."""
p = Path(path_like)
if p.is_absolute():
return p
parts = p.parts
if parts and parts[0].lower() == "data":
rel = Path(*parts[1:]) if len(parts) > 1 else Path()
else:
rel = p
return Path(CFG_DATA_DIR) / rel
def load_company_profiles(profiles_dir):
"""
Load all company profile parquet files from the directory into a DataFrame.
Returns a DataFrame indexed by symbol.
"""
profile_files = glob.glob(os.path.join(profiles_dir, '*_company_profile.parquet'))
profiles = []
for file in profile_files:
df = pd.read_parquet(file)
# Extract symbol from filename
symbol = os.path.basename(file).split('_')[0]
df['symbol'] = symbol
profiles.append(df)
if profiles:
profiles_df = pd.concat(profiles, ignore_index=True)
profiles_df.set_index('symbol', inplace=True)
return profiles_df
else:
return pd.DataFrame()
def merge_company_info_to_features(features_path, profiles_dir, output_path):
"""
Merge company profile info into stocks features DataFrame by symbol.
"""
# Resolve all paths under DATA_DIR
features_path = _resolve_under_data(features_path)
profiles_dir = _resolve_under_data(profiles_dir)
output_path = _resolve_under_data(output_path)
# Load features
features_df = pd.read_parquet(features_path)
# Load company profiles
profiles_df = load_company_profiles(profiles_dir)
# Merge on symbol
merged_df = features_df.join(profiles_df, on='symbol', rsuffix='_company')
# Save result
merged_df.to_parquet(output_path, compression='snappy')
return merged_df
# Example usage
def main():
features_path = "data/merged/features/stocks_features.parquet"
profiles_dir = "data/finnhub/company_info"
output_path = features_path
merge_company_info_to_features(features_path, profiles_dir, output_path)
print(f"[INFO] Merged company info into features and saved to: {output_path}")
if __name__ == "__main__":
main()