#!/bin/bash # Extract embeddings for Site 1 (Split 1) # Usage: bash extract_site1.sh set -e # Exit on error # Configuration ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" # Update this to your NLST data path OUTPUT_BASE="site1_data" SITE_SPLITS="../subsets/site_splits" echo "============================================" echo "SITE 1 - Embedding Extraction" echo "============================================" echo "" # Create output directories mkdir -p ${OUTPUT_BASE}/train mkdir -p ${OUTPUT_BASE}/test # Extract training embeddings echo "📦 Extracting TRAINING embeddings..." python extract-embeddings.py \ --root-dir ${ROOT_DIR} \ --pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv \ --output-dir ${OUTPUT_BASE}/train \ --num-workers 8 \ --checkpoint-interval 500 echo "" echo "✓ Training embeddings complete!" echo "" # Extract test embeddings echo "📦 Extracting TEST embeddings..." python extract-embeddings.py \ --root-dir ${ROOT_DIR} \ --pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv \ --output-dir ${OUTPUT_BASE}/test \ --num-workers 8 \ --checkpoint-interval 500 echo "" echo "✓ Test embeddings complete!" echo "" # Prepare files for federated learning echo "📋 Preparing files for federated learning..." mkdir -p ${OUTPUT_BASE}/fl_ready # Copy and rename embeddings cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_train.parquet cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site1_embeddings_test.parquet # Extract just pid and label columns from the CSV files echo "Creating site1_labels-train.csv..." head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-train.csv echo "Creating site1_labels-test.csv..." head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_1.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site1_labels-test.csv echo "" echo "============================================" echo "SITE 1 - COMPLETE! ✅" echo "============================================" echo "" echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/" ls -lh ${OUTPUT_BASE}/fl_ready/ echo "" echo "Files ready for federated learning:" echo " ✓ site1_embeddings_train.parquet" echo " ✓ site1_embeddings_test.parquet" echo " ✓ site1_labels-train.csv" echo " ✓ site1_labels-test.csv" echo ""