create scripts/datasets.py (#2)

Browse files

- create scripts/datasets.py (06ac44905efe4681adf6d4de95ce3ef99b9134d1)

Co-authored-by: Ryan Keivanfar <[email protected]>

Files changed (1) hide show

scripts/datasets.py +557 -0

scripts/datasets.py ADDED Viewed

	@@ -0,0 +1,557 @@

+# datasets.py
+import torch
+from torch.utils.data import Dataset
+import pandas as pd
+import numpy as np
+import os
+import pyfaidx
+import kipoiseq.transforms.functional
+from rdkit import Chem
+from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
+from rdkit.Chem import rdFingerprintGenerator
+# --- Global Config ---
+# Enformer typically uses a 196,608 bp input sequence.
+# We will use a shorter input (1/4 of usual length) to speed up training.
+ENFORMER_INPUT_SEQ_LENGTH = 49_152
+# Relative paths from the project root directory
+GENOME_FASTA_PATH = "data/hg38.fa"
+TSS_REGIONS_CSV_PATH = "data/Enformer_genomic_regions_TSSCenteredGenes_FixedOverlapRemoval.csv"
+# Path to pseudobulk target data, matching the provided dummy file
+PSEUDOBULK_TARGET_DATA_PATH = "data/pseudobulk_dummy.csv"
+# ----------------------
+class GenomeOneHotEncoder:
+    """
+    Encodes DNA sequences into one-hot format using kipoiseq.
+    """
+    def __init__(self, sequence_length: int = ENFORMER_INPUT_SEQ_LENGTH):
+        self.sequence_length = sequence_length
+    @staticmethod
+    def _one_hot_encode(sequence: str) -> np.ndarray:
+        ## one hot encodes DNA using the same code from the original Enformer paper.
+        ## Ensures one-hot encoding is consistent with representations Enformer has
+        ## already learned
+        return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)
+    def encode(self, seq: str) -> np.ndarray:
+        """
+        One-hot encodes a DNA sequence using kipoiseq.
+        Args:
+            seq (str): The DNA sequence string. The FastaReader should ensure this
+                       sequence is already uppercase and of length ENFORMER_INPUT_SEQ_LENGTH.
+        Returns:
+            np.ndarray: A numpy array, typically (L, 4) for DNA, with one-hot encoded sequence.
+        """
+        return GenomeOneHotEncoder._one_hot_encode(seq)
+class FastaReader:
+    """
+    Reads sequences from a FASTA file using pyfaidx.
+    Handles chromosome boundary conditions by padding with 'N'.
+    """
+    def __init__(self, fasta_path: str):
+        self.fasta_path = fasta_path
+        self.genome = None
+        try:
+            self.genome = pyfaidx.Fasta(self.fasta_path, sequence_always_upper=True)
+            print(f"Successfully loaded and indexed genome using pyfaidx from: {self.fasta_path}")
+        except pyfaidx.FastaIndexingError as e:
+            print(f"Error: Could not index FASTA file at {self.fasta_path}.")
+            print("Ensure it's a valid FASTA file and the .fai index can be created/read in its directory.")
+            print(f"pyfaidx error: {e}")
+            raise
+        except FileNotFoundError:
+            print(f"Error: FASTA file not found at {self.fasta_path}.")
+            raise
+    def get_sequence(self, chrom: str, start_0based: int, end_0based_exclusive: int) -> str:
+        """
+        Fetches a DNA sequence for the given 0-based genomic interval.
+        Pads with 'N' if the interval extends beyond chromosome boundaries.
+        Args:
+            chrom (str): Chromosome name (e.g., 'chr1').
+            start_0based (int): 0-based start coordinate (inclusive).
+            end_0based_exclusive (int): 0-based end coordinate (exclusive).
+        Returns:
+            str: The DNA sequence, padded with 'N's to match the requested length
+                 (end_0based_exclusive - start_0based).
+        """
+        if self.genome is None:
+            raise RuntimeError("FastaReader not properly initialized (pyfaidx missing or genome loading failed).")
+        # Sanitize chromosome name (e.g., '1' vs 'chr1')
+        true_chrom_name = chrom
+        if chrom not in self.genome:
+            alternative_chrom_name = 'chr' + chrom if not chrom.startswith('chr') else chrom.replace('chr', '', 1)
+            if alternative_chrom_name in self.genome:
+                true_chrom_name = alternative_chrom_name
+            else:
+                available_chroms_sample = list(self.genome.keys())[:5]
+                raise ValueError(
+                    f"Chromosome '{chrom}' (and alternative '{alternative_chrom_name}') not found in FASTA file. "
+                    f"Available chromosomes sample: {available_chroms_sample}..."
+                )
+        chrom_len = len(self.genome[true_chrom_name])
+        seq_len_requested = end_0based_exclusive - start_0based
+        # init sequence with Ns for padding
+        sequence_parts = []
+        # handle padding at the beginning
+        padding_start_len = 0
+        if start_0based < 0:
+            padding_start_len = abs(start_0based)
+            sequence_parts.append('N' * padding_start_len)
+            effective_start = 0
+        else:
+            effective_start = start_0based
+        # determine the part of the sequence to fetch from FASTA
+        fetch_len = min(end_0based_exclusive, chrom_len) - effective_start
+        if fetch_len > 0:
+            sequence_parts.append(self.genome[true_chrom_name][effective_start : effective_start + fetch_len].seq)
+        elif effective_start >= chrom_len: # Requested start is beyond chromosome end
+             pass # No sequence to fetch, only padding needed
+        # handle padding at the end
+        current_len = sum(len(p) for p in sequence_parts)
+        padding_end_len = seq_len_requested - current_len
+        if padding_end_len > 0:
+            sequence_parts.append('N' * padding_end_len)
+        final_sequence = "".join(sequence_parts)
+        # Final check for length; this should be guaranteed by logic above
+        if len(final_sequence) != seq_len_requested:
+            # This indicates a logic error in padding/fetching
+            raise RuntimeError(
+                f"Internal error: Final sequence length {len(final_sequence)} for {true_chrom_name}:{start_0based}-{end_0based_exclusive} "
+                f"does not match requested {seq_len_requested}."
+            )
+        return final_sequence
+# --- Main Dataset Classes ---
+class TahoeDataset(Dataset):
+    """
+    PyTorch Dataset for loading Tahoe data for Enformer fine-tuning.
+    - Reads genomic regions from a regions CSV.
+    - Reads pseudobulk conditions and expression values from a pseudobulk CSV.
+    - Merges these two data sources based on gene identifiers.
+    - Each sample represents a unique gene-condition pair.
+    - Fetches DNA sequence for the gene, resized to `enformer_input_seq_length`.
+    - One-hot encodes sequence and returns it with the specific expression value.
+    """
+    ORIGINAL_ENFORMER_WINDOW_SIZE = 196_608
+    def __init__(self,
+                 tss_regions_csv_path: str,
+                 genome_fasta_path: str,
+                 pseudobulk_data_path: str,
+                 enformer_input_seq_length: int = ENFORMER_INPUT_SEQ_LENGTH,
+                 regions_csv_gene_col: str = 'gene_name',        # Gene ID column in tss_regions_csv
+                 pseudobulk_csv_gene_col: str = 'gene_id',     # Gene ID column in pseudobulk_data_csv
+                 regions_csv_chr_col: str = 'seqnames',      # Chromosome column in tss_regions_csv
+                 regions_csv_start_col: str = 'starts',        # 0-based start col in tss_regions_csv
+                 regions_csv_end_col: str = 'ends'):           # 0-based exclusive end col in tss_regions_csv
+        super().__init__()
+        self.enformer_input_seq_length = enformer_input_seq_length
+        # Store column names for clarity
+        self.regions_gene_col = regions_csv_gene_col
+        self.pseudobulk_gene_col = pseudobulk_csv_gene_col
+        self.regions_chr_col = regions_csv_chr_col
+        self.regions_start_col = regions_csv_start_col
+        self.regions_end_col = regions_csv_end_col
+        print(f"Initializing TahoeDataset...")
+        print(f"  Target model input sequence length: {self.enformer_input_seq_length} bp")
+        print(f"  Genomic regions are assumed to define a {self.ORIGINAL_ENFORMER_WINDOW_SIZE} bp window for centering.")
+        # Load genomic regions data
+        print(f"  Loading TSS regions from: {tss_regions_csv_path}")
+        try:
+            regions_df = pd.read_csv(tss_regions_csv_path)
+            print(f"    Successfully loaded regions CSV with {len(regions_df)} gene region entries.")
+            expected_region_cols = [self.regions_chr_col, self.regions_gene_col,
+                                    self.regions_start_col, self.regions_end_col]
+            missing_region_cols = [col for col in expected_region_cols if col not in regions_df.columns]
+            if missing_region_cols:
+                raise ValueError(f"Missing columns in regions CSV ('{tss_regions_csv_path}'): {missing_region_cols}. Expected: {expected_region_cols}")
+        except FileNotFoundError:
+            print(f"FATAL ERROR: Regions CSV file not found at {tss_regions_csv_path}")
+            raise
+        except Exception as e:
+            print(f"FATAL ERROR loading or validating regions CSV: {e}")
+            raise
+        # Load pseudobulk target data
+        print(f"  Loading pseudobulk targets from: {pseudobulk_data_path}")
+        try:
+            pseudobulk_df = pd.read_csv(pseudobulk_data_path)
+            print(f"    Successfully loaded pseudobulk CSV with {len(pseudobulk_df)} condition entries.")
+            expected_pb_cols = [self.pseudobulk_gene_col, 'cell_line', 'drug_id', 'drug_dose', 'expression']
+            missing_pb_cols = [col for col in expected_pb_cols if col not in pseudobulk_df.columns]
+            if missing_pb_cols:
+                raise ValueError(f"Missing columns in pseudobulk CSV ('{pseudobulk_data_path}'): {missing_pb_cols}. Expected: {expected_pb_cols}")
+        except FileNotFoundError:
+            print(f"FATAL ERROR: Pseudobulk CSV file not found at {pseudobulk_data_path}")
+            raise
+        except Exception as e:
+            print(f"FATAL ERROR loading or validating pseudobulk CSV: {e}")
+            raise
+        # Merge regions with pseudobulk data
+        print(f"  Merging genomic regions with pseudobulk target data...")
+        print(f"    Regions gene column: '{self.regions_gene_col}', Pseudobulk gene column: '{self.pseudobulk_gene_col}'")
+        regions_df[self.regions_gene_col] = regions_df[self.regions_gene_col].astype(str)
+        pseudobulk_df[self.pseudobulk_gene_col] = pseudobulk_df[self.pseudobulk_gene_col].astype(str)
+        self.samples_df = pd.merge(
+            regions_df,
+            pseudobulk_df,
+            left_on=self.regions_gene_col,
+            right_on=self.pseudobulk_gene_col,
+            how='inner' # Keeps only genes present in both DataFrames
+        )
+        if len(self.samples_df) == 0:
+            print("WARNING: The merge operation resulted in an empty DataFrame.")
+            print(f"  No common genes found between column '{self.regions_gene_col}' in regions CSV ")
+            print(f"  and column '{self.pseudobulk_gene_col}' in pseudobulk CSV.")
+            print(f"  Please check that gene identifiers match and are of the same type in both files.")
+            # Example gene IDs for debugging:
+            if not regions_df.empty: print(f"  Sample gene IDs from regions CSV: {regions_df[self.regions_gene_col].unique()[:5].tolist()}")
+            if not pseudobulk_df.empty: print(f"  Sample gene IDs from pseudobulk CSV: {pseudobulk_df[self.pseudobulk_gene_col].unique()[:5].tolist()}")
+        else:
+            print(f"    Successfully merged data: {len(self.samples_df)} total samples (gene-condition pairs).")
+            # Check for genes in regions_df not found in pseudobulk_df (and thus dropped)
+            original_region_genes = set(regions_df[self.regions_gene_col].unique())
+            merged_region_genes = set(self.samples_df[self.regions_gene_col].unique())
+            dropped_region_genes = original_region_genes - merged_region_genes
+            if dropped_region_genes:
+                print(f"    WARNING: {len(dropped_region_genes)} unique gene IDs from the regions CSV ('{self.regions_gene_col}') were not found in the pseudobulk CSV ('{self.pseudobulk_gene_col}') and were dropped.")
+                print(f"      Examples of dropped region gene IDs: {list(dropped_region_genes)[:min(5, len(dropped_region_genes))]}")
+            # Check for genes in pseudobulk_df not found in regions_df (and thus dropped)
+            original_pseudobulk_genes = set(pseudobulk_df[self.pseudobulk_gene_col].unique())
+            merged_pseudobulk_genes = set(self.samples_df[self.regions_gene_col].unique()) # Genes that made it into the merge, identified by the regions_gene_col key
+            final_merged_keys_from_pseudobulk_perspective = set(self.samples_df[self.pseudobulk_gene_col].unique())
+            dropped_pseudobulk_genes = original_pseudobulk_genes - final_merged_keys_from_pseudobulk_perspective
+            if dropped_pseudobulk_genes:
+                print(f"    WARNING: {len(dropped_pseudobulk_genes)} unique gene IDs from the pseudobulk CSV ('{self.pseudobulk_gene_col}') were not found in the regions CSV ('{self.regions_gene_col}') and were dropped.")
+                print(f"      Examples of dropped pseudobulk gene IDs: {list(dropped_pseudobulk_genes)[:min(5, len(dropped_pseudobulk_genes))]}")
+        if 'expression' in self.samples_df and self.samples_df['expression'].isnull().any():
+             print("WARNING: NA values found in 'expression' column after merge. These samples might cause errors or yield NaN targets.")
+             print("         Consider handling these (e.g., fill with a default or drop rows withna(subset=['expression'])).")
+             # self.samples_df.dropna(subset=['expression'], inplace=True) # Example: drop rows with NA expression
+        print(f"  Initializing FASTA reader for genome: {genome_fasta_path}")
+        self.fasta_reader = FastaReader(genome_fasta_path)
+        self.encoder = GenomeOneHotEncoder(sequence_length=self.enformer_input_seq_length)
+        print("TahoeDataset initialized successfully.")
+    def __len__(self):
+        return len(self.samples_df)
+    def __getitem__(self, idx: int):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        if not (0 <= idx < len(self.samples_df)):
+            raise IndexError(f"Index {idx} out of bounds for dataset of length {len(self.samples_df)}")
+        sample_info = self.samples_df.iloc[idx]
+        try:
+            chrom = str(sample_info[self.regions_chr_col])
+            # Gene name from the regions CSV (used for merge, should be consistent)
+            gene_name_for_logging = str(sample_info[self.regions_gene_col])
+            csv_region_start = int(sample_info[self.regions_start_col])
+            csv_region_end = int(sample_info[self.regions_end_col])
+            expression_value = float(sample_info['expression']) # Assuming 'expression' is the target column
+        except KeyError as e:
+            print(f"FATAL ERROR in __getitem__ (idx {idx}): Missing expected column {e} in merged samples_df.")
+            print(f"  Available columns: {self.samples_df.columns.tolist()}")
+            print(f"  Sample info for this index: {sample_info.to_dict() if isinstance(sample_info, pd.Series) else sample_info}")
+            raise
+        except ValueError as e:
+            print(f"FATAL ERROR in __getitem__ (idx {idx}): Could not convert data for gene {gene_name_for_logging}. Error: {e}")
+            print(f"  Expression value was: '{sample_info.get('expression', 'N/A')}'")
+            raise
+        except Exception as e: # Catch any other unexpected error for this item
+            print(f"FATAL ERROR in __getitem__ (idx {idx}) for gene {gene_name_for_logging}: An unexpected error occurred: {type(e).__name__} - {e}")
+            raise
+        # --- Sequence window calculation ---
+        actual_csv_window_len = csv_region_end - csv_region_start
+        if actual_csv_window_len != self.ORIGINAL_ENFORMER_WINDOW_SIZE:
+            # Warning if the input CSV regions are not consistently 196kb.
+            # The centering logic below will still try to work based on csv_region_end.
+            print(f"WARNING for gene {gene_name_for_logging} (idx {idx}): Region {chrom}:{csv_region_start}-{csv_region_end} from CSV "
+                  f"has length {actual_csv_window_len}bp, but expected {self.ORIGINAL_ENFORMER_WINDOW_SIZE}bp "
+                  f"for the original window definition used for centering. Sequence extraction might be affected if assumptions are wrong.")
+        # Initialize final sequence coordinates with those from the CSV.
+        # These will be used if no resizing is needed.
+        final_seq_start_0based = csv_region_start
+        final_seq_end_0based_exclusive = csv_region_end
+        # If the target model input sequence length is different from the original Enformer window size,
+        # recalculate start and end positions by centering the target length within the original window.
+        if self.enformer_input_seq_length != self.ORIGINAL_ENFORMER_WINDOW_SIZE:
+            # Calculate the center of the ORIGINAL_ENFORMER_WINDOW_SIZE.
+            # Assumes 'csv_region_end' is the exclusive end of this original window.
+            original_window_center = csv_region_end - (self.ORIGINAL_ENFORMER_WINDOW_SIZE // 2)
+            half_target_seq_len = self.enformer_input_seq_length // 2
+            final_seq_start_0based = original_window_center - half_target_seq_len
+            # Ensure the end is exclusive and maintains the correct length for the target sequence
+            final_seq_end_0based_exclusive = final_seq_start_0based + self.enformer_input_seq_length
+        # Fetch and encode DNA sequence
+        dna_sequence = self.fasta_reader.get_sequence(chrom, final_seq_start_0based, final_seq_end_0based_exclusive)
+        one_hot_sequence = self.encoder.encode(dna_sequence)
+        one_hot_sequence_tensor = torch.tensor(one_hot_sequence, dtype=torch.float32)
+        # Target is the specific expression value for this gene-condition pair
+        target_tensor = torch.tensor([expression_value], dtype=torch.float32)
+        return one_hot_sequence_tensor, target_tensor
+# --- Extended Dataset for SMILES ---
+class TahoeSMILESDataset(Dataset):
+    """
+    Extends TahoeDataset to also return:
+        - Morgan Fingerprints for the drug
+        - drug dose
+        - target expression
+    """
+    def __init__(self,
+                 regions_csv_path: str, # Renamed from tss_regions_csv_path for clarity with config
+                 pbulk_parquet_path: str, # Renamed from pseudobulk_data_path for clarity with config
+                 drug_meta_csv_path: str, # Renamed from drug_metadata_path for clarity with config
+                 fasta_file_path: str,    # Renamed from genome_fasta_path for clarity with config
+                 enformer_input_seq_length: int = ENFORMER_INPUT_SEQ_LENGTH,
+                 # Morgan fingerprint parameters (from data_config)
+                 morgan_fp_radius: int = 2,
+                 morgan_fp_nbits: int = 2048,
+                 # Column names from regions_csv (from data_config)
+                 regions_gene_col: str    = 'gene_name',
+                 regions_chr_col: str     = 'seqnames',
+                 regions_start_col: str   = 'starts',
+                 regions_end_col: str     = 'ends',
+                 # Column names from pbulk_parquet (from data_config)
+                 pbulk_gene_col: str      = 'gene_id',
+                 pbulk_drug_col: str      = 'drug_id',
+                 pbulk_dose_col: str      = 'drug_dose',
+                 pbulk_expr_col: str      = 'expression',
+                 pbulk_cell_line_col: str = 'cell_line',
+                 # Column names from drug_meta_csv (from data_config)
+                 drug_meta_id_col: str    = 'drug',
+                 drug_meta_smiles_col: str = 'canonical_smiles',
+                 filter_drugs_by_ids: list = None, # Added from dataset_args
+                 regions_strand_col: str = None,    # Added from dataset_args, though not used in current __getitem__
+                 regions_set_col: str = 'set',      # New: Name of the column in regions_csv for data splitting
+                 target_set: str = None             # New: Specific set to load (e.g., "train", "valid", "test")
+                 ):
+        super().__init__()
+        # store config
+        self.seq_len = enformer_input_seq_length
+        self.morgan_fp_radius = morgan_fp_radius
+        self.morgan_fp_nbits = morgan_fp_nbits
+        self.regions_gene_col    = regions_gene_col
+        self.regions_chr_col     = regions_chr_col
+        self.regions_start_col   = regions_start_col
+        self.regions_end_col     = regions_end_col
+        self.regions_set_col     = regions_set_col # Store the name of the set column
+        self.pbulk_gene_col      = pbulk_gene_col
+        self.pbulk_drug_col      = pbulk_drug_col
+        self.pbulk_dose_col      = pbulk_dose_col
+        self.pbulk_expr_col      = pbulk_expr_col
+        self.pbulk_cell_line_col = pbulk_cell_line_col
+        self.drug_meta_id_col    = drug_meta_id_col
+        self.drug_meta_smiles_col= drug_meta_smiles_col
+        self.target_set          = target_set # Store the specific set value for this instance
+        # --- Morgan Fingerprint Generator (NEW) ---
+        self._morgan_gen = rdFingerprintGenerator.GetMorganGenerator(
+            radius=self.morgan_fp_radius,
+            fpSize=self.morgan_fp_nbits
+        )
+        # load & merge regions + pseudobulk
+        print(f"  Loading TSS regions from: {regions_csv_path}")
+        try:
+            regs = pd.read_csv(regions_csv_path)
+            print(f"    Successfully loaded regions CSV with {len(regs)} gene region entries.")
+        except FileNotFoundError:
+            print(f"FATAL ERROR: Regions CSV file not found at {regions_csv_path}")
+            raise
+        except Exception as e:
+            print(f"FATAL ERROR loading regions CSV: {e}")
+            raise
+        print(f"  Loading pseudobulk targets from: {pbulk_parquet_path} (expected Parquet format)")
+        try:
+            pb = pd.read_parquet(pbulk_parquet_path)
+            print(f"    Successfully loaded pseudobulk Parquet file with {len(pb)} entries.")
+        except FileNotFoundError:
+            print(f"FATAL ERROR: Pseudobulk Parquet file not found at {pbulk_parquet_path}")
+            raise
+        except Exception as e:
+            print(f"FATAL ERROR loading or parsing pseudobulk Parquet file: {e}")
+            print("  Ensure the file is a valid Parquet file and you have a Parquet engine like 'pyarrow' or 'fastparquet' installed.")
+            raise
+        # Ensure gene ID columns are strings for merging
+        regs[self.regions_gene_col] = regs[self.regions_gene_col].astype(str)
+        pb[self.pbulk_gene_col]     = pb[self.pbulk_gene_col].astype(str)
+        print(f"  Merging genomic regions with pseudobulk target data...")
+        print(f"    Regions gene column: '{self.regions_gene_col}', Pseudobulk gene column: '{self.pbulk_gene_col}'")
+        self.samples_df = regs.merge(
+            pb,
+            left_on  = self.regions_gene_col,
+            right_on = self.pbulk_gene_col,
+            how      = 'inner'
+        )
+        if filter_drugs_by_ids and self.pbulk_drug_col in self.samples_df.columns:
+            print(f"    Filtering samples to include only drugs: {filter_drugs_by_ids}")
+            initial_count = len(self.samples_df)
+            self.samples_df = self.samples_df[self.samples_df[self.pbulk_drug_col].isin(filter_drugs_by_ids)]
+            print(f"    Retained {len(self.samples_df)} samples after drug filtering (from {initial_count}).")
+            if len(self.samples_df) == 0:
+                print("WARNING: No samples remaining after filtering by drug IDs. Check your filter_drugs_by_ids list and drug IDs in pbulk data.")
+        # Filter by target_set if specified
+        if self.target_set:
+            if self.regions_set_col in self.samples_df.columns:
+                print(f"    Filtering samples for set: '{self.target_set}' using column '{self.regions_set_col}'.")
+                initial_count_set_filter = len(self.samples_df)
+                self.samples_df = self.samples_df[self.samples_df[self.regions_set_col] == self.target_set].copy()
+                print(f"    Retained {len(self.samples_df)} samples after filtering for set '{self.target_set}' (from {initial_count_set_filter}).")
+                if len(self.samples_df) == 0:
+                    print(f"WARNING: No samples remaining for this dataset instance (target_set='{self.target_set}') after filtering. Check the '{self.regions_set_col}' column in '{regions_csv_path}' for entries matching '{self.target_set}' and their overlap with pseudobulk data.")
+            else:
+                print(f"WARNING: target_set '{self.target_set}' was specified, but the column '{self.regions_set_col}' was not found in the merged DataFrame. No set-specific filtering was applied for this dataset instance. This instance will contain all data that matched other criteria.")
+        # load drug metadata
+        print(f"  Loading drug metadata from: {drug_meta_csv_path}")
+        try:
+            dm = pd.read_csv(drug_meta_csv_path)
+            print(f"    Successfully loaded drug metadata with {len(dm)} entries.")
+        except FileNotFoundError:
+            print(f"FATAL ERROR: Drug metadata CSV not found at {drug_meta_csv_path}")
+            raise
+        except Exception as e:
+            print(f"FATAL ERROR loading drug metadata CSV: {e}")
+            raise
+        # Ensure SMILES and ID columns are present and fill NA SMILES with empty string
+        if self.drug_meta_smiles_col not in dm.columns:
+            raise ValueError(f"SMILES column '{self.drug_meta_smiles_col}' not found in drug metadata.")
+        if self.drug_meta_id_col not in dm.columns:
+            raise ValueError(f"Drug ID column '{self.drug_meta_id_col}' not found in drug metadata.")
+        dm[self.drug_meta_smiles_col] = dm[self.drug_meta_smiles_col].fillna('').astype(str)
+        self.drug_meta = dm.set_index(self.drug_meta_id_col)
+        # fasta reader & one-hot encoder
+        self.fasta_reader = FastaReader(fasta_file_path)
+        self.encoder      = GenomeOneHotEncoder(sequence_length=self.seq_len)
+        print("TahoeSMILESDataset initialized.")
+    def _generate_morgan_fingerprint(self, smiles_string: str) -> np.ndarray:
+        """Generates a Morgan fingerprint from a SMILES string using the new generator API."""
+        if not smiles_string:
+            return np.zeros(self.morgan_fp_nbits, dtype=np.float32)
+        try:
+            mol = Chem.MolFromSmiles(smiles_string)
+            if mol:
+                # Use the generator's NumPy helper:
+                fp_array = self._morgan_gen.GetFingerprintAsNumPy(mol)
+                return fp_array.astype(np.float32)
+            else:
+                return np.zeros(self.morgan_fp_nbits, dtype=np.float32)
+        except Exception as e:
+            return np.zeros(self.morgan_fp_nbits, dtype=np.float32)
+    def __len__(self):
+        return len(self.samples_df)
+    def __getitem__(self, idx):
+        row = self.samples_df.iloc[idx]
+        # --- DNA sequence ---
+        chrom = str(row[self.regions_chr_col])
+        start = int(row[self.regions_start_col])
+        end   = int(row[self.regions_end_col])
+        orig = end - start
+        if self.seq_len != orig:
+            center = end - orig//2
+            half   = self.seq_len//2
+            start, end = center-half, center+half
+        seq = self.fasta_reader.get_sequence(chrom, start, end)
+        oh  = self.encoder.encode(seq)
+        seq_tensor = torch.tensor(oh, dtype=torch.float32)
+        # --- Morgan Fingerprint ---
+        drug_id_for_fp = row[self.pbulk_drug_col]
+        smiles_string = ''
+        if drug_id_for_fp in self.drug_meta.index:
+            smiles_string = self.drug_meta.loc[drug_id_for_fp, self.drug_meta_smiles_col]
+            # If multiple entries for a drug_id, loc might return a Series. Take the first one.
+            if isinstance(smiles_string, pd.Series):
+                smiles_string = smiles_string.iloc[0]
+        else:
+            # print(f"Warning: Drug ID {drug_id_for_fp} not found in drug_meta. Using empty SMILES for fingerprint.")
+            pass # SMILES string remains empty, will result in zero vector
+        morgan_fp = self._generate_morgan_fingerprint(str(smiles_string)) # Ensure it's a string
+        morgan_fp_tensor = torch.tensor(morgan_fp, dtype=torch.float32)
+        # --- dose & target ---
+        dose_val = float(row[self.pbulk_dose_col])
+        expression_val = float(row[self.pbulk_expr_col])
+        dose_tensor = torch.tensor([dose_val], dtype=torch.float32)
+        tgt_tensor  = torch.tensor([expression_val], dtype=torch.float32)
+        # --- Metadata for Logging ---
+        gene_id_meta = str(row[self.pbulk_gene_col])
+        drug_id_meta = str(row[self.pbulk_drug_col])
+        cell_line_meta = str(row[self.pbulk_cell_line_col])
+        return seq_tensor, morgan_fp_tensor, dose_tensor, tgt_tensor, gene_id_meta, drug_id_meta, cell_line_meta, chrom, start, end