create config file (#1)
Browse files- create config file (4f247a7e70cccf28d32bd861ae2956b3c6d87209)
Co-authored-by: Ryan Keivanfar <[email protected]>
- configs/config.yaml +57 -0
configs/config.yaml
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ----------------------------------------------------
|
2 |
+
# Full Dataset Run Config
|
3 |
+
# ----------------------------------------------------
|
4 |
+
|
5 |
+
experiment_name: 'MorganFP_full_dataset_run'
|
6 |
+
|
7 |
+
data:
|
8 |
+
regions_csv_path: "data/Enformer_genomic_regions_TSSCenteredGenes_FixedOverlapRemoval_subset500_priorityCustom_parquetFiltered.csv" # Ensure this is correct
|
9 |
+
regions_gene_col: 'gene_name'
|
10 |
+
regions_chr_col: 'seqnames'
|
11 |
+
regions_start_col: 'starts'
|
12 |
+
regions_end_col: 'ends'
|
13 |
+
|
14 |
+
pbulk_parquet_path: "/home/ubuntu/pseudoBulk_celllineXdrug_8Cellline_27Drugs_1Dosage_includeZero.parquet" # Ensure this is correct
|
15 |
+
pbulk_gene_col: 'gene_id'
|
16 |
+
pbulk_drug_col: 'drug_id'
|
17 |
+
pbulk_dose_col: 'drug_dose'
|
18 |
+
pbulk_expr_col: 'expression'
|
19 |
+
pbulk_cell_line_col: 'cell_line'
|
20 |
+
drug_meta_csv_path: "data/drug_metadata.csv"
|
21 |
+
drug_meta_id_col: 'drug'
|
22 |
+
fasta_file_path: "/home/ubuntu/data/hg38.fa" # Ensure this is correct for your full run
|
23 |
+
enformer_input_seq_length: 49152
|
24 |
+
morgan_fp_radius: 2
|
25 |
+
morgan_fp_nbits: 2048
|
26 |
+
|
27 |
+
model:
|
28 |
+
enformer_model_name: 'EleutherAI/enformer-official-rough'
|
29 |
+
morgan_fingerprint_dim: 2048
|
30 |
+
learning_rate: 5.0e-6
|
31 |
+
weight_decay: 0.01
|
32 |
+
|
33 |
+
training:
|
34 |
+
batch_size: 16 # Increased for full run
|
35 |
+
max_epochs: 100 # Increased for full run
|
36 |
+
precision: 'bf16-mixed'
|
37 |
+
deterministic: True
|
38 |
+
seed: 42
|
39 |
+
gradient_clip_val: 0.05
|
40 |
+
accumulate_grad_batches: 1
|
41 |
+
gpus: -1 # Use all available GPUs
|
42 |
+
strategy: 'ddp_find_unused_parameters_true' # Suitable for multi-GPU
|
43 |
+
|
44 |
+
logging:
|
45 |
+
wandb_project: 'tahoeformer' # Specific project for these tests
|
46 |
+
# wandb_entity: 'your_wandb_username_or_team'
|
47 |
+
save_dir: 'outputs/morgan_full_dataset_runs' # Updated save directory
|
48 |
+
# checkpoint_monitor_metric: 'validation_pearson_epoch' # default
|
49 |
+
# checkpoint_monitor_mode: 'max' # default
|
50 |
+
# early_stopping_metric: 'validation_pearson_epoch' # default
|
51 |
+
# early_stopping_mode: 'max' # default
|
52 |
+
early_stopping_patience: 30 # Increased patience
|
53 |
+
|
54 |
+
use_wandb: True
|
55 |
+
validate_before_train: True
|
56 |
+
delete_checkpoint_after_run: False
|
57 |
+
# check_val_every_n_epoch: 1 # default
|