Spaces:

torinriley
/

ARC125m

Build error

App Files Files Community

torinriley commited on May 13

Commit

3c8aa4a

1 Parent(s): 8468281

update

Browse files

Files changed (3) hide show

data/streaming_dataset.py +63 -0
model.py +22 -2
train.py +43 -17

data/streaming_dataset.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+import numpy as np
+import tiktoken
+from datasets import load_dataset, concatenate_datasets, interleave_datasets
+from torch.utils.data import IterableDataset
+import torch
+class StreamingDataset(IterableDataset):
+    """Streaming dataset that loads and processes data on the fly"""
+    def __init__(self, dataset_configs, block_size=2048, batch_size=12):
+        self.dataset_configs = dataset_configs
+        self.block_size = block_size
+        self.batch_size = batch_size
+        self.enc = tiktoken.get_encoding("gpt2")
+    def load_and_process_chunk(self, dataset_name, split="train"):
+        # Load datasets with appropriate configs
+        if dataset_name == "openwebtext":
+            dataset = load_dataset(dataset_name, split=split, streaming=True, trust_remote_code=True)
+        elif dataset_name == "the_pile":
+            dataset = load_dataset("the_pile", split=split, streaming=True)
+        elif dataset_name == "red_pajama":
+            dataset = load_dataset("togethercomputer/RedPajama-Data-1T", split=split, streaming=True)
+        for example in dataset:
+            ids = self.enc.encode_ordinary(example['text'])
+            ids.append(self.enc.eot_token)
+            if len(ids) >= self.block_size:
+                # Return chunks of block_size
+                for i in range(0, len(ids) - self.block_size + 1, self.block_size):
+                    yield torch.tensor(ids[i:i + self.block_size])
+    def __iter__(self):
+        # Interleave datasets with specified weights
+        iterators = []
+        weights = []
+        for config in self.dataset_configs:
+            iterators.append(self.load_and_process_chunk(config['name']))
+            weights.append(config['weight'])
+        # Normalize weights
+        weights = np.array(weights) / sum(weights)
+        while True:
+            # Randomly select a dataset based on weights
+            dataset_idx = np.random.choice(len(iterators), p=weights)
+            try:
+                batch = []
+                for _ in range(self.batch_size):
+                    batch.append(next(iterators[dataset_idx]))
+                yield torch.stack(batch)
+            except StopIteration:
+                # Restart iterator if it's exhausted
+                iterators[dataset_idx] = self.load_and_process_chunk(self.dataset_configs[dataset_idx]['name'])
+                continue
+# Example usage:
+dataset_configs = [
+    {'name': 'openwebtext', 'weight': 0.4},
+    {'name': 'the_pile', 'weight': 0.3},
+    {'name': 'red_pajama', 'weight': 0.3}
+]

model.py CHANGED Viewed

@@ -114,6 +114,7 @@ class GPTConfig:
     n_embd: int = 768
     dropout: float = 0.0
     bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
 class GPT(nn.Module):
@@ -144,6 +145,9 @@ class GPT(nn.Module):
             if pn.endswith('c_proj.weight'):
                 torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
         # report number of parameters
         print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
@@ -177,8 +181,24 @@ class GPT(nn.Module):
         tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
         pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
         x = self.transformer.drop(tok_emb + pos_emb)
-        for block in self.transformer.h:
-            x = block(x)
         x = self.transformer.ln_f(x)
         if targets is not None:

     n_embd: int = 768
     dropout: float = 0.0
     bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+    gradient_checkpointing: bool = False # Enable gradient checkpointing for memory efficiency
 class GPT(nn.Module):
             if pn.endswith('c_proj.weight'):
                 torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+        # Enable gradient checkpointing if configured
+        self.gradient_checkpointing = getattr(config, 'gradient_checkpointing', False)
         # report number of parameters
         print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
         tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
         pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
         x = self.transformer.drop(tok_emb + pos_emb)
+        if self.gradient_checkpointing and self.training:
+            # Use gradient checkpointing for transformer layers
+            def create_custom_forward(module):
+                def custom_forward(*args):
+                    return module(*args)
+                return custom_forward
+            x = torch.utils.checkpoint.checkpoint_sequential(
+                self.transformer.h,
+                len(self.transformer.h),
+                create_custom_forward(self.transformer.h[0]),
+                x
+            )
+        else:
+            for block in self.transformer.h:
+                x = block(x)
         x = self.transformer.ln_f(x)
         if targets is not None:

train.py CHANGED Viewed

@@ -47,13 +47,13 @@ wandb_run_name = 'gpt2' # 'run' + str(time.time())
 dataset = 'openwebtext'
 gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
 batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
-block_size = 1024
-# model
-n_layer = 12
-n_head = 12
-n_embd = 768
-dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+
-bias = False # do we use bias inside LayerNorm and Linear layers?
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
 max_iters = 600000 # total number of training iterations
@@ -70,8 +70,11 @@ min_lr = 6e-5 # minimum learning rate, should be ~= learning_rate/10 per Chinchi
 backend = 'nccl' # 'nccl', 'gloo', etc.
 # system
 device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
-dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler
 compile = True # use PyTorch 2.0 to compile the model to be faster
 # -----------------------------------------------------------------------------
 config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
 exec(open('configurator.py').read()) # overrides from command line or config file
@@ -111,20 +114,43 @@ device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.aut
 ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
 ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
-# poor man's data loader
-data_dir = os.path.join('data', dataset)
 def get_batch(split):
-    # We recreate np.memmap every batch to avoid a memory leak, as per
-    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
     if split == 'train':
-        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
     else:
         data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
-    ix = torch.randint(len(data) - block_size, (batch_size,))
-    x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
-    y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
     if device_type == 'cuda':
-        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
         x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
     else:
         x, y = x.to(device), y.to(device)

 dataset = 'openwebtext'
 gradient_accumulation_steps = 5 * 8 # used to simulate larger batch sizes
 batch_size = 12 # if gradient_accumulation_steps > 1, this is the micro-batch size
+block_size = 2048  # increased context length
+# model (1.3B parameters)
+n_layer = 24       # scaled up from 12
+n_head = 16       # scaled up from 12
+n_embd = 1024     # scaled up from 768
+dropout = 0.0     # for pretraining 0 is good, for finetuning try 0.1+
+bias = False      # do we use bias inside LayerNorm and Linear layers?
 # adamw optimizer
 learning_rate = 6e-4 # max learning rate
 max_iters = 600000 # total number of training iterations
 backend = 'nccl' # 'nccl', 'gloo', etc.
 # system
 device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+dtype = 'float16' # use fp16 training with gradient scaling
 compile = True # use PyTorch 2.0 to compile the model to be faster
+# mixed precision and memory optimization
+use_amp = True    # use automatic mixed precision (fp16)
+gradient_checkpointing = True  # trade compute for memory
 # -----------------------------------------------------------------------------
 config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
 exec(open('configurator.py').read()) # overrides from command line or config file
 ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
 ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
+# streaming data loader
+from data.streaming_dataset import StreamingDataset
+dataset_configs = [
+    {'name': 'openwebtext', 'weight': 0.4},
+    {'name': 'the_pile', 'weight': 0.3},
+    {'name': 'red_pajama', 'weight': 0.3}
+]
+train_dataset = StreamingDataset(dataset_configs, block_size=block_size, batch_size=batch_size)
+train_loader = torch.utils.data.DataLoader(
+    train_dataset,
+    batch_size=None,  # batch size is handled by the dataset
+    num_workers=4,
+    pin_memory=True
+)
+train_iter = iter(train_loader)
 def get_batch(split):
     if split == 'train':
+        try:
+            batch = next(train_iter)
+        except StopIteration:
+            # Reset iterator when exhausted
+            train_iter = iter(train_loader)
+            batch = next(train_iter)
+        x = batch[:, :-1]  # all but last token
+        y = batch[:, 1:]   # all but first token
     else:
+        # For validation, we'll keep using the original approach with memmap files
         data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
+        ix = torch.randint(len(data) - block_size, (batch_size,))
+        x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
     if device_type == 'cuda':
         x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
     else:
         x, y = x.to(device), y.to(device)