Aleph-Alpha
/

llama-3_1-8b-tfree-hat-dpo

@@ -37,9 +37,10 @@ def sample_argmax(logits: torch.Tensor) -> torch.Tensor:
     return torch.argmax(logits, dim=-1)[:, -1]
-LLAMA_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
-You are a helpful assistant. You give engaging, well-structured answers to user inquiries.<|eot_id|><|start_header_id|>user<|end_header_id|>
-{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
 class HATCache(Cache):
@@ -488,6 +489,7 @@ class HATEncoderConnector(nn.Module):
             device=self.latent_query.device,
             dtype=torch.int32,
         )
         word_embeddings = self.cross_attention_encoder_connector.forward(
             q_activations=latent_query_repeated,
             kv_activations=hidden_states,
@@ -607,7 +609,7 @@ class HATForCausalLM(PreTrainedModel):
         backbone_past_key_values = past_key_values.get_backbone_cache() if past_key_values is not None else None
         decoder_past_key_values = past_key_values.get_decoder_cache() if past_key_values is not None else None
-        encoder_output: BaseModelOutputWithPast = self.encoder(
             input_ids=input_ids,
             cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
             byte_position_ids=byte_position_ids,
@@ -617,13 +619,13 @@ class HATForCausalLM(PreTrainedModel):
         )
         byte_level_activations = encoder_output.hidden_states
-        encoder_connector_output = self.encoder_connector(
             byte_level_activations,
             cumulative_seq_lengths_per_word,
             word_position_ids,
             byte_position_ids,
         )
-        backbone_output: CausalLMOutputWithPast = self.backbone(
             hidden_states=encoder_connector_output,
             position_ids=word_position_ids,
             past_key_values=backbone_past_key_values,
@@ -658,7 +660,7 @@ class HATForCausalLM(PreTrainedModel):
     def _append_byte(self, words: list[list[int]], token: int) -> list[list[int]]:
         extended_last_word = words.pop() + [token]
         try:
-            text = self.splitter.decode(extended_last_word, errors='strict', skip_special_tokens=False)
             list_of_bytes = self.splitter.encode(text)
             words.extend([list(word_in_bytes) for word_in_bytes in list_of_bytes])
         except UnicodeDecodeError:
@@ -667,20 +669,70 @@ class HATForCausalLM(PreTrainedModel):
             words.append(extended_last_word)
         return words
     def _complete_word(
         self,
         input_ids: torch.Tensor,
         byte_position_ids: torch.Tensor,
-        backbone_word_prediction: torch.Tensor,
         word_position_id: torch.Tensor,
         encoder_cache: DynamicCache,
         decoder_cache: DynamicCache,
         sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
     ):
         """Generate byte tokens until we hit the first byte of a new word."""
-        words = [input_ids.squeeze(0).tolist()]
-        byte_encoder_activations = []
-        completion_logits = []
         while True:
             encoder_output = self.encoder.forward(
@@ -692,7 +744,7 @@ class HATForCausalLM(PreTrainedModel):
             )
             byte_encoder_activations.append(encoder_output.hidden_states)
             decoder_output = self.decoder.forward(
-                backbone_word_prediction,
                 encoder_output.hidden_states,
                 byte_position_ids=None,
                 word_position_ids=word_position_id,
@@ -705,22 +757,112 @@ class HATForCausalLM(PreTrainedModel):
             next_byte = int(sample_fn(logits).item())
             words = self._append_byte(words, next_byte)
             if len(words) > 1 or next_byte == self.eos_token_id:
                 break
             input_ids = torch.tensor([[next_byte]], dtype=input_ids.dtype, device=input_ids.device)
-        byte_encoder_activations = torch.cat(byte_encoder_activations, dim=1)
         num_kv = encoder_cache.get_seq_length()
-        byte_position_ids = torch.arange(num_kv + 1 - byte_encoder_activations.shape[1], num_kv + 1, device=input_ids.device, dtype=torch.long).unsqueeze(0)
         completed_word_embedding = self.encoder_connector.forward(
-            byte_encoder_activations,
-            cumulative_seq_lengths_per_word=torch.tensor([0, byte_encoder_activations.size(1)], dtype=torch.int32, device=input_ids.device),
             word_position_ids=word_position_id,
             byte_position_ids=byte_position_ids,
         )
-        completion = sum(words, [])[-len(completion_logits) :]
-        first_byte_of_next_word = words[1]
-        return completion, completed_word_embedding, first_byte_of_next_word, byte_position_ids[:, -1].item() + 1, completion_logits
     def generate(
         self,
@@ -756,6 +898,20 @@ class HATForCausalLM(PreTrainedModel):
             completion_logits=completion_logits,
         )
     @torch.no_grad()
     def _generate_cached(
         self,
@@ -767,43 +923,35 @@ class HATForCausalLM(PreTrainedModel):
         sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
         stop_sequences: Sequence[str] | None = None,
     ):
-        max_total_bytes = max_new_tokens + input_ids.shape[1]
-        if byte_position_ids is None:
-            byte_position_ids = torch.arange(0, cumulative_seq_lengths_per_word[-1].item(), device=input_ids.device, dtype=torch.int32).unsqueeze(0)
-        if word_position_ids is None:
-            word_position_ids = torch.arange(0, cumulative_seq_lengths_per_word.shape[0] - 1, device=input_ids.device, dtype=torch.int32).unsqueeze(0)
-        last_word_start, last_word_end = (
-            cumulative_seq_lengths_per_word[-2],
-            cumulative_seq_lengths_per_word[-1],
-        )
-        # Populate cache with everything except last word
-        initial_forward_output = self.forward(
-            input_ids=input_ids[:, :last_word_start],
-            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word[:-1],
-            byte_position_ids=byte_position_ids[:, :last_word_start],
-            word_position_ids=word_position_ids[:, :-1],
-            past_key_values=None,
-            use_cache=True,
         )
-        completion_bytes = []
-        completion_logits = []
-        input_ids = input_ids[:, last_word_start:last_word_end]
-        next_byte_id = last_word_end
-        byte_position_ids = byte_position_ids[:, last_word_start:last_word_end]
-        word_position_id = word_position_ids[:, -1].unsqueeze(-1)
-        backbone_last_hidden_state = initial_forward_output.hidden_states[:, -1:, :]
         while next_byte_id < max_total_bytes:
-            completion, completed_word_embedding, first_byte_of_next_word, next_byte_id, next_completion_logits = self._complete_word(
                 input_ids=input_ids,
                 byte_position_ids=byte_position_ids,
-                backbone_word_prediction=backbone_last_hidden_state,
                 word_position_id=word_position_id,
                 encoder_cache=initial_forward_output.past_key_values.get_encoder_cache(),
                 decoder_cache=initial_forward_output.past_key_values.get_decoder_cache(),
                 sample_fn=sample_fn,
             )
             completion_logits.extend(next_completion_logits)
             completion_bytes.extend(completion)
@@ -828,11 +976,19 @@ class HATForCausalLM(PreTrainedModel):
             )
             backbone_last_hidden_state = backbone_output.hidden_states[:, -1, :].unsqueeze(1)
-            input_ids = torch.tensor([first_byte_of_next_word], dtype=input_ids.dtype, device=input_ids.device)
-            byte_position_ids = torch.tensor([[next_byte_id]], dtype=input_ids.dtype, device=input_ids.device)
             word_position_id = word_position_id + 1
-        completion_bytes.extend(first_byte_of_next_word)
         completion_bytes = completion_bytes[:max_new_tokens]
         completion_logits = torch.cat(completion_logits[:max_new_tokens], dim=0)
         completion_text = self.splitter.decode(completion_bytes)
@@ -847,7 +1003,7 @@ class HATForCausalLM(PreTrainedModel):
         cumulative_seq_lengths_per_word: torch.Tensor,
         byte_position_ids: torch.Tensor | None = None,
         word_position_ids: torch.Tensor | None = None,
-        sample_fn=sample_argmax,
         stop_sequences: Sequence[str] | None = None,
     ):
         if byte_position_ids is None:

     return torch.argmax(logits, dim=-1)[:, -1]
+LLAMA_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n
+You are a helpful assistant. You give engaging, well-structured answers to user inquiries.<|eot_id|><|start_header_id|>user<|end_header_id|>\n
+{input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""
 class HATCache(Cache):
             device=self.latent_query.device,
             dtype=torch.int32,
         )
         word_embeddings = self.cross_attention_encoder_connector.forward(
             q_activations=latent_query_repeated,
             kv_activations=hidden_states,
         backbone_past_key_values = past_key_values.get_backbone_cache() if past_key_values is not None else None
         decoder_past_key_values = past_key_values.get_decoder_cache() if past_key_values is not None else None
+        encoder_output: BaseModelOutputWithPast = self.encoder.forward(
             input_ids=input_ids,
             cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
             byte_position_ids=byte_position_ids,
         )
         byte_level_activations = encoder_output.hidden_states
+        encoder_connector_output = self.encoder_connector.forward(
             byte_level_activations,
             cumulative_seq_lengths_per_word,
             word_position_ids,
             byte_position_ids,
         )
+        backbone_output: CausalLMOutputWithPast = self.backbone.forward(
             hidden_states=encoder_connector_output,
             position_ids=word_position_ids,
             past_key_values=backbone_past_key_values,
     def _append_byte(self, words: list[list[int]], token: int) -> list[list[int]]:
         extended_last_word = words.pop() + [token]
         try:
+            text = self.splitter.decode(extended_last_word, errors="strict", skip_special_tokens=False)
             list_of_bytes = self.splitter.encode(text)
             words.extend([list(word_in_bytes) for word_in_bytes in list_of_bytes])
         except UnicodeDecodeError:
             words.append(extended_last_word)
         return words
+    def _split_encoder_activations(
+        self,
+        byte_encoder_activations: torch.Tensor,
+        words: list[list[int]],
+        previous_encoder_activations: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        """Split encoder activations between first word and next word.
+        Args:
+            byte_encoder_activations: Tensor of shape [batch_size, seq_len, hidden_size] containing all encoder activations which were computed in the current iteration
+            words: List of word byte sequences which were completed in previous iteration and current iteration
+            previous_encoder_activations: Optional tensor of shape [batch_size, prev_seq_len, hidden_size] containing precomputed activations from the previous iteration
+        Returns:
+            tuple containing:
+                - first_word_encoder_activations: Tensor of shape [batch_size, first_word_len, hidden_size]
+                - next_word_encoder_activations: Tensor of shape [batch_size, remaining_len, hidden_size]
+        """
+        assert sum(len(word) for word in words) - 1 == byte_encoder_activations.shape[1] + (previous_encoder_activations.shape[1] if previous_encoder_activations is not None else 0), "Length of (words - 1) must match the sum of byte_encoder_activations and previous_encoder_activations dimensions"
+        next_word_encoder_activations = None
+        if previous_encoder_activations is not None:
+            # We have already precomputed first word's encoder activations partially in the previous iteration
+            new_bytes_of_first_words = len(words[0]) - previous_encoder_activations.shape[1]
+            # Concatenate the precomputed activations with the new activations that still belong to the first word
+            first_word_encoder_activations = torch.cat([previous_encoder_activations, byte_encoder_activations[:, :new_bytes_of_first_words]], dim=1)
+            if len(words[1]) > 1:
+                # The remaining activations that belong to the next word
+                next_word_encoder_activations = byte_encoder_activations[:, new_bytes_of_first_words:]
+            else:
+                next_word_encoder_activations = None
+        else:
+            # We have not precomputed any activations for the first word previously
+            first_word_encoder_activations = byte_encoder_activations[:, : len(words[0])]
+            if len(words[1]) > 1:
+                next_word_encoder_activations = byte_encoder_activations[:, len(words[0]) :]
+            else:
+                next_word_encoder_activations = None
+        return first_word_encoder_activations, next_word_encoder_activations
     def _complete_word(
         self,
         input_ids: torch.Tensor,
         byte_position_ids: torch.Tensor,
+        predictive_word_embeddings: torch.Tensor,
         word_position_id: torch.Tensor,
         encoder_cache: DynamicCache,
         decoder_cache: DynamicCache,
         sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
+        previous_encoder_activations: torch.Tensor | None = None,
     ):
         """Generate byte tokens until we hit the first byte of a new word."""
+        words: list[list[int]] = [input_ids.squeeze(0).tolist()]
+        byte_encoder_activations: list[torch.Tensor] = []
+        completion_logits: list[torch.Tensor] = []
+        if previous_encoder_activations is not None:
+            # we need to pass all inputs in order to get the correct encoding/decoding by the splitter
+            # but only the last byte is used for the generation
+            # since the cache is already populated with the first word's activations
+            input_ids = input_ids[:, -1:]
         while True:
             encoder_output = self.encoder.forward(
             )
             byte_encoder_activations.append(encoder_output.hidden_states)
             decoder_output = self.decoder.forward(
+                predictive_word_embeddings,
                 encoder_output.hidden_states,
                 byte_position_ids=None,
                 word_position_ids=word_position_id,
             next_byte = int(sample_fn(logits).item())
             words = self._append_byte(words, next_byte)
             if len(words) > 1 or next_byte == self.eos_token_id:
+                byte_encoder_activations = torch.cat(byte_encoder_activations, dim=1)
+                first_word_encoder_activations, next_word_encoder_activations = self._split_encoder_activations(
+                    byte_encoder_activations,
+                    words,
+                    previous_encoder_activations,
+                )
                 break
             input_ids = torch.tensor([[next_byte]], dtype=input_ids.dtype, device=input_ids.device)
         num_kv = encoder_cache.get_seq_length()
+        completion = sum(words, [])[-len(completion_logits) :]
+        if next_word_encoder_activations is not None:
+            start_idx = num_kv - first_word_encoder_activations.shape[1] - next_word_encoder_activations.shape[1]
+            end_idx = num_kv - next_word_encoder_activations.shape[1]
+            # We do not want to return the logits for the second word went into the mulitbyte starting character case
+            # When that happens we remove the logits and post-hoc fix the decoder cache and compute new logits
+            # This is breaking causality but we want to imitate uncached generation/training behavior
+            completion_logits = completion_logits[:-next_word_encoder_activations.shape[1]]
+        else:
+            start_idx = num_kv - first_word_encoder_activations.shape[1]
+            end_idx = num_kv
+        byte_position_ids = torch.arange(start_idx, end_idx, device=input_ids.device, dtype=torch.long).unsqueeze(0)
         completed_word_embedding = self.encoder_connector.forward(
+            first_word_encoder_activations,
+            cumulative_seq_lengths_per_word=torch.tensor([0, first_word_encoder_activations.size(1)], dtype=torch.int32, device=input_ids.device),
             word_position_ids=word_position_id,
             byte_position_ids=byte_position_ids,
         )
+        bytes_of_next_word = words[1]
+        return (
+            completion,
+            completed_word_embedding,
+            bytes_of_next_word,
+            byte_position_ids[:, -1].item() + 1,
+            completion_logits,
+            next_word_encoder_activations,
+        )
+    def _populate_cache(
+        self,
+        input_ids: torch.Tensor,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        byte_position_ids: torch.Tensor,
+        word_position_ids: torch.Tensor,
+    ):
+        last_word_start = cumulative_seq_lengths_per_word[-2]
+        last_word_end = cumulative_seq_lengths_per_word[-1]
+        # Populate cache with everything except last word
+        initial_forward_output = self.forward(
+            input_ids=input_ids[:, :last_word_start],
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word[:-1],
+            byte_position_ids=byte_position_ids[:, :last_word_start],
+            word_position_ids=word_position_ids[:, :-1],
+            past_key_values=None,
+            use_cache=True,
+        )
+        return initial_forward_output, last_word_start, last_word_end
+    def _initialize_generation_state(
+        self,
+        input_ids: torch.Tensor,
+        max_new_tokens: int,
+        cumulative_seq_lengths_per_word: torch.Tensor,
+        byte_position_ids: torch.Tensor | None = None,
+        word_position_ids: torch.Tensor | None = None,
+    ):
+        max_total_bytes = max_new_tokens + input_ids.shape[1]
+        if byte_position_ids is None:
+            byte_position_ids = torch.arange(0, cumulative_seq_lengths_per_word[-1].item(), device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        if word_position_ids is None:
+            word_position_ids = torch.arange(0, cumulative_seq_lengths_per_word.shape[0] - 1, device=input_ids.device, dtype=torch.int32).unsqueeze(0)
+        initial_forward_output, last_word_start, last_word_end = self._populate_cache(
+            input_ids=input_ids,
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+            byte_position_ids=byte_position_ids,
+            word_position_ids=word_position_ids,
+        )
+        completion_bytes: list[int] = []
+        completion_logits: list[torch.Tensor] = []
+        # Slice input_ids and byte_position_ids to only contain the last word for the generation loop
+        current_input_ids = input_ids[:, last_word_start:last_word_end]
+        next_byte_id = last_word_end.item()  # Ensure this is an int
+        current_byte_position_ids = byte_position_ids[:, last_word_start:last_word_end]
+        current_word_position_id = word_position_ids[:, -1].unsqueeze(-1)
+        backbone_last_hidden_state = initial_forward_output.hidden_states[:, -1:, :]
+        next_word_encoder_activations = None
+        return (
+            initial_forward_output,
+            completion_bytes,
+            completion_logits,
+            current_input_ids,
+            next_byte_id,
+            current_byte_position_ids,
+            current_word_position_id,
+            backbone_last_hidden_state,
+            next_word_encoder_activations,
+            max_total_bytes,
+        )
     def generate(
         self,
             completion_logits=completion_logits,
         )
+    def _fix_decoder_cache(self, predictive_word_embeddings: torch.Tensor, encoder_activions: torch.Tensor, decoder_cache: DynamicCache, word_position_id: torch.Tensor):
+        decoder_cache.crop(decoder_cache.get_seq_length() - encoder_activions.shape[1])
+        real_decoder_logits = self.decoder.forward(
+            predictive_word_embeddings,
+            encoder_activions,
+            byte_position_ids=None,
+            word_position_ids=word_position_id,
+            past_key_values=decoder_cache,
+        ).last_hidden_state
+        decoder_output = self.layer_norm(real_decoder_logits)
+        logits = self.lm_head(decoder_output)
+        return logits
     @torch.no_grad()
     def _generate_cached(
         self,
         sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
         stop_sequences: Sequence[str] | None = None,
     ):
+        (
+            initial_forward_output,
+            completion_bytes,  # empty list
+            completion_logits,  # empty list
+            input_ids,  # This is now the sliced input_ids for the last word
+            next_byte_id,
+            byte_position_ids,  # This is now the sliced byte_position_ids for the last word
+            word_position_id,
+            backbone_last_hidden_state,
+            next_word_encoder_activations,  # None for the first iteration
+            max_total_bytes,
+        ) = self._initialize_generation_state(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            cumulative_seq_lengths_per_word=cumulative_seq_lengths_per_word,
+            byte_position_ids=byte_position_ids,
+            word_position_ids=word_position_ids,
         )
         while next_byte_id < max_total_bytes:
+            completion, completed_word_embedding, bytes_of_next_word, next_byte_id, next_completion_logits, next_word_encoder_activations = self._complete_word(
                 input_ids=input_ids,
                 byte_position_ids=byte_position_ids,
+                predictive_word_embeddings=backbone_last_hidden_state,
                 word_position_id=word_position_id,
                 encoder_cache=initial_forward_output.past_key_values.get_encoder_cache(),
                 decoder_cache=initial_forward_output.past_key_values.get_decoder_cache(),
                 sample_fn=sample_fn,
+                previous_encoder_activations=next_word_encoder_activations,
             )
             completion_logits.extend(next_completion_logits)
             completion_bytes.extend(completion)
             )
             backbone_last_hidden_state = backbone_output.hidden_states[:, -1, :].unsqueeze(1)
             word_position_id = word_position_id + 1
+            if len(bytes_of_next_word) > 1:
+                real_decoder_logits = self._fix_decoder_cache(
+                    predictive_word_embeddings=backbone_last_hidden_state,
+                    encoder_activions=next_word_encoder_activations,
+                    decoder_cache=initial_forward_output.past_key_values.get_decoder_cache(),
+                    word_position_id=word_position_id,
+                )
+                completion_logits.extend(real_decoder_logits)
+            input_ids = torch.tensor([bytes_of_next_word], dtype=input_ids.dtype, device=input_ids.device)
+            byte_position_ids = torch.tensor([[next_byte_id]], dtype=input_ids.dtype, device=input_ids.device)
         completion_bytes = completion_bytes[:max_new_tokens]
         completion_logits = torch.cat(completion_logits[:max_new_tokens], dim=0)
         completion_text = self.splitter.decode(completion_bytes)
         cumulative_seq_lengths_per_word: torch.Tensor,
         byte_position_ids: torch.Tensor | None = None,
         word_position_ids: torch.Tensor | None = None,
+        sample_fn: Callable[[torch.Tensor], torch.Tensor] = sample_argmax,
         stop_sequences: Sequence[str] | None = None,
     ):
         if byte_position_ids is None: