bacformer-masked-complete-genomes / utils_bacformer.py

Upload BacformerForMaskedGM

9c2e756 verified 2 months ago

4.01 kB

	import torch
	from torch.nn.functional import cross_entropy, softmax

	from .configuration_bacformer import SPECIAL_TOKENS_DICT


	def compute_contrastive_loss(
	protein_embeddings: torch.Tensor,
	last_hidden_state: torch.Tensor,
	special_tokens_mask: torch.Tensor,
	) -> torch.Tensor:
	"""Compute contrastive loss between protein embeddings and masked items."""
	# keep protein embeddings and masked items
	# ensure the batch size is 1, the model currently does not work with batch size > 1
	assert protein_embeddings.shape[0] == last_hidden_state.shape[0] == 1

	# subset to mask and protein embedding tokens
	special_tokens_mask = special_tokens_mask.squeeze(0)
	mask = (special_tokens_mask == SPECIAL_TOKENS_DICT["PROT_EMB"]) \| (
	special_tokens_mask == SPECIAL_TOKENS_DICT["MASK"]
	)
	protein_embeddings = protein_embeddings.squeeze(0)[mask]
	last_hidden_state = last_hidden_state.squeeze(0)[mask]

	# Normalize embeddings
	last_hidden_state = last_hidden_state / last_hidden_state.norm(dim=1, keepdim=True)
	protein_embeddings = protein_embeddings / protein_embeddings.norm(dim=1, keepdim=True)

	# Compute similarity matrix and loss as before
	similarity_matrix = torch.matmul(last_hidden_state, protein_embeddings.T)

	n_prots = protein_embeddings.shape[0]
	labels = torch.arange(n_prots).to(protein_embeddings.device)

	# Compute the loss
	loss = cross_entropy(similarity_matrix, labels)
	return loss


	def top_k_filtering(logits: torch.Tensor, top_k: int = 50):
	"""
	Keep only top_k logits and set the rest to -inf.

	Args:
	logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
	top_k (int): The number of highest probability logits to keep.

	Returns
	-------
	torch.Tensor: Filtered logits where only the top k values remain, and all others are -inf.
	"""
	if top_k <= 0:
	return logits

	# Find top_k values
	top_k = min(top_k, logits.size(-1))
	vals, idx = torch.topk(logits, top_k, dim=-1)
	# Get the smallest logit in the top_k
	min_vals = vals[:, -1].unsqueeze(-1)
	# Mask all logits that are < this min value
	mask = logits < min_vals
	logits[mask] = float("-inf")
	return logits


	def top_p_filtering(logits: torch.Tensor, top_p: float = 0.9):
	"""
	Keep the smallest set of logits whose cumulative probability >= top_p.

	Args:
	logits (torch.Tensor): Logits of shape (batch_size, vocab_size).
	top_p (float): Cumulative probability threshold.

	Returns
	-------
	torch.Tensor: Filtered logits where only tokens within the top_p cumulative
	probability mass are kept; the rest are set to -inf.
	"""
	if top_p >= 1.0:
	return logits

	sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
	cumulative_probs = torch.cumsum(softmax(sorted_logits, dim=-1), dim=-1)

	# Identify where cumulative probability exceeds top_p
	sorted_indices_to_remove = cumulative_probs > top_p
	# Shift the mask to ensure we always keep at least one token
	sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
	sorted_indices_to_remove[..., 0] = False

	# Scatter to replicate the mask in the original ordering
	for i in range(logits.size(0)):
	remove_indices = sorted_indices[i, sorted_indices_to_remove[i]]
	logits[i, remove_indices] = float("-inf")

	return logits


	def create_4d_from_2d_attn_mask(attn_mask: torch.Tensor, num_attn_heads: int):
	"""Helper function to reshape attn_mask to 3D from 2D"""
	assert (
	len(attn_mask.shape) == 2
	), f"Please provide attn_mask of shape (batch_size, seq_len), current shape {attn_mask.shape}"

	bs, seq_len = attn_mask.shape
	attn_mask = attn_mask.view(bs, 1, 1, seq_len)
	attn_mask = attn_mask.expand(-1, num_attn_heads, -1, -1)
	attn_mask = attn_mask.view(bs, num_attn_heads, -1, seq_len)
	return attn_mask