Spaces:

Tzktz
/

Dit-document-layout-analysis

Running

App Files Files Community

Dit-document-layout-analysis / unilm /speechlm /modules.py

Tzktz

Upload 7664 files

6fc683c verified over 1 year ago

raw

history blame contribute delete

81.3 kB

	# ----------------------------------------------------------------------------
	# SpeechLM: Enhanced Speech Pre-Training with Unpaired Textual Data (https://arxiv.org/abs/2209.15329)
	# Github source: https://github.com/microsoft/SpeechT5/tree/main/SpeechLM
	# Code based on fairseq: https://github.com/facebookresearch/fairseq
	#
	# Copyright (c) 2022 Microsoft
	# Licensed under The MIT License [see LICENSE for details]
	# ----------------------------------------------------------------------------
	"""
	We just merge all the required modules and functions into one python file.
	It is for easily use the pre-trained model to extract features.
	"""
	import math
	import numpy as np
	import logging
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.nn import Parameter
	from torch import Tensor
	from typing import Any, Dict, List, Tuple, Callable, Optional

	logger = logging.getLogger(__name__)

	# rewrite name for backward compatibility in `make_generation_fast_`
	def module_name_fordropout(module_name: str) -> str:
	if module_name == "TransformerEncoderBase":
	return "TransformerEncoder"
	else:
	return module_name

	def utils_make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
	"""Replace non-padding symbols with their position numbers.

	Position numbers begin at padding_idx+1. Padding symbols are ignored.
	"""
	# The series of casts and type-conversions here are carefully
	# balanced to both work with ONNX export and XLA. In particular XLA
	# prefers ints, cumsum defaults to output longs, and ONNX doesn't know
	# how to handle the dtype kwarg in cumsum.
	mask = tensor.ne(padding_idx).int()
	return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx

	def utils_item(tensor):
	# tpu-comment: making this a no-op for xla devices.
	if torch.is_tensor(tensor) and tensor.device.type == "xla":
	return tensor.detach()
	if hasattr(tensor, "item"):
	return tensor.item()
	if hasattr(tensor, "__getitem__"):
	return tensor[0]
	return tensor

	def fsdp_wrap(module, min_num_params: Optional[int] = None, **kwargs):
	"""
	Helper to wrap layers/modules in FSDP. This falls back to a no-op if
	fairscale is not available.

	Args:
	module (nn.Module): module to (maybe) wrap
	min_num_params (int, Optional): minimum number of layer params to wrap
	"""
	try:
	from fairscale.nn import wrap

	if min_num_params is not None:
	num_params = sum(p.numel() for p in module.parameters())
	if num_params >= min_num_params:
	return wrap(module, **kwargs)
	else:
	return module
	else:
	return wrap(module, **kwargs)
	except ImportError:
	return module

	def quant_noise(module, p, block_size):
	"""
	Wraps modules and applies quantization noise to the weights for
	subsequent quantization with Iterative Product Quantization as
	described in "Training with Quantization Noise for Extreme Model Compression"

	Args:
	- module: nn.Module
	- p: amount of Quantization Noise
	- block_size: size of the blocks for subsequent quantization with iPQ

	Remarks:
	- Module weights must have the right sizes wrt the block size
	- Only Linear, Embedding and Conv2d modules are supported for the moment
	- For more detail on how to quantize by blocks with convolutional weights,
	see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
	- We implement the simplest form of noise here as stated in the paper
	which consists in randomly dropping blocks
	"""

	# if no quantization noise, don't register hook
	if p <= 0:
	return module

	# supported modules
	assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))

	# test whether module.weight has the right sizes wrt block_size
	is_conv = module.weight.ndim == 4

	# 2D matrix
	if not is_conv:
	assert (
	module.weight.size(1) % block_size == 0
	), "Input features must be a multiple of block sizes"

	# 4D matrix
	else:
	# 1x1 convolutions
	if module.kernel_size == (1, 1):
	assert (
	module.in_channels % block_size == 0
	), "Input channels must be a multiple of block sizes"
	# regular convolutions
	else:
	k = module.kernel_size[0] * module.kernel_size[1]
	assert k % block_size == 0, "Kernel size must be a multiple of block size"

	def _forward_pre_hook(mod, input):
	# no noise for evaluation
	if mod.training:
	if not is_conv:
	# gather weight and sizes
	weight = mod.weight
	in_features = weight.size(1)
	out_features = weight.size(0)

	# split weight matrix into blocks and randomly drop selected blocks
	mask = torch.zeros(
	in_features // block_size * out_features, device=weight.device
	)
	mask.bernoulli_(p)
	mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)

	else:
	# gather weight and sizes
	weight = mod.weight
	in_channels = mod.in_channels
	out_channels = mod.out_channels

	# split weight matrix into blocks and randomly drop selected blocks
	if mod.kernel_size == (1, 1):
	mask = torch.zeros(
	int(in_channels // block_size * out_channels),
	device=weight.device,
	)
	mask.bernoulli_(p)
	mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
	else:
	mask = torch.zeros(
	weight.size(0), weight.size(1), device=weight.device
	)
	mask.bernoulli_(p)
	mask = (
	mask.unsqueeze(2)
	.unsqueeze(3)
	.repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
	)

	# scale weights and apply mask
	mask = mask.to(
	torch.bool
	) # x.bool() is not currently supported in TorchScript
	s = 1 / (1 - p)
	mod.weight.data = s * weight.masked_fill(mask, 0)

	module.register_forward_pre_hook(_forward_pre_hook)
	return module

	def relu_squared(x: torch.Tensor):
	return F.relu(x).pow(2)

	def gelu(x: torch.Tensor) -> torch.Tensor:
	return torch.nn.functional.gelu(x.float()).type_as(x)

	def gelu_accurate(x):
	if not hasattr(gelu_accurate, "_a"):
	gelu_accurate._a = math.sqrt(2 / math.pi)
	return (
	0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
	)

	def get_activation_fn(activation: str) -> Callable:
	"""Returns the activation function corresponding to `activation`"""
	if activation == "relu":
	return F.relu
	elif activation == "relu_squared":
	return relu_squared
	elif activation == "gelu":
	return gelu
	elif activation == "gelu_fast":
	logger.warn(
	"--activation-fn=gelu_fast has been renamed to gelu_accurate"
	)
	return gelu_accurate
	elif activation == "gelu_accurate":
	return gelu_accurate
	elif activation == "tanh":
	return torch.tanh
	elif activation == "linear":
	return lambda x: x
	elif activation == "swish":
	return torch.nn.SiLU
	else:
	raise RuntimeError("--activation-fn {} not supported".format(activation))

	def softmax(x, dim: int, onnx_trace: bool = False):
	if onnx_trace:
	return F.softmax(x.float(), dim=dim)
	else:
	return F.softmax(x, dim=dim, dtype=torch.float32)

	def compute_mask_indices(
	shape: Tuple[int, int],
	padding_mask: Optional[torch.Tensor],
	mask_prob: float,
	mask_length: int,
	mask_type: str = "static",
	mask_other: float = 0.0,
	min_masks: int = 0,
	no_overlap: bool = False,
	min_space: int = 0,
	require_same_masks: bool = True,
	mask_dropout: float = 0.0,
	) -> np.ndarray:
	"""
	Computes random mask spans for a given shape

	Args:
	shape: the the shape for which to compute masks.
	should be of size 2 where first element is batch size and 2nd is timesteps
	padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
	mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
	number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
	however due to overlaps, the actual number will be smaller (unless no_overlap is True)
	mask_type: how to compute mask lengths
	static = fixed size
	uniform = sample from uniform distribution [mask_other, mask_length*2]
	normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
	poisson = sample from possion distribution with lambda = mask length
	min_masks: minimum number of masked spans
	no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
	min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
	require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
	mask_dropout: randomly dropout this percentage of masks in each example
	"""

	bsz, all_sz = shape
	mask = np.full((bsz, all_sz), False)

	all_num_mask = int(
	# add a random number for probabilistic rounding
	mask_prob * all_sz / float(mask_length)
	+ np.random.rand()
	)

	all_num_mask = max(min_masks, all_num_mask)

	mask_idcs = []
	for i in range(bsz):
	if padding_mask is not None:
	sz = all_sz - padding_mask[i].long().sum().item()
	num_mask = int(
	# add a random number for probabilistic rounding
	mask_prob * sz / float(mask_length)
	+ np.random.rand()
	)
	num_mask = max(min_masks, num_mask)
	else:
	sz = all_sz
	num_mask = all_num_mask

	if mask_type == "static":
	lengths = np.full(num_mask, mask_length)
	elif mask_type == "uniform":
	lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
	elif mask_type == "normal":
	lengths = np.random.normal(mask_length, mask_other, size=num_mask)
	lengths = [max(1, int(round(x))) for x in lengths]
	elif mask_type == "poisson":
	lengths = np.random.poisson(mask_length, size=num_mask)
	lengths = [int(round(x)) for x in lengths]
	else:
	raise Exception("unknown mask selection " + mask_type)

	if sum(lengths) == 0:
	lengths[0] = min(mask_length, sz - 1)

	if no_overlap:
	mask_idc = []

	def arrange(s, e, length, keep_length):
	span_start = np.random.randint(s, e - length)
	mask_idc.extend(span_start + i for i in range(length))

	new_parts = []
	if span_start - s - min_space >= keep_length:
	new_parts.append((s, span_start - min_space + 1))
	if e - span_start - keep_length - min_space > keep_length:
	new_parts.append((span_start + length + min_space, e))
	return new_parts

	parts = [(0, sz)]
	min_length = min(lengths)
	for length in sorted(lengths, reverse=True):
	lens = np.fromiter(
	(e - s if e - s >= length + min_space else 0 for s, e in parts),
	np.int,
	)
	l_sum = np.sum(lens)
	if l_sum == 0:
	break
	probs = lens / np.sum(lens)
	c = np.random.choice(len(parts), p=probs)
	s, e = parts.pop(c)
	parts.extend(arrange(s, e, length, min_length))
	mask_idc = np.asarray(mask_idc)
	else:
	min_len = min(lengths)
	if sz - min_len <= num_mask:
	min_len = sz - num_mask - 1

	mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)

	mask_idc = np.asarray(
	[
	mask_idc[j] + offset
	for j in range(len(mask_idc))
	for offset in range(lengths[j])
	]
	)

	mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))

	min_len = min([len(m) for m in mask_idcs])
	for i, mask_idc in enumerate(mask_idcs):
	if len(mask_idc) > min_len and require_same_masks:
	mask_idc = np.random.choice(mask_idc, min_len, replace=False)
	if mask_dropout > 0:
	num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int)
	mask_idc = np.random.choice(
	mask_idc, len(mask_idc) - num_holes, replace=False
	)

	mask[i, mask_idc] = True

	return mask

	def init_bert_params(module):
	"""
	Initialize the weights specific to the BERT Model.
	This overrides the default initializations depending on the specified arguments.
	1. If normal_init_linear_weights is set then weights of linear
	layer will be initialized using the normal distribution and
	bais will be set to the specified value.
	2. If normal_init_embed_weights is set then weights of embedding
	layer will be initialized using the normal distribution.
	3. If normal_init_proj_weights is set then weights of
	in_project_weight for MultiHeadAttention initialized using
	the normal distribution (to be validated).
	"""

	def normal_(data):
	# with FSDP, module params will be on CUDA, so we cast them back to CPU
	# so that the RNG is consistent with and without FSDP
	data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))

	if isinstance(module, nn.Linear):
	normal_(module.weight.data)
	if module.bias is not None:
	module.bias.data.zero_()
	if isinstance(module, nn.Embedding):
	normal_(module.weight.data)
	if module.padding_idx is not None:
	module.weight.data[module.padding_idx].zero_()
	if isinstance(module, MultiheadAttention):
	normal_(module.q_proj.weight.data)
	normal_(module.k_proj.weight.data)
	normal_(module.v_proj.weight.data)

	def pad_to_multiple(x, multiple, dim=-1, value=0):
	# Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
	if x is None:
	return None, 0
	tsz = x.size(dim)
	m = tsz / multiple
	remainder = math.ceil(m) * multiple - tsz
	if m.is_integer():
	return x, 0
	pad_offset = (0,) * (-1 - dim) * 2

	return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder

	def is_xla_tensor(tensor):
	return torch.is_tensor(tensor) and tensor.device.type == "xla"

	def index_put(tensor, indices, value):
	if is_xla_tensor(tensor):
	for _ in range(indices.dim(), tensor.dim()):
	indices = indices.unsqueeze(-1)
	if indices.size(-1) < tensor.size(-1):
	indices = indices.expand_as(tensor)
	tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
	else:
	tensor[indices] = value
	return tensor

	def PositionalEmbedding(
	num_embeddings: int,
	embedding_dim: int,
	padding_idx: int,
	learned: bool = False,
	):
	if learned:
	# if padding_idx is specified then offset the embedding ids by
	# this index and adjust num_embeddings appropriately
	# TODO: The right place for this offset would be inside
	# LearnedPositionalEmbedding. Move this there for a cleaner implementation.
	if padding_idx is not None:
	num_embeddings = num_embeddings + padding_idx + 1
	m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
	nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
	if padding_idx is not None:
	nn.init.constant_(m.weight[padding_idx], 0)
	else:
	m = SinusoidalPositionalEmbedding(
	embedding_dim,
	padding_idx,
	init_size=num_embeddings + padding_idx + 1,
	)
	return m

	def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
	if torch.jit.is_scripting() or torch.jit.is_tracing():
	export = True
	if not export and torch.cuda.is_available() and has_fused_layernorm:
	return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
	return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)


	class TransformerEncoderBase(nn.Module):
	"""
	Transformer encoder consisting of cfg.encoder.layers layers. Each layer
	is a :class:`TransformerEncoderLayer`.

	Args:
	args (argparse.Namespace): parsed command-line arguments
	dictionary: deprecated(None)
	embed_tokens (torch.nn.Embedding): input embedding
	"""

	def __init__(self, cfg, dictionary, embed_tokens, use_rel_pos_enc=False, scaling_for_att=1.0):
	self.cfg = cfg
	super().__init__()
	self.register_buffer("version", torch.Tensor([3]))

	self.dropout_module = FairseqDropout(
	cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__)
	)
	self.encoder_layerdrop = cfg.encoder.layerdrop

	embed_dim = embed_tokens.embedding_dim if embed_tokens is not None else cfg.encoder.embed_dim
	self.padding_idx = embed_tokens.padding_idx if embed_tokens is not None else 1
	self.max_source_positions = cfg.max_source_positions

	self.embed_tokens = embed_tokens

	self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim)

	self.embed_positions = (
	PositionalEmbedding(
	cfg.max_source_positions,
	embed_dim,
	self.padding_idx,
	learned=cfg.encoder.learned_pos,
	)
	if not cfg.no_token_positional_embeddings
	else None
	)
	if cfg.layernorm_embedding:
	self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export)
	else:
	self.layernorm_embedding = None

	if not cfg.adaptive_input and cfg.quant_noise.pq > 0:
	self.quant_noise = quant_noise(
	nn.Linear(embed_dim, embed_dim, bias=False),
	cfg.quant_noise.pq,
	cfg.quant_noise.pq_block_size,
	)
	else:
	self.quant_noise = None

	if self.encoder_layerdrop > 0.0:
	self.layers = LayerDropModuleList(p=self.encoder_layerdrop)
	else:
	self.layers = nn.ModuleList([])
	self.use_rel_pos_enc = use_rel_pos_enc
	self.scaling_for_att = scaling_for_att
	self.layers.extend(
	[self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)]
	)
	self.num_layers = len(self.layers)

	if cfg.encoder.normalize_before:
	self.layer_norm = LayerNorm(embed_dim, export=cfg.export)
	else:
	self.layer_norm = None
	if self.use_rel_pos_enc:
	self.pos_emb = RelativePositionalEncoding(embed_dim // cfg.encoder.attention_heads, 160)

	def build_encoder_layer(self, cfg):
	layer = TransformerEncoderLayerBase(cfg, has_relative_attention_bias=self.use_rel_pos_enc, scaling_for_att=self.scaling_for_att)
	checkpoint = cfg.checkpoint_activations
	if checkpoint:
	raise ValueError("We don't support checkpoint_activations for now! Please set cfg.checkpoint_activations=False.")
	min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0
	layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap)
	return layer

	def forward_embedding(
	self, src_tokens, token_embedding: Optional[torch.Tensor] = None
	):
	# embed tokens and positions
	if token_embedding is None:
	token_embedding = self.embed_tokens(src_tokens)
	x = embed = self.embed_scale * token_embedding
	if self.embed_positions is not None:
	x = embed + self.embed_positions(src_tokens)
	if self.layernorm_embedding is not None:
	x = self.layernorm_embedding(x)
	x = self.dropout_module(x)
	if self.quant_noise is not None:
	x = self.quant_noise(x)
	return x, embed

	def forward(
	self,
	src_tokens,
	src_lengths: Optional[torch.Tensor] = None,
	return_all_hiddens: bool = False,
	token_embeddings: Optional[torch.Tensor] = None,
	uniformity_layers: Optional[List[int]] = None,
	):
	"""
	Args:
	src_tokens (LongTensor): tokens in the source language of shape
	`(batch, src_len)`
	src_lengths (torch.LongTensor): lengths of each source sentence of
	shape `(batch)`
	return_all_hiddens (bool, optional): also return all of the
	intermediate hidden states (default: False).
	token_embeddings (torch.Tensor, optional): precomputed embeddings
	default `None` will recompute embeddings

	Returns:
	dict:
	- encoder_out (Tensor): the last encoder layer's output of
	shape `(src_len, batch, embed_dim)`
	- encoder_padding_mask (ByteTensor): the positions of
	padding elements of shape `(batch, src_len)`
	- encoder_embedding (Tensor): the (scaled) embedding lookup
	of shape `(batch, src_len, embed_dim)`
	- encoder_states (List[Tensor]): all intermediate
	hidden states of shape `(src_len, batch, embed_dim)`.
	Only populated if return_all_hiddens is True.
	"""
	return self.forward_scriptable(
	src_tokens, src_lengths, return_all_hiddens, token_embeddings, uniformity_layers
	)

	# TorchScript doesn't support super() method so that the scriptable Subclass
	# can't access the base class model in Torchscript.
	# Current workaround is to add a helper function with different name and
	# call the helper function from scriptable Subclass.
	def forward_scriptable(
	self,
	src_tokens,
	src_lengths: Optional[torch.Tensor] = None,
	return_all_hiddens: bool = False,
	token_embeddings: Optional[torch.Tensor] = None,
	uniformity_layers: Optional[List[int]] = None,
	):
	"""
	Args:
	src_tokens (LongTensor): tokens in the source language of shape
	`(batch, src_len)`
	src_lengths (torch.LongTensor): lengths of each source sentence of
	shape `(batch)`
	return_all_hiddens (bool, optional): also return all of the
	intermediate hidden states (default: False).
	token_embeddings (torch.Tensor, optional): precomputed embeddings
	default `None` will recompute embeddings

	Returns:
	dict:
	- encoder_out (Tensor): the last encoder layer's output of
	shape `(src_len, batch, embed_dim)`
	- encoder_padding_mask (ByteTensor): the positions of
	padding elements of shape `(batch, src_len)`
	- encoder_embedding (Tensor): the (scaled) embedding lookup
	of shape `(batch, src_len, embed_dim)`
	- encoder_states (List[Tensor]): all intermediate
	hidden states of shape `(src_len, batch, embed_dim)`.
	Only populated if return_all_hiddens is True.
	"""
	# compute padding mask
	encoder_padding_mask = src_tokens.eq(self.padding_idx)
	has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any()

	x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)

	# account for padding while computing the representation
	if has_pads:
	x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))

	# B x T x C -> T x B x C
	x = x.transpose(0, 1)
	if self.use_rel_pos_enc:
	x_len = x.shape[0]
	pos_seq = torch.arange(0, x_len).long().to(x.device)
	pos_seq = pos_seq[:, None] - pos_seq[None, :]
	pos_k, pos_v = self.pos_emb(pos_seq)
	else:
	pos_k = None

	encoder_states = []
	uniformity_hiddens = []

	if return_all_hiddens:
	encoder_states.append(x)

	if uniformity_layers is not None and 0 in uniformity_layers:
	x = F.normalize(x.float(), dim=-1).type_as(x)
	uniformity_hiddens.append(x)

	# encoder layers
	for i, layer in enumerate(self.layers):
	x = layer(
	x, encoder_padding_mask=encoder_padding_mask if has_pads else None,
	pos_bias=pos_k,
	)
	if uniformity_layers is not None and i+1 in uniformity_layers:
	x = F.normalize(x.float(), dim=-1).type_as(x)
	uniformity_hiddens.append(x)
	if return_all_hiddens:
	assert encoder_states is not None
	encoder_states.append(x)

	if self.layer_norm is not None:
	x = self.layer_norm(x)

	# The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
	# `forward` so we use a dictionary instead.
	# TorchScript does not support mixed values so the values are all lists.
	# The empty list is equivalent to None.
	src_lengths = (
	src_tokens.ne(self.padding_idx)
	.sum(dim=1, dtype=torch.int32)
	.reshape(-1, 1)
	.contiguous()
	)
	return {
	"encoder_out": [x], # T x B x C
	"encoder_padding_mask": [encoder_padding_mask], # B x T
	"encoder_embedding": [encoder_embedding], # B x T x C
	"encoder_states": encoder_states, # List[T x B x C]
	"uniformity_hiddens": uniformity_hiddens, # List[T x B x C]
	"src_tokens": [],
	"src_lengths": [src_lengths],
	}

	def forward_torchscript(self, net_input: Dict[str, Tensor]):
	"""A TorchScript-compatible version of forward.

	Encoders which use additional arguments may want to override
	this method for TorchScript compatibility.
	"""
	if torch.jit.is_scripting():
	return self.forward(
	src_tokens=net_input["src_tokens"],
	src_lengths=net_input["src_lengths"],
	)
	else:
	return self.forward_non_torchscript(net_input)

	@torch.jit.unused
	def forward_non_torchscript(self, net_input: Dict[str, Tensor]):
	encoder_input = {
	k: v for k, v in net_input.items() if k != "prev_output_tokens"
	}
	return self.forward(**encoder_input)

	@torch.jit.export
	def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order):
	"""
	Reorder encoder output according to new_order.

	Args:
	encoder_out: output from the ``forward()`` method
	new_order (LongTensor): desired order

	Returns:
	encoder_out rearranged according to new_order
	"""
	if len(encoder_out["encoder_out"]) == 0:
	new_encoder_out = []
	else:
	new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)]
	if len(encoder_out["encoder_padding_mask"]) == 0:
	new_encoder_padding_mask = []
	else:
	new_encoder_padding_mask = [
	encoder_out["encoder_padding_mask"][0].index_select(0, new_order)
	]
	if len(encoder_out["encoder_embedding"]) == 0:
	new_encoder_embedding = []
	else:
	new_encoder_embedding = [
	encoder_out["encoder_embedding"][0].index_select(0, new_order)
	]

	if len(encoder_out["src_tokens"]) == 0:
	src_tokens = []
	else:
	src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)]

	if len(encoder_out["src_lengths"]) == 0:
	src_lengths = []
	else:
	src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)]

	encoder_states = encoder_out["encoder_states"]
	if len(encoder_states) > 0:
	for idx, state in enumerate(encoder_states):
	encoder_states[idx] = state.index_select(1, new_order)

	return {
	"encoder_out": new_encoder_out, # T x B x C
	"encoder_padding_mask": new_encoder_padding_mask, # B x T
	"encoder_embedding": new_encoder_embedding, # B x T x C
	"encoder_states": encoder_states, # List[T x B x C]
	"src_tokens": src_tokens, # B x T
	"src_lengths": src_lengths, # B x 1
	}

	def max_positions(self):
	"""Maximum input length supported by the encoder."""
	if self.embed_positions is None:
	return self.max_source_positions
	return min(self.max_source_positions, self.embed_positions.max_positions)

	def upgrade_state_dict_named(self, state_dict, name):
	"""Upgrade a (possibly old) state dict for new versions."""
	if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
	weights_key = "{}.embed_positions.weights".format(name)
	if weights_key in state_dict:
	print("deleting {0}".format(weights_key))
	del state_dict[weights_key]
	state_dict[
	"{}.embed_positions._float_tensor".format(name)
	] = torch.FloatTensor(1)
	for i in range(self.num_layers):
	# update layer norms
	self.layers[i].upgrade_state_dict_named(
	state_dict, "{}.layers.{}".format(name, i)
	)

	version_key = "{}.version".format(name)
	if utils_item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2:
	# earlier checkpoints did not normalize after the stack of layers
	self.layer_norm = None
	self.normalize = False
	state_dict[version_key] = torch.Tensor([1])
	return state_dict

	def set_num_updates(self, num_updates):
	"""State from trainer to pass along to model at every update."""

	def _apply(m):
	if hasattr(m, "set_num_updates") and m != self:
	m.set_num_updates(num_updates)

	self.apply(_apply)


	class TransformerEncoderLayerBase(nn.Module):
	"""Encoder layer block.

	In the original paper each operation (multi-head attention or FFN) is
	postprocessed with: `dropout -> add residual -> layernorm`. In the
	tensor2tensor code they suggest that learning is more robust when
	preprocessing each layer with layernorm and postprocessing with:
	`dropout -> add residual`. We default to the approach in the paper, but the
	tensor2tensor approach can be enabled by setting
	cfg.encoder.normalize_before to ``True``.

	Args:
	args (argparse.Namespace): parsed command-line arguments
	"""

	def __init__(self, cfg, has_relative_attention_bias=False, scaling_for_att=1.0):
	super().__init__()
	self.cfg = cfg
	self.embed_dim = cfg.encoder.embed_dim
	self.quant_noise = cfg.quant_noise.pq
	self.quant_noise_block_size = cfg.quant_noise.pq_block_size
	self.self_attn = self.build_self_attention(self.embed_dim, cfg, has_relative_attention_bias=has_relative_attention_bias, scaling_for_att=scaling_for_att)
	self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
	self.dropout_module = FairseqDropout(
	cfg.dropout, module_name=self.__class__.__name__
	)
	self.activation_fn = get_activation_fn(activation=cfg.activation_fn)
	activation_dropout_p = cfg.activation_dropout
	if activation_dropout_p == 0:
	# for backwards compatibility with models that use cfg.relu_dropout
	activation_dropout_p = cfg.relu_dropout or 0
	self.activation_dropout_module = FairseqDropout(
	float(activation_dropout_p), module_name=self.__class__.__name__
	)
	self.normalize_before = cfg.encoder.normalize_before
	self.fc1 = self.build_fc1(
	self.embed_dim,
	cfg.encoder.ffn_embed_dim,
	self.quant_noise,
	self.quant_noise_block_size,
	)
	self.fc2 = self.build_fc2(
	cfg.encoder.ffn_embed_dim,
	self.embed_dim,
	self.quant_noise,
	self.quant_noise_block_size,
	)

	self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export)
	if has_relative_attention_bias:
	self.norm_k = LayerNorm(self.embed_dim // cfg.encoder.attention_heads)

	def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
	return quant_noise(
	nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
	)

	def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
	return quant_noise(
	nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
	)

	def build_self_attention(self, embed_dim, cfg, has_relative_attention_bias=False, scaling_for_att=1.0):
	return MultiheadAttention(
	embed_dim,
	cfg.encoder.attention_heads,
	dropout=cfg.attention_dropout,
	self_attention=True,
	q_noise=self.quant_noise,
	qn_block_size=self.quant_noise_block_size,
	has_relative_attention_bias=has_relative_attention_bias,
	scaling_for_att=scaling_for_att,
	)

	def residual_connection(self, x, residual):
	return residual + x

	def upgrade_state_dict_named(self, state_dict, name):
	"""
	Rename layer norm states from `...layer_norms.0.weight` to
	`...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to
	`...final_layer_norm.weight`
	"""
	layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"}
	for old, new in layer_norm_map.items():
	for m in ("weight", "bias"):
	k = "{}.layer_norms.{}.{}".format(name, old, m)
	if k in state_dict:
	state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k]
	del state_dict[k]

	def forward(
	self,
	x,
	encoder_padding_mask: Optional[Tensor],
	attn_mask: Optional[Tensor] = None,
	pos_bias=None,
	):
	"""
	Args:
	x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
	encoder_padding_mask (ByteTensor): binary ByteTensor of shape
	`(batch, seq_len)` where padding elements are indicated by ``1``.
	attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`,
	where `tgt_len` is the length of output and `src_len` is the
	length of input, though here both are equal to `seq_len`.
	`attn_mask[tgt_i, src_j] = 1` means that when calculating the
	embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
	useful for strided self-attention.

	Returns:
	encoded output of shape `(seq_len, batch, embed_dim)`
	"""
	# anything in original attn_mask = 1, becomes -1e8
	# anything in original attn_mask = 0, becomes 0
	# Note that we cannot use -inf here, because at some edge cases,
	# the attention weight (before softmax) for some padded element in query
	# will become -inf, which results in NaN in model parameters
	if attn_mask is not None:
	attn_mask = attn_mask.masked_fill(
	attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
	)

	residual = x
	if self.normalize_before:
	x = self.self_attn_layer_norm(x)
	if pos_bias is not None:
	pos_bias = self.norm_k(pos_bias)
	x, _ = self.self_attn(
	query=x,
	key=x,
	value=x,
	key_padding_mask=encoder_padding_mask,
	need_weights=False,
	attn_mask=attn_mask,
	position_bias=pos_bias,
	)
	x = self.dropout_module(x)
	x = self.residual_connection(x, residual)
	if not self.normalize_before:
	x = self.self_attn_layer_norm(x)

	residual = x
	if self.normalize_before:
	x = self.final_layer_norm(x)
	x = self.activation_fn(self.fc1(x))
	x = self.activation_dropout_module(x)
	x = self.fc2(x)
	x = self.dropout_module(x)
	x = self.residual_connection(x, residual)
	if not self.normalize_before:
	x = self.final_layer_norm(x)
	return x


	class TransformerEncoder(nn.Module):
	"""
	wav2vec-style transformer encoder.
	"""
	def __init__(self, args):
	super().__init__()

	self.dropout = args.dropout
	self.embedding_dim = args.encoder_embed_dim
	self.required_seq_len_multiple = args.required_seq_len_multiple

	self.pos_conv = nn.Conv1d(
	self.embedding_dim,
	self.embedding_dim,
	kernel_size=args.conv_pos,
	padding=args.conv_pos // 2,
	groups=args.conv_pos_groups,
	)
	dropout = 0
	std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
	nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
	nn.init.constant_(self.pos_conv.bias, 0)

	self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
	self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())

	layers = []
	self.use_rel_pos_enc = getattr(args, "use_rel_pos_enc", False)
	for _ in range(args.encoder_layers):
	layer = TransformerSentenceEncoderLayer(
	embedding_dim=self.embedding_dim,
	ffn_embedding_dim=args.encoder_ffn_embed_dim,
	num_attention_heads=args.encoder_attention_heads,
	dropout=self.dropout,
	attention_dropout=args.attention_dropout,
	activation_dropout=args.activation_dropout,
	activation_fn=args.activation_fn,
	layer_norm_first=args.layer_norm_first,
	has_relative_attention_bias=self.use_rel_pos_enc,
	scaling_for_att=getattr(args, "scaling_for_att", 1.0)
	)
	if args.checkpoint_activations:
	raise ValueError("We don't support checkpoint_activations for now! Please set checkpoint_activations=False.")
	layers.append(layer)
	self.layers = nn.ModuleList(layers)

	self.layer_norm_first = args.layer_norm_first
	self.layer_norm = LayerNorm(self.embedding_dim)
	self.layerdrop = args.encoder_layerdrop

	if self.use_rel_pos_enc:
	self.pos_emb = RelativePositionalEncoding(args.encoder_embed_dim // args.encoder_attention_heads, 160)

	self.apply(init_bert_params)

	def forward(self, x, padding_mask=None, layer=None, conv_pos=True):
	x, layer_results = self.extract_features(x, padding_mask, layer, conv_pos)

	if self.layer_norm_first and (layer is None or layer >= len(self.layers) - 1):
	x = self.layer_norm(x)

	return x, layer_results

	def extract_features(self, x, padding_mask=None, tgt_layer=None, conv_pos=True):

	if padding_mask is not None:
	x = index_put(x, padding_mask, 0)

	if conv_pos:
	x_conv = self.pos_conv(x.transpose(1, 2))
	x_conv = x_conv.transpose(1, 2)
	x = x + x_conv

	if not self.layer_norm_first:
	x = self.layer_norm(x)

	# pad to the sequence length dimension
	x, pad_length = pad_to_multiple(
	x, self.required_seq_len_multiple, dim=-2, value=0
	)
	if pad_length > 0 and padding_mask is None:
	padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
	padding_mask[:, -pad_length:] = True
	else:
	padding_mask, _ = pad_to_multiple(
	padding_mask, self.required_seq_len_multiple, dim=-1, value=True
	)
	x = F.dropout(x, p=self.dropout, training=self.training)

	# B x T x C -> T x B x C
	x = x.transpose(0, 1)

	if self.use_rel_pos_enc:
	x_len = x.shape[0]
	pos_seq = torch.arange(0, x_len).long().to(x.device)
	pos_seq = pos_seq[:, None] - pos_seq[None, :]
	pos_k, pos_v = self.pos_emb(pos_seq)
	else:
	pos_k = None

	layer_results = []
	r = None
	for i, layer in enumerate(self.layers):
	dropout_probability = np.random.random()
	if not self.training or (dropout_probability > self.layerdrop):
	x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_k)
	if tgt_layer is not None:
	# unpad if needed
	if pad_length > 0:
	layer_results.append(
	x[:-pad_length]
	# (
	# x[:-pad_length],
	# z[:, :-pad_length, :-pad_length]
	# if z is not None
	# else z,
	# )
	)
	else:
	# layer_results.append((x, z))
	layer_results.append(x)
	if i == tgt_layer:
	r = x
	break

	if r is not None:
	x = r

	# T x B x C -> B x T x C
	x = x.transpose(0, 1)
	# undo paddding
	if pad_length > 0:
	x = x[:, :-pad_length]

	return x, layer_results

	def max_positions(self):
	"""Maximum output length supported by the encoder."""
	return self.args.max_positions

	def upgrade_state_dict_named(self, state_dict, name):
	"""Upgrade a (possibly old) state dict for new versions of fairseq."""
	return state_dict


	class TransformerSentenceEncoderLayer(nn.Module):
	"""
	wav2vec-style transformer layer
	"""

	def __init__(
	self,
	embedding_dim: float = 768,
	ffn_embedding_dim: float = 3072,
	num_attention_heads: float = 8,
	dropout: float = 0.1,
	attention_dropout: float = 0.1,
	activation_dropout: float = 0.1,
	activation_fn: str = "relu",
	layer_norm_first: bool = False,
	has_relative_attention_bias: bool = False,
	scaling_for_att: float = 1.0,
	) -> None:

	super().__init__()
	# Initialize parameters
	self.embedding_dim = embedding_dim
	self.dropout = dropout
	self.activation_dropout = activation_dropout

	# Initialize blocks
	self.activation_fn = get_activation_fn(activation_fn)
	self.self_attn = MultiheadAttention(
	self.embedding_dim,
	num_attention_heads,
	dropout=attention_dropout,
	self_attention=True,
	has_relative_attention_bias=has_relative_attention_bias,
	scaling_for_att=scaling_for_att
	)

	self.dropout1 = nn.Dropout(dropout)
	self.dropout2 = nn.Dropout(self.activation_dropout)
	self.dropout3 = nn.Dropout(dropout)

	self.layer_norm_first = layer_norm_first

	# layer norm associated with the self attention layer
	self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
	self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
	self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)

	# layer norm associated with the position wise feed-forward NN
	self.final_layer_norm = LayerNorm(self.embedding_dim)

	if has_relative_attention_bias:
	self.norm_k = LayerNorm(self.embedding_dim//num_attention_heads)

	def forward(
	self,
	x: torch.Tensor,
	self_attn_mask: torch.Tensor = None,
	self_attn_padding_mask: torch.Tensor = None,
	need_weights: bool = False,
	att_args=None,
	pos_bias=None,
	):
	"""
	LayerNorm is applied either before or after the self-attention/ffn
	modules similar to the original Transformer imlementation.
	"""
	residual = x

	if self.layer_norm_first:
	x = self.self_attn_layer_norm(x)
	if pos_bias is not None:
	pos_bias = self.norm_k(pos_bias)
	x, attn = self.self_attn(
	query=x,
	key=x,
	value=x,
	key_padding_mask=self_attn_padding_mask,
	attn_mask=self_attn_mask,
	position_bias=pos_bias,
	)
	x = self.dropout1(x)
	x = residual + x

	residual = x
	x = self.final_layer_norm(x)
	x = self.activation_fn(self.fc1(x))
	x = self.dropout2(x)
	x = self.fc2(x)
	x = self.dropout3(x)
	x = residual + x
	else:
	x, attn = self.self_attn(
	query=x,
	key=x,
	value=x,
	key_padding_mask=self_attn_padding_mask,
	position_bias=pos_bias,
	)

	x = self.dropout1(x)
	x = residual + x

	x = self.self_attn_layer_norm(x)

	residual = x
	x = self.activation_fn(self.fc1(x))
	x = self.dropout2(x)
	x = self.fc2(x)
	x = self.dropout3(x)
	x = residual + x
	x = self.final_layer_norm(x)

	return x, attn


	class FairseqDropout(nn.Module):
	def __init__(self, p, module_name=None):
	super().__init__()
	self.p = p
	self.module_name = module_name
	self.apply_during_inference = False

	def forward(self, x, inplace: bool = False):
	if self.p > 0 and (self.training or self.apply_during_inference):
	return F.dropout(x, p=self.p, training=True, inplace=inplace)
	else:
	return x

	def make_generation_fast_(
	self,
	name: str,
	retain_dropout: bool = False,
	retain_dropout_modules: Optional[List[str]] = None,
	**kwargs
	):
	if retain_dropout:
	if retain_dropout_modules is not None and self.module_name is None:
	logger.warning(
	"Cannot enable dropout during inference for module {} "
	"because module_name was not set".format(name)
	)
	elif (
	retain_dropout_modules is None # if None, apply to all modules
	or self.module_name in retain_dropout_modules
	):
	logger.info(
	"Enabling dropout during inference for module: {}".format(name)
	)
	self.apply_during_inference = True
	else:
	logger.info("Disabling dropout for module: {}".format(name))


	class LearnedPositionalEmbedding(nn.Embedding):
	"""
	This module learns positional embeddings up to a fixed maximum size.
	Padding ids are ignored by either offsetting based on padding_idx
	or by setting padding_idx to None and ensuring that the appropriate
	position ids are passed to the forward function.
	"""

	def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
	super().__init__(num_embeddings, embedding_dim, padding_idx)
	self.onnx_trace = False
	if self.padding_idx is not None:
	self.max_positions = self.num_embeddings - self.padding_idx - 1
	else:
	self.max_positions = self.num_embeddings

	def forward(
	self,
	input: Tensor,
	incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
	positions: Optional[Tensor] = None,
	):
	"""Input is expected to be of size [bsz x seqlen]."""
	assert (positions is None) or (
	self.padding_idx is None
	), "If positions is pre-computed then padding_idx should not be set."

	if positions is None:
	if incremental_state is not None:
	# positions is the same for every token when decoding a single step
	# Without the int() cast, it doesn't work in some cases when exporting to ONNX
	positions = torch.zeros(
	(1, 1), device=input.device, dtype=input.dtype
	).fill_(int(self.padding_idx + input.size(1)))
	else:
	positions = utils_make_positions(
	input, self.padding_idx, onnx_trace=self.onnx_trace
	)
	positions = torch.clamp(positions, max=self.padding_idx + self.max_positions)
	return F.embedding(
	positions,
	self.weight,
	self.padding_idx,
	self.max_norm,
	self.norm_type,
	self.scale_grad_by_freq,
	self.sparse,
	)


	class SinusoidalPositionalEmbedding(nn.Module):
	"""This module produces sinusoidal positional embeddings of any length.

	Padding symbols are ignored.
	"""

	def __init__(self, embedding_dim, padding_idx, init_size=1024):
	super().__init__()
	self.embedding_dim = embedding_dim
	self.padding_idx = padding_idx if padding_idx is not None else 0
	self.weights = SinusoidalPositionalEmbedding.get_embedding(
	init_size, embedding_dim, padding_idx
	)
	self.onnx_trace = False
	self.register_buffer("_float_tensor", torch.FloatTensor(1))
	self.max_positions = int(1e5)

	def prepare_for_onnx_export_(self):
	self.onnx_trace = True

	@staticmethod
	def get_embedding(
	num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
	):
	"""Build sinusoidal embeddings.

	This matches the implementation in tensor2tensor, but differs slightly
	from the description in Section 3.5 of "Attention Is All You Need".
	"""
	half_dim = embedding_dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
	emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
	1
	) * emb.unsqueeze(0)
	emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
	num_embeddings, -1
	)
	if embedding_dim % 2 == 1:
	# zero pad
	emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
	if padding_idx is not None:
	emb[padding_idx, :] = 0
	return emb

	def forward(
	self,
	input,
	incremental_state: Optional[Any] = None,
	timestep: Optional[Tensor] = None,
	positions: Optional[Any] = None,
	):
	"""Input is expected to be of size [bsz x seqlen]."""
	bspair = torch.onnx.operators.shape_as_tensor(input)
	bsz, seq_len = bspair[0], bspair[1]
	max_pos = self.padding_idx + 1 + seq_len
	if self.weights is None or max_pos > self.weights.size(0):
	# recompute/expand embeddings if needed
	self.weights = SinusoidalPositionalEmbedding.get_embedding(
	max_pos, self.embedding_dim, self.padding_idx
	)
	self.weights = self.weights.to(self._float_tensor)

	if incremental_state is not None:
	# positions is the same for every token when decoding a single step
	pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
	if self.onnx_trace:
	return (
	self.weights.index_select(index=self.padding_idx + pos, dim=0)
	.unsqueeze(1)
	.repeat(bsz, 1, 1)
	)
	return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)

	positions = utils_make_positions(
	input, self.padding_idx, onnx_trace=self.onnx_trace
	)
	if self.onnx_trace:
	flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
	embedding_shape = torch.cat(
	(bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long))
	)
	embeddings = torch.onnx.operators.reshape_from_tensor_shape(
	flat_embeddings, embedding_shape
	)
	return embeddings
	return (
	self.weights.index_select(0, positions.view(-1))
	.view(bsz, seq_len, -1)
	.detach()
	)


	try:
	from apex.normalization import FusedLayerNorm as _FusedLayerNorm

	has_fused_layernorm = True

	class FusedLayerNorm(_FusedLayerNorm):
	@torch.jit.unused
	def forward(self, x):
	if not x.is_cuda:
	return super().forward(x)
	else:
	with torch.cuda.device(x.device):
	return super().forward(x)

	except ImportError:
	has_fused_layernorm = False


	class Fp32LayerNorm(nn.LayerNorm):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	def forward(self, input):
	output = F.layer_norm(
	input.float(),
	self.normalized_shape,
	self.weight.float() if self.weight is not None else None,
	self.bias.float() if self.bias is not None else None,
	self.eps,
	)
	return output.type_as(input)


	class LayerDropModuleList(nn.ModuleList):
	"""
	A LayerDrop implementation based on :class:`torch.nn.ModuleList`.

	We refresh the choice of which layers to drop every time we iterate
	over the LayerDropModuleList instance. During evaluation we always
	iterate over all layers.

	Usage::

	layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
	for layer in layers: # this might iterate over layers 1 and 3
	x = layer(x)
	for layer in layers: # this might iterate over all layers
	x = layer(x)
	for layer in layers: # this might not iterate over any layers
	x = layer(x)

	Args:
	p (float): probability of dropping out each layer
	modules (iterable, optional): an iterable of modules to add
	"""

	def __init__(self, p, modules=None):
	super().__init__(modules)
	self.p = p

	def __iter__(self):
	dropout_probs = torch.empty(len(self)).uniform_()
	for i, m in enumerate(super().__iter__()):
	if not self.training or (dropout_probs[i] > self.p):
	yield m


	class RelativePositionalEncoding(torch.nn.Module):
	def __init__(self, d_model, maxlen=1000, embed_v=False):
	super(RelativePositionalEncoding, self).__init__()

	self.d_model = d_model
	self.maxlen = maxlen
	self.pe_k = torch.nn.Embedding(2*maxlen, d_model)
	if embed_v:
	self.pe_v = torch.nn.Embedding(2*maxlen, d_model)
	self.embed_v = embed_v


	def forward(self, pos_seq, incremental_state=None):
	pos_seq[pos_seq < -self.maxlen] = -self.maxlen
	pos_seq[pos_seq >= self.maxlen] = self.maxlen - 1
	pos_seq = pos_seq + self.maxlen

	if incremental_state is not None:
	pos_seq = pos_seq[-1:]

	if self.embed_v:
	return self.pe_k(pos_seq), self.pe_v(pos_seq)
	else:
	return self.pe_k(pos_seq), None


	class MultiheadAttention(nn.Module):
	"""Multi-headed attention.

	See "Attention Is All You Need" for more details.
	"""

	def __init__(
	self,
	embed_dim,
	num_heads,
	kdim=None,
	vdim=None,
	dropout=0.0,
	bias=True,
	add_bias_kv=False,
	add_zero_attn=False,
	self_attention=False,
	encoder_decoder_attention=False,
	q_noise=0.0,
	qn_block_size=8,
	has_relative_attention_bias=False,
	scaling_for_att=1.0
	):
	super().__init__()
	self.embed_dim = embed_dim
	self.kdim = kdim if kdim is not None else embed_dim
	self.vdim = vdim if vdim is not None else embed_dim
	self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

	self.num_heads = num_heads
	self.dropout_module = FairseqDropout(
	dropout, module_name=self.__class__.__name__
	)

	self.has_relative_attention_bias = has_relative_attention_bias

	self.head_dim = embed_dim // num_heads
	assert (
	self.head_dim * num_heads == self.embed_dim
	), "embed_dim must be divisible by num_heads"
	self.scaling = self.head_dim ** -0.5
	self.scaling_for_att = scaling_for_att

	self.self_attention = self_attention
	self.encoder_decoder_attention = encoder_decoder_attention

	assert not self.self_attention or self.qkv_same_dim, (
	"Self-attention requires query, key and " "value to be of the same size"
	)

	self.k_proj = quant_noise(
	nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size
	)
	self.v_proj = quant_noise(
	nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size
	)
	self.q_proj = quant_noise(
	nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
	)

	self.out_proj = quant_noise(
	nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size
	)

	if add_bias_kv:
	self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
	self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
	else:
	self.bias_k = self.bias_v = None

	self.add_zero_attn = add_zero_attn

	self.reset_parameters()

	self.onnx_trace = False

	def prepare_for_onnx_export_(self):
	self.onnx_trace = True

	def reset_parameters(self):
	if self.qkv_same_dim:
	# Empirically observed the convergence to be much better with
	# the scaled initialization
	nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
	nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
	nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
	else:
	nn.init.xavier_uniform_(self.k_proj.weight)
	nn.init.xavier_uniform_(self.v_proj.weight)
	nn.init.xavier_uniform_(self.q_proj.weight)

	nn.init.xavier_uniform_(self.out_proj.weight)
	if self.out_proj.bias is not None:
	nn.init.constant_(self.out_proj.bias, 0.0)
	if self.bias_k is not None:
	nn.init.xavier_normal_(self.bias_k)
	if self.bias_v is not None:
	nn.init.xavier_normal_(self.bias_v)

	def forward(
	self,
	query,
	key: Optional[Tensor],
	value: Optional[Tensor],
	key_padding_mask: Optional[Tensor] = None,
	incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
	need_weights: bool = True,
	static_kv: bool = False,
	attn_mask: Optional[Tensor] = None,
	before_softmax: bool = False,
	need_head_weights: bool = False,
	position_bias: Optional[Tensor] = None
	) -> Tuple[Tensor, Optional[Tensor]]:
	"""Input shape: Time x Batch x Channel

	Args:
	key_padding_mask (ByteTensor, optional): mask to exclude
	keys that are pads, of shape `(batch, src_len)`, where
	padding elements are indicated by 1s.
	need_weights (bool, optional): return the attention weights,
	averaged over heads (default: False).
	attn_mask (ByteTensor, optional): typically used to
	implement causal attention, where the mask prevents the
	attention from looking forward in time (default: None).
	before_softmax (bool, optional): return the raw attention
	weights and values before the attention softmax.
	need_head_weights (bool, optional): return the attention
	weights for each head. Implies need_weights. Default:
	return the average attention weights over all heads.
	"""
	if need_head_weights:
	need_weights = True

	is_tpu = query.device.type == "xla"

	tgt_len, bsz, embed_dim = query.size()
	src_len = tgt_len
	assert embed_dim == self.embed_dim, f"query dim {embed_dim} != {self.embed_dim}"
	assert list(query.size()) == [tgt_len, bsz, embed_dim]
	if key is not None:
	src_len, key_bsz, _ = key.size()
	if not torch.jit.is_scripting():
	assert key_bsz == bsz
	assert value is not None
	assert src_len, bsz == value.shape[:2]

	if (
	not self.onnx_trace
	and not is_tpu # don't use PyTorch version on TPUs
	and incremental_state is None
	and not static_kv
	# A workaround for quantization to work. Otherwise JIT compilation
	# treats bias in linear module as method.
	and not torch.jit.is_scripting()
	and not self.has_relative_attention_bias
	):
	assert key is not None and value is not None
	return F.multi_head_attention_forward(
	query,
	key,
	value,
	self.embed_dim,
	self.num_heads,
	torch.empty([0]),
	torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
	self.bias_k,
	self.bias_v,
	self.add_zero_attn,
	self.dropout_module.p,
	self.out_proj.weight,
	self.out_proj.bias,
	self.training or self.dropout_module.apply_during_inference,
	key_padding_mask,
	need_weights,
	attn_mask,
	use_separate_proj_weight=True,
	q_proj_weight=self.q_proj.weight,
	k_proj_weight=self.k_proj.weight,
	v_proj_weight=self.v_proj.weight,
	)

	if incremental_state is not None:
	saved_state = self._get_input_buffer(incremental_state)
	if saved_state is not None and "prev_key" in saved_state:
	# previous time steps are cached - no need to recompute
	# key and value if they are static
	if static_kv:
	assert self.encoder_decoder_attention and not self.self_attention
	key = value = None
	else:
	saved_state = None

	if self.self_attention:
	q = self.q_proj(query)
	k = self.k_proj(query)
	v = self.v_proj(query)
	elif self.encoder_decoder_attention:
	# encoder-decoder attention
	q = self.q_proj(query)
	if key is None:
	assert value is None
	k = v = None
	else:
	k = self.k_proj(key)
	v = self.v_proj(key)

	else:
	assert key is not None and value is not None
	q = self.q_proj(query)
	k = self.k_proj(key)
	v = self.v_proj(value)
	q *= self.scaling
	q *= (1 / self.scaling_for_att)

	if self.bias_k is not None:
	assert self.bias_v is not None
	k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
	v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
	if attn_mask is not None:
	attn_mask = torch.cat(
	[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
	)
	if key_padding_mask is not None:
	key_padding_mask = torch.cat(
	[
	key_padding_mask,
	key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
	],
	dim=1,
	)

	q = (
	q.contiguous()
	.view(tgt_len, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)
	if k is not None:
	k = (
	k.contiguous()
	.view(-1, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)
	if v is not None:
	v = (
	v.contiguous()
	.view(-1, bsz * self.num_heads, self.head_dim)
	.transpose(0, 1)
	)

	if saved_state is not None:
	# saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
	if "prev_key" in saved_state:
	_prev_key = saved_state["prev_key"]
	assert _prev_key is not None
	prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
	if static_kv:
	k = prev_key
	else:
	assert k is not None
	k = torch.cat([prev_key, k], dim=1)
	src_len = k.size(1)
	if "prev_value" in saved_state:
	_prev_value = saved_state["prev_value"]
	assert _prev_value is not None
	prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
	if static_kv:
	v = prev_value
	else:
	assert v is not None
	v = torch.cat([prev_value, v], dim=1)
	prev_key_padding_mask: Optional[Tensor] = None
	if "prev_key_padding_mask" in saved_state:
	prev_key_padding_mask = saved_state["prev_key_padding_mask"]
	assert k is not None and v is not None
	key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
	key_padding_mask=key_padding_mask,
	prev_key_padding_mask=prev_key_padding_mask,
	batch_size=bsz,
	src_len=k.size(1),
	static_kv=static_kv,
	)

	saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
	saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
	saved_state["prev_key_padding_mask"] = key_padding_mask
	# In this branch incremental_state is never None
	assert incremental_state is not None
	incremental_state = self._set_input_buffer(incremental_state, saved_state)
	assert k is not None
	assert k.size(1) == src_len

	# This is part of a workaround to get around fork/join parallelism
	# not supporting Optional types.
	if key_padding_mask is not None and key_padding_mask.dim() == 0:
	key_padding_mask = None

	if key_padding_mask is not None:
	assert key_padding_mask.size(0) == bsz
	assert key_padding_mask.size(1) == src_len

	if self.add_zero_attn:
	assert v is not None
	src_len += 1
	k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1)
	v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1)
	if attn_mask is not None:
	attn_mask = torch.cat(
	[attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1
	)
	if key_padding_mask is not None:
	key_padding_mask = torch.cat(
	[
	key_padding_mask,
	torch.zeros(key_padding_mask.size(0), 1).type_as(
	key_padding_mask
	),
	],
	dim=1,
	)

	attn_weights = torch.bmm(q, k.transpose(1, 2))
	attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)

	if position_bias is not None: ## first order
	## position_bias: [241, 241, 64]
	#print ("attn_weights: ", attn_weights.size()) # [492, 241, 241]
	reshape_q = q.contiguous().view(bsz * self.num_heads, -1, self.head_dim).transpose(0,1) #[241, 492, 64]
	#print ("reshape_q: ", reshape_q.size())
	B = torch.matmul(reshape_q, position_bias.transpose(-2, -1))
	#print ("B: ", B.size()) ## [241, 492, 241]
	#B = B.transpose(0, 1).view(bsz, self.num_heads, position_bias.size(0), position_bias.size(1))
	B = B.transpose(0, 1).view(bsz*self.num_heads, position_bias.size(0), position_bias.size(1))
	#print ("B 2: ", B.size())
	attn_weights += B

	attn_weights *= self.scaling_for_att
	assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

	if attn_mask is not None:
	attn_mask = attn_mask.unsqueeze(0)
	if self.onnx_trace:
	attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
	attn_weights += attn_mask

	if key_padding_mask is not None:
	# don't attend to padding symbols
	attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
	if not is_tpu:
	attn_weights = attn_weights.masked_fill(
	key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
	float("-inf"),
	)
	else:
	attn_weights = attn_weights.transpose(0, 2)
	attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
	attn_weights = attn_weights.transpose(0, 2)
	attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)

	if self.scaling_for_att > 1.0:
	attn_weights = attn_weights - attn_weights.detach().max(dim=-1, keepdim=True)[0]

	if before_softmax:
	return attn_weights, v

	attn_weights_float = softmax(
	attn_weights, dim=-1, onnx_trace=self.onnx_trace
	)
	attn_weights = attn_weights_float.type_as(attn_weights)
	attn_probs = self.dropout_module(attn_weights)

	assert v is not None
	attn = torch.bmm(attn_probs, v)
	assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
	if self.onnx_trace and attn.size(1) == 1:
	# when ONNX tracing a single decoder step (sequence length == 1)
	# the transpose is a no-op copy before view, thus unnecessary
	attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
	else:
	attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
	attn = self.out_proj(attn)
	attn_weights: Optional[Tensor] = None
	if need_weights:
	attn_weights = attn_weights_float.view(
	bsz, self.num_heads, tgt_len, src_len
	).transpose(1, 0)
	if not need_head_weights:
	# average attention weights over heads
	attn_weights = attn_weights.mean(dim=0)

	return attn, attn_weights

	@staticmethod
	def _append_prev_key_padding_mask(
	key_padding_mask: Optional[Tensor],
	prev_key_padding_mask: Optional[Tensor],
	batch_size: int,
	src_len: int,
	static_kv: bool,
	) -> Optional[Tensor]:
	# saved key padding masks have shape (bsz, seq_len)
	if prev_key_padding_mask is not None and static_kv:
	new_key_padding_mask = prev_key_padding_mask
	elif prev_key_padding_mask is not None and key_padding_mask is not None:
	new_key_padding_mask = torch.cat(
	[prev_key_padding_mask.float(), key_padding_mask.float()], dim=1
	)
	# During incremental decoding, as the padding token enters and
	# leaves the frame, there will be a time when prev or current
	# is None
	elif prev_key_padding_mask is not None:
	if src_len > prev_key_padding_mask.size(1):
	filler = torch.zeros(
	(batch_size, src_len - prev_key_padding_mask.size(1)),
	device=prev_key_padding_mask.device,
	)
	new_key_padding_mask = torch.cat(
	[prev_key_padding_mask.float(), filler.float()], dim=1
	)
	else:
	new_key_padding_mask = prev_key_padding_mask.float()
	elif key_padding_mask is not None:
	if src_len > key_padding_mask.size(1):
	filler = torch.zeros(
	(batch_size, src_len - key_padding_mask.size(1)),
	device=key_padding_mask.device,
	)
	new_key_padding_mask = torch.cat(
	[filler.float(), key_padding_mask.float()], dim=1
	)
	else:
	new_key_padding_mask = key_padding_mask.float()
	else:
	new_key_padding_mask = prev_key_padding_mask
	return new_key_padding_mask

	@torch.jit.export
	def reorder_incremental_state(
	self,
	incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
	new_order: Tensor,
	):
	"""Reorder buffered internal state (for incremental generation)."""
	input_buffer = self._get_input_buffer(incremental_state)
	if input_buffer is not None:
	for k in input_buffer.keys():
	input_buffer_k = input_buffer[k]
	if input_buffer_k is not None:
	if self.encoder_decoder_attention and input_buffer_k.size(
	0
	) == new_order.size(0):
	break
	input_buffer[k] = input_buffer_k.index_select(0, new_order)
	incremental_state = self._set_input_buffer(incremental_state, input_buffer)
	return incremental_state

	def _get_input_buffer(
	self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
	) -> Dict[str, Optional[Tensor]]:
	result = self.get_incremental_state(incremental_state, "attn_state")
	if result is not None:
	return result
	else:
	empty_result: Dict[str, Optional[Tensor]] = {}
	return empty_result

	def _set_input_buffer(
	self,
	incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
	buffer: Dict[str, Optional[Tensor]],
	):
	return self.set_incremental_state(incremental_state, "attn_state", buffer)

	def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
	return attn_weights

	def upgrade_state_dict_named(self, state_dict, name):
	prefix = name + "." if name != "" else ""
	items_to_add = {}
	keys_to_remove = []
	for k in state_dict.keys():
	if k.endswith(prefix + "in_proj_weight"):
	# in_proj_weight used to be q + k + v with same dimensions
	dim = int(state_dict[k].shape[0] / 3)
	items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
	items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim]
	items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :]

	keys_to_remove.append(k)

	k_bias = prefix + "in_proj_bias"
	if k_bias in state_dict.keys():
	dim = int(state_dict[k].shape[0] / 3)
	items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim]
	items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
	dim : 2 * dim
	]
	items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :]

	keys_to_remove.append(prefix + "in_proj_bias")

	for k in keys_to_remove:
	del state_dict[k]

	for key, value in items_to_add.items():
	state_dict[key] = value


	class ConvFeatureExtractionModel(nn.Module):
	def __init__(
	self,
	conv_layers: List[Tuple[int, int, int]],
	dropout: float = 0.0,
	mode: str = "default",
	conv_bias: bool = False,
	):
	super().__init__()

	assert mode in {"default", "layer_norm"}

	def block(
	n_in,
	n_out,
	k,
	stride,
	is_layer_norm=False,
	is_group_norm=False,
	conv_bias=False,
	):
	def make_conv():
	conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
	nn.init.kaiming_normal_(conv.weight)
	return conv

	assert (
	is_layer_norm and is_group_norm
	) == False, "layer norm and group norm are exclusive"

	if is_layer_norm:
	return nn.Sequential(
	make_conv(),
	nn.Dropout(p=dropout),
	nn.Sequential(
	TransposeLast(),
	Fp32LayerNorm(dim, elementwise_affine=True),
	TransposeLast(),
	),
	nn.GELU(),
	)
	elif is_group_norm:
	return nn.Sequential(
	make_conv(),
	nn.Dropout(p=dropout),
	Fp32GroupNorm(dim, dim, affine=True),
	nn.GELU(),
	)
	else:
	return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())

	in_d = 1
	self.conv_layers = nn.ModuleList()
	for i, cl in enumerate(conv_layers):
	assert len(cl) == 3, "invalid conv definition: " + str(cl)
	(dim, k, stride) = cl

	self.conv_layers.append(
	block(
	in_d,
	dim,
	k,
	stride,
	is_layer_norm=mode == "layer_norm",
	is_group_norm=mode == "default" and i == 0,
	conv_bias=conv_bias,
	)
	)
	in_d = dim

	def forward(self, x):

	# BxT -> BxCxT
	x = x.unsqueeze(1)

	for conv in self.conv_layers:
	x = conv(x)

	return x


	class TransposeLast(nn.Module):
	def __init__(self, deconstruct_idx=None):
	super().__init__()
	self.deconstruct_idx = deconstruct_idx

	def forward(self, x):
	if self.deconstruct_idx is not None:
	x = x[self.deconstruct_idx]
	return x.transpose(-2, -1)


	class Fp32GroupNorm(nn.GroupNorm):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	def forward(self, input):
	output = F.group_norm(
	input.float(),
	self.num_groups,
	self.weight.float() if self.weight is not None else None,
	self.bias.float() if self.bias is not None else None,
	self.eps,
	)
	return output.type_as(input)


	class GradMultiply(torch.autograd.Function):
	@staticmethod
	def forward(ctx, x, scale):
	ctx.scale = scale
	res = x.new(x)
	return res

	@staticmethod
	def backward(ctx, grad):
	return grad * ctx.scale, None


	class Rotate3D(nn.Module):
	"""
	(T, B, D) --> (B, D, T) --> (D, T, B) --> (T, B, D)
	"""
	def __init__(self):
	super().__init__()

	def forward(self, x):
	return x.permute(1, 2, 0)


	class SamePad(nn.Module):
	def __init__(self, kernel_size, causal=False):
	super().__init__()
	if causal:
	self.remove = kernel_size - 1
	else:
	self.remove = 1 if kernel_size % 2 == 0 else 0

	def forward(self, x):
	if self.remove > 0:
	x = x[:, :, : -self.remove]
	return x