Spaces:

Billpai
/

test2

Build error

test2 / modules /wenet_extractor /bin /export_onnx_bpu.py

Billpai

test

f196feb about 1 year ago

41.7 kB

	# This module is from [WeNet](https://github.com/wenet-e2e/wenet).

	# ## Citations

	# ```bibtex
	# @inproceedings{yao2021wenet,
	# title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit},
	# author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin},
	# booktitle={Proc. Interspeech},
	# year={2021},
	# address={Brno, Czech Republic },
	# organization={IEEE}
	# }

	# @article{zhang2022wenet,
	# title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit},
	# author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei},
	# journal={arXiv preprint arXiv:2203.15455},
	# year={2022}
	# }
	#


	from __future__ import print_function

	import os
	import sys
	import copy
	import math
	import yaml
	import logging
	from typing import Tuple

	import torch
	import numpy as np

	from wenet.transformer.embedding import NoPositionalEncoding
	from wenet.utils.checkpoint import load_checkpoint
	from wenet.utils.init_model import init_model
	from wenet.bin.export_onnx_cpu import get_args, to_numpy, print_input_output_info


	try:
	import onnx
	import onnxruntime
	except ImportError:
	print("Please install onnx and onnxruntime!")
	sys.exit(1)


	logger = logging.getLogger(__file__)
	logger.setLevel(logging.INFO)


	class BPULayerNorm(torch.nn.Module):
	"""Refactor torch.nn.LayerNorm to meet 4-D dataflow."""

	def __init__(self, module, chunk_size=8, run_on_bpu=False):
	super().__init__()
	original = copy.deepcopy(module)
	self.hidden = module.weight.size(0)
	self.chunk_size = chunk_size
	self.run_on_bpu = run_on_bpu

	if self.run_on_bpu:
	self.weight = torch.nn.Parameter(
	module.weight.reshape(1, self.hidden, 1, 1).repeat(1, 1, 1, chunk_size)
	)
	self.bias = torch.nn.Parameter(
	module.bias.reshape(1, self.hidden, 1, 1).repeat(1, 1, 1, chunk_size)
	)
	self.negtive = torch.nn.Parameter(
	torch.ones((1, self.hidden, 1, chunk_size)) * -1.0
	)
	self.eps = torch.nn.Parameter(
	torch.zeros((1, self.hidden, 1, chunk_size)) + module.eps
	)
	self.mean_conv_1 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
	self.mean_conv_1.weight = torch.nn.Parameter(
	torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)
	)
	self.mean_conv_2 = torch.nn.Conv2d(self.hidden, 1, 1, bias=False)
	self.mean_conv_2.weight = torch.nn.Parameter(
	torch.ones(self.hidden, self.hidden, 1, 1) / (1.0 * self.hidden)
	)
	else:
	self.norm = module

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, self.chunk_size, self.hidden)
	orig_out = module(random_data)
	new_out = self.forward(random_data.transpose(1, 2).unsqueeze(2))
	np.testing.assert_allclose(
	to_numpy(orig_out),
	to_numpy(new_out.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	if self.run_on_bpu:
	u = self.mean_conv_1(x) # (1, h, 1, c)
	numerator = x + u * self.negtive # (1, h, 1, c)
	s = torch.pow(numerator, 2) # (1, h, 1, c)
	s = self.mean_conv_2(s) # (1, h, 1, c)
	denominator = torch.sqrt(s + self.eps) # (1, h, 1, c)
	x = torch.div(numerator, denominator) # (1, h, 1, c)
	x = x * self.weight + self.bias
	else:
	x = x.squeeze(2).transpose(1, 2).contiguous()
	x = self.norm(x)
	x = x.transpose(1, 2).contiguous().unsqueeze(2)
	return x


	class BPUIdentity(torch.nn.Module):
	"""Refactor torch.nn.Identity().
	For inserting BPU node whose input == output.
	"""

	def __init__(self, channels):
	super().__init__()
	self.channels = channels
	self.identity_conv = torch.nn.Conv2d(
	channels, channels, 1, groups=channels, bias=False
	)
	torch.nn.init.dirac_(self.identity_conv.weight.data, groups=channels)

	self.check_equal()

	def check_equal(self):
	random_data = torch.randn(1, self.channels, 1, 10)
	result = self.forward(random_data)
	np.testing.assert_allclose(
	to_numpy(random_data), to_numpy(result), rtol=1e-02, atol=1e-03
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Identity with 4-D dataflow, input == output.
	Args:
	x (torch.Tensor): (batch, in_channel, 1, time)

	Returns:
	(torch.Tensor): (batch, in_channel, 1, time).
	"""
	return self.identity_conv(x)


	class BPULinear(torch.nn.Module):
	"""Refactor torch.nn.Linear or pointwise_conv"""

	def __init__(self, module, is_pointwise_conv=False):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.idim = module.weight.size(1)
	self.odim = module.weight.size(0)
	self.is_pointwise_conv = is_pointwise_conv

	# Modify weight & bias
	self.linear = torch.nn.Conv2d(self.idim, self.odim, 1, 1)
	if is_pointwise_conv:
	# (odim, idim, kernel=1) -> (odim, idim, 1, 1)
	self.linear.weight = torch.nn.Parameter(module.weight.unsqueeze(-1))
	else:
	# (odim, idim) -> (odim, idim, 1, 1)
	self.linear.weight = torch.nn.Parameter(
	module.weight.unsqueeze(2).unsqueeze(3)
	)
	self.linear.bias = module.bias

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, 8, self.idim)
	if self.is_pointwise_conv:
	random_data = random_data.transpose(1, 2)
	original_result = module(random_data)
	if self.is_pointwise_conv:
	random_data = random_data.transpose(1, 2)
	original_result = original_result.transpose(1, 2)
	random_data = random_data.transpose(1, 2).unsqueeze(2)
	new_result = self.forward(random_data)
	np.testing.assert_allclose(
	to_numpy(original_result),
	to_numpy(new_result.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Linear with 4-D dataflow.
	Args:
	x (torch.Tensor): (batch, in_channel, 1, time)
	Returns:
	(torch.Tensor): (batch, out_channel, 1, time).
	"""
	return self.linear(x)


	class BPUGlobalCMVN(torch.nn.Module):
	"""Refactor wenet/transformer/cmvn.py::GlobalCMVN"""

	def __init__(self, module):
	super().__init__()
	# Unchanged submodules and attributes
	self.norm_var = module.norm_var

	# NOTE(xcsong): Expand to 4-D tensor, (mel_dim) -> (1, 1, mel_dim, 1)
	self.mean = module.mean.unsqueeze(-1).unsqueeze(0).unsqueeze(0)
	self.istd = module.istd.unsqueeze(-1).unsqueeze(0).unsqueeze(0)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""CMVN with 4-D dataflow.
	Args:
	x (torch.Tensor): (batch, 1, mel_dim, time)
	Returns:
	(torch.Tensor): normalized feature with same shape.
	"""
	x = x - self.mean
	if self.norm_var:
	x = x * self.istd
	return x


	class BPUConv2dSubsampling8(torch.nn.Module):
	"""Refactor wenet/transformer/subsampling.py::Conv2dSubsampling8

	NOTE(xcsong): Only support pos_enc_class == NoPositionalEncoding
	"""

	def __init__(self, module):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.right_context = module.right_context
	self.subsampling_rate = module.subsampling_rate
	assert isinstance(module.pos_enc, NoPositionalEncoding)

	# 1. Modify self.conv
	# NOTE(xcsong): We change input shape from (1, 1, frames, mel_dim)
	# to (1, 1, mel_dim, frames) for more efficient computation.
	self.conv = module.conv
	for idx in [0, 2, 4]:
	self.conv[idx].weight = torch.nn.Parameter(
	module.conv[idx].weight.transpose(2, 3)
	)

	# 2. Modify self.linear
	# NOTE(xcsong): Split final projection to meet the requirment of
	# maximum kernel_size (7 for XJ3)
	self.linear = torch.nn.ModuleList()
	odim = module.linear.weight.size(0) # 512, in this case
	freq = module.linear.weight.size(1) // odim # 4608 // 512 == 9
	self.odim, self.freq = odim, freq
	weight = module.linear.weight.reshape(
	odim, odim, freq, 1
	) # (odim, odim * freq) -> (odim, odim, freq, 1)
	self.split_size = []
	num_split = (freq - 1) // 7 + 1 # XJ3 requires kernel_size <= 7
	slice_begin = 0
	for idx in range(num_split):
	kernel_size = min(freq, (idx + 1) * 7) - idx * 7
	conv_ele = torch.nn.Conv2d(odim, odim, (kernel_size, 1), (kernel_size, 1))
	conv_ele.weight = torch.nn.Parameter(
	weight[:, :, slice_begin : slice_begin + kernel_size, :]
	)
	conv_ele.bias = torch.nn.Parameter(torch.zeros_like(conv_ele.bias))
	self.linear.append(conv_ele)
	self.split_size.append(kernel_size)
	slice_begin += kernel_size
	self.linear[0].bias = torch.nn.Parameter(module.linear.bias)

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, 67, 80)
	mask = torch.zeros(1, 1, 67)
	original_result, _, _ = module(random_data, mask) # (1, 8, 512)
	random_data = random_data.transpose(1, 2).unsqueeze(0) # (1, 1, 80, 67)
	new_result = self.forward(random_data) # (1, 512, 1, 8)
	np.testing.assert_allclose(
	to_numpy(original_result),
	to_numpy(new_result.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Subsample x with 4-D dataflow.
	Args:
	x (torch.Tensor): Input tensor (#batch, 1, mel_dim, time).

	Returns:
	torch.Tensor: Subsampled tensor (#batch, odim, 1, time'),
	where time' = time // 8.
	"""
	x = self.conv(x) # (1, odim, freq, time')
	x_out = torch.zeros(x.size(0), self.odim, 1, x.size(3))
	x = torch.split(x, self.split_size, dim=2)
	for idx, (x_part, layer) in enumerate(zip(x, self.linear)):
	x_out += layer(x_part)
	return x_out


	class BPUMultiHeadedAttention(torch.nn.Module):
	"""Refactor wenet/transformer/attention.py::MultiHeadedAttention

	NOTE(xcsong): Only support attention_class == MultiHeadedAttention,
	we do not consider RelPositionMultiHeadedAttention currently.
	"""

	def __init__(self, module, chunk_size, left_chunks):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.d_k = module.d_k
	self.h = module.h
	n_feat = self.d_k * self.h
	self.chunk_size = chunk_size
	self.left_chunks = left_chunks
	self.time = chunk_size * (left_chunks + 1)
	self.activation = torch.nn.Softmax(dim=-1)

	# 1. Modify self.linear_x
	self.linear_q = BPULinear(module.linear_q)
	self.linear_k = BPULinear(module.linear_k)
	self.linear_v = BPULinear(module.linear_v)
	self.linear_out = BPULinear(module.linear_out)
	# 2. denom
	self.register_buffer(
	"denom", torch.full((1, self.h, 1, 1), 1.0 / math.sqrt(self.d_k))
	)

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, self.chunk_size, self.d_k * self.h)
	mask = torch.ones((1, self.h, self.chunk_size, self.time), dtype=torch.bool)
	cache = torch.zeros(1, self.h, self.chunk_size * self.left_chunks, self.d_k * 2)
	original_out, original_cache = module(
	random_data,
	random_data,
	random_data,
	mask[:, 0, :, :],
	torch.empty(0),
	cache,
	)
	random_data = random_data.transpose(1, 2).unsqueeze(2)
	cache = cache.reshape(
	1, self.h, self.d_k * 2, self.chunk_size * self.left_chunks
	)
	new_out, new_cache = self.forward(
	random_data, random_data, random_data, mask, cache
	)
	np.testing.assert_allclose(
	to_numpy(original_out),
	to_numpy(new_out.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(original_cache),
	to_numpy(new_cache.transpose(2, 3)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(
	self,
	q: torch.Tensor,
	k: torch.Tensor,
	v: torch.Tensor,
	mask: torch.Tensor,
	cache: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Compute scaled dot product attention.

	Args:
	q (torch.Tensor): Query tensor (#batch, size, 1, chunk_size).
	k (torch.Tensor): Key tensor (#batch, size, 1, chunk_size).
	v (torch.Tensor): Value tensor (#batch, size, 1, chunk_size).
	mask (torch.Tensor): Mask tensor,
	(#batch, head, chunk_size, cache_t + chunk_size).
	cache (torch.Tensor): Cache tensor
	(1, head, d_k * 2, cache_t),
	where `cache_t == chunk_size * left_chunks`.


	Returns:
	torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
	torch.Tensor: Cache tensor
	(1, head, d_k * 2, cache_t + chunk_size)
	where `cache_t == chunk_size * left_chunks`
	"""
	# 1. Forward QKV
	q = self.linear_q(q) # (1, d, 1, c) d == size, c == chunk_size
	k = self.linear_k(k) # (1, d, 1, c)
	v = self.linear_v(v) # (1, d, 1, c)
	q = q.view(1, self.h, self.d_k, self.chunk_size)
	k = k.view(1, self.h, self.d_k, self.chunk_size)
	v = v.view(1, self.h, self.d_k, self.chunk_size)
	q = q.transpose(2, 3) # (batch, head, time1, d_k)
	k_cache, v_cache = torch.split(cache, cache.size(2) // 2, dim=2)
	k = torch.cat((k_cache, k), dim=3)
	v = torch.cat((v_cache, v), dim=3)
	new_cache = torch.cat((k, v), dim=2)
	# 2. (Q^T)K
	scores = torch.matmul(q, k) * self.denom # (#b, n_head, time1, time2)
	# 3. Forward attention
	mask = mask.eq(0)
	scores = scores.masked_fill(mask, -float("inf"))
	attn = self.activation(scores).masked_fill(mask, 0.0)
	attn = attn.transpose(2, 3)
	x = torch.matmul(v, attn)
	x = x.view(1, self.d_k * self.h, 1, self.chunk_size)
	x_out = self.linear_out(x)
	return x_out, new_cache


	class BPUConvolution(torch.nn.Module):
	"""Refactor wenet/transformer/convolution.py::ConvolutionModule

	NOTE(xcsong): Only suport use_layer_norm == False
	"""

	def __init__(self, module):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.lorder = module.lorder
	self.use_layer_norm = False
	self.activation = module.activation
	channels = module.pointwise_conv1.weight.size(1)
	self.channels = channels
	kernel_size = module.depthwise_conv.weight.size(2)
	assert module.use_layer_norm is False

	# 1. Modify self.pointwise_conv1
	self.pointwise_conv1 = BPULinear(module.pointwise_conv1, True)

	# 2. Modify self.depthwise_conv
	self.depthwise_conv = torch.nn.Conv2d(
	channels, channels, (1, kernel_size), stride=1, groups=channels
	)
	self.depthwise_conv.weight = torch.nn.Parameter(
	module.depthwise_conv.weight.unsqueeze(-2)
	)
	self.depthwise_conv.bias = torch.nn.Parameter(module.depthwise_conv.bias)

	# 3. Modify self.norm, Only support batchnorm2d
	self.norm = torch.nn.BatchNorm2d(channels)
	self.norm.training = False
	self.norm.num_features = module.norm.num_features
	self.norm.eps = module.norm.eps
	self.norm.momentum = module.norm.momentum
	self.norm.weight = torch.nn.Parameter(module.norm.weight)
	self.norm.bias = torch.nn.Parameter(module.norm.bias)
	self.norm.running_mean = module.norm.running_mean
	self.norm.running_var = module.norm.running_var

	# 4. Modify self.pointwise_conv2
	self.pointwise_conv2 = BPULinear(module.pointwise_conv2, True)

	# 5. Identity conv, for running `concat` on BPU
	self.identity = BPUIdentity(channels)

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, 8, self.channels)
	cache = torch.zeros((1, self.channels, self.lorder))
	original_out, original_cache = module(random_data, cache=cache)
	random_data = random_data.transpose(1, 2).unsqueeze(2)
	cache = cache.unsqueeze(2)
	new_out, new_cache = self.forward(random_data, cache)
	np.testing.assert_allclose(
	to_numpy(original_out),
	to_numpy(new_out.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(original_cache),
	to_numpy(new_cache.squeeze(2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(
	self, x: torch.Tensor, cache: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Compute convolution module.
	Args:
	x (torch.Tensor): Input tensor (#batch, channels, 1, chunk_size).
	cache (torch.Tensor): left context cache, it is only
	used in causal convolution (#batch, channels, 1, cache_t).
	Returns:
	torch.Tensor: Output tensor (#batch, channels, 1, chunk_size).
	torch.Tensor: Cache tensor (#batch, channels, 1, cache_t).
	"""
	# Concat cache
	x = torch.cat((self.identity(cache), self.identity(x)), dim=3)
	new_cache = x[:, :, :, -self.lorder :]

	# GLU mechanism
	x = self.pointwise_conv1(x) # (batch, 2*channel, 1, dim)
	x = torch.nn.functional.glu(x, dim=1) # (b, channel, 1, dim)

	# Depthwise Conv
	x = self.depthwise_conv(x)
	x = self.activation(self.norm(x))
	x = self.pointwise_conv2(x)
	return x, new_cache


	class BPUFFN(torch.nn.Module):
	"""Refactor wenet/transformer/positionwise_feed_forward.py::PositionwiseFeedForward"""

	def __init__(self, module):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.activation = module.activation

	# 1. Modify self.w_x
	self.w_1 = BPULinear(module.w_1)
	self.w_2 = BPULinear(module.w_2)

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, 8, self.w_1.idim)
	original_out = module(random_data)
	random_data = random_data.transpose(1, 2).unsqueeze(2)
	new_out = self.forward(random_data)
	np.testing.assert_allclose(
	to_numpy(original_out),
	to_numpy(new_out.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Forward function.

	Args:
	xs: input tensor (B, D, 1, L)
	Returns:
	output tensor, (B, D, 1, L)
	"""
	return self.w_2(self.activation(self.w_1(x)))


	class BPUConformerEncoderLayer(torch.nn.Module):
	"""Refactor wenet/transformer/encoder_layer.py::ConformerEncoderLayer"""

	def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.size = module.size
	assert module.normalize_before is True
	assert module.concat_after is False

	# 1. Modify submodules
	self.feed_forward_macaron = BPUFFN(module.feed_forward_macaron)
	self.self_attn = BPUMultiHeadedAttention(
	module.self_attn, chunk_size, left_chunks
	)
	self.conv_module = BPUConvolution(module.conv_module)
	self.feed_forward = BPUFFN(module.feed_forward)

	# 2. Modify norms
	self.norm_ff = BPULayerNorm(module.norm_ff, chunk_size, ln_run_on_bpu)
	self.norm_mha = BPULayerNorm(module.norm_mha, chunk_size, ln_run_on_bpu)
	self.norm_ff_macron = BPULayerNorm(
	module.norm_ff_macaron, chunk_size, ln_run_on_bpu
	)
	self.norm_conv = BPULayerNorm(module.norm_conv, chunk_size, ln_run_on_bpu)
	self.norm_final = BPULayerNorm(module.norm_final, chunk_size, ln_run_on_bpu)

	# 3. 4-D ff_scale
	self.register_buffer(
	"ff_scale", torch.full((1, self.size, 1, 1), module.ff_scale)
	)

	self.check_equal(original)

	def check_equal(self, module):
	time1 = self.self_attn.chunk_size
	time2 = self.self_attn.time
	h, d_k = self.self_attn.h, self.self_attn.d_k
	random_x = torch.randn(1, time1, self.size)
	att_mask = torch.ones(1, h, time1, time2)
	att_cache = torch.zeros(1, h, time2 - time1, d_k * 2)
	cnn_cache = torch.zeros(1, self.size, self.conv_module.lorder)
	original_x, _, original_att_cache, original_cnn_cache = module(
	random_x,
	att_mask[:, 0, :, :],
	torch.empty(0),
	att_cache=att_cache,
	cnn_cache=cnn_cache,
	)
	random_x = random_x.transpose(1, 2).unsqueeze(2)
	att_cache = att_cache.reshape(1, h, d_k * 2, time2 - time1)
	cnn_cache = cnn_cache.unsqueeze(2)
	new_x, new_att_cache, new_cnn_cache = self.forward(
	random_x, att_mask, att_cache, cnn_cache
	)
	np.testing.assert_allclose(
	to_numpy(original_att_cache),
	to_numpy(new_att_cache.transpose(2, 3)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(original_x),
	to_numpy(new_x.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(original_cnn_cache),
	to_numpy(new_cnn_cache.squeeze(2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(
	self,
	x: torch.Tensor,
	att_mask: torch.Tensor,
	att_cache: torch.Tensor,
	cnn_cache: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""Compute encoded features.

	Args:
	x (torch.Tensor): (#batch, size, 1, chunk_size)
	att_mask (torch.Tensor): Mask tensor for the input
	(#batch, head, chunk_size, cache_t1 + chunk_size),
	att_cache (torch.Tensor): Cache tensor of the KEY & VALUE
	(#batch=1, head, d_k * 2, cache_t1), head * d_k == size.
	cnn_cache (torch.Tensor): Convolution cache in conformer layer
	(#batch=1, size, 1, cache_t2)
	Returns:
	torch.Tensor: Output tensor (#batch, size, 1, chunk_size).
	torch.Tensor: att_cache tensor,
	(1, head, d_k * 2, cache_t1 + chunk_size).
	torch.Tensor: cnn_cahce tensor (#batch, size, 1, cache_t2).
	"""
	# 1. ffn_macaron
	residual = x
	x = self.norm_ff_macron(x)
	x = residual + self.ff_scale * self.feed_forward_macaron(x)

	# 2. attention
	residual = x
	x = self.norm_mha(x)
	x_att, new_att_cache = self.self_attn(x, x, x, att_mask, att_cache)
	x = residual + x_att

	# 3. convolution
	residual = x
	x = self.norm_conv(x)
	x, new_cnn_cache = self.conv_module(x, cnn_cache)
	x = residual + x

	# 4. ffn
	residual = x
	x = self.norm_ff(x)
	x = residual + self.ff_scale * self.feed_forward(x)

	# 5. final post-norm
	x = self.norm_final(x)

	return x, new_att_cache, new_cnn_cache


	class BPUConformerEncoder(torch.nn.Module):
	"""Refactor wenet/transformer/encoder.py::ConformerEncoder"""

	def __init__(self, module, chunk_size, left_chunks, ln_run_on_bpu=False):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	output_size = module.output_size()
	self._output_size = module.output_size()
	self.after_norm = module.after_norm
	self.chunk_size = chunk_size
	self.left_chunks = left_chunks
	self.head = module.encoders[0].self_attn.h
	self.layers = len(module.encoders)

	# 1. Modify submodules
	self.global_cmvn = BPUGlobalCMVN(module.global_cmvn)
	self.embed = BPUConv2dSubsampling8(module.embed)
	self.encoders = torch.nn.ModuleList()
	for layer in module.encoders:
	self.encoders.append(
	BPUConformerEncoderLayer(layer, chunk_size, left_chunks, ln_run_on_bpu)
	)

	# 2. Auxiliary conv
	self.identity_cnncache = BPUIdentity(output_size)

	self.check_equal(original)

	def check_equal(self, module):
	time1 = self.encoders[0].self_attn.chunk_size
	time2 = self.encoders[0].self_attn.time
	layers = self.layers
	h, d_k = self.head, self.encoders[0].self_attn.d_k
	decoding_window = (
	(self.chunk_size - 1) * module.embed.subsampling_rate
	+ module.embed.right_context
	+ 1
	)
	lorder = self.encoders[0].conv_module.lorder
	random_x = torch.randn(1, decoding_window, 80)
	att_mask = torch.ones(1, h, time1, time2)
	att_cache = torch.zeros(layers, h, time2 - time1, d_k * 2)
	cnn_cache = torch.zeros(layers, 1, self._output_size, lorder)
	orig_x, orig_att_cache, orig_cnn_cache = module.forward_chunk(
	random_x,
	0,
	time2 - time1,
	att_mask=att_mask[:, 0, :, :],
	att_cache=att_cache,
	cnn_cache=cnn_cache,
	)
	random_x = random_x.unsqueeze(0)
	att_cache = att_cache.reshape(1, h * layers, d_k * 2, time2 - time1)
	cnn_cache = cnn_cache.reshape(1, self._output_size, layers, lorder)
	new_x, new_att_cache, new_cnn_cache = self.forward(
	random_x, att_cache, cnn_cache, att_mask
	)
	caches = torch.split(new_att_cache, h, dim=1)
	caches = [c.transpose(2, 3) for c in caches]
	np.testing.assert_allclose(
	to_numpy(orig_att_cache),
	to_numpy(torch.cat(caches, dim=0)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(orig_x),
	to_numpy(new_x.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)
	np.testing.assert_allclose(
	to_numpy(orig_cnn_cache),
	to_numpy(new_cnn_cache.transpose(0, 2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(
	self,
	xs: torch.Tensor,
	att_cache: torch.Tensor,
	cnn_cache: torch.Tensor,
	att_mask: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	""" Forward just one chunk

	Args:
	xs (torch.Tensor): chunk input, with shape (b=1, 1, time, mel-dim),
	where `time == (chunk_size - 1) * subsample_rate + \
	subsample.right_context + 1`
	att_cache (torch.Tensor): cache tensor for KEY & VALUE in
	transformer/conformer attention, with shape
	(1, head * elayers, d_k * 2, cache_t1), where
	`head * d_k == hidden-dim` and
	`cache_t1 == chunk_size * left_chunks`.
	cnn_cache (torch.Tensor): cache tensor for cnn_module in conformer,
	(1, hidden-dim, elayers, cache_t2), where
	`cache_t2 == cnn.lorder - 1`
	att_mask (torch.Tensor): Mask tensor for the input
	(#batch, head, chunk_size, cache_t1 + chunk_size),

	Returns:
	torch.Tensor: output of current input xs,
	with shape (b=1, hidden-dim, 1, chunk_size).
	torch.Tensor: new attention cache required for next chunk, with
	same shape as the original att_cache.
	torch.Tensor: new conformer cnn cache required for next chunk, with
	same shape as the original cnn_cache.
	"""
	# xs: (B, 1, time, mel_dim) -> (B, 1, mel_dim, time)
	xs = xs.transpose(2, 3)
	xs = self.global_cmvn(xs)
	# xs: (B, 1, mel_dim, time) -> (B, hidden_dim, 1, chunk_size)
	xs = self.embed(xs)

	att_cache = torch.split(att_cache, self.head, dim=1)
	cnn_cache = self.identity_cnncache(cnn_cache)
	cnn_cache = torch.split(cnn_cache, 1, dim=2)
	r_att_cache = []
	r_cnn_cache = []
	for i, layer in enumerate(self.encoders):
	xs, new_att_cache, new_cnn_cache = layer(
	xs, att_mask, att_cache=att_cache[i], cnn_cache=cnn_cache[i]
	)
	r_att_cache.append(new_att_cache[:, :, :, self.chunk_size :])
	r_cnn_cache.append(new_cnn_cache)
	r_att_cache = torch.cat(r_att_cache, dim=1)
	r_cnn_cache = self.identity_cnncache(torch.cat(r_cnn_cache, dim=2))

	xs = xs.squeeze(2).transpose(1, 2).contiguous()
	xs = self.after_norm(xs)
	# NOTE(xcsong): 4D in, 4D out to meet the requirment of CTC input.
	xs = xs.transpose(1, 2).contiguous().unsqueeze(2) # (B, C, 1, T)

	return (xs, r_att_cache, r_cnn_cache)


	class BPUCTC(torch.nn.Module):
	"""Refactor wenet/transformer/ctc.py::CTC"""

	def __init__(self, module):
	super().__init__()
	# Unchanged submodules and attributes
	original = copy.deepcopy(module)
	self.idim = module.ctc_lo.weight.size(1)
	num_class = module.ctc_lo.weight.size(0)

	# 1. Modify self.ctc_lo, Split final projection to meet the
	# requirment of maximum in/out channels (2048 for XJ3)
	self.ctc_lo = torch.nn.ModuleList()
	self.split_size = []
	num_split = (num_class - 1) // 2048 + 1
	for idx in range(num_split):
	out_channel = min(num_class, (idx + 1) * 2048) - idx * 2048
	conv_ele = torch.nn.Conv2d(self.idim, out_channel, 1, 1)
	self.ctc_lo.append(conv_ele)
	self.split_size.append(out_channel)
	orig_weight = torch.split(module.ctc_lo.weight, self.split_size, dim=0)
	orig_bias = torch.split(module.ctc_lo.bias, self.split_size, dim=0)
	for i, (w, b) in enumerate(zip(orig_weight, orig_bias)):
	w = w.unsqueeze(2).unsqueeze(3)
	self.ctc_lo[i].weight = torch.nn.Parameter(w)
	self.ctc_lo[i].bias = torch.nn.Parameter(b)

	self.check_equal(original)

	def check_equal(self, module):
	random_data = torch.randn(1, 100, self.idim)
	original_result = module.ctc_lo(random_data)
	random_data = random_data.transpose(1, 2).unsqueeze(2)
	new_result = self.forward(random_data)
	np.testing.assert_allclose(
	to_numpy(original_result),
	to_numpy(new_result.squeeze(2).transpose(1, 2)),
	rtol=1e-02,
	atol=1e-03,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""frame activations, without softmax.

	Args:
	Tensor x: 4d tensor (B, hidden_dim, 1, chunk_size)
	Returns:
	torch.Tensor: (B, num_class, 1, chunk_size)
	"""
	out = []
	for i, layer in enumerate(self.ctc_lo):
	out.append(layer(x))
	out = torch.cat(out, dim=1)
	return out


	def export_encoder(asr_model, args):
	logger.info("Stage-1: export encoder")
	decode_window, mel_dim = args.decoding_window, args.feature_size
	encoder = BPUConformerEncoder(
	asr_model.encoder,
	args.chunk_size,
	args.num_decoding_left_chunks,
	args.ln_run_on_bpu,
	)
	encoder.eval()
	encoder_outpath = os.path.join(args.output_dir, "encoder.onnx")

	logger.info("Stage-1.1: prepare inputs for encoder")
	chunk = torch.randn((1, 1, decode_window, mel_dim))
	required_cache_size = encoder.chunk_size * encoder.left_chunks
	kv_time = required_cache_size + encoder.chunk_size
	hidden, layers = encoder._output_size, len(encoder.encoders)
	head = encoder.encoders[0].self_attn.h
	d_k = hidden // head
	lorder = encoder.encoders[0].conv_module.lorder
	att_cache = torch.zeros(1, layers * head, d_k * 2, required_cache_size)
	att_mask = torch.ones((1, head, encoder.chunk_size, kv_time))
	att_mask[:, :, :, :required_cache_size] = 0
	cnn_cache = torch.zeros((1, hidden, layers, lorder))
	inputs = (chunk, att_cache, cnn_cache, att_mask)
	logger.info(
	"chunk.size(): {} att_cache.size(): {} "
	"cnn_cache.size(): {} att_mask.size(): {}".format(
	list(chunk.size()),
	list(att_cache.size()),
	list(cnn_cache.size()),
	list(att_mask.size()),
	)
	)

	logger.info("Stage-1.2: torch.onnx.export")
	# NOTE(xcsong): Below attributes will be used in
	# onnx2horizonbin.py::generate_config()
	attributes = {}
	attributes["input_name"] = "chunk;att_cache;cnn_cache;att_mask"
	attributes["output_name"] = "output;r_att_cache;r_cnn_cache"
	attributes["input_type"] = "featuremap;featuremap;featuremap;featuremap"
	attributes["norm_type"] = "no_preprocess;no_preprocess;no_preprocess;no_preprocess"
	attributes["input_layout_train"] = "NCHW;NCHW;NCHW;NCHW"
	attributes["input_layout_rt"] = "NCHW;NCHW;NCHW;NCHW"
	attributes[
	"input_shape"
	] = "{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{};{}x{}x{}x{}".format(
	chunk.size(0),
	chunk.size(1),
	chunk.size(2),
	chunk.size(3),
	att_cache.size(0),
	att_cache.size(1),
	att_cache.size(2),
	att_cache.size(3),
	cnn_cache.size(0),
	cnn_cache.size(1),
	cnn_cache.size(2),
	cnn_cache.size(3),
	att_mask.size(0),
	att_mask.size(1),
	att_mask.size(2),
	att_mask.size(3),
	)
	torch.onnx.export( # NOTE(xcsong): only support opset==11
	encoder,
	inputs,
	encoder_outpath,
	opset_version=11,
	export_params=True,
	do_constant_folding=True,
	input_names=attributes["input_name"].split(";"),
	output_names=attributes["output_name"].split(";"),
	dynamic_axes=None,
	verbose=False,
	)
	onnx_encoder = onnx.load(encoder_outpath)
	for k in vars(args):
	meta = onnx_encoder.metadata_props.add()
	meta.key, meta.value = str(k), str(getattr(args, k))
	for k in attributes:
	meta = onnx_encoder.metadata_props.add()
	meta.key, meta.value = str(k), str(attributes[k])
	onnx.checker.check_model(onnx_encoder)
	onnx.helper.printable_graph(onnx_encoder.graph)
	onnx.save(onnx_encoder, encoder_outpath)
	print_input_output_info(onnx_encoder, "onnx_encoder")
	logger.info("Export onnx_encoder, done! see {}".format(encoder_outpath))

	logger.info("Stage-1.3: check onnx_encoder and torch_encoder")
	torch_output = []
	torch_chunk, torch_att_mask = copy.deepcopy(chunk), copy.deepcopy(att_mask)
	torch_att_cache = copy.deepcopy(att_cache)
	torch_cnn_cache = copy.deepcopy(cnn_cache)
	for i in range(10):
	logger.info(
	"torch chunk-{}: {}, att_cache: {}, cnn_cache: {}"
	", att_mask: {}".format(
	i,
	list(torch_chunk.size()),
	list(torch_att_cache.size()),
	list(torch_cnn_cache.size()),
	list(torch_att_mask.size()),
	)
	)
	torch_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)) :] = 1
	out, torch_att_cache, torch_cnn_cache = encoder(
	torch_chunk, torch_att_cache, torch_cnn_cache, torch_att_mask
	)
	torch_output.append(out)
	torch_output = torch.cat(torch_output, dim=-1)

	onnx_output = []
	onnx_chunk, onnx_att_mask = to_numpy(chunk), to_numpy(att_mask)
	onnx_att_cache = to_numpy(att_cache)
	onnx_cnn_cache = to_numpy(cnn_cache)
	ort_session = onnxruntime.InferenceSession(encoder_outpath)
	input_names = [node.name for node in onnx_encoder.graph.input]
	for i in range(10):
	logger.info(
	"onnx chunk-{}: {}, att_cache: {}, cnn_cache: {},"
	" att_mask: {}".format(
	i,
	onnx_chunk.shape,
	onnx_att_cache.shape,
	onnx_cnn_cache.shape,
	onnx_att_mask.shape,
	)
	)
	onnx_att_mask[:, :, :, -(encoder.chunk_size * (i + 1)) :] = 1
	ort_inputs = {
	"chunk": onnx_chunk,
	"att_cache": onnx_att_cache,
	"cnn_cache": onnx_cnn_cache,
	"att_mask": onnx_att_mask,
	}
	ort_outs = ort_session.run(None, ort_inputs)
	onnx_att_cache, onnx_cnn_cache = ort_outs[1], ort_outs[2]
	onnx_output.append(ort_outs[0])
	onnx_output = np.concatenate(onnx_output, axis=-1)

	np.testing.assert_allclose(
	to_numpy(torch_output), onnx_output, rtol=1e-03, atol=1e-04
	)
	meta = ort_session.get_modelmeta()
	logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
	logger.info("Check onnx_encoder, pass!")
	return encoder, ort_session


	def export_ctc(asr_model, args):
	logger.info("Stage-2: export ctc")
	ctc = BPUCTC(asr_model.ctc).eval()
	ctc_outpath = os.path.join(args.output_dir, "ctc.onnx")

	logger.info("Stage-2.1: prepare inputs for ctc")
	hidden = torch.randn((1, args.output_size, 1, args.chunk_size))

	logger.info("Stage-2.2: torch.onnx.export")
	# NOTE(xcsong): Below attributes will be used in
	# onnx2horizonbin.py::generate_config()
	attributes = {}
	attributes["input_name"], attributes["input_type"] = "hidden", "featuremap"
	attributes["norm_type"] = "no_preprocess"
	attributes["input_layout_train"] = "NCHW"
	attributes["input_layout_rt"] = "NCHW"
	attributes["input_shape"] = "{}x{}x{}x{}".format(
	hidden.size(0),
	hidden.size(1),
	hidden.size(2),
	hidden.size(3),
	)
	torch.onnx.export(
	ctc,
	hidden,
	ctc_outpath,
	opset_version=11,
	export_params=True,
	do_constant_folding=True,
	input_names=["hidden"],
	output_names=["probs"],
	dynamic_axes=None,
	verbose=False,
	)
	onnx_ctc = onnx.load(ctc_outpath)
	for k in vars(args):
	meta = onnx_ctc.metadata_props.add()
	meta.key, meta.value = str(k), str(getattr(args, k))
	for k in attributes:
	meta = onnx_ctc.metadata_props.add()
	meta.key, meta.value = str(k), str(attributes[k])
	onnx.checker.check_model(onnx_ctc)
	onnx.helper.printable_graph(onnx_ctc.graph)
	onnx.save(onnx_ctc, ctc_outpath)
	print_input_output_info(onnx_ctc, "onnx_ctc")
	logger.info("Export onnx_ctc, done! see {}".format(ctc_outpath))

	logger.info("Stage-2.3: check onnx_ctc and torch_ctc")
	torch_output = ctc(hidden)
	ort_session = onnxruntime.InferenceSession(ctc_outpath)
	onnx_output = ort_session.run(None, {"hidden": to_numpy(hidden)})

	np.testing.assert_allclose(
	to_numpy(torch_output), onnx_output[0], rtol=1e-03, atol=1e-04
	)
	meta = ort_session.get_modelmeta()
	logger.info("custom_metadata_map={}".format(meta.custom_metadata_map))
	logger.info("Check onnx_ctc, pass!")
	return ctc, ort_session


	def export_decoder(asr_model, args):
	logger.info("Currently, Decoder is not supported.")


	if __name__ == "__main__":
	torch.manual_seed(777)
	args = get_args()
	args.ln_run_on_bpu = False
	# NOTE(xcsong): XJ3 BPU only support static shapes
	assert args.chunk_size > 0
	assert args.num_decoding_left_chunks > 0
	os.system("mkdir -p " + args.output_dir)
	os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

	with open(args.config, "r") as fin:
	configs = yaml.load(fin, Loader=yaml.FullLoader)

	model = init_model(configs)
	load_checkpoint(model, args.checkpoint)
	model.eval()
	print(model)

	args.feature_size = configs["input_dim"]
	args.output_size = model.encoder.output_size()
	args.decoding_window = (
	(args.chunk_size - 1) * model.encoder.embed.subsampling_rate
	+ model.encoder.embed.right_context
	+ 1
	)

	export_encoder(model, args)
	export_ctc(model, args)
	export_decoder(model, args)