Source code for hyperion.torch.narchs.conformer_encoder_v1

"""
 Copyright 2019 Johns Hopkins University  (Author: Jesus Villalba, Nanxin Chen)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

from jsonargparse import ArgumentParser, ActionParser

import torch
import torch.nn as nn

from ..layers import ActivationFactory as AF
from ..layers import NormLayer1dFactory as NLF
from ..layers import PosEncoder, RelPosEncoder, NoPosEncoder
from ..layer_blocks import ConformerEncoderBlockV1 as EBlock
from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler
from .net_arch import NetArch


[docs]class ConformerEncoderV1(NetArch): """Conformer encoder introduced in https://arxiv.org/pdf/2005.08100.pdf This includes some optional extra features not included in the original paper: - Choose local-attention (attending only to close frames instead of all the frames in the sequence) - Choose number of conv blocks in each conformer layer - Squeeze-Excitation after depthwise-conv - Allows downsampling in time dimension - Allows choosing activation and layer normalization type We call this Conformer+ This becomes a standard Transformer by setting conv_repeats=0, pos_enc_type='abs', ff_macaron=False. Attributes: in_feats: input features dimension d_model: encoder blocks feature dimension num_heads: number of heads num_blocks: number of self attn blocks att_type: string in ['scaled-dot-prod-att-v1', 'local-scaled-dot-prod-att-v1'] att_context: maximum context range for local attention conv_repeats: number of conv blocks in each conformer block conv_kernel_sizes: kernel size for conv blocks conv_strides: stride for depth-wise conv in the first conv block of each conformer block ff_type: string in ['linear', 'conv1dx2', 'conv1d-linear'] d_ff: dimension of middle layer in feed_forward block ff_kernel_size: kernel size for convolutional versions of ff block dropout_rate: dropout rate for ff and conv blocks pos_dropout_rate: dropout rate for positional encoder att_dropout_rate: dropout rate for attention block in_layer_type: input layer block type in ['linear','conv2d-sub', 'embed', None] pos_enc_type: type of positional encoder ['no', 'abs', 'rel'] causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes that query q_i only attents to key k_j when j<=i no_pos_enc: if True, it doesn't use positional encoder. hid_act: hidden activations in ff and input blocks conv_norm_layer: norm layer constructor or str for conv block, if None it uses BatchNorm1d se_r: Squeeze-Excitation compression ratio, if None it doesn't use Squeeze-Excitation ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. red_lnorms: it True, use redundant LNorm layers at the output of the conformer blocks as in the paper concat_after: if True, if concats attention input and output and apply linear transform, i.e., y = x + linear(concat(x, att(x))) if False, y = x + att(x) padding_idx: padding idx for embed layer in_time_dim: time dimension in the input Tensor out_time_dim: dimension that we want to be time in the output tensor rel_pos_enc: if True, use relative postional encodings, absolute encodings otherwise. (deprecated) red_lnorm: (deprecated) """
[docs] def __init__( self, in_feats, d_model=256, num_heads=4, num_blocks=6, att_type="scaled-dot-prod-v1", att_context=25, conv_repeats=1, conv_kernel_sizes=31, conv_strides=1, ff_type="linear", d_ff=2048, ff_kernel_size=1, dropout_rate=0.1, pos_dropout_rate=0.1, att_dropout_rate=0.0, in_layer_type="conv2d-sub", pos_enc_type="rel", causal_pos_enc=False, hid_act="swish", conv_norm_layer=None, se_r=None, ff_macaron=True, red_lnorms=False, concat_after=False, padding_idx=-1, in_time_dim=-1, out_time_dim=1, rel_pos_enc=True, red_lnorm=False, ): super().__init__() self.in_feats = in_feats self.d_model = d_model self.num_heads = num_heads self.num_blocks = num_blocks self.att_type = att_type self.att_context = att_context self.conv_repeats = self._standarize_cblocks_param( conv_repeats, num_blocks, "conv_repeats" ) self.conv_kernel_sizes = self._standarize_cblocks_param( conv_kernel_sizes, num_blocks, "conv_kernel_sizes" ) self.conv_strides = self._standarize_cblocks_param( conv_strides, num_blocks, "conv_strides" ) self.ff_type = ff_type self.d_ff = d_ff self.ff_kernel_size = ff_kernel_size self.dropout_rate = dropout_rate self.pos_enc_type = pos_enc_type self.causal_pos_enc = causal_pos_enc self.att_dropout_rate = att_dropout_rate self.pos_dropout_rate = pos_dropout_rate self.in_layer_type = in_layer_type self.se_r = se_r self.ff_macaron = ff_macaron self.red_lnorms = red_lnorms self.concat_after = concat_after self.padding_idx = padding_idx self.in_time_dim = in_time_dim self.out_time_dim = out_time_dim self.hid_act = hid_act self.conv_norm_layer = conv_norm_layer norm_groups = None if conv_norm_layer == "group-norm": norm_groups = min(d_model // 2, 32) self._conv_norm_layer = NLF.create(conv_norm_layer, norm_groups) self._make_in_layer() blocks = [] for i in range(num_blocks): blocks.append( EBlock( d_model, att_type, num_heads, self.conv_repeats[i], self.conv_kernel_sizes[i], self.conv_strides[i], ff_type, d_ff, ff_kernel_size, hid_act=hid_act, dropout_rate=dropout_rate, att_context=att_context, att_dropout_rate=att_dropout_rate, pos_enc_type=pos_enc_type, causal_pos_enc=causal_pos_enc, conv_norm_layer=self._conv_norm_layer, se_r=se_r, ff_macaron=ff_macaron, out_lnorm=self.red_lnorms, concat_after=concat_after, ) ) self.blocks = nn.ModuleList(blocks) if not self.red_lnorms: self.norm_out = nn.LayerNorm(d_model)
@staticmethod def _standarize_cblocks_param(p, num_blocks, p_name): if isinstance(p, int): p = [p] * num_blocks elif isinstance(p, list): if len(p) == 1: p = p * num_blocks assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( p_name, len(p), num_blocks, ) else: raise TypeError("wrong type for param {}={}".format(p_name, p)) return p def _make_in_layer(self): in_feats = self.in_feats d_model = self.d_model dropout_rate = self.dropout_rate if self.pos_enc_type == "no": pos_enc = NoPosEncoder() elif self.pos_enc_type == "rel": pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) elif self.pos_enc_type == "abs": pos_enc = PosEncoder(d_model, self.pos_dropout_rate) else: raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) hid_act = AF.create(self.hid_act) if self.in_layer_type == "linear": self.in_layer = nn.Sequential( nn.Linear(in_feats, d_model), nn.LayerNorm(d_model), nn.Dropout(dropout_rate), hid_act, pos_enc, ) elif self.in_layer_type == "conv2d-sub": self.in_layer = Conv2dSubsampler( in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim ) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc ) elif isinstance(self.in_layer_type, nn.Module): self.in_layer = nn.Sequential(in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: raise ValueError("unknown in_layer_type: " + self.in_layer_type)
[docs] def forward(self, x, mask=None, target_shape=None): """Forward pass function Args: x: input tensor with size=(batch, time, num_feats) mask: mask to indicate valid time steps for x (batch, time) Returns: Tensor with output features Tensor with mask """ if isinstance(self.in_layer, Conv2dSubsampler): x, mask = self.in_layer(x, mask) else: if self.in_time_dim != 1: x = x.transpose(1, self.in_time_dim).contiguous() x = self.in_layer(x) if isinstance(x, tuple): x, pos_emb = x b_args = {"pos_emb": pos_emb} else: b_args = {} for i in range(len(self.blocks)): x, mask = self.blocks[i](x, mask=mask, **b_args) if not self.red_lnorms: x = self.norm_out(x) if self.out_time_dim != 1: x = x.transpose(1, self.out_time_dim) if mask is None: return x return x, mask
[docs] def get_config(self): """Gets network config Returns: dictionary with config params """ config = { "in_feats": self.in_feats, "d_model": self.d_model, "num_heads": self.num_heads, "num_blocks": self.num_blocks, "att_type": self.att_type, "att_context": self.att_context, "conv_repeats": self.conv_repeats, "conv_kernel_sizes": self.conv_kernel_sizes, "conv_strides": self.conv_strides, "ff_type": self.ff_type, "d_ff": self.d_ff, "ff_kernel_size": self.ff_kernel_size, "dropout_rate": self.dropout_rate, "att_dropout_rate": self.att_dropout_rate, "pos_dropout_rate": self.pos_dropout_rate, "in_layer_type": self.in_layer_type, "pos_enc_type": self.pos_enc_type, "causal_pos_enc": self.causal_pos_enc, "hid_act": self.hid_act, "se_r": self.se_r, "ff_macaron": self.ff_macaron, "red_lnorms": self.red_lnorms, "conv_norm_layer": self.conv_norm_layer, "concat_after": self.concat_after, "padding_idx": self.padding_idx, "in_time_dim": self.in_time_dim, "out_time_dim": self.out_time_dim, } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items()))
[docs] def in_context(self): return (self.att_context, self.att_context)
[docs] def in_shape(self): """Input shape for network Returns: Tuple describing input shape """ if self.in_time_dim == 1: return (None, None, self.in_feats) else: return (None, self.in_feats, None)
[docs] def out_shape(self, in_shape=None): """Infers the network output shape given the input shape Args: in_shape: input shape tuple Returns: Tuple with the output shape """ if in_shape is None: out_t = None batch_size = None else: assert len(in_shape) == 3 batch_size = in_shape[0] in_t = in_shape[self.in_time_dim] if in_t is None: out_t = None else: if isinstance(self.in_layer, Conv2dSubsampler): # out_t = in_t//4 out_t = ((in_t - 1) // 2 - 1) // 2 else: out_t = in_t if self.out_time_dim == 1: return (batch_size, out_t, self.d_model) else: return (batch_size, self.d_model, out_t)
[docs] @staticmethod def filter_args(**kwargs): """Filters arguments correspondin to TransformerXVector from args dictionary Args: kwargs: args dictionary Returns: args dictionary """ if "no_ff_macaron" in kwargs: kwargs["ff_macaron"] = not kwargs["no_ff_macaron"] valid_args = ( "num_blocks", "in_feats", "d_model", "num_heads", "att_type", "att_context", "conv_repeats", "conv_kernel_sizes", "conv_strides", "ff_type", "d_ff", "ff_kernel_size", "dropout_rate", "pos_dropout_rate", "att_dropout_rate", "in_layer_type", "hid_act", "pos_enc_type", "causal_pos_enc", "conv_norm_layer", "se_r", "ff_macaron", "red_lnorms", "concat_after", ) return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
[docs] @staticmethod def add_class_args(parser, prefix=None, in_feats=False): """Adds Conformer config parameters to argparser Args: parser: argparse object prefix: prefix string to add to the argument names """ if prefix is not None: outer_parser = parser parser = ArgumentParser(prog="") if in_feats: parser.add_argument( "--in-feats", type=int, default=80, help=("input feature dimension") ) parser.add_argument( "--num-blocks", default=6, type=int, help=("number of tranformer blocks") ) parser.add_argument( "--d-model", default=512, type=int, help=("encoder layer sizes") ) parser.add_argument( "--num-heads", default=4, type=int, help=("number of heads in self-attention layers"), ) parser.add_argument( "--att-type", default="scaled-dot-prod-v1", choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], help=("type of self-attention"), ) parser.add_argument( "--att-context", default=25, type=int, help=("context size when using local attention"), ) parser.add_argument( "--conv-repeats", default=[1], type=int, nargs="+", help=("number of conv blocks in each conformer block"), ) parser.add_argument( "--conv-kernel-sizes", default=[31], nargs="+", type=int, help=("kernels sizes for the depth-wise convs of each conformer block"), ) parser.add_argument( "--conv-strides", default=[1], nargs="+", type=int, help=("resb-blocks strides for each encoder stage"), ) parser.add_argument( "--ff-type", default="linear", choices=["linear", "conv1dx2", "conv1dlinear"], help=("type of feed forward layers in transformer block"), ) parser.add_argument( "--d-ff", default=2048, type=int, help=("size middle layer in feed forward block"), ) parser.add_argument( "--ff-kernel-size", default=3, type=int, help=("kernel size in convolutional feed forward block"), ) try: parser.add_argument("--hid-act", default="swish", help="hidden activation") except: pass parser.add_argument( "--pos-dropout-rate", default=0.1, type=float, help="positional encoder dropout", ) parser.add_argument( "--att-dropout-rate", default=0, type=float, help="self-att dropout" ) parser.add_argument( "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" ) parser.add_argument( "--in-layer-type", default="linear", choices=["linear", "conv2d-sub"], help=("type of input layer"), ) # parser.add_argument('--abs-pos-enc', default=False, action='store_true', # help='use absolute positional encoder') parser.add_argument( "--pos-enc-type", default="rel", choices=["no", "rel", "abs"], help=("type of positional encoder"), ) parser.add_argument( "--causal-pos-enc", default=False, action="store_true", help="relative positional encodings are zero when attending to the future", ) try: parser.add_argument( "--conv-norm-layer", default=None, choices=[ "batch-norm", "group-norm", "instance-norm", "instance-norm-affine", "layer-norm", ], help="type of normalization layer for conv block in conformer", ) except: pass parser.add_argument( "--se-r", default=None, type=int, help=("squeeze-excitation compression ratio"), ) parser.add_argument( "--no-ff-macaron", default=False, action="store_true", help="do not use macaron style ff layers ", ) parser.add_argument( "--red-lnorms", default=False, action="store_true", help="use redundant Lnorm at conformer blocks' outputs", ) parser.add_argument( "--concat-after", default=False, action="store_true", help="concatenate attention input and output instead of adding", ) # parser.add_argument('--in-norm', default=False, action='store_true', # help='batch normalization at the input') if prefix is not None: outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
# help='conformer encoder options') add_argparse_args = add_class_args