Source code for hyperion.torch.layer_blocks.conformer_conv

"""
 Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import torch
import torch.nn as nn

from ..layers import ActivationFactory as AF
from .se_blocks import SEBlock1d


[docs]def _conv1(in_channels, out_channels, bias=False): """1x1 convolution""" return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias)
[docs]def _dwconvk(channels, kernel_size, stride=1, bias=False): """kxk depth-wise convolution with padding""" return nn.Conv1d( channels, channels, kernel_size=kernel_size, stride=stride, padding=(kernel_size - 1) // 2, groups=channels, bias=bias, padding_mode="zeros", )
def _make_downsample(in_channels, out_channels, stride): return _conv1(in_channels, out_channels, stride, bias=True)
[docs]class ConformerConvBlock(nn.Module): """Convolutional block for conformer introduced at https://arxiv.org/pdf/2005.08100.pdf This includes some optional extra features not included in the original paper: - Squeeze-Excitation after depthwise-conv - Allows downsampling in time dimension - Allows choosing activation and layer normalization type Attributes: num_channels : number of input/output channels kernel_size: kernel_size for depth-wise conv stride: stride for depth-wise conv activation: activation function str or object norm_layer: norm layer constructor, if None it uses BatchNorm dropout_rate: dropout rate se_r: Squeeze-Excitation compression ratio, if None it doesn't use Squeeze-Excitation """
[docs] def __init__( self, num_channels, kernel_size, stride=1, activation="swish", norm_layer=None, dropout_rate=0, se_r=None, ): super().__init__() self.num_channels = (num_channels,) self.kernel_size = kernel_size self.stride = stride self.dropout_rate = dropout_rate self.act = AF.create(activation) self.se_r = se_r self.has_se = se_r is not None and se_r > 1 if norm_layer is None: norm_layer = nn.BatchNorm1d self.layer_norm = nn.LayerNorm(num_channels) # expansion phase self.conv_exp = _conv1(num_channels, 2 * num_channels, bias=True) # depthwise conv phase self.conv_dw = _dwconvk(num_channels, kernel_size, stride=stride, bias=False) self.norm_dw = norm_layer(num_channels, momentum=0.01, eps=1e-3) if self.has_se: self.se_layer = SEBlock1d(num_channels, se_r, activation) # final projection self.conv_proj = _conv1(num_channels, num_channels, bias=True) self.dropout = None if dropout_rate > 0: self.dropout = nn.Dropout(dropout_rate) # when input and output dimensions are different, we adapt the dimensions using conv1x1 self.downsample = None if stride != 1: self.downsample = _make_downsample(num_channels, num_channels, stride) self.context = stride * (kernel_size - 1) // 2
[docs] def forward(self, x): """Forward function Args: x: input size = (batch, num_channels, time) Returns torch.Tensor size = (batch, num_channels, (time-1)//stride+1) """ residual = x # layer norm x = self.layer_norm(x.transpose(1, 2)).transpose(1, 2) # expansion + glu x = self.conv_exp(x) x = nn.functional.glu(x, dim=1) # depthwide conv phase x = self.act(self.norm_dw(self.conv_dw(x))) if self.has_se: x = self.se_layer(x) # final projection x = self.conv_proj(x) if self.dropout_rate > 0: x = self.dropout(x) if self.downsample is not None: residual = self.downsample(residual) x += residual return x