Source code for hyperion.torch.layer_blocks.conformer_conv

"""
 Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import torch
import torch.nn as nn

from ..layers import ActivationFactory as AF
from .se_blocks import SEBlock1d


[docs]def _conv1(in_channels, out_channels, bias=False):
    """1x1 convolution"""
    return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias)


[docs]def _dwconvk(channels, kernel_size, stride=1, bias=False):
    """kxk depth-wise convolution with padding"""
    return nn.Conv1d(
        channels,
        channels,
        kernel_size=kernel_size,
        stride=stride,
        padding=(kernel_size - 1) // 2,
        groups=channels,
        bias=bias,
        padding_mode="zeros",
    )


def _make_downsample(in_channels, out_channels, stride):
    return _conv1(in_channels, out_channels, stride, bias=True)


[docs]class ConformerConvBlock(nn.Module):
    """Convolutional block for conformer introduced at
        https://arxiv.org/pdf/2005.08100.pdf

        This includes some optional extra features
        not included in the original paper:
           - Squeeze-Excitation after depthwise-conv
           - Allows downsampling in time dimension
           - Allows choosing activation and layer normalization type

    Attributes:
       num_channels : number of input/output channels
       kernel_size: kernel_size for depth-wise conv
       stride: stride for depth-wise conv
       activation: activation function str or object
       norm_layer: norm layer constructor,
                   if None it uses BatchNorm
       dropout_rate: dropout rate
       se_r:         Squeeze-Excitation compression ratio,
                     if None it doesn't use Squeeze-Excitation
    """

[docs]    def __init__(
        self,
        num_channels,
        kernel_size,
        stride=1,
        activation="swish",
        norm_layer=None,
        dropout_rate=0,
        se_r=None,
    ):
        super().__init__()
        self.num_channels = (num_channels,)
        self.kernel_size = kernel_size
        self.stride = stride
        self.dropout_rate = dropout_rate
        self.act = AF.create(activation)
        self.se_r = se_r
        self.has_se = se_r is not None and se_r > 1

        if norm_layer is None:
            norm_layer = nn.BatchNorm1d

        self.layer_norm = nn.LayerNorm(num_channels)
        # expansion phase
        self.conv_exp = _conv1(num_channels, 2 * num_channels, bias=True)

        # depthwise conv phase
        self.conv_dw = _dwconvk(num_channels, kernel_size, stride=stride, bias=False)
        self.norm_dw = norm_layer(num_channels, momentum=0.01, eps=1e-3)
        if self.has_se:
            self.se_layer = SEBlock1d(num_channels, se_r, activation)

        # final projection
        self.conv_proj = _conv1(num_channels, num_channels, bias=True)
        self.dropout = None
        if dropout_rate > 0:
            self.dropout = nn.Dropout(dropout_rate)

        # when input and output dimensions are different, we adapt the dimensions using conv1x1
        self.downsample = None
        if stride != 1:
            self.downsample = _make_downsample(num_channels, num_channels, stride)

        self.context = stride * (kernel_size - 1) // 2

[docs]    def forward(self, x):
        """Forward function

        Args:
          x: input size = (batch, num_channels, time)

        Returns
          torch.Tensor size = (batch, num_channels, (time-1)//stride+1)
        """
        residual = x

        # layer norm
        x = self.layer_norm(x.transpose(1, 2)).transpose(1, 2)

        # expansion + glu
        x = self.conv_exp(x)
        x = nn.functional.glu(x, dim=1)

        # depthwide conv phase
        x = self.act(self.norm_dw(self.conv_dw(x)))
        if self.has_se:
            x = self.se_layer(x)

        # final projection
        x = self.conv_proj(x)
        if self.dropout_rate > 0:
            x = self.dropout(x)

        if self.downsample is not None:
            residual = self.downsample(residual)

        x += residual
        return x