Source code for hyperion.feats.energy_vad

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""
import logging

import numpy as np
from scipy.signal import lfilter

from ..hyp_defs import float_cpu
from ..utils.misc import str2bool
from .stft import st_logE


[docs]class EnergyVAD(object): """Compute VAD based on Kaldi Energy VAD method. Attributes: sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) frame_length: Frame length in milliseconds (default = 25) frame_shift: Frame shift in milliseconds (default = 10) dither: Dithering constant (0.0 means no dither) (default = 1) snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) vad_frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0) vad_proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6) """
[docs] def __init__( self, sample_frequency=16000, frame_length=25, frame_shift=10, dither=1, snip_edges=True, vad_energy_mean_scale=0.5, vad_energy_threshold=5, vad_frames_context=0, vad_proportion_threshold=0.6, ): self.sample_frequency = sample_frequency fs = sample_frequency self.fs = fs self.frame_length = frame_length self.frame_shift = frame_shift self.dither = dither self.snip_edges = snip_edges N = int(np.floor(frame_length * fs / 1000)) self._length = N self._shift = int(np.floor(frame_shift * fs / 1000)) self._dc_a = np.array([1, -0.999], dtype=float_cpu()) self._dc_b = np.array([1, -1], dtype=float_cpu()) assert vad_energy_mean_scale >= 0 assert vad_frames_context >= 0 assert vad_proportion_threshold > 0 and vad_proportion_threshold < 1 self.vad_energy_mean_scale = vad_energy_mean_scale self.vad_energy_threshold = vad_energy_threshold self.vad_frames_context = vad_frames_context self.vad_proportion_threshold = vad_proportion_threshold self.reset()
[docs] def reset(self): """Resets the internal states of the filters""" self._dc_zi = np.array([0], dtype=float_cpu())
[docs] def compute(self, x, return_loge=False): """Evaluates the VAD. Args: x: Wave return_loge: If true, it also returns the log-energy. Returns: Binary VAD """ if x.ndim == 1: # Input is wave if self.snip_edges: num_frames = int( np.floor((len(x) - self._length + self._shift) / self._shift) ) else: num_frames = int(np.round(len(x) / self._shift)) len_x = (num_frames - 1) * self._shift + self._length dlen_x = len_x - len(x) dlen1_x = int(np.floor((self._length - self._shift) / 2)) dlen2_x = int(dlen_x - dlen1_x) x = np.pad(x, (dlen1_x, dlen2_x), mode="reflect") # add dither if self.dither > 0: n = self.dither * np.random.RandomState(seed=len(x)).randn( len(x) ).astype(float_cpu(), copy=False) x = x + n x, self._dc_zi = lfilter(self._dc_b, self._dc_a, x, zi=self._dc_zi) # Compute raw energy logE = st_logE(x, self._length, self._shift) elif x.ndim == 2: # Assume that input are features with log-e in the first coeff of the vector logE = x[:, 0] else: raise Exception("Wrong input dimension ndim=%d" % x.ndim) # compute VAD from logE # print(np.mean(logE)) e_thr = self.vad_energy_threshold + self.vad_energy_mean_scale * np.mean(logE) # print(e_thr) # print(logE) vad = logE > e_thr context = self.vad_frames_context if context == 0: return vad window = 2 * context + 1 if len(vad) < window: context = int(len(vad) - 1 / 2) window = 2 * context + 1 if window == 1: return vad h = np.ones((window,), dtype="float32") num_count = np.convolve(vad.astype("float32"), h, "same") den_count_boundary = np.arange(context + 1, window, dtype="float32") num_count[:context] /= den_count_boundary num_count[-context:] /= den_count_boundary[::-1] num_count[context:-context] /= window vad = num_count > self.vad_proportion_threshold return vad
[docs] @staticmethod def filter_args(**kwargs): """Filters VAD args from arguments dictionary. Args: kwargs: Arguments dictionary. Returns: Dictionary with VAD options. """ valid_args = ( "sample_frequency", "frame_length", "frame_shift", "dither", "snip_edges", "vad_energy_mean_scale", "vad_energy_threshold", "vad_frames_context", "vad_proportion_threshold", ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d
[docs] @staticmethod def add_class_args(parser, prefix=None): """Adds VAD options to parser. Args: parser: Arguments parser prefix: Options prefix. """ if prefix is None: p1 = "--" else: p1 = "--" + prefix + "." parser.add_argument( p1 + "sample-frequency", default=16000, type=int, help=( "Waveform data sample frequency " "(must match the waveform file, if specified there)" ), ) parser.add_argument( p1 + "frame-length", type=int, default=25, help="Frame length in milliseconds", ) parser.add_argument( p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" ) parser.add_argument( p1 + "dither", type=float, default=1, help="Dithering constant (0.0 means no dither)", ) parser.add_argument( p1 + "snip-edges", default=True, type=str2bool, help=( "If true, end effects will be handled by outputting only " "frames that completely fit in the file, and the number of " "frames depends on the frame-length. " "If false, the number of frames depends only on the " "frame-shift, and we reflect the data at the ends." ), ) parser.add_argument( p1 + "vad-energy-mean-scale", type=float, default=0.5, help=( "If this is set to s, to get the actual threshold we let m " "be the mean log-energy of the file, and use " "s*m + vad-energy-threshold" ), ) parser.add_argument( p1 + "vad-energy-threshold", type=float, default=5, help="Constant term in energy threshold for MFCC0 for VAD", ) parser.add_argument( p1 + "vad-frames-context", type=int, default=0, help=( "Number of frames of context on each side of central frame, " "in window for which energy is monitored" ), ) parser.add_argument( p1 + "vad-proportion-threshold", type=float, default=0.6, help=( "Parameter controlling the proportion of frames within " "the window that need to have more energy than the threshold" ), )
add_argparse_args = add_class_args