Source code for hyperion.augment.noise_augment

"""
 Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import logging
import math
import multiprocessing
import yaml
from copy import deepcopy

import numpy as np

from ..hyp_defs import float_cpu
from ..io import RandomAccessAudioReader as AR


[docs]class SingleNoiseAugment(object): """Class to augment speech with additive noise of a single type, e.g., music, babble, ... Attributes: noise_type: string label indicating the noise type. noise_path: path to Kaldi style wav.scp file indicating the path to the noise wav files. min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. rng: Random number generator returned by np.random.RandomState (optional) """
[docs] def __init__( self, noise_type, noise_path, min_snr, max_snr, random_seed=112358, rng=None ): logging.info( "init noise_augment with noise={} noise_path={} snr={}-{}".format( noise_type, noise_path, min_snr, max_snr ) ) self.noise_type = noise_type self.r = AR(noise_path) self.noise_keys = self.r.keys self.min_snr = min_snr self.max_snr = max_snr self.cache = None self.lock = multiprocessing.Lock() if rng is None: self.rng = np.random.RandomState(seed=random_seed) else: self.rng = deepcopy(rng) logging.info("init noise_augment with noise={} done".format(noise_type))
@staticmethod def _power(x): return 10 * np.log10((x ** 2).sum())
[docs] @staticmethod def snr(x, n): return SingleNoiseAugment._power(x) - SingleNoiseAugment._power(n)
@staticmethod def _compute_noise_scale(x, n, target_snr): snr = SingleNoiseAugment.snr(x, n) return 10 ** ((snr - target_snr) / 20)
[docs] def forward(self, x): num_samples = x.shape[0] with self.lock: if self.cache is not None: if self.cache.shape[0] > num_samples: noise = self.cache[:num_samples] self.cache = self.cache[num_samples:] else: noise = self.cache self.cache = None else: noise = None while noise is None or noise.shape[0] < num_samples: with self.lock: noise_idx = self.rng.randint(len(self.noise_keys)) key = self.noise_keys[noise_idx] noise_k, fs_k = self.r.read([key]) noise_k = noise_k[0] if noise is None: need_samples = min(x.shape[0], noise_k.shape[0]) noise = noise_k[:need_samples] else: need_samples = min(x.shape[0] - noise.shape[0], noise_k.shape[0]) noise = np.concatenate((noise, noise_k[:need_samples])) if need_samples < noise_k.shape[0]: with self.lock: self.cache = noise_k[need_samples:] with self.lock: target_snr = self.rng.uniform(self.min_snr, self.max_snr) scale = self._compute_noise_scale(x, noise, target_snr) info = {"noise_type": self.noise_type, "snr": target_snr} return x + scale * noise, info
def __call__(self, x): return self.forward(x)
[docs]class NoiseAugment(object): """Class to augment speech with additive noise from multiple types, e.g., music, babble, ... It will randomly choose which noise type to add. Attributes: noise_prob: probability of adding noise noise_types: dictionary of options with one entry per noise-type, Each entry is also a dictiory with the following entries: weight, max_snr, min_snr, noise_path. The weight parameter is proportional to how often we want to sample a given noise type. rng: Random number generator returned by np.random.RandomState (optional) """
[docs] def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): logging.info("init noise augment") self.noise_prob = noise_prob assert isinstance(noise_types, dict) # num_noise_types = len(noise_types) augmenters = [] self.weights = np.zeros((len(noise_types),)) count = 0 for key, opts in noise_types.items(): self.weights[count] = opts["weight"] aug = SingleNoiseAugment( key, opts["noise_path"], opts["min_snr"], opts["max_snr"], random_seed=random_seed, rng=rng, ) augmenters.append(aug) count += 1 self.weights /= np.sum(self.weights) self.augmenters = augmenters self.lock = multiprocessing.Lock() if rng is None: self.rng = np.random.RandomState(seed=random_seed) else: self.rng = deepcopy(rng)
[docs] @classmethod def create(cls, cfg, random_seed=112358, rng=None): """Creates a NoiseAugment object from options dictionary or YAML file. Args: cfg: YAML file path or dictionary with noise options. rng: Random number generator returned by np.random.RandomState (optional) Returns: NoiseAugment object """ if isinstance(cfg, str): with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) return cls( noise_prob=cfg["noise_prob"], noise_types=cfg["noise_types"], random_seed=random_seed, rng=rng, )
[docs] def forward(self, x): # decide whether to add noise or not with self.lock: p = self.rng.random_sample() if p > self.noise_prob: # we don't add noise info = {"noise_type": None, "snr": 100} return x, info # decide the noise type with self.lock: noise_idx = self.rng.choice(len(self.weights), p=self.weights) # add noise x, info = self.augmenters[noise_idx](x) return x, info
def __call__(self, x): return self.forward(x)