"""
Copyright 2020 Johns Hopkins University (Author: Jesus Villalba)
Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
import logging
import math
import multiprocessing
import yaml
from copy import deepcopy
import numpy as np
from ..hyp_defs import float_cpu
from ..io import RandomAccessAudioReader as AR
[docs]class SingleNoiseAugment(object):
"""Class to augment speech with additive noise of a single type,
e.g., music, babble, ...
Attributes:
noise_type: string label indicating the noise type.
noise_path: path to Kaldi style wav.scp file indicating the path
to the noise wav files.
min_snr: mininimum SNR(dB) to sample from.
max_snr: maximum SNR(dB) to sample from.
rng: Random number generator returned by
np.random.RandomState (optional)
"""
[docs] def __init__(
self, noise_type, noise_path, min_snr, max_snr, random_seed=112358, rng=None
):
logging.info(
"init noise_augment with noise={} noise_path={} snr={}-{}".format(
noise_type, noise_path, min_snr, max_snr
)
)
self.noise_type = noise_type
self.r = AR(noise_path)
self.noise_keys = self.r.keys
self.min_snr = min_snr
self.max_snr = max_snr
self.cache = None
self.lock = multiprocessing.Lock()
if rng is None:
self.rng = np.random.RandomState(seed=random_seed)
else:
self.rng = deepcopy(rng)
logging.info("init noise_augment with noise={} done".format(noise_type))
@staticmethod
def _power(x):
return 10 * np.log10((x ** 2).sum())
[docs] @staticmethod
def snr(x, n):
return SingleNoiseAugment._power(x) - SingleNoiseAugment._power(n)
@staticmethod
def _compute_noise_scale(x, n, target_snr):
snr = SingleNoiseAugment.snr(x, n)
return 10 ** ((snr - target_snr) / 20)
[docs] def forward(self, x):
num_samples = x.shape[0]
with self.lock:
if self.cache is not None:
if self.cache.shape[0] > num_samples:
noise = self.cache[:num_samples]
self.cache = self.cache[num_samples:]
else:
noise = self.cache
self.cache = None
else:
noise = None
while noise is None or noise.shape[0] < num_samples:
with self.lock:
noise_idx = self.rng.randint(len(self.noise_keys))
key = self.noise_keys[noise_idx]
noise_k, fs_k = self.r.read([key])
noise_k = noise_k[0]
if noise is None:
need_samples = min(x.shape[0], noise_k.shape[0])
noise = noise_k[:need_samples]
else:
need_samples = min(x.shape[0] - noise.shape[0], noise_k.shape[0])
noise = np.concatenate((noise, noise_k[:need_samples]))
if need_samples < noise_k.shape[0]:
with self.lock:
self.cache = noise_k[need_samples:]
with self.lock:
target_snr = self.rng.uniform(self.min_snr, self.max_snr)
scale = self._compute_noise_scale(x, noise, target_snr)
info = {"noise_type": self.noise_type, "snr": target_snr}
return x + scale * noise, info
def __call__(self, x):
return self.forward(x)
[docs]class NoiseAugment(object):
"""Class to augment speech with additive noise from multiple types,
e.g., music, babble, ...
It will randomly choose which noise type to add.
Attributes:
noise_prob: probability of adding noise
noise_types: dictionary of options with one entry per noise-type,
Each entry is also a dictiory with the following entries:
weight, max_snr, min_snr, noise_path. The weight parameter
is proportional to how often we want to sample a given noise
type.
rng: Random number generator returned by
np.random.RandomState (optional)
"""
[docs] def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None):
logging.info("init noise augment")
self.noise_prob = noise_prob
assert isinstance(noise_types, dict)
# num_noise_types = len(noise_types)
augmenters = []
self.weights = np.zeros((len(noise_types),))
count = 0
for key, opts in noise_types.items():
self.weights[count] = opts["weight"]
aug = SingleNoiseAugment(
key,
opts["noise_path"],
opts["min_snr"],
opts["max_snr"],
random_seed=random_seed,
rng=rng,
)
augmenters.append(aug)
count += 1
self.weights /= np.sum(self.weights)
self.augmenters = augmenters
self.lock = multiprocessing.Lock()
if rng is None:
self.rng = np.random.RandomState(seed=random_seed)
else:
self.rng = deepcopy(rng)
[docs] @classmethod
def create(cls, cfg, random_seed=112358, rng=None):
"""Creates a NoiseAugment object from options dictionary or YAML file.
Args:
cfg: YAML file path or dictionary with noise options.
rng: Random number generator returned by
np.random.RandomState (optional)
Returns:
NoiseAugment object
"""
if isinstance(cfg, str):
with open(cfg, "r") as f:
cfg = yaml.load(f, Loader=yaml.FullLoader)
assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg)
return cls(
noise_prob=cfg["noise_prob"],
noise_types=cfg["noise_types"],
random_seed=random_seed,
rng=rng,
)
[docs] def forward(self, x):
# decide whether to add noise or not
with self.lock:
p = self.rng.random_sample()
if p > self.noise_prob:
# we don't add noise
info = {"noise_type": None, "snr": 100}
return x, info
# decide the noise type
with self.lock:
noise_idx = self.rng.choice(len(self.weights), p=self.weights)
# add noise
x, info = self.augmenters[noise_idx](x)
return x, info
def __call__(self, x):
return self.forward(x)