Source code for hyperion.augment.reverb_augment

"""
 Copyright 2020 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import time
import logging
import math
import multiprocessing
import yaml
from copy import deepcopy
from enum import Enum

import numpy as np
from scipy import signal

from ..hyp_defs import float_cpu
from ..io import RandomAccessDataReaderFactory as DRF


class RIRNormType(Enum):
    """normalization type to apply to RIR"""

    NONE = 0  # none
    MAX = 1  # max ray normalized to 1
    ENERGY = 2  # energy of RIR normalized to 1


[docs]class SingleReverbAugment(object): """Class to augment speech with reverberation using RIR from a single type, e.g., small room, medium room, large room Attributes: rir_type: string label indicating the RIR type. rir_path: Kaldi style rspecifier to Ark or H5 file containing RIRs rir_norm: RIR normalization method between None, 'max' or 'energy' comp_delay: compensate the delay introduced by the RIR if any, this delay will happen if the maximum of the RIR is not in its first sample. preload_rirs: if True all RIRS are loaded into RAM rng: Random number generator returned by np.random.RandomState (optional) """
[docs] def __init__( self, rir_type, rir_path, rir_norm=None, comp_delay=True, preload_rirs=True, random_seed=112358, rng=None, ): self.rir_type = rir_type logging.info( ( "init reverb_augment with RIR={} rir_path={} " "rir_norm={} comp_delay={}" ).format(rir_type, rir_path, rir_norm, comp_delay) ) self.r = DRF.create(rir_path) # logging.info('init reverb_augment with RIR={} read RIR lengths'.format(rir_type)) self.rir_keys = self.r.keys self.preload_rirs = preload_rirs if preload_rirs: self.rirs = self.r.read(self.rir_keys) self.r.close() else: self.rirs = None if rir_norm is None: self.rir_norm = RIRNormType.NONE elif rir_norm == "max": self.rir_norm = RIRNormType.MAX elif rir_norm == "energy": self.rir_norm = RIRNormType.ENERGY self.comp_delay = comp_delay self.lock = multiprocessing.Lock() if rng is None: self.rng = np.random.RandomState(seed=random_seed) else: self.rng = deepcopy(rng) logging.info("init reverb_augment with RIR={} done".format(rir_type))
@staticmethod def _power(x): return 10 * np.log10((x ** 2).sum() + 1e-5)
[docs] @staticmethod def sdr(x, y, scale, delay): x = scale * x n = y[delay:] - x return SingleReverbAugment._power(x) - SingleReverbAugment._power(n)
def _norm_rir(self, h): if self.rir_norm == RIRNormType.NONE: return h if self.rir_norm == RIRNormType.MAX: idx = np.argmax(np.abs(h)) return h / h[idx] return h / np.sum(h ** 2)
[docs] def forward(self, x): num_samples = x.shape[0] with self.lock: rir_idx = self.rng.randint(len(self.rir_keys)) if self.preload_rirs: h = self.rirs[rir_idx] else: key = self.rir_keys[rir_idx] h = self.r.read([key])[0] h = self._norm_rir(h) h_delay = np.argmax(np.abs(h)) h_max = h[h_delay] y = signal.fftconvolve(x, h) if self.comp_delay: y = y[h_delay : num_samples + h_delay] h_delay = 0 else: y = y[: num_samples + h_delay] srr = self.sdr(x, y, h_max, h_delay) # logging.info('rt={} {} {} {} {}'.format(t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)) info = { "rir_type": self.rir_type, "srr": srr, "h_max": h_max, "h_delay": h_delay, } return y, info
def __call__(self, x): return self.forward(x)
[docs]class ReverbAugment(object): """Class to augment speech with reverberation with RIRS from multiple types, e.g., small room, medium room, large room. It will randomly choose which RIR type to add. Attributes: reverb_prob: probability of adding reverberation rir_types: dictionary of options with one entry per RIR-type, Each entry is also a dictiory with the following entries: weight, rir_norm, comp_delay, rir_path. The weight parameter is proportional to how often we want to sample a given RIR type. max_reverb_context: number of samples required as left context for the convolution operation. rng: Random number generator returned by np.random.RandomState (optional) """
[docs] def __init__( self, reverb_prob, rir_types, max_reverb_context=0, random_seed=112358, rng=None ): logging.info("init reverb_augment") self.reverb_prob = reverb_prob assert isinstance(rir_types, dict) num_rir_types = len(rir_types) augmenters = [] self.weights = np.zeros((len(rir_types),)) count = 0 val_opts = ("rir_path", "rir_norm", "comp_delay", "preload_rirs") for key, opts in rir_types.items(): self.weights[count] = opts["weight"] opts_i = {} for opt_key in val_opts: if opt_key in opts: opts_i[opt_key] = opts[opt_key] aug = SingleReverbAugment(key, **opts_i, random_seed=random_seed, rng=rng) augmenters.append(aug) count += 1 self.max_reverb_context = max_reverb_context # self.max_rir_length = np.max( # [a.max_rir_length for a in augmenters]) self.weights /= np.sum(self.weights) self.augmenters = augmenters self.lock = multiprocessing.Lock() if rng is None: self.rng = np.random.RandomState(seed=random_seed) else: self.rng = deepcopy(rng)
[docs] @classmethod def create(cls, cfg, random_seed=112358, rng=None): """Creates a ReverbAugment object from options dictionary or YAML file. Args: cfg: YAML file path or dictionary with reverb options. rng: Random number generator returned by np.random.RandomState (optional) Returns: ReverbAugment object """ if isinstance(cfg, str): with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) return cls( reverb_prob=cfg["reverb_prob"], rir_types=cfg["rir_types"], max_reverb_context=cfg["max_reverb_context"], random_seed=random_seed, rng=rng, )
[docs] @staticmethod def sdr(x, y, scale, delay): return SingleReverbAugment.sdr(x, y, scale, delay)
[docs] def forward(self, x): # decide whether to add reverb or not with self.lock: p = self.rng.random_sample() if p > self.reverb_prob: # we don't add reverb info = {"rir_type": None, "srr": 100, "h_max": 1, "h_delay": 0} return x, info # decide the RIR type with self.lock: rir_idx = self.rng.choice(len(self.weights), p=self.weights) # add reverb x, info = self.augmenters[rir_idx](x) return x, info
def __call__(self, x): return self.forward(x)