"""
Copyright 2019 Johns Hopkins University (Author: Jesus Villalba)
Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
from jsonargparse import ArgumentParser, ActionParser
import re
from ...utils.misc import str2bool
from ...feats.filter_banks import FilterBankFactory as FBF
from .audio_feats import *
FFT = "fft"
SPEC = "spec"
LOG_SPEC = "log_spec"
LOG_FB = "logfb"
MFCC = "mfcc"
KAN_BAYASHI = "kanbayashi_logfb"
FEAT_TYPES = [FFT, SPEC, LOG_SPEC, LOG_FB, MFCC, KAN_BAYASHI]
[docs]class AudioFeatsFactory(object):
[docs] @staticmethod
def create(
audio_feat,
sample_frequency=16000,
frame_length=25,
frame_shift=10,
fft_length=512,
remove_dc_offset=True,
preemphasis_coeff=0.97,
window_type="povey",
use_fft_mag=False,
dither=1,
fb_type="mel_kaldi",
low_freq=20,
high_freq=0,
num_filters=23,
norm_filters=False,
num_ceps=13,
snip_edges=True,
center=False,
cepstral_lifter=22,
energy_floor=0,
raw_energy=True,
use_energy=True,
):
if audio_feat == FFT:
return Wav2FFT(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
preemph_coeff=preemphasis_coeff,
window_type=window_type,
dither=dither,
snip_edges=snip_edges,
center=center,
energy_floor=energy_floor,
raw_energy=raw_energy,
use_energy=use_energy,
)
if audio_feat == SPEC:
return Wav2Spec(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
preemph_coeff=preemphasis_coeff,
window_type=window_type,
use_fft_mag=use_fft_mag,
dither=dither,
snip_edges=snip_edges,
center=center,
energy_floor=energy_floor,
raw_energy=raw_energy,
use_energy=use_energy,
)
if audio_feat == LOG_SPEC:
return Wav2LogSpec(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
preemph_coeff=preemphasis_coeff,
window_type=window_type,
use_fft_mag=use_fft_mag,
dither=dither,
snip_edges=snip_edges,
center=center,
energy_floor=energy_floor,
raw_energy=raw_energy,
use_energy=use_energy,
)
if audio_feat == LOG_FB:
return Wav2LogFilterBank(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
preemph_coeff=preemphasis_coeff,
window_type=window_type,
use_fft_mag=use_fft_mag,
dither=dither,
fb_type=fb_type,
low_freq=low_freq,
high_freq=high_freq,
num_filters=num_filters,
norm_filters=norm_filters,
snip_edges=snip_edges,
center=center,
energy_floor=energy_floor,
raw_energy=raw_energy,
use_energy=use_energy,
)
if audio_feat == MFCC:
return Wav2MFCC(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
preemph_coeff=preemphasis_coeff,
window_type=window_type,
use_fft_mag=use_fft_mag,
dither=dither,
fb_type=fb_type,
low_freq=low_freq,
high_freq=high_freq,
num_filters=num_filters,
norm_filters=norm_filters,
num_ceps=num_ceps,
snip_edges=snip_edges,
center=center,
cepstral_lifter=cepstral_lifter,
energy_floor=energy_floor,
raw_energy=raw_energy,
use_energy=use_energy,
)
if audio_feat == KAN_BAYASHI:
return Wav2KanBayashiLogFilterBank(
sample_frequency,
frame_length,
frame_shift,
fft_length,
remove_dc_offset=remove_dc_offset,
window_type=window_type,
low_freq=low_freq,
high_freq=high_freq,
num_filters=num_filters,
snip_edges=snip_edges,
)
[docs] @staticmethod
def filter_args(**kwargs):
"""Filters MFCC args from arguments dictionary.
Args:
kwargs: Arguments dictionary.
Returns:
Dictionary with MFCC options.
"""
valid_args = (
"sample_frequency",
"frame_length",
"frame_shift",
"fft_length",
"remove_dc_offset",
"preemphasis_coeff",
"window_type",
"blackman_coeff",
"use_fft_mag",
"dither",
"fb_type",
"low_freq",
"high_freq",
"num_filters",
"norm_filters",
"num_ceps",
"snip_edges",
"energy_floor",
"raw_energy",
"use_energy",
"cepstral_lifter",
"audio_feat",
)
d = dict((k, kwargs[k]) for k in valid_args if k in kwargs)
return d
[docs] @staticmethod
def add_class_args(parser, prefix=None):
"""Adds MFCC options to parser.
Args:
parser: Arguments parser
prefix: Options prefix.
"""
if prefix is not None:
outer_parser = parser
parser = ArgumentParser(prog="")
parser.add_argument(
"--sample-frequency",
default=16000,
type=int,
help=(
"Waveform data sample frequency (must match the waveform file, "
"if specified there)"
),
)
parser.add_argument(
"--frame-length", type=int, default=25, help="Frame length in milliseconds"
)
parser.add_argument(
"--frame-shift", type=int, default=10, help="Frame shift in milliseconds"
)
parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT")
parser.add_argument(
"--remove-dc-offset",
default=True,
type=str2bool,
help="Subtract mean from waveform on each frame",
)
parser.add_argument(
"--preemphasis-coeff",
type=float,
default=0.97,
help="Coefficient for use in signal preemphasis",
)
parser.add_argument(
"--window-type",
default="povey",
choices=["hamming", "hanning", "povey", "rectangular", "blackman"],
help=(
'Type of window ("hamming"|"hanning"|"povey"|'
'"rectangular"|"blackmann")'
),
)
parser.add_argument(
"--use-fft-mag",
default=False,
action="store_true",
help="If true, it uses |X(f)|, if false, it uses |X(f)|^2",
)
parser.add_argument(
"--dither",
type=float,
default=1,
help="Dithering constant (0.0 means no dither)",
)
FBF.add_class_args(parser)
parser.add_argument(
"--num-ceps",
type=int,
default=13,
help="Number of cepstra in MFCC computation (including C0)",
)
parser.add_argument(
"--snip-edges",
default=True,
type=str2bool,
help=(
"If true, end effects will be handled by outputting only "
"frames that completely fit in the file, and the number of "
"frames depends on the frame-length. If false, the number "
"of frames depends only on the frame-shift, "
"and we reflect the data at the ends."
),
)
parser.add_argument(
"--center",
default=False,
type=str2bool,
help=(
"If true, puts the center of the frame at t*frame_shift, "
"it over-wrides snip-edges and set it to false"
),
)
parser.add_argument(
"--energy-floor",
type=float,
default=0,
help="Floor on energy (absolute, not relative) in MFCC computation",
)
parser.add_argument(
"--raw-energy",
default=True,
type=str2bool,
help="If true, compute energy before preemphasis and windowing",
)
parser.add_argument(
"--use-energy",
default=True,
type=str2bool,
help="Use energy (not C0) in MFCC computation",
)
parser.add_argument(
"--cepstral-lifter",
type=float,
default=22,
help="Constant that controls scaling of MFCCs",
)
parser.add_argument(
"--audio-feat",
default="cepstrum",
choices=FEAT_TYPES,
help=(
"It can return intermediate result: fft, spec, log_spec, " "logfb, mfcc"
),
)
if prefix is not None:
outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser))
# help='acoustic features options')
add_argparse_args = add_class_args