"""
Copyright 2018 Johns Hopkins University (Author: Jesus Villalba)
Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
import os
import logging
import io
import math
import subprocess
import soundfile as sf
import numpy as np
from ..hyp_defs import float_cpu
from ..utils import SCPList, SegmentList
valid_ext = [
".wav",
".flac",
".ogg",
".au",
".avr",
".caf",
".htk",
".iff",
".mat",
".mpc",
".oga",
".pvf",
".rf64",
".sd2",
".sds",
".sf",
".voc",
"w64",
".wve",
".xi",
]
[docs]class AudioReader(object):
"""Class to read audio files from wav, flac or pipe
Attributes:
file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object.
segments_path: segments file with format: segment_id file_id tbeg tend
wav_scale: multiplies signal by scale factor
"""
[docs] def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1):
self.file_path = file_path
if isinstance(file_path, SCPList):
self.scp = file_path
else:
self.scp = SCPList.load(file_path, sep=" ", is_wav=True)
self.segments_path = segments_path
if segments_path is None:
self.segments = None
self.with_segments = False
else:
self.with_segments = True
if isinstance(file_path, SegmentList):
self.segments = segments_path
else:
self.segments = SegmentList.load(
segments_path, sep=" ", index_by_file=False
)
self.wav_scale = wav_scale
@property
def keys(self):
if self.with_segments:
return np.asarray(self.segments["segment_id"])
return self.scp.key
[docs] def __enter__(self):
"""Function required when entering contructions of type
with AudioReader('file.h5') as f:
keys, data = f.read()
"""
return self
[docs] def __exit__(self, exc_type, exc_value, traceback):
"""Function required when exiting from contructions of type
with AudioReader('file.h5') as f:
keys, data = f.read()
"""
pass
[docs] @staticmethod
def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0):
"""Reads an audiospecifier (audio_file/pipe)
It reads from pipe or from all the files that can be read
by `libsndfile <http://www.mega-nerd.com/libsndfile/#Features>`
Args:
wavspecifier: A pipe, wav, flac, ogg file etc.
scale: Multiplies signal by scale factor
time_offset: float indicating the start time to read in the utterance.
time_durs: floats indicating the number of seconds to read from the utterance,
if 0 it reads untils the end
"""
wavspecifier = wavspecifier.strip()
if wavspecifier[-1] == "|":
wavspecifier = wavspecifier[:-1]
x, fs = AudioReader.read_pipe(wavspecifier, scale)
if time_offset == 0 and time_dur == 0:
return x, fs
start_sample = int(math.floor(time_offset * fs))
num_samples = int(math.floor(time_dur * fs))
if num_samples == 0:
return x[start_sample:], fs
end_sample = start_sample + num_samples
assert end_sample <= len(x)
return x[start_sample:end_sample], fs
ext = os.path.splitext(wavspecifier)[1]
if ext in valid_ext:
if time_offset == 0 and time_dur == 0:
x, fs = sf.read(wavspecifier, dtype=float_cpu())
x *= scale
return x, fs
with sf.SoundFile(wavspecifier, "r") as f:
fs = f.samplerate
start_sample = int(math.floor(time_offset * fs))
num_samples = int(math.floor(time_dur * fs))
f.seek(start_sample)
if num_samples > 0:
x = scale * f.read(num_samples, dtype=float_cpu())
else:
x = scale * f.read(dtype=float_cpu())
return x, fs
raise Exception("Unknown format for %s" % (wavspecifier))
[docs] @staticmethod
def read_pipe(wavspecifier, scale=2 ** 15):
"""Reads wave file from a pipe
Args:
wavspecifier: Shell command with pipe output
scale: Multiplies signal by scale factor
"""
# proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE)
pipe = proc.communicate()[0]
if proc.returncode != 0:
raise Exception(
"Wave read pipe command %s returned code %d"
% (wavspecifier, proc.returncode)
)
x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu())
x *= scale
return x, fs
[docs] def _read_segment(self, segment, time_offset=0, time_dur=0):
"""Reads a wave segment
Args:
segment: pandas DataFrame (segment_id , file_id, tbeg, tend)
Returns:
Wave, sampling frequency
"""
file_id = segment["file_id"]
t_beg = segment["tbeg"] + time_offset
t_end = segment["tend"]
if time_dur > 0:
t_end_new = t_beg + time_dur
assert t_end_new <= t_end
t_end = t_end_new
file_path, _, _ = self.scp[file_id]
x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale)
num_samples_i = len(x_i)
s_beg = int(t_beg * fs_i)
if s_beg >= num_samples_i:
raise Exception(
"segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)"
% (key, tbeg, sbeg, file_id, num_samples_i)
)
s_end = int(t_end * fs_i)
if s_end > num_samples_i or t_end < 0:
s_end = num_samples_i
x_i = x_i[s_beg:s_end]
return x_i, fs_i
[docs]class SequentialAudioReader(AudioReader):
[docs] def __init__(
self,
file_path,
segments_path=None,
wav_scale=2 ** 15 - 1,
part_idx=1,
num_parts=1,
):
super().__init__(file_path, segments_path, wav_scale=wav_scale)
self.cur_item = 0
self.part_idx = part_idx
self.num_parts = num_parts
if self.num_parts > 1:
if self.with_segments:
self.segments = self.segments.split(self.part_idx, self.num_parts)
else:
self.scp = self.scp.split(
self.part_idx, self.num_parts, group_by_key=False
)
[docs] def __iter__(self):
"""Needed to build an iterator, e.g.:
r = SequentialAudioReader(...)
for key, s, fs in r:
print(key)
process(s)
"""
return self
[docs] def __next__(self):
"""Needed to build an iterator, e.g.:
r = SequentialAudioReader(...)
for key , s, fs in r:
process(s)
"""
key, x, fs = self.read(1)
if len(key) == 0:
raise StopIteration
return key[0], x[0], fs[0]
[docs] def next(self):
"""__next__ for Python 2"""
return self.__next__()
[docs] def reset(self):
"""Returns the file pointer to the begining of the dataset,
then we can start reading the features again.
"""
self.cur_item = 0
[docs] def eof(self):
"""End of file.
Returns:
True, when we have read all the recordings in the dataset.
"""
if self.with_segments:
return self.cur_item == len(self.segments)
return self.cur_item == len(self.scp)
[docs] def read(self, num_records=0, time_offset=0, time_durs=0):
"""Reads next num_records audio files
Args:
num_records: Number of audio files to read.
time_offset: List of floats indicating the start time to read in the utterance.
time_durs: List of floats indicating the number of seconds to read from each utterance
Returns:
key: List of recording names.
data: List of waveforms
fs: list of sample freqs
"""
if num_records == 0:
if self.with_segments:
num_records = len(self.segments) - self.cur_item
else:
num_records = len(self.scp) - self.cur_item
offset_is_list = isinstance(time_offset, (list, np.ndarray))
dur_is_list = isinstance(time_durs, (list, np.ndarray))
keys = []
data = []
fs = []
for i in range(num_records):
if self.eof():
break
offset_i = time_offset[i] if offset_is_list else time_offset
dur_i = time_durs[i] if dur_is_list else time_durs
if self.with_segments:
segment = self.segments[self.cur_item]
key = segment["segment_id"]
x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
else:
key, file_path, _, _ = self.scp[self.cur_item]
x_i, fs_i = self.read_wavspecifier(
file_path, self.wav_scale, offset_i, dur_i
)
keys.append(key)
data.append(x_i)
fs.append(fs_i)
self.cur_item += 1
return keys, data, fs
[docs] @staticmethod
def filter_args(**kwargs):
valid_args = ("part_idx", "num_parts", "wav_scale")
return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
[docs] @staticmethod
def add_class_args(parser, prefix=None):
if prefix is None:
p1 = "--"
else:
p1 = "--" + prefix + "."
parser.add_argument(
p1 + "wav-scale",
default=2 ** 15 - 1,
type=float,
help=("multiplicative factor for waveform"),
)
try:
parser.add_argument(
p1 + "part-idx",
type=int,
default=1,
help=(
"splits the list of files into num-parts and " "processes part-idx"
),
)
parser.add_argument(
p1 + "num-parts",
type=int,
default=1,
help=(
"splits the list of files into num-parts and " "processes part-idx"
),
)
except:
pass
add_argparse_args = add_class_args
[docs]class RandomAccessAudioReader(AudioReader):
[docs] def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1):
super().__init__(file_path, segments_path, wav_scale)
[docs] def _read(self, keys, time_offset=0, time_durs=0):
"""Reads the waveforms for the recordings in keys.
Args:
keys: List of recording/segment_ids names.
Returns:
data: List of waveforms
"""
if isinstance(keys, str):
keys = [keys]
offset_is_list = isinstance(time_offset, (list, np.ndarray))
dur_is_list = isinstance(time_durs, (list, np.ndarray))
data = []
fs = []
for i, key in enumerate(keys):
offset_i = time_offset[i] if offset_is_list else time_offset
dur_i = time_durs[i] if dur_is_list else time_durs
if self.with_segments:
if not (key in self.segments):
raise Exception("Key %s not found" % key)
segment = self.segments[key]
x_i, fs_i = self._read_segment(segment, offset_i, dur_i)
else:
if not (key in self.scp):
raise Exception("Key %s not found" % key)
file_path, _, _ = self.scp[key]
x_i, fs_i = self.read_wavspecifier(
file_path, self.wav_scale, offset_i, dur_i
)
data.append(x_i)
fs.append(fs_i)
return data, fs
[docs] def read(self, keys, time_offset=0, time_durs=0):
"""Reads the waveforms for the recordings in keys.
Args:
keys: List of recording/segment_ids names.
Returns:
data: List of waveforms
fs: List of sampling freq.
"""
try:
x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs)
except:
if isinstance(keys, str):
keys = [keys]
if not isinstance(time_offset, (list, np.ndarray)):
time_offset = [time_offset] * len(keys)
if not isinstance(time_durs, (list, np.ndarray)):
time_durs = [time_durs] * len(keys)
try:
# some files produce error in the fseek after reading the data,
# this seems an issue from pysoundfile or soundfile lib itself
# we try to read from
# time-offset to the end of the file, and remove the extra frames later,
# this solves the problem in most cases
logging.info(
(
"error-1 reading at keys={} offset={} "
"retrying reading until end-of-file ..."
).format(keys, time_offset)
)
x, fs = self._read(keys, time_offset=time_offset)
for i in range(len(x)):
end_sample = int(time_durs[i] * fs[i])
x[i] = x[i][:end_sample]
except:
# try to read the full file
logging.info(
(
"error-2 reading at key={}, " "retrying reading full file ..."
).format(keys)
)
x, fs = self._read(keys)
for i in range(len(x)):
start_sample = int(time_offset[i] * fs[i])
end_sample = start_sample + int(time_durs[i] * fs[i])
x[i] = x[i][start_sample:end_sample]
return x, fs
[docs] @staticmethod
def filter_args(**kwargs):
valid_args = ("wav_scale",)
return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
[docs] @staticmethod
def add_class_args(parser, prefix=None):
if prefix is None:
p1 = "--"
else:
p1 = "--" + prefix + "."
parser.add_argument(
p1 + "wav-scale",
default=2 ** 15 - 1,
type=float,
help=("multiplicative factor for waveform"),
)
add_argparse_args = add_class_args