Source code for hyperion.io.audio_reader

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import os
import logging
import io
import math
import subprocess
import soundfile as sf

import numpy as np

from ..hyp_defs import float_cpu
from ..utils import SCPList, SegmentList

valid_ext = [
    ".wav",
    ".flac",
    ".ogg",
    ".au",
    ".avr",
    ".caf",
    ".htk",
    ".iff",
    ".mat",
    ".mpc",
    ".oga",
    ".pvf",
    ".rf64",
    ".sd2",
    ".sds",
    ".sf",
    ".voc",
    "w64",
    ".wve",
    ".xi",
]


[docs]class AudioReader(object): """Class to read audio files from wav, flac or pipe Attributes: file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. segments_path: segments file with format: segment_id file_id tbeg tend wav_scale: multiplies signal by scale factor """
[docs] def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): self.file_path = file_path if isinstance(file_path, SCPList): self.scp = file_path else: self.scp = SCPList.load(file_path, sep=" ", is_wav=True) self.segments_path = segments_path if segments_path is None: self.segments = None self.with_segments = False else: self.with_segments = True if isinstance(file_path, SegmentList): self.segments = segments_path else: self.segments = SegmentList.load( segments_path, sep=" ", index_by_file=False ) self.wav_scale = wav_scale
@property def keys(self): if self.with_segments: return np.asarray(self.segments["segment_id"]) return self.scp.key
[docs] def __enter__(self): """Function required when entering contructions of type with AudioReader('file.h5') as f: keys, data = f.read() """ return self
[docs] def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type with AudioReader('file.h5') as f: keys, data = f.read() """ pass
[docs] @staticmethod def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): """Reads an audiospecifier (audio_file/pipe) It reads from pipe or from all the files that can be read by `libsndfile <http://www.mega-nerd.com/libsndfile/#Features>` Args: wavspecifier: A pipe, wav, flac, ogg file etc. scale: Multiplies signal by scale factor time_offset: float indicating the start time to read in the utterance. time_durs: floats indicating the number of seconds to read from the utterance, if 0 it reads untils the end """ wavspecifier = wavspecifier.strip() if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] x, fs = AudioReader.read_pipe(wavspecifier, scale) if time_offset == 0 and time_dur == 0: return x, fs start_sample = int(math.floor(time_offset * fs)) num_samples = int(math.floor(time_dur * fs)) if num_samples == 0: return x[start_sample:], fs end_sample = start_sample + num_samples assert end_sample <= len(x) return x[start_sample:end_sample], fs ext = os.path.splitext(wavspecifier)[1] if ext in valid_ext: if time_offset == 0 and time_dur == 0: x, fs = sf.read(wavspecifier, dtype=float_cpu()) x *= scale return x, fs with sf.SoundFile(wavspecifier, "r") as f: fs = f.samplerate start_sample = int(math.floor(time_offset * fs)) num_samples = int(math.floor(time_dur * fs)) f.seek(start_sample) if num_samples > 0: x = scale * f.read(num_samples, dtype=float_cpu()) else: x = scale * f.read(dtype=float_cpu()) return x, fs raise Exception("Unknown format for %s" % (wavspecifier))
[docs] @staticmethod def read_pipe(wavspecifier, scale=2 ** 15): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output scale: Multiplies signal by scale factor """ # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] if proc.returncode != 0: raise Exception( "Wave read pipe command %s returned code %d" % (wavspecifier, proc.returncode) ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale return x, fs
[docs] def _read_segment(self, segment, time_offset=0, time_dur=0): """Reads a wave segment Args: segment: pandas DataFrame (segment_id , file_id, tbeg, tend) Returns: Wave, sampling frequency """ file_id = segment["file_id"] t_beg = segment["tbeg"] + time_offset t_end = segment["tend"] if time_dur > 0: t_end_new = t_beg + time_dur assert t_end_new <= t_end t_end = t_end_new file_path, _, _ = self.scp[file_id] x_i, fs_i = self.read_wavspecifier(file_path, self.wav_scale) num_samples_i = len(x_i) s_beg = int(t_beg * fs_i) if s_beg >= num_samples_i: raise Exception( "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" % (key, tbeg, sbeg, file_id, num_samples_i) ) s_end = int(t_end * fs_i) if s_end > num_samples_i or t_end < 0: s_end = num_samples_i x_i = x_i[s_beg:s_end] return x_i, fs_i
[docs] def read(self): pass
[docs]class SequentialAudioReader(AudioReader):
[docs] def __init__( self, file_path, segments_path=None, wav_scale=2 ** 15 - 1, part_idx=1, num_parts=1, ): super().__init__(file_path, segments_path, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx self.num_parts = num_parts if self.num_parts > 1: if self.with_segments: self.segments = self.segments.split(self.part_idx, self.num_parts) else: self.scp = self.scp.split( self.part_idx, self.num_parts, group_by_key=False )
[docs] def __iter__(self): """Needed to build an iterator, e.g.: r = SequentialAudioReader(...) for key, s, fs in r: print(key) process(s) """ return self
[docs] def __next__(self): """Needed to build an iterator, e.g.: r = SequentialAudioReader(...) for key , s, fs in r: process(s) """ key, x, fs = self.read(1) if len(key) == 0: raise StopIteration return key[0], x[0], fs[0]
[docs] def next(self): """__next__ for Python 2""" return self.__next__()
[docs] def reset(self): """Returns the file pointer to the begining of the dataset, then we can start reading the features again. """ self.cur_item = 0
[docs] def eof(self): """End of file. Returns: True, when we have read all the recordings in the dataset. """ if self.with_segments: return self.cur_item == len(self.segments) return self.cur_item == len(self.scp)
[docs] def read(self, num_records=0, time_offset=0, time_durs=0): """Reads next num_records audio files Args: num_records: Number of audio files to read. time_offset: List of floats indicating the start time to read in the utterance. time_durs: List of floats indicating the number of seconds to read from each utterance Returns: key: List of recording names. data: List of waveforms fs: list of sample freqs """ if num_records == 0: if self.with_segments: num_records = len(self.segments) - self.cur_item else: num_records = len(self.scp) - self.cur_item offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) keys = [] data = [] fs = [] for i in range(num_records): if self.eof(): break offset_i = time_offset[i] if offset_is_list else time_offset dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: segment = self.segments[self.cur_item] key = segment["segment_id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: key, file_path, _, _ = self.scp[self.cur_item] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) keys.append(key) data.append(x_i) fs.append(fs_i) self.cur_item += 1 return keys, data, fs
[docs] @staticmethod def filter_args(**kwargs): valid_args = ("part_idx", "num_parts", "wav_scale") return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
[docs] @staticmethod def add_class_args(parser, prefix=None): if prefix is None: p1 = "--" else: p1 = "--" + prefix + "." parser.add_argument( p1 + "wav-scale", default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), ) try: parser.add_argument( p1 + "part-idx", type=int, default=1, help=( "splits the list of files into num-parts and " "processes part-idx" ), ) parser.add_argument( p1 + "num-parts", type=int, default=1, help=( "splits the list of files into num-parts and " "processes part-idx" ), ) except: pass
add_argparse_args = add_class_args
[docs]class RandomAccessAudioReader(AudioReader):
[docs] def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): super().__init__(file_path, segments_path, wav_scale)
[docs] def _read(self, keys, time_offset=0, time_durs=0): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. Returns: data: List of waveforms """ if isinstance(keys, str): keys = [keys] offset_is_list = isinstance(time_offset, (list, np.ndarray)) dur_is_list = isinstance(time_durs, (list, np.ndarray)) data = [] fs = [] for i, key in enumerate(keys): offset_i = time_offset[i] if offset_is_list else time_offset dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: if not (key in self.segments): raise Exception("Key %s not found" % key) segment = self.segments[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: if not (key in self.scp): raise Exception("Key %s not found" % key) file_path, _, _ = self.scp[key] x_i, fs_i = self.read_wavspecifier( file_path, self.wav_scale, offset_i, dur_i ) data.append(x_i) fs.append(fs_i) return data, fs
[docs] def read(self, keys, time_offset=0, time_durs=0): """Reads the waveforms for the recordings in keys. Args: keys: List of recording/segment_ids names. Returns: data: List of waveforms fs: List of sampling freq. """ try: x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) except: if isinstance(keys, str): keys = [keys] if not isinstance(time_offset, (list, np.ndarray)): time_offset = [time_offset] * len(keys) if not isinstance(time_durs, (list, np.ndarray)): time_durs = [time_durs] * len(keys) try: # some files produce error in the fseek after reading the data, # this seems an issue from pysoundfile or soundfile lib itself # we try to read from # time-offset to the end of the file, and remove the extra frames later, # this solves the problem in most cases logging.info( ( "error-1 reading at keys={} offset={} " "retrying reading until end-of-file ..." ).format(keys, time_offset) ) x, fs = self._read(keys, time_offset=time_offset) for i in range(len(x)): end_sample = int(time_durs[i] * fs[i]) x[i] = x[i][:end_sample] except: # try to read the full file logging.info( ( "error-2 reading at key={}, " "retrying reading full file ..." ).format(keys) ) x, fs = self._read(keys) for i in range(len(x)): start_sample = int(time_offset[i] * fs[i]) end_sample = start_sample + int(time_durs[i] * fs[i]) x[i] = x[i][start_sample:end_sample] return x, fs
[docs] @staticmethod def filter_args(**kwargs): valid_args = ("wav_scale",) return dict((k, kwargs[k]) for k in valid_args if k in kwargs)
[docs] @staticmethod def add_class_args(parser, prefix=None): if prefix is None: p1 = "--" else: p1 = "--" + prefix + "." parser.add_argument( p1 + "wav-scale", default=2 ** 15 - 1, type=float, help=("multiplicative factor for waveform"), )
add_argparse_args = add_class_args