Source code for hyperion.io.ark_data_writer

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import sys
import numpy as np

from ..hyp_defs import float_save
from ..utils.scp_list import SCPList
from ..utils.kaldi_io_funcs import is_token, write_token, init_kaldi_output_stream
from ..utils.kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix
from .data_writer import DataWriter


[docs]class ArkDataWriter(DataWriter): """Class to write Ark feature files. Attributes: archive_path: output data file path. script_path: optional output scp file. binary: True if the the Ark file is binary, False if it is text file. flush: If True, it flushes the output after writing each feature file. compress: It True, it uses Kaldi compression. compression_method: Kaldi compression method: {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. scp_sep: Separator for scp files (default ' '). """
[docs] def __init__(self, archive_path, script_path=None, binary=True, **kwargs): super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: self.f = open(archive_path, "wb") else: self.f = open(archive_path, "w") if script_path is not None: self.f_script = open(script_path, "w") else: self.f_script = None
[docs] def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type with ArkDataWriter('file.h5') as f: f.write(key, data) It closes the output file. """ self.close()
[docs] def close(self): """Closes the output file""" self.f.close() if self.f_script is not None: self.f_script.close()
[docs] def flush(self): """Flushes the file""" self.f.flush() if self.f_script is not None: self.f_script.flush()
[docs] def _convert_data(self, data): """Converts the feature matrix from numpy array to KaldiMatrix or KaldiCompressedMatrix. """ if isinstance(data, np.ndarray): data = data.astype(float_save(), copy=False) if self.compress: return KaldiCompressedMatrix.compress(data, self.compression_method) return KaldiMatrix(data) if isinstance(data, KaldiMatrix): if self.compress: return KaldiCompressedMatrix.compress(data, self.compression_method) return data if isinstance(data, KaldiCompressedMatrix): if not self.compress: return data.to_matrix() return data raise ValueError("Data is not ndarray or KaldiMatrix")
[docs] def write(self, keys, data): """Writes data to file. Args: key: List of recodings names. data: List of Feature matrices or vectors. If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ if isinstance(keys, str): keys = [keys] data = [data] for i, key_i in enumerate(keys): assert is_token(key_i), "Token %s not valid" % key_i write_token(self.f, self.binary, key_i) pos = self.f.tell() data_i = self._convert_data(data[i]) init_kaldi_output_stream(self.f, self.binary) data_i.write(self.f, self.binary) if self.f_script is not None: self.f_script.write( "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) ) if self._flush: self.flush()