Source code for hyperion.io.rw_specifiers

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

 Functions to write and read kaldi files
"""

import re
from enum import Enum


[docs]class ArchiveType(Enum):
    """Types of archive: hdf5, Kaldi Ark or packed-audio files."""

    H5 = 0
    ARK = 1
    AUDIO = 2
    SEGMENT_LIST = 3
    RTTM = 4


"""Documentation for "wspecifier" (taken from Kaldi).
"wspecifier" describes how we write a set of objects indexed by keys.
The basic, unadorned wspecifiers are as follows:

 h5:wxfilename
 ark:wxfilename
 scp:rxfilename
 h5,scp:filename,wxfilename
 ark,scp:filename,wxfilename


 We also allow the following modifiers:
 t means text mode.
 b means binary mode.
 f means flush the stream after writing each entry.
  (nf means don't flush, and isn't very useful as the default is to flush).
 p means permissive mode, when writing to an "scp" file only: will ignore
    missing scp entries, i.e. won't write anything for those files but will
    return success status).

 So the following are valid wspecifiers:
 ark,b,f:foo
 "ark,b,b:| gzip -c > foo"
 "ark,scp,t,nf:foo.ark,|gzip -c > foo.scp.gz"
 ark,b:-

 The meanings of rxfilename and wxfilename are as described in
 kaldi-stream.h (they are filenames but include pipes, stdin/stdout
 and so on; filename is a regular filename.


 The ark:wxfilename type of wspecifier instructs the class to
 write directly to an archive.  For small objects (e.g. lists of ints),
 the text archive format will generally be human readable with one line
 per entry in the archive.

 The type "scp:xfilename" refers to an scp file which should
 already exist on disk, and tells us where to write the data for
 each key (usually an actual file); each line of the scp file
 would be:
  key xfilename

 The type ark,scp:filename,wxfilename means
 we write both an archive and an scp file that specifies offsets into the
 archive, with lines like:
   key filename:12407
 where the number is the byte offset into the file.
 In this case we restrict the archive-filename to be an actual filename,
 as we can't see a situtation where an extended filename would make sense
 for this (we can't fseek() in pipes).
"""


[docs]class WSpecType(Enum):
    """Type of Kaldi stype write specifiers."""

    NO = 0  # No specifier
    ARCHIVE = 1  # Specifier contains Ark, hdf5, segment_list or rttm file.
    SCRIPT = 2  # Specifier contains scp file.
    BOTH = 3  # Specifier contains Ark/hdf5 file and scp file.


[docs]class WSpecifier(object):
    """Class to parse Kaldi style write specifier.

    Attributes:
      spec_type: WSpecType object describing the type of specfier:
                 ARCHIVE: Specifier contains Ark or hdf5 file.
                 SCRIPT:  Specifier contains scp file.
                 BOTH:    Specifier contains Ark/hdf5 file and scp file.

      archive: output data file path.
      script: optional output scp file.
      archive_type: type of data files.
                    ARK: Kaldi Ark file.
                    H5: hdf5 file.
      binary: True if the the Ark file is binary, False if it is text file.
      flush: If True, it flushes the output after writing each feature matrix.
      permissive: when writing to an scp file only: will ignore
                  missing scp entries
    """

[docs]    def __init__(
        self,
        spec_type,
        archive,
        script,
        archive_type=ArchiveType.H5,
        binary=True,
        flush=False,
        permissive=False,
    ):
        self.archive = archive
        self.script = script
        self.spec_type = spec_type
        self.archive_type = archive_type
        self.binary = binary
        self.flush = flush
        self.permissive = permissive

[docs]    @classmethod
    def create(cls, wspecifier):
        """Creates WSpecifier object from string.

        Args:
          wspecifier: Write specifier string, e.g.:
                      file.h5
                      h5:file.h5
                      ark:file.ark
                      h5,scp:file.h5,file.scp
                      ark,scp:file.ark,file.scp

        Returns:
          WSpecifier object.
        """
        fields = wspecifier.strip().split(":")
        if len(fields) == 1:
            assert len(fields[0]) > 0
            return cls(WSpecType.ARCHIVE, fields[0], None)
        elif len(fields) == 2:
            options = fields[0].strip().split(",")
            archives = fields[1].strip().split(",")

            archive = None
            script = None
            archive_type = None
            binary = True
            flush = False
            permissive = False

            cur_archive = 0
            for option in options:
                if option == "h5":
                    assert archive_type is None
                    assert archive is None, "Repeated h5, ark in wspecifier %s" % script
                    assert len(archives) > cur_archive
                    archive_type = ArchiveType.H5
                    archive = archives[cur_archive]
                    cur_archive += 1
                elif option == "ark":
                    assert archive_type is None
                    assert archive is None, "Repeated h5, ark in wspecifier %s" % script
                    assert len(archives) > cur_archive
                    archive_type = ArchiveType.ARK
                    archive = archives[cur_archive]
                    cur_archive += 1
                elif option == "audio":
                    assert archive_type is None
                    assert archive is None, (
                        "Repeated h5, ark, audio in wspecifier %s" % script
                    )
                    assert len(archives) > cur_archive
                    archive_type = ArchiveType.AUDIO
                    archive = archives[cur_archive]
                    cur_archive += 1
                elif option == "scp":
                    assert script is None, "Repeated scp in wspecifier %s" % script
                    assert len(archives) > cur_archive
                    script = archives[cur_archive]
                    cur_archive += 1
                elif option == "segments":
                    assert archive_type is None
                    assert archive is None, "Repeated h5, ark in wspecifier %s" % script
                    assert len(archives) > cur_archive
                    archive_type = ArchiveType.SEGMENT_LIST
                    archive = archives[cur_archive]
                    cur_archive += 1
                elif option == "rttm":
                    assert archive_type is None
                    assert archive is None, "Repeated h5, ark in wspecifier %s" % script
                    assert len(archives) > cur_archive
                    archive_type = ArchiveType.RTTM
                    archive = archives[cur_archive]
                    cur_archive += 1
                elif option == "f":
                    flush = True
                elif option in ["b", "t", "nf", "p"]:
                    pass
                else:
                    raise ValueError("Wrong wspecifier options %s" % fields[0])

            if archive is None:
                if script is not None:
                    spec_type = WSpecType.SCRIPT
                else:
                    raise ValueError("Wrong wspecifier %s " % wspecifier)
            else:
                if script is None:
                    spec_type = WSpecType.ARCHIVE
                else:
                    spec_type = WSpecType.BOTH

            if archive_type == ArchiveType.ARK:
                for option in options:
                    if option == "t":
                        binary = False
                    elif option == "p":
                        permissive = True

            return cls(
                spec_type, archive, script, archive_type, binary, flush, permissive
            )
        else:
            raise ValueError(
                "Two many fields (%d>2) in wspecifier %s" % (len(fields), wspecifier)
            )

[docs]    def __eq__(self, other):
        """Equal operator."""
        eq = self.archive == other.archive
        eq = eq and self.script == other.script
        eq = eq and self.spec_type == other.spec_type
        eq = eq and self.archive_type == other.archive_type
        eq = eq and self.binary == other.binary
        eq = eq and self.flush == other.flush
        eq = eq and self.permissive == other.permissive
        return eq

[docs]    def __ne__(self, other):
        """Non-equal operator."""
        return not self.__eq__(other)

[docs]    def __cmp__(self, other):
        """Comparison operator."""
        if self.__eq__(other):
            return 0
        return 1


"""Documentation for "rspecifier" (Taken from Kaldi)
"rspecifier" describes how we read a set of objects indexed by keys.
The possibilities are:

h5:rxfilename
ark:rxfilename
scp:rxfilename

We also allow various modifiers:
  o   means the program will only ask for each key once, which enables
      the reader to discard already-asked-for values.
  s   means the keys are sorted on input (means we don't have to read till
      eof if someone asked for a key that wasn't there).
  cs  means that it is called in sorted order (we are generally asserting
      this based on knowledge of how the program works).
  p   means "permissive", and causes it to skip over keys whose corresponding
      scp-file entries cannot be read. [and to ignore errors in archives and
      script files, and just consider the "good" entries].
      We allow the negation of the options above, as in no, ns, np,
      but these aren't currently very useful (just equivalent to omitting the
      corresponding option).
      [any of the above options can be prefixed by n to negate them, e.g. no,
      ns, ncs, np; but these aren't currently useful as you could just omit
      the option].
  bg means "background".  It currently has no effect for random-access readers,
      but for sequential readers it will cause it to "read ahead" to the next
      value, in a background thread.  Recommended when reading larger objects
      such as neural-net training examples, especially when you want to
      maximize GPU usage.

  b   is ignored [for scripting convenience]
  t   is ignored [for scripting convenience]


 So for instance the following would be a valid rspecifier:

  "o, s, p, ark:gunzip -c foo.gz|"
"""


[docs]class RSpecType(Enum):
    NO = 0
    ARCHIVE = 1
    SCRIPT = 2


[docs]class RSpecifier(object):
[docs]    def __init__(
        self,
        spec_type,
        archive,
        archive_type=ArchiveType.H5,
        once=False,
        is_sorted=False,
        called_sorted=False,
        permissive=False,
        background=False,
    ):

        self.spec_type = spec_type
        self.archive = archive
        self.archive_type = archive_type
        self.once = once
        self.is_sorted = is_sorted
        self.called_sorted = called_sorted
        self.permissive = permissive
        self.background = background

    @property
    def script(self):
        return self.archive

[docs]    @classmethod
    def create(cls, rspecifier):
        fields = rspecifier.strip().split(":")
        if len(fields) == 1:
            assert len(fields[0]) > 0
            return cls(RSpecType.ARCHIVE, fields[0])
        elif len(fields) == 2:
            options = fields[0].strip().split(",")
            archives = fields[1].strip().split(",")
            assert len(archives) == 1

            spec_type = None
            archive = archives[0]
            archive_type = None
            once = False
            is_sorted = False
            called_sorted = False
            permissive = False
            background = False

            for option in options:
                if option == "h5":
                    assert spec_type is None
                    spec_type = RSpecType.ARCHIVE
                    archive_type = ArchiveType.H5
                elif option == "ark":
                    assert spec_type is None
                    spec_type = RSpecType.ARCHIVE
                    archive_type = ArchiveType.ARK
                elif option == "audio":
                    assert spec_type is None
                    spec_type = RSpecType.ARCHIVE
                    archive_type = ArchiveType.AUDIO
                elif option == "segments":
                    assert spec_type is None
                    spec_type = RSpecType.ARCHIVE
                    archive_type = ArchiveType.SEGMENT_LIST
                elif option == "rttm":
                    assert spec_type is None
                    spec_type = RSpecType.ARCHIVE
                    archive_type = ArchiveType.RTTM
                elif option == "scp":
                    assert spec_type is None
                    spec_type = RSpecType.SCRIPT
                elif option == "p":
                    permissive = True
                elif option in ["o", "s", "cs", "bg"]:
                    pass
                else:
                    raise ValueError("Wrong wspecifier options %s" % fields[0])

            assert spec_type is not None, "Wrong wspecifier options %s" % fields[0]

            if spec_type == RSpecType.SCRIPT:
                with open(archive, "r") as f:
                    scp_f2 = f.readline().strip().split(" ")[1]
                    if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None:
                        archive_type = ArchiveType.H5
                    elif re.match(r".*\.ark:.*$", scp_f2) is not None:
                        archive_type = ArchiveType.ARK
                    elif (
                        re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None
                    ):
                        archive_type = ArchiveType.AUDIO
                    else:
                        archive_type = ArchiveType.ARK

                    # .split('[')[0].split(':')
                    # if len(scp) == 1:
                    #     archive_type = ArchiveType.H5
                    # else:
                    #     archive_type = ArchiveType.ARK

            if archive_type == ArchiveType.ARK:
                for option in options:
                    if option == "o":
                        once = True
                    elif option == "s":
                        is_sorted = True
                    elif option == "cs":
                        called_sorted = True
                    elif option == "bg":
                        background = True

            return cls(
                spec_type,
                archive,
                archive_type,
                once,
                is_sorted,
                called_sorted,
                permissive,
                background,
            )
        else:
            raise ValueError(
                "Two many fields (%d>2) in wspecifier %s" % (len(fields), rspecifier)
            )

    def __eq__(self, other):
        eq = self.spec_type == other.spec_type
        eq = eq and self.archive == other.archive
        eq = eq and self.archive_type == other.archive_type
        eq = eq and self.once == other.once
        eq = eq and self.is_sorted == other.is_sorted
        eq = eq and self.called_sorted == other.called_sorted
        eq = eq and self.permissive == other.permissive
        eq = eq and self.background == other.background
        return eq

    def __ne__(self, other):
        return not self.__eq__(other)