# Copyright 2021 Sean Robertson
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Interfaces for Kaldi's readers and writers
This subpackage contains a factory function, :func:`open`, which is intended to behave
similarly to python's built-in :func:`open` factory. :func:`open` gives the specifics
behind Kaldi's different read/write styles. Here, they are described in a general way.
Kaldi's streams can be very exotic, including regular files, file offsets, stdin/out,
and pipes.
Data can be read/written from a binary or text stream in the usual way: specific data
types have specific encodings, and data are packed/unpacked in that fashion. While an
appropriate style for a fixed sequence of data, variables sequences of data are encoded
using the table analogy.
Kaldi uses the table analogy to store and retrieve indexed data. In a nutshell, Kaldi
uses archive ("ark") files to store binary or text data, and script files ("scp") to
point *into* archives. Both use whitespace- free strings as keys. Scripts and archives
do not have any built-in type checking, so it is necessary to specify the input/output
type when the files are opened.
A full account of Kaldi IO can be found on Kaldi's website under `Kaldi I/O Mechanisms
<http://kaldi-asr.org/doc/io.html>`_.
See Also
--------
pydrobert.kaldi.io.enums.KaldiDataType
For more information on the types of streams that can be read or
written
"""
import abc
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pydrobert.kaldi.io.enums import KaldiDataType
__all__ = [
"KaldiIOBase",
"open",
"argparse",
"corpus",
"duck_streams",
"enums",
"table_streams",
"util",
]
[docs]class KaldiIOBase(object, metaclass=abc.ABCMeta):
"""IOBase for kaldi readers and writers
Similar to :class:`io.IOBase`, but without a lot of the assumed functionality.
Arguments
---------
path
The path passed to "func:`pydrobert.kaldi.io.open`. One of an rspecifier,
wspecifier, rxfilename, or wxfilename
Attributes
----------
path
The opened path
table_type
The type of table that's being read/written (or ``NotATable``)
xfilenames
The extended file names being read/written. For tables, this
excludes the ``'ark:'`` and ``'scp:'`` prefixes from path.
Usually there will be only one extended file name, unless the
path uses the special ``'ark,scp:'`` format to write both an
archive and script at the same time
xtypes
The type of extended file name opened. Usually there will be
only one extended file name, unless the path uses the special
``'ark,scp:'`` format to write both an archive and script at
the same time
binary
Whether this stream encodes binary data (or text)
closed
Whether this stream is closed
permissive
Whether invalid values will be treated as non-existent (tables only)
once
Whether each entry will only be read once (readable tables only)
sorted
Whether keys are sorted (readable tables only)
called_sorted
Whether entries will be read in sorted order (readable tables only)
background
Whether reading is not being performed on the main thread (readable tables only)
flush
Whether the stream is flushed after each write operation (writable tables only)
"""
def __init__(self, path: str):
from pydrobert.kaldi.io.util import parse_kaldi_input_path
from pydrobert.kaldi.io.util import parse_kaldi_output_path
self.path = path
self.closed = False
if self.readable():
(
self._table_type,
self._xfilenames,
self._xtypes,
options,
) = parse_kaldi_input_path(path)
else:
(
self._table_type,
self._xfilenames,
self._xtypes,
options,
) = parse_kaldi_output_path(path)
self.binary = True
for key, value in list(options.items()):
setattr(self, key, value)
super(KaldiIOBase, self).__init__()
[docs] @abc.abstractmethod
def close(self):
"""Close and flush the underlying IO object
This method has no effect if the file is already closed
"""
pass
[docs] @abc.abstractmethod
def readable(self) -> bool:
"""Return whether this object was opened for reading"""
pass
[docs] @abc.abstractmethod
def writable(self) -> bool:
"""Return whether this object was opened for writing"""
pass
def __enter__(self) -> "KaldiIOBase":
return self
def __exit__(self, exception_type, exception_val, trace) -> None:
self.close()
[docs]def open(
path: str,
kaldi_dtype: "KaldiDataType" = None,
mode: str = "r",
error_on_str: bool = True,
utt2spk: str = "",
value_style: str = "b",
header: bool = True,
cache: str = False,
) -> KaldiIOBase:
"""Factory function for initializing and opening kaldi streams
This function provides a general interface for opening kaldi streams. Kaldi streams
are either simple input/output of kaldi objects (the basic/duck stream) or key-value
readers and writers (tables).
When `path` starts with ``'ark:'`` or ``'scp:'`` (possibly with modifiers before the
colon), a table is opened. Otherwise, a basic stream is opened.
See also
--------
pydrobert.kaldi.io.table_streams.open_table_stream
For information on opening tables
pydrobert.kaldi.io.duck_streams.open_duck_stream
For information on opening basic streams
"""
from pydrobert.kaldi.io.enums import TableType
from pydrobert.kaldi.io.util import parse_kaldi_input_path
from pydrobert.kaldi.io.util import parse_kaldi_output_path
from pydrobert.kaldi.io.duck_streams import open_duck_stream
from pydrobert.kaldi.io.table_streams import open_table_stream
if "r" in mode:
table_type = parse_kaldi_input_path(path)[0]
else:
table_type = parse_kaldi_output_path(path)[0]
if table_type == TableType.NotATable:
return open_duck_stream(path, mode=mode, header=header)
else:
return open_table_stream(
path,
kaldi_dtype,
mode=mode,
error_on_str=error_on_str,
utt2spk=utt2spk,
value_style=value_style,
cache=cache,
)