Source code for pydrobert.kaldi.io.util

# Copyright 2021 Sean Robertson

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Kaldi I/O utilities"""

from typing import Any, Tuple
import numpy as np

from pydrobert.kaldi.io.enums import KaldiDataType
from pydrobert.kaldi.io.enums import RxfilenameType
from pydrobert.kaldi.io.enums import TableType
from pydrobert.kaldi.io.enums import WxfilenameType

import pydrobert.kaldi._internal as _i  # type: ignore

__all__ = [
    "parse_kaldi_input_path",
    "parse_kaldi_output_path",
    "infer_kaldi_data_type",
]


[docs]def parse_kaldi_input_path(path: str) -> Tuple[TableType, str, RxfilenameType, dict]:
    """Determine the characteristics of an input stream by its path

    Returns a 4-tuple of the following information:

    1. If path is not an rspecifier (``TableType.NotATable``):

       a. Classify path as an rxfilename
       b. return a tuple of ``(TableType, path, RxfilenameType,
          dict())``

    2. else:

       a. Put all rspecifier options (once, sorted, called_sorted,
          permissive, background) into a dictionary
       b. Extract the embedded rxfilename and classify it
       c. return a tuple of ``(TableType, rxfilename,
          RxfilenameType, options)``

    Parameters
    ----------
    path
        A string that would be passed to ``pydrobert.kaldi.io.open``
    """
    cpp_ret = _i.ParseInputPath(path)
    table_type = TableType(cpp_ret[0])
    rxfilename = cpp_ret[1]
    rx_type = RxfilenameType(cpp_ret[2])
    if table_type == TableType.NotATable:
        options = dict()
    else:
        options = {
            "once": cpp_ret[3],
            "sorted": cpp_ret[4],
            "called_sorted": cpp_ret[5],
            "permissive": cpp_ret[6],
            "background": cpp_ret[7],
        }
    return (table_type, rxfilename, rx_type, options)


[docs]def parse_kaldi_output_path(path: str) -> Tuple[TableType, str, WxfilenameType, dict]:
    """Determine the charactersistics of an output stram by its path

    Returns a 4-tuple of the following information

    1. If path is not a wspecifier (``TableType.NotATable``)

       a. Classify path as a wxfilename
       b. return a tuple of ``(TableType, path, WxfilenameType,
          dict())``

    2. If path is an archive or script

       a. Put all wspecifier options (binary, flush, permissive)
          into a dictionary
       b. Extract the embedded wxfilename and classify it
       c. return a tuple of ``(TableType, wxfilename,
          WxfilenameType, options)``

    3. If path contains both an archive and a script
       (``TableType.BothTables``)

       a. Put all wspecifier options (binary, flush, permissive)
          into a dictionary
       b. Extract both embedded wxfilenames and classify them
       c. return a tuple of
          ``(TableType, (arch_wxfilename, script_wxfilename),
          (arch_WxfilenameType, script_WxfilenameType), options)``


    Parameters
    ----------
    path
        A string that would be passed to :func:`pydrobert.kaldi.io.open`
    """
    cpp_ret = _i.ParseOutputPath(path)
    table_type = TableType(cpp_ret[0])
    if table_type == TableType.BothTables:
        wxfilenames = cpp_ret[1:3]
        wx_types = tuple(WxfilenameType(wx) for wx in cpp_ret[3:5])
    else:
        wxfilenames = cpp_ret[1]
        wx_types = WxfilenameType(cpp_ret[2])
    if table_type == TableType.NotATable:
        options = dict()
    else:
        options = {
            "binary": cpp_ret[-3],
            "flush": cpp_ret[-2],
            "permissive": cpp_ret[-1],
        }
    return (table_type, wxfilenames, wx_types, options)


[docs]def infer_kaldi_data_type(obj: Any) -> KaldiDataType:
    """Infer the appropriate kaldi data type for this object

    The following map is used (in order):

    +------------------------------+---------------------+
    | Object                       | KaldiDataType       |
    +==============================+=====================+
    | an int                       | Int32               |
    +------------------------------+---------------------+
    | a boolean                    | Bool                |
    +------------------------------+---------------------+
    | a float*                     | Base                |
    +------------------------------+---------------------+
    | str                          | Token               |
    +------------------------------+---------------------+
    | 2-dim numpy array float32    | FloatMatrix         |
    +------------------------------+---------------------+
    | 1-dim numpy array float32    | FloatVector         |
    +------------------------------+---------------------+
    | 2-dim numpy array float64    | DoubleMatrix        |
    +------------------------------+---------------------+
    | 1-dim numpy array float64    | DoubleVector        |
    +------------------------------+---------------------+
    | 1-dim numpy array of int32   | Int32Vector         |
    +------------------------------+---------------------+
    | 2-dim numpy array of int32\\* | Int32VectorVector   |
    +------------------------------+---------------------+
    | (matrix-like, float or int)  | WaveMatrix**        |
    +------------------------------+---------------------+
    | an empty container           | BaseMatrix          |
    +------------------------------+---------------------+
    | container of str             | TokenVector         |
    +------------------------------+---------------------+
    | 1-dim py container of ints   | Int32Vector         |
    +------------------------------+---------------------+
    | 2-dim py container of ints\\* | Int32VectorVector   |
    +------------------------------+---------------------+
    | 2-dim py container of pairs  | BasePairVector      |
    | of floats                    |                     |
    +------------------------------+---------------------+
    | matrix-like python container | DoubleMatrix        |
    +------------------------------+---------------------+
    | vector-like python container | DoubleVector        |
    +------------------------------+---------------------+

    \\*The same data types could represent a ``Double`` or an
    ``Int32PairVector``, respectively. Care should be taken in these
    cases.

    \\*\\*The first element is the wave data, the second its sample
    frequency. The wave data can be a 2d numpy float array of the same
    precision as ``KaldiDataType.BaseMatrix``, or a matrix-like python
    container of floats and/or ints.

    Returns
    -------
    pydrobert.kaldi.io.enums.KaldiDataType or None
    """
    if isinstance(obj, int):
        return KaldiDataType.Int32
    elif isinstance(obj, bool):
        return KaldiDataType.Bool
    elif isinstance(obj, float):
        return KaldiDataType.Base
    elif isinstance(obj, str):
        return KaldiDataType.Token
    # the remainder are expected to be containers
    if not hasattr(obj, "__len__"):
        return None
    # numpy array or wav tuple?
    try:
        if len(obj.shape) == 1:
            if obj.dtype == np.float32:
                return KaldiDataType.FloatVector
            elif obj.dtype == np.float64:
                return KaldiDataType.DoubleVector
            elif obj.dtype == np.int32:
                return KaldiDataType.Int32Vector
        elif len(obj.shape) == 2:
            if obj.dtype == np.float32:
                return KaldiDataType.FloatMatrix
            elif obj.dtype == np.float64:
                return KaldiDataType.DoubleMatrix
            elif obj.dtype == np.int32:
                return KaldiDataType.Int32Vector
        elif (
            len(obj) == 2
            and len(obj[0].shape) == 2
            and (obj[0].dtype == np.float32 and not KaldiDataType.BaseMatrix.is_double)
            or (obj[0].dtype == np.float64 and KaldiDataType.BaseMatrix.is_double)
            and (isinstance(obj[1], int) or isinstance(obj[1], float))
        ):
            return KaldiDataType.WaveMatrix
    except AttributeError:
        pass
    if not len(obj):
        return KaldiDataType.BaseMatrix
    elif all(isinstance(x, str) for x in obj):
        return KaldiDataType.TokenVector
    elif all(isinstance(x, int) for x in obj):
        return KaldiDataType.Int32Vector
    elif all(hasattr(x, "__len__") and hasattr(x, "__getitem__") for x in obj):
        if all(all(isinstance(y, int) for y in x) for x in obj):
            return KaldiDataType.Int32VectorVector
        try:
            if all(len(x) == 2 and all(np.isreal(y) for y in x) for x in obj):
                return KaldiDataType.BasePairVector
            elif len(np.array(obj).astype(np.float64).shape) == 2:
                return KaldiDataType.DoubleMatrix
        except ValueError:
            pass
    else:
        try:
            if len(np.array(obj).astype(np.float64).shape) == 1:
                return KaldiDataType.DoubleVector
        except ValueError:
            pass
    return None