Source code for pydrobert.kaldi.io.argparse

# Copyright 2021 Sean Robertson

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Contains a custom ArgumentParser, KaldiParser, and a number of arg types"""

import argparse
import logging
import sys
import shlex
from typing import Any, List, Optional, Sequence, TextIO

import numpy as np

from pydrobert import kaldi
from pydrobert.kaldi.io import enums as kaldi_io_enums
from pydrobert.kaldi.io import util as kaldi_io_util
from pydrobert.kaldi.logging import kaldi_lvl_to_logging_lvl
from pydrobert.kaldi.logging import kaldi_vlog_level_cmd_decorator

__author__ = "Sean Robertson"
__email__ = "sdrobert@cs.toronto.edu"
__license__ = "Apache 2.0"
__copyright__ = "Copyright 2017 Sean Robertson"

__all__ = [
    "kaldi_rspecifier_arg_type",
    "kaldi_wspecifier_arg_type",
    "kaldi_rxfilename_arg_type",
    "kaldi_wxfilename_arg_type",
    "kaldi_dtype_arg_type",
    "kaldi_config_arg_type",
    "numpy_dtype_arg_type",
    "kaldi_bool_arg_type",
    "parse_kaldi_config_file",
    "KaldiVerbosityAction",
    "KaldiParser",
]


[docs]def kaldi_rspecifier_arg_type(string: str) -> str:
    """argument type to make sure string is a valid rspecifier"""
    table_type, _, _, _ = kaldi_io_util.parse_kaldi_input_path(string)
    if table_type == kaldi_io_enums.TableType.NotATable:
        raise argparse.ArgumentTypeError("Not a valid rspecifier")
    return string


[docs]def kaldi_wspecifier_arg_type(string: str) -> str:
    """argument type to make sure string is a valid wspecifier"""
    table_type, _, _, _ = kaldi_io_util.parse_kaldi_output_path(string)
    if table_type == kaldi_io_enums.TableType.NotATable:
        raise argparse.ArgumentTypeError("Not a valid wspecifier")
    return string


[docs]def kaldi_rxfilename_arg_type(string: str) -> str:
    """argument type to make sure string is a valid extended readable file"""
    table_type, _, rxfilename_type, _ = kaldi_io_util.parse_kaldi_input_path(string)
    if table_type != kaldi_io_enums.TableType.NotATable:
        raise argparse.ArgumentTypeError(
            "Expected an extended file name, got an rspecifier (starts with "
            "'ark:' or 'scp:')"
        )
    elif rxfilename_type == kaldi_io_enums.RxfilenameType.InvalidInput:
        raise argparse.ArgumentTypeError("Not a valid rxfilename")
    return string


[docs]def kaldi_wxfilename_arg_type(string: str) -> str:
    """argument type to make sure string is a valid extended readable file"""
    table_type, _, wxfilename_type, _ = kaldi_io_util.parse_kaldi_output_path(string)
    if table_type != kaldi_io_enums.TableType.NotATable:
        raise argparse.ArgumentTypeError(
            "Expected an extended file name, got an rspecifier (starts with "
            "'ark:' or 'scp:')"
        )
    elif wxfilename_type == kaldi_io_enums.WxfilenameType.InvalidOutput:
        raise argparse.ArgumentTypeError("Not a valid wxfilename")
    return string


[docs]def kaldi_dtype_arg_type(string: str) -> kaldi_io_enums.KaldiDataType:
    """argument type for string reps of KaldiDataType"""
    try:
        ret = kaldi_io_enums.KaldiDataType(string)
    except ValueError:
        raise argparse.ArgumentTypeError(
            "Invalid kaldi data type (must be one of {})".format(
                ",".join("'{}'".format(x.value) for x in kaldi_io_enums.KaldiDataType)
            )
        )
    return ret


[docs]def kaldi_bool_arg_type(string: str) -> bool:
    '''argument type for bool strings of "true","t","false", or "f"'''
    if string in ("true", "t"):
        return True
    elif string in ("false", "f"):
        return False
    else:
        raise argparse.ArgumentTypeError("Must be 'true'/'t' or 'false'/'f'")


[docs]def numpy_dtype_arg_type(string: str) -> np.dtype:
    """argument type for string reps of numpy dtypes"""
    try:
        ret = np.dtype(string)
    except TypeError as error:
        raise argparse.ArgumentTypeError(error.message)
    return ret


[docs]def kaldi_config_arg_type(string: str) -> np.dtype:
    """Encapsulate parse_kaldi_config_file as an argument type"""
    try:
        return parse_kaldi_config_file(string)
    except (IOError, ValueError) as e:
        raise argparse.ArgumentTypeError("config file error:") from e


[docs]def parse_kaldi_config_file(file_path: str, allow_space=True) -> List[str]:
    """Return a list of arguments from a kaldi config file

    Parameters
    ----------
    file_path : str
        Points to the config file in question
    allow_spaces : bool, optional
        If ``True``, treat the first space on a line as splitting key
        and value if no equals sign exists on the line. If ``False``, no
        equals sign will chunk the whole line (as if a boolean flag).
        Kaldi does not split on spaces, but python does. Note that
        `allow_spaces` does not split the entire line on spaces, unlike
        shell arguments.
    """
    args = []
    with open(file_path) as config_file:
        for line_no, line in enumerate(config_file):
            line = line.split("#")[0].strip()
            if not line:
                continue
            if not line.startswith("--"):
                raise ValueError(
                    "Reading config file {} : line {} does not look "
                    "like a line from a Kaldi command-line program's "
                    "config file: should be of the form --x=y. Note: "
                    "config files intended to be sourced by shell "
                    "scripts lack the '--'.".format(file_path, line_no + 1)
                )
            equals_index = line.find("=")
            if equals_index == 2:
                raise ValueError("Invalid option (no key): ".format(line))
            elif allow_space and equals_index == -1:
                space_index = line.find(" ")
                assert space_index != 2
                if space_index == -1:
                    args.append(line)
                else:
                    args.extend([line[:space_index], line[space_index + 1 :]])
            else:
                args.append(line)
    return args


[docs]class KaldiVerbosityAction(argparse.Action):
    """Read kaldi-style verbosity levels, setting logger to python level

    Kaldi verbosities tend to range from [-3, 9]. This action takes in a
    kaldi verbosity level and converts it to python logging levels with
    :func:`pydrobert.kaldi.logging.kaldi_lvl_to_logging_lvl`

    If the parser has a `logger` attribute, the `logger` will be set to
    the new level.
    """

    def __init__(
        self,
        option_strings,
        dest,
        default=logging.INFO,
        required=False,
        help="Verbose level (higher->more logging)",
        metavar=None,
    ):
        super(KaldiVerbosityAction, self).__init__(
            option_strings,
            dest,
            nargs=None,
            default=default,
            type=int,
            required=required,
            help=help,
            metavar=metavar,
        )

    def __call__(self, parser, namespace, values, option_string=None):
        if values < -3 or values > 9:
            raise argparse.ArgumentTypeError(
                "Verbosity must be between -3 and 9 inclusive"
            )
        logging_lvl = kaldi_lvl_to_logging_lvl(values)
        setattr(namespace, self.dest, logging_lvl)
        if hasattr(parser, "logger"):
            parser.logger.setLevel(logging_lvl)


[docs]class KaldiParser(argparse.ArgumentParser):
    """Kaldi-compatible wrapper for argument parsing

    KaldiParser intends to make command-line entry points in python more compatible with
    kaldi command-line scripts. It makes the following changes to
    :class:`argparse.ArgumentParser`:

    1. Creates a :class:`logging.Formatter` instance that formats messages similarly to
       kaldi using the `prog` keyword as the program name.
    2. Sets the default help and usage locations to :obj:`sys.stderr` (instead of
       :obj:`sys.stdout`)
    3. Registers ``'kaldi_bool'``, ``'kaldi_rspecifier'``, ``'kaldi_wspecifier'``,
       ``'kaldi_wxfilename'``, ``'kaldi_rxfilename'``, ``'kaldi_config'``,
       ``'kaldi_dtype'``, and ``'numpy_dtype'`` as argument types
    4. Registers ``'kaldi_verbose'`` as an action
    5. Adds `logger`, `update_formatters`, `add_config`, and `add_verbose` parameters to
       initialization (see below)
    6. Wraps `parse_args` and `parse_known_args` with ``kaldi_vlog_level_cmd_decorator``
       (so loggers use the right level names on error)

    KaldiParser differs from kaldi's command line parsing in a few key ways. First,
    though '=' syntax is supported, the parser will also group using the command-line
    splitting (on unquoted whitespace). For the :class:`KaldiParser`, ``--foo bar`` and
    ``--foo=bar`` are equivalent (assuming foo takes one optional argument), whereas, in
    Kaldi, ``--foo bar`` would be parsed as the boolean flag ``--foo`` followed by a
    positional with value ``bar``. This ambiguity is the source of the next difference:
    boolean flags. Because kaldi command-line parsing splits around ``=``, it can use
    ``--foo=true`` and ``--foo`` interchangeably. To avoid gobbling up a positional
    argument, :class:`KaldiParser` allows for only one type of boolean flag syntax. For
    the former, use ``action='store_true'`` in `add_argument`. For the latter, use
    ``type='kaldi_bool'``.

    Parameters
    ----------
    prog
        Name of the program. Defaults to ``sys.argv[0]``
    usage
        A usage message. Default: auto-generated from arguments
    description
        A description of what the program does
    epilog
        Text following the argument descriptions
    parents
        Parsers whose arguments should be copied into this one
    formatter_class
        Class for printing help messages
    prefix_chars
        Characters that prefix optional arguments
    fromfile_prefix_chars
        Characters that prefix files containing additional arguments
    argument_default
        The default value for all arguments
    conflict_handler
        String indicating how to handle conflicts
    add_help
        Add a ``-h/--help`` option
    add_verbose
        Add a ``-v/--verbose`` option. The option requires an integer
        argument specifying a verbosiy level at the same degrees as
        Kaldi. The level will be converted to the appropriate python
        level when parsed
    add_config
        Whether to add the standard ``--config`` option to the
        parser. If ``True``, a first-pass will extract all config file
        options and put them at the beginning of the argument string
        to be re-parsed.
    add_print_args
        Whether to add the standard ``--print-args`` to the parser. If
        ``True``, a first-pass of the will search for the value of
        ``--print-args`` and, if ``True``, will print that value to
        stderr (only on `parse_args`, not `parse_known_args`)
    update_formatters
        If `logger` is set, the logger's handlers' formatters will be
        set to a kaldi-style formatter
    logger
        Errors will be written to this logger when parse_args fails. If
        `add_verbose` has been set to ``True``, the logger will be set
        to the appropriate python level if verbose is set (note: the
        logger will be set to the default level - ``INFO`` - on
        initialization)
    version
        A version string to use for logs. If not set,
        ``pydrobert.kaldi.__version__`` will be used by default

    Attributes
    ----------
    logger
        The logger this parse was printing out to
    formatter
        A log formatter that formats with kaldi-style headers
    add_config
        Whether this parser has a ``--config`` flag
    add_print_args
        Whether this parser has a ``--print-args`` flag
    version
        Version string used by this parser and `logger`
    """

    def __init__(
        self,
        prog: Optional[str] = None,
        usage: Optional[str] = None,
        description: Optional[str] = None,
        epilog: Optional[str] = None,
        parents: Sequence[argparse.ArgumentParser] = tuple(),
        formatter_class: type = argparse.HelpFormatter,
        prefix_chars: str = "-",
        fromfile_prefix_chars: Optional[str] = None,
        argument_default: Any = None,
        conflict_handler: str = "error",
        add_help: bool = True,
        add_verbose: bool = True,
        add_config: bool = True,
        update_formatters: bool = True,
        add_print_args: bool = True,
        logger: logging.Logger = None,
        version: str = None,
    ):
        super(KaldiParser, self).__init__(
            prog=prog,
            usage=usage,
            description=description,
            epilog=epilog,
            parents=parents,
            formatter_class=formatter_class,
            prefix_chars=prefix_chars,
            fromfile_prefix_chars=fromfile_prefix_chars,
            argument_default=argument_default,
            conflict_handler=conflict_handler,
            add_help=add_help,
        )
        if version is None:
            self.version = kaldi.__version__
        else:
            self.version = version
        self.add_config = bool(add_config)
        self.add_print_args = bool(add_print_args)
        self.register("type", "kaldi_bool", kaldi_bool_arg_type)
        self.register("type", "kaldi_rspecifier", kaldi_rspecifier_arg_type)
        self.register("type", "kaldi_wspecifier", kaldi_wspecifier_arg_type)
        self.register("type", "kaldi_rxfilename", kaldi_rxfilename_arg_type)
        self.register("type", "kaldi_wxfilename", kaldi_wxfilename_arg_type)
        self.register("type", "kaldi_dtype", kaldi_dtype_arg_type)
        self.register("type", "numpy_dtype", numpy_dtype_arg_type)
        self.register("type", "kaldi_config", kaldi_config_arg_type)
        self.register("action", "kaldi_verbose", KaldiVerbosityAction)
        self.logger = logger
        self.formatter = logging.Formatter(
            "%(levelname)s ("
            + self.prog
            + "["
            + self.version
            + "]:%(funcName)s():%(filename)s:%(lineno)d) %(message)s"
        )
        if logger and update_formatters:
            logger.setLevel(logging.INFO)
            for handler in logger.handlers:
                handler.setFormatter(self.formatter)
        default_prefix = "-" if "-" in prefix_chars else prefix_chars[0]
        if add_verbose:
            self.add_argument(
                default_prefix + "v",
                default_prefix * 2 + "verbose",
                action="kaldi_verbose",
            )
        if add_config:
            self.add_argument(default_prefix * 2 + "config", type="kaldi_config")
        if add_print_args:
            self.add_argument(default_prefix * 2 + "print-args", type="kaldi_bool")

[docs]    def print_help(self, file: Optional[TextIO] = None):
        if file is None:
            file = sys.stderr
        super(KaldiParser, self).print_help(file=file)

    print_help.__doc__ = argparse.ArgumentParser.print_help.__doc__

[docs]    def print_usage(self, file: Optional[TextIO] = None):
        if file is None:
            file = sys.stderr
        super(KaldiParser, self).print_usage(file=file)

    print_usage.__doc__ = argparse.ArgumentParser.print_usage.__doc__

[docs]    def error(self, message: str):
        if self.logger:
            self.logger.error(message)
            self.print_usage(file=sys.stderr)
            self.exit(2)
        else:
            super(KaldiParser, self).error(message)

    error.__doc__ = argparse.ArgumentParser.error.__doc__

    @kaldi_vlog_level_cmd_decorator
    def parse_known_args(
        self,
        args: Optional[Sequence[str]] = None,
        namespace: Optional[argparse.Namespace] = None,
    ):
        if args is None:
            args = sys.argv[1:]
        else:
            args = list(args)
        if self.add_print_args:
            # we do a cursory pass for --print-args, since we want to
            # print even if there's an error
            arg_idx = 0
            print_args = True
            while arg_idx < len(args):
                arg = args[arg_idx]
                if (arg[:1] not in self.prefix_chars) or (
                    arg[1:2] not in self.prefix_chars
                ):
                    pass
                elif arg[2:] == "print-args":
                    arg_idx += 1
                    if arg_idx == len(args):
                        self.error("argument {}: expected one argument".format(arg))
                    elif args[arg_idx] in ("true", "t"):
                        print_args = True
                    elif args[arg_idx] in ("false", "f"):
                        print_args = False
                    else:
                        self.error(
                            "argument {}: Must be 'true'/'t' or 'false'/'f'"
                            "".format(arg)
                        )
                elif arg[2:].startswith("print-args="):
                    value = arg[2:].split("=", 1)[1]
                    if value in ("true", "t"):
                        print_args = True
                    elif value in ("false", "f"):
                        print_args = False
                    else:
                        self.error(
                            "argument {}: Must be 'true'/'t' or 'false'/'f'"
                            "".format(arg)
                        )
                arg_idx += 1
            if print_args:
                print(
                    " ".join(shlex.quote(arg) for arg in [self.prog] + args),
                    file=sys.stderr,
                )
        ns, remainder = super(KaldiParser, self).parse_known_args(
            args=args, namespace=namespace
        )
        add_config = self.add_config and ns.config
        if add_config:
            args = ns.config + args
            # ignoring the possibility that they nested print-args in
            # the config
            ns, remainder = super(KaldiParser, self).parse_known_args(
                args=args, namespace=namespace
            )
        return ns, remainder

    parse_known_args.__doc__ = argparse.ArgumentParser.parse_known_args.__doc__