Source code for almanac.parsing.parsing

# The parsing logic is heavily borrowed from the python-nubia project, available at:
# https://github.com/facebookincubator/python-nubia
#
# In compliance with python-nubia's BSD-style license, its copyright and license terms
# are included below:
#
# BSD License
#
# For python-nubia software
#
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification,
# are permitted provided that the following conditions are met:
#
#  * Redistributions of source code must retain the above copyright notice, this
#    list of conditions and the following disclaimer.
#
#  * Redistributions in binary form must reproduce the above copyright notice,
#    this list of conditions and the following disclaimer in the documentation
#    and/or other materials provided with the distribution.
#
#  * Neither the name Facebook nor the names of its contributors may be used to
#    endorse or promote products derived from this software without specific
#    prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import re

import pyparsing as pp

from enum import auto, Enum
from functools import lru_cache
from typing import NamedTuple

from prompt_toolkit.document import Document

from ..errors import PartialParseError, TotalParseError


[docs]class Patterns:
    ALLOWED_SYMBOLS_IN_STRING = r'-_/#@£$€%*+~|<>?.'
    IDENTIFIER = r'([a-zA-Z_][a-zA-Z0-9_\-]*)'
    WHITESPACE = r'\s+'

    UNQUOTED_STRING = r'([a-zA-Z0-9' + ALLOWED_SYMBOLS_IN_STRING + r']+)'
    STRING_SINGLE_QUOTE = r"\'([^\\\']|\\.)*\'"
    STRING_DOUBLE_QUOTE = r'\"([^\\\"]|\\.)*\"'

    BOOLEAN = r'(True|true|False|false)'
    FLOAT = r'\-?\d+\.\d*([eE]\d+)?'
    INTEGER = r'\-?\d+'

    KWARG = IDENTIFIER + r'(\s*=\s*)'
    COMMAND = r'^' + IDENTIFIER + r'(\s+|$)'

[docs]    @staticmethod
    def is_valid_identifier(
        s: str
    ) -> bool:
        """Whether the specified string is a valid command name or kwarg identifier."""
        return bool(re.fullmatch(Patterns.IDENTIFIER, s))


def _no_transform(x):
    return x


def _bool_transform(x):
    return x in ('True', 'true',)


def _str_transform(x):
    return x.strip('"\'')


_TRANSFORMS = {
    'bool': _bool_transform,
    'str': _str_transform,
    'int': int,
    'float': float,
    'dict': dict,
}


def _parse_type(data_type):
    transform = _TRANSFORMS.get(data_type, _no_transform)

    def _parse(s, loc, toks):
        return [transform(x) for x in toks]

    return _parse


# Valid identifiers cannot start with a number, but may contain them in their body.
identifier = pp.Word(pp.alphas + '_-', pp.alphanums + '_-')

# XXX: allow for hex?
int_value = pp.Regex(Patterns.INTEGER).setParseAction(_parse_type('int'))

float_value = pp.Regex(Patterns.FLOAT).setParseAction(_parse_type('float'))

bool_value = (
    pp.Literal('True') ^ pp.Literal('true') ^
    pp.Literal('False') ^ pp.Literal('false')
).setParseAction(_parse_type('bool'))

quoted_string = pp.quotedString.setParseAction(_parse_type('str'))

unquoted_string = pp.Word(
    pp.alphanums + Patterns.ALLOWED_SYMBOLS_IN_STRING
).setParseAction(_parse_type('str'))

string_value = quoted_string | unquoted_string

single_value = bool_value | float_value | int_value | string_value

list_value = pp.Group(
    pp.Suppress('[') +
    pp.Optional(pp.delimitedList(single_value)) +
    pp.Suppress(']')
).setParseAction(_parse_type('list'))

dict_value = pp.Forward()

value = list_value ^ single_value ^ dict_value

dict_key_value = pp.dictOf(string_value + pp.Suppress(':'), value)

dict_value << pp.Group(
    pp.Suppress('{') + pp.delimitedList(dict_key_value) + pp.Suppress('}')
).setParseAction(_parse_type('dict'))

# Positionals must be end of line or has a space (or more) afterwards.
# This is to ensure that the parser treats text like "something=" as invalid
# instead of parsing this as positional "something" and leaving the "=" as
# invalid on its own.
positionals = pp.ZeroOrMore(
    value + (pp.StringEnd() ^ pp.Suppress(pp.OneOrMore(pp.White())))
).setResultsName('positionals')

key_value = pp.Dict(pp.ZeroOrMore(pp.Group(
    identifier + pp.Suppress('=') + value
))).setResultsName('kv')

command = identifier.setResultsName('command')

command_line = command + positionals + key_value


[docs]class ParseState(Enum):
    FULL = auto()
    PARTIAL = auto()
    NONE = auto()


[docs]class ParseStatus(NamedTuple):
    results: pp.ParseResults
    unparsed_text: str
    unparsed_start_pos: int
    state: ParseState


[docs]@lru_cache()
def parse_cmd_line(
    text: str
) -> ParseStatus:
    """Attempt to parse a command line, returning a :class:`ParseStatus` object."""
    try:
        parse_results = _raw_parse_cmd_line(text)
        unparsed_text = ''
        unparsed_start_pos = len(text)
        parse_state = ParseState.FULL
    except PartialParseError as e:
        parse_results = e.partial_result
        unparsed_text = e.remaining
        unparsed_start_pos = e.error_pos
        parse_state = ParseState.PARTIAL
    except TotalParseError:
        parse_results = None
        unparsed_text = text
        unparsed_start_pos = 0
        parse_state = ParseState.NONE

    return ParseStatus(parse_results, unparsed_text, unparsed_start_pos, parse_state)


def _raw_parse_cmd_line(
    text: str
) -> pp.ParseResults:
    """Attempt to parse the command line as per the grammar defined in this module.

    If the specified text can be fully parsed, then a `pypaysing.ParseResults` will be
    returned with the following attributes:

        * command: The name or alias of the command.
        * kv: A dictionary of key-value pairs representing the keyword arguments.
        * positionals: Any positional argument values.

    Otherwise, a descendant of :class:`CommandParseError` is raised.

    Raises:
        :class:`CommandPartialParseError`: If the specified text can be partially
            parsed, but errors still exist.
        :class:`CommandPartialParseError`: If the text cannot even be partially parsed.

    """
    try:
        result = command_line.parseString(text, parseAll=True)
        return result
    except pp.ParseException as e:
        remaining = e.markInputline()
        remaining = remaining[(remaining.find('>!<') + 3):]

        try:
            partial_result = command_line.parseString(text, parseAll=False)
        except pp.ParseException as ee:
            raise TotalParseError(str(ee)) from None

        new_exc = PartialParseError(str(e), remaining, partial_result, e.col)
        raise new_exc from None


[docs]class IncompleteToken:
    """Encapsulation of a token that could only be partially parsed."""

    def __init__(
        self,
        token: str
    ) -> None:
        self._token = token

        self._key = ''
        self._value = ''
        self._is_kw_arg = False
        self._is_pos_arg = False

        self._parse()

    def _parse(
        self
    ) -> None:
        key, delim, value = self._token.partition('=')

        if any(x in key for x in '[]{}"\''):
            # Treat the whole token as a positional value.
            self._is_pos_arg = True
            self._value = self._token
            return

        if delim == '=':
            # This is a key=value.
            self._is_kw_arg = True
            self._key = key
            self._value = value
        else:
            # This could either be the beginning of something like key=value or the
            # positional literal keywest.
            self._key = self._value = key

    @property
    def is_kw_arg(
        self
    ) -> bool:
        return self._is_kw_arg

    @property
    def is_pos_arg(
        self
    ) -> bool:
        return self._is_pos_arg

    @property
    def is_ambiguous_arg(
        self
    ) -> bool:
        return not self._is_kw_arg and not self._is_pos_arg

    @property
    def key(
        self
    ) -> str:
        return self._key

    @property
    def value(
        self
    ) -> str:
        return self._value

    def __str__(
        self
    ) -> str:
        if self.is_kw_arg:
            return f'kwarg {self._key}={self._value}'
        elif self.is_pos_arg:
            return f'positional {self._value}'
        elif self.is_ambiguous_arg:
            return f'ambiguous {self._key}'
        else:
            return 'Parse error'

    def __repr__(
        self
    ) -> str:
        return f'<{self.__class__.__qualname__} [{str(self)}]>'


[docs]def last_incomplete_token(
    document: Document,
    unparsed_text: str
) -> IncompleteToken:
    if document.char_before_cursor in ' ]}':
        last_token = ''
    else:
        last_space = document.find_backwards(' ', in_current_line=True)
        if last_space is None:
            last_space = -1

        last_token = document.text[last_space+1:]

    # The longer of the last_token and unparsed_text is taken in the event that the
    # unparsed_text is an open literal, which could itself contain spaces.
    if len(unparsed_text) > len(last_token):
        last_token = unparsed_text

    return IncompleteToken(last_token)


[docs]def last_incomplete_token_from_document(
    document: Document
) -> IncompleteToken:
    """Shortcut for getting the last incomplete token only from a ``Document``."""
    parse_status = parse_cmd_line(document.text)
    return last_incomplete_token(document, parse_status.unparsed_text)