Source code for xopgi.ql.lang.expressions.parser

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ---------------------------------------------------------------------
# Copyright (c) Merchise Autrement [~º/~] and Contributors
# All rights reserved.
#
# This is free software; you can do what the LICENCE file allows you to.
#
from typing import List, Union, Deque
from collections import deque

from xoutil.objects import setdefaultattr
from ply import lex, yacc

from .base import (
    AST,
    Identifier,
    Literal,
    Lambda,
    Application,
    Let,
    Letrec
)

from ..builtins import StringType, CharType, NumberType


class ParserError(SyntaxError):
    pass


tokens = [
    'IDENTIFIER',
    'BASE16_INTEGER',
    'BASE10_INTEGER',
    'BASE8_INTEGER',
    'BASE2_INTEGER',
    'COLON',
    'DOT_OPERATOR',
    'SPACE',
    'PADDING',
    'LPAREN',
    'RPAREN',
    'CHAR',
    'STRING',
    'PLUS',
    'MINUS',
    'STAR',
    'SLASH',
    'BACKSLASH',
    'ARROW',
    'DOUBLESLASH',
    'PERCENT',
    'OPERATOR',
    'TICK_OPERATOR',
    'ANNOTATION',
    'FLOAT',
    'EQ',
    'EOF',
]

# Reserved keywords: pairs of (keyword, regexp).  If the regexp is None, it
# defaults to '\b{keyword}\b' (case-sensitive).
reserved = [
    ('let', None),

    # We need the KEYWORD_IN to match the preceding spaces to avoid the lexer
    # to issue a 'SPACE'.  Otherwise, in an expression like 'let id x = x in
    # ...'; the body of the last equation ('x') is a full expression; and the
    # parser would be in a state `expr . SPACE KEYWORD_IN ...`; at which it
    # will try to match the application.
    ('in', r'\s+in\b'),
]


for keyword, regexp in reserved:
    tk = f'KEYWORD_{keyword.upper()}'

    def tkdef(t):
        setdefaultattr(t.lexer, 'col', 0)
        t.value = t.value.strip()
        t.lexer.col += len(keyword)
        return t

    tkdef.__doc__ = regexp or rf'\b{keyword}\b'
    tkdef.__name__ = tkname = f't_{tk}'

    globals()[tkname] = tkdef
    tokens.append(tk)


t_IDENTIFIER = r'[A-Za-z]\w*'
t_BASE10_INTEGER = '[0-9][0-9_]*'
t_BASE16_INTEGER = '0[xX][0-9a-fA-F][0-9a-fA-F_]*'
t_BASE8_INTEGER = '0[oO][0-7][0-7_]*'
t_BASE2_INTEGER = '0[bB][01][01_]*'

t_COLON = r':'


def t_STRING(t):
    r'\"([^\n]|\\["])*\"'
    # eval is safe because t.value must match the regular expression.  Notice
    # that you must use `string_repr`:func: to get a string representation
    # that work in our language.
    t.value = eval(t.value)
    return t


def string_repr(s):
    result = ['"']
    for ch in s:
        if ch == "'":
            result.append(ch)
        elif ch == '"':
            result.append('\\')
            result.append(ch)
        else:
            result.append(repr(ch)[1:-1].replace(r'\\', '\\'))
    result.append('"')
    return ''.join(result)


def t_CHAR(t):
    r"'([^\n]|(\\[ntr])|(\\[xuU][a-f\d]+))'"
    value = t.value[1:-1]
    if len(value) > 1:
        assert value.startswith('\\')
        # \x..., \u...
        value = eval(f"'{value}'")
    t.value = value
    return t


# We need to treat space specially in this grammar because it can have a
# semantical value: the application of expressions 'e1 e2'.
#
# We want to emit a SPACE token only when application is **not impossible**.
#
# We disallow to break applications with a new line; so 'f \n a' won't a emit
# SPACE token but a PADDING (because \n).
#
# In the malformed expression '+ a b' the first space is ignore and the second
# is emitted; but in the well-formed expression '(+) a b' both spaces are
# emitted.
#
# In expressions like 'let x ...' the space after the keyword is not ignored
# but required.
def t_SPACE(t):
    r'[ \t\n]+'
    if '\n' in t.value:
        if t.lexpos + len(t.value) == len(t.lexer.lexdata):
            t.type = 'EOF'
        else:
            t.type = 'PADDING'
        t.lexer.lineno += t.value.count('\n')
        return t
    elif not t.lexpos or t.lexpos + len(t.value) == len(t.lexer.lexdata):
        return  # This removes the token entirely.
    else:
        # In the expression:
        #
        #    e0   (   e1   e2   )   e3
        #       ^   !    ^    !   ^
        #
        # The run of spaces marked with '^' are important and must be kept,
        # but the spaces above the '!' are just padding and can be ignored
        # (don't create a token for them).
        #
        before = t.lexer.lexdata[t.lexpos - 1]
        after = t.lexer.lexdata[t.lexpos + len(t.value)]
        common = '=<>`.,:+-%@!$*^/'
        if before in common + '(' or after in common + ')':
            return  # This removes the token entirely.
        else:
            return t


def t_LPAREN(t):
    r'\('
    return t


def t_RPAREN(t):
    r'\)'
    return t


def t_FLOAT(t):
    r'([0-9_]*\.[0-9]+([eE][-+]\d+)?|[0-9][0-9_]*[eE][-+]\d+)'
    if t.value.startswith('_'):
        raise lex.LexError(f'Illegal float representation {t.value!r}', t.lexpos)
    t.value = t.value.replace('_', '')
    return t


# PLUS, MINUS, EQ and DOT are treated specially to disambiguate (binary + from
# unary +, etc); DOT is right associative.

def t_PLUS(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\+(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_MINUS(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\-(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_STAR(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\*(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_DOUBLESLASH(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\/\/(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_SLASH(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\/(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_ARROW(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\-\>(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_BACKSLASH(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])\\(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_PERCENT(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])%(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_EQ(t):
    r'(?<![/\.\-\+\*<>\$%\^&!@\#=\|])=(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


# Don't merge this with OPERATOR, we need it to make sure is
# right-associative.
def t_DOT_OPERATOR(t):
    r'\.(?![/\.\-\+\*<>\$%\^&!@\#=\|])'
    return t


def t_TICK_OPERATOR(t):
    r'`[A-Za-z]\w*`'
    t.value = t.value[1:-1]
    return t


# Emit the ANNOTATION only if the '@' surrounded by a possible number and a
# possible string/char/identifier.
def t_ANNOTATION(t):
    r'(?<=[0-9a-fA-F_])@(?=[\'"A-Za-z])'
    return t


def t_OPERATOR(t):
    r'[/\.\-\+\*<>\$%\^&!@\#=\|]+'
    return t


lexer = lex.lex(debug=False)


precedence = (
    ('right', 'ARROW', ),

    ('left', 'KEYWORD_LET', ),
    ('left', 'KEYWORD_IN', ),

    ('left', 'TICK_OPERATOR', ),
    ('left', 'OPERATOR', ),

    ('left', 'PLUS', 'MINUS'),
    ('left', 'STAR', 'SLASH', 'DOUBLESLASH', 'PERCENT'),

    ('right', 'DOT_OPERATOR', ),

    # Application has the highest priority, and is left associative:
    # 'a b c' means '(a b) c'.
    ('left', 'SPACE', ),
)


def p_standalone_expr(prod):
    '''st_expr : expr
               | PADDING expr
               | expr EOF
               | PADDING expr EOF
    '''
    count = len(prod)
    if count == 2:
        prod[0] = prod[1]
    elif count == 4:
        prod[0] = prod[2]
    else:
        assert count == 3
        prod[0] = next(tk for tk in prod[1:] if isinstance(tk, AST))


def p_literals_and_basic(prod):
    '''expr :  number
             | concrete_number
             | string
             | char
             | identifier
             | enclosed_expr
             | letexpr
    '''
    prod[0] = prod[1]


def p_char(prod):
    'char : CHAR'
    prod[0] = Literal(prod[1], CharType)


def p_string(prod):
    'string : STRING'
    prod[0] = Literal(prod[1], StringType)


def p_variable(prod):
    'identifier : IDENTIFIER'
    prod[0] = Identifier(prod[1])


def p_paren_expr(prod):
    'enclosed_expr : LPAREN expr RPAREN'
    prod[0] = prod[2]


def p_infix_application(prod):
    'expr : expr TICK_OPERATOR expr'
    prod[0] = Application(Application(Identifier(prod[2]), prod[1]), prod[3])


def p_application(prod):
    'expr : expr SPACE expr'
    prod[0] = Application(prod[1], prod[3])


def p_application_after_paren(prod):
    '''expr : enclosed_expr expr
            | expr enclosed_expr
    '''
    prod[0] = Application(prod[1], prod[2])


def p_compose(prod):
    r'''
    expr : expr DOT_OPERATOR expr
    '''
    e1, e2 = prod[1], prod[3]
    prod[0] = Application(Application(Identifier('.'), e1), e2)


def p_operators_as_expressios(prod):
    '''enclosed_expr : LPAREN DOT_OPERATOR RPAREN
                     | LPAREN operator RPAREN
    '''
    operator = prod[2]
    prod[0] = Identifier(operator)


def p_user_operator_expr(prod):
    r'''expr : expr operator expr

    '''
    e1, operator, e2 = prod[1], prod[2], prod[3]
    prod[0] = Application(Application(Identifier(operator), e1), e2)


def p_operator(prod):
    r'''
    operator :  PLUS
              | MINUS
              | STAR
              | SLASH
              | DOUBLESLASH
              | PERCENT
              | ARROW
              | OPERATOR

    '''
    prod[0] = prod[1]


def p_integer(prod):
    '''number : BASE10_INTEGER
              | BASE16_INTEGER
              | BASE8_INTEGER
              | BASE2_INTEGER
    '''
    value = prod[1]
    value = value.replace('_', '')  # We separators anywhere: 1000 = 10_00
    base = 10
    if value.startswith('0'):
        mark = value[1:2]
        if mark in ('x', 'X'):
            base = 16
        elif mark in ('b', 'B'):
            base = 2
        elif mark in ('o', 'O'):
            base = 8
        elif not mark or mark in '0123456789':
            pass
        else:
            assert False, 'Invalid integer mark'
    val = int(value, base)
    prod[0] = Literal(val, NumberType)


def p_pos_number(prod):
    '''number : PLUS number'''
    prod[0] = prod[2]


def p_neg_number(prod):
    '''number : MINUS number'''
    number = prod[2]
    assert isinstance(number, Literal)
    number.value = -number.value
    prod[0] = number


def p_float(prod):
    'number : FLOAT'
    prod[0] = Literal(float(prod[1]), NumberType)


def p_concrete_number(prod):
    '''concrete_number :  number ANNOTATION string
                        | number ANNOTATION char
                        | number ANNOTATION identifier
    '''
    number = prod[1]
    assert isinstance(number, Literal)
    number.annotation = prod[3]
    prod[0] = number


def p_empty(prod):
    '''empty : '''
    pass


def p_lambda_definition(prod):
    '''expr : BACKSLASH parameters ARROW expr
    '''
    params = prod[2]
    assert params
    prod[0] = build_lambda(params, prod[4])


def p_parameters(prod):
    '''parameters : IDENTIFIER _parameters
    '''
    names = prod[2]
    names.insert(0, prod[1])
    prod[0] = names


def p__params(prod):
    '_parameters : SPACE IDENTIFIER _parameters'
    names = prod[3]
    names.insert(0, prod[2])
    prod[0] = names


def p_empty__parameters(prod):
    '''_parameters : empty
    '''
    prod[0] = []


# Patterns and Equations.  In the final AST, an expression like:
#
#     let id x = x in ...
#
# would actually be like
#
#     let id = \x -> x
#
# with some complications if pattern-matching is allowed:
#
#     let length [] = 0
#         lenght (x:xs) = 1 + length xs
#     in  ...
#
# For now, we allow only the SIMPLEST of all definitions (we don't have a
# 'case' keyword to implement pattern matching.)  But, in any case, having the
# names of productions be 'pattern' and 'equations' is fit.
#
#
# The Pattern and Equation definitions are purposely not part of the AST, but
# more concrete syntactical object in the source code.  In the final AST, the
# let expressions shown above are indistinguishable.


class Pattern:
    def __init__(self, cons, params=None):
        self.cons = cons
        self.params = params or []

    def __repr__(self):
        return f'<pattern {self.cons!r} {self.params!r}>'

    def __str__(self):
        if self.params:
            return f'{self.cons} {self.parameters}'
        else:
            return self.cons

    @property
    def parameters(self):
        return ' '.join(self.params)


def p_pattern(prod):
    '''pattern : parameters'''
    cons, *params = prod[1]
    prod[0] = Pattern(cons, params)


class Equation:
    def __init__(self, pattern: Pattern, body: AST) -> None:
        self.pattern = pattern
        self.body = body

    def __repr__(self):
        return f'<equation {self.pattern!s} = {self.body!r}>'


def p_equation(prod):
    '''equation : pattern EQ expr
    '''
    prod[0] = Equation(prod[1], prod[3])


class Equations(list):
    pass


def p_equation_set(prod):
    '''equations : equation _equation_set
    '''
    result = prod[2]
    result.append(prod[1])
    prod[0] = Equations(result)


def p_equation_set2(prod):
    '''
    _equation_set : PADDING equation _equation_set
    '''
    result = prod[3]
    result.append(prod[2])
    prod[0] = result


def p_equation_set3(prod):
    '''
    _equation_set : empty
    '''
    prod[0] = []


def p_let_expr(prod):
    '''letexpr : KEYWORD_LET SPACE equations KEYWORD_IN SPACE st_expr

    '''
    #
    # We need to decide if we issue a Let or a Letrec: if any of declared
    # names appear in the any of the bodies we must issue a Letrec, otherwise
    # issue a Let.
    #
    # Also we need to convert function-patterns into Lambda abstractions:
    #
    #    let id x = ...
    #
    # becomes:
    #
    #    led id = \x -> ...
    #
    # For the time being (we don't have pattern matching yet), each symbol can
    # be defined just once.
    #
    def to_lambda(equation: Equation):
        'Convert (if needed) an equation to the equivalent one using lambdas.'
        if equation.pattern.params:
            return Equation(
                Pattern(equation.pattern.cons),
                build_lambda(equation.pattern.params, equation.body)
            )
        else:
            return equation

    equations = [to_lambda(eq) for eq in prod[3]]
    conses = [eq.pattern.cons for eq in equations]
    names = set(conses)
    if len(names) != len(conses):
        raise SyntaxError('Several definitions for the same name')
    if any(set(find_free_names(eq.body)) & names for eq in equations):
        klass = Letrec
    else:
        klass = Let
    body = prod[6]
    prod[0] = klass({eq.pattern.cons: eq.body for eq in equations}, body)


def p_error(prod):
    raise ParserError('Invalid expression')


parser = yacc.yacc(debug=True, start='st_expr')


def build_lambda(params: List[str], body: AST) -> Lambda:
    '''Create a Lambda from several parameters.

    Example:

       >>> build_lambda(['a', 'b'], Identifier('a'))
       Lambda('a', Lambda('b', Identifier('a')))

    '''
    assert params
    result = body
    for param in reversed(params):
        result = Lambda(param, result)
    return result  # type: ignore


[docs]def find_free_names(expr: AST) -> List[str]:
    '''Find all names that appear free in `expr`.

    Example:

      >>> from xopgi.ql.lang.expressions import parse, find_free_names
      >>> set(find_free_names(parse('let id x = x in map id xs')))
      {'map', 'xs'}

    Names can be repeated:

      >>> find_free_names(parse('twice x x')).count('x')
      2

    '''
    POPFRAME = None  # remove a binding from the 'stack'
    result: List[str] = []
    bindings: Deque[str] = deque([])
    nodes: Deque[Union[None, AST]] = deque([expr])
    while nodes:
        node = nodes.pop()
        if node is POPFRAME:
            bindings.pop()
        elif isinstance(node, Identifier):
            if node.name not in bindings:
                result.append(node.name)
        elif isinstance(node, Literal):
            if isinstance(node.annotation, AST):
                nodes.append(node)
        elif isinstance(node, Application):
            nodes.extend([
                node.e1,
                node.e2,
            ])
        elif isinstance(node, Lambda):
            bindings.append(node.varname)
            nodes.append(POPFRAME)
            nodes.append(node.body)
        elif isinstance(node, (Let, Letrec)):
            # This is tricky; the bindings can be used recursively in the
            # bodies of a letrec:
            #
            #    letrec f1 = ....f1 ... f2 ....
            #           f2 = ... f1 ... f2 ....
            #           ....
            #    in ... f1 ... f2 ...
            #
            # So we must make all the names in the bindings bound and then
            # look at all the definitions.
            #
            # We push several POPFRAME at the to account for that.
            bindings.extend(node.bindings.keys())
            nodes.extend(POPFRAME for _ in node.bindings.keys())
            nodes.extend(node.bindings.values())
            nodes.append(node.body)
        else:
            assert False, f'Unknown AST node: {node!r}'
    return result