JSON-Lang/parse.py

from emis_funky_funktions import *

from dataclasses import dataclass
from functools import wraps
from operator import contains
from typing import Callable, Collection, Mapping, TypeGuard

@dataclass(frozen=True)
class Action(Generic[A]):
    f: Callable[[Sequence[A]], Sequence[A]]
    def __call__(self, i: Sequence[A]) -> Sequence[A]:
        return self.f(i)

def _expected(row: Mapping[B, Collection[Sequence[Any]]]) -> Collection[B]:
    """
    Given a single row from an oracle table, identify the expected terminals
    """
    return [
        terminal
        for terminal, expansions in row.items()
        if len(expansions)
    ]

def parser(
    oracle: Mapping[A, Mapping[B, Collection[Sequence[A | B | Action[C | D]]]]],
    identify_lexeme: Callable[[D], B],
    start_symbol: A,
) -> Callable[[Sequence[D]], Result[Sequence[C | D], Tuple[D, Collection[B]]]]:
    """
    Produces a parser based on a grammar, an oracle, and a start symbol.

    The `identify_lexeme` argument should be a function which converts a lexeme into the
    token that it represents.  This allows for the actual lexemes that are being fed in to
    be more complex, and store additional data.

    ### Example:

    We generate a simple grammar:

    >>> class SimpleVariable(IntEnum):
    ...     S = auto()
    ...     Sum = auto()
    ...     Sum_ = auto()
    >>> class SimpleTerminal(IntEnum):
    ...     Number = auto()
    ...     Plus = auto()
    ...     Eof = auto()
    ...     def __repr__(self):
    ...         return self.name
    >>> build_S = Action(lambda x: x[:-1])
    >>> build_Sum = Action(lambda x: (*x[:-2], x[-1](int(x[-2][1]))))
    >>> build_Sum_1 = Action(lambda x: (*x[:-2], lambda y: x[-1] + y))
    >>> build_Sum_2 = Action(lambda x: (*x, lambda y: y))
    >>> grammar = [
    ...     (SimpleVariable.S,    [SimpleVariable.Sum, SimpleTerminal.Eof, build_S]),
    ...     (SimpleVariable.Sum,  [SimpleTerminal.Number, SimpleVariable.Sum_, build_Sum]),
    ...     (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum, build_Sum_1]),
    ...     (SimpleVariable.Sum_, [build_Sum_2]),
    ... ]
    >>> is_term = p_instance(SimpleTerminal)
    >>> is_var = p_instance(SimpleVariable)
    >>> my_oracle_table = oracle_table(is_term, is_var, grammar)
    >>> my_parser = parser(my_oracle_table, lambda x: x[0], SimpleVariable.S)

    >>> my_parser([
    ...     (SimpleTerminal.Number, 1),
    ...     (SimpleTerminal.Plus,),
    ...     (SimpleTerminal.Number, 3),
    ...     (SimpleTerminal.Plus,),
    ...     (SimpleTerminal.Number, 10),
    ...     (SimpleTerminal.Eof,),
    ... ])
    Ok((14,))

    >>> my_parser([
    ...     (SimpleTerminal.Number, 1),
    ...     (SimpleTerminal.Plus,),
    ...     (SimpleTerminal.Number, 3),
    ...     (SimpleTerminal.Number, 10), # <--- this is invalid!
    ...     (SimpleTerminal.Eof,),
    ... ])
    Err(((Number, 10), [Plus, Eof]))
    """
    is_var: Callable[[Any], TypeGuard[A]] = p_instance(start_symbol.__class__)
    is_tok: Callable[[Any], TypeGuard[B]] = p_instance(next(iter(oracle[start_symbol].keys())).__class__)
    def inner(
        stack: Sequence[A | B | Action[C | D]],
        ast_stack: Sequence[C | D],
        lexemes: Sequence[D],
    ) -> Result[Sequence[C | D], Tuple[D, Collection[B]]]:
        match stack:
            # Action
            case [Action(f), *popped_stack]:
                return inner(popped_stack, f(ast_stack), lexemes)
            # A [Variable]
            case [top_of_stack, *popped_stack] if is_var(top_of_stack):
                expansions = oracle[top_of_stack][identify_lexeme(lexemes[0])]
                match expansions:
                    case []:
                        return Err((lexemes[0], _expected(oracle[top_of_stack])))
                    case [expansion]:
                        return inner((*expansion, *popped_stack), ast_stack, lexemes)
                    case _:
                        raise Exception('Not an LL(1) grammar!!!')
            # B [Token] (match)
            case [top_of_stack, *popped_stack] if top_of_stack == identify_lexeme(lexemes[0]):
                return inner(popped_stack, (*ast_stack, lexemes[0]), lexemes[1:])
            # B [Token] (no match)
            case [top_of_stack, *popped_stack]:
                assert is_tok(top_of_stack)
                return Err((lexemes[0], (top_of_stack,)))
            # Empty stack (finished parsing)
            case []:
                if len(lexemes):
                    return Err((lexemes[0], []))
                else:
                    return Ok(ast_stack)
        raise Exception('Unreachable!')
    return wraps(parser)(p(inner, [start_symbol], []))

if __name__ == '__main__':
    import doctest
    from enum import auto, IntEnum
    from build_oracle import oracle_table
    doctest.testmod()