JSON-Lang/parse.py

138 lines
5.6 KiB
Python

from emis_funky_funktions import *
from dataclasses import dataclass
from functools import wraps
from operator import contains
from typing import Callable, Collection, Mapping, TypeGuard, TypeAlias
def _expected(row: Mapping[B, Collection[Sequence[Any]]]) -> Collection[B]:
"""
Given a single row from an oracle table, identify the expected terminals
"""
return [
terminal
for terminal, expansions in row.items()
if len(expansions)
]
Action: TypeAlias = Callable[[Sequence[C | D]], Sequence[C | D]]
def parser(
oracle: Mapping[A, Mapping[B, Collection[Sequence[A | B | Action]]]],
identify_lexeme: Callable[[D], B],
start_symbol: A,
) -> Callable[[Sequence[D]], Result[Sequence[C | D], Tuple[D, Collection[B]]]]:
"""
Produces a parser based on a grammar, an oracle, and a start symbol.
The `identify_lexeme` argument should be a function which converts a lexeme into the
token that it represents. This allows for the actual lexemes that are being fed in to
be more complex, and store additional data.
The oracle table my include "action" annotations in its sequences. Actions should be
an instance of `Action`, and should work on the AST stack. Every matched lexeme is
pushed to the AST stack. An action may transform this stack by popping some number of
items off of it, constructing some AST, pushing that AST back to the stack, and then
returning the modified stack.
A couple things to note about this process:
- The stack that is passed to each action is immutable. "Modifications" should be
made by simply constructing and returning a new stack.
- The bottom of the stack is the zero index.
If a parse is successful, the return value will be the AST stack at the end of the
parse. It is up the the caller to verify that this is an expected result.
If a parse fails, the return value will be a tuple containing the erronious lexeme and
a collection of expected tokens which failed to match it.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... S = auto()
... Sum = auto()
... Sum_ = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Plus = auto()
... Eof = auto()
... def __repr__(self):
... return self.name
>>> build_S = lambda x: x[1:]
>>> build_Sum = lambda x: (x[0](x[1][1]), *x[2:])
>>> build_Sum_1 = lambda x: (lambda y: x[0] + y, *x[2:])
>>> build_Sum_2 = lambda x: (lambda y: y, *x)
>>> grammar = [
... (SimpleVariable.S, [SimpleVariable.Sum, SimpleTerminal.Eof, build_S]),
... (SimpleVariable.Sum, [SimpleTerminal.Number, SimpleVariable.Sum_, build_Sum]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum, build_Sum_1]),
... (SimpleVariable.Sum_, [build_Sum_2]),
... ]
>>> is_term = p_instance(SimpleTerminal)
>>> is_var = p_instance(SimpleVariable)
>>> my_oracle_table = oracle_table(is_term, is_var, grammar)
>>> my_parser = parser(my_oracle_table, lambda x: x[0], SimpleVariable.S)
>>> my_parser([
... (SimpleTerminal.Number, 1),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 3),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 10),
... (SimpleTerminal.Eof,),
... ])
Ok((14,))
>>> my_parser([
... (SimpleTerminal.Number, 1),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 3),
... (SimpleTerminal.Number, 10), # <--- this is invalid!
... (SimpleTerminal.Eof,),
... ])
Err(((Number, 10), [Plus, Eof]))
"""
is_var: Callable[[Any], TypeGuard[A]] = p_instance(start_symbol.__class__)
is_tok: Callable[[Any], TypeGuard[B]] = p_instance(next(iter(oracle[start_symbol].keys())).__class__)
def inner(
stack: Sequence[A | B | Action],
ast_stack: Sequence[C | D],
lexemes: Sequence[D],
) -> Result[Sequence[C | D], Tuple[D, Collection[B]]]:
match stack:
# A [Variable]
case [top_of_stack, *popped_stack] if is_var(top_of_stack):
expansions = oracle[top_of_stack][identify_lexeme(lexemes[0])]
match expansions:
case []:
return Err((lexemes[0], _expected(oracle[top_of_stack])))
case [expansion]:
return inner((*expansion, *popped_stack), ast_stack, lexemes)
case _:
raise Exception('Not an LL(1) grammar!!!')
# B [Token] (match)
case [top_of_stack, *popped_stack] if is_tok(top_of_stack) and top_of_stack == identify_lexeme(lexemes[0]):
return inner(popped_stack, (lexemes[0], *ast_stack), lexemes[1:])
# B [Token] (no match)
case [top_of_stack, *popped_stack] if is_tok(top_of_stack):
assert is_tok(top_of_stack)
return Err((lexemes[0], (top_of_stack,)))
# Action
case [f, *popped_stack]:
assert hasattr(f, '__call__')
return inner(popped_stack, f(ast_stack), lexemes)
# Empty stack (finished parsing)
case []:
if len(lexemes):
return Err((lexemes[0], []))
else:
return Ok(ast_stack)
raise Exception('Unreachable!')
return wraps(parser)(p(inner, [start_symbol], []))
if __name__ == '__main__':
import doctest
from enum import auto, IntEnum
from build_oracle import oracle_table
doctest.testmod()