125 lines
4.7 KiB
Python
125 lines
4.7 KiB
Python
from emis_funky_funktions import *
|
|
|
|
from dataclasses import dataclass
|
|
from functools import wraps
|
|
from operator import contains
|
|
from typing import Callable, Collection, Mapping, TypeGuard
|
|
|
|
@dataclass(frozen=True)
|
|
class Action(Generic[A]):
|
|
f: Callable[[Sequence[A]], Sequence[A]]
|
|
def __call__(self, i: Sequence[A]) -> Sequence[A]:
|
|
return self.f(i)
|
|
|
|
def _expected(row: Mapping[B, Collection[Sequence[Any]]]) -> Collection[B]:
|
|
"""
|
|
Given a single row from an oracle table, identify the expected terminals
|
|
"""
|
|
return [
|
|
terminal
|
|
for terminal, expansions in row.items()
|
|
if len(expansions)
|
|
]
|
|
|
|
def parser(
|
|
oracle: Mapping[A, Mapping[B, Collection[Sequence[A | B | Action[C | D]]]]],
|
|
identify_lexeme: Callable[[D], B],
|
|
start_symbol: A,
|
|
) -> Callable[[Sequence[D]], Result[Sequence[C | D], Tuple[D, Collection[B]]]]:
|
|
"""
|
|
Produces a parser based on a grammar, an oracle, and a start symbol.
|
|
|
|
The `identify_lexeme` argument should be a function which converts a lexeme into the
|
|
token that it represents. This allows for the actual lexemes that are being fed in to
|
|
be more complex, and store additional data.
|
|
|
|
### Example:
|
|
|
|
We generate a simple grammar:
|
|
|
|
>>> class SimpleVariable(IntEnum):
|
|
... S = auto()
|
|
... Sum = auto()
|
|
... Sum_ = auto()
|
|
>>> class SimpleTerminal(IntEnum):
|
|
... Number = auto()
|
|
... Plus = auto()
|
|
... Eof = auto()
|
|
... def __repr__(self):
|
|
... return self.name
|
|
>>> build_S = Action(lambda x: x[:-1])
|
|
>>> build_Sum = Action(lambda x: (*x[:-2], x[-1](int(x[-2][1]))))
|
|
>>> build_Sum_1 = Action(lambda x: (*x[:-2], lambda y: x[-1] + y))
|
|
>>> build_Sum_2 = Action(lambda x: (*x, lambda y: y))
|
|
>>> grammar = [
|
|
... (SimpleVariable.S, [SimpleVariable.Sum, SimpleTerminal.Eof, build_S]),
|
|
... (SimpleVariable.Sum, [SimpleTerminal.Number, SimpleVariable.Sum_, build_Sum]),
|
|
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum, build_Sum_1]),
|
|
... (SimpleVariable.Sum_, [build_Sum_2]),
|
|
... ]
|
|
>>> is_term = p_instance(SimpleTerminal)
|
|
>>> is_var = p_instance(SimpleVariable)
|
|
>>> my_oracle_table = oracle_table(is_term, is_var, grammar)
|
|
>>> my_parser = parser(my_oracle_table, lambda x: x[0], SimpleVariable.S)
|
|
|
|
>>> my_parser([
|
|
... (SimpleTerminal.Number, 1),
|
|
... (SimpleTerminal.Plus,),
|
|
... (SimpleTerminal.Number, 3),
|
|
... (SimpleTerminal.Plus,),
|
|
... (SimpleTerminal.Number, 10),
|
|
... (SimpleTerminal.Eof,),
|
|
... ])
|
|
Ok((14,))
|
|
|
|
>>> my_parser([
|
|
... (SimpleTerminal.Number, 1),
|
|
... (SimpleTerminal.Plus,),
|
|
... (SimpleTerminal.Number, 3),
|
|
... (SimpleTerminal.Number, 10), # <--- this is invalid!
|
|
... (SimpleTerminal.Eof,),
|
|
... ])
|
|
Err(((Number, 10), [Plus, Eof]))
|
|
"""
|
|
is_var: Callable[[Any], TypeGuard[A]] = p_instance(start_symbol.__class__)
|
|
is_tok: Callable[[Any], TypeGuard[B]] = p_instance(next(iter(oracle[start_symbol].keys())).__class__)
|
|
def inner(
|
|
stack: Sequence[A | B | Action[C | D]],
|
|
ast_stack: Sequence[C | D],
|
|
lexemes: Sequence[D],
|
|
) -> Result[Sequence[C | D], Tuple[D, Collection[B]]]:
|
|
match stack:
|
|
# Action
|
|
case [Action(f), *popped_stack]:
|
|
return inner(popped_stack, f(ast_stack), lexemes)
|
|
# A [Variable]
|
|
case [top_of_stack, *popped_stack] if is_var(top_of_stack):
|
|
expansions = oracle[top_of_stack][identify_lexeme(lexemes[0])]
|
|
match expansions:
|
|
case []:
|
|
return Err((lexemes[0], _expected(oracle[top_of_stack])))
|
|
case [expansion]:
|
|
return inner((*expansion, *popped_stack), ast_stack, lexemes)
|
|
case _:
|
|
raise Exception('Not an LL(1) grammar!!!')
|
|
# B [Token] (match)
|
|
case [top_of_stack, *popped_stack] if top_of_stack == identify_lexeme(lexemes[0]):
|
|
return inner(popped_stack, (*ast_stack, lexemes[0]), lexemes[1:])
|
|
# B [Token] (no match)
|
|
case [top_of_stack, *popped_stack]:
|
|
assert is_tok(top_of_stack)
|
|
return Err((lexemes[0], (top_of_stack,)))
|
|
# Empty stack (finished parsing)
|
|
case []:
|
|
if len(lexemes):
|
|
return Err((lexemes[0], []))
|
|
else:
|
|
return Ok(ast_stack)
|
|
raise Exception('Unreachable!')
|
|
return wraps(parser)(p(inner, [start_symbol], []))
|
|
|
|
if __name__ == '__main__':
|
|
import doctest
|
|
from enum import auto, IntEnum
|
|
from build_oracle import oracle_table
|
|
doctest.testmod() |