2023-03-04 18:31:44 +00:00
|
|
|
from emis_funky_funktions import *
|
|
|
|
|
2023-03-04 21:37:55 +00:00
|
|
|
from dataclasses import dataclass
|
2023-03-04 18:31:44 +00:00
|
|
|
from functools import wraps
|
|
|
|
from operator import contains
|
2023-03-05 01:00:00 +00:00
|
|
|
from typing import Callable, Collection, Mapping, TypeGuard, TypeAlias
|
2023-03-04 21:37:55 +00:00
|
|
|
|
|
|
|
def _expected(row: Mapping[B, Collection[Sequence[Any]]]) -> Collection[B]:
|
|
|
|
"""
|
|
|
|
Given a single row from an oracle table, identify the expected terminals
|
|
|
|
"""
|
|
|
|
return [
|
|
|
|
terminal
|
|
|
|
for terminal, expansions in row.items()
|
|
|
|
if len(expansions)
|
|
|
|
]
|
|
|
|
|
2023-03-05 01:00:00 +00:00
|
|
|
Action: TypeAlias = Callable[[Sequence[C | D]], Sequence[C | D]]
|
2023-03-04 18:31:44 +00:00
|
|
|
def parser(
|
2023-03-05 01:00:00 +00:00
|
|
|
oracle: Mapping[A, Mapping[B, Collection[Sequence[A | B | Action]]]],
|
2023-03-04 21:37:55 +00:00
|
|
|
identify_lexeme: Callable[[D], B],
|
2023-03-04 18:31:44 +00:00
|
|
|
start_symbol: A,
|
2023-03-04 21:37:55 +00:00
|
|
|
) -> Callable[[Sequence[D]], Result[Sequence[C | D], Tuple[D, Collection[B]]]]:
|
2023-03-04 18:31:44 +00:00
|
|
|
"""
|
|
|
|
Produces a parser based on a grammar, an oracle, and a start symbol.
|
|
|
|
|
|
|
|
The `identify_lexeme` argument should be a function which converts a lexeme into the
|
|
|
|
token that it represents. This allows for the actual lexemes that are being fed in to
|
|
|
|
be more complex, and store additional data.
|
|
|
|
|
2023-03-05 00:31:30 +00:00
|
|
|
The oracle table my include "action" annotations in its sequences. Actions should be
|
|
|
|
an instance of `Action`, and should work on the AST stack. Every matched lexeme is
|
|
|
|
pushed to the AST stack. An action may transform this stack by popping some number of
|
|
|
|
items off of it, constructing some AST, pushing that AST back to the stack, and then
|
|
|
|
returning the modified stack.
|
|
|
|
|
|
|
|
A couple things to note about this process:
|
|
|
|
- The stack that is passed to each action is immutable. "Modifications" should be
|
|
|
|
made by simply constructing and returning a new stack.
|
|
|
|
- The bottom of the stack is the zero index.
|
|
|
|
|
|
|
|
If a parse is successful, the return value will be the AST stack at the end of the
|
|
|
|
parse. It is up the the caller to verify that this is an expected result.
|
|
|
|
|
|
|
|
If a parse fails, the return value will be a tuple containing the erronious lexeme and
|
|
|
|
a collection of expected tokens which failed to match it.
|
|
|
|
|
2023-03-04 18:31:44 +00:00
|
|
|
### Example:
|
|
|
|
|
|
|
|
We generate a simple grammar:
|
|
|
|
|
|
|
|
>>> class SimpleVariable(IntEnum):
|
|
|
|
... S = auto()
|
|
|
|
... Sum = auto()
|
|
|
|
... Sum_ = auto()
|
|
|
|
>>> class SimpleTerminal(IntEnum):
|
|
|
|
... Number = auto()
|
|
|
|
... Plus = auto()
|
|
|
|
... Eof = auto()
|
2023-03-04 21:37:55 +00:00
|
|
|
... def __repr__(self):
|
|
|
|
... return self.name
|
2023-03-05 01:00:00 +00:00
|
|
|
>>> build_S = lambda x: x[1:]
|
|
|
|
>>> build_Sum = lambda x: (x[0](x[1][1]), *x[2:])
|
|
|
|
>>> build_Sum_1 = lambda x: (lambda y: x[0] + y, *x[2:])
|
|
|
|
>>> build_Sum_2 = lambda x: (lambda y: y, *x)
|
2023-03-04 18:31:44 +00:00
|
|
|
>>> grammar = [
|
2023-03-04 21:37:55 +00:00
|
|
|
... (SimpleVariable.S, [SimpleVariable.Sum, SimpleTerminal.Eof, build_S]),
|
|
|
|
... (SimpleVariable.Sum, [SimpleTerminal.Number, SimpleVariable.Sum_, build_Sum]),
|
|
|
|
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum, build_Sum_1]),
|
|
|
|
... (SimpleVariable.Sum_, [build_Sum_2]),
|
2023-03-04 18:31:44 +00:00
|
|
|
... ]
|
2023-03-04 21:37:55 +00:00
|
|
|
>>> is_term = p_instance(SimpleTerminal)
|
|
|
|
>>> is_var = p_instance(SimpleVariable)
|
|
|
|
>>> my_oracle_table = oracle_table(is_term, is_var, grammar)
|
2023-03-04 18:31:44 +00:00
|
|
|
>>> my_parser = parser(my_oracle_table, lambda x: x[0], SimpleVariable.S)
|
|
|
|
|
|
|
|
>>> my_parser([
|
|
|
|
... (SimpleTerminal.Number, 1),
|
|
|
|
... (SimpleTerminal.Plus,),
|
2023-03-04 21:37:55 +00:00
|
|
|
... (SimpleTerminal.Number, 3),
|
2023-03-04 18:31:44 +00:00
|
|
|
... (SimpleTerminal.Plus,),
|
|
|
|
... (SimpleTerminal.Number, 10),
|
|
|
|
... (SimpleTerminal.Eof,),
|
|
|
|
... ])
|
2023-03-04 21:37:55 +00:00
|
|
|
Ok((14,))
|
2023-03-04 18:31:44 +00:00
|
|
|
|
|
|
|
>>> my_parser([
|
|
|
|
... (SimpleTerminal.Number, 1),
|
|
|
|
... (SimpleTerminal.Plus,),
|
2023-03-04 21:37:55 +00:00
|
|
|
... (SimpleTerminal.Number, 3),
|
2023-03-04 18:31:44 +00:00
|
|
|
... (SimpleTerminal.Number, 10), # <--- this is invalid!
|
|
|
|
... (SimpleTerminal.Eof,),
|
|
|
|
... ])
|
2023-03-04 21:37:55 +00:00
|
|
|
Err(((Number, 10), [Plus, Eof]))
|
2023-03-04 18:31:44 +00:00
|
|
|
"""
|
2023-03-04 21:37:55 +00:00
|
|
|
is_var: Callable[[Any], TypeGuard[A]] = p_instance(start_symbol.__class__)
|
|
|
|
is_tok: Callable[[Any], TypeGuard[B]] = p_instance(next(iter(oracle[start_symbol].keys())).__class__)
|
|
|
|
def inner(
|
2023-03-05 01:00:00 +00:00
|
|
|
stack: Sequence[A | B | Action],
|
2023-03-04 21:37:55 +00:00
|
|
|
ast_stack: Sequence[C | D],
|
|
|
|
lexemes: Sequence[D],
|
|
|
|
) -> Result[Sequence[C | D], Tuple[D, Collection[B]]]:
|
2023-03-04 18:31:44 +00:00
|
|
|
match stack:
|
2023-03-04 21:37:55 +00:00
|
|
|
# A [Variable]
|
|
|
|
case [top_of_stack, *popped_stack] if is_var(top_of_stack):
|
2023-03-05 03:02:16 +00:00
|
|
|
try:
|
|
|
|
expansions = oracle[top_of_stack][identify_lexeme(lexemes[0])]
|
|
|
|
except IndexError:
|
|
|
|
raise Exception('Unexpected end of input. Expected:', _expected(oracle[top_of_stack]))
|
2023-03-04 18:31:44 +00:00
|
|
|
match expansions:
|
|
|
|
case []:
|
2023-03-04 21:37:55 +00:00
|
|
|
return Err((lexemes[0], _expected(oracle[top_of_stack])))
|
2023-03-04 18:31:44 +00:00
|
|
|
case [expansion]:
|
2023-03-04 21:37:55 +00:00
|
|
|
return inner((*expansion, *popped_stack), ast_stack, lexemes)
|
2023-03-04 18:31:44 +00:00
|
|
|
case _:
|
|
|
|
raise Exception('Not an LL(1) grammar!!!')
|
2023-03-04 21:37:55 +00:00
|
|
|
# B [Token] (match)
|
2023-03-05 01:00:00 +00:00
|
|
|
case [top_of_stack, *popped_stack] if is_tok(top_of_stack) and top_of_stack == identify_lexeme(lexemes[0]):
|
2023-03-05 00:31:30 +00:00
|
|
|
return inner(popped_stack, (lexemes[0], *ast_stack), lexemes[1:])
|
2023-03-04 21:37:55 +00:00
|
|
|
# B [Token] (no match)
|
2023-03-05 01:00:00 +00:00
|
|
|
case [top_of_stack, *popped_stack] if is_tok(top_of_stack):
|
2023-03-04 21:37:55 +00:00
|
|
|
assert is_tok(top_of_stack)
|
|
|
|
return Err((lexemes[0], (top_of_stack,)))
|
2023-03-05 01:00:00 +00:00
|
|
|
# Action
|
|
|
|
case [f, *popped_stack]:
|
|
|
|
assert hasattr(f, '__call__')
|
|
|
|
return inner(popped_stack, f(ast_stack), lexemes)
|
2023-03-04 21:37:55 +00:00
|
|
|
# Empty stack (finished parsing)
|
2023-03-04 18:31:44 +00:00
|
|
|
case []:
|
2023-03-04 21:37:55 +00:00
|
|
|
if len(lexemes):
|
|
|
|
return Err((lexemes[0], []))
|
|
|
|
else:
|
|
|
|
return Ok(ast_stack)
|
2023-03-04 18:31:44 +00:00
|
|
|
raise Exception('Unreachable!')
|
2023-03-04 21:37:55 +00:00
|
|
|
return wraps(parser)(p(inner, [start_symbol], []))
|
2023-03-04 18:31:44 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
import doctest
|
|
|
|
from enum import auto, IntEnum
|
2023-03-04 21:37:55 +00:00
|
|
|
from build_oracle import oracle_table
|
2023-03-04 18:31:44 +00:00
|
|
|
doctest.testmod()
|