360 lines
12 KiB
Python
360 lines
12 KiB
Python
"""
|
|
Tools for building an oracle table
|
|
|
|
See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This
|
|
module only produces an oracle table in python, without outputting it.
|
|
"""
|
|
from emis_funky_funktions import *
|
|
|
|
from enum import auto, Enum, IntEnum
|
|
from functools import cache, reduce
|
|
from operator import getitem
|
|
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
|
|
|
|
def _erase_actions_h(
|
|
handle: Sequence[A | B | C],
|
|
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
|
|
) -> Sequence[A | B]:
|
|
"""
|
|
Produce an identical handle, but with all the actions removed
|
|
"""
|
|
return [i for i in handle if is_not_c(i)]
|
|
|
|
def _erase_actions(
|
|
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
|
|
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
|
|
) -> Sequence[Tuple[A, Sequence[A | B]]]:
|
|
"""
|
|
Produce an identical grammar, but with all the actions removed
|
|
"""
|
|
return [
|
|
(var, _erase_actions_h(handle, is_not_c))
|
|
for (var, handle) in grammar
|
|
]
|
|
|
|
def _first(
|
|
is_term: Callable[[A | B], TypeGuard[B]],
|
|
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
|
sequence: Sequence[A | B]
|
|
) -> Tuple[Collection[B], bool]:
|
|
"""
|
|
Computes all of the possible starting terminals for a handle in a given grammar
|
|
|
|
Due to pathetic python weaknesses, the first argument you must provide is a type guard
|
|
to determine whether a certain thing is a terminal as opposed to a variable.
|
|
|
|
Then, pass in the grammar and the sequence of terminals and variables in question.
|
|
|
|
The output contains two values. The first is a set of possible terminals, and the
|
|
second is a boolean indicating whether this term can derive epsilon.
|
|
|
|
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause])
|
|
({Negate, Identifier}, False)
|
|
|
|
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms])
|
|
({Comma}, True)
|
|
|
|
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP])
|
|
({CloseP, Comma}, False)
|
|
"""
|
|
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
|
|
match vs:
|
|
case []:
|
|
return (set(), True)
|
|
case [v, *rest] if is_term(v):
|
|
return ({v}, False)
|
|
case [v, *rest]:
|
|
this_variable_first, derives_epsilon = reduce(
|
|
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
|
|
[
|
|
inner(handle)
|
|
for (other_variable, handle) in grammar
|
|
if other_variable == v
|
|
]
|
|
)
|
|
if derives_epsilon:
|
|
rest_first, rest_derives_epsilon = inner(rest)
|
|
return (rest_first | this_variable_first, rest_derives_epsilon)
|
|
else:
|
|
return (this_variable_first, False)
|
|
raise Exception("UNREACHABLE")
|
|
return inner(sequence)
|
|
|
|
def _follow(
|
|
is_term: Callable[[A | B], TypeGuard[B]],
|
|
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
|
) -> Mapping[A, Collection[B]]:
|
|
"""
|
|
Produce a table indicating exactly which terminals can follow each variable
|
|
|
|
>>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE
|
|
{<Start>: set(),
|
|
<Idents>: {Newline},
|
|
<Clauses>: {Eof},
|
|
<Clauses_>: {Eof},
|
|
<Clause>: {Newline, Eof},
|
|
<Clause_>: {Newline, Eof},
|
|
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
|
|
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
|
|
<CSTerms>: {CloseP}}
|
|
"""
|
|
follow_table: Mapping[A, Set[B]] = {
|
|
variable: set()
|
|
for (variable, _) in grammar
|
|
}
|
|
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
|
|
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
|
|
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
|
|
|
|
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
|
|
new_table = reduce(
|
|
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
|
|
[
|
|
(
|
|
cast(A, handle[i]),
|
|
following_tokens(handle[i+1:], prev_table[variable])
|
|
)
|
|
for (variable, handle) in grammar
|
|
for i in range(len(handle))
|
|
if not is_term(handle[i])
|
|
],
|
|
prev_table
|
|
)
|
|
if new_table == prev_table:
|
|
return new_table
|
|
else:
|
|
return inner(new_table)
|
|
return inner(follow_table)
|
|
|
|
def _predict(
|
|
is_term: Callable[[A | B], TypeGuard[B]],
|
|
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
|
follow: Mapping[A, Collection[B]],
|
|
lhs: A,
|
|
rhs: Sequence[A | B]
|
|
) -> Collection[B]:
|
|
"""
|
|
Given a production, identify the terminals which this production would be valid under
|
|
|
|
>>> is_tok = flip(cur2(isinstance))(Tok)
|
|
>>> follow = _follow(is_tok, GRAMMAR)
|
|
>>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
|
|
{Negate, Identifier}
|
|
"""
|
|
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
|
|
if epsilon_rhs:
|
|
return set(follow[lhs]) | set(first_rhs)
|
|
else:
|
|
return first_rhs
|
|
|
|
def oracle(
|
|
is_term: Callable[[A | B | C], TypeGuard[B]],
|
|
is_var: Callable[[A | B | C], TypeGuard[A]],
|
|
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
|
|
) -> Callable[[A, B], Collection[Sequence[A | B | C]]]:
|
|
"""
|
|
Show valid expansions of a variable based on the next terminal to be read
|
|
|
|
For valid LL(1) grammars, there should never be more than one valid expansion.
|
|
|
|
The inner method constructed is memoized for your convenience.
|
|
|
|
>>> is_tok = p_instance(Tok)
|
|
>>> is_var = p_instance(Variable)
|
|
>>> my_oracle = oracle(is_tok, is_var, GRAMMAR)
|
|
|
|
One valid expansion:
|
|
>>> my_oracle(Variable.Clauses_, Tok.Negate)
|
|
[[<Clause>, <Clauses>]]
|
|
|
|
One valid expansion, but it expands to epsilon:
|
|
>>> my_oracle(Variable.Clauses_, Tok.Eof)
|
|
[[]]
|
|
|
|
Zero valid expansions:
|
|
>>> my_oracle(Variable.Term, Tok.Newline)
|
|
[]
|
|
"""
|
|
is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore
|
|
e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c)
|
|
follow = _follow(is_term, e_grammar)
|
|
|
|
@wraps(oracle)
|
|
@cache
|
|
def inner(v: A, c: B) -> Collection[Sequence[A | B | C]]:
|
|
return [
|
|
handle
|
|
for (lhs, handle) in grammar
|
|
if lhs == v
|
|
and c in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c))
|
|
]
|
|
return inner
|
|
|
|
def oracle_table(
|
|
is_term: Callable[[A | B], TypeGuard[B]],
|
|
is_var: Callable[[A | B], TypeGuard[A]],
|
|
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
|
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
|
|
"""
|
|
A variant of `_oracle` that generates a table immediately rather than lazily
|
|
|
|
No significant performance benefit
|
|
|
|
>>> is_tok = p_instance(Tok)
|
|
>>> is_var = p_instance(Variable)
|
|
>>> my_oracle_table = oracle_table(is_tok, is_var, GRAMMAR)
|
|
|
|
One valid expansion:
|
|
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
|
|
[[<Clause>, <Clauses>]]
|
|
|
|
One valid expansion, but it expands to epsilon:
|
|
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
|
|
[[]]
|
|
|
|
Zero valid expansions:
|
|
>>> my_oracle_table[Variable.Term][Tok.Newline]
|
|
[]
|
|
"""
|
|
all_variables = { lhs for (lhs, rhs) in grammar }
|
|
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
|
|
the_oracle = oracle(is_term, is_var, grammar)
|
|
return {
|
|
v: {
|
|
t: the_oracle(v, t)
|
|
for t in all_terminals
|
|
}
|
|
for v in all_variables
|
|
}
|
|
|
|
def print_oracle_table(
|
|
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]],
|
|
render: Callable[[A | B], str],
|
|
) -> str:
|
|
"""
|
|
Pretty prints an oracle table
|
|
|
|
The render function is expected to render terminals and variables. If the render
|
|
function produces valid python, then `print_oracle_table` will also produce valid
|
|
python.
|
|
|
|
### Example:
|
|
|
|
We generate a simple grammar:
|
|
|
|
>>> class SimpleVariable(IntEnum):
|
|
... Sum = auto()
|
|
... Sum_ = auto()
|
|
... Term = auto()
|
|
|
|
>>> class SimpleTerminal(IntEnum):
|
|
... Number = auto()
|
|
... Letter = auto()
|
|
... Plus = auto()
|
|
|
|
>>> grammar = [
|
|
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
|
|
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
|
|
... (SimpleVariable.Sum_, []),
|
|
... (SimpleVariable.Term, [SimpleTerminal.Number]),
|
|
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
|
|
... ]
|
|
|
|
>>> is_tok = p_instance(SimpleTerminal)
|
|
>>> is_var = p_instance(SimpleVariable)
|
|
>>> my_oracle_table = oracle_table(is_tok, is_var, grammar)
|
|
>>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}')
|
|
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
|
|
{
|
|
SimpleVariable.Sum: {
|
|
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
|
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
|
SimpleTerminal.Plus: []
|
|
},
|
|
SimpleVariable.Sum_: {
|
|
SimpleTerminal.Number: [],
|
|
SimpleTerminal.Letter: [],
|
|
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
|
|
},
|
|
SimpleVariable.Term: {
|
|
SimpleTerminal.Number: [[SimpleTerminal.Number]],
|
|
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
|
|
SimpleTerminal.Plus: []
|
|
}
|
|
}
|
|
"""
|
|
return '{\n' + ",\n".join([
|
|
f'{render(v)}: {"{"}\n' + ',\n'.join([
|
|
f'\t{render(t)}: [' + ', '.join([
|
|
'[' + ', '.join([
|
|
render(symbol)
|
|
for symbol in expansion
|
|
]) + ']'
|
|
for expansion in expansions
|
|
]) + ']'
|
|
for (t, expansions) in term_table.items()
|
|
]) + '\n}'
|
|
for (v, term_table) in oracle_table.items()
|
|
]) + '\n}'
|
|
|
|
EA = TypeVar('EA', bound=Enum)
|
|
EB = TypeVar('EB', bound=Enum)
|
|
def print_oracle_table_enum(
|
|
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]
|
|
) -> str:
|
|
"""
|
|
A special case of `print_oracle_table` where tokens and variables are enums
|
|
|
|
Always produces valid python.
|
|
|
|
### Example:
|
|
|
|
We generate a simple grammar:
|
|
|
|
>>> class SimpleVariable(IntEnum):
|
|
... Sum = auto()
|
|
... Sum_ = auto()
|
|
... Term = auto()
|
|
|
|
>>> class SimpleTerminal(IntEnum):
|
|
... Number = auto()
|
|
... Letter = auto()
|
|
... Plus = auto()
|
|
|
|
>>> grammar = [
|
|
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
|
|
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
|
|
... (SimpleVariable.Sum_, []),
|
|
... (SimpleVariable.Term, [SimpleTerminal.Number]),
|
|
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
|
|
... ]
|
|
|
|
>>> is_tok = p_instance(SimpleTerminal)
|
|
>>> is_var = p_instance(SimpleVariable)
|
|
>>> my_oracle_table = oracle_table(is_tok, is_var, grammar)
|
|
>>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table)
|
|
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
|
|
{
|
|
SimpleVariable.Sum: {
|
|
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
|
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
|
SimpleTerminal.Plus: []
|
|
},
|
|
SimpleVariable.Sum_: {
|
|
SimpleTerminal.Number: [],
|
|
SimpleTerminal.Letter: [],
|
|
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
|
|
},
|
|
SimpleVariable.Term: {
|
|
SimpleTerminal.Number: [[SimpleTerminal.Number]],
|
|
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
|
|
SimpleTerminal.Plus: []
|
|
}
|
|
}
|
|
"""
|
|
return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore
|
|
|
|
if __name__ == '__main__':
|
|
import doctest
|
|
from grammar import GRAMMAR, Tok, Variable
|
|
doctest.testmod() |