JSON-Lang/build_oracle.py

360 lines
12 KiB
Python

"""
Tools for building an oracle table
See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This
module only produces an oracle table in python, without outputting it.
"""
from emis_funky_funktions import *
from enum import auto, Enum, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
def _erase_actions_h(
handle: Sequence[A | B | C],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[A | B]:
"""
Produce an identical handle, but with all the actions removed
"""
return [i for i in handle if is_not_c(i)]
def _erase_actions(
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[Tuple[A, Sequence[A | B]]]:
"""
Produce an identical grammar, but with all the actions removed
"""
return [
(var, _erase_actions_h(handle, is_not_c))
for (var, handle) in grammar
]
def _first(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
sequence: Sequence[A | B]
) -> Tuple[Collection[B], bool]:
"""
Computes all of the possible starting terminals for a handle in a given grammar
Due to pathetic python weaknesses, the first argument you must provide is a type guard
to determine whether a certain thing is a terminal as opposed to a variable.
Then, pass in the grammar and the sequence of terminals and variables in question.
The output contains two values. The first is a set of possible terminals, and the
second is a boolean indicating whether this term can derive epsilon.
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause])
({Negate, Identifier}, False)
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms])
({Comma}, True)
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP])
({CloseP, Comma}, False)
"""
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
match vs:
case []:
return (set(), True)
case [v, *rest] if is_term(v):
return ({v}, False)
case [v, *rest]:
this_variable_first, derives_epsilon = reduce(
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
[
inner(handle)
for (other_variable, handle) in grammar
if other_variable == v
]
)
if derives_epsilon:
rest_first, rest_derives_epsilon = inner(rest)
return (rest_first | this_variable_first, rest_derives_epsilon)
else:
return (this_variable_first, False)
raise Exception("UNREACHABLE")
return inner(sequence)
def _follow(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Collection[B]]:
"""
Produce a table indicating exactly which terminals can follow each variable
>>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE
{<Start>: set(),
<Idents>: {Newline},
<Clauses>: {Eof},
<Clauses_>: {Eof},
<Clause>: {Newline, Eof},
<Clause_>: {Newline, Eof},
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<CSTerms>: {CloseP}}
"""
follow_table: Mapping[A, Set[B]] = {
variable: set()
for (variable, _) in grammar
}
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
new_table = reduce(
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
[
(
cast(A, handle[i]),
following_tokens(handle[i+1:], prev_table[variable])
)
for (variable, handle) in grammar
for i in range(len(handle))
if not is_term(handle[i])
],
prev_table
)
if new_table == prev_table:
return new_table
else:
return inner(new_table)
return inner(follow_table)
def _predict(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
follow: Mapping[A, Collection[B]],
lhs: A,
rhs: Sequence[A | B]
) -> Collection[B]:
"""
Given a production, identify the terminals which this production would be valid under
>>> is_tok = flip(cur2(isinstance))(Tok)
>>> follow = _follow(is_tok, GRAMMAR)
>>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
{Negate, Identifier}
"""
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
if epsilon_rhs:
return set(follow[lhs]) | set(first_rhs)
else:
return first_rhs
def oracle(
is_term: Callable[[A | B | C], TypeGuard[B]],
is_var: Callable[[A | B | C], TypeGuard[A]],
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
) -> Callable[[A, B], Collection[Sequence[A | B | C]]]:
"""
Show valid expansions of a variable based on the next terminal to be read
For valid LL(1) grammars, there should never be more than one valid expansion.
The inner method constructed is memoized for your convenience.
>>> is_tok = p_instance(Tok)
>>> is_var = p_instance(Variable)
>>> my_oracle = oracle(is_tok, is_var, GRAMMAR)
One valid expansion:
>>> my_oracle(Variable.Clauses_, Tok.Negate)
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle(Variable.Clauses_, Tok.Eof)
[[]]
Zero valid expansions:
>>> my_oracle(Variable.Term, Tok.Newline)
[]
"""
is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore
e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c)
follow = _follow(is_term, e_grammar)
@wraps(oracle)
@cache
def inner(v: A, c: B) -> Collection[Sequence[A | B | C]]:
return [
handle
for (lhs, handle) in grammar
if lhs == v
and c in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c))
]
return inner
def oracle_table(
is_term: Callable[[A | B], TypeGuard[B]],
is_var: Callable[[A | B], TypeGuard[A]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
"""
A variant of `_oracle` that generates a table immediately rather than lazily
No significant performance benefit
>>> is_tok = p_instance(Tok)
>>> is_var = p_instance(Variable)
>>> my_oracle_table = oracle_table(is_tok, is_var, GRAMMAR)
One valid expansion:
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
[[]]
Zero valid expansions:
>>> my_oracle_table[Variable.Term][Tok.Newline]
[]
"""
all_variables = { lhs for (lhs, rhs) in grammar }
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
the_oracle = oracle(is_term, is_var, grammar)
return {
v: {
t: the_oracle(v, t)
for t in all_terminals
}
for v in all_variables
}
def print_oracle_table(
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]],
render: Callable[[A | B], str],
) -> str:
"""
Pretty prints an oracle table
The render function is expected to render terminals and variables. If the render
function produces valid python, then `print_oracle_table` will also produce valid
python.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... Sum = auto()
... Sum_ = auto()
... Term = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Letter = auto()
... Plus = auto()
>>> grammar = [
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
... (SimpleVariable.Sum_, []),
... (SimpleVariable.Term, [SimpleTerminal.Number]),
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
... ]
>>> is_tok = p_instance(SimpleTerminal)
>>> is_var = p_instance(SimpleVariable)
>>> my_oracle_table = oracle_table(is_tok, is_var, grammar)
>>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}')
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
{
SimpleVariable.Sum: {
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Plus: []
},
SimpleVariable.Sum_: {
SimpleTerminal.Number: [],
SimpleTerminal.Letter: [],
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
},
SimpleVariable.Term: {
SimpleTerminal.Number: [[SimpleTerminal.Number]],
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
SimpleTerminal.Plus: []
}
}
"""
return '{\n' + ",\n".join([
f'{render(v)}: {"{"}\n' + ',\n'.join([
f'\t{render(t)}: [' + ', '.join([
'[' + ', '.join([
render(symbol)
for symbol in expansion
]) + ']'
for expansion in expansions
]) + ']'
for (t, expansions) in term_table.items()
]) + '\n}'
for (v, term_table) in oracle_table.items()
]) + '\n}'
EA = TypeVar('EA', bound=Enum)
EB = TypeVar('EB', bound=Enum)
def print_oracle_table_enum(
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]
) -> str:
"""
A special case of `print_oracle_table` where tokens and variables are enums
Always produces valid python.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... Sum = auto()
... Sum_ = auto()
... Term = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Letter = auto()
... Plus = auto()
>>> grammar = [
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
... (SimpleVariable.Sum_, []),
... (SimpleVariable.Term, [SimpleTerminal.Number]),
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
... ]
>>> is_tok = p_instance(SimpleTerminal)
>>> is_var = p_instance(SimpleVariable)
>>> my_oracle_table = oracle_table(is_tok, is_var, grammar)
>>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table)
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
{
SimpleVariable.Sum: {
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Plus: []
},
SimpleVariable.Sum_: {
SimpleTerminal.Number: [],
SimpleTerminal.Letter: [],
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
},
SimpleVariable.Term: {
SimpleTerminal.Number: [[SimpleTerminal.Number]],
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
SimpleTerminal.Plus: []
}
}
"""
return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore
if __name__ == '__main__':
import doctest
from grammar import GRAMMAR, Tok, Variable
doctest.testmod()