JSON-Lang/build_oracle.py

203 lines
7 KiB
Python
Raw Normal View History

2023-03-04 17:13:20 +00:00
"""
Tools for building an oracle table
See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This
module only produces an oracle table in python, without outputting it.
2023-03-06 18:03:59 +00:00
NOTE! Doctests in this module use `GRAMMAR` from `grammar.py` and `EGRAMMAR` as a version
of that grammar with the actions erased, with `_erase_actions()`.
2023-03-04 17:13:20 +00:00
"""
from emis_funky_funktions import *
from enum import auto, Enum, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
2023-03-04 19:55:34 +00:00
def _erase_actions_h(
handle: Sequence[A | B | C],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[A | B]:
"""
Produce an identical handle, but with all the actions removed
"""
return [i for i in handle if is_not_c(i)]
def _erase_actions(
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[Tuple[A, Sequence[A | B]]]:
"""
Produce an identical grammar, but with all the actions removed
"""
return [
(var, _erase_actions_h(handle, is_not_c))
for (var, handle) in grammar
]
2023-03-04 17:13:20 +00:00
def _first(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
sequence: Sequence[A | B]
) -> Tuple[Collection[B], bool]:
"""
Computes all of the possible starting terminals for a handle in a given grammar
Due to pathetic python weaknesses, the first argument you must provide is a type guard
to determine whether a certain thing is a terminal as opposed to a variable.
Then, pass in the grammar and the sequence of terminals and variables in question.
The output contains two values. The first is a set of possible terminals, and the
second is a boolean indicating whether this term can derive epsilon.
2023-03-06 18:03:59 +00:00
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.Clause])
2023-03-04 17:13:20 +00:00
({Negate, Identifier}, False)
2023-03-06 18:03:59 +00:00
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms])
2023-03-04 17:13:20 +00:00
({Comma}, True)
2023-03-06 18:03:59 +00:00
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms, Tok.CloseP])
2023-03-04 17:13:20 +00:00
({CloseP, Comma}, False)
"""
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
match vs:
case []:
return (set(), True)
case [v, *rest] if is_term(v):
return ({v}, False)
case [v, *rest]:
this_variable_first, derives_epsilon = reduce(
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
[
inner(handle)
for (other_variable, handle) in grammar
if other_variable == v
]
)
if derives_epsilon:
rest_first, rest_derives_epsilon = inner(rest)
return (rest_first | this_variable_first, rest_derives_epsilon)
else:
return (this_variable_first, False)
raise Exception("UNREACHABLE")
return inner(sequence)
def _follow(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Collection[B]]:
"""
Produce a table indicating exactly which terminals can follow each variable
2023-03-06 18:03:59 +00:00
>>> _follow(flip(cur2(isinstance))(Tok), EGRAMMAR) #doctest: +NORMALIZE_WHITESPACE
2023-03-04 17:13:20 +00:00
{<Start>: set(),
<Idents>: {Newline},
<Clauses>: {Eof},
<Clauses_>: {Eof},
<Clause>: {Newline, Eof},
<Clause_>: {Newline, Eof},
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<CSTerms>: {CloseP}}
"""
follow_table: Mapping[A, Set[B]] = {
variable: set()
for (variable, _) in grammar
}
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
new_table = reduce(
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
[
(
cast(A, handle[i]),
following_tokens(handle[i+1:], prev_table[variable])
)
for (variable, handle) in grammar
for i in range(len(handle))
if not is_term(handle[i])
],
prev_table
)
if new_table == prev_table:
return new_table
else:
return inner(new_table)
return inner(follow_table)
def _predict(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
follow: Mapping[A, Collection[B]],
lhs: A,
rhs: Sequence[A | B]
) -> Collection[B]:
"""
Given a production, identify the terminals which this production would be valid under
>>> is_tok = flip(cur2(isinstance))(Tok)
2023-03-06 18:03:59 +00:00
>>> follow = _follow(is_tok, EGRAMMAR)
>>> _predict(is_tok, EGRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
2023-03-04 17:13:20 +00:00
{Negate, Identifier}
"""
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
if epsilon_rhs:
return set(follow[lhs]) | set(first_rhs)
else:
return first_rhs
def oracle_table(
is_term: Callable[[A | B], TypeGuard[B]],
2023-03-04 19:55:34 +00:00
is_var: Callable[[A | B], TypeGuard[A]],
2023-03-04 17:13:20 +00:00
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
"""
A variant of `_oracle` that generates a table immediately rather than lazily
No significant performance benefit
2023-03-04 19:55:34 +00:00
>>> is_tok = p_instance(Tok)
>>> is_var = p_instance(Variable)
2023-03-06 18:03:59 +00:00
>>> my_oracle_table = oracle_table(is_tok, is_var, EGRAMMAR)
2023-03-04 17:13:20 +00:00
One valid expansion:
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
[[]]
Zero valid expansions:
>>> my_oracle_table[Variable.Term][Tok.Newline]
[]
"""
all_variables = { lhs for (lhs, rhs) in grammar }
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
2023-03-06 17:03:15 +00:00
is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore
e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c) #type:ignore
follow = _follow(is_term, e_grammar)
2023-03-04 17:13:20 +00:00
return {
v: {
2023-03-06 17:03:15 +00:00
t: [
handle
for (lhs, handle) in grammar
if lhs == v
and t in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c)) #type:ignore
]
2023-03-04 17:13:20 +00:00
for t in all_terminals
}
for v in all_variables
}
if __name__ == '__main__':
import doctest
from grammar import GRAMMAR, Tok, Variable
2023-03-06 18:03:59 +00:00
EGRAMMAR = _erase_actions(GRAMMAR, lambda x: not hasattr(x, '__call__')) #type: ignore
doctest.testmod()