Added an oracle table builder

This commit is contained in:
Emi Simpson 2023-03-04 12:13:20 -05:00
parent c452cf3158
commit 03ff0d800e
Signed by: Emi
GPG key ID: A12F2C2FFDC3D847
3 changed files with 500 additions and 0 deletions

340
build_oracle.py Normal file
View file

@ -0,0 +1,340 @@
"""
Tools for building an oracle table
If this module is run directly in python, it will spit out valid python which produces an
oracle table for the grammar defined in `grammar.py`. It's recommended that this be done
using `build_oracle.sh` instead, however, which will build a whole python module
containing the oracle table, complete with imports.
This module can also be used on its own to generate an oracle table on the fly.
Note that, when run directly, this module will refuse to build an oracle table if ANY of
the tests defined within the module fail.
"""
from emis_funky_funktions import *
from enum import auto, Enum, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
def _first(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
sequence: Sequence[A | B]
) -> Tuple[Collection[B], bool]:
"""
Computes all of the possible starting terminals for a handle in a given grammar
Due to pathetic python weaknesses, the first argument you must provide is a type guard
to determine whether a certain thing is a terminal as opposed to a variable.
Then, pass in the grammar and the sequence of terminals and variables in question.
The output contains two values. The first is a set of possible terminals, and the
second is a boolean indicating whether this term can derive epsilon.
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause])
({Negate, Identifier}, False)
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms])
({Comma}, True)
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP])
({CloseP, Comma}, False)
"""
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
match vs:
case []:
return (set(), True)
case [v, *rest] if is_term(v):
return ({v}, False)
case [v, *rest]:
this_variable_first, derives_epsilon = reduce(
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
[
inner(handle)
for (other_variable, handle) in grammar
if other_variable == v
]
)
if derives_epsilon:
rest_first, rest_derives_epsilon = inner(rest)
return (rest_first | this_variable_first, rest_derives_epsilon)
else:
return (this_variable_first, False)
raise Exception("UNREACHABLE")
return inner(sequence)
def _follow(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Collection[B]]:
"""
Produce a table indicating exactly which terminals can follow each variable
>>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE
{<Start>: set(),
<Idents>: {Newline},
<Clauses>: {Eof},
<Clauses_>: {Eof},
<Clause>: {Newline, Eof},
<Clause_>: {Newline, Eof},
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<CSTerms>: {CloseP}}
"""
follow_table: Mapping[A, Set[B]] = {
variable: set()
for (variable, _) in grammar
}
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
new_table = reduce(
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
[
(
cast(A, handle[i]),
following_tokens(handle[i+1:], prev_table[variable])
)
for (variable, handle) in grammar
for i in range(len(handle))
if not is_term(handle[i])
],
prev_table
)
if new_table == prev_table:
return new_table
else:
return inner(new_table)
return inner(follow_table)
def _predict(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
follow: Mapping[A, Collection[B]],
lhs: A,
rhs: Sequence[A | B]
) -> Collection[B]:
"""
Given a production, identify the terminals which this production would be valid under
>>> is_tok = flip(cur2(isinstance))(Tok)
>>> follow = _follow(is_tok, GRAMMAR)
>>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
{Negate, Identifier}
"""
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
if epsilon_rhs:
return set(follow[lhs]) | set(first_rhs)
else:
return first_rhs
def oracle(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Callable[[A, B], Collection[Sequence[A | B]]]:
"""
Show valid expansions of a variable based on the next terminal to be read
For valid LL(1) grammars, there should never be more than one valid expansion.
The inner method constructed is memoized for your convenience.
>>> my_oracle = oracle(flip(cur2(isinstance))(Tok), GRAMMAR)
One valid expansion:
>>> my_oracle(Variable.Clauses_, Tok.Negate)
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle(Variable.Clauses_, Tok.Eof)
[[]]
Zero valid expansions:
>>> my_oracle(Variable.Term, Tok.Newline)
[]
"""
follow = _follow(is_term, grammar)
@wraps(oracle)
@cache
def inner(v: A, c: B) -> Collection[Sequence[A | B]]:
return [
handle
for (lhs, handle) in grammar
if lhs == v
and c in _predict(is_term, grammar, follow, lhs, handle)
]
return inner
def oracle_table(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
"""
A variant of `_oracle` that generates a table immediately rather than lazily
No significant performance benefit
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR)
One valid expansion:
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
[[]]
Zero valid expansions:
>>> my_oracle_table[Variable.Term][Tok.Newline]
[]
"""
all_variables = { lhs for (lhs, rhs) in grammar }
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
the_oracle = oracle(is_term, grammar)
return {
v: {
t: the_oracle(v, t)
for t in all_terminals
}
for v in all_variables
}
def print_oracle_table(
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]],
render: Callable[[A | B], str],
) -> str:
"""
Pretty prints an oracle table
The render function is expected to render terminals and variables. If the render
function produces valid python, then `print_oracle_table` will also produce valid
python.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... Sum = auto()
... Sum_ = auto()
... Term = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Letter = auto()
... Plus = auto()
>>> grammar = [
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
... (SimpleVariable.Sum_, []),
... (SimpleVariable.Term, [SimpleTerminal.Number]),
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
... ]
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar)
>>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}')
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
{
SimpleVariable.Sum: {
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Plus: []
},
SimpleVariable.Sum_: {
SimpleTerminal.Number: [],
SimpleTerminal.Letter: [],
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
},
SimpleVariable.Term: {
SimpleTerminal.Number: [[SimpleTerminal.Number]],
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
SimpleTerminal.Plus: []
}
}
"""
return '{\n' + ",\n".join([
f'{render(v)}: {"{"}\n' + ',\n'.join([
f'\t{render(t)}: [' + ', '.join([
'[' + ', '.join([
render(symbol)
for symbol in expansion
]) + ']'
for expansion in expansions
]) + ']'
for (t, expansions) in term_table.items()
]) + '\n}'
for (v, term_table) in oracle_table.items()
]) + '\n}'
EA = TypeVar('EA', bound=Enum)
EB = TypeVar('EB', bound=Enum)
def print_oracle_table_enum(
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]
) -> str:
"""
A special case of `print_oracle_table` where tokens and variables are enums
Always produces valid python.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... Sum = auto()
... Sum_ = auto()
... Term = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Letter = auto()
... Plus = auto()
>>> grammar = [
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
... (SimpleVariable.Sum_, []),
... (SimpleVariable.Term, [SimpleTerminal.Number]),
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
... ]
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar)
>>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table)
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
{
SimpleVariable.Sum: {
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
SimpleTerminal.Plus: []
},
SimpleVariable.Sum_: {
SimpleTerminal.Number: [],
SimpleTerminal.Letter: [],
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
},
SimpleVariable.Term: {
SimpleTerminal.Number: [[SimpleTerminal.Number]],
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
SimpleTerminal.Plus: []
}
}
"""
return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore
if __name__ == '__main__':
import doctest
from lex import Tok
from parse import GRAMMAR, Variable
failure_count, test_count = doctest.testmod()
if failure_count:
print('\n\nRefusing to build oracle table due to test failures')
exit(1)
else:
print(print_oracle_table_enum(oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR))) #type: ignore

16
build_oracle.sh Normal file
View file

@ -0,0 +1,16 @@
#!/bin/bash
cat << EOF > oracle_table.py
from lex import Tok
from parse import Variable
oracle_table = (
EOF
if python build_oracle.py >> oracle_table.py; then
echo ")" >> oracle_table.py
echo "Built oracle_table.py"
else
rm oracle_table.py
python build_oracle.py
fi

144
parse.py Normal file
View file

@ -0,0 +1,144 @@
from emis_funky_funktions import *
from enum import auto, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard
from lex import Tok
"""
Implements a parser for the following grammar:
Start := PredicateSection <Idents> Newline
VariablesSection <Idents> Newline
ConstantsSection <Idents> Newline
FunctionsSection <Idents> Newline
ClausesSection <Clauses> Eof
Idents := Identifier <Idents>
:= ε
Clauses := Newline <Clauses'>
:= ε
Clauses' := <Clause> <Clauses>
:= ε
Clause := <Term> <Clause'>
Clause' := <Clause>
:= ε
Term := Negate <Term>
:= Identifier <Func?>
Func? := OpenP <Term> <CSTerms> CloseP
:= ε
CSTerms := Comma <Term> <CSTerms>
:= ε
"""
class Variable(IntEnum):
Start = auto()
Idents = auto()
Clauses = auto()
Clauses_ = auto()
Clause = auto()
Clause_ = auto()
Term = auto()
Func = auto()
CSTerms = auto()
def __repr__(self) -> str:
return f'<{self._name_}>'
GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
(Variable.Start,
[ Tok.PredicateSection, Variable.Idents, Tok.Newline
, Tok.VariablesSection, Variable.Idents, Tok.Newline
, Tok.ConstantsSection, Variable.Idents, Tok.Newline
, Tok.FunctionsSection, Variable.Idents, Tok.Newline
, Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
(Variable.Idents,
[ Tok.Identifier, Variable.Idents ]),
(Variable.Idents,
[ ]),
(Variable.Clauses,
[ Tok.Newline, Variable.Clauses_ ]),
(Variable.Clauses,
[ ]),
(Variable.Clauses_,
[ Variable.Clause, Variable.Clauses ]),
(Variable.Clauses_,
[ ]),
(Variable.Clause,
[ Variable.Term, Variable.Clause_ ]),
(Variable.Clause_,
[ Variable.Clause ]),
(Variable.Clause_,
[ ]),
(Variable.Term,
[ Tok.Negate, Variable.Term ]),
(Variable.Term,
[ Tok.Identifier, Variable.Func ]),
(Variable.Func,
[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
(Variable.Func,
[ ]),
(Variable.CSTerms,
[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
(Variable.CSTerms,
[ ]),
]
# ### FIRST Table ###
#
# Start : PredicateSection
# Idents : Identifier, ε
# Clauses : Newline, ε
# Clauses' : Negate, Identifier, ε
# Clause : Negate, Identifier
# Clause' : Negate, Identifier, ε
# Term : Negate, Identifier
# Func? : OpenP
# CSTerms : Comma, ε
#
#
#
# ### FOLLOW Table ###
#
# Idents : Newline
# Clauses : Eof
# Clauses' : Eof
# Clause : Newline, Eof
# Clause' : Newline, Eof
# Term : Negate, Identifier, Newline, Eof, Comma
# Func? : Negate, Identifier, Newline, Eof, Comma
# CSTerms : CloseP
#
#
#
# ### PREDICT Table ###
#
# Idents : Identifier
# : Newline
# Clauses : Newline
# : Eof
# Clauses' : Negate, Identifier
# : Eof
# Clause : Newline, Eof
# Clause' : Newline, Eof
# Term : Negate, Identifier, Newline, Eof, Comma
# Func? : Negate, Identifier, Newline, Eof, Comma
# CSTerms : CloseP