Added an oracle table builder
This commit is contained in:
parent
c452cf3158
commit
03ff0d800e
340
build_oracle.py
Normal file
340
build_oracle.py
Normal file
|
@ -0,0 +1,340 @@
|
|||
"""
|
||||
Tools for building an oracle table
|
||||
|
||||
If this module is run directly in python, it will spit out valid python which produces an
|
||||
oracle table for the grammar defined in `grammar.py`. It's recommended that this be done
|
||||
using `build_oracle.sh` instead, however, which will build a whole python module
|
||||
containing the oracle table, complete with imports.
|
||||
|
||||
This module can also be used on its own to generate an oracle table on the fly.
|
||||
|
||||
Note that, when run directly, this module will refuse to build an oracle table if ANY of
|
||||
the tests defined within the module fail.
|
||||
"""
|
||||
from emis_funky_funktions import *
|
||||
|
||||
from enum import auto, Enum, IntEnum
|
||||
from functools import cache, reduce
|
||||
from operator import getitem
|
||||
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
|
||||
|
||||
def _first(
|
||||
is_term: Callable[[A | B], TypeGuard[B]],
|
||||
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
||||
sequence: Sequence[A | B]
|
||||
) -> Tuple[Collection[B], bool]:
|
||||
"""
|
||||
Computes all of the possible starting terminals for a handle in a given grammar
|
||||
|
||||
Due to pathetic python weaknesses, the first argument you must provide is a type guard
|
||||
to determine whether a certain thing is a terminal as opposed to a variable.
|
||||
|
||||
Then, pass in the grammar and the sequence of terminals and variables in question.
|
||||
|
||||
The output contains two values. The first is a set of possible terminals, and the
|
||||
second is a boolean indicating whether this term can derive epsilon.
|
||||
|
||||
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause])
|
||||
({Negate, Identifier}, False)
|
||||
|
||||
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms])
|
||||
({Comma}, True)
|
||||
|
||||
>>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP])
|
||||
({CloseP, Comma}, False)
|
||||
"""
|
||||
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
|
||||
match vs:
|
||||
case []:
|
||||
return (set(), True)
|
||||
case [v, *rest] if is_term(v):
|
||||
return ({v}, False)
|
||||
case [v, *rest]:
|
||||
this_variable_first, derives_epsilon = reduce(
|
||||
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
|
||||
[
|
||||
inner(handle)
|
||||
for (other_variable, handle) in grammar
|
||||
if other_variable == v
|
||||
]
|
||||
)
|
||||
if derives_epsilon:
|
||||
rest_first, rest_derives_epsilon = inner(rest)
|
||||
return (rest_first | this_variable_first, rest_derives_epsilon)
|
||||
else:
|
||||
return (this_variable_first, False)
|
||||
raise Exception("UNREACHABLE")
|
||||
return inner(sequence)
|
||||
|
||||
def _follow(
|
||||
is_term: Callable[[A | B], TypeGuard[B]],
|
||||
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
||||
) -> Mapping[A, Collection[B]]:
|
||||
"""
|
||||
Produce a table indicating exactly which terminals can follow each variable
|
||||
|
||||
>>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE
|
||||
{<Start>: set(),
|
||||
<Idents>: {Newline},
|
||||
<Clauses>: {Eof},
|
||||
<Clauses_>: {Eof},
|
||||
<Clause>: {Newline, Eof},
|
||||
<Clause_>: {Newline, Eof},
|
||||
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
|
||||
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
|
||||
<CSTerms>: {CloseP}}
|
||||
"""
|
||||
follow_table: Mapping[A, Set[B]] = {
|
||||
variable: set()
|
||||
for (variable, _) in grammar
|
||||
}
|
||||
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
|
||||
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
|
||||
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
|
||||
|
||||
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
|
||||
new_table = reduce(
|
||||
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
|
||||
[
|
||||
(
|
||||
cast(A, handle[i]),
|
||||
following_tokens(handle[i+1:], prev_table[variable])
|
||||
)
|
||||
for (variable, handle) in grammar
|
||||
for i in range(len(handle))
|
||||
if not is_term(handle[i])
|
||||
],
|
||||
prev_table
|
||||
)
|
||||
if new_table == prev_table:
|
||||
return new_table
|
||||
else:
|
||||
return inner(new_table)
|
||||
return inner(follow_table)
|
||||
|
||||
def _predict(
|
||||
is_term: Callable[[A | B], TypeGuard[B]],
|
||||
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
||||
follow: Mapping[A, Collection[B]],
|
||||
lhs: A,
|
||||
rhs: Sequence[A | B]
|
||||
) -> Collection[B]:
|
||||
"""
|
||||
Given a production, identify the terminals which this production would be valid under
|
||||
|
||||
>>> is_tok = flip(cur2(isinstance))(Tok)
|
||||
>>> follow = _follow(is_tok, GRAMMAR)
|
||||
>>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
|
||||
{Negate, Identifier}
|
||||
"""
|
||||
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
|
||||
if epsilon_rhs:
|
||||
return set(follow[lhs]) | set(first_rhs)
|
||||
else:
|
||||
return first_rhs
|
||||
|
||||
def oracle(
|
||||
is_term: Callable[[A | B], TypeGuard[B]],
|
||||
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
||||
) -> Callable[[A, B], Collection[Sequence[A | B]]]:
|
||||
"""
|
||||
Show valid expansions of a variable based on the next terminal to be read
|
||||
|
||||
For valid LL(1) grammars, there should never be more than one valid expansion.
|
||||
|
||||
The inner method constructed is memoized for your convenience.
|
||||
|
||||
>>> my_oracle = oracle(flip(cur2(isinstance))(Tok), GRAMMAR)
|
||||
|
||||
One valid expansion:
|
||||
>>> my_oracle(Variable.Clauses_, Tok.Negate)
|
||||
[[<Clause>, <Clauses>]]
|
||||
|
||||
One valid expansion, but it expands to epsilon:
|
||||
>>> my_oracle(Variable.Clauses_, Tok.Eof)
|
||||
[[]]
|
||||
|
||||
Zero valid expansions:
|
||||
>>> my_oracle(Variable.Term, Tok.Newline)
|
||||
[]
|
||||
"""
|
||||
follow = _follow(is_term, grammar)
|
||||
|
||||
@wraps(oracle)
|
||||
@cache
|
||||
def inner(v: A, c: B) -> Collection[Sequence[A | B]]:
|
||||
return [
|
||||
handle
|
||||
for (lhs, handle) in grammar
|
||||
if lhs == v
|
||||
and c in _predict(is_term, grammar, follow, lhs, handle)
|
||||
]
|
||||
return inner
|
||||
|
||||
def oracle_table(
|
||||
is_term: Callable[[A | B], TypeGuard[B]],
|
||||
grammar: Sequence[Tuple[A, Sequence[A | B]]],
|
||||
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
|
||||
"""
|
||||
A variant of `_oracle` that generates a table immediately rather than lazily
|
||||
|
||||
No significant performance benefit
|
||||
|
||||
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR)
|
||||
|
||||
One valid expansion:
|
||||
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
|
||||
[[<Clause>, <Clauses>]]
|
||||
|
||||
One valid expansion, but it expands to epsilon:
|
||||
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
|
||||
[[]]
|
||||
|
||||
Zero valid expansions:
|
||||
>>> my_oracle_table[Variable.Term][Tok.Newline]
|
||||
[]
|
||||
"""
|
||||
all_variables = { lhs for (lhs, rhs) in grammar }
|
||||
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
|
||||
the_oracle = oracle(is_term, grammar)
|
||||
return {
|
||||
v: {
|
||||
t: the_oracle(v, t)
|
||||
for t in all_terminals
|
||||
}
|
||||
for v in all_variables
|
||||
}
|
||||
|
||||
def print_oracle_table(
|
||||
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]],
|
||||
render: Callable[[A | B], str],
|
||||
) -> str:
|
||||
"""
|
||||
Pretty prints an oracle table
|
||||
|
||||
The render function is expected to render terminals and variables. If the render
|
||||
function produces valid python, then `print_oracle_table` will also produce valid
|
||||
python.
|
||||
|
||||
### Example:
|
||||
|
||||
We generate a simple grammar:
|
||||
|
||||
>>> class SimpleVariable(IntEnum):
|
||||
... Sum = auto()
|
||||
... Sum_ = auto()
|
||||
... Term = auto()
|
||||
|
||||
>>> class SimpleTerminal(IntEnum):
|
||||
... Number = auto()
|
||||
... Letter = auto()
|
||||
... Plus = auto()
|
||||
|
||||
>>> grammar = [
|
||||
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
|
||||
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
|
||||
... (SimpleVariable.Sum_, []),
|
||||
... (SimpleVariable.Term, [SimpleTerminal.Number]),
|
||||
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
|
||||
... ]
|
||||
|
||||
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar)
|
||||
>>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}')
|
||||
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
|
||||
{
|
||||
SimpleVariable.Sum: {
|
||||
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
||||
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
||||
SimpleTerminal.Plus: []
|
||||
},
|
||||
SimpleVariable.Sum_: {
|
||||
SimpleTerminal.Number: [],
|
||||
SimpleTerminal.Letter: [],
|
||||
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
|
||||
},
|
||||
SimpleVariable.Term: {
|
||||
SimpleTerminal.Number: [[SimpleTerminal.Number]],
|
||||
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
|
||||
SimpleTerminal.Plus: []
|
||||
}
|
||||
}
|
||||
"""
|
||||
return '{\n' + ",\n".join([
|
||||
f'{render(v)}: {"{"}\n' + ',\n'.join([
|
||||
f'\t{render(t)}: [' + ', '.join([
|
||||
'[' + ', '.join([
|
||||
render(symbol)
|
||||
for symbol in expansion
|
||||
]) + ']'
|
||||
for expansion in expansions
|
||||
]) + ']'
|
||||
for (t, expansions) in term_table.items()
|
||||
]) + '\n}'
|
||||
for (v, term_table) in oracle_table.items()
|
||||
]) + '\n}'
|
||||
|
||||
EA = TypeVar('EA', bound=Enum)
|
||||
EB = TypeVar('EB', bound=Enum)
|
||||
def print_oracle_table_enum(
|
||||
oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]
|
||||
) -> str:
|
||||
"""
|
||||
A special case of `print_oracle_table` where tokens and variables are enums
|
||||
|
||||
Always produces valid python.
|
||||
|
||||
### Example:
|
||||
|
||||
We generate a simple grammar:
|
||||
|
||||
>>> class SimpleVariable(IntEnum):
|
||||
... Sum = auto()
|
||||
... Sum_ = auto()
|
||||
... Term = auto()
|
||||
|
||||
>>> class SimpleTerminal(IntEnum):
|
||||
... Number = auto()
|
||||
... Letter = auto()
|
||||
... Plus = auto()
|
||||
|
||||
>>> grammar = [
|
||||
... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]),
|
||||
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]),
|
||||
... (SimpleVariable.Sum_, []),
|
||||
... (SimpleVariable.Term, [SimpleTerminal.Number]),
|
||||
... (SimpleVariable.Term, [SimpleTerminal.Letter]),
|
||||
... ]
|
||||
|
||||
>>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar)
|
||||
>>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table)
|
||||
>>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE
|
||||
{
|
||||
SimpleVariable.Sum: {
|
||||
SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
||||
SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]],
|
||||
SimpleTerminal.Plus: []
|
||||
},
|
||||
SimpleVariable.Sum_: {
|
||||
SimpleTerminal.Number: [],
|
||||
SimpleTerminal.Letter: [],
|
||||
SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]]
|
||||
},
|
||||
SimpleVariable.Term: {
|
||||
SimpleTerminal.Number: [[SimpleTerminal.Number]],
|
||||
SimpleTerminal.Letter: [[SimpleTerminal.Letter]],
|
||||
SimpleTerminal.Plus: []
|
||||
}
|
||||
}
|
||||
"""
|
||||
return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore
|
||||
|
||||
if __name__ == '__main__':
|
||||
import doctest
|
||||
from lex import Tok
|
||||
from parse import GRAMMAR, Variable
|
||||
failure_count, test_count = doctest.testmod()
|
||||
if failure_count:
|
||||
print('\n\nRefusing to build oracle table due to test failures')
|
||||
exit(1)
|
||||
else:
|
||||
print(print_oracle_table_enum(oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR))) #type: ignore
|
16
build_oracle.sh
Normal file
16
build_oracle.sh
Normal file
|
@ -0,0 +1,16 @@
|
|||
#!/bin/bash
|
||||
|
||||
cat << EOF > oracle_table.py
|
||||
from lex import Tok
|
||||
from parse import Variable
|
||||
|
||||
oracle_table = (
|
||||
EOF
|
||||
|
||||
if python build_oracle.py >> oracle_table.py; then
|
||||
echo ")" >> oracle_table.py
|
||||
echo "Built oracle_table.py"
|
||||
else
|
||||
rm oracle_table.py
|
||||
python build_oracle.py
|
||||
fi
|
144
parse.py
Normal file
144
parse.py
Normal file
|
@ -0,0 +1,144 @@
|
|||
from emis_funky_funktions import *
|
||||
|
||||
from enum import auto, IntEnum
|
||||
from functools import cache, reduce
|
||||
from operator import getitem
|
||||
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard
|
||||
|
||||
from lex import Tok
|
||||
|
||||
"""
|
||||
Implements a parser for the following grammar:
|
||||
|
||||
Start := PredicateSection <Idents> Newline
|
||||
VariablesSection <Idents> Newline
|
||||
ConstantsSection <Idents> Newline
|
||||
FunctionsSection <Idents> Newline
|
||||
ClausesSection <Clauses> Eof
|
||||
|
||||
Idents := Identifier <Idents>
|
||||
:= ε
|
||||
|
||||
Clauses := Newline <Clauses'>
|
||||
:= ε
|
||||
|
||||
Clauses' := <Clause> <Clauses>
|
||||
:= ε
|
||||
|
||||
Clause := <Term> <Clause'>
|
||||
|
||||
Clause' := <Clause>
|
||||
:= ε
|
||||
|
||||
Term := Negate <Term>
|
||||
:= Identifier <Func?>
|
||||
|
||||
Func? := OpenP <Term> <CSTerms> CloseP
|
||||
:= ε
|
||||
|
||||
CSTerms := Comma <Term> <CSTerms>
|
||||
:= ε
|
||||
"""
|
||||
|
||||
class Variable(IntEnum):
|
||||
Start = auto()
|
||||
Idents = auto()
|
||||
Clauses = auto()
|
||||
Clauses_ = auto()
|
||||
Clause = auto()
|
||||
Clause_ = auto()
|
||||
Term = auto()
|
||||
Func = auto()
|
||||
CSTerms = auto()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'<{self._name_}>'
|
||||
|
||||
GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
|
||||
(Variable.Start,
|
||||
[ Tok.PredicateSection, Variable.Idents, Tok.Newline
|
||||
, Tok.VariablesSection, Variable.Idents, Tok.Newline
|
||||
, Tok.ConstantsSection, Variable.Idents, Tok.Newline
|
||||
, Tok.FunctionsSection, Variable.Idents, Tok.Newline
|
||||
, Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
|
||||
|
||||
(Variable.Idents,
|
||||
[ Tok.Identifier, Variable.Idents ]),
|
||||
(Variable.Idents,
|
||||
[ ]),
|
||||
|
||||
(Variable.Clauses,
|
||||
[ Tok.Newline, Variable.Clauses_ ]),
|
||||
(Variable.Clauses,
|
||||
[ ]),
|
||||
|
||||
(Variable.Clauses_,
|
||||
[ Variable.Clause, Variable.Clauses ]),
|
||||
(Variable.Clauses_,
|
||||
[ ]),
|
||||
|
||||
(Variable.Clause,
|
||||
[ Variable.Term, Variable.Clause_ ]),
|
||||
|
||||
(Variable.Clause_,
|
||||
[ Variable.Clause ]),
|
||||
(Variable.Clause_,
|
||||
[ ]),
|
||||
|
||||
(Variable.Term,
|
||||
[ Tok.Negate, Variable.Term ]),
|
||||
(Variable.Term,
|
||||
[ Tok.Identifier, Variable.Func ]),
|
||||
|
||||
(Variable.Func,
|
||||
[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
|
||||
(Variable.Func,
|
||||
[ ]),
|
||||
|
||||
(Variable.CSTerms,
|
||||
[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
|
||||
(Variable.CSTerms,
|
||||
[ ]),
|
||||
]
|
||||
|
||||
|
||||
# ### FIRST Table ###
|
||||
#
|
||||
# Start : PredicateSection
|
||||
# Idents : Identifier, ε
|
||||
# Clauses : Newline, ε
|
||||
# Clauses' : Negate, Identifier, ε
|
||||
# Clause : Negate, Identifier
|
||||
# Clause' : Negate, Identifier, ε
|
||||
# Term : Negate, Identifier
|
||||
# Func? : OpenP
|
||||
# CSTerms : Comma, ε
|
||||
#
|
||||
#
|
||||
#
|
||||
# ### FOLLOW Table ###
|
||||
#
|
||||
# Idents : Newline
|
||||
# Clauses : Eof
|
||||
# Clauses' : Eof
|
||||
# Clause : Newline, Eof
|
||||
# Clause' : Newline, Eof
|
||||
# Term : Negate, Identifier, Newline, Eof, Comma
|
||||
# Func? : Negate, Identifier, Newline, Eof, Comma
|
||||
# CSTerms : CloseP
|
||||
#
|
||||
#
|
||||
#
|
||||
# ### PREDICT Table ###
|
||||
#
|
||||
# Idents : Identifier
|
||||
# : Newline
|
||||
# Clauses : Newline
|
||||
# : Eof
|
||||
# Clauses' : Negate, Identifier
|
||||
# : Eof
|
||||
# Clause : Newline, Eof
|
||||
# Clause' : Newline, Eof
|
||||
# Term : Negate, Identifier, Newline, Eof, Comma
|
||||
# Func? : Negate, Identifier, Newline, Eof, Comma
|
||||
# CSTerms : CloseP
|
Loading…
Reference in a new issue