From 03ff0d800e85eb8f054306da2e6c30f95fc0e4d8 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Sat, 4 Mar 2023 12:13:20 -0500 Subject: [PATCH] Added an oracle table builder --- build_oracle.py | 340 ++++++++++++++++++++++++++++++++++++++++++++++++ build_oracle.sh | 16 +++ parse.py | 144 ++++++++++++++++++++ 3 files changed, 500 insertions(+) create mode 100644 build_oracle.py create mode 100644 build_oracle.sh create mode 100644 parse.py diff --git a/build_oracle.py b/build_oracle.py new file mode 100644 index 0000000..ff61bc7 --- /dev/null +++ b/build_oracle.py @@ -0,0 +1,340 @@ +""" +Tools for building an oracle table + +If this module is run directly in python, it will spit out valid python which produces an +oracle table for the grammar defined in `grammar.py`. It's recommended that this be done +using `build_oracle.sh` instead, however, which will build a whole python module +containing the oracle table, complete with imports. + +This module can also be used on its own to generate an oracle table on the fly. + +Note that, when run directly, this module will refuse to build an oracle table if ANY of +the tests defined within the module fail. +""" +from emis_funky_funktions import * + +from enum import auto, Enum, IntEnum +from functools import cache, reduce +from operator import getitem +from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar + +def _first( + is_term: Callable[[A | B], TypeGuard[B]], + grammar: Sequence[Tuple[A, Sequence[A | B]]], + sequence: Sequence[A | B] +) -> Tuple[Collection[B], bool]: + """ + Computes all of the possible starting terminals for a handle in a given grammar + + Due to pathetic python weaknesses, the first argument you must provide is a type guard + to determine whether a certain thing is a terminal as opposed to a variable. + + Then, pass in the grammar and the sequence of terminals and variables in question. + + The output contains two values. The first is a set of possible terminals, and the + second is a boolean indicating whether this term can derive epsilon. + + >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause]) + ({Negate, Identifier}, False) + + >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms]) + ({Comma}, True) + + >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP]) + ({CloseP, Comma}, False) + """ + def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]: + match vs: + case []: + return (set(), True) + case [v, *rest] if is_term(v): + return ({v}, False) + case [v, *rest]: + this_variable_first, derives_epsilon = reduce( + lambda acc, result: (acc[0] | result[0], acc[1] or result[1]), + [ + inner(handle) + for (other_variable, handle) in grammar + if other_variable == v + ] + ) + if derives_epsilon: + rest_first, rest_derives_epsilon = inner(rest) + return (rest_first | this_variable_first, rest_derives_epsilon) + else: + return (this_variable_first, False) + raise Exception("UNREACHABLE") + return inner(sequence) + +def _follow( + is_term: Callable[[A | B], TypeGuard[B]], + grammar: Sequence[Tuple[A, Sequence[A | B]]], +) -> Mapping[A, Collection[B]]: + """ + Produce a table indicating exactly which terminals can follow each variable + + >>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE + {: set(), + : {Newline}, + : {Eof}, + : {Eof}, + : {Newline, Eof}, + : {Newline, Eof}, + : {Newline, Negate, CloseP, Comma, Identifier, Eof}, + : {Newline, Negate, CloseP, Comma, Identifier, Eof}, + : {CloseP}} + """ + follow_table: Mapping[A, Set[B]] = { + variable: set() + for (variable, _) in grammar + } + def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]: + handle_first, handle_derives_epsilon = _first(is_term, grammar, handle) + return set(handle_first) | (follows_handle if handle_derives_epsilon else set()) + + def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]: + new_table = reduce( + lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]}, + [ + ( + cast(A, handle[i]), + following_tokens(handle[i+1:], prev_table[variable]) + ) + for (variable, handle) in grammar + for i in range(len(handle)) + if not is_term(handle[i]) + ], + prev_table + ) + if new_table == prev_table: + return new_table + else: + return inner(new_table) + return inner(follow_table) + +def _predict( + is_term: Callable[[A | B], TypeGuard[B]], + grammar: Sequence[Tuple[A, Sequence[A | B]]], + follow: Mapping[A, Collection[B]], + lhs: A, + rhs: Sequence[A | B] +) -> Collection[B]: + """ + Given a production, identify the terminals which this production would be valid under + + >>> is_tok = flip(cur2(isinstance))(Tok) + >>> follow = _follow(is_tok, GRAMMAR) + >>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_]) + {Negate, Identifier} + """ + first_rhs, epsilon_rhs = _first(is_term, grammar, rhs) + if epsilon_rhs: + return set(follow[lhs]) | set(first_rhs) + else: + return first_rhs + +def oracle( + is_term: Callable[[A | B], TypeGuard[B]], + grammar: Sequence[Tuple[A, Sequence[A | B]]], +) -> Callable[[A, B], Collection[Sequence[A | B]]]: + """ + Show valid expansions of a variable based on the next terminal to be read + + For valid LL(1) grammars, there should never be more than one valid expansion. + + The inner method constructed is memoized for your convenience. + + >>> my_oracle = oracle(flip(cur2(isinstance))(Tok), GRAMMAR) + + One valid expansion: + >>> my_oracle(Variable.Clauses_, Tok.Negate) + [[, ]] + + One valid expansion, but it expands to epsilon: + >>> my_oracle(Variable.Clauses_, Tok.Eof) + [[]] + + Zero valid expansions: + >>> my_oracle(Variable.Term, Tok.Newline) + [] + """ + follow = _follow(is_term, grammar) + + @wraps(oracle) + @cache + def inner(v: A, c: B) -> Collection[Sequence[A | B]]: + return [ + handle + for (lhs, handle) in grammar + if lhs == v + and c in _predict(is_term, grammar, follow, lhs, handle) + ] + return inner + +def oracle_table( + is_term: Callable[[A | B], TypeGuard[B]], + grammar: Sequence[Tuple[A, Sequence[A | B]]], +) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]: + """ + A variant of `_oracle` that generates a table immediately rather than lazily + + No significant performance benefit + + >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR) + + One valid expansion: + >>> my_oracle_table[Variable.Clauses_][Tok.Negate] + [[, ]] + + One valid expansion, but it expands to epsilon: + >>> my_oracle_table[Variable.Clauses_][Tok.Eof] + [[]] + + Zero valid expansions: + >>> my_oracle_table[Variable.Term][Tok.Newline] + [] + """ + all_variables = { lhs for (lhs, rhs) in grammar } + all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) } + the_oracle = oracle(is_term, grammar) + return { + v: { + t: the_oracle(v, t) + for t in all_terminals + } + for v in all_variables + } + +def print_oracle_table( + oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]], + render: Callable[[A | B], str], +) -> str: + """ + Pretty prints an oracle table + + The render function is expected to render terminals and variables. If the render + function produces valid python, then `print_oracle_table` will also produce valid + python. + + ### Example: + + We generate a simple grammar: + + >>> class SimpleVariable(IntEnum): + ... Sum = auto() + ... Sum_ = auto() + ... Term = auto() + + >>> class SimpleTerminal(IntEnum): + ... Number = auto() + ... Letter = auto() + ... Plus = auto() + + >>> grammar = [ + ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), + ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), + ... (SimpleVariable.Sum_, []), + ... (SimpleVariable.Term, [SimpleTerminal.Number]), + ... (SimpleVariable.Term, [SimpleTerminal.Letter]), + ... ] + + >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar) + >>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') + >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE + { + SimpleVariable.Sum: { + SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], + SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], + SimpleTerminal.Plus: [] + }, + SimpleVariable.Sum_: { + SimpleTerminal.Number: [], + SimpleTerminal.Letter: [], + SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] + }, + SimpleVariable.Term: { + SimpleTerminal.Number: [[SimpleTerminal.Number]], + SimpleTerminal.Letter: [[SimpleTerminal.Letter]], + SimpleTerminal.Plus: [] + } + } + """ + return '{\n' + ",\n".join([ + f'{render(v)}: {"{"}\n' + ',\n'.join([ + f'\t{render(t)}: [' + ', '.join([ + '[' + ', '.join([ + render(symbol) + for symbol in expansion + ]) + ']' + for expansion in expansions + ]) + ']' + for (t, expansions) in term_table.items() + ]) + '\n}' + for (v, term_table) in oracle_table.items() + ]) + '\n}' + +EA = TypeVar('EA', bound=Enum) +EB = TypeVar('EB', bound=Enum) +def print_oracle_table_enum( + oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]] +) -> str: + """ + A special case of `print_oracle_table` where tokens and variables are enums + + Always produces valid python. + + ### Example: + + We generate a simple grammar: + + >>> class SimpleVariable(IntEnum): + ... Sum = auto() + ... Sum_ = auto() + ... Term = auto() + + >>> class SimpleTerminal(IntEnum): + ... Number = auto() + ... Letter = auto() + ... Plus = auto() + + >>> grammar = [ + ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), + ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), + ... (SimpleVariable.Sum_, []), + ... (SimpleVariable.Term, [SimpleTerminal.Number]), + ... (SimpleVariable.Term, [SimpleTerminal.Letter]), + ... ] + + >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar) + >>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table) + >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE + { + SimpleVariable.Sum: { + SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], + SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], + SimpleTerminal.Plus: [] + }, + SimpleVariable.Sum_: { + SimpleTerminal.Number: [], + SimpleTerminal.Letter: [], + SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] + }, + SimpleVariable.Term: { + SimpleTerminal.Number: [[SimpleTerminal.Number]], + SimpleTerminal.Letter: [[SimpleTerminal.Letter]], + SimpleTerminal.Plus: [] + } + } + """ + return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore + +if __name__ == '__main__': + import doctest + from lex import Tok + from parse import GRAMMAR, Variable + failure_count, test_count = doctest.testmod() + if failure_count: + print('\n\nRefusing to build oracle table due to test failures') + exit(1) + else: + print(print_oracle_table_enum(oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR))) #type: ignore \ No newline at end of file diff --git a/build_oracle.sh b/build_oracle.sh new file mode 100644 index 0000000..54309c8 --- /dev/null +++ b/build_oracle.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +cat << EOF > oracle_table.py +from lex import Tok +from parse import Variable + +oracle_table = ( +EOF + +if python build_oracle.py >> oracle_table.py; then + echo ")" >> oracle_table.py + echo "Built oracle_table.py" +else + rm oracle_table.py + python build_oracle.py +fi \ No newline at end of file diff --git a/parse.py b/parse.py new file mode 100644 index 0000000..4ddbc02 --- /dev/null +++ b/parse.py @@ -0,0 +1,144 @@ +from emis_funky_funktions import * + +from enum import auto, IntEnum +from functools import cache, reduce +from operator import getitem +from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard + +from lex import Tok + +""" +Implements a parser for the following grammar: + +Start := PredicateSection Newline + VariablesSection Newline + ConstantsSection Newline + FunctionsSection Newline + ClausesSection Eof + +Idents := Identifier + := ε + +Clauses := Newline + := ε + +Clauses' := + := ε + +Clause := + +Clause' := + := ε + +Term := Negate + := Identifier + +Func? := OpenP CloseP + := ε + +CSTerms := Comma + := ε +""" + +class Variable(IntEnum): + Start = auto() + Idents = auto() + Clauses = auto() + Clauses_ = auto() + Clause = auto() + Clause_ = auto() + Term = auto() + Func = auto() + CSTerms = auto() + + def __repr__(self) -> str: + return f'<{self._name_}>' + +GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [ + (Variable.Start, + [ Tok.PredicateSection, Variable.Idents, Tok.Newline + , Tok.VariablesSection, Variable.Idents, Tok.Newline + , Tok.ConstantsSection, Variable.Idents, Tok.Newline + , Tok.FunctionsSection, Variable.Idents, Tok.Newline + , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ), + + (Variable.Idents, + [ Tok.Identifier, Variable.Idents ]), + (Variable.Idents, + [ ]), + + (Variable.Clauses, + [ Tok.Newline, Variable.Clauses_ ]), + (Variable.Clauses, + [ ]), + + (Variable.Clauses_, + [ Variable.Clause, Variable.Clauses ]), + (Variable.Clauses_, + [ ]), + + (Variable.Clause, + [ Variable.Term, Variable.Clause_ ]), + + (Variable.Clause_, + [ Variable.Clause ]), + (Variable.Clause_, + [ ]), + + (Variable.Term, + [ Tok.Negate, Variable.Term ]), + (Variable.Term, + [ Tok.Identifier, Variable.Func ]), + + (Variable.Func, + [ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]), + (Variable.Func, + [ ]), + + (Variable.CSTerms, + [ Tok.Comma, Variable.Term, Variable.CSTerms ]), + (Variable.CSTerms, + [ ]), +] + + +# ### FIRST Table ### +# +# Start : PredicateSection +# Idents : Identifier, ε +# Clauses : Newline, ε +# Clauses' : Negate, Identifier, ε +# Clause : Negate, Identifier +# Clause' : Negate, Identifier, ε +# Term : Negate, Identifier +# Func? : OpenP +# CSTerms : Comma, ε +# +# +# +# ### FOLLOW Table ### +# +# Idents : Newline +# Clauses : Eof +# Clauses' : Eof +# Clause : Newline, Eof +# Clause' : Newline, Eof +# Term : Negate, Identifier, Newline, Eof, Comma +# Func? : Negate, Identifier, Newline, Eof, Comma +# CSTerms : CloseP +# +# +# +# ### PREDICT Table ### +# +# Idents : Identifier +# : Newline +# Clauses : Newline +# : Eof +# Clauses' : Negate, Identifier +# : Eof +# Clause : Newline, Eof +# Clause' : Newline, Eof +# Term : Negate, Identifier, Newline, Eof, Comma +# Func? : Negate, Identifier, Newline, Eof, Comma +# CSTerms : CloseP \ No newline at end of file