""" Tools for building an oracle table If this module is run directly in python, it will spit out valid python which produces an oracle table for the grammar defined in `grammar.py`. It's recommended that this be done using `build_oracle.sh` instead, however, which will build a whole python module containing the oracle table, complete with imports. This module can also be used on its own to generate an oracle table on the fly. Note that, when run directly, this module will refuse to build an oracle table if ANY of the tests defined within the module fail. """ from emis_funky_funktions import * from enum import auto, Enum, IntEnum from functools import cache, reduce from operator import getitem from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar def _first( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], sequence: Sequence[A | B] ) -> Tuple[Collection[B], bool]: """ Computes all of the possible starting terminals for a handle in a given grammar Due to pathetic python weaknesses, the first argument you must provide is a type guard to determine whether a certain thing is a terminal as opposed to a variable. Then, pass in the grammar and the sequence of terminals and variables in question. The output contains two values. The first is a set of possible terminals, and the second is a boolean indicating whether this term can derive epsilon. >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause]) ({Negate, Identifier}, False) >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms]) ({Comma}, True) >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP]) ({CloseP, Comma}, False) """ def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]: match vs: case []: return (set(), True) case [v, *rest] if is_term(v): return ({v}, False) case [v, *rest]: this_variable_first, derives_epsilon = reduce( lambda acc, result: (acc[0] | result[0], acc[1] or result[1]), [ inner(handle) for (other_variable, handle) in grammar if other_variable == v ] ) if derives_epsilon: rest_first, rest_derives_epsilon = inner(rest) return (rest_first | this_variable_first, rest_derives_epsilon) else: return (this_variable_first, False) raise Exception("UNREACHABLE") return inner(sequence) def _follow( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Collection[B]]: """ Produce a table indicating exactly which terminals can follow each variable >>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE {: set(), : {Newline}, : {Eof}, : {Eof}, : {Newline, Eof}, : {Newline, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {CloseP}} """ follow_table: Mapping[A, Set[B]] = { variable: set() for (variable, _) in grammar } def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]: handle_first, handle_derives_epsilon = _first(is_term, grammar, handle) return set(handle_first) | (follows_handle if handle_derives_epsilon else set()) def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]: new_table = reduce( lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]}, [ ( cast(A, handle[i]), following_tokens(handle[i+1:], prev_table[variable]) ) for (variable, handle) in grammar for i in range(len(handle)) if not is_term(handle[i]) ], prev_table ) if new_table == prev_table: return new_table else: return inner(new_table) return inner(follow_table) def _predict( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], follow: Mapping[A, Collection[B]], lhs: A, rhs: Sequence[A | B] ) -> Collection[B]: """ Given a production, identify the terminals which this production would be valid under >>> is_tok = flip(cur2(isinstance))(Tok) >>> follow = _follow(is_tok, GRAMMAR) >>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_]) {Negate, Identifier} """ first_rhs, epsilon_rhs = _first(is_term, grammar, rhs) if epsilon_rhs: return set(follow[lhs]) | set(first_rhs) else: return first_rhs def oracle( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Callable[[A, B], Collection[Sequence[A | B]]]: """ Show valid expansions of a variable based on the next terminal to be read For valid LL(1) grammars, there should never be more than one valid expansion. The inner method constructed is memoized for your convenience. >>> my_oracle = oracle(flip(cur2(isinstance))(Tok), GRAMMAR) One valid expansion: >>> my_oracle(Variable.Clauses_, Tok.Negate) [[, ]] One valid expansion, but it expands to epsilon: >>> my_oracle(Variable.Clauses_, Tok.Eof) [[]] Zero valid expansions: >>> my_oracle(Variable.Term, Tok.Newline) [] """ follow = _follow(is_term, grammar) @wraps(oracle) @cache def inner(v: A, c: B) -> Collection[Sequence[A | B]]: return [ handle for (lhs, handle) in grammar if lhs == v and c in _predict(is_term, grammar, follow, lhs, handle) ] return inner def oracle_table( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]: """ A variant of `_oracle` that generates a table immediately rather than lazily No significant performance benefit >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR) One valid expansion: >>> my_oracle_table[Variable.Clauses_][Tok.Negate] [[, ]] One valid expansion, but it expands to epsilon: >>> my_oracle_table[Variable.Clauses_][Tok.Eof] [[]] Zero valid expansions: >>> my_oracle_table[Variable.Term][Tok.Newline] [] """ all_variables = { lhs for (lhs, rhs) in grammar } all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) } the_oracle = oracle(is_term, grammar) return { v: { t: the_oracle(v, t) for t in all_terminals } for v in all_variables } def print_oracle_table( oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]], render: Callable[[A | B], str], ) -> str: """ Pretty prints an oracle table The render function is expected to render terminals and variables. If the render function produces valid python, then `print_oracle_table` will also produce valid python. ### Example: We generate a simple grammar: >>> class SimpleVariable(IntEnum): ... Sum = auto() ... Sum_ = auto() ... Term = auto() >>> class SimpleTerminal(IntEnum): ... Number = auto() ... Letter = auto() ... Plus = auto() >>> grammar = [ ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), ... (SimpleVariable.Sum_, []), ... (SimpleVariable.Term, [SimpleTerminal.Number]), ... (SimpleVariable.Term, [SimpleTerminal.Letter]), ... ] >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar) >>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE { SimpleVariable.Sum: { SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Plus: [] }, SimpleVariable.Sum_: { SimpleTerminal.Number: [], SimpleTerminal.Letter: [], SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] }, SimpleVariable.Term: { SimpleTerminal.Number: [[SimpleTerminal.Number]], SimpleTerminal.Letter: [[SimpleTerminal.Letter]], SimpleTerminal.Plus: [] } } """ return '{\n' + ",\n".join([ f'{render(v)}: {"{"}\n' + ',\n'.join([ f'\t{render(t)}: [' + ', '.join([ '[' + ', '.join([ render(symbol) for symbol in expansion ]) + ']' for expansion in expansions ]) + ']' for (t, expansions) in term_table.items() ]) + '\n}' for (v, term_table) in oracle_table.items() ]) + '\n}' EA = TypeVar('EA', bound=Enum) EB = TypeVar('EB', bound=Enum) def print_oracle_table_enum( oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]] ) -> str: """ A special case of `print_oracle_table` where tokens and variables are enums Always produces valid python. ### Example: We generate a simple grammar: >>> class SimpleVariable(IntEnum): ... Sum = auto() ... Sum_ = auto() ... Term = auto() >>> class SimpleTerminal(IntEnum): ... Number = auto() ... Letter = auto() ... Plus = auto() >>> grammar = [ ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), ... (SimpleVariable.Sum_, []), ... (SimpleVariable.Term, [SimpleTerminal.Number]), ... (SimpleVariable.Term, [SimpleTerminal.Letter]), ... ] >>> my_oracle_table = oracle_table(flip(cur2(isinstance))(SimpleTerminal), grammar) >>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table) >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE { SimpleVariable.Sum: { SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Plus: [] }, SimpleVariable.Sum_: { SimpleTerminal.Number: [], SimpleTerminal.Letter: [], SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] }, SimpleVariable.Term: { SimpleTerminal.Number: [[SimpleTerminal.Number]], SimpleTerminal.Letter: [[SimpleTerminal.Letter]], SimpleTerminal.Plus: [] } } """ return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore if __name__ == '__main__': import doctest from lex import Tok from parse import GRAMMAR, Variable failure_count, test_count = doctest.testmod() if failure_count: print('\n\nRefusing to build oracle table due to test failures') exit(1) else: print(print_oracle_table_enum(oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR))) #type: ignore