""" Tools for building an oracle table See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This module only produces an oracle table in python, without outputting it. """ from emis_funky_funktions import * from enum import auto, Enum, IntEnum from functools import cache, reduce from operator import getitem from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar def _erase_actions_h( handle: Sequence[A | B | C], is_not_c: Callable[[A | B | C], TypeGuard[A | B]] ) -> Sequence[A | B]: """ Produce an identical handle, but with all the actions removed """ return [i for i in handle if is_not_c(i)] def _erase_actions( grammar: Sequence[Tuple[A, Sequence[A | B | C]]], is_not_c: Callable[[A | B | C], TypeGuard[A | B]] ) -> Sequence[Tuple[A, Sequence[A | B]]]: """ Produce an identical grammar, but with all the actions removed """ return [ (var, _erase_actions_h(handle, is_not_c)) for (var, handle) in grammar ] def _first( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], sequence: Sequence[A | B] ) -> Tuple[Collection[B], bool]: """ Computes all of the possible starting terminals for a handle in a given grammar Due to pathetic python weaknesses, the first argument you must provide is a type guard to determine whether a certain thing is a terminal as opposed to a variable. Then, pass in the grammar and the sequence of terminals and variables in question. The output contains two values. The first is a set of possible terminals, and the second is a boolean indicating whether this term can derive epsilon. >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.Clause]) ({Negate, Identifier}, False) >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms]) ({Comma}, True) >>> _first(flip(cur2(isinstance))(Tok), GRAMMAR, [Variable.CSTerms, Tok.CloseP]) ({CloseP, Comma}, False) """ def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]: match vs: case []: return (set(), True) case [v, *rest] if is_term(v): return ({v}, False) case [v, *rest]: this_variable_first, derives_epsilon = reduce( lambda acc, result: (acc[0] | result[0], acc[1] or result[1]), [ inner(handle) for (other_variable, handle) in grammar if other_variable == v ] ) if derives_epsilon: rest_first, rest_derives_epsilon = inner(rest) return (rest_first | this_variable_first, rest_derives_epsilon) else: return (this_variable_first, False) raise Exception("UNREACHABLE") return inner(sequence) def _follow( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Collection[B]]: """ Produce a table indicating exactly which terminals can follow each variable >>> _follow(flip(cur2(isinstance))(Tok), GRAMMAR) #doctest: +NORMALIZE_WHITESPACE {: set(), : {Newline}, : {Eof}, : {Eof}, : {Newline, Eof}, : {Newline, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {CloseP}} """ follow_table: Mapping[A, Set[B]] = { variable: set() for (variable, _) in grammar } def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]: handle_first, handle_derives_epsilon = _first(is_term, grammar, handle) return set(handle_first) | (follows_handle if handle_derives_epsilon else set()) def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]: new_table = reduce( lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]}, [ ( cast(A, handle[i]), following_tokens(handle[i+1:], prev_table[variable]) ) for (variable, handle) in grammar for i in range(len(handle)) if not is_term(handle[i]) ], prev_table ) if new_table == prev_table: return new_table else: return inner(new_table) return inner(follow_table) def _predict( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], follow: Mapping[A, Collection[B]], lhs: A, rhs: Sequence[A | B] ) -> Collection[B]: """ Given a production, identify the terminals which this production would be valid under >>> is_tok = flip(cur2(isinstance))(Tok) >>> follow = _follow(is_tok, GRAMMAR) >>> _predict(is_tok, GRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_]) {Negate, Identifier} """ first_rhs, epsilon_rhs = _first(is_term, grammar, rhs) if epsilon_rhs: return set(follow[lhs]) | set(first_rhs) else: return first_rhs def oracle( is_term: Callable[[A | B | C], TypeGuard[B]], is_var: Callable[[A | B | C], TypeGuard[A]], grammar: Sequence[Tuple[A, Sequence[A | B | C]]], ) -> Callable[[A, B], Collection[Sequence[A | B | C]]]: """ Show valid expansions of a variable based on the next terminal to be read For valid LL(1) grammars, there should never be more than one valid expansion. The inner method constructed is memoized for your convenience. >>> is_tok = p_instance(Tok) >>> is_var = p_instance(Variable) >>> my_oracle = oracle(is_tok, is_var, GRAMMAR) One valid expansion: >>> my_oracle(Variable.Clauses_, Tok.Negate) [[, ]] One valid expansion, but it expands to epsilon: >>> my_oracle(Variable.Clauses_, Tok.Eof) [[]] Zero valid expansions: >>> my_oracle(Variable.Term, Tok.Newline) [] """ is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c) follow = _follow(is_term, e_grammar) @wraps(oracle) @cache def inner(v: A, c: B) -> Collection[Sequence[A | B | C]]: return [ handle for (lhs, handle) in grammar if lhs == v and c in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c)) ] return inner def oracle_table( is_term: Callable[[A | B], TypeGuard[B]], is_var: Callable[[A | B], TypeGuard[A]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]: """ A variant of `_oracle` that generates a table immediately rather than lazily No significant performance benefit >>> is_tok = p_instance(Tok) >>> is_var = p_instance(Variable) >>> my_oracle_table = oracle_table(is_tok, is_var, GRAMMAR) One valid expansion: >>> my_oracle_table[Variable.Clauses_][Tok.Negate] [[, ]] One valid expansion, but it expands to epsilon: >>> my_oracle_table[Variable.Clauses_][Tok.Eof] [[]] Zero valid expansions: >>> my_oracle_table[Variable.Term][Tok.Newline] [] """ all_variables = { lhs for (lhs, rhs) in grammar } all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) } the_oracle = oracle(is_term, is_var, grammar) return { v: { t: the_oracle(v, t) for t in all_terminals } for v in all_variables } def print_oracle_table( oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]], render: Callable[[A | B], str], ) -> str: """ Pretty prints an oracle table The render function is expected to render terminals and variables. If the render function produces valid python, then `print_oracle_table` will also produce valid python. ### Example: We generate a simple grammar: >>> class SimpleVariable(IntEnum): ... Sum = auto() ... Sum_ = auto() ... Term = auto() >>> class SimpleTerminal(IntEnum): ... Number = auto() ... Letter = auto() ... Plus = auto() >>> grammar = [ ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), ... (SimpleVariable.Sum_, []), ... (SimpleVariable.Term, [SimpleTerminal.Number]), ... (SimpleVariable.Term, [SimpleTerminal.Letter]), ... ] >>> is_tok = p_instance(SimpleTerminal) >>> is_var = p_instance(SimpleVariable) >>> my_oracle_table = oracle_table(is_tok, is_var, grammar) >>> rendered_oracle_table = print_oracle_table(my_oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE { SimpleVariable.Sum: { SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Plus: [] }, SimpleVariable.Sum_: { SimpleTerminal.Number: [], SimpleTerminal.Letter: [], SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] }, SimpleVariable.Term: { SimpleTerminal.Number: [[SimpleTerminal.Number]], SimpleTerminal.Letter: [[SimpleTerminal.Letter]], SimpleTerminal.Plus: [] } } """ return '{\n' + ",\n".join([ f'{render(v)}: {"{"}\n' + ',\n'.join([ f'\t{render(t)}: [' + ', '.join([ '[' + ', '.join([ render(symbol) for symbol in expansion ]) + ']' for expansion in expansions ]) + ']' for (t, expansions) in term_table.items() ]) + '\n}' for (v, term_table) in oracle_table.items() ]) + '\n}' EA = TypeVar('EA', bound=Enum) EB = TypeVar('EB', bound=Enum) def print_oracle_table_enum( oracle_table: Mapping[A, Mapping[B, Collection[Sequence[A | B]]]] ) -> str: """ A special case of `print_oracle_table` where tokens and variables are enums Always produces valid python. ### Example: We generate a simple grammar: >>> class SimpleVariable(IntEnum): ... Sum = auto() ... Sum_ = auto() ... Term = auto() >>> class SimpleTerminal(IntEnum): ... Number = auto() ... Letter = auto() ... Plus = auto() >>> grammar = [ ... (SimpleVariable.Sum, [SimpleVariable.Term, SimpleVariable.Sum_]), ... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum]), ... (SimpleVariable.Sum_, []), ... (SimpleVariable.Term, [SimpleTerminal.Number]), ... (SimpleVariable.Term, [SimpleTerminal.Letter]), ... ] >>> is_tok = p_instance(SimpleTerminal) >>> is_var = p_instance(SimpleVariable) >>> my_oracle_table = oracle_table(is_tok, is_var, grammar) >>> rendered_oracle_table = print_oracle_table_enum(my_oracle_table) >>> print(rendered_oracle_table) #doctest: +NORMALIZE_WHITESPACE { SimpleVariable.Sum: { SimpleTerminal.Number: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Letter: [[SimpleVariable.Term, SimpleVariable.Sum_]], SimpleTerminal.Plus: [] }, SimpleVariable.Sum_: { SimpleTerminal.Number: [], SimpleTerminal.Letter: [], SimpleTerminal.Plus: [[SimpleTerminal.Plus, SimpleVariable.Sum]] }, SimpleVariable.Term: { SimpleTerminal.Number: [[SimpleTerminal.Number]], SimpleTerminal.Letter: [[SimpleTerminal.Letter]], SimpleTerminal.Plus: [] } } """ return print_oracle_table(oracle_table, lambda e: f'{e.__class__.__name__}.{e.name}') #type: ignore if __name__ == '__main__': import doctest from grammar import GRAMMAR, Tok, Variable doctest.testmod()