""" Tools for building an oracle table See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This module only produces an oracle table in python, without outputting it. NOTE! Doctests in this module use `GRAMMAR` from `grammar.py` and `EGRAMMAR` as a version of that grammar with the actions erased, with `_erase_actions()`. """ from emis_funky_funktions import * from enum import auto, Enum, IntEnum from functools import cache, reduce from operator import getitem from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar def _erase_actions_h( handle: Sequence[A | B | C], is_not_c: Callable[[A | B | C], TypeGuard[A | B]] ) -> Sequence[A | B]: """ Produce an identical handle, but with all the actions removed """ return [i for i in handle if is_not_c(i)] def _erase_actions( grammar: Sequence[Tuple[A, Sequence[A | B | C]]], is_not_c: Callable[[A | B | C], TypeGuard[A | B]] ) -> Sequence[Tuple[A, Sequence[A | B]]]: """ Produce an identical grammar, but with all the actions removed """ return [ (var, _erase_actions_h(handle, is_not_c)) for (var, handle) in grammar ] def _first( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], sequence: Sequence[A | B] ) -> Tuple[Collection[B], bool]: """ Computes all of the possible starting terminals for a handle in a given grammar Due to pathetic python weaknesses, the first argument you must provide is a type guard to determine whether a certain thing is a terminal as opposed to a variable. Then, pass in the grammar and the sequence of terminals and variables in question. The output contains two values. The first is a set of possible terminals, and the second is a boolean indicating whether this term can derive epsilon. >>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.Clause]) ({Negate, Identifier}, False) >>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms]) ({Comma}, True) >>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms, Tok.CloseP]) ({CloseP, Comma}, False) """ def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]: match vs: case []: return (set(), True) case [v, *rest] if is_term(v): return ({v}, False) case [v, *rest]: this_variable_first, derives_epsilon = reduce( lambda acc, result: (acc[0] | result[0], acc[1] or result[1]), [ inner(handle) for (other_variable, handle) in grammar if other_variable == v ] ) if derives_epsilon: rest_first, rest_derives_epsilon = inner(rest) return (rest_first | this_variable_first, rest_derives_epsilon) else: return (this_variable_first, False) raise Exception("UNREACHABLE") return inner(sequence) def _follow( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Collection[B]]: """ Produce a table indicating exactly which terminals can follow each variable >>> _follow(flip(cur2(isinstance))(Tok), EGRAMMAR) #doctest: +NORMALIZE_WHITESPACE {: set(), : {Newline}, : {Eof}, : {Eof}, : {Newline, Eof}, : {Newline, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {Newline, Negate, CloseP, Comma, Identifier, Eof}, : {CloseP}} """ follow_table: Mapping[A, Set[B]] = { variable: set() for (variable, _) in grammar } def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]: handle_first, handle_derives_epsilon = _first(is_term, grammar, handle) return set(handle_first) | (follows_handle if handle_derives_epsilon else set()) def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]: new_table = reduce( lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]}, [ ( cast(A, handle[i]), following_tokens(handle[i+1:], prev_table[variable]) ) for (variable, handle) in grammar for i in range(len(handle)) if not is_term(handle[i]) ], prev_table ) if new_table == prev_table: return new_table else: return inner(new_table) return inner(follow_table) def _predict( is_term: Callable[[A | B], TypeGuard[B]], grammar: Sequence[Tuple[A, Sequence[A | B]]], follow: Mapping[A, Collection[B]], lhs: A, rhs: Sequence[A | B] ) -> Collection[B]: """ Given a production, identify the terminals which this production would be valid under >>> is_tok = flip(cur2(isinstance))(Tok) >>> follow = _follow(is_tok, EGRAMMAR) >>> _predict(is_tok, EGRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_]) {Negate, Identifier} """ first_rhs, epsilon_rhs = _first(is_term, grammar, rhs) if epsilon_rhs: return set(follow[lhs]) | set(first_rhs) else: return first_rhs def oracle_table( is_term: Callable[[A | B], TypeGuard[B]], is_var: Callable[[A | B], TypeGuard[A]], grammar: Sequence[Tuple[A, Sequence[A | B]]], ) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]: """ A variant of `_oracle` that generates a table immediately rather than lazily No significant performance benefit >>> is_tok = p_instance(Tok) >>> is_var = p_instance(Variable) >>> my_oracle_table = oracle_table(is_tok, is_var, EGRAMMAR) One valid expansion: >>> my_oracle_table[Variable.Clauses_][Tok.Negate] [[, ]] One valid expansion, but it expands to epsilon: >>> my_oracle_table[Variable.Clauses_][Tok.Eof] [[]] Zero valid expansions: >>> my_oracle_table[Variable.Term][Tok.Newline] [] """ all_variables = { lhs for (lhs, rhs) in grammar } all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) } is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c) #type:ignore follow = _follow(is_term, e_grammar) return { v: { t: [ handle for (lhs, handle) in grammar if lhs == v and t in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c)) #type:ignore ] for t in all_terminals } for v in all_variables } if __name__ == '__main__': import doctest from grammar import GRAMMAR, Tok, Variable EGRAMMAR = _erase_actions(GRAMMAR, lambda x: not hasattr(x, '__call__')) #type: ignore doctest.testmod()