from emis_funky_funktions import * from dataclasses import dataclass from enum import auto, IntEnum from functools import reduce from re import compile, Pattern from lex import Lexeme, tokenize from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias # Grammar # S :=

Eof # P := +

# P := # M := * # M := # T := Letter # T := Number # T := (

) class TrivialTok(IntEnum): "A set of tokens for a trivial grammar used for testing" EOF = auto() "Special token: End of file" ADD = auto() "Addition (+)" MUL = auto() "Multiplication (*)" LTR = auto() "A single letter (x)" NUM = auto() "A number (3)" OPN = auto() "An open paren, i.e. '('" CLS = auto() "An close paren, i.e. ')'" NUL = auto() "Whitespace ( )" def __repr__(self): return self._name_ TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [ (compile(r"\+" ), TrivialTok.ADD), (compile(r"\*" ), TrivialTok.MUL), (compile(r"[a-zA-Z]"), TrivialTok.LTR), (compile("\d+" ), TrivialTok.NUM), (compile("\(" ), TrivialTok.OPN), (compile("\)" ), TrivialTok.CLS), (compile("\s+" ), TrivialTok.NUL), ] """ A mapping of regexs to the tokens the identify in a trivial grammar for testing Tokens earlier on in the list should be regarded as higher priority, even if a match lower on the list also matches. All unicode strings should be matched by at least one token. """ lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\ c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore """ A lexer for the trivial grammar defined above Throws an error if the lex fails. >>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE [[NUM: '1']@(1, 1-1), [ADD: '+']@(1, 3-3), [NUM: '3']@(1, 5-5), [EOF: '']@(1, 6-6)] """ ############# Combinator Parsing ################ Out = TypeVar('Out') Out1 = TypeVar('Out1') Out2 = TypeVar('Out2') TokN = TypeVar('TokN') LexStream: TypeAlias = Sequence[Lexeme[TokN]] ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]] @dataclass(frozen=True) class Parser(Generic[Out, TokN]): """ A parser which consumes a token stream and produces a series of possible parses Each possible parse consists of the subsection of the input stream left over after the parse, along with the output of that parse. If the parse fails, the error returned will be a series of two-tuples, each containing the lexeme at which the error occurred and the token which was expected in its stead. By the nature of combinator parsing, parsers can be built and tested in small pieces. To this end, each combinator method on this class comes with a small example demonstrating how to build a trivial parser using that method. However, in the following section, we also provide an example for a grammar that is still trivial, but slightly less so. ### Example: Arithmetic Grammar Let us define the following grammar for basic additive and multiplicative arithmetic ``` S :=

Eof P := +

P := M := * M := T := Number T := (

) ``` As with the rest of the examples in this class, we will use the `TrivialTok` token class to build this parser. Working our way from the bottom up, we start with defining a parser for T. For the parenthetical production, we use a lazy parser to refer to the parser for P, which hasn't been constructed yet. >>> parse_parens = Parser.token(TrivialTok.OPN)\\ ... .bind(k(Parser.lazy(lambda: parse_p)))\\ ... .seq_ignore_tok(TrivialTok.CLS) >>> parse_num = Parser.token(TrivialTok.NUM)\\ ... .map(c(int, Lexeme.get_match)) >>> parse_t = parse_parens.or_(parse_num) For multiplication, we use `Parser.many()` to represent any number of "* " matches and combine them together. This is a slight departure from how our grammar is written above. The set of accepted inputs will be the same, but our implementation will be left-associative. It is possible to implement this grammar as it is written, but this will result in a right-associative structure. Of course, multiplication and addition are associative, so in this case it doesn't matter. >>> parse_times = Parser.token(TrivialTok.MUL)\\ ... .map(k(lambda x: lambda y: x * y))\\ ... .fapply(parse_t)\\ ... .many(lambda l: lambda r: c(l, r), lambda x: x) >>> parse_m = parse_t.fapply_r(parse_times) Addition is largely the same as multiplication: >>> parse_plus = Parser.token(TrivialTok.ADD)\\ ... .map(k(lambda x: lambda y: x + y))\\ ... .fapply(parse_m)\\ ... .many(lambda l: lambda r: c(l, r), lambda x: x) >>> parse_p = parse_m.fapply_r(parse_plus) And finally, we expect and EOF after the end of the term, to ensure that we've reached the end of the input. >>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF) And now, for a few sample runs: >>> parse_s.parse_(lex_trivial('1')) Ok(1) >>> parse_s.parse_(lex_trivial('1 + 2')) Ok(3) >>> parse_s.parse_(lex_trivial('1 + 2 * 3 + 4')) Ok(11) >>> parse_s.parse_(lex_trivial('(1 + 2) * (3 + 4)')) Ok(21) >>> parse_s.parse_(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6')) Ok(69) And an example of a bad parse: # TODO fix this >>> parse_s.parse_(lex_trivial('1 * * 2')) #doctest: +ELLIPSIS Err(...) """ parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]] """ Run this parser Accepts a stream of tokens to parse, and returns either a list of possible successful parses, each with a value and a remainder of the token stream, or a list of all failures which lead to the failure of this parser, each with a lexeme they failed at and the token the expected. This is meant for use in constructing parsers, and it's probably more useful to call `parse_()`. """ def parse_(self, input: LexStream[TokN]) -> Result[Out, Mapping[Lexeme[TokN], Collection[TokN]]]: """ Run this parser, expecting at most one result which consumes the full input stream This is a wrapper around `parse()` with a few features for ease of use, including: - expects that at most one single successful parse will be produced - asserts that the parse consumed the whole input stream - aggregates failures into a multi-set of `Lexeme` -> expected tokens Normal failures are passed through as an `Err`, but if the results returned by a successful parse violate the above conditions, an assertion error will be raised. However, by carefully constructing your grammar, you can ensure that this will not happen for any input. Because of these assertions, the successful return value of this function is *just* the output type of this parse, which is much easier to use. ### Example A parser which parses a single number. Notice that we also require that it parses an EOF. Without this, it would be unable to consume the entire input stream, and thus would fail. >>> parse_num = Parser.token(TrivialTok.NUM).seq_ignore_tok(TrivialTok.EOF) >>> parse_num.parse_(lex_trivial('1312')) Ok([NUM: '1312']@(1, 1-4)) """ match self.parse(input): case Err(errors): # The input is bad failure_locations = FSet(lex for (lex, expected) in errors) return Err({ location: FSet(expected for (lex, expected) in errors if lex == location) for location in failure_locations }) case Ok([result1, result2, *rest] as possible_parses): # The grammar is bad raise AssertionError("Parse returned multiple possible parses", possible_parses) case Ok([(value, [non_empty, *rest] as remainder)]): # The grammar is bad raise AssertionError("Parse failed to consume the whole input, and left remainder", remainder) case Ok([]): # The parser code is bad raise AssertionError('"Successful" parse returned no possible parses') case Ok([(value, [])]): return Ok(value) # The code in this function is bad raise AssertionError('Unreachable') @staticmethod def epsilon(ret: Out) -> 'Parser[Out, TokN]': """ Parse an empty string, then return a constant Always succeeds, and always produces exactly one possibility >>> Parser.epsilon(100).parse(lex_trivial("+")) Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),)) """ return Parser(lambda s: Ok(((ret, s),))) @staticmethod def token(t: TokN) -> 'Parser[Lexeme[TokN], TokN]': """ Parser that only accepts a single token, and returns the parsed lexeme The first argument is a function which, given a lexeme, returns the token that that lexeme instantiates. The second argument is a token which this parser should accept >>> parse_num = Parser.token(TrivialTok.NUM) >>> parse_num.parse(lex_trivial('3')) Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),)) >>> parse_num.parse(lex_trivial('x')) Err((([LTR: 'x']@(1, 1-1), NUM),)) """ def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]: match input: case [lexeme, *rest] if lexeme.token == t: return Ok(((lexeme, rest),)) case [bad_lexeme, *rest]: return Err(((bad_lexeme, t),)) case []: raise Exception('Bad grammar! Reached an empty input') raise Exception('Unreachable') return Parser(parse_single) @staticmethod def lazy( gen: 'Callable[[], Parser[Out, TokN]]', ) -> 'Parser[Out, TokN]': """ A stand-in parser which will only be actually computed when called """ return Parser(lambda s: gen().parse(s)) def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]': """ A monadic bind operator - allows a parser to be generated from its precenent ### Example We generate a parser which reads a number then accepts exactly that many plus tokens, returning the last one. >>> some_n = Parser.token(TrivialTok.NUM) >>> n_plus = some_n.bind(lambda prev_result: ... reduce( ... lambda a, p: a.bind(k(p)), ... [ ... Parser.token(TrivialTok.ADD) ... for i in range(int(prev_result.matched_string)) ... ] ... ) ... ) **Sample Run 1**: We parse the string `3+++`. Since this is a three followed by exactly three plus signs, this should parse successfully. Sure enough, the result contains exactly one possibility, where the `3+++` has been consumed, leaving only the EOF, and returning the value of the last plus sign. >>> n_plus.parse(lex_trivial('3+++')) Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])]) **Simple Run 2**: We parse the string `3++`. This only has two of the three plus signs, so we should expect it to fail. As expected, at does, correctly identifying that it saw an EOF while expecting an ADD. >>> n_plus.parse(lex_trivial('3++')) Err([([EOF: '']@(1, 4-4), ADD)]) """ def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]: successes, errors = partition([ f(out1).parse(stream) for (out1, stream) in results ]) if len(successes): return Ok([p for s in successes for p in s]) else: return Err([e for errs in errors for e in errs]) def inner(input: LexStream) -> ParserResult[Out2, TokN]: return self.parse(input) << handle_results return Parser(inner) def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]': """ Transform the output of some parser with a function This is a particularly useful method, because it allows converting parsers which return lexemes (e.g. `Parser.token()`) into parsers that return other thing. As an example, here's a parser which parses a number, and returns it as a number. >>> parse_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string)) >>> parse_num.parse(lex_trivial('3')) Ok([(3, [[EOF: '']@(1, 2-2)])]) """ return self.bind(c(Parser.epsilon, f)) #type: ignore def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]': """ Apply the function which this returns to the value produced by another parser Equivalent to the fapply method of an applicative functor. """ return self.bind(p(Parser.map, arg)) #type: ignore def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]': """ A reversed version of `fapply()` Applies the function returned by the argument to the value returned by this. """ return self.bind(lambda v: arg.map(lambda f: f(v))) def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]': """ Parses two things in series, ignoring the output of the second parser Example: Parse a number followed by any letter (ignored) >>> parse_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string)) >>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR)) >>> parse_numlet.parse(lex_trivial('4a')) Ok([(4, [[EOF: '']@(1, 3-3)])]) """ return self.map(k).fapply(subsequent) #type:ignore def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]': """ A shorthand for calling `seq_ignore()` with `Parser.token` Example: Parse a number followed by any letter (ignored) >>> parse_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string)) >>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR) >>> parse_numlet.parse(lex_trivial('4a')) Ok([(4, [[EOF: '']@(1, 3-3)])]) """ return self.seq_ignore(Parser.token(subsequent)) def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]': """ Returns a parser which succeeds if this or any arguments succeed **Example:** A parser which parses a letter or a number >>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR)) >>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS Ok(...) >>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS Ok(...) Notice that this can produce multiple successes. A simple example would be a parser which parses either a single number or an empty string (epsilon). When faced with an input stream starting with a number, it could either parse that number, returning the rest of the input stream, or parse the empty string, returning the input unchanged. >>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string))\\ ... .or_(Parser.epsilon(-1)) >>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE Ok([(3, [[EOF: '']@(1, 2-2)]), (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])]) Of course, this can also produce multiple failures as well: >>> parse_or.parse(lex_trivial('+')) Err([([ADD: '+']@(1, 1-1), NUM), ([ADD: '+']@(1, 1-1), LTR)]) """ all_parsers = (self, *parsers) def inner(input: LexStream) -> ParserResult[Out, TokN]: successes, failures = partition([p.parse(input) for p in all_parsers]) if len(successes): return Ok([successful_path for success in successes for successful_path in success]) else: return Err([expectation for failure in failures for expectation in failure]) return Parser(inner) def opt(self, fallback: Out) -> 'Parser[Out, TokN]': """ Parse one or zero of this thing, returning fallback if not parsing it We can use this to write a simpler example of parse_maybe_num from the `or_()` example: >>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string))\\ ... .opt(-1) >>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE Ok([(3, [[EOF: '']@(1, 2-2)]), (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])]) """ return self.or_(Parser.epsilon(fallback)) def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]': """ Create a new parser which accepts any number of instances of this parser The combine argument is a function which joins the result of this parser with the result of parsing the 0 or more identical parsers on the right. Think of this like a reduce. As an example, here's a parser which parses any number of numbers, and sums them all together. We start with the single number parser from the `map()` example. >>> parse_num = Parser.token(TrivialTok.NUM)\\ ... .map(lambda l: int(l.matched_string)) Then we call `many()` on it. We also add the EOF parser to the end to force it to parse the whole input. >>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\ ... .seq_ignore_tok(TrivialTok.EOF) **Sample Run 1:** Sum of the numbers 1, 2, and 3. This produces the operation (1 + (2 + (3 + 0)) >>> parse_nums.parse(lex_trivial('1 2 3')) Ok([(6, [])]) **Sample Run 2:** If attempting to sum no numbers, we get back the default argument, zero in this case. >>> parse_nums.parse(lex_trivial('')) Ok([(0, [])]) """ return (self.map(combine) .fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore .opt(default)) if __name__ == '__main__': import doctest doctest.testmod()