From 08547aea2f4dcd059341b1e92f6acde3c90c11a6 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Tue, 7 Mar 2023 11:05:10 -0500 Subject: [PATCH] Add a combinator parser --- comb_parse.py | 430 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 430 insertions(+) create mode 100644 comb_parse.py diff --git a/comb_parse.py b/comb_parse.py new file mode 100644 index 0000000..6d678bc --- /dev/null +++ b/comb_parse.py @@ -0,0 +1,430 @@ +from emis_funky_funktions import * + +from dataclasses import dataclass +from enum import auto, IntEnum +from functools import reduce +from re import compile, Pattern + +from lex import Lexeme, tokenize +from parse import Action + +from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias + +# Grammar +# S :=

Eof +# P := +

+# P := +# M := * +# M := +# T := Letter +# T := Number +# T := (

) + +class TrivialTok(IntEnum): + "A set of tokens for a trivial grammar used for testing" + + EOF = auto() + "Special token: End of file" + + ADD = auto() + "Addition (+)" + + MUL = auto() + "Multiplication (*)" + + LTR = auto() + "A single letter (x)" + + NUM = auto() + "A number (3)" + + OPN = auto() + "An open paren, i.e. '('" + + CLS = auto() + "An close paren, i.e. ')'" + + NUL = auto() + "Whitespace ( )" + + def __repr__(self): + return self._name_ + +TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [ + (compile(r"\+" ), TrivialTok.ADD), + (compile(r"\*" ), TrivialTok.MUL), + (compile(r"[a-zA-Z]"), TrivialTok.LTR), + (compile("\d+" ), TrivialTok.NUM), + (compile("\(" ), TrivialTok.OPN), + (compile("\)" ), TrivialTok.CLS), + (compile("\s+" ), TrivialTok.NUL), +] +""" +A mapping of regexs to the tokens the identify in a trivial grammar for testing + +Tokens earlier on in the list should be regarded as higher priority, even if a match lower +on the list also matches. All unicode strings should be matched by at least one token. +""" + +lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\ + c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore +""" +A lexer for the trivial grammar defined above + +Throws an error if the lex fails. + +>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE +[[NUM: '1']@(1, 1-1), + [ADD: '+']@(1, 3-3), + [NUM: '3']@(1, 5-5), + [EOF: '']@(1, 6-6)] +""" + +############# Combinator Parsing ################ + +Out = TypeVar('Out') +Out1 = TypeVar('Out1') +Out2 = TypeVar('Out2') +TokN = TypeVar('TokN') +LexStream: TypeAlias = Sequence[Lexeme[TokN]] +ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]] + +@dataclass(frozen=True) +class Parser(Generic[Out, TokN]): + """ + A parser which consumes a token stream and produces a series of possible parses + + Each possible parse consists of the subsection of the input stream left over after the + parse, along with the output of that parse. + + If the parse fails, the error returned will be a series of two-tuples, each containing + the lexeme at which the error occurred and the token which was expected in its stead. + + By the nature of combinator parsing, parsers can be built and tested in small pieces. + To this end, each combinator method on this class comes with a small example + demonstrating how to build a trivial parser using that method. However, in the + following section, we also provide an example for a grammar that is still trivial, but + slightly less so. + + ### Example: Arithmetic Grammar + + Let us define the following grammar for basic additive and multiplicative arithmetic + + ``` + S :=

Eof + P := +

+ P := + M := * + M := + T := Number + T := (

) + ``` + + As with the rest of the examples in this class, we will use the `TrivialTok` token + class to build this parser. + + Working our way from the bottom up, we start with defining a parser for T. For the + parenthetical production, we use a lazy parser to refer to the parser for P, which + hasn't been constructed yet. + + >>> parse_parens = Parser.token(TrivialTok.OPN)\\ + ... .bind(k(Parser.lazy(lambda: parse_p)))\\ + ... .seq_ignore_tok(TrivialTok.CLS) + >>> parse_num = Parser.token(TrivialTok.NUM)\\ + ... .map(c(int, Lexeme.get_match)) + >>> parse_t = parse_parens.or_(parse_num) + + For multiplication, we use `Parser.many()` to represent any number of "* " matches + and combine them together. This is a slight departure from how our grammar is + written above. The set of accepted inputs will be the same, but our implementation + will be left-associative. It is possible to implement this grammar as it is written, + but this will result in a right-associative structure. Of course, multiplication and + addition are associative, so in this case it doesn't matter. + + >>> parse_times = Parser.token(TrivialTok.MUL)\\ + ... .map(k(lambda x: lambda y: x * y))\\ + ... .fapply(parse_t)\\ + ... .many(lambda l: lambda r: c(l, r), lambda x: x) + >>> parse_m = parse_t.fapply_r(parse_times) + + Addition is largely the same as multiplication: + + >>> parse_plus = Parser.token(TrivialTok.ADD)\\ + ... .map(k(lambda x: lambda y: x + y))\\ + ... .fapply(parse_m)\\ + ... .many(lambda l: lambda r: c(l, r), lambda x: x) + >>> parse_p = parse_m.fapply_r(parse_plus) + + And finally, we expect and EOF after the end of the term, to ensure that we've reached + the end of the input. + + >>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF) + + And now, for a few sample runs: + + >>> parse_s.parse(lex_trivial('1')) + Ok([(1, [])]) + + >>> parse_s.parse(lex_trivial('1 + 2')) + Ok([(3, [])]) + + >>> parse_s.parse(lex_trivial('1 + 2 * 3 + 4')) + Ok([(11, [])]) + + >>> parse_s.parse(lex_trivial('(1 + 2) * (3 + 4)')) + Ok([(21, [])]) + + >>> parse_s.parse(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6')) + Ok([(69, [])]) + """ + + parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]] + + @staticmethod + def epsilon(ret: Out) -> 'Parser[Out, TokN]': + """ + Parse an empty string, then return a constant + + Always succeeds, and always produces exactly one possibility + + >>> Parser.epsilon(100).parse(lex_trivial("+")) + Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),)) + """ + return Parser(lambda s: Ok(((ret, s),))) + + @staticmethod + def token(t: TokN) -> 'Parser[Lexeme[TokN], TokN]': + """ + Parser that only accepts a single token, and returns the parsed lexeme + + The first argument is a function which, given a lexeme, returns the token that + that lexeme instantiates. The second argument is a token which this parser should + accept + + >>> parse_num = Parser.token(TrivialTok.NUM) + + >>> parse_num.parse(lex_trivial('3')) + Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),)) + + >>> parse_num.parse(lex_trivial('x')) + Err((([LTR: 'x']@(1, 1-1), NUM),)) + """ + def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]: + match input: + case [lexeme, *rest] if lexeme.token == t: + return Ok(((lexeme, rest),)) + case [bad_lexeme, *rest]: + return Err(((bad_lexeme, t),)) + case []: + raise Exception('Bad grammar! Reached an empty input') + raise Exception('Unreachable') + return Parser(parse_single) + + @staticmethod + def lazy( + gen: 'Callable[[], Parser[Out, TokN]]', + ) -> 'Parser[Out, TokN]': + """ + A stand-in parser which will only be actually computed when called + """ + return Parser(lambda s: gen().parse(s)) + + def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]': + """ + A monadic bind operator - allows a parser to be generated from its precenent + + ### Example + + We generate a parser which reads a number then accepts exactly that many plus + tokens, returning the last one. + + >>> some_n = Parser.token(TrivialTok.NUM) + >>> n_plus = some_n.bind(lambda prev_result: + ... reduce( + ... lambda a, p: a.bind(k(p)), + ... [ + ... Parser.token(TrivialTok.ADD) + ... for i in range(int(prev_result.matched_string)) + ... ] + ... ) + ... ) + + **Sample Run 1**: We parse the string `3+++`. Since this is a three followed by + exactly three plus signs, this should parse successfully. Sure enough, the result + contains exactly one possibility, where the `3+++` has been consumed, leaving only + the EOF, and returning the value of the last plus sign. + + >>> n_plus.parse(lex_trivial('3+++')) + Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])]) + + **Simple Run 2**: We parse the string `3++`. This only has two of the three plus + signs, so we should expect it to fail. As expected, at does, correctly + identifying that it saw an EOF while expecting an ADD. + + >>> n_plus.parse(lex_trivial('3++')) + Err([([EOF: '']@(1, 4-4), ADD)]) + """ + def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]: + successes, errors = partition([ + f(out1).parse(stream) + for (out1, stream) in results + ]) + if len(successes): + return Ok([p for s in successes for p in s]) + else: + return Err([e for errs in errors for e in errs]) + def inner(input: LexStream) -> ParserResult[Out2, TokN]: + return self.parse(input) << handle_results + return Parser(inner) + + def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]': + """ + Transform the output of some parser with a function + + This is a particularly useful method, because it allows converting parsers which + return lexemes (e.g. `Parser.token()`) into parsers that return other thing. + + As an example, here's a parser which parses a number, and returns it as a number. + + >>> parse_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string)) + >>> parse_num.parse(lex_trivial('3')) + Ok([(3, [[EOF: '']@(1, 2-2)])]) + """ + return self.bind(c(Parser.epsilon, f)) #type: ignore + + def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]': + """ + Apply the function which this returns to the value produced by another parser + + Equivalent to the fapply method of an applicative functor. + """ + return self.bind(p(Parser.map, arg)) #type: ignore + + def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]': + """ + A reversed version of `fapply()` + + Applies the function returned by the argument to the value returned by this. + """ + return self.bind(lambda v: arg.map(lambda f: f(v))) + + def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]': + """ + Parses two things in series, ignoring the output of the second parser + + Example: Parse a number followed by any letter (ignored) + + >>> parse_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string)) + >>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR)) + >>> parse_numlet.parse(lex_trivial('4a')) + Ok([(4, [[EOF: '']@(1, 3-3)])]) + """ + return self.map(k).fapply(subsequent) #type:ignore + + def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]': + """ + A shorthand for calling `seq_ignore()` with `Parser.token` + + Example: Parse a number followed by any letter (ignored) + + >>> parse_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string)) + >>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR) + >>> parse_numlet.parse(lex_trivial('4a')) + Ok([(4, [[EOF: '']@(1, 3-3)])]) + """ + return self.seq_ignore(Parser.token(subsequent)) + + def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]': + """ + Returns a parser which succeeds if this or any arguments succeed + + **Example:** A parser which parses a letter or a number + + >>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR)) + >>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS + Ok(...) + >>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS + Ok(...) + + Notice that this can produce multiple successes. A simple example would be a + parser which parses either a single number or an empty string (epsilon). When + faced with an input stream starting with a number, it could either parse that + number, returning the rest of the input stream, or parse the empty string, + returning the input unchanged. + + >>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string))\\ + ... .or_(Parser.epsilon(-1)) + >>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE + Ok([(3, [[EOF: '']@(1, 2-2)]), + (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])]) + """ + all_parsers = (self, *parsers) + def inner(input: LexStream) -> ParserResult[Out, TokN]: + successes, failures = partition([p.parse(input) for p in all_parsers]) + if len(successes): + return Ok([successful_path for success in successes for successful_path in success]) + else: + return Err([expectation for failure in failures for expectation in failure]) + return Parser(inner) + + def opt(self, fallback: Out) -> 'Parser[Out, TokN]': + """ + Parse one or zero of this thing, returning fallback if not parsing it + + We can use this to write a simpler example of parse_maybe_num from the `or_()` + example: + + >>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string))\\ + ... .opt(-1) + >>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE + Ok([(3, [[EOF: '']@(1, 2-2)]), + (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])]) + """ + return self.or_(Parser.epsilon(fallback)) + + def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]': + """ + Create a new parser which accepts any number of instances of this parser + + The combine argument is a function which joins the result of this parser with the + result of parsing the 0 or more identical parsers on the right. Think of this + like a reduce. + + As an example, here's a parser which parses any number of numbers, and sums them + all together. + + We start with the single number parser from the `map()` example. + + >>> parse_num = Parser.token(TrivialTok.NUM)\\ + ... .map(lambda l: int(l.matched_string)) + + Then we call `many()` on it. We also add the EOF parser to the end to force it to + parse the whole input. + + >>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\ + ... .seq_ignore_tok(TrivialTok.EOF) + + **Sample Run 1:** Sum of the numbers 1, 2, and 3. This produces the operation (1 + + (2 + (3 + 0)) + + >>> parse_nums.parse(lex_trivial('1 2 3')) + Ok([(6, [])]) + + **Sample Run 2:** If attempting to sum no numbers, we get back the default + argument, zero in this case. + + >>> parse_nums.parse(lex_trivial('')) + Ok([(0, [])]) + """ + return (self.map(combine) + .fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore + .opt(default)) + +if __name__ == '__main__': + import doctest + doctest.testmod() \ No newline at end of file