From 08547aea2f4dcd059341b1e92f6acde3c90c11a6 Mon Sep 17 00:00:00 2001
From: Emi Simpson <emi@alchemi.dev>
Date: Tue, 7 Mar 2023 11:05:10 -0500
Subject: [PATCH] Add a combinator parser

---
 comb_parse.py | 430 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 430 insertions(+)
 create mode 100644 comb_parse.py
diff --git a/comb_parse.py b/comb_parse.py
new file mode 100644
index 0000000..6d678bc
--- /dev/null
+++ b/comb_parse.py
@@ -0,0 +1,430 @@
+from emis_funky_funktions import *
+
+from dataclasses import dataclass
+from enum import auto, IntEnum
+from functools import reduce
+from re import compile, Pattern
+
+from lex import Lexeme, tokenize
+from parse import Action
+
+from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias
+
+# Grammar
+# S := <P> Eof
+# P := <M> + <P>
+# P := <M>
+# M := <T> * <M>
+# M := <T>
+# T := Letter
+# T := Number
+# T := ( <P> )
+
+class TrivialTok(IntEnum):
+	"A set of tokens for a trivial grammar used for testing"
+
+	EOF = auto()
+	"Special token: End of file"
+
+	ADD = auto()
+	"Addition (+)"
+
+	MUL = auto()
+	"Multiplication (*)"
+
+	LTR = auto()
+	"A single letter (x)"
+
+	NUM = auto()
+	"A number (3)"
+
+	OPN = auto()
+	"An open paren, i.e. '('"
+
+	CLS = auto()
+	"An close paren, i.e. ')'"
+
+	NUL = auto()
+	"Whitespace ( )"
+
+	def __repr__(self):
+		return self._name_
+
+TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [
+	(compile(r"\+"      ), TrivialTok.ADD),
+	(compile(r"\*"      ), TrivialTok.MUL),
+	(compile(r"[a-zA-Z]"), TrivialTok.LTR),
+	(compile("\d+"      ), TrivialTok.NUM),
+	(compile("\("       ), TrivialTok.OPN),
+	(compile("\)"       ), TrivialTok.CLS),
+	(compile("\s+"      ), TrivialTok.NUL),
+]
+"""
+A mapping of regexs to the tokens the identify in a trivial grammar for testing
+
+Tokens earlier on in the list should be regarded as higher priority, even if a match lower
+on the list also matches.  All unicode strings should be matched by at least one token.
+"""
+
+lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\
+	c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore
+"""
+A lexer for the trivial grammar defined above
+
+Throws an error if the lex fails.
+
+>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE
+[[NUM: '1']@(1, 1-1),
+ [ADD: '+']@(1, 3-3),
+ [NUM: '3']@(1, 5-5),
+ [EOF: '']@(1, 6-6)]
+"""
+
+############# Combinator Parsing ################
+
+Out = TypeVar('Out')
+Out1 = TypeVar('Out1')
+Out2 = TypeVar('Out2')
+TokN = TypeVar('TokN')
+LexStream: TypeAlias = Sequence[Lexeme[TokN]]
+ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]]
+
+@dataclass(frozen=True)
+class Parser(Generic[Out, TokN]):
+	"""
+	A parser which consumes a token stream and produces a series of possible parses
+
+	Each possible parse consists of the subsection of the input stream left over after the
+	parse, along with the output of that parse.
+
+	If the parse fails, the error returned will be a series of two-tuples, each containing
+	the lexeme at which the error occurred and the token which was expected in its stead.
+
+	By the nature of combinator parsing, parsers can be built and tested in small pieces.
+	To this end, each combinator method on this class comes with a small example
+	demonstrating how to build a trivial parser using that method.  However, in the
+	following section, we also provide an example for a grammar that is still trivial, but
+	slightly less so.
+
+	### Example:  Arithmetic Grammar
+
+	Let us define the following grammar for basic additive and multiplicative arithmetic
+
+	```
+	S := <P> Eof
+	P := <M> + <P>
+	P := <M>
+	M := <T> * <M>
+	M := <T>
+	T := Number
+	T := ( <P> )
+	```
+
+	As with the rest of the examples in this class, we will use the `TrivialTok` token
+	class to build this parser.
+
+	Working our way from the bottom up, we start with defining a parser for T.  For the
+	parenthetical production, we use a lazy parser to refer to the parser for P, which
+	hasn't been constructed yet.
+
+	>>> parse_parens = Parser.token(TrivialTok.OPN)\\
+	...                      .bind(k(Parser.lazy(lambda: parse_p)))\\
+	...                      .seq_ignore_tok(TrivialTok.CLS)
+	>>> parse_num = Parser.token(TrivialTok.NUM)\\
+	...                   .map(c(int, Lexeme.get_match))
+	>>> parse_t = parse_parens.or_(parse_num)
+
+	For multiplication, we use `Parser.many()` to represent any number of "* <T>" matches
+	and combine them together.  This is a slight departure from how our grammar is
+	written above.  The set of accepted inputs will be the same, but our implementation
+	will be left-associative.  It is possible to implement this grammar as it is written,
+	but this will result in a right-associative structure.  Of course, multiplication and
+	addition are associative, so in this case it doesn't matter.
+
+	>>> parse_times = Parser.token(TrivialTok.MUL)\\
+	...                     .map(k(lambda x: lambda y: x * y))\\
+	...                     .fapply(parse_t)\\
+	...                     .many(lambda l: lambda r: c(l, r), lambda x: x)
+	>>> parse_m = parse_t.fapply_r(parse_times)
+
+	Addition is largely the same as multiplication:
+
+	>>> parse_plus = Parser.token(TrivialTok.ADD)\\
+	...                    .map(k(lambda x: lambda y: x + y))\\
+	...                    .fapply(parse_m)\\
+	...                    .many(lambda l: lambda r: c(l, r), lambda x: x)
+	>>> parse_p = parse_m.fapply_r(parse_plus)
+
+	And finally, we expect and EOF after the end of the term, to ensure that we've reached
+	the end of the input.
+
+	>>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF)
+
+	And now, for a few sample runs:
+
+	>>> parse_s.parse(lex_trivial('1'))
+	Ok([(1, [])])
+
+	>>> parse_s.parse(lex_trivial('1 + 2'))
+	Ok([(3, [])])
+
+	>>> parse_s.parse(lex_trivial('1 + 2 * 3 + 4'))
+	Ok([(11, [])])
+
+	>>> parse_s.parse(lex_trivial('(1 + 2) * (3 + 4)'))
+	Ok([(21, [])])
+
+	>>> parse_s.parse(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6'))
+	Ok([(69, [])])
+	"""
+
+	parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]]
+
+	@staticmethod
+	def epsilon(ret: Out) -> 'Parser[Out, TokN]':
+		"""
+		Parse an empty string, then return a constant
+
+		Always succeeds, and always produces exactly one possibility
+
+		>>> Parser.epsilon(100).parse(lex_trivial("+"))
+		Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),))
+		"""
+		return Parser(lambda s: Ok(((ret, s),)))
+
+	@staticmethod
+	def token(t: TokN)  -> 'Parser[Lexeme[TokN], TokN]':
+		"""
+		Parser that only accepts a single token, and returns the parsed lexeme
+
+		The first argument is a function which, given a lexeme, returns the token that
+		that lexeme instantiates.  The second argument is a token which this parser should
+		accept
+
+		>>> parse_num = Parser.token(TrivialTok.NUM)
+
+		>>> parse_num.parse(lex_trivial('3'))
+		Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),))
+
+		>>> parse_num.parse(lex_trivial('x'))
+		Err((([LTR: 'x']@(1, 1-1), NUM),))
+		"""
+		def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]:
+			match input:
+				case [lexeme, *rest] if lexeme.token == t:
+					return Ok(((lexeme, rest),))
+				case [bad_lexeme, *rest]:
+					return Err(((bad_lexeme, t),))
+				case []:
+					raise Exception('Bad grammar!  Reached an empty input')
+			raise Exception('Unreachable')
+		return Parser(parse_single)
+
+	@staticmethod
+	def lazy(
+		gen: 'Callable[[], Parser[Out, TokN]]',
+	) -> 'Parser[Out, TokN]':
+		"""
+		A stand-in parser which will only be actually computed when called
+		"""
+		return Parser(lambda s: gen().parse(s))
+
+	def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]':
+		"""
+		A monadic bind operator - allows a parser to be generated from its precenent
+
+		### Example
+
+		We generate a parser which reads a number then accepts exactly that many plus
+		tokens, returning the last one.
+
+		>>> some_n = Parser.token(TrivialTok.NUM)
+		>>> n_plus = some_n.bind(lambda prev_result:
+		...     reduce(
+		...         lambda a, p: a.bind(k(p)),
+		...         [
+		...             Parser.token(TrivialTok.ADD)
+		...             for i in range(int(prev_result.matched_string))
+		...         ]
+		...     )
+		... )
+
+		**Sample Run 1**: We parse the string `3+++`.  Since this is a three followed by
+		exactly three plus signs, this should parse successfully.  Sure enough, the result
+		contains exactly one possibility, where the `3+++` has been consumed, leaving only
+		the EOF, and returning the value of the last plus sign.
+
+		>>> n_plus.parse(lex_trivial('3+++'))
+		Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])])
+
+		**Simple Run 2**: We parse the string `3++`.  This only has two of the three plus
+		signs, so we should expect it to fail.  As expected, at does, correctly
+		identifying that it saw an EOF while expecting an ADD.
+
+		>>> n_plus.parse(lex_trivial('3++'))
+		Err([([EOF: '']@(1, 4-4), ADD)])
+		"""
+		def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]:
+			successes, errors = partition([
+				f(out1).parse(stream)
+				for (out1, stream) in results
+			])
+			if len(successes):
+				return Ok([p for s in successes for p in s])
+			else:
+				return Err([e for errs in errors for e in errs])
+		def inner(input: LexStream) -> ParserResult[Out2, TokN]:
+			return self.parse(input) << handle_results
+		return Parser(inner)
+
+	def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]':
+		"""
+		Transform the output of some parser with a function
+
+		This is a particularly useful method, because it allows converting parsers which
+		return lexemes (e.g. `Parser.token()`) into parsers that return other thing.
+
+		As an example, here's a parser which parses a number, and returns it as a number.
+
+		>>> parse_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))
+		>>> parse_num.parse(lex_trivial('3'))
+		Ok([(3, [[EOF: '']@(1, 2-2)])])
+		"""
+		return self.bind(c(Parser.epsilon, f)) #type: ignore
+
+	def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]':
+		"""
+		Apply the function which this returns to the value produced by another parser
+
+		Equivalent to the fapply method of an applicative functor.
+		"""
+		return self.bind(p(Parser.map, arg)) #type: ignore
+
+	def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]':
+		"""
+		A reversed version of `fapply()`
+
+		Applies the function returned by the argument to the value returned by this.
+		"""
+		return self.bind(lambda v: arg.map(lambda f: f(v)))
+
+	def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]':
+		"""
+		Parses two things in series, ignoring the output of the second parser
+
+		Example:  Parse a number followed by any letter (ignored)
+
+		>>> parse_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))
+		>>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR))
+		>>> parse_numlet.parse(lex_trivial('4a'))
+		Ok([(4, [[EOF: '']@(1, 3-3)])])
+		"""
+		return self.map(k).fapply(subsequent) #type:ignore
+
+	def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]':
+		"""
+		A shorthand for calling `seq_ignore()` with `Parser.token`
+
+		Example:  Parse a number followed by any letter (ignored)
+
+		>>> parse_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))
+		>>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR)
+		>>> parse_numlet.parse(lex_trivial('4a'))
+		Ok([(4, [[EOF: '']@(1, 3-3)])])
+		"""
+		return self.seq_ignore(Parser.token(subsequent))
+
+	def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]':
+		"""
+		Returns a parser which succeeds if this or any arguments succeed
+
+		**Example:**  A parser which parses a letter or a number
+
+		>>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR))
+		>>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS
+		Ok(...)
+		>>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS
+		Ok(...)
+
+		Notice that this can produce multiple successes.  A simple example would be a
+		parser which parses either a single number or an empty string (epsilon).  When
+		faced with an input stream starting with a number, it could either parse that
+		number, returning the rest of the input stream, or parse the empty string,
+		returning the input unchanged.
+
+		>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))\\
+		...                   .or_(Parser.epsilon(-1))
+		>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
+		Ok([(3,  [[EOF: '']@(1, 2-2)]),
+		    (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
+		"""
+		all_parsers = (self, *parsers)
+		def inner(input: LexStream) -> ParserResult[Out, TokN]:
+			successes, failures = partition([p.parse(input) for p in all_parsers])
+			if len(successes):
+				return Ok([successful_path for success in successes for successful_path in success])
+			else:
+				return Err([expectation for failure in failures for expectation in failure])
+		return Parser(inner)
+
+	def opt(self, fallback: Out) -> 'Parser[Out, TokN]':
+		"""
+		Parse one or zero of this thing, returning fallback if not parsing it
+
+		We can use this to write a simpler example of parse_maybe_num from the `or_()`
+		example:
+
+		>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))\\
+		...                   .opt(-1)
+		>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
+		Ok([(3,  [[EOF: '']@(1, 2-2)]),
+		    (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
+		"""
+		return self.or_(Parser.epsilon(fallback))
+
+	def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]':
+		"""
+		Create a new parser which accepts any number of instances of this parser
+
+		The combine argument is a function which joins the result of this parser with the
+		result of parsing the 0 or more identical parsers on the right.  Think of this
+		like a reduce.
+
+		As an example, here's a parser which parses any number of numbers, and sums them
+		all together.
+
+		We start with the single number parser from the `map()` example.
+
+		>>> parse_num = Parser.token(TrivialTok.NUM)\\
+		...                   .map(lambda l: int(l.matched_string))
+
+		Then we call `many()` on it.  We also add the EOF parser to the end to force it to
+		parse the whole input.
+
+		>>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\
+		...                       .seq_ignore_tok(TrivialTok.EOF)
+
+		**Sample Run 1:**  Sum of the numbers 1, 2, and 3.  This produces the operation (1
+		+ (2 + (3 + 0))
+
+		>>> parse_nums.parse(lex_trivial('1 2 3'))
+		Ok([(6, [])])
+
+		**Sample Run 2:**  If attempting to sum no numbers, we get back the default
+		argument, zero in this case.
+
+		>>> parse_nums.parse(lex_trivial(''))
+		Ok([(0, [])])
+		"""
+		return (self.map(combine)
+		            .fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore
+		            .opt(default))
+
+if __name__ == '__main__':
+	import doctest
+	doctest.testmod()
\ No newline at end of file