JSON-Lang/parse/comb_parse.py

from emis_funky_funktions import *

from dataclasses import dataclass
from enum import auto, IntEnum
from functools import reduce
from re import compile, Pattern

from parse.lex import Lexeme, tokenize

from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias

# Grammar
# S := <P> Eof
# P := <M> + <P>
# P := <M>
# M := <T> * <M>
# M := <T>
# T := Letter
# T := Number
# T := ( <P> )

class TrivialTok(IntEnum):
	"A set of tokens for a trivial grammar used for testing"

	EOF = auto()
	"Special token: End of file"

	ADD = auto()
	"Addition (+)"

	MUL = auto()
	"Multiplication (*)"

	LTR = auto()
	"A single letter (x)"

	NUM = auto()
	"A number (3)"

	OPN = auto()
	"An open paren, i.e. '('"

	CLS = auto()
	"An close paren, i.e. ')'"

	NUL = auto()
	"Whitespace ( )"

	def __repr__(self):
		return self._name_

TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [
	(compile(r"\+"      ), TrivialTok.ADD),
	(compile(r"\*"      ), TrivialTok.MUL),
	(compile(r"[a-zA-Z]"), TrivialTok.LTR),
	(compile("\d+"      ), TrivialTok.NUM),
	(compile("\("       ), TrivialTok.OPN),
	(compile("\)"       ), TrivialTok.CLS),
	(compile("\s+"      ), TrivialTok.NUL),
]
"""
A mapping of regexs to the tokens the identify in a trivial grammar for testing

Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches.  All unicode strings should be matched by at least one token.
"""

lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\
	c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore
"""
A lexer for the trivial grammar defined above

Throws an error if the lex fails.

>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE
[[NUM: '1']@(1, 1-1),
 [ADD: '+']@(1, 3-3),
 [NUM: '3']@(1, 5-5),
 [EOF: '']@(1, 6-6)]
"""

############# Combinator Parsing ################

Out = TypeVar('Out')
Out1 = TypeVar('Out1')
Out2 = TypeVar('Out2')
TokN = TypeVar('TokN')
LexStream: TypeAlias = Sequence[Lexeme[TokN]]
ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]]

@dataclass(frozen=True)
class Parser(Generic[Out, TokN]):
	"""
	A parser which consumes a token stream and produces a series of possible parses

	Each possible parse consists of the subsection of the input stream left over after the
	parse, along with the output of that parse.

	If the parse fails, the error returned will be a series of two-tuples, each containing
	the lexeme at which the error occurred and the token which was expected in its stead.

	By the nature of combinator parsing, parsers can be built and tested in small pieces.
	To this end, each combinator method on this class comes with a small example
	demonstrating how to build a trivial parser using that method.  However, in the
	following section, we also provide an example for a grammar that is still trivial, but
	slightly less so.

	### Example:  Arithmetic Grammar

	Let us define the following grammar for basic additive and multiplicative arithmetic

	```
	S := <P> Eof
	P := <M> + <P>
	P := <M>
	M := <T> * <M>
	M := <T>
	T := Number
	T := ( <P> )
	```

	As with the rest of the examples in this class, we will use the `TrivialTok` token
	class to build this parser.

	Working our way from the bottom up, we start with defining a parser for T.  For the
	parenthetical production, we use a lazy parser to refer to the parser for P, which
	hasn't been constructed yet.

	>>> parse_parens = Parser.token(TrivialTok.OPN)\\
	...                      .bind(k(Parser.lazy(lambda: parse_p)))\\
	...                      .seq_ignore_tok(TrivialTok.CLS)
	>>> parse_num = Parser.token(TrivialTok.NUM)\\
	...                   .map(c(int, Lexeme.get_match))
	>>> parse_t = parse_parens.or_(parse_num)

	For multiplication, we use `Parser.many()` to represent any number of "* <T>" matches
	and combine them together.  This is a slight departure from how our grammar is
	written above.  The set of accepted inputs will be the same, but our implementation
	will be left-associative.  It is possible to implement this grammar as it is written,
	but this will result in a right-associative structure.  Of course, multiplication and
	addition are associative, so in this case it doesn't matter.

	>>> parse_times = Parser.token(TrivialTok.MUL)\\
	...                     .map(k(lambda x: lambda y: x * y))\\
	...                     .fapply(parse_t)\\
	...                     .many(lambda l: lambda r: c(l, r), lambda x: x)
	>>> parse_m = parse_t.fapply_r(parse_times)

	Addition is largely the same as multiplication:

	>>> parse_plus = Parser.token(TrivialTok.ADD)\\
	...                    .map(k(lambda x: lambda y: x + y))\\
	...                    .fapply(parse_m)\\
	...                    .many(lambda l: lambda r: c(l, r), lambda x: x)
	>>> parse_p = parse_m.fapply_r(parse_plus)

	And finally, we expect and EOF after the end of the term, to ensure that we've reached
	the end of the input.

	>>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF)

	And now, for a few sample runs:

	>>> parse_s.parse_(lex_trivial('1'))
	Ok(1)

	>>> parse_s.parse_(lex_trivial('1 + 2'))
	Ok(3)

	>>> parse_s.parse_(lex_trivial('1 + 2 * 3 + 4'))
	Ok(11)

	>>> parse_s.parse_(lex_trivial('(1 + 2) * (3 + 4)'))
	Ok(21)

	>>> parse_s.parse_(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6'))
	Ok(69)

	And an example of a bad parse:

	# TODO fix this
	>>> parse_s.parse_(lex_trivial('1 * * 2')) #doctest: +ELLIPSIS
	Err(...)
	"""

	parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]]
	"""
	Run this parser

	Accepts a stream of tokens to parse, and returns either a list of possible successful
	parses, each with a value and a remainder of the token stream, or a list of all
	failures which lead to the failure of this parser, each with a lexeme they failed at
	and the token the expected.

	This is meant for use in constructing parsers, and it's probably more useful to call
	`parse_()`.
	"""

	def parse_(self, input: LexStream[TokN]) -> Result[Out, Mapping[Lexeme[TokN], Collection[TokN]]]:
		"""
		Run this parser, expecting at most one result which consumes the full input stream

		This is a wrapper around `parse()` with a few features for ease of use, including:
			- expects that at most one single successful parse will be produced
			- asserts that the parse consumed the whole input stream
			- aggregates failures into a multi-set of `Lexeme` -> expected tokens

		Normal failures are passed through as an `Err`, but if the results returned by a
		successful parse violate the above conditions, an assertion error will be raised.
		However, by carefully constructing your grammar, you can ensure that this will not
		happen for any input.

		Because of these assertions, the successful return value of this function is
		*just* the output type of this parse, which is much easier to use.

		### Example

		A parser which parses a single number.  Notice that we also require that it parses
		an EOF.  Without this, it would be unable to consume the entire input stream, and
		thus would fail.

		>>> parse_num = Parser.token(TrivialTok.NUM).seq_ignore_tok(TrivialTok.EOF)
		>>> parse_num.parse_(lex_trivial('1312'))
		Ok([NUM: '1312']@(1, 1-4))
		"""
		match self.parse(input):
			case Err(errors):
				# The input is bad
				failure_locations = FSet(lex for (lex, expected) in errors)
				return Err({
					location: FSet(expected for (lex, expected) in errors if lex == location)
					for location in failure_locations
				})
			case Ok([result1, result2, *rest] as possible_parses):
				# The grammar is bad
				raise AssertionError("Parse returned multiple possible parses", possible_parses)
			case Ok([(value, [non_empty, *rest] as remainder)]):
				# The grammar is bad
				raise AssertionError("Parse failed to consume the whole input, and left remainder", remainder)
			case Ok([]):
				# The parser code is bad
				raise AssertionError('"Successful" parse returned no possible parses')
			case Ok([(value, [])]):
				return Ok(value)
		# The code in this function is bad
		raise AssertionError('Unreachable')

	@staticmethod
	def epsilon(ret: Out) -> 'Parser[Out, TokN]':
		"""
		Parse an empty string, then return a constant

		Always succeeds, and always produces exactly one possibility

		>>> Parser.epsilon(100).parse(lex_trivial("+"))
		Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),))
		"""
		return Parser(lambda s: Ok(((ret, s),)))

	@staticmethod
	def token(t: TokN)  -> 'Parser[Lexeme[TokN], TokN]':
		"""
		Parser that only accepts a single token, and returns the parsed lexeme

		The first argument is a function which, given a lexeme, returns the token that
		that lexeme instantiates.  The second argument is a token which this parser should
		accept

		>>> parse_num = Parser.token(TrivialTok.NUM)

		>>> parse_num.parse(lex_trivial('3'))
		Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),))

		>>> parse_num.parse(lex_trivial('x'))
		Err((([LTR: 'x']@(1, 1-1), NUM),))
		"""
		def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]:
			match input:
				case [lexeme, *rest] if lexeme.token == t:
					return Ok(((lexeme, rest),))
				case [bad_lexeme, *rest]:
					return Err(((bad_lexeme, t),))
				case []:
					raise Exception('Bad grammar!  Reached an empty input')
			raise Exception('Unreachable')
		return Parser(parse_single)

	@staticmethod
	def lazy(
		gen: 'Callable[[], Parser[Out, TokN]]',
	) -> 'Parser[Out, TokN]':
		"""
		A stand-in parser which will only be actually computed when called
		"""
		return Parser(lambda s: gen().parse(s))

	def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]':
		"""
		A monadic bind operator - allows a parser to be generated from its precenent

		### Example

		We generate a parser which reads a number then accepts exactly that many plus
		tokens, returning the last one.

		>>> some_n = Parser.token(TrivialTok.NUM)
		>>> n_plus = some_n.bind(lambda prev_result:
		...     reduce(
		...         lambda a, p: a.bind(k(p)),
		...         [
		...             Parser.token(TrivialTok.ADD)
		...             for i in range(int(prev_result.matched_string))
		...         ]
		...     )
		... )

		**Sample Run 1**: We parse the string `3+++`.  Since this is a three followed by
		exactly three plus signs, this should parse successfully.  Sure enough, the result
		contains exactly one possibility, where the `3+++` has been consumed, leaving only
		the EOF, and returning the value of the last plus sign.

		>>> n_plus.parse(lex_trivial('3+++'))
		Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])])

		**Simple Run 2**: We parse the string `3++`.  This only has two of the three plus
		signs, so we should expect it to fail.  As expected, at does, correctly
		identifying that it saw an EOF while expecting an ADD.

		>>> n_plus.parse(lex_trivial('3++'))
		Err([([EOF: '']@(1, 4-4), ADD)])
		"""
		def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]:
			successes, errors = partition([
				f(out1).parse(stream)
				for (out1, stream) in results
			])
			if len(successes):
				return Ok([p for s in successes for p in s])
			else:
				return Err([e for errs in errors for e in errs])
		def inner(input: LexStream) -> ParserResult[Out2, TokN]:
			return self.parse(input) << handle_results
		return Parser(inner)

	def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]':
		"""
		Transform the output of some parser with a function

		This is a particularly useful method, because it allows converting parsers which
		return lexemes (e.g. `Parser.token()`) into parsers that return other thing.

		As an example, here's a parser which parses a number, and returns it as a number.

		>>> parse_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))
		>>> parse_num.parse(lex_trivial('3'))
		Ok([(3, [[EOF: '']@(1, 2-2)])])
		"""
		return self.bind(c(Parser.epsilon, f)) #type: ignore

	def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]':
		"""
		Apply the function which this returns to the value produced by another parser

		Equivalent to the fapply method of an applicative functor.
		"""
		return self.bind(p(Parser.map, arg)) #type: ignore

	def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]':
		"""
		A reversed version of `fapply()`

		Applies the function returned by the argument to the value returned by this.
		"""
		return self.bind(lambda v: arg.map(lambda f: f(v)))

	def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]':
		"""
		Parses two things in series, ignoring the output of the second parser

		Example:  Parse a number followed by any letter (ignored)

		>>> parse_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))
		>>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR))
		>>> parse_numlet.parse(lex_trivial('4a'))
		Ok([(4, [[EOF: '']@(1, 3-3)])])
		"""
		return self.map(k).fapply(subsequent) #type:ignore

	def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]':
		"""
		A shorthand for calling `seq_ignore()` with `Parser.token`

		Example:  Parse a number followed by any letter (ignored)

		>>> parse_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))
		>>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR)
		>>> parse_numlet.parse(lex_trivial('4a'))
		Ok([(4, [[EOF: '']@(1, 3-3)])])
		"""
		return self.seq_ignore(Parser.token(subsequent))

	def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]':
		"""
		Returns a parser which succeeds if this or any arguments succeed

		**Example:**  A parser which parses a letter or a number

		>>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR))
		>>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS
		Ok(...)
		>>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS
		Ok(...)

		Notice that this can produce multiple successes.  A simple example would be a
		parser which parses either a single number or an empty string (epsilon).  When
		faced with an input stream starting with a number, it could either parse that
		number, returning the rest of the input stream, or parse the empty string,
		returning the input unchanged.

		>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))\\
		...                   .or_(Parser.epsilon(-1))
		>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
		Ok([(3,  [[EOF: '']@(1, 2-2)]),
		    (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])

		Of course, this can also produce multiple failures as well:

		>>> parse_or.parse(lex_trivial('+'))
		Err([([ADD: '+']@(1, 1-1), NUM), ([ADD: '+']@(1, 1-1), LTR)])
		"""
		all_parsers = (self, *parsers)
		def inner(input: LexStream) -> ParserResult[Out, TokN]:
			successes, failures = partition([p.parse(input) for p in all_parsers])
			if len(successes):
				return Ok([successful_path for success in successes for successful_path in success])
			else:
				return Err([expectation for failure in failures for expectation in failure])
		return Parser(inner)

	def opt(self, fallback: Out) -> 'Parser[Out, TokN]':
		"""
		Parse one or zero of this thing, returning fallback if not parsing it

		We can use this to write a simpler example of parse_maybe_num from the `or_()`
		example:

		>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))\\
		...                   .opt(-1)
		>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
		Ok([(3,  [[EOF: '']@(1, 2-2)]),
		    (-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
		"""
		return self.or_(Parser.epsilon(fallback))

	def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]':
		"""
		Create a new parser which accepts any number of instances of this parser

		The combine argument is a function which joins the result of this parser with the
		result of parsing the 0 or more identical parsers on the right.  Think of this
		like a reduce.

		As an example, here's a parser which parses any number of numbers, and sums them
		all together.

		We start with the single number parser from the `map()` example.

		>>> parse_num = Parser.token(TrivialTok.NUM)\\
		...                   .map(lambda l: int(l.matched_string))

		Then we call `many()` on it.  We also add the EOF parser to the end to force it to
		parse the whole input.

		>>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\
		...                       .seq_ignore_tok(TrivialTok.EOF)

		**Sample Run 1:**  Sum of the numbers 1, 2, and 3.  This produces the operation (1
		+ (2 + (3 + 0))

		>>> parse_nums.parse(lex_trivial('1 2 3'))
		Ok([(6, [])])

		**Sample Run 2:**  If attempting to sum no numbers, we get back the default
		argument, zero in this case.

		>>> parse_nums.parse(lex_trivial(''))
		Ok([(0, [])])
		"""
		return (self.map(combine)
		            .fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore
		            .opt(default))

if __name__ == '__main__':
	import doctest
	doctest.testmod()