JSON-Lang/comb_parse.py
2024-03-15 20:21:42 -04:00

500 lines
17 KiB
Python

from emis_funky_funktions import *
from dataclasses import dataclass
from enum import auto, IntEnum
from functools import reduce
from re import compile, Pattern
from lex import Lexeme, tokenize
from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias
# Grammar
# S := <P> Eof
# P := <M> + <P>
# P := <M>
# M := <T> * <M>
# M := <T>
# T := Letter
# T := Number
# T := ( <P> )
class TrivialTok(IntEnum):
"A set of tokens for a trivial grammar used for testing"
EOF = auto()
"Special token: End of file"
ADD = auto()
"Addition (+)"
MUL = auto()
"Multiplication (*)"
LTR = auto()
"A single letter (x)"
NUM = auto()
"A number (3)"
OPN = auto()
"An open paren, i.e. '('"
CLS = auto()
"An close paren, i.e. ')'"
NUL = auto()
"Whitespace ( )"
def __repr__(self):
return self._name_
TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [
(compile(r"\+" ), TrivialTok.ADD),
(compile(r"\*" ), TrivialTok.MUL),
(compile(r"[a-zA-Z]"), TrivialTok.LTR),
(compile("\d+" ), TrivialTok.NUM),
(compile("\(" ), TrivialTok.OPN),
(compile("\)" ), TrivialTok.CLS),
(compile("\s+" ), TrivialTok.NUL),
]
"""
A mapping of regexs to the tokens the identify in a trivial grammar for testing
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""
lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\
c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore
"""
A lexer for the trivial grammar defined above
Throws an error if the lex fails.
>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE
[[NUM: '1']@(1, 1-1),
[ADD: '+']@(1, 3-3),
[NUM: '3']@(1, 5-5),
[EOF: '']@(1, 6-6)]
"""
############# Combinator Parsing ################
Out = TypeVar('Out')
Out1 = TypeVar('Out1')
Out2 = TypeVar('Out2')
TokN = TypeVar('TokN')
LexStream: TypeAlias = Sequence[Lexeme[TokN]]
ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]]
@dataclass(frozen=True)
class Parser(Generic[Out, TokN]):
"""
A parser which consumes a token stream and produces a series of possible parses
Each possible parse consists of the subsection of the input stream left over after the
parse, along with the output of that parse.
If the parse fails, the error returned will be a series of two-tuples, each containing
the lexeme at which the error occurred and the token which was expected in its stead.
By the nature of combinator parsing, parsers can be built and tested in small pieces.
To this end, each combinator method on this class comes with a small example
demonstrating how to build a trivial parser using that method. However, in the
following section, we also provide an example for a grammar that is still trivial, but
slightly less so.
### Example: Arithmetic Grammar
Let us define the following grammar for basic additive and multiplicative arithmetic
```
S := <P> Eof
P := <M> + <P>
P := <M>
M := <T> * <M>
M := <T>
T := Number
T := ( <P> )
```
As with the rest of the examples in this class, we will use the `TrivialTok` token
class to build this parser.
Working our way from the bottom up, we start with defining a parser for T. For the
parenthetical production, we use a lazy parser to refer to the parser for P, which
hasn't been constructed yet.
>>> parse_parens = Parser.token(TrivialTok.OPN)\\
... .bind(k(Parser.lazy(lambda: parse_p)))\\
... .seq_ignore_tok(TrivialTok.CLS)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(c(int, Lexeme.get_match))
>>> parse_t = parse_parens.or_(parse_num)
For multiplication, we use `Parser.many()` to represent any number of "* <T>" matches
and combine them together. This is a slight departure from how our grammar is
written above. The set of accepted inputs will be the same, but our implementation
will be left-associative. It is possible to implement this grammar as it is written,
but this will result in a right-associative structure. Of course, multiplication and
addition are associative, so in this case it doesn't matter.
>>> parse_times = Parser.token(TrivialTok.MUL)\\
... .map(k(lambda x: lambda y: x * y))\\
... .fapply(parse_t)\\
... .many(lambda l: lambda r: c(l, r), lambda x: x)
>>> parse_m = parse_t.fapply_r(parse_times)
Addition is largely the same as multiplication:
>>> parse_plus = Parser.token(TrivialTok.ADD)\\
... .map(k(lambda x: lambda y: x + y))\\
... .fapply(parse_m)\\
... .many(lambda l: lambda r: c(l, r), lambda x: x)
>>> parse_p = parse_m.fapply_r(parse_plus)
And finally, we expect and EOF after the end of the term, to ensure that we've reached
the end of the input.
>>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF)
And now, for a few sample runs:
>>> parse_s.parse_(lex_trivial('1'))
Ok(1)
>>> parse_s.parse_(lex_trivial('1 + 2'))
Ok(3)
>>> parse_s.parse_(lex_trivial('1 + 2 * 3 + 4'))
Ok(11)
>>> parse_s.parse_(lex_trivial('(1 + 2) * (3 + 4)'))
Ok(21)
>>> parse_s.parse_(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6'))
Ok(69)
And an example of a bad parse:
# TODO fix this
>>> parse_s.parse_(lex_trivial('1 * * 2')) #doctest: +ELLIPSIS
Err(...)
"""
parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]]
"""
Run this parser
Accepts a stream of tokens to parse, and returns either a list of possible successful
parses, each with a value and a remainder of the token stream, or a list of all
failures which lead to the failure of this parser, each with a lexeme they failed at
and the token the expected.
This is meant for use in constructing parsers, and it's probably more useful to call
`parse_()`.
"""
def parse_(self, input: LexStream[TokN]) -> Result[Out, Mapping[Lexeme[TokN], Collection[TokN]]]:
"""
Run this parser, expecting at most one result which consumes the full input stream
This is a wrapper around `parse()` with a few features for ease of use, including:
- expects that at most one single successful parse will be produced
- asserts that the parse consumed the whole input stream
- aggregates failures into a multi-set of `Lexeme` -> expected tokens
Normal failures are passed through as an `Err`, but if the results returned by a
successful parse violate the above conditions, an assertion error will be raised.
However, by carefully constructing your grammar, you can ensure that this will not
happen for any input.
Because of these assertions, the successful return value of this function is
*just* the output type of this parse, which is much easier to use.
### Example
A parser which parses a single number. Notice that we also require that it parses
an EOF. Without this, it would be unable to consume the entire input stream, and
thus would fail.
>>> parse_num = Parser.token(TrivialTok.NUM).seq_ignore_tok(TrivialTok.EOF)
>>> parse_num.parse_(lex_trivial('1312'))
Ok([NUM: '1312']@(1, 1-4))
"""
match self.parse(input):
case Err(errors):
# The input is bad
failure_locations = FSet(lex for (lex, expected) in errors)
return Err({
location: FSet(expected for (lex, expected) in errors if lex == location)
for location in failure_locations
})
case Ok([result1, result2, *rest] as possible_parses):
# The grammar is bad
raise AssertionError("Parse returned multiple possible parses", possible_parses)
case Ok([(value, [non_empty, *rest] as remainder)]):
# The grammar is bad
raise AssertionError("Parse failed to consume the whole input, and left remainder", remainder)
case Ok([]):
# The parser code is bad
raise AssertionError('"Successful" parse returned no possible parses')
case Ok([(value, [])]):
return Ok(value)
# The code in this function is bad
raise AssertionError('Unreachable')
@staticmethod
def epsilon(ret: Out) -> 'Parser[Out, TokN]':
"""
Parse an empty string, then return a constant
Always succeeds, and always produces exactly one possibility
>>> Parser.epsilon(100).parse(lex_trivial("+"))
Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),))
"""
return Parser(lambda s: Ok(((ret, s),)))
@staticmethod
def token(t: TokN) -> 'Parser[Lexeme[TokN], TokN]':
"""
Parser that only accepts a single token, and returns the parsed lexeme
The first argument is a function which, given a lexeme, returns the token that
that lexeme instantiates. The second argument is a token which this parser should
accept
>>> parse_num = Parser.token(TrivialTok.NUM)
>>> parse_num.parse(lex_trivial('3'))
Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),))
>>> parse_num.parse(lex_trivial('x'))
Err((([LTR: 'x']@(1, 1-1), NUM),))
"""
def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]:
match input:
case [lexeme, *rest] if lexeme.token == t:
return Ok(((lexeme, rest),))
case [bad_lexeme, *rest]:
return Err(((bad_lexeme, t),))
case []:
raise Exception('Bad grammar! Reached an empty input')
raise Exception('Unreachable')
return Parser(parse_single)
@staticmethod
def lazy(
gen: 'Callable[[], Parser[Out, TokN]]',
) -> 'Parser[Out, TokN]':
"""
A stand-in parser which will only be actually computed when called
"""
return Parser(lambda s: gen().parse(s))
def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]':
"""
A monadic bind operator - allows a parser to be generated from its precenent
### Example
We generate a parser which reads a number then accepts exactly that many plus
tokens, returning the last one.
>>> some_n = Parser.token(TrivialTok.NUM)
>>> n_plus = some_n.bind(lambda prev_result:
... reduce(
... lambda a, p: a.bind(k(p)),
... [
... Parser.token(TrivialTok.ADD)
... for i in range(int(prev_result.matched_string))
... ]
... )
... )
**Sample Run 1**: We parse the string `3+++`. Since this is a three followed by
exactly three plus signs, this should parse successfully. Sure enough, the result
contains exactly one possibility, where the `3+++` has been consumed, leaving only
the EOF, and returning the value of the last plus sign.
>>> n_plus.parse(lex_trivial('3+++'))
Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])])
**Simple Run 2**: We parse the string `3++`. This only has two of the three plus
signs, so we should expect it to fail. As expected, at does, correctly
identifying that it saw an EOF while expecting an ADD.
>>> n_plus.parse(lex_trivial('3++'))
Err([([EOF: '']@(1, 4-4), ADD)])
"""
def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]:
successes, errors = partition([
f(out1).parse(stream)
for (out1, stream) in results
])
if len(successes):
return Ok([p for s in successes for p in s])
else:
return Err([e for errs in errors for e in errs])
def inner(input: LexStream) -> ParserResult[Out2, TokN]:
return self.parse(input) << handle_results
return Parser(inner)
def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]':
"""
Transform the output of some parser with a function
This is a particularly useful method, because it allows converting parsers which
return lexemes (e.g. `Parser.token()`) into parsers that return other thing.
As an example, here's a parser which parses a number, and returns it as a number.
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_num.parse(lex_trivial('3'))
Ok([(3, [[EOF: '']@(1, 2-2)])])
"""
return self.bind(c(Parser.epsilon, f)) #type: ignore
def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]':
"""
Apply the function which this returns to the value produced by another parser
Equivalent to the fapply method of an applicative functor.
"""
return self.bind(p(Parser.map, arg)) #type: ignore
def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]':
"""
A reversed version of `fapply()`
Applies the function returned by the argument to the value returned by this.
"""
return self.bind(lambda v: arg.map(lambda f: f(v)))
def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]':
"""
Parses two things in series, ignoring the output of the second parser
Example: Parse a number followed by any letter (ignored)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR))
>>> parse_numlet.parse(lex_trivial('4a'))
Ok([(4, [[EOF: '']@(1, 3-3)])])
"""
return self.map(k).fapply(subsequent) #type:ignore
def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]':
"""
A shorthand for calling `seq_ignore()` with `Parser.token`
Example: Parse a number followed by any letter (ignored)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR)
>>> parse_numlet.parse(lex_trivial('4a'))
Ok([(4, [[EOF: '']@(1, 3-3)])])
"""
return self.seq_ignore(Parser.token(subsequent))
def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]':
"""
Returns a parser which succeeds if this or any arguments succeed
**Example:** A parser which parses a letter or a number
>>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR))
>>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS
Ok(...)
>>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS
Ok(...)
Notice that this can produce multiple successes. A simple example would be a
parser which parses either a single number or an empty string (epsilon). When
faced with an input stream starting with a number, it could either parse that
number, returning the rest of the input stream, or parse the empty string,
returning the input unchanged.
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))\\
... .or_(Parser.epsilon(-1))
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
Ok([(3, [[EOF: '']@(1, 2-2)]),
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
Of course, this can also produce multiple failures as well:
>>> parse_or.parse(lex_trivial('+'))
Err([([ADD: '+']@(1, 1-1), NUM), ([ADD: '+']@(1, 1-1), LTR)])
"""
all_parsers = (self, *parsers)
def inner(input: LexStream) -> ParserResult[Out, TokN]:
successes, failures = partition([p.parse(input) for p in all_parsers])
if len(successes):
return Ok([successful_path for success in successes for successful_path in success])
else:
return Err([expectation for failure in failures for expectation in failure])
return Parser(inner)
def opt(self, fallback: Out) -> 'Parser[Out, TokN]':
"""
Parse one or zero of this thing, returning fallback if not parsing it
We can use this to write a simpler example of parse_maybe_num from the `or_()`
example:
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))\\
... .opt(-1)
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
Ok([(3, [[EOF: '']@(1, 2-2)]),
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
"""
return self.or_(Parser.epsilon(fallback))
def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]':
"""
Create a new parser which accepts any number of instances of this parser
The combine argument is a function which joins the result of this parser with the
result of parsing the 0 or more identical parsers on the right. Think of this
like a reduce.
As an example, here's a parser which parses any number of numbers, and sums them
all together.
We start with the single number parser from the `map()` example.
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
Then we call `many()` on it. We also add the EOF parser to the end to force it to
parse the whole input.
>>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\
... .seq_ignore_tok(TrivialTok.EOF)
**Sample Run 1:** Sum of the numbers 1, 2, and 3. This produces the operation (1
+ (2 + (3 + 0))
>>> parse_nums.parse(lex_trivial('1 2 3'))
Ok([(6, [])])
**Sample Run 2:** If attempting to sum no numbers, we get back the default
argument, zero in this case.
>>> parse_nums.parse(lex_trivial(''))
Ok([(0, [])])
"""
return (self.map(combine)
.fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore
.opt(default))
if __name__ == '__main__':
import doctest
doctest.testmod()