JSON-Lang/comb_parse.py

430 lines
14 KiB
Python

from emis_funky_funktions import *
from dataclasses import dataclass
from enum import auto, IntEnum
from functools import reduce
from re import compile, Pattern
from lex import Lexeme, tokenize
from parse import Action
from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias
# Grammar
# S := <P> Eof
# P := <M> + <P>
# P := <M>
# M := <T> * <M>
# M := <T>
# T := Letter
# T := Number
# T := ( <P> )
class TrivialTok(IntEnum):
"A set of tokens for a trivial grammar used for testing"
EOF = auto()
"Special token: End of file"
ADD = auto()
"Addition (+)"
MUL = auto()
"Multiplication (*)"
LTR = auto()
"A single letter (x)"
NUM = auto()
"A number (3)"
OPN = auto()
"An open paren, i.e. '('"
CLS = auto()
"An close paren, i.e. ')'"
NUL = auto()
"Whitespace ( )"
def __repr__(self):
return self._name_
TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [
(compile(r"\+" ), TrivialTok.ADD),
(compile(r"\*" ), TrivialTok.MUL),
(compile(r"[a-zA-Z]"), TrivialTok.LTR),
(compile("\d+" ), TrivialTok.NUM),
(compile("\(" ), TrivialTok.OPN),
(compile("\)" ), TrivialTok.CLS),
(compile("\s+" ), TrivialTok.NUL),
]
"""
A mapping of regexs to the tokens the identify in a trivial grammar for testing
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""
lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\
c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore
"""
A lexer for the trivial grammar defined above
Throws an error if the lex fails.
>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE
[[NUM: '1']@(1, 1-1),
[ADD: '+']@(1, 3-3),
[NUM: '3']@(1, 5-5),
[EOF: '']@(1, 6-6)]
"""
############# Combinator Parsing ################
Out = TypeVar('Out')
Out1 = TypeVar('Out1')
Out2 = TypeVar('Out2')
TokN = TypeVar('TokN')
LexStream: TypeAlias = Sequence[Lexeme[TokN]]
ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]]
@dataclass(frozen=True)
class Parser(Generic[Out, TokN]):
"""
A parser which consumes a token stream and produces a series of possible parses
Each possible parse consists of the subsection of the input stream left over after the
parse, along with the output of that parse.
If the parse fails, the error returned will be a series of two-tuples, each containing
the lexeme at which the error occurred and the token which was expected in its stead.
By the nature of combinator parsing, parsers can be built and tested in small pieces.
To this end, each combinator method on this class comes with a small example
demonstrating how to build a trivial parser using that method. However, in the
following section, we also provide an example for a grammar that is still trivial, but
slightly less so.
### Example: Arithmetic Grammar
Let us define the following grammar for basic additive and multiplicative arithmetic
```
S := <P> Eof
P := <M> + <P>
P := <M>
M := <T> * <M>
M := <T>
T := Number
T := ( <P> )
```
As with the rest of the examples in this class, we will use the `TrivialTok` token
class to build this parser.
Working our way from the bottom up, we start with defining a parser for T. For the
parenthetical production, we use a lazy parser to refer to the parser for P, which
hasn't been constructed yet.
>>> parse_parens = Parser.token(TrivialTok.OPN)\\
... .bind(k(Parser.lazy(lambda: parse_p)))\\
... .seq_ignore_tok(TrivialTok.CLS)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(c(int, Lexeme.get_match))
>>> parse_t = parse_parens.or_(parse_num)
For multiplication, we use `Parser.many()` to represent any number of "* <T>" matches
and combine them together. This is a slight departure from how our grammar is
written above. The set of accepted inputs will be the same, but our implementation
will be left-associative. It is possible to implement this grammar as it is written,
but this will result in a right-associative structure. Of course, multiplication and
addition are associative, so in this case it doesn't matter.
>>> parse_times = Parser.token(TrivialTok.MUL)\\
... .map(k(lambda x: lambda y: x * y))\\
... .fapply(parse_t)\\
... .many(lambda l: lambda r: c(l, r), lambda x: x)
>>> parse_m = parse_t.fapply_r(parse_times)
Addition is largely the same as multiplication:
>>> parse_plus = Parser.token(TrivialTok.ADD)\\
... .map(k(lambda x: lambda y: x + y))\\
... .fapply(parse_m)\\
... .many(lambda l: lambda r: c(l, r), lambda x: x)
>>> parse_p = parse_m.fapply_r(parse_plus)
And finally, we expect and EOF after the end of the term, to ensure that we've reached
the end of the input.
>>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF)
And now, for a few sample runs:
>>> parse_s.parse(lex_trivial('1'))
Ok([(1, [])])
>>> parse_s.parse(lex_trivial('1 + 2'))
Ok([(3, [])])
>>> parse_s.parse(lex_trivial('1 + 2 * 3 + 4'))
Ok([(11, [])])
>>> parse_s.parse(lex_trivial('(1 + 2) * (3 + 4)'))
Ok([(21, [])])
>>> parse_s.parse(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6'))
Ok([(69, [])])
"""
parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]]
@staticmethod
def epsilon(ret: Out) -> 'Parser[Out, TokN]':
"""
Parse an empty string, then return a constant
Always succeeds, and always produces exactly one possibility
>>> Parser.epsilon(100).parse(lex_trivial("+"))
Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),))
"""
return Parser(lambda s: Ok(((ret, s),)))
@staticmethod
def token(t: TokN) -> 'Parser[Lexeme[TokN], TokN]':
"""
Parser that only accepts a single token, and returns the parsed lexeme
The first argument is a function which, given a lexeme, returns the token that
that lexeme instantiates. The second argument is a token which this parser should
accept
>>> parse_num = Parser.token(TrivialTok.NUM)
>>> parse_num.parse(lex_trivial('3'))
Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),))
>>> parse_num.parse(lex_trivial('x'))
Err((([LTR: 'x']@(1, 1-1), NUM),))
"""
def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]:
match input:
case [lexeme, *rest] if lexeme.token == t:
return Ok(((lexeme, rest),))
case [bad_lexeme, *rest]:
return Err(((bad_lexeme, t),))
case []:
raise Exception('Bad grammar! Reached an empty input')
raise Exception('Unreachable')
return Parser(parse_single)
@staticmethod
def lazy(
gen: 'Callable[[], Parser[Out, TokN]]',
) -> 'Parser[Out, TokN]':
"""
A stand-in parser which will only be actually computed when called
"""
return Parser(lambda s: gen().parse(s))
def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]':
"""
A monadic bind operator - allows a parser to be generated from its precenent
### Example
We generate a parser which reads a number then accepts exactly that many plus
tokens, returning the last one.
>>> some_n = Parser.token(TrivialTok.NUM)
>>> n_plus = some_n.bind(lambda prev_result:
... reduce(
... lambda a, p: a.bind(k(p)),
... [
... Parser.token(TrivialTok.ADD)
... for i in range(int(prev_result.matched_string))
... ]
... )
... )
**Sample Run 1**: We parse the string `3+++`. Since this is a three followed by
exactly three plus signs, this should parse successfully. Sure enough, the result
contains exactly one possibility, where the `3+++` has been consumed, leaving only
the EOF, and returning the value of the last plus sign.
>>> n_plus.parse(lex_trivial('3+++'))
Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])])
**Simple Run 2**: We parse the string `3++`. This only has two of the three plus
signs, so we should expect it to fail. As expected, at does, correctly
identifying that it saw an EOF while expecting an ADD.
>>> n_plus.parse(lex_trivial('3++'))
Err([([EOF: '']@(1, 4-4), ADD)])
"""
def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]:
successes, errors = partition([
f(out1).parse(stream)
for (out1, stream) in results
])
if len(successes):
return Ok([p for s in successes for p in s])
else:
return Err([e for errs in errors for e in errs])
def inner(input: LexStream) -> ParserResult[Out2, TokN]:
return self.parse(input) << handle_results
return Parser(inner)
def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]':
"""
Transform the output of some parser with a function
This is a particularly useful method, because it allows converting parsers which
return lexemes (e.g. `Parser.token()`) into parsers that return other thing.
As an example, here's a parser which parses a number, and returns it as a number.
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_num.parse(lex_trivial('3'))
Ok([(3, [[EOF: '']@(1, 2-2)])])
"""
return self.bind(c(Parser.epsilon, f)) #type: ignore
def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]':
"""
Apply the function which this returns to the value produced by another parser
Equivalent to the fapply method of an applicative functor.
"""
return self.bind(p(Parser.map, arg)) #type: ignore
def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]':
"""
A reversed version of `fapply()`
Applies the function returned by the argument to the value returned by this.
"""
return self.bind(lambda v: arg.map(lambda f: f(v)))
def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]':
"""
Parses two things in series, ignoring the output of the second parser
Example: Parse a number followed by any letter (ignored)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR))
>>> parse_numlet.parse(lex_trivial('4a'))
Ok([(4, [[EOF: '']@(1, 3-3)])])
"""
return self.map(k).fapply(subsequent) #type:ignore
def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]':
"""
A shorthand for calling `seq_ignore()` with `Parser.token`
Example: Parse a number followed by any letter (ignored)
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
>>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR)
>>> parse_numlet.parse(lex_trivial('4a'))
Ok([(4, [[EOF: '']@(1, 3-3)])])
"""
return self.seq_ignore(Parser.token(subsequent))
def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]':
"""
Returns a parser which succeeds if this or any arguments succeed
**Example:** A parser which parses a letter or a number
>>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR))
>>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS
Ok(...)
>>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS
Ok(...)
Notice that this can produce multiple successes. A simple example would be a
parser which parses either a single number or an empty string (epsilon). When
faced with an input stream starting with a number, it could either parse that
number, returning the rest of the input stream, or parse the empty string,
returning the input unchanged.
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))\\
... .or_(Parser.epsilon(-1))
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
Ok([(3, [[EOF: '']@(1, 2-2)]),
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
"""
all_parsers = (self, *parsers)
def inner(input: LexStream) -> ParserResult[Out, TokN]:
successes, failures = partition([p.parse(input) for p in all_parsers])
if len(successes):
return Ok([successful_path for success in successes for successful_path in success])
else:
return Err([expectation for failure in failures for expectation in failure])
return Parser(inner)
def opt(self, fallback: Out) -> 'Parser[Out, TokN]':
"""
Parse one or zero of this thing, returning fallback if not parsing it
We can use this to write a simpler example of parse_maybe_num from the `or_()`
example:
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))\\
... .opt(-1)
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
Ok([(3, [[EOF: '']@(1, 2-2)]),
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
"""
return self.or_(Parser.epsilon(fallback))
def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]':
"""
Create a new parser which accepts any number of instances of this parser
The combine argument is a function which joins the result of this parser with the
result of parsing the 0 or more identical parsers on the right. Think of this
like a reduce.
As an example, here's a parser which parses any number of numbers, and sums them
all together.
We start with the single number parser from the `map()` example.
>>> parse_num = Parser.token(TrivialTok.NUM)\\
... .map(lambda l: int(l.matched_string))
Then we call `many()` on it. We also add the EOF parser to the end to force it to
parse the whole input.
>>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\
... .seq_ignore_tok(TrivialTok.EOF)
**Sample Run 1:** Sum of the numbers 1, 2, and 3. This produces the operation (1
+ (2 + (3 + 0))
>>> parse_nums.parse(lex_trivial('1 2 3'))
Ok([(6, [])])
**Sample Run 2:** If attempting to sum no numbers, we get back the default
argument, zero in this case.
>>> parse_nums.parse(lex_trivial(''))
Ok([(0, [])])
"""
return (self.map(combine)
.fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore
.opt(default))
if __name__ == '__main__':
import doctest
doctest.testmod()