500 lines
17 KiB
Python
500 lines
17 KiB
Python
from emis_funky_funktions import *
|
|
|
|
from dataclasses import dataclass
|
|
from enum import auto, IntEnum
|
|
from functools import reduce
|
|
from re import compile, Pattern
|
|
|
|
from lex import Lexeme, tokenize
|
|
|
|
from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, TypeAlias
|
|
|
|
# Grammar
|
|
# S := <P> Eof
|
|
# P := <M> + <P>
|
|
# P := <M>
|
|
# M := <T> * <M>
|
|
# M := <T>
|
|
# T := Letter
|
|
# T := Number
|
|
# T := ( <P> )
|
|
|
|
class TrivialTok(IntEnum):
|
|
"A set of tokens for a trivial grammar used for testing"
|
|
|
|
EOF = auto()
|
|
"Special token: End of file"
|
|
|
|
ADD = auto()
|
|
"Addition (+)"
|
|
|
|
MUL = auto()
|
|
"Multiplication (*)"
|
|
|
|
LTR = auto()
|
|
"A single letter (x)"
|
|
|
|
NUM = auto()
|
|
"A number (3)"
|
|
|
|
OPN = auto()
|
|
"An open paren, i.e. '('"
|
|
|
|
CLS = auto()
|
|
"An close paren, i.e. ')'"
|
|
|
|
NUL = auto()
|
|
"Whitespace ( )"
|
|
|
|
def __repr__(self):
|
|
return self._name_
|
|
|
|
TRIVIAL_LEX_TABLE: Collection[Tuple[Pattern[str], TrivialTok]] = [
|
|
(compile(r"\+" ), TrivialTok.ADD),
|
|
(compile(r"\*" ), TrivialTok.MUL),
|
|
(compile(r"[a-zA-Z]"), TrivialTok.LTR),
|
|
(compile("\d+" ), TrivialTok.NUM),
|
|
(compile("\(" ), TrivialTok.OPN),
|
|
(compile("\)" ), TrivialTok.CLS),
|
|
(compile("\s+" ), TrivialTok.NUL),
|
|
]
|
|
"""
|
|
A mapping of regexs to the tokens the identify in a trivial grammar for testing
|
|
|
|
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
|
|
on the list also matches. All unicode strings should be matched by at least one token.
|
|
"""
|
|
|
|
lex_trivial: Callable[[str], List[Lexeme[TrivialTok]]] =\
|
|
c(unwrap_r, p(tokenize, TRIVIAL_LEX_TABLE, [TrivialTok.NUL], TrivialTok.EOF)) #type: ignore
|
|
"""
|
|
A lexer for the trivial grammar defined above
|
|
|
|
Throws an error if the lex fails.
|
|
|
|
>>> lex_trivial("1 + 3") #doctest: +NORMALIZE_WHITESPACE
|
|
[[NUM: '1']@(1, 1-1),
|
|
[ADD: '+']@(1, 3-3),
|
|
[NUM: '3']@(1, 5-5),
|
|
[EOF: '']@(1, 6-6)]
|
|
"""
|
|
|
|
############# Combinator Parsing ################
|
|
|
|
Out = TypeVar('Out')
|
|
Out1 = TypeVar('Out1')
|
|
Out2 = TypeVar('Out2')
|
|
TokN = TypeVar('TokN')
|
|
LexStream: TypeAlias = Sequence[Lexeme[TokN]]
|
|
ParserResult: TypeAlias = Result[Collection[Tuple[Out, LexStream[TokN]]], Collection[Tuple[Lexeme[TokN], TokN]]]
|
|
|
|
@dataclass(frozen=True)
|
|
class Parser(Generic[Out, TokN]):
|
|
"""
|
|
A parser which consumes a token stream and produces a series of possible parses
|
|
|
|
Each possible parse consists of the subsection of the input stream left over after the
|
|
parse, along with the output of that parse.
|
|
|
|
If the parse fails, the error returned will be a series of two-tuples, each containing
|
|
the lexeme at which the error occurred and the token which was expected in its stead.
|
|
|
|
By the nature of combinator parsing, parsers can be built and tested in small pieces.
|
|
To this end, each combinator method on this class comes with a small example
|
|
demonstrating how to build a trivial parser using that method. However, in the
|
|
following section, we also provide an example for a grammar that is still trivial, but
|
|
slightly less so.
|
|
|
|
### Example: Arithmetic Grammar
|
|
|
|
Let us define the following grammar for basic additive and multiplicative arithmetic
|
|
|
|
```
|
|
S := <P> Eof
|
|
P := <M> + <P>
|
|
P := <M>
|
|
M := <T> * <M>
|
|
M := <T>
|
|
T := Number
|
|
T := ( <P> )
|
|
```
|
|
|
|
As with the rest of the examples in this class, we will use the `TrivialTok` token
|
|
class to build this parser.
|
|
|
|
Working our way from the bottom up, we start with defining a parser for T. For the
|
|
parenthetical production, we use a lazy parser to refer to the parser for P, which
|
|
hasn't been constructed yet.
|
|
|
|
>>> parse_parens = Parser.token(TrivialTok.OPN)\\
|
|
... .bind(k(Parser.lazy(lambda: parse_p)))\\
|
|
... .seq_ignore_tok(TrivialTok.CLS)
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(c(int, Lexeme.get_match))
|
|
>>> parse_t = parse_parens.or_(parse_num)
|
|
|
|
For multiplication, we use `Parser.many()` to represent any number of "* <T>" matches
|
|
and combine them together. This is a slight departure from how our grammar is
|
|
written above. The set of accepted inputs will be the same, but our implementation
|
|
will be left-associative. It is possible to implement this grammar as it is written,
|
|
but this will result in a right-associative structure. Of course, multiplication and
|
|
addition are associative, so in this case it doesn't matter.
|
|
|
|
>>> parse_times = Parser.token(TrivialTok.MUL)\\
|
|
... .map(k(lambda x: lambda y: x * y))\\
|
|
... .fapply(parse_t)\\
|
|
... .many(lambda l: lambda r: c(l, r), lambda x: x)
|
|
>>> parse_m = parse_t.fapply_r(parse_times)
|
|
|
|
Addition is largely the same as multiplication:
|
|
|
|
>>> parse_plus = Parser.token(TrivialTok.ADD)\\
|
|
... .map(k(lambda x: lambda y: x + y))\\
|
|
... .fapply(parse_m)\\
|
|
... .many(lambda l: lambda r: c(l, r), lambda x: x)
|
|
>>> parse_p = parse_m.fapply_r(parse_plus)
|
|
|
|
And finally, we expect and EOF after the end of the term, to ensure that we've reached
|
|
the end of the input.
|
|
|
|
>>> parse_s = parse_p.seq_ignore_tok(TrivialTok.EOF)
|
|
|
|
And now, for a few sample runs:
|
|
|
|
>>> parse_s.parse_(lex_trivial('1'))
|
|
Ok(1)
|
|
|
|
>>> parse_s.parse_(lex_trivial('1 + 2'))
|
|
Ok(3)
|
|
|
|
>>> parse_s.parse_(lex_trivial('1 + 2 * 3 + 4'))
|
|
Ok(11)
|
|
|
|
>>> parse_s.parse_(lex_trivial('(1 + 2) * (3 + 4)'))
|
|
Ok(21)
|
|
|
|
>>> parse_s.parse_(lex_trivial('(1 + 2 * 3) * (4 + 5) + 6'))
|
|
Ok(69)
|
|
|
|
And an example of a bad parse:
|
|
|
|
# TODO fix this
|
|
>>> parse_s.parse_(lex_trivial('1 * * 2')) #doctest: +ELLIPSIS
|
|
Err(...)
|
|
"""
|
|
|
|
parse: Callable[[LexStream[TokN]], ParserResult[Out, TokN]]
|
|
"""
|
|
Run this parser
|
|
|
|
Accepts a stream of tokens to parse, and returns either a list of possible successful
|
|
parses, each with a value and a remainder of the token stream, or a list of all
|
|
failures which lead to the failure of this parser, each with a lexeme they failed at
|
|
and the token the expected.
|
|
|
|
This is meant for use in constructing parsers, and it's probably more useful to call
|
|
`parse_()`.
|
|
"""
|
|
|
|
def parse_(self, input: LexStream[TokN]) -> Result[Out, Mapping[Lexeme[TokN], Collection[TokN]]]:
|
|
"""
|
|
Run this parser, expecting at most one result which consumes the full input stream
|
|
|
|
This is a wrapper around `parse()` with a few features for ease of use, including:
|
|
- expects that at most one single successful parse will be produced
|
|
- asserts that the parse consumed the whole input stream
|
|
- aggregates failures into a multi-set of `Lexeme` -> expected tokens
|
|
|
|
Normal failures are passed through as an `Err`, but if the results returned by a
|
|
successful parse violate the above conditions, an assertion error will be raised.
|
|
However, by carefully constructing your grammar, you can ensure that this will not
|
|
happen for any input.
|
|
|
|
Because of these assertions, the successful return value of this function is
|
|
*just* the output type of this parse, which is much easier to use.
|
|
|
|
### Example
|
|
|
|
A parser which parses a single number. Notice that we also require that it parses
|
|
an EOF. Without this, it would be unable to consume the entire input stream, and
|
|
thus would fail.
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM).seq_ignore_tok(TrivialTok.EOF)
|
|
>>> parse_num.parse_(lex_trivial('1312'))
|
|
Ok([NUM: '1312']@(1, 1-4))
|
|
"""
|
|
match self.parse(input):
|
|
case Err(errors):
|
|
# The input is bad
|
|
failure_locations = FSet(lex for (lex, expected) in errors)
|
|
return Err({
|
|
location: FSet(expected for (lex, expected) in errors if lex == location)
|
|
for location in failure_locations
|
|
})
|
|
case Ok([result1, result2, *rest] as possible_parses):
|
|
# The grammar is bad
|
|
raise AssertionError("Parse returned multiple possible parses", possible_parses)
|
|
case Ok([(value, [non_empty, *rest] as remainder)]):
|
|
# The grammar is bad
|
|
raise AssertionError("Parse failed to consume the whole input, and left remainder", remainder)
|
|
case Ok([]):
|
|
# The parser code is bad
|
|
raise AssertionError('"Successful" parse returned no possible parses')
|
|
case Ok([(value, [])]):
|
|
return Ok(value)
|
|
# The code in this function is bad
|
|
raise AssertionError('Unreachable')
|
|
|
|
@staticmethod
|
|
def epsilon(ret: Out) -> 'Parser[Out, TokN]':
|
|
"""
|
|
Parse an empty string, then return a constant
|
|
|
|
Always succeeds, and always produces exactly one possibility
|
|
|
|
>>> Parser.epsilon(100).parse(lex_trivial("+"))
|
|
Ok(((100, [[ADD: '+']@(1, 1-1), [EOF: '']@(1, 2-2)]),))
|
|
"""
|
|
return Parser(lambda s: Ok(((ret, s),)))
|
|
|
|
@staticmethod
|
|
def token(t: TokN) -> 'Parser[Lexeme[TokN], TokN]':
|
|
"""
|
|
Parser that only accepts a single token, and returns the parsed lexeme
|
|
|
|
The first argument is a function which, given a lexeme, returns the token that
|
|
that lexeme instantiates. The second argument is a token which this parser should
|
|
accept
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)
|
|
|
|
>>> parse_num.parse(lex_trivial('3'))
|
|
Ok((([NUM: '3']@(1, 1-1), [[EOF: '']@(1, 2-2)]),))
|
|
|
|
>>> parse_num.parse(lex_trivial('x'))
|
|
Err((([LTR: 'x']@(1, 1-1), NUM),))
|
|
"""
|
|
def parse_single(input: LexStream) -> ParserResult[Lexeme[TokN], TokN]:
|
|
match input:
|
|
case [lexeme, *rest] if lexeme.token == t:
|
|
return Ok(((lexeme, rest),))
|
|
case [bad_lexeme, *rest]:
|
|
return Err(((bad_lexeme, t),))
|
|
case []:
|
|
raise Exception('Bad grammar! Reached an empty input')
|
|
raise Exception('Unreachable')
|
|
return Parser(parse_single)
|
|
|
|
@staticmethod
|
|
def lazy(
|
|
gen: 'Callable[[], Parser[Out, TokN]]',
|
|
) -> 'Parser[Out, TokN]':
|
|
"""
|
|
A stand-in parser which will only be actually computed when called
|
|
"""
|
|
return Parser(lambda s: gen().parse(s))
|
|
|
|
def bind(self, f: 'Callable[[Out], Parser[Out2, TokN]]') -> 'Parser[Out2, TokN]':
|
|
"""
|
|
A monadic bind operator - allows a parser to be generated from its precenent
|
|
|
|
### Example
|
|
|
|
We generate a parser which reads a number then accepts exactly that many plus
|
|
tokens, returning the last one.
|
|
|
|
>>> some_n = Parser.token(TrivialTok.NUM)
|
|
>>> n_plus = some_n.bind(lambda prev_result:
|
|
... reduce(
|
|
... lambda a, p: a.bind(k(p)),
|
|
... [
|
|
... Parser.token(TrivialTok.ADD)
|
|
... for i in range(int(prev_result.matched_string))
|
|
... ]
|
|
... )
|
|
... )
|
|
|
|
**Sample Run 1**: We parse the string `3+++`. Since this is a three followed by
|
|
exactly three plus signs, this should parse successfully. Sure enough, the result
|
|
contains exactly one possibility, where the `3+++` has been consumed, leaving only
|
|
the EOF, and returning the value of the last plus sign.
|
|
|
|
>>> n_plus.parse(lex_trivial('3+++'))
|
|
Ok([([ADD: '+']@(1, 4-4), [[EOF: '']@(1, 5-5)])])
|
|
|
|
**Simple Run 2**: We parse the string `3++`. This only has two of the three plus
|
|
signs, so we should expect it to fail. As expected, at does, correctly
|
|
identifying that it saw an EOF while expecting an ADD.
|
|
|
|
>>> n_plus.parse(lex_trivial('3++'))
|
|
Err([([EOF: '']@(1, 4-4), ADD)])
|
|
"""
|
|
def handle_results(results: Collection[Tuple[Out, LexStream]]) -> ParserResult[Out2, TokN]:
|
|
successes, errors = partition([
|
|
f(out1).parse(stream)
|
|
for (out1, stream) in results
|
|
])
|
|
if len(successes):
|
|
return Ok([p for s in successes for p in s])
|
|
else:
|
|
return Err([e for errs in errors for e in errs])
|
|
def inner(input: LexStream) -> ParserResult[Out2, TokN]:
|
|
return self.parse(input) << handle_results
|
|
return Parser(inner)
|
|
|
|
def map(self, f: Callable[[Out], B]) -> 'Parser[B, TokN]':
|
|
"""
|
|
Transform the output of some parser with a function
|
|
|
|
This is a particularly useful method, because it allows converting parsers which
|
|
return lexemes (e.g. `Parser.token()`) into parsers that return other thing.
|
|
|
|
As an example, here's a parser which parses a number, and returns it as a number.
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))
|
|
>>> parse_num.parse(lex_trivial('3'))
|
|
Ok([(3, [[EOF: '']@(1, 2-2)])])
|
|
"""
|
|
return self.bind(c(Parser.epsilon, f)) #type: ignore
|
|
|
|
def fapply(self: 'Parser[Callable[[Out1], Out2], TokN]', arg: 'Parser[Out1, TokN]') -> 'Parser[Out2, TokN]':
|
|
"""
|
|
Apply the function which this returns to the value produced by another parser
|
|
|
|
Equivalent to the fapply method of an applicative functor.
|
|
"""
|
|
return self.bind(p(Parser.map, arg)) #type: ignore
|
|
|
|
def fapply_r(self: 'Parser[Out, TokN]', arg: 'Parser[Callable[[Out], Out2], TokN]') -> 'Parser[Out2, TokN]':
|
|
"""
|
|
A reversed version of `fapply()`
|
|
|
|
Applies the function returned by the argument to the value returned by this.
|
|
"""
|
|
return self.bind(lambda v: arg.map(lambda f: f(v)))
|
|
|
|
def seq_ignore(self, subsequent: 'Parser[Any, TokN]') -> 'Parser[Out, TokN]':
|
|
"""
|
|
Parses two things in series, ignoring the output of the second parser
|
|
|
|
Example: Parse a number followed by any letter (ignored)
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))
|
|
>>> parse_numlet = parse_num.seq_ignore(Parser.token(TrivialTok.LTR))
|
|
>>> parse_numlet.parse(lex_trivial('4a'))
|
|
Ok([(4, [[EOF: '']@(1, 3-3)])])
|
|
"""
|
|
return self.map(k).fapply(subsequent) #type:ignore
|
|
|
|
def seq_ignore_tok(self, subsequent: TokN) -> 'Parser[Out, TokN]':
|
|
"""
|
|
A shorthand for calling `seq_ignore()` with `Parser.token`
|
|
|
|
Example: Parse a number followed by any letter (ignored)
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))
|
|
>>> parse_numlet = parse_num.seq_ignore_tok(TrivialTok.LTR)
|
|
>>> parse_numlet.parse(lex_trivial('4a'))
|
|
Ok([(4, [[EOF: '']@(1, 3-3)])])
|
|
"""
|
|
return self.seq_ignore(Parser.token(subsequent))
|
|
|
|
def or_(self: 'Parser[Out, TokN]', *parsers: 'Parser[Out, TokN]') -> 'Parser[Out, TokN]':
|
|
"""
|
|
Returns a parser which succeeds if this or any arguments succeed
|
|
|
|
**Example:** A parser which parses a letter or a number
|
|
|
|
>>> parse_or = Parser.token(TrivialTok.NUM).or_(Parser.token(TrivialTok.LTR))
|
|
>>> parse_or.parse(lex_trivial('a')) #doctest: +ELLIPSIS
|
|
Ok(...)
|
|
>>> parse_or.parse(lex_trivial('1')) #doctest: +ELLIPSIS
|
|
Ok(...)
|
|
|
|
Notice that this can produce multiple successes. A simple example would be a
|
|
parser which parses either a single number or an empty string (epsilon). When
|
|
faced with an input stream starting with a number, it could either parse that
|
|
number, returning the rest of the input stream, or parse the empty string,
|
|
returning the input unchanged.
|
|
|
|
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))\\
|
|
... .or_(Parser.epsilon(-1))
|
|
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
|
|
Ok([(3, [[EOF: '']@(1, 2-2)]),
|
|
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
|
|
|
|
Of course, this can also produce multiple failures as well:
|
|
|
|
>>> parse_or.parse(lex_trivial('+'))
|
|
Err([([ADD: '+']@(1, 1-1), NUM), ([ADD: '+']@(1, 1-1), LTR)])
|
|
"""
|
|
all_parsers = (self, *parsers)
|
|
def inner(input: LexStream) -> ParserResult[Out, TokN]:
|
|
successes, failures = partition([p.parse(input) for p in all_parsers])
|
|
if len(successes):
|
|
return Ok([successful_path for success in successes for successful_path in success])
|
|
else:
|
|
return Err([expectation for failure in failures for expectation in failure])
|
|
return Parser(inner)
|
|
|
|
def opt(self, fallback: Out) -> 'Parser[Out, TokN]':
|
|
"""
|
|
Parse one or zero of this thing, returning fallback if not parsing it
|
|
|
|
We can use this to write a simpler example of parse_maybe_num from the `or_()`
|
|
example:
|
|
|
|
>>> parse_maybe_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))\\
|
|
... .opt(-1)
|
|
>>> parse_maybe_num.parse(lex_trivial('3')) #doctest: +NORMALIZE_WHITESPACE
|
|
Ok([(3, [[EOF: '']@(1, 2-2)]),
|
|
(-1, [[NUM: '3']@(1, 1-1), [EOF: '']@(1, 2-2)])])
|
|
"""
|
|
return self.or_(Parser.epsilon(fallback))
|
|
|
|
def many(self, combine: Callable[[Out], Callable[[Out2], Out2]], default: Out2) -> 'Parser[Out2, TokN]':
|
|
"""
|
|
Create a new parser which accepts any number of instances of this parser
|
|
|
|
The combine argument is a function which joins the result of this parser with the
|
|
result of parsing the 0 or more identical parsers on the right. Think of this
|
|
like a reduce.
|
|
|
|
As an example, here's a parser which parses any number of numbers, and sums them
|
|
all together.
|
|
|
|
We start with the single number parser from the `map()` example.
|
|
|
|
>>> parse_num = Parser.token(TrivialTok.NUM)\\
|
|
... .map(lambda l: int(l.matched_string))
|
|
|
|
Then we call `many()` on it. We also add the EOF parser to the end to force it to
|
|
parse the whole input.
|
|
|
|
>>> parse_nums = parse_num.many(lambda num: lambda sum: num + sum, 0)\\
|
|
... .seq_ignore_tok(TrivialTok.EOF)
|
|
|
|
**Sample Run 1:** Sum of the numbers 1, 2, and 3. This produces the operation (1
|
|
+ (2 + (3 + 0))
|
|
|
|
>>> parse_nums.parse(lex_trivial('1 2 3'))
|
|
Ok([(6, [])])
|
|
|
|
**Sample Run 2:** If attempting to sum no numbers, we get back the default
|
|
argument, zero in this case.
|
|
|
|
>>> parse_nums.parse(lex_trivial(''))
|
|
Ok([(0, [])])
|
|
"""
|
|
return (self.map(combine)
|
|
.fapply(Parser.lazy(p(self.many, combine, default))) #type: ignore
|
|
.opt(default))
|
|
|
|
if __name__ == '__main__':
|
|
import doctest
|
|
doctest.testmod() |