Genericize the lexer
This commit is contained in:
parent
03ff0d800e
commit
00043d27dd
115
lex.py
Normal file
115
lex.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
from emis_funky_funktions import *
|
||||
|
||||
from dataclasses import dataclass
|
||||
from enum import auto, IntEnum
|
||||
from operator import is_not
|
||||
from re import compile, Pattern
|
||||
|
||||
from typing import Collection, Tuple, List, NewType
|
||||
|
||||
class Tok(IntEnum):
|
||||
"""
|
||||
All possible tokens used in the grammar
|
||||
"""
|
||||
Newline = auto()
|
||||
Whitespace = auto()
|
||||
PredicateSection = auto()
|
||||
VariablesSection = auto()
|
||||
ConstantsSection = auto()
|
||||
FunctionsSection = auto()
|
||||
ClausesSection = auto()
|
||||
Negate = auto()
|
||||
OpenP = auto()
|
||||
CloseP = auto()
|
||||
Comma = auto()
|
||||
Identifier = auto()
|
||||
Eof = auto()
|
||||
|
||||
def __repr__(self):
|
||||
return self._name_
|
||||
|
||||
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
|
||||
(compile(r"\n"), Tok.Newline),
|
||||
(compile(r"[ \t]+"), Tok.Whitespace),
|
||||
(compile("Predicates:"), Tok.PredicateSection),
|
||||
(compile("Variables:"), Tok.VariablesSection),
|
||||
(compile("Constants:"), Tok.ConstantsSection),
|
||||
(compile("Functions:"), Tok.FunctionsSection),
|
||||
(compile("Clauses:"), Tok.ClausesSection),
|
||||
(compile("!"), Tok.Negate),
|
||||
(compile(r"\("), Tok.OpenP),
|
||||
(compile(r"\)"), Tok.CloseP),
|
||||
(compile(","), Tok.Comma),
|
||||
(compile(r"\w+"), Tok.Identifier),
|
||||
]
|
||||
"""
|
||||
A mapping of regexs to the tokens the identify
|
||||
|
||||
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
|
||||
on the list also matches. All unicode strings should be matched by at least one token.
|
||||
"""
|
||||
|
||||
def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
|
||||
"""
|
||||
Attempt to recognize a single token against a full input string
|
||||
|
||||
If successful, returns the token provided as an argument, the part of the input which
|
||||
matched, and the rest of the input. Otherwise, returns `None`
|
||||
|
||||
>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc")
|
||||
Some((('NUMBER', '123'), 'abc'))
|
||||
|
||||
>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None
|
||||
True
|
||||
"""
|
||||
match regex.match(input):
|
||||
case None:
|
||||
return None
|
||||
case match:
|
||||
assert match is not None
|
||||
return Some(((tok, match.group()), input[match.end():]))
|
||||
|
||||
|
||||
def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]:
|
||||
"""
|
||||
Attempt to lex an entire input string.
|
||||
|
||||
Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list
|
||||
should be regarded as higher priority, even if a match lower on the list also matches.
|
||||
|
||||
Any tokens in the `drop_tokens` list will be dropped from the output.
|
||||
|
||||
If the lexer is unable to match the input string with any of the tokens, then an `Err`
|
||||
is returned containing the section of the input that failed to match.
|
||||
|
||||
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
|
||||
Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'),
|
||||
(Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'),
|
||||
(CloseP, ')'), (Identifier, 'person')])
|
||||
|
||||
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
|
||||
Err('🍆 !man(x5)')
|
||||
"""
|
||||
if len(input):
|
||||
try:
|
||||
lexeme, rest_input = next(
|
||||
unwrap_opt(maybe_lexeme)
|
||||
for maybe_lexeme in (
|
||||
try_lex1(regex, tok, input)
|
||||
for (regex, tok) in lex_table
|
||||
)
|
||||
if isinstance(maybe_lexeme, Some)
|
||||
)
|
||||
except StopIteration:
|
||||
return Err(input)
|
||||
if lexeme[0] not in drop_tokens:
|
||||
prefix.append(lexeme)
|
||||
return tokenize(lex_table, drop_tokens, rest_input, prefix)
|
||||
else:
|
||||
return Ok(prefix)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# print(tokenize(open('sample.cnf').read()))
|
||||
import doctest
|
||||
doctest.testmod()
|
Loading…
Reference in a new issue