From 00043d27dde848058f1d0fab5da0aa0d513a843a Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Sat, 4 Mar 2023 12:16:00 -0500 Subject: [PATCH] Genericize the lexer --- lex.py | 115 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 lex.py diff --git a/lex.py b/lex.py new file mode 100644 index 0000000..255d68b --- /dev/null +++ b/lex.py @@ -0,0 +1,115 @@ +from emis_funky_funktions import * + +from dataclasses import dataclass +from enum import auto, IntEnum +from operator import is_not +from re import compile, Pattern + +from typing import Collection, Tuple, List, NewType + +class Tok(IntEnum): + """ + All possible tokens used in the grammar + """ + Newline = auto() + Whitespace = auto() + PredicateSection = auto() + VariablesSection = auto() + ConstantsSection = auto() + FunctionsSection = auto() + ClausesSection = auto() + Negate = auto() + OpenP = auto() + CloseP = auto() + Comma = auto() + Identifier = auto() + Eof = auto() + + def __repr__(self): + return self._name_ + +LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [ + (compile(r"\n"), Tok.Newline), + (compile(r"[ \t]+"), Tok.Whitespace), + (compile("Predicates:"), Tok.PredicateSection), + (compile("Variables:"), Tok.VariablesSection), + (compile("Constants:"), Tok.ConstantsSection), + (compile("Functions:"), Tok.FunctionsSection), + (compile("Clauses:"), Tok.ClausesSection), + (compile("!"), Tok.Negate), + (compile(r"\("), Tok.OpenP), + (compile(r"\)"), Tok.CloseP), + (compile(","), Tok.Comma), + (compile(r"\w+"), Tok.Identifier), +] +""" +A mapping of regexs to the tokens the identify + +Tokens earlier on in the list should be regarded as higher priority, even if a match lower +on the list also matches. All unicode strings should be matched by at least one token. +""" + +def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]: + """ + Attempt to recognize a single token against a full input string + + If successful, returns the token provided as an argument, the part of the input which + matched, and the rest of the input. Otherwise, returns `None` + + >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc") + Some((('NUMBER', '123'), 'abc')) + + >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None + True + """ + match regex.match(input): + case None: + return None + case match: + assert match is not None + return Some(((tok, match.group()), input[match.end():])) + + +def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]: + """ + Attempt to lex an entire input string. + + Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list + should be regarded as higher priority, even if a match lower on the list also matches. + + Any tokens in the `drop_tokens` list will be dropped from the output. + + If the lexer is unable to match the input string with any of the tokens, then an `Err` + is returned containing the section of the input that failed to match. + + >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE + Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'), + (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'), + (CloseP, ')'), (Identifier, 'person')]) + + >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\nšŸ† !man(x5)') + Err('šŸ† !man(x5)') + """ + if len(input): + try: + lexeme, rest_input = next( + unwrap_opt(maybe_lexeme) + for maybe_lexeme in ( + try_lex1(regex, tok, input) + for (regex, tok) in lex_table + ) + if isinstance(maybe_lexeme, Some) + ) + except StopIteration: + return Err(input) + if lexeme[0] not in drop_tokens: + prefix.append(lexeme) + return tokenize(lex_table, drop_tokens, rest_input, prefix) + else: + return Ok(prefix) + + +if __name__ == '__main__': + # print(tokenize(open('sample.cnf').read())) + import doctest + doctest.testmod() \ No newline at end of file