from emis_funky_funktions import * from dataclasses import dataclass from enum import auto, IntEnum from operator import is_not from re import compile, Pattern from typing import Collection, Tuple, List, NewType class Tok(IntEnum): """ All possible tokens used in the grammar """ Newline = auto() Whitespace = auto() PredicateSection = auto() VariablesSection = auto() ConstantsSection = auto() FunctionsSection = auto() ClausesSection = auto() Negate = auto() OpenP = auto() CloseP = auto() Comma = auto() Identifier = auto() Eof = auto() def __repr__(self): return self._name_ LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [ (compile(r"\n"), Tok.Newline), (compile(r"[ \t]+"), Tok.Whitespace), (compile("Predicates:"), Tok.PredicateSection), (compile("Variables:"), Tok.VariablesSection), (compile("Constants:"), Tok.ConstantsSection), (compile("Functions:"), Tok.FunctionsSection), (compile("Clauses:"), Tok.ClausesSection), (compile("!"), Tok.Negate), (compile(r"\("), Tok.OpenP), (compile(r"\)"), Tok.CloseP), (compile(","), Tok.Comma), (compile(r"\w+"), Tok.Identifier), ] """ A mapping of regexs to the tokens the identify Tokens earlier on in the list should be regarded as higher priority, even if a match lower on the list also matches. All unicode strings should be matched by at least one token. """ def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]: """ Attempt to recognize a single token against a full input string If successful, returns the token provided as an argument, the part of the input which matched, and the rest of the input. Otherwise, returns `None` >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc") Some((('NUMBER', '123'), 'abc')) >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None True """ match regex.match(input): case None: return None case match: assert match is not None return Some(((tok, match.group()), input[match.end():])) def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]: """ Attempt to lex an entire input string. Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list should be regarded as higher priority, even if a match lower on the list also matches. Any tokens in the `drop_tokens` list will be dropped from the output. If the lexer is unable to match the input string with any of the tokens, then an `Err` is returned containing the section of the input that failed to match. >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'), (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'), (CloseP, ')'), (Identifier, 'person')]) >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\nšŸ† !man(x5)') Err('šŸ† !man(x5)') """ if len(input): try: lexeme, rest_input = next( unwrap_opt(maybe_lexeme) for maybe_lexeme in ( try_lex1(regex, tok, input) for (regex, tok) in lex_table ) if isinstance(maybe_lexeme, Some) ) except StopIteration: return Err(input) if lexeme[0] not in drop_tokens: prefix.append(lexeme) return tokenize(lex_table, drop_tokens, rest_input, prefix) else: return Ok(prefix) if __name__ == '__main__': # print(tokenize(open('sample.cnf').read())) import doctest doctest.testmod()