from emis_funky_funktions import * from dataclasses import dataclass from enum import auto, IntEnum from operator import eq, is_not from re import Pattern from typing import Collection, Tuple, List, NewType @dataclass(frozen=True) class Lexeme(Generic[B]): token: B matched_string: str line: int col_start: int col_end: int def __repr__(self): return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})' def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]: """ Attempt to recognize a single token against a full input string If successful, returns the token provided as an argument, the part of the input which matched, and the rest of the input. Otherwise, returns `None` >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1) Some((['NUMBER': '123']@(1, 1-3), 'abc')) >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None True """ match regex.match(input): case None: return None case match: assert match is not None return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end() - 1), input[match.end():])) def tokenize( lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], eof_token: A, input: str ) -> Result[List[Lexeme[A]], str]: """ Attempt to lex an entire input string. Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list should be regarded as higher priority, even if a match lower on the list also matches. Any tokens in the `drop_tokens` list will be dropped from the output. If the lexer is unable to match the input string with any of the tokens, then an `Err` is returned containing the section of the input that failed to match. >>> tokenize(LEX_TABLE, [Tok.Whitespace], Tok.Eof, 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE Ok([[ClausesSection: 'Clauses:']@(1, 1-8), [Newline: '\\n']@(1, 10-10), [Negate: '!']@(2, 1-1), [Identifier: 'man']@(2, 2-4), [OpenP: '(']@(2, 5-5), [Identifier: 'x5']@(2, 6-7), [CloseP: ')']@(2, 8-8), [Identifier: 'person']@(2, 10-15), [Eof: '']@(2, 16-16)]) >>> tokenize(LEX_TABLE, [Tok.Whitespace], Tok.Eof,'Clauses: \\nšŸ† !man(x5)') Err('šŸ† !man(x5)') """ def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]: if len(input): try: lexeme, rest_input = next( unwrap_opt(maybe_lexeme) for maybe_lexeme in ( try_lex1(regex, tok, input, line_no, col_no) for (regex, tok) in lex_table ) if isinstance(maybe_lexeme, Some) ) except StopIteration: return Err(input) if lexeme.token not in drop_tokens: prefix.append(lexeme) newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string))) new_col_no = ( len(lexeme.matched_string) - lexeme.matched_string.rfind('\n') if newline_count else col_no + len(lexeme.matched_string) ) return inner(rest_input, line_no+newline_count, new_col_no, prefix) else: return Ok(prefix + [Lexeme(eof_token, '', line_no, col_no, col_no)]) return inner(input, 1, 1, []) if __name__ == '__main__': # print(tokenize(open('sample.cnf').read())) import doctest from re import compile from grammar import Tok, LEX_TABLE doctest.testmod()