From 3afed0c2e08b466f139adefd4337a4a89a41a362 Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Sat, 4 Mar 2023 17:04:23 -0500 Subject: [PATCH] Souped up the lexer to track line & col numbers --- lex.py | 73 ++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 48 insertions(+), 25 deletions(-) diff --git a/lex.py b/lex.py index 7808f7f..4051c5a 100644 --- a/lex.py +++ b/lex.py @@ -2,22 +2,32 @@ from emis_funky_funktions import * from dataclasses import dataclass from enum import auto, IntEnum -from operator import is_not +from operator import eq, is_not from re import Pattern from typing import Collection, Tuple, List, NewType -def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]: +@dataclass(frozen=True) +class Lexeme(Generic[B]): + token: B + matched_string: str + line: int + col_start: int + col_end: int + def __repr__(self): + return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})' + +def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]: """ Attempt to recognize a single token against a full input string If successful, returns the token provided as an argument, the part of the input which matched, and the rest of the input. Otherwise, returns `None` - >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc") - Some((('NUMBER', '123'), 'abc')) + >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1) + Some((['NUMBER': '123']@(1, 1-4), 'abc')) - >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None + >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None True """ match regex.match(input): @@ -25,10 +35,14 @@ def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, s return None case match: assert match is not None - return Some(((tok, match.group()), input[match.end():])) + return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end()), input[match.end():])) -def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]: +def tokenize( + lex_table: Collection[Tuple[Pattern[str], A]], + drop_tokens: Collection[A], + input: str +) -> Result[List[Lexeme[A]], str]: """ Attempt to lex an entire input string. @@ -41,30 +55,39 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect is returned containing the section of the input that failed to match. >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE - Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'), - (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'), - (CloseP, ')'), (Identifier, 'person')]) + Ok([[ClausesSection: 'Clauses:']@(1, 1-9), [Newline: '\\n']@(1, 10-11), + [Negate: '!']@(2, 1-2), [Identifier: 'man']@(2, 2-5), [OpenP: '(']@(2, 5-6), + [Identifier: 'x5']@(2, 6-8), [CloseP: ')']@(2, 8-9), + [Identifier: 'person']@(2, 10-16)]) >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\nšŸ† !man(x5)') Err('šŸ† !man(x5)') """ - if len(input): - try: - lexeme, rest_input = next( - unwrap_opt(maybe_lexeme) - for maybe_lexeme in ( - try_lex1(regex, tok, input) - for (regex, tok) in lex_table + def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]: + if len(input): + try: + lexeme, rest_input = next( + unwrap_opt(maybe_lexeme) + for maybe_lexeme in ( + try_lex1(regex, tok, input, line_no, col_no) + for (regex, tok) in lex_table + ) + if isinstance(maybe_lexeme, Some) ) - if isinstance(maybe_lexeme, Some) + except StopIteration: + return Err(input) + if lexeme.token not in drop_tokens: + prefix.append(lexeme) + newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string))) + new_col_no = ( + len(lexeme.matched_string) - lexeme.matched_string.rfind('\n') + if newline_count else + col_no + len(lexeme.matched_string) ) - except StopIteration: - return Err(input) - if lexeme[0] not in drop_tokens: - prefix.append(lexeme) - return tokenize(lex_table, drop_tokens, rest_input, prefix) - else: - return Ok(prefix) + return inner(rest_input, line_no+newline_count, new_col_no, prefix) + else: + return Ok(prefix) + return inner(input, 1, 1, []) if __name__ == '__main__':