Souped up the lexer to track line & col numbers

This commit is contained in:
Emi Simpson 2023-03-04 17:04:23 -05:00
parent 61ee996c7a
commit 3afed0c2e0
Signed by: Emi
GPG key ID: A12F2C2FFDC3D847

49
lex.py
View file

@ -2,22 +2,32 @@ from emis_funky_funktions import *
from dataclasses import dataclass from dataclasses import dataclass
from enum import auto, IntEnum from enum import auto, IntEnum
from operator import is_not from operator import eq, is_not
from re import Pattern from re import Pattern
from typing import Collection, Tuple, List, NewType from typing import Collection, Tuple, List, NewType
def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]: @dataclass(frozen=True)
class Lexeme(Generic[B]):
token: B
matched_string: str
line: int
col_start: int
col_end: int
def __repr__(self):
return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})'
def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]:
""" """
Attempt to recognize a single token against a full input string Attempt to recognize a single token against a full input string
If successful, returns the token provided as an argument, the part of the input which If successful, returns the token provided as an argument, the part of the input which
matched, and the rest of the input. Otherwise, returns `None` matched, and the rest of the input. Otherwise, returns `None`
>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc") >>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1)
Some((('NUMBER', '123'), 'abc')) Some((['NUMBER': '123']@(1, 1-4), 'abc'))
>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None >>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None
True True
""" """
match regex.match(input): match regex.match(input):
@ -25,10 +35,14 @@ def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, s
return None return None
case match: case match:
assert match is not None assert match is not None
return Some(((tok, match.group()), input[match.end():])) return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end()), input[match.end():]))
def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]: def tokenize(
lex_table: Collection[Tuple[Pattern[str], A]],
drop_tokens: Collection[A],
input: str
) -> Result[List[Lexeme[A]], str]:
""" """
Attempt to lex an entire input string. Attempt to lex an entire input string.
@ -41,30 +55,39 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
is returned containing the section of the input that failed to match. is returned containing the section of the input that failed to match.
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'), Ok([[ClausesSection: 'Clauses:']@(1, 1-9), [Newline: '\\n']@(1, 10-11),
(Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'), [Negate: '!']@(2, 1-2), [Identifier: 'man']@(2, 2-5), [OpenP: '(']@(2, 5-6),
(CloseP, ')'), (Identifier, 'person')]) [Identifier: 'x5']@(2, 6-8), [CloseP: ')']@(2, 8-9),
[Identifier: 'person']@(2, 10-16)])
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)') >>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
Err('🍆 !man(x5)') Err('🍆 !man(x5)')
""" """
def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]:
if len(input): if len(input):
try: try:
lexeme, rest_input = next( lexeme, rest_input = next(
unwrap_opt(maybe_lexeme) unwrap_opt(maybe_lexeme)
for maybe_lexeme in ( for maybe_lexeme in (
try_lex1(regex, tok, input) try_lex1(regex, tok, input, line_no, col_no)
for (regex, tok) in lex_table for (regex, tok) in lex_table
) )
if isinstance(maybe_lexeme, Some) if isinstance(maybe_lexeme, Some)
) )
except StopIteration: except StopIteration:
return Err(input) return Err(input)
if lexeme[0] not in drop_tokens: if lexeme.token not in drop_tokens:
prefix.append(lexeme) prefix.append(lexeme)
return tokenize(lex_table, drop_tokens, rest_input, prefix) newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string)))
new_col_no = (
len(lexeme.matched_string) - lexeme.matched_string.rfind('\n')
if newline_count else
col_no + len(lexeme.matched_string)
)
return inner(rest_input, line_no+newline_count, new_col_no, prefix)
else: else:
return Ok(prefix) return Ok(prefix)
return inner(input, 1, 1, [])
if __name__ == '__main__': if __name__ == '__main__':