Souped up the lexer to track line & col numbers
This commit is contained in:
parent
61ee996c7a
commit
3afed0c2e0
49
lex.py
49
lex.py
|
@ -2,22 +2,32 @@ from emis_funky_funktions import *
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import auto, IntEnum
|
from enum import auto, IntEnum
|
||||||
from operator import is_not
|
from operator import eq, is_not
|
||||||
from re import Pattern
|
from re import Pattern
|
||||||
|
|
||||||
from typing import Collection, Tuple, List, NewType
|
from typing import Collection, Tuple, List, NewType
|
||||||
|
|
||||||
def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
|
@dataclass(frozen=True)
|
||||||
|
class Lexeme(Generic[B]):
|
||||||
|
token: B
|
||||||
|
matched_string: str
|
||||||
|
line: int
|
||||||
|
col_start: int
|
||||||
|
col_end: int
|
||||||
|
def __repr__(self):
|
||||||
|
return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})'
|
||||||
|
|
||||||
|
def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]:
|
||||||
"""
|
"""
|
||||||
Attempt to recognize a single token against a full input string
|
Attempt to recognize a single token against a full input string
|
||||||
|
|
||||||
If successful, returns the token provided as an argument, the part of the input which
|
If successful, returns the token provided as an argument, the part of the input which
|
||||||
matched, and the rest of the input. Otherwise, returns `None`
|
matched, and the rest of the input. Otherwise, returns `None`
|
||||||
|
|
||||||
>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc")
|
>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1)
|
||||||
Some((('NUMBER', '123'), 'abc'))
|
Some((['NUMBER': '123']@(1, 1-4), 'abc'))
|
||||||
|
|
||||||
>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None
|
>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None
|
||||||
True
|
True
|
||||||
"""
|
"""
|
||||||
match regex.match(input):
|
match regex.match(input):
|
||||||
|
@ -25,10 +35,14 @@ def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, s
|
||||||
return None
|
return None
|
||||||
case match:
|
case match:
|
||||||
assert match is not None
|
assert match is not None
|
||||||
return Some(((tok, match.group()), input[match.end():]))
|
return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end()), input[match.end():]))
|
||||||
|
|
||||||
|
|
||||||
def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]:
|
def tokenize(
|
||||||
|
lex_table: Collection[Tuple[Pattern[str], A]],
|
||||||
|
drop_tokens: Collection[A],
|
||||||
|
input: str
|
||||||
|
) -> Result[List[Lexeme[A]], str]:
|
||||||
"""
|
"""
|
||||||
Attempt to lex an entire input string.
|
Attempt to lex an entire input string.
|
||||||
|
|
||||||
|
@ -41,30 +55,39 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
|
||||||
is returned containing the section of the input that failed to match.
|
is returned containing the section of the input that failed to match.
|
||||||
|
|
||||||
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
|
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
|
||||||
Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'),
|
Ok([[ClausesSection: 'Clauses:']@(1, 1-9), [Newline: '\\n']@(1, 10-11),
|
||||||
(Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'),
|
[Negate: '!']@(2, 1-2), [Identifier: 'man']@(2, 2-5), [OpenP: '(']@(2, 5-6),
|
||||||
(CloseP, ')'), (Identifier, 'person')])
|
[Identifier: 'x5']@(2, 6-8), [CloseP: ')']@(2, 8-9),
|
||||||
|
[Identifier: 'person']@(2, 10-16)])
|
||||||
|
|
||||||
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
|
>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
|
||||||
Err('🍆 !man(x5)')
|
Err('🍆 !man(x5)')
|
||||||
"""
|
"""
|
||||||
|
def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]:
|
||||||
if len(input):
|
if len(input):
|
||||||
try:
|
try:
|
||||||
lexeme, rest_input = next(
|
lexeme, rest_input = next(
|
||||||
unwrap_opt(maybe_lexeme)
|
unwrap_opt(maybe_lexeme)
|
||||||
for maybe_lexeme in (
|
for maybe_lexeme in (
|
||||||
try_lex1(regex, tok, input)
|
try_lex1(regex, tok, input, line_no, col_no)
|
||||||
for (regex, tok) in lex_table
|
for (regex, tok) in lex_table
|
||||||
)
|
)
|
||||||
if isinstance(maybe_lexeme, Some)
|
if isinstance(maybe_lexeme, Some)
|
||||||
)
|
)
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
return Err(input)
|
return Err(input)
|
||||||
if lexeme[0] not in drop_tokens:
|
if lexeme.token not in drop_tokens:
|
||||||
prefix.append(lexeme)
|
prefix.append(lexeme)
|
||||||
return tokenize(lex_table, drop_tokens, rest_input, prefix)
|
newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string)))
|
||||||
|
new_col_no = (
|
||||||
|
len(lexeme.matched_string) - lexeme.matched_string.rfind('\n')
|
||||||
|
if newline_count else
|
||||||
|
col_no + len(lexeme.matched_string)
|
||||||
|
)
|
||||||
|
return inner(rest_input, line_no+newline_count, new_col_no, prefix)
|
||||||
else:
|
else:
|
||||||
return Ok(prefix)
|
return Ok(prefix)
|
||||||
|
return inner(input, 1, 1, [])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in a new issue