Souped up the lexer to track line & col numbers

2023-03-04 17:04:23 -05:00 · 2023-03-04 17:04:23 -05:00 · 3afed0c2e0
parent 61ee996c7a
commit 3afed0c2e0
1 changed files with 48 additions and 25 deletions
--- a/lex.py
+++ b/lex.py
@ -2,22 +2,32 @@ from emis_funky_funktions import *

 from dataclasses import dataclass
 from enum import auto, IntEnum
-from operator import is_not
+from operator import eq, is_not
 from re import Pattern

 from typing import Collection, Tuple, List, NewType

-def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
+@dataclass(frozen=True)
+class Lexeme(Generic[B]):
+	token: B
+	matched_string: str
+	line: int
+	col_start: int
+	col_end: int
+	def __repr__(self):
+		return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})'
+
+def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]:
 	"""
 	Attempt to recognize a single token against a full input string

 	If successful, returns the token provided as an argument, the part of the input which
 	matched, and the rest of the input.  Otherwise, returns `None`

-	>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc")
-	Some((('NUMBER', '123'), 'abc'))
+	>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1)
+	Some((['NUMBER': '123']@(1, 1-4), 'abc'))

-	>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None
+	>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None
 	True
 	"""
 	match regex.match(input):
@ -25,10 +35,14 @@ def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, s
 			return None
 		case match:
 			assert match is not None
-			return Some(((tok, match.group()), input[match.end():]))
+			return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end()), input[match.end():]))


-def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]:
+def tokenize(
+	lex_table: Collection[Tuple[Pattern[str], A]],
+	drop_tokens: Collection[A],
+	input: str
+) -> Result[List[Lexeme[A]], str]:
 	"""
 	Attempt to lex an entire input string.

@ -41,30 +55,39 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
 	is returned containing the section of the input that failed to match.

 	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
-	Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'),
-	    (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'),
-	    (CloseP, ')'), (Identifier, 'person')])
+	Ok([[ClausesSection: 'Clauses:']@(1, 1-9), [Newline: '\\n']@(1, 10-11),
+	    [Negate: '!']@(2, 1-2), [Identifier: 'man']@(2, 2-5), [OpenP: '(']@(2, 5-6),
+	    [Identifier: 'x5']@(2, 6-8), [CloseP: ')']@(2, 8-9),
+	    [Identifier: 'person']@(2, 10-16)])

 	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
 	Err('🍆 !man(x5)')
 	"""
-	if len(input):
-		try:
-			lexeme, rest_input = next(
-				unwrap_opt(maybe_lexeme)
-				for maybe_lexeme in (
-					try_lex1(regex, tok, input)
-					for (regex, tok) in lex_table
+	def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]:
+		if len(input):
+			try:
+				lexeme, rest_input = next(
+					unwrap_opt(maybe_lexeme)
+					for maybe_lexeme in (
+						try_lex1(regex, tok, input, line_no, col_no)
+						for (regex, tok) in lex_table
+					)
+					if isinstance(maybe_lexeme, Some)
 				)
-				if isinstance(maybe_lexeme, Some)
+			except StopIteration:
+				return Err(input)
+			if lexeme.token not in drop_tokens:
+				prefix.append(lexeme)
+			newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string)))
+			new_col_no = (
+				len(lexeme.matched_string) - lexeme.matched_string.rfind('\n')
+				if newline_count else
+				col_no + len(lexeme.matched_string)
 			)
-		except StopIteration:
-			return Err(input)
-		if lexeme[0] not in drop_tokens:
-			prefix.append(lexeme)
-		return tokenize(lex_table, drop_tokens, rest_input, prefix)
-	else:
-		return Ok(prefix)
+			return inner(rest_input, line_no+newline_count, new_col_no, prefix)
+		else:
+			return Ok(prefix)
+	return inner(input, 1, 1, [])


 if __name__ == '__main__':