Genericize the lexer

2023-03-04 12:16:00 -05:00 · 2023-03-04 12:16:00 -05:00 · 00043d27dd
parent 03ff0d800e
commit 00043d27dd
1 changed files with 115 additions and 0 deletions
--- a/lex.py
+++ b/lex.py
@ -0,0 +1,115 @@
+from emis_funky_funktions import *
+
+from dataclasses import dataclass
+from enum import auto, IntEnum
+from operator import is_not
+from re import compile, Pattern
+
+from typing import Collection, Tuple, List, NewType
+
+class Tok(IntEnum):
+	"""
+	All possible tokens used in the grammar
+	"""
+	Newline = auto()
+	Whitespace = auto()
+	PredicateSection = auto()
+	VariablesSection = auto()
+	ConstantsSection = auto()
+	FunctionsSection = auto()
+	ClausesSection = auto()
+	Negate = auto()
+	OpenP = auto()
+	CloseP = auto()
+	Comma = auto()
+	Identifier = auto()
+	Eof = auto()
+
+	def __repr__(self):
+		return self._name_
+
+LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
+	(compile(r"\n"), Tok.Newline),
+	(compile(r"[ \t]+"), Tok.Whitespace),
+	(compile("Predicates:"), Tok.PredicateSection),
+	(compile("Variables:"), Tok.VariablesSection),
+	(compile("Constants:"), Tok.ConstantsSection),
+	(compile("Functions:"), Tok.FunctionsSection),
+	(compile("Clauses:"), Tok.ClausesSection),
+	(compile("!"), Tok.Negate),
+	(compile(r"\("), Tok.OpenP),
+	(compile(r"\)"), Tok.CloseP),
+	(compile(","), Tok.Comma),
+	(compile(r"\w+"), Tok.Identifier),
+]
+"""
+A mapping of regexs to the tokens the identify
+
+Tokens earlier on in the list should be regarded as higher priority, even if a match lower
+on the list also matches.  All unicode strings should be matched by at least one token.
+"""
+
+def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
+	"""
+	Attempt to recognize a single token against a full input string
+
+	If successful, returns the token provided as an argument, the part of the input which
+	matched, and the rest of the input.  Otherwise, returns `None`
+
+	>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc")
+	Some((('NUMBER', '123'), 'abc'))
+
+	>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None
+	True
+	"""
+	match regex.match(input):
+		case None:
+			return None
+		case match:
+			assert match is not None
+			return Some(((tok, match.group()), input[match.end():]))
+
+
+def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]:
+	"""
+	Attempt to lex an entire input string.
+
+	Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list
+	should be regarded as higher priority, even if a match lower on the list also matches.
+
+	Any tokens in the `drop_tokens` list will be dropped from the output.
+
+	If the lexer is unable to match the input string with any of the tokens, then an `Err`
+	is returned containing the section of the input that failed to match.
+
+	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
+	Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'),
+	    (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'),
+	    (CloseP, ')'), (Identifier, 'person')])
+
+	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
+	Err('🍆 !man(x5)')
+	"""
+	if len(input):
+		try:
+			lexeme, rest_input = next(
+				unwrap_opt(maybe_lexeme)
+				for maybe_lexeme in (
+					try_lex1(regex, tok, input)
+					for (regex, tok) in lex_table
+				)
+				if isinstance(maybe_lexeme, Some)
+			)
+		except StopIteration:
+			return Err(input)
+		if lexeme[0] not in drop_tokens:
+			prefix.append(lexeme)
+		return tokenize(lex_table, drop_tokens, rest_input, prefix)
+	else:
+		return Ok(prefix)
+
+
+if __name__ == '__main__':
+	# print(tokenize(open('sample.cnf').read()))
+	import doctest
+	doctest.testmod()