JSON-Lang/lex.py

from emis_funky_funktions import *

from dataclasses import dataclass
from enum import auto, IntEnum
from operator import eq, is_not
from re import Pattern

from typing import Collection, Tuple, List, NewType

@dataclass(frozen=True)
class Lexeme(Generic[B]):
	token: B
	matched_string: str
	line: int
	col_start: int
	col_end: int
	def __repr__(self):
		return f'[{repr(self.token)}: {repr(self.matched_string)}]@({self.line}, {self.col_start}-{self.col_end})'

def try_lex1(regex: Pattern[str], tok: A, input: str, line_no: int, col_no: int) -> Option[Tuple[Lexeme[A], str]]:
	"""
	Attempt to recognize a single token against a full input string

	If successful, returns the token provided as an argument, the part of the input which
	matched, and the rest of the input.  Otherwise, returns `None`

	>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc", 1, 1)
	Some((['NUMBER': '123']@(1, 1-4), 'abc'))

	>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123", 1, 1) is None
	True
	"""
	match regex.match(input):
		case None:
			return None
		case match:
			assert match is not None
			return Some((Lexeme(tok, match.group(), line_no, col_no, col_no + match.end()), input[match.end():]))


def tokenize(
	lex_table: Collection[Tuple[Pattern[str], A]],
	drop_tokens: Collection[A],
	input: str
) -> Result[List[Lexeme[A]], str]:
	"""
	Attempt to lex an entire input string.

	Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list
	should be regarded as higher priority, even if a match lower on the list also matches.

	Any tokens in the `drop_tokens` list will be dropped from the output.

	If the lexer is unable to match the input string with any of the tokens, then an `Err`
	is returned containing the section of the input that failed to match.

	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
	Ok([[ClausesSection: 'Clauses:']@(1, 1-9), [Newline: '\\n']@(1, 10-11),
	    [Negate: '!']@(2, 1-2), [Identifier: 'man']@(2, 2-5), [OpenP: '(']@(2, 5-6),
	    [Identifier: 'x5']@(2, 6-8), [CloseP: ')']@(2, 8-9),
	    [Identifier: 'person']@(2, 10-16)])

	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
	Err('🍆 !man(x5)')
	"""
	def inner(input: str, line_no: int, col_no: int, prefix: List[Lexeme[A]]) -> Result[List[Lexeme[A]], str]:
		if len(input):
			try:
				lexeme, rest_input = next(
					unwrap_opt(maybe_lexeme)
					for maybe_lexeme in (
						try_lex1(regex, tok, input, line_no, col_no)
						for (regex, tok) in lex_table
					)
					if isinstance(maybe_lexeme, Some)
				)
			except StopIteration:
				return Err(input)
			if lexeme.token not in drop_tokens:
				prefix.append(lexeme)
			newline_count = len(list(filter(p(eq, '\n'), lexeme.matched_string)))
			new_col_no = (
				len(lexeme.matched_string) - lexeme.matched_string.rfind('\n')
				if newline_count else
				col_no + len(lexeme.matched_string)
			)
			return inner(rest_input, line_no+newline_count, new_col_no, prefix)
		else:
			return Ok(prefix)
	return inner(input, 1, 1, [])


if __name__ == '__main__':
	# print(tokenize(open('sample.cnf').read()))
	import doctest
	from re import compile
	from grammar import Tok, LEX_TABLE
	doctest.testmod()