JSON-Lang/lex.py

from emis_funky_funktions import *

from dataclasses import dataclass
from enum import auto, IntEnum
from operator import is_not
from re import Pattern

from typing import Collection, Tuple, List, NewType

def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
	"""
	Attempt to recognize a single token against a full input string

	If successful, returns the token provided as an argument, the part of the input which
	matched, and the rest of the input.  Otherwise, returns `None`

	>>> try_lex1(compile(r'\d+'), "NUMBER", "123abc")
	Some((('NUMBER', '123'), 'abc'))

	>>> try_lex1(compile(r'\d+'), "NUMBER", "abc123") is None
	True
	"""
	match regex.match(input):
		case None:
			return None
		case match:
			assert match is not None
			return Some(((tok, match.group()), input[match.end():]))


def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collection[A], input: str, prefix: List[Tuple[A, str]] = []) -> Result[List[Tuple[A, str]], str]:
	"""
	Attempt to lex an entire input string.

	Will be lexed into `Tok`s according to `lex_table`. Tokens earlier on in the list
	should be regarded as higher priority, even if a match lower on the list also matches.

	Any tokens in the `drop_tokens` list will be dropped from the output.

	If the lexer is unable to match the input string with any of the tokens, then an `Err`
	is returned containing the section of the input that failed to match.

	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n!man(x5) person') #doctest: +NORMALIZE_WHITESPACE
	Ok([(ClausesSection, 'Clauses:'), (Newline, '\\n'), (Negate, '!'),
	    (Identifier, 'man'), (OpenP, '('), (Identifier, 'x5'),
	    (CloseP, ')'), (Identifier, 'person')])

	>>> tokenize(LEX_TABLE, [Tok.Whitespace], 'Clauses: \\n🍆 !man(x5)')
	Err('🍆 !man(x5)')
	"""
	if len(input):
		try:
			lexeme, rest_input = next(
				unwrap_opt(maybe_lexeme)
				for maybe_lexeme in (
					try_lex1(regex, tok, input)
					for (regex, tok) in lex_table
				)
				if isinstance(maybe_lexeme, Some)
			)
		except StopIteration:
			return Err(input)
		if lexeme[0] not in drop_tokens:
			prefix.append(lexeme)
		return tokenize(lex_table, drop_tokens, rest_input, prefix)
	else:
		return Ok(prefix)


if __name__ == '__main__':
	# print(tokenize(open('sample.cnf').read()))
	import doctest
	from re import compile
	from grammar import Tok, LEX_TABLE
	doctest.testmod()