39 lines
993 B
Python
39 lines
993 B
Python
from enum import auto, IntEnum
|
|
from typing import Collection, Tuple
|
|
from re import compile, Pattern
|
|
|
|
class Tok(IntEnum):
|
|
"""
|
|
All possible tokens used in the grammar
|
|
"""
|
|
Whitespace = auto()
|
|
OpenCurly = auto()
|
|
CloseCurly = auto()
|
|
OpenSquare = auto()
|
|
CloseSquare = auto()
|
|
Comma = auto()
|
|
Colon = auto()
|
|
String = auto()
|
|
Number = auto()
|
|
Eof = auto()
|
|
|
|
def __repr__(self):
|
|
return self._name_
|
|
|
|
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
|
|
(compile(r"[\s\n]+"), Tok.Whitespace),
|
|
(compile(r"{"), Tok.OpenCurly),
|
|
(compile(r"}"), Tok.CloseCurly),
|
|
(compile(r"\["), Tok.OpenSquare),
|
|
(compile(r"\]"), Tok.CloseSquare),
|
|
(compile(r","), Tok.Comma),
|
|
(compile(r":"), Tok.Colon),
|
|
(compile(r'"[^"]*"'), Tok.String),
|
|
(compile(r'\d+'), Tok.Number),
|
|
]
|
|
"""
|
|
A mapping of regexs to the tokens the identify
|
|
|
|
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
|
|
on the list also matches. All unicode strings should be matched by at least one token.
|
|
""" |