Move grammar and lex table to their own file

This commit is contained in:
Emi Simpson 2023-03-04 12:42:43 -05:00
parent 00043d27dd
commit f813d91736
Signed by: Emi
GPG key ID: A12F2C2FFDC3D847
5 changed files with 144 additions and 191 deletions

View file

@ -330,8 +330,7 @@ def print_oracle_table_enum(
if __name__ == '__main__':
import doctest
from lex import Tok
from parse import GRAMMAR, Variable
from grammar import GRAMMAR, Tok, Variable
failure_count, test_count = doctest.testmod()
if failure_count:
print('\n\nRefusing to build oracle table due to test failures')

View file

@ -1,8 +1,7 @@
#!/bin/bash
cat << EOF > oracle_table.py
from lex import Tok
from parse import Variable
from grammar import Tok, Variable
oracle_table = (
EOF

139
grammar.py Normal file
View file

@ -0,0 +1,139 @@
from enum import auto, IntEnum
from re import compile, Pattern
from typing import Collection, Mapping, Sequence, Tuple
class Tok(IntEnum):
"""
All possible tokens used in the grammar
"""
Newline = auto()
Whitespace = auto()
PredicateSection = auto()
VariablesSection = auto()
ConstantsSection = auto()
FunctionsSection = auto()
ClausesSection = auto()
Negate = auto()
OpenP = auto()
CloseP = auto()
Comma = auto()
Identifier = auto()
Eof = auto()
def __repr__(self):
return self._name_
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
(compile(r"\n"), Tok.Newline),
(compile(r"[ \t]+"), Tok.Whitespace),
(compile("Predicates:"), Tok.PredicateSection),
(compile("Variables:"), Tok.VariablesSection),
(compile("Constants:"), Tok.ConstantsSection),
(compile("Functions:"), Tok.FunctionsSection),
(compile("Clauses:"), Tok.ClausesSection),
(compile("!"), Tok.Negate),
(compile(r"\("), Tok.OpenP),
(compile(r"\)"), Tok.CloseP),
(compile(","), Tok.Comma),
(compile(r"\w+"), Tok.Identifier),
]
"""
A mapping of regexs to the tokens the identify
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""
class Variable(IntEnum):
Start = auto()
Idents = auto()
Clauses = auto()
Clauses_ = auto()
Clause = auto()
Clause_ = auto()
Term = auto()
Func = auto()
CSTerms = auto()
def __repr__(self) -> str:
return f'<{self._name_}>'
GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
(Variable.Start,
[ Tok.PredicateSection, Variable.Idents, Tok.Newline
, Tok.VariablesSection, Variable.Idents, Tok.Newline
, Tok.ConstantsSection, Variable.Idents, Tok.Newline
, Tok.FunctionsSection, Variable.Idents, Tok.Newline
, Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
(Variable.Idents,
[ Tok.Identifier, Variable.Idents ]),
(Variable.Idents,
[ ]),
(Variable.Clauses,
[ Tok.Newline, Variable.Clauses_ ]),
(Variable.Clauses,
[ ]),
(Variable.Clauses_,
[ Variable.Clause, Variable.Clauses ]),
(Variable.Clauses_,
[ ]),
(Variable.Clause,
[ Variable.Term, Variable.Clause_ ]),
(Variable.Clause_,
[ Variable.Clause ]),
(Variable.Clause_,
[ ]),
(Variable.Term,
[ Tok.Negate, Variable.Term ]),
(Variable.Term,
[ Tok.Identifier, Variable.Func ]),
(Variable.Func,
[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
(Variable.Func,
[ ]),
(Variable.CSTerms,
[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
(Variable.CSTerms,
[ ]),
]
"""
Implements the following grammar:
Start := PredicateSection <Idents> Newline
VariablesSection <Idents> Newline
ConstantsSection <Idents> Newline
FunctionsSection <Idents> Newline
ClausesSection <Clauses> Eof
Idents := Identifier <Idents>
:= ε
Clauses := Newline <Clauses'>
:= ε
Clauses' := <Clause> <Clauses>
:= ε
Clause := <Term> <Clause'>
Clause' := <Clause>
:= ε
Term := Negate <Term>
:= Identifier <Func?>
Func? := OpenP <Term> <CSTerms> CloseP
:= ε
CSTerms := Comma <Term> <CSTerms>
:= ε
"""

46
lex.py
View file

@ -3,52 +3,10 @@ from emis_funky_funktions import *
from dataclasses import dataclass
from enum import auto, IntEnum
from operator import is_not
from re import compile, Pattern
from re import Pattern
from typing import Collection, Tuple, List, NewType
class Tok(IntEnum):
"""
All possible tokens used in the grammar
"""
Newline = auto()
Whitespace = auto()
PredicateSection = auto()
VariablesSection = auto()
ConstantsSection = auto()
FunctionsSection = auto()
ClausesSection = auto()
Negate = auto()
OpenP = auto()
CloseP = auto()
Comma = auto()
Identifier = auto()
Eof = auto()
def __repr__(self):
return self._name_
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
(compile(r"\n"), Tok.Newline),
(compile(r"[ \t]+"), Tok.Whitespace),
(compile("Predicates:"), Tok.PredicateSection),
(compile("Variables:"), Tok.VariablesSection),
(compile("Constants:"), Tok.ConstantsSection),
(compile("Functions:"), Tok.FunctionsSection),
(compile("Clauses:"), Tok.ClausesSection),
(compile("!"), Tok.Negate),
(compile(r"\("), Tok.OpenP),
(compile(r"\)"), Tok.CloseP),
(compile(","), Tok.Comma),
(compile(r"\w+"), Tok.Identifier),
]
"""
A mapping of regexs to the tokens the identify
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""
def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
"""
Attempt to recognize a single token against a full input string
@ -112,4 +70,6 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
if __name__ == '__main__':
# print(tokenize(open('sample.cnf').read()))
import doctest
from re import compile
from grammar import Tok, LEX_TABLE
doctest.testmod()

144
parse.py
View file

@ -1,144 +0,0 @@
from emis_funky_funktions import *
from enum import auto, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard
from lex import Tok
"""
Implements a parser for the following grammar:
Start := PredicateSection <Idents> Newline
VariablesSection <Idents> Newline
ConstantsSection <Idents> Newline
FunctionsSection <Idents> Newline
ClausesSection <Clauses> Eof
Idents := Identifier <Idents>
:= ε
Clauses := Newline <Clauses'>
:= ε
Clauses' := <Clause> <Clauses>
:= ε
Clause := <Term> <Clause'>
Clause' := <Clause>
:= ε
Term := Negate <Term>
:= Identifier <Func?>
Func? := OpenP <Term> <CSTerms> CloseP
:= ε
CSTerms := Comma <Term> <CSTerms>
:= ε
"""
class Variable(IntEnum):
Start = auto()
Idents = auto()
Clauses = auto()
Clauses_ = auto()
Clause = auto()
Clause_ = auto()
Term = auto()
Func = auto()
CSTerms = auto()
def __repr__(self) -> str:
return f'<{self._name_}>'
GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
(Variable.Start,
[ Tok.PredicateSection, Variable.Idents, Tok.Newline
, Tok.VariablesSection, Variable.Idents, Tok.Newline
, Tok.ConstantsSection, Variable.Idents, Tok.Newline
, Tok.FunctionsSection, Variable.Idents, Tok.Newline
, Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
(Variable.Idents,
[ Tok.Identifier, Variable.Idents ]),
(Variable.Idents,
[ ]),
(Variable.Clauses,
[ Tok.Newline, Variable.Clauses_ ]),
(Variable.Clauses,
[ ]),
(Variable.Clauses_,
[ Variable.Clause, Variable.Clauses ]),
(Variable.Clauses_,
[ ]),
(Variable.Clause,
[ Variable.Term, Variable.Clause_ ]),
(Variable.Clause_,
[ Variable.Clause ]),
(Variable.Clause_,
[ ]),
(Variable.Term,
[ Tok.Negate, Variable.Term ]),
(Variable.Term,
[ Tok.Identifier, Variable.Func ]),
(Variable.Func,
[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
(Variable.Func,
[ ]),
(Variable.CSTerms,
[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
(Variable.CSTerms,
[ ]),
]
# ### FIRST Table ###
#
# Start : PredicateSection
# Idents : Identifier, ε
# Clauses : Newline, ε
# Clauses' : Negate, Identifier, ε
# Clause : Negate, Identifier
# Clause' : Negate, Identifier, ε
# Term : Negate, Identifier
# Func? : OpenP
# CSTerms : Comma, ε
#
#
#
# ### FOLLOW Table ###
#
# Idents : Newline
# Clauses : Eof
# Clauses' : Eof
# Clause : Newline, Eof
# Clause' : Newline, Eof
# Term : Negate, Identifier, Newline, Eof, Comma
# Func? : Negate, Identifier, Newline, Eof, Comma
# CSTerms : CloseP
#
#
#
# ### PREDICT Table ###
#
# Idents : Identifier
# : Newline
# Clauses : Newline
# : Eof
# Clauses' : Negate, Identifier
# : Eof
# Clause : Newline, Eof
# Clause' : Newline, Eof
# Term : Negate, Identifier, Newline, Eof, Comma
# Func? : Negate, Identifier, Newline, Eof, Comma
# CSTerms : CloseP