Move grammar and lex table to their own file

2023-03-04 12:42:43 -05:00 · 2023-03-04 12:42:43 -05:00 · f813d91736
parent 00043d27dd
commit f813d91736
5 changed files with 144 additions and 191 deletions
--- a/build_oracle.py
+++ b/build_oracle.py
@ -330,8 +330,7 @@ def print_oracle_table_enum(
 if __name__ == '__main__':
    import doctest
-    from lex import Tok
+    from grammar import GRAMMAR, Tok, Variable
    from parse import GRAMMAR, Variable
    failure_count, test_count = doctest.testmod()
    if failure_count:
        print('\n\nRefusing to build oracle table due to test failures')
--- a/build_oracle.sh
+++ b/build_oracle.sh
@ -1,8 +1,7 @@
 #!/bin/bash
 cat << EOF > oracle_table.py
-from lex import Tok
+from grammar import Tok, Variable
 from parse import Variable
 oracle_table = (
 EOF
--- a/grammar.py
+++ b/grammar.py
@ -0,0 +1,139 @@
 from enum import auto, IntEnum
 from re import compile, Pattern
 from typing import Collection, Mapping, Sequence, Tuple
 class Tok(IntEnum):
 	"""
 	All possible tokens used in the grammar
 	"""
 	Newline = auto()
 	Whitespace = auto()
 	PredicateSection = auto()
 	VariablesSection = auto()
 	ConstantsSection = auto()
 	FunctionsSection = auto()
 	ClausesSection = auto()
 	Negate = auto()
 	OpenP = auto()
 	CloseP = auto()
 	Comma = auto()
 	Identifier = auto()
 	Eof = auto()
 	def __repr__(self):
 		return self._name_
 LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
 	(compile(r"\n"), Tok.Newline),
 	(compile(r"[ \t]+"), Tok.Whitespace),
 	(compile("Predicates:"), Tok.PredicateSection),
 	(compile("Variables:"), Tok.VariablesSection),
 	(compile("Constants:"), Tok.ConstantsSection),
 	(compile("Functions:"), Tok.FunctionsSection),
 	(compile("Clauses:"), Tok.ClausesSection),
 	(compile("!"), Tok.Negate),
 	(compile(r"\("), Tok.OpenP),
 	(compile(r"\)"), Tok.CloseP),
 	(compile(","), Tok.Comma),
 	(compile(r"\w+"), Tok.Identifier),
 ]
 """
 A mapping of regexs to the tokens the identify
 Tokens earlier on in the list should be regarded as higher priority, even if a match lower
 on the list also matches.  All unicode strings should be matched by at least one token.
 """
 class Variable(IntEnum):
 	Start = auto()
 	Idents = auto()
 	Clauses = auto()
 	Clauses_ = auto()
 	Clause = auto()
 	Clause_ = auto()
 	Term = auto()
 	Func = auto()
 	CSTerms = auto()
 	def __repr__(self) -> str:
 		return f'<{self._name_}>'
 GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
 	(Variable.Start,
 		 [ Tok.PredicateSection, Variable.Idents, Tok.Newline
 		 , Tok.VariablesSection, Variable.Idents, Tok.Newline
 		 , Tok.ConstantsSection, Variable.Idents, Tok.Newline
 		 , Tok.FunctionsSection, Variable.Idents, Tok.Newline
 		 , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
 	(Variable.Idents,
 		[ Tok.Identifier, Variable.Idents ]),
 	(Variable.Idents,
 		[ ]),
 	(Variable.Clauses,
 		[ Tok.Newline, Variable.Clauses_ ]),
 	(Variable.Clauses,
 		[ ]),
 	(Variable.Clauses_,
 		[ Variable.Clause, Variable.Clauses ]),
 	(Variable.Clauses_,
 		[ ]),
 	(Variable.Clause,
 		[ Variable.Term, Variable.Clause_ ]),
 	(Variable.Clause_,
 		[ Variable.Clause ]),
 	(Variable.Clause_,
 		[ ]),
 	(Variable.Term,
 		[ Tok.Negate, Variable.Term ]),
 	(Variable.Term,
 		[ Tok.Identifier, Variable.Func ]),
 	(Variable.Func,
 		[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
 	(Variable.Func,
 		[ ]),
 	(Variable.CSTerms,
 		[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
 	(Variable.CSTerms,
 		[ ]),
 ]
 """
 Implements the following grammar:
 Start	 := PredicateSection <Idents> Newline
 			VariablesSection <Idents> Newline
 			ConstantsSection <Idents> Newline
 			FunctionsSection <Idents> Newline
 			ClausesSection <Clauses> Eof
 Idents	 := Identifier <Idents>
 		 := ε
 Clauses  := Newline <Clauses'>
 		 := ε
 Clauses' := <Clause> <Clauses>
 		 := ε
 Clause	 := <Term> <Clause'>
 Clause'  := <Clause>
 		 := ε
 Term	 := Negate <Term>
 		 := Identifier <Func?>
 Func?	 := OpenP <Term> <CSTerms> CloseP
 		 := ε
 CSTerms  := Comma <Term> <CSTerms>
 		 := ε
 """
--- a/lex.py
+++ b/lex.py
@ -3,52 +3,10 @@ from emis_funky_funktions import *
 from dataclasses import dataclass
 from enum import auto, IntEnum
 from operator import is_not
-from re import compile, Pattern
+from re import Pattern
 from typing import Collection, Tuple, List, NewType
 class Tok(IntEnum):
 	"""
 	All possible tokens used in the grammar
 	"""
 	Newline = auto()
 	Whitespace = auto()
 	PredicateSection = auto()
 	VariablesSection = auto()
 	ConstantsSection = auto()
 	FunctionsSection = auto()
 	ClausesSection = auto()
 	Negate = auto()
 	OpenP = auto()
 	CloseP = auto()
 	Comma = auto()
 	Identifier = auto()
 	Eof = auto()
 	def __repr__(self):
 		return self._name_
 LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
 	(compile(r"\n"), Tok.Newline),
 	(compile(r"[ \t]+"), Tok.Whitespace),
 	(compile("Predicates:"), Tok.PredicateSection),
 	(compile("Variables:"), Tok.VariablesSection),
 	(compile("Constants:"), Tok.ConstantsSection),
 	(compile("Functions:"), Tok.FunctionsSection),
 	(compile("Clauses:"), Tok.ClausesSection),
 	(compile("!"), Tok.Negate),
 	(compile(r"\("), Tok.OpenP),
 	(compile(r"\)"), Tok.CloseP),
 	(compile(","), Tok.Comma),
 	(compile(r"\w+"), Tok.Identifier),
 ]
 """
 A mapping of regexs to the tokens the identify
 Tokens earlier on in the list should be regarded as higher priority, even if a match lower
 on the list also matches.  All unicode strings should be matched by at least one token.
 """
 def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
 	"""
 	Attempt to recognize a single token against a full input string
@ -112,4 +70,6 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
 if __name__ == '__main__':
 	# print(tokenize(open('sample.cnf').read()))
 	import doctest
 	from re import compile
 	from grammar import Tok, LEX_TABLE
 	doctest.testmod()
--- a/parse.py
+++ b/parse.py
@ -1,144 +0,0 @@
 from emis_funky_funktions import *
 from enum import auto, IntEnum
 from functools import cache, reduce
 from operator import getitem
 from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard
 from lex import Tok
 """
 Implements a parser for the following grammar:
 Start    := PredicateSection <Idents> Newline
            VariablesSection <Idents> Newline
            ConstantsSection <Idents> Newline
            FunctionsSection <Idents> Newline
            ClausesSection <Clauses> Eof
 Idents   := Identifier <Idents>
         := ε
 Clauses  := Newline <Clauses'>
         := ε
 Clauses' := <Clause> <Clauses>
         := ε
 Clause   := <Term> <Clause'>
 Clause'  := <Clause>
         := ε
 Term     := Negate <Term>
         := Identifier <Func?>
 Func?    := OpenP <Term> <CSTerms> CloseP
         := ε
 CSTerms  := Comma <Term> <CSTerms>
         := ε
 """
 class Variable(IntEnum):
    Start = auto()
    Idents = auto()
    Clauses = auto()
    Clauses_ = auto()
    Clause = auto()
    Clause_ = auto()
    Term = auto()
    Func = auto()
    CSTerms = auto()
    def __repr__(self) -> str:
        return f'<{self._name_}>'
 GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
    (Variable.Start,
         [ Tok.PredicateSection, Variable.Idents, Tok.Newline
         , Tok.VariablesSection, Variable.Idents, Tok.Newline
         , Tok.ConstantsSection, Variable.Idents, Tok.Newline
         , Tok.FunctionsSection, Variable.Idents, Tok.Newline
         , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
    (Variable.Idents,
        [ Tok.Identifier, Variable.Idents ]),
    (Variable.Idents,
        [ ]),
    (Variable.Clauses,
        [ Tok.Newline, Variable.Clauses_ ]),
    (Variable.Clauses,
        [ ]),
    (Variable.Clauses_,
        [ Variable.Clause, Variable.Clauses ]),
    (Variable.Clauses_,
        [ ]),
    (Variable.Clause,
        [ Variable.Term, Variable.Clause_ ]),
    (Variable.Clause_,
        [ Variable.Clause ]),
    (Variable.Clause_,
        [ ]),
    (Variable.Term,
        [ Tok.Negate, Variable.Term ]),
    (Variable.Term,
        [ Tok.Identifier, Variable.Func ]),
    (Variable.Func,
        [ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
    (Variable.Func,
        [ ]),
    (Variable.CSTerms,
        [ Tok.Comma, Variable.Term, Variable.CSTerms ]),
    (Variable.CSTerms,
        [ ]),
 ]
 # ### FIRST Table ###
 #
 # Start      : PredicateSection
 # Idents     : Identifier, ε
 # Clauses    : Newline, ε
 # Clauses'   : Negate, Identifier, ε
 # Clause     : Negate, Identifier
 # Clause'    : Negate, Identifier, ε
 # Term       : Negate, Identifier
 # Func?      : OpenP
 # CSTerms    : Comma, ε
 #
 #
 #
 # ### FOLLOW Table ###
 #
 # Idents     : Newline
 # Clauses    : Eof
 # Clauses'   : Eof
 # Clause     : Newline, Eof
 # Clause'    : Newline, Eof
 # Term       : Negate, Identifier, Newline, Eof, Comma
 # Func?      : Negate, Identifier, Newline, Eof, Comma
 # CSTerms    : CloseP
 #
 #
 #
 # ### PREDICT Table ###
 #
 # Idents     : Identifier
 #            : Newline
 # Clauses    : Newline
 #            : Eof
 # Clauses'   : Negate, Identifier
 #            : Eof
 # Clause     : Newline, Eof
 # Clause'    : Newline, Eof
 # Term       : Negate, Identifier, Newline, Eof, Comma
 # Func?      : Negate, Identifier, Newline, Eof, Comma
 # CSTerms    : CloseP