Move grammar and lex table to their own file

2023-03-04 12:42:43 -05:00 · 2023-03-04 12:42:43 -05:00 · f813d91736
parent 00043d27dd
commit f813d91736
5 changed files with 144 additions and 191 deletions
--- a/build_oracle.py
+++ b/build_oracle.py
@ -330,8 +330,7 @@ def print_oracle_table_enum(

 if __name__ == '__main__':
    import doctest
-    from lex import Tok
-    from parse import GRAMMAR, Variable
+    from grammar import GRAMMAR, Tok, Variable
    failure_count, test_count = doctest.testmod()
    if failure_count:
        print('\n\nRefusing to build oracle table due to test failures')
--- a/build_oracle.sh
+++ b/build_oracle.sh
@ -1,8 +1,7 @@
 #!/bin/bash

 cat << EOF > oracle_table.py
-from lex import Tok
-from parse import Variable
+from grammar import Tok, Variable

 oracle_table = (
 EOF
--- a/grammar.py
+++ b/grammar.py
@ -0,0 +1,139 @@
+from enum import auto, IntEnum
+from re import compile, Pattern
+
+from typing import Collection, Mapping, Sequence, Tuple
+
+class Tok(IntEnum):
+	"""
+	All possible tokens used in the grammar
+	"""
+	Newline = auto()
+	Whitespace = auto()
+	PredicateSection = auto()
+	VariablesSection = auto()
+	ConstantsSection = auto()
+	FunctionsSection = auto()
+	ClausesSection = auto()
+	Negate = auto()
+	OpenP = auto()
+	CloseP = auto()
+	Comma = auto()
+	Identifier = auto()
+	Eof = auto()
+
+	def __repr__(self):
+		return self._name_
+
+LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
+	(compile(r"\n"), Tok.Newline),
+	(compile(r"[ \t]+"), Tok.Whitespace),
+	(compile("Predicates:"), Tok.PredicateSection),
+	(compile("Variables:"), Tok.VariablesSection),
+	(compile("Constants:"), Tok.ConstantsSection),
+	(compile("Functions:"), Tok.FunctionsSection),
+	(compile("Clauses:"), Tok.ClausesSection),
+	(compile("!"), Tok.Negate),
+	(compile(r"\("), Tok.OpenP),
+	(compile(r"\)"), Tok.CloseP),
+	(compile(","), Tok.Comma),
+	(compile(r"\w+"), Tok.Identifier),
+]
+"""
+A mapping of regexs to the tokens the identify
+
+Tokens earlier on in the list should be regarded as higher priority, even if a match lower
+on the list also matches.  All unicode strings should be matched by at least one token.
+"""
+
+class Variable(IntEnum):
+	Start = auto()
+	Idents = auto()
+	Clauses = auto()
+	Clauses_ = auto()
+	Clause = auto()
+	Clause_ = auto()
+	Term = auto()
+	Func = auto()
+	CSTerms = auto()
+
+	def __repr__(self) -> str:
+		return f'<{self._name_}>'
+
+GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
+	(Variable.Start,
+		 [ Tok.PredicateSection, Variable.Idents, Tok.Newline
+		 , Tok.VariablesSection, Variable.Idents, Tok.Newline
+		 , Tok.ConstantsSection, Variable.Idents, Tok.Newline
+		 , Tok.FunctionsSection, Variable.Idents, Tok.Newline
+		 , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
+
+	(Variable.Idents,
+		[ Tok.Identifier, Variable.Idents ]),
+	(Variable.Idents,
+		[ ]),
+
+	(Variable.Clauses,
+		[ Tok.Newline, Variable.Clauses_ ]),
+	(Variable.Clauses,
+		[ ]),
+
+	(Variable.Clauses_,
+		[ Variable.Clause, Variable.Clauses ]),
+	(Variable.Clauses_,
+		[ ]),
+
+	(Variable.Clause,
+		[ Variable.Term, Variable.Clause_ ]),
+
+	(Variable.Clause_,
+		[ Variable.Clause ]),
+	(Variable.Clause_,
+		[ ]),
+
+	(Variable.Term,
+		[ Tok.Negate, Variable.Term ]),
+	(Variable.Term,
+		[ Tok.Identifier, Variable.Func ]),
+
+	(Variable.Func,
+		[ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
+	(Variable.Func,
+		[ ]),
+
+	(Variable.CSTerms,
+		[ Tok.Comma, Variable.Term, Variable.CSTerms ]),
+	(Variable.CSTerms,
+		[ ]),
+]
+"""
+Implements the following grammar:
+
+Start	 := PredicateSection <Idents> Newline
+			VariablesSection <Idents> Newline
+			ConstantsSection <Idents> Newline
+			FunctionsSection <Idents> Newline
+			ClausesSection <Clauses> Eof
+
+Idents	 := Identifier <Idents>
+		 := ε
+
+Clauses  := Newline <Clauses'>
+		 := ε
+
+Clauses' := <Clause> <Clauses>
+		 := ε
+
+Clause	 := <Term> <Clause'>
+
+Clause'  := <Clause>
+		 := ε
+
+Term	 := Negate <Term>
+		 := Identifier <Func?>
+
+Func?	 := OpenP <Term> <CSTerms> CloseP
+		 := ε
+
+CSTerms  := Comma <Term> <CSTerms>
+		 := ε
+"""
--- a/lex.py
+++ b/lex.py
@ -3,52 +3,10 @@ from emis_funky_funktions import *
 from dataclasses import dataclass
 from enum import auto, IntEnum
 from operator import is_not
-from re import compile, Pattern
+from re import Pattern

 from typing import Collection, Tuple, List, NewType

-class Tok(IntEnum):
-	"""
-	All possible tokens used in the grammar
-	"""
-	Newline = auto()
-	Whitespace = auto()
-	PredicateSection = auto()
-	VariablesSection = auto()
-	ConstantsSection = auto()
-	FunctionsSection = auto()
-	ClausesSection = auto()
-	Negate = auto()
-	OpenP = auto()
-	CloseP = auto()
-	Comma = auto()
-	Identifier = auto()
-	Eof = auto()
-
-	def __repr__(self):
-		return self._name_
-
-LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
-	(compile(r"\n"), Tok.Newline),
-	(compile(r"[ \t]+"), Tok.Whitespace),
-	(compile("Predicates:"), Tok.PredicateSection),
-	(compile("Variables:"), Tok.VariablesSection),
-	(compile("Constants:"), Tok.ConstantsSection),
-	(compile("Functions:"), Tok.FunctionsSection),
-	(compile("Clauses:"), Tok.ClausesSection),
-	(compile("!"), Tok.Negate),
-	(compile(r"\("), Tok.OpenP),
-	(compile(r"\)"), Tok.CloseP),
-	(compile(","), Tok.Comma),
-	(compile(r"\w+"), Tok.Identifier),
-]
-"""
-A mapping of regexs to the tokens the identify
-
-Tokens earlier on in the list should be regarded as higher priority, even if a match lower
-on the list also matches.  All unicode strings should be matched by at least one token.
-"""
-
 def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]:
 	"""
 	Attempt to recognize a single token against a full input string
@ -112,4 +70,6 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect
 if __name__ == '__main__':
 	# print(tokenize(open('sample.cnf').read()))
 	import doctest
+	from re import compile
+	from grammar import Tok, LEX_TABLE
 	doctest.testmod()
--- a/parse.py
+++ b/parse.py
@ -1,144 +0,0 @@
-from emis_funky_funktions import *
-
-from enum import auto, IntEnum
-from functools import cache, reduce
-from operator import getitem
-from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard
-
-from lex import Tok
-
-"""
-Implements a parser for the following grammar:
-
-Start    := PredicateSection <Idents> Newline
-            VariablesSection <Idents> Newline
-            ConstantsSection <Idents> Newline
-            FunctionsSection <Idents> Newline
-            ClausesSection <Clauses> Eof
-
-Idents   := Identifier <Idents>
-         := ε
-
-Clauses  := Newline <Clauses'>
-         := ε
-
-Clauses' := <Clause> <Clauses>
-         := ε
-
-Clause   := <Term> <Clause'>
-
-Clause'  := <Clause>
-         := ε
-
-Term     := Negate <Term>
-         := Identifier <Func?>
-
-Func?    := OpenP <Term> <CSTerms> CloseP
-         := ε
-
-CSTerms  := Comma <Term> <CSTerms>
-         := ε
-"""
-
-class Variable(IntEnum):
-    Start = auto()
-    Idents = auto()
-    Clauses = auto()
-    Clauses_ = auto()
-    Clause = auto()
-    Clause_ = auto()
-    Term = auto()
-    Func = auto()
-    CSTerms = auto()
-
-    def __repr__(self) -> str:
-        return f'<{self._name_}>'
-
-GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [
-    (Variable.Start,
-         [ Tok.PredicateSection, Variable.Idents, Tok.Newline
-         , Tok.VariablesSection, Variable.Idents, Tok.Newline
-         , Tok.ConstantsSection, Variable.Idents, Tok.Newline
-         , Tok.FunctionsSection, Variable.Idents, Tok.Newline
-         , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ),
-
-    (Variable.Idents,
-        [ Tok.Identifier, Variable.Idents ]),
-    (Variable.Idents,
-        [ ]),
-
-    (Variable.Clauses,
-        [ Tok.Newline, Variable.Clauses_ ]),
-    (Variable.Clauses,
-        [ ]),
-
-    (Variable.Clauses_,
-        [ Variable.Clause, Variable.Clauses ]),
-    (Variable.Clauses_,
-        [ ]),
-
-    (Variable.Clause,
-        [ Variable.Term, Variable.Clause_ ]),
-
-    (Variable.Clause_,
-        [ Variable.Clause ]),
-    (Variable.Clause_,
-        [ ]),
-
-    (Variable.Term,
-        [ Tok.Negate, Variable.Term ]),
-    (Variable.Term,
-        [ Tok.Identifier, Variable.Func ]),
-
-    (Variable.Func,
-        [ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]),
-    (Variable.Func,
-        [ ]),
-
-    (Variable.CSTerms,
-        [ Tok.Comma, Variable.Term, Variable.CSTerms ]),
-    (Variable.CSTerms,
-        [ ]),
-]
-
-
-# ### FIRST Table ###
-#
-# Start      : PredicateSection
-# Idents     : Identifier, ε
-# Clauses    : Newline, ε
-# Clauses'   : Negate, Identifier, ε
-# Clause     : Negate, Identifier
-# Clause'    : Negate, Identifier, ε
-# Term       : Negate, Identifier
-# Func?      : OpenP
-# CSTerms    : Comma, ε
-#
-#
-#
-# ### FOLLOW Table ###
-#
-# Idents     : Newline
-# Clauses    : Eof
-# Clauses'   : Eof
-# Clause     : Newline, Eof
-# Clause'    : Newline, Eof
-# Term       : Negate, Identifier, Newline, Eof, Comma
-# Func?      : Negate, Identifier, Newline, Eof, Comma
-# CSTerms    : CloseP
-#
-#
-#
-# ### PREDICT Table ###
-#
-# Idents     : Identifier
-#            : Newline
-# Clauses    : Newline
-#            : Eof
-# Clauses'   : Negate, Identifier
-#            : Eof
-# Clause     : Newline, Eof
-# Clause'    : Newline, Eof
-# Term       : Negate, Identifier, Newline, Eof, Comma
-# Func?      : Negate, Identifier, Newline, Eof, Comma
-# CSTerms    : CloseP