diff --git a/build_oracle.py b/build_oracle.py index ff61bc7..9bf5a99 100644 --- a/build_oracle.py +++ b/build_oracle.py @@ -330,8 +330,7 @@ def print_oracle_table_enum( if __name__ == '__main__': import doctest - from lex import Tok - from parse import GRAMMAR, Variable + from grammar import GRAMMAR, Tok, Variable failure_count, test_count = doctest.testmod() if failure_count: print('\n\nRefusing to build oracle table due to test failures') diff --git a/build_oracle.sh b/build_oracle.sh index 54309c8..cf922e7 100644 --- a/build_oracle.sh +++ b/build_oracle.sh @@ -1,8 +1,7 @@ #!/bin/bash cat << EOF > oracle_table.py -from lex import Tok -from parse import Variable +from grammar import Tok, Variable oracle_table = ( EOF diff --git a/grammar.py b/grammar.py new file mode 100644 index 0000000..88e6598 --- /dev/null +++ b/grammar.py @@ -0,0 +1,139 @@ +from enum import auto, IntEnum +from re import compile, Pattern + +from typing import Collection, Mapping, Sequence, Tuple + +class Tok(IntEnum): + """ + All possible tokens used in the grammar + """ + Newline = auto() + Whitespace = auto() + PredicateSection = auto() + VariablesSection = auto() + ConstantsSection = auto() + FunctionsSection = auto() + ClausesSection = auto() + Negate = auto() + OpenP = auto() + CloseP = auto() + Comma = auto() + Identifier = auto() + Eof = auto() + + def __repr__(self): + return self._name_ + +LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [ + (compile(r"\n"), Tok.Newline), + (compile(r"[ \t]+"), Tok.Whitespace), + (compile("Predicates:"), Tok.PredicateSection), + (compile("Variables:"), Tok.VariablesSection), + (compile("Constants:"), Tok.ConstantsSection), + (compile("Functions:"), Tok.FunctionsSection), + (compile("Clauses:"), Tok.ClausesSection), + (compile("!"), Tok.Negate), + (compile(r"\("), Tok.OpenP), + (compile(r"\)"), Tok.CloseP), + (compile(","), Tok.Comma), + (compile(r"\w+"), Tok.Identifier), +] +""" +A mapping of regexs to the tokens the identify + +Tokens earlier on in the list should be regarded as higher priority, even if a match lower +on the list also matches. All unicode strings should be matched by at least one token. +""" + +class Variable(IntEnum): + Start = auto() + Idents = auto() + Clauses = auto() + Clauses_ = auto() + Clause = auto() + Clause_ = auto() + Term = auto() + Func = auto() + CSTerms = auto() + + def __repr__(self) -> str: + return f'<{self._name_}>' + +GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [ + (Variable.Start, + [ Tok.PredicateSection, Variable.Idents, Tok.Newline + , Tok.VariablesSection, Variable.Idents, Tok.Newline + , Tok.ConstantsSection, Variable.Idents, Tok.Newline + , Tok.FunctionsSection, Variable.Idents, Tok.Newline + , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ), + + (Variable.Idents, + [ Tok.Identifier, Variable.Idents ]), + (Variable.Idents, + [ ]), + + (Variable.Clauses, + [ Tok.Newline, Variable.Clauses_ ]), + (Variable.Clauses, + [ ]), + + (Variable.Clauses_, + [ Variable.Clause, Variable.Clauses ]), + (Variable.Clauses_, + [ ]), + + (Variable.Clause, + [ Variable.Term, Variable.Clause_ ]), + + (Variable.Clause_, + [ Variable.Clause ]), + (Variable.Clause_, + [ ]), + + (Variable.Term, + [ Tok.Negate, Variable.Term ]), + (Variable.Term, + [ Tok.Identifier, Variable.Func ]), + + (Variable.Func, + [ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]), + (Variable.Func, + [ ]), + + (Variable.CSTerms, + [ Tok.Comma, Variable.Term, Variable.CSTerms ]), + (Variable.CSTerms, + [ ]), +] +""" +Implements the following grammar: + +Start := PredicateSection Newline + VariablesSection Newline + ConstantsSection Newline + FunctionsSection Newline + ClausesSection Eof + +Idents := Identifier + := ε + +Clauses := Newline + := ε + +Clauses' := + := ε + +Clause := + +Clause' := + := ε + +Term := Negate + := Identifier + +Func? := OpenP CloseP + := ε + +CSTerms := Comma + := ε +""" \ No newline at end of file diff --git a/lex.py b/lex.py index 255d68b..7808f7f 100644 --- a/lex.py +++ b/lex.py @@ -3,52 +3,10 @@ from emis_funky_funktions import * from dataclasses import dataclass from enum import auto, IntEnum from operator import is_not -from re import compile, Pattern +from re import Pattern from typing import Collection, Tuple, List, NewType -class Tok(IntEnum): - """ - All possible tokens used in the grammar - """ - Newline = auto() - Whitespace = auto() - PredicateSection = auto() - VariablesSection = auto() - ConstantsSection = auto() - FunctionsSection = auto() - ClausesSection = auto() - Negate = auto() - OpenP = auto() - CloseP = auto() - Comma = auto() - Identifier = auto() - Eof = auto() - - def __repr__(self): - return self._name_ - -LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [ - (compile(r"\n"), Tok.Newline), - (compile(r"[ \t]+"), Tok.Whitespace), - (compile("Predicates:"), Tok.PredicateSection), - (compile("Variables:"), Tok.VariablesSection), - (compile("Constants:"), Tok.ConstantsSection), - (compile("Functions:"), Tok.FunctionsSection), - (compile("Clauses:"), Tok.ClausesSection), - (compile("!"), Tok.Negate), - (compile(r"\("), Tok.OpenP), - (compile(r"\)"), Tok.CloseP), - (compile(","), Tok.Comma), - (compile(r"\w+"), Tok.Identifier), -] -""" -A mapping of regexs to the tokens the identify - -Tokens earlier on in the list should be regarded as higher priority, even if a match lower -on the list also matches. All unicode strings should be matched by at least one token. -""" - def try_lex1(regex: Pattern[str], tok: A, input: str) -> Option[Tuple[Tuple[A, str], str]]: """ Attempt to recognize a single token against a full input string @@ -112,4 +70,6 @@ def tokenize(lex_table: Collection[Tuple[Pattern[str], A]], drop_tokens: Collect if __name__ == '__main__': # print(tokenize(open('sample.cnf').read())) import doctest + from re import compile + from grammar import Tok, LEX_TABLE doctest.testmod() \ No newline at end of file diff --git a/parse.py b/parse.py deleted file mode 100644 index 4ddbc02..0000000 --- a/parse.py +++ /dev/null @@ -1,144 +0,0 @@ -from emis_funky_funktions import * - -from enum import auto, IntEnum -from functools import cache, reduce -from operator import getitem -from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard - -from lex import Tok - -""" -Implements a parser for the following grammar: - -Start := PredicateSection Newline - VariablesSection Newline - ConstantsSection Newline - FunctionsSection Newline - ClausesSection Eof - -Idents := Identifier - := ε - -Clauses := Newline - := ε - -Clauses' := - := ε - -Clause := - -Clause' := - := ε - -Term := Negate - := Identifier - -Func? := OpenP CloseP - := ε - -CSTerms := Comma - := ε -""" - -class Variable(IntEnum): - Start = auto() - Idents = auto() - Clauses = auto() - Clauses_ = auto() - Clause = auto() - Clause_ = auto() - Term = auto() - Func = auto() - CSTerms = auto() - - def __repr__(self) -> str: - return f'<{self._name_}>' - -GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok]]] = [ - (Variable.Start, - [ Tok.PredicateSection, Variable.Idents, Tok.Newline - , Tok.VariablesSection, Variable.Idents, Tok.Newline - , Tok.ConstantsSection, Variable.Idents, Tok.Newline - , Tok.FunctionsSection, Variable.Idents, Tok.Newline - , Tok.ClausesSection, Variable.Clauses, Tok.Eof ] ), - - (Variable.Idents, - [ Tok.Identifier, Variable.Idents ]), - (Variable.Idents, - [ ]), - - (Variable.Clauses, - [ Tok.Newline, Variable.Clauses_ ]), - (Variable.Clauses, - [ ]), - - (Variable.Clauses_, - [ Variable.Clause, Variable.Clauses ]), - (Variable.Clauses_, - [ ]), - - (Variable.Clause, - [ Variable.Term, Variable.Clause_ ]), - - (Variable.Clause_, - [ Variable.Clause ]), - (Variable.Clause_, - [ ]), - - (Variable.Term, - [ Tok.Negate, Variable.Term ]), - (Variable.Term, - [ Tok.Identifier, Variable.Func ]), - - (Variable.Func, - [ Tok.OpenP, Variable.CSTerms, Tok.CloseP ]), - (Variable.Func, - [ ]), - - (Variable.CSTerms, - [ Tok.Comma, Variable.Term, Variable.CSTerms ]), - (Variable.CSTerms, - [ ]), -] - - -# ### FIRST Table ### -# -# Start : PredicateSection -# Idents : Identifier, ε -# Clauses : Newline, ε -# Clauses' : Negate, Identifier, ε -# Clause : Negate, Identifier -# Clause' : Negate, Identifier, ε -# Term : Negate, Identifier -# Func? : OpenP -# CSTerms : Comma, ε -# -# -# -# ### FOLLOW Table ### -# -# Idents : Newline -# Clauses : Eof -# Clauses' : Eof -# Clause : Newline, Eof -# Clause' : Newline, Eof -# Term : Negate, Identifier, Newline, Eof, Comma -# Func? : Negate, Identifier, Newline, Eof, Comma -# CSTerms : CloseP -# -# -# -# ### PREDICT Table ### -# -# Idents : Identifier -# : Newline -# Clauses : Newline -# : Eof -# Clauses' : Negate, Identifier -# : Eof -# Clause : Newline, Eof -# Clause' : Newline, Eof -# Term : Negate, Identifier, Newline, Eof, Comma -# Func? : Negate, Identifier, Newline, Eof, Comma -# CSTerms : CloseP \ No newline at end of file