Start making a grammar then realize there's a builtin json parser

This commit is contained in:
Emi Simpson 2023-03-07 20:42:10 -05:00
parent bc2a2e2a8c
commit 206c0a0ec0
Signed by: Emi
GPG key ID: A12F2C2FFDC3D847
7 changed files with 31 additions and 1248 deletions

View file

@ -1,203 +0,0 @@
"""
Tools for building an oracle table
See `grammar` and `build_oracle.sh` for scripts which actually produce python code. This
module only produces an oracle table in python, without outputting it.
NOTE! Doctests in this module use `GRAMMAR` from `grammar.py` and `EGRAMMAR` as a version
of that grammar with the actions erased, with `_erase_actions()`.
"""
from emis_funky_funktions import *
from enum import auto, Enum, IntEnum
from functools import cache, reduce
from operator import getitem
from typing import Any, cast, Collection, Mapping, Sequence, Set, Tuple, TypeGuard, TypeVar
def _erase_actions_h(
handle: Sequence[A | B | C],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[A | B]:
"""
Produce an identical handle, but with all the actions removed
"""
return [i for i in handle if is_not_c(i)]
def _erase_actions(
grammar: Sequence[Tuple[A, Sequence[A | B | C]]],
is_not_c: Callable[[A | B | C], TypeGuard[A | B]]
) -> Sequence[Tuple[A, Sequence[A | B]]]:
"""
Produce an identical grammar, but with all the actions removed
"""
return [
(var, _erase_actions_h(handle, is_not_c))
for (var, handle) in grammar
]
def _first(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
sequence: Sequence[A | B]
) -> Tuple[Collection[B], bool]:
"""
Computes all of the possible starting terminals for a handle in a given grammar
Due to pathetic python weaknesses, the first argument you must provide is a type guard
to determine whether a certain thing is a terminal as opposed to a variable.
Then, pass in the grammar and the sequence of terminals and variables in question.
The output contains two values. The first is a set of possible terminals, and the
second is a boolean indicating whether this term can derive epsilon.
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.Clause])
({Negate, Identifier}, False)
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms])
({Comma}, True)
>>> _first(flip(cur2(isinstance))(Tok), EGRAMMAR, [Variable.CSTerms, Tok.CloseP])
({CloseP, Comma}, False)
"""
def inner(vs: Sequence[A | B]) -> Tuple[Set[B], bool]:
match vs:
case []:
return (set(), True)
case [v, *rest] if is_term(v):
return ({v}, False)
case [v, *rest]:
this_variable_first, derives_epsilon = reduce(
lambda acc, result: (acc[0] | result[0], acc[1] or result[1]),
[
inner(handle)
for (other_variable, handle) in grammar
if other_variable == v
]
)
if derives_epsilon:
rest_first, rest_derives_epsilon = inner(rest)
return (rest_first | this_variable_first, rest_derives_epsilon)
else:
return (this_variable_first, False)
raise Exception("UNREACHABLE")
return inner(sequence)
def _follow(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Collection[B]]:
"""
Produce a table indicating exactly which terminals can follow each variable
>>> _follow(flip(cur2(isinstance))(Tok), EGRAMMAR) #doctest: +NORMALIZE_WHITESPACE
{<Start>: set(),
<Idents>: {Newline},
<Clauses>: {Eof},
<Clauses_>: {Eof},
<Clause>: {Newline, Eof},
<Clause_>: {Newline, Eof},
<Term>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<Func>: {Newline, Negate, CloseP, Comma, Identifier, Eof},
<CSTerms>: {CloseP}}
"""
follow_table: Mapping[A, Set[B]] = {
variable: set()
for (variable, _) in grammar
}
def following_tokens(handle: Sequence[A | B], follows_handle: Set[B]) -> Set[B]:
handle_first, handle_derives_epsilon = _first(is_term, grammar, handle)
return set(handle_first) | (follows_handle if handle_derives_epsilon else set())
def inner(prev_table: Mapping[A, Set[B]]) -> Mapping[A, Set[B]]:
new_table = reduce(
lambda acc, entry: acc | {entry[0]: acc[entry[0]] | entry[1]},
[
(
cast(A, handle[i]),
following_tokens(handle[i+1:], prev_table[variable])
)
for (variable, handle) in grammar
for i in range(len(handle))
if not is_term(handle[i])
],
prev_table
)
if new_table == prev_table:
return new_table
else:
return inner(new_table)
return inner(follow_table)
def _predict(
is_term: Callable[[A | B], TypeGuard[B]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
follow: Mapping[A, Collection[B]],
lhs: A,
rhs: Sequence[A | B]
) -> Collection[B]:
"""
Given a production, identify the terminals which this production would be valid under
>>> is_tok = flip(cur2(isinstance))(Tok)
>>> follow = _follow(is_tok, EGRAMMAR)
>>> _predict(is_tok, EGRAMMAR, follow, Variable.Clause, [Variable.Term, Variable.Clause_])
{Negate, Identifier}
"""
first_rhs, epsilon_rhs = _first(is_term, grammar, rhs)
if epsilon_rhs:
return set(follow[lhs]) | set(first_rhs)
else:
return first_rhs
def oracle_table(
is_term: Callable[[A | B], TypeGuard[B]],
is_var: Callable[[A | B], TypeGuard[A]],
grammar: Sequence[Tuple[A, Sequence[A | B]]],
) -> Mapping[A, Mapping[B, Collection[Sequence[A | B]]]]:
"""
A variant of `_oracle` that generates a table immediately rather than lazily
No significant performance benefit
>>> is_tok = p_instance(Tok)
>>> is_var = p_instance(Variable)
>>> my_oracle_table = oracle_table(is_tok, is_var, EGRAMMAR)
One valid expansion:
>>> my_oracle_table[Variable.Clauses_][Tok.Negate]
[[<Clause>, <Clauses>]]
One valid expansion, but it expands to epsilon:
>>> my_oracle_table[Variable.Clauses_][Tok.Eof]
[[]]
Zero valid expansions:
>>> my_oracle_table[Variable.Term][Tok.Newline]
[]
"""
all_variables = { lhs for (lhs, rhs) in grammar }
all_terminals = { symbol for (lhs, rhs) in grammar for symbol in rhs if is_term(symbol) }
is_not_c: Callable[[A | B | C], TypeGuard[A | B]] = lambda x: is_term(x) or is_var(x) #type:ignore
e_grammar: Sequence[Tuple[A, Sequence[A | B]]] = _erase_actions(grammar, is_not_c) #type:ignore
follow = _follow(is_term, e_grammar)
return {
v: {
t: [
handle
for (lhs, handle) in grammar
if lhs == v
and t in _predict(is_term, e_grammar, follow, lhs, _erase_actions_h(handle, is_not_c)) #type:ignore
]
for t in all_terminals
}
for v in all_variables
}
if __name__ == '__main__':
import doctest
from grammar import GRAMMAR, Tok, Variable
EGRAMMAR = _erase_actions(GRAMMAR, lambda x: not hasattr(x, '__call__')) #type: ignore
doctest.testmod()

View file

@ -1,300 +1,39 @@
"""
A grammar for parsing CNF files
If this module is run directly in python, it will spit out valid python which produces an
oracle table for the grammar it defines. It's recommended that this be done using
`build_oracle.sh` instead, however, which will build a whole python module containing the
oracle table, complete with imports.
"""
from emis_funky_funktions import *
from dataclasses import dataclass
from enum import auto, IntEnum
from typing import Collection, Tuple
from re import compile, Pattern
from build_oracle import oracle_table
from ir import IRNeg, IRProp, IRTerm, IRVar, KnowledgeBase
from lex import Lexeme, tokenize
from parse import Action, parser
from tokens import *
class Tok(IntEnum):
"""
All possible tokens used in the grammar
"""
Whitespace = auto()
OpenCurly = auto()
CloseCurly = auto()
OpenSquare = auto()
CloseSquare = auto()
Comma = auto()
Colon = auto()
String = auto()
Number = auto()
Eof = auto()
from typing import Any, Callable, cast, Collection, Mapping, Sequence, Tuple, TypeAlias
def __repr__(self):
return self._name_
class Variable(IntEnum):
Start = auto()
Idents = auto()
Clauses = auto()
Clauses_ = auto()
Clause = auto()
Clause_ = auto()
Term = auto()
Func = auto()
CSTerms = auto()
def __repr__(self) -> str:
return f'<{self._name_}>'
ASTTerm: TypeAlias = 'ASTNegated | ASTProp'
class IdentKind(IntEnum):
Function = auto()
Constant = auto()
Variable = auto()
Predicate = auto()
@dataclass(frozen=True)
class CallingNonFunc:
term: Lexeme[Tok]
obj_type: IdentKind
def __str__(self):
return f'Semantic error: Attempted to call {repr(self.term.matched_string)} (a {self.obj_type.name.lower()}) with arguments on line {self.term.line}:{self.term.col_start}-{self.term.col_end}'
@dataclass(frozen=True)
class MissingArguments:
term: Lexeme[Tok]
def __str__(self):
return f'Semantic error: The function {repr(self.term.matched_string)} on line {self.term.line}:{self.term.col_start}-{self.term.col_end} is missing arguments!'
@dataclass(frozen=True)
class UnidentifiedVariable:
term: Lexeme[Tok]
def __str__(self):
return f'Semantic error: Unidentified identifier {repr(self.term.matched_string)} on line {self.term.line}:{self.term.col_start}-{self.term.col_end}'
@dataclass(frozen=True)
class PropUsedInObjectContext:
term: Lexeme[Tok]
def __str__(self):
return f'Semantic error: The proposition {repr(self.term.matched_string)} was used in a context where an object was expected on line {self.term.line}:{self.term.col_start}-{self.term.col_end}'
@dataclass(frozen=True)
class ObjectUsedInPropContext:
term: Lexeme[Tok]
obj_type: IdentKind
def __str__(self):
return f'Semantic error: The {self.obj_type.name.lower()} {repr(self.term.matched_string)} was used in a context where a proposition was expected on line {self.term.line}:{self.term.col_start}-{self.term.col_end}'
@dataclass(frozen=True)
class NegationOfObject:
line: int
col: int
def __str__(self):
return f'Semantic error: Attempted to use negation in a context where working on objects on line {self.line}:{self.col}'
GenIrError: TypeAlias = CallingNonFunc | MissingArguments | UnidentifiedVariable | PropUsedInObjectContext | ObjectUsedInPropContext | NegationOfObject
@dataclass(frozen=True)
class IdentBindings:
predicate_idents: Sequence[str]
variable_idents: Sequence[str]
const_idents: Sequence[str]
func_idents: Sequence[str]
@dataclass(frozen=True)
class ASTNegated:
neg_lexeme: Lexeme[Tok]
term: ASTTerm
def make_ir(self, idents: IdentBindings, is_prop: bool) -> 'Result[IRTerm, GenIrError]':
if is_prop:
return map_res(IRNeg, self.term.make_ir(idents, True))
else:
return Err(NegationOfObject(self.neg_lexeme.line, self.neg_lexeme.col_start))
@dataclass(frozen=True)
class ASTProp:
ident: Lexeme[Tok]
arguments: Sequence[ASTTerm]
def make_ir(self, idents: IdentBindings, is_pred: bool) -> 'Result[IRTerm, GenIrError]':
bound_type = (
IdentKind.Predicate
if self.ident.matched_string in idents.predicate_idents else
IdentKind.Variable
if self.ident.matched_string in idents.variable_idents else
IdentKind.Constant
if self.ident.matched_string in idents.const_idents else
IdentKind.Function
if self.ident.matched_string in idents.func_idents else
None
)
if bound_type is None:
return Err(UnidentifiedVariable(self.ident))
if is_pred:
if bound_type != IdentKind.Predicate:
return Err(ObjectUsedInPropContext(self.ident, bound_type))
else:
if bound_type == IdentKind.Function:
if not len(self.arguments):
return Err(MissingArguments(self.ident))
elif bound_type == IdentKind.Predicate:
return Err(PropUsedInObjectContext(self.ident))
else:
if len(self.arguments):
return Err(CallingNonFunc(self.ident, bound_type))
if bound_type == IdentKind.Variable:
return Ok(IRVar(self.ident.matched_string))
else:
return (sequence([t.make_ir(idents, False) for t in self.arguments])
<= cast(Callable[[Iterable[IRTerm]], Tuple[IRTerm, ...]], tuple))\
<= p(IRProp, self.ident.matched_string)
@cur2
def make_ir(
idents: IdentBindings,
clauses: Sequence[Sequence[ASTTerm]],
) -> Result[KnowledgeBase, GenIrError]:
return map_res(
lambda kb_: FSet(FSet(clause) for clause in kb_),
sequence([sequence([term.make_ir(idents, True) for term in clause]) for clause in clauses])
)
def cons(stack: Sequence[Any]) -> Sequence[Any]:
match stack:
case [rest, head, *popped_stack]:
return ((head, *rest), *popped_stack)
case bad_stack:
raise Exception("Unexpected stack state!", bad_stack)
nil: Sequence[Any] = tuple()
@cur2
def introduce(
cons: Any,
stack: Sequence[Any]
) -> Sequence[Any]:
return (cons, *stack)
def f_apply(stack: Sequence[Any]) -> Sequence[Any]:
match stack:
case [arg, func, *popped_stack] if hasattr(func, '__call__'):
return (func(arg), *popped_stack)
raise Exception("Unexpected stack state!", stack)
@cur2
def call_func(func: Callable[[Any], Any], stack: Sequence[Any]) -> Sequence[Any]:
match stack:
case [arg, *popped_stack]:
return (func(arg), *popped_stack)
case bad_stack:
raise Exception("Unexpected stack state!", bad_stack)
def drop(stack: Sequence[Any]) -> Sequence[Any]:
return stack[1:]
GRAMMAR: Sequence[Tuple[Variable, Sequence[Variable | Tok | Action]]] = [
(Variable.Start,
[ Tok.PredicateSection, drop, Variable.Idents, call_func(p(p,p,p,IdentBindings)), Tok.Newline, drop
, Tok.VariablesSection, drop, Variable.Idents, f_apply, Tok.Newline, drop
, Tok.ConstantsSection, drop, Variable.Idents, f_apply, Tok.Newline, drop
, Tok.FunctionsSection, drop, Variable.Idents, f_apply, call_func(make_ir), Tok.Newline, drop
, Tok.ClausesSection, drop, Variable.Clauses, f_apply, Tok.Eof, drop] ),
(Variable.Idents,
[ Tok.Identifier, call_func(lambda i: i.matched_string), Variable.Idents, cons ]),
(Variable.Idents,
[ introduce(nil) ]),
(Variable.Clauses,
[ Tok.Newline, drop, Variable.Clauses_ ]),
(Variable.Clauses,
[ introduce(nil) ]),
(Variable.Clauses_,
[ Variable.Clause, Variable.Clauses, cons ]),
(Variable.Clauses_,
[ introduce(nil) ]),
(Variable.Clause,
[ Variable.Term, Variable.Clause_, cons ]),
(Variable.Clause_,
[ Variable.Clause ]),
(Variable.Clause_,
[ introduce(nil) ]),
(Variable.Term,
[ Tok.Negate, call_func(cur2(ASTNegated)), Variable.Term, f_apply ]),
(Variable.Term,
[ Tok.Identifier, call_func(cur2(ASTProp)), Variable.Func, f_apply ]),
(Variable.Func,
[ Tok.OpenP, drop, Variable.Term, Variable.CSTerms, cons, Tok.CloseP, drop ]),
(Variable.Func,
[ introduce(nil) ]),
(Variable.CSTerms,
[ Tok.Comma, drop, Variable.Term, Variable.CSTerms, cons ]),
(Variable.CSTerms,
[ introduce(nil) ]),
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
(compile(r"[\s\n]+"), Tok.Whitespace),
(compile(r"{"), Tok.OpenCurly),
(compile(r"}"), Tok.CloseCurly),
(compile(r"\["), Tok.OpenSquare),
(compile(r"\]"), Tok.CloseSquare),
(compile(r","), Tok.Comma),
(compile(r":"), Tok.Colon),
(compile(r'"[^"]*"'), Tok.String),
(compile(r'\d+'), Tok.Number),
]
"""
Implements the following grammar:
A mapping of regexs to the tokens the identify
Start := PredicateSection <Idents> Newline
VariablesSection <Idents> Newline
ConstantsSection <Idents> Newline
FunctionsSection <Idents> Newline
ClausesSection <Clauses> Eof
Idents := Identifier <Idents>
:= ε
Clauses := Newline <Clauses'>
:= ε
Clauses' := <Clause> <Clauses>
:= ε
Clause := <Term> <Clause'>
Clause' := <Clause>
:= ε
Term := Negate <Term>
:= Identifier <Func?>
Func? := OpenP <Term> <CSTerms> CloseP
:= ε
CSTerms := Comma <Term> <CSTerms>
:= ε
"""
def lex_and_parse(input: str) -> Result[Result[KnowledgeBase, GenIrError], Tuple[Lexeme[Tok], Collection[Tok]]]:
lexemes = unwrap_r(tokenize(LEX_TABLE, [Tok.Whitespace], Tok.Eof, input))
oracle_table_ = oracle_table(p_instance(Tok), p_instance(Variable), GRAMMAR) #type:ignore
parser_ = parser(oracle_table_, flip(cur2(getattr))('token'), Variable.Start)
match parser_(lexemes):
case Ok([Ok(ir)]):
return Ok(Ok(ir))
case Ok([Err(err)]):
return Ok(Err(err))
case Ok(huh):
raise Exception('Unexpected end result: ', huh)
case Err(e_tup):
return Err(e_tup) #type:ignore
raise Exception('Unreachable')
if __name__ == '__main__':
# from emis_funky_funktions import cur2, flip
# from build_oracle import print_oracle_table_enum, oracle_table
# print(print_oracle_table_enum(oracle_table(flip(cur2(isinstance))(Tok), GRAMMAR))) #type: ignore
import sys
if len(sys.argv) == 2:
with open(sys.argv[1]) as file:
match lex_and_parse(file.read()):
case Ok(Ok(ir)):
print('\n'.join(' or '.join(str(t) for t in c) for c in ir))
case Ok(Err(err)):
print(err)
exit(102)
case Err((Lexeme(token, text, line, col_start, col_end), expected)):
print(f'Parse error! Line {line}:{col_start}-{col_end}\n\nGot: {repr(text)}\nExpected: {expected}')
exit(101)
else:
print(f'Usage: python {sys.argv[0]} <cnf-file-to-parse>')
exit(100)
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""

281
ir.py
View file

@ -1,281 +0,0 @@
from emis_funky_funktions import *
from dataclasses import dataclass
from functools import reduce
from typing import Collection, FrozenSet, Sequence, TypeAlias
@dataclass(frozen=True)
class Subst:
"""
A substitution which may be performed on a term.
Only variables may be substituted, but they may be substituted with any term. That
is, `x1/Chris` is valid, but `Chris/x1` is not.
It is strongly recommended to avoid recursive substitutions, that is, substitutions
which contain the variable they are replacing
"""
variable: str
replacement: 'IRTerm'
def __str__(self) -> str:
return repr(self)
def __repr__(self) -> str:
return f'{self.replacement}/{self.variable}'
Substitutions: TypeAlias = Sequence[Subst]
@dataclass(frozen=True)
class UnificationMismatch:
"""
Indicates that two terms failed to unify
Contains the two terms, each of which is a valid subterm of one of the two original
terms, and which do not superficially unify.
"""
term1: 'IRTerm'
term2: 'IRTerm'
@dataclass(frozen=True)
class LengthMismatch:
"""
Indicates that two clauses/argument lists failed to unify due to a length mismatch
Contains the first element of the two lists which didn't have a corresponding term on
the other side
"""
term: 'IRTerm'
UnificationError = UnificationMismatch | LengthMismatch
@dataclass(frozen=True)
class IRProp:
"""
Represents a proposition or object for resolution
Can have any number of arguments, each of which should be a `IRTerm`. Note that no
distinction is made between predicates (n-arity, logical statements), functions
(positive-arity functions over objects), and constants (stand-ins for objects)
"""
name: str
"""
The identifier of this thing, including its location in the source
"""
arguments: 'Tuple[IRTerm, ...]' = tuple()
def subst(self, subst: Subst) -> 'IRTerm':
"""
Perform substitution on a proposition
Returns the same proposition, but with any instances of the variable named in the
substitution replaced with the contents of the substitution.
>>> original = IRProp('angry', (IRVar('x1'),))
>>> original
angry(*x1)
>>> original.subst(Subst('x1', IRProp('Alex')))
angry(Alex())
"""
return IRProp(self.name, tuple(arg.subst(subst) for arg in self.arguments))
def __str__(self) -> str:
return repr(self)
def __repr__(self) -> str:
return f'{self.name}({",".join(str(arg) for arg in self.arguments)})'
def __contains__(self, var: str) -> bool:
"""
Test if a variable with a given name exists in this term
>>> 'x1' in IRProp('friends', [IRProp('John'), IRProp('mother_of', [IRVar('x1')])])
True
"""
return any(var in term for term in self.arguments)
@dataclass(frozen=True)
class IRVar:
"""
A variable which may be substituted for any other term
"""
name: str
def subst(self, subst: Subst) -> 'IRTerm':
"""
Perform substitution on a proposition
Returns the same proposition, but with any instances of the variable named in the
substitution replaced with the contents of the substitution.
>>> IRVar('x1').subst(Subst('x1', IRProp('Alex')))
Alex()
>>> IRVar('x1').subst(Subst('x2', IRProp('Alex')))
*x1
"""
if self.name == subst.variable:
return subst.replacement
else:
return self
def __str__(self) -> str:
return repr(self)
def __repr__(self) -> str:
return f'*{self.name}'
def __contains__(self, var: str) -> bool:
"""
Test if a variable with a given name exists in this term
>>> 'x1' in IRVar('x1')
True
>>> 'x1' in IRVar('x2')
False
"""
return var == self.name
@dataclass(frozen=True)
class IRNeg:
"""
A negated proposition
"""
inner: 'IRTerm'
def subst(self, subst: Subst) -> 'IRTerm':
"""
Perform substitution on a proposition
Returns the same proposition, but with any instances of the variable named in the
substitution replaced with the contents of the substitution.
>>> original = IRNeg(IRProp('happy', [IRVar('x1')]))
>>> original
¬happy(*x1)
>>> original.subst(Subst('x1', IRProp('parent', [IRProp('Susie')])))
¬happy(parent(Susie()))
"""
return IRNeg(self.inner.subst(subst))
def __str__(self) -> str:
return repr(self)
def __repr__(self) -> str:
return f'¬{self.inner}'
def __contains__(self, var: str) -> bool:
"""
Test if a variable with a given name exists in this term
>>> 'x1' in IRNeg(IRProp('round', [IRVar('x1')]))
True
"""
return var in self.inner
IRTerm: TypeAlias = IRVar | IRProp | IRNeg
Clause: TypeAlias = FrozenSet[IRTerm]
Clause_: TypeAlias = Collection[IRTerm]
"""
A more general definition of `Clause` which uses a collection rather than a frozen set
Every `Clause` is a `Clause_`, but not vice versa. In other words, true `Clause` is a
subclass of `Clause_`.
Due to this generalization, `Clause_` does not necessarily benefit from hashability or
deduplication. It exists mostly to make inputing things easier, e.g. in doctests.
"""
KnowledgeBase: TypeAlias = FrozenSet[Clause]
KnowledgeBase_: TypeAlias = Collection[Clause_]
"""
A more general version of `KnowledgeBase`
`KnowledgeBase_` : `KnowledgeBase` :: `Clause_` : `Clause`
A superclass of `KnowledgeBase`
"""
sub_all: Callable[[Substitutions, IRTerm], IRTerm] = p(reduce, lambda t, s: t.subst(s)) #type:ignore
"""
Perform a series of substitutions on a term
Applies every substitution to the term in order
>>> sub_all(
... [Subst('x1', IRVar('x2')), Subst('x2', IRProp('Karkat'))],
... IRProp('kismesis', [IRVar('x1'), IRVar('x2')]),
... )
kismesis(Karkat(), Karkat())
"""
def unify(t1: IRTerm, t2: IRTerm) -> Result[Substitutions, UnificationError]:
"""
Attempt to find a substitution that unifies two terms
If successful, the returned substitutions will cause both term to be equal, when
applied to both.
If this method fails, then the pair of subterms which caused the unification to fail
are returned.
>>> unify(
... IRProp('imaginary', [IRProp('Rufio')]),
... IRProp('imaginary', [IRVar('x1')])
... )
Ok((Rufio()/x1,))
>>> unify(
... IRProp('dating', [IRProp('Jade'), IRVar('x1')]),
... IRProp('dating', [IRVar('x1'), IRProp('John')])
... )
Err(UnificationMismatch(term1=Jade(), term2=John()))
>>> unify(
... IRProp('mother_of', [IRVar('x1')]),
... IRVar('x1')
... )
Err(UnificationMismatch(term1=mother_of(*x1), term2=*x1))
"""
match (t1, t2):
case (IRVar(v1), IRVar(v2)) if v1 == v2:
return Ok(tuple())
case (IRVar(v), t_other) | (t_other, IRVar(v)) if v not in t_other: #type: ignore
return Ok((Subst(v, t_other),))
case (IRProp(n1, a1), IRProp(n2, a2)) if n1 == n2 and len(a1) == len(a2):
return unify_lists(a1, a2)
case (IRNeg(i1), IRNeg(i2)):
return unify(i1, i2)
return Err(UnificationMismatch(t1, t2))
def unify_lists(c1: Sequence[IRTerm], c2: Sequence[IRTerm]) -> Result[Substitutions, UnificationError]:
"""
Attempt to perform unification on two term/argument lists
See `unify()` for the details of how this works. When working with lists, the same
rules apply. The substitutions, when applied to every term of both lists, will
cause the lists to become exactly the same.
Lists which are not the same length cannot be unified, and will always fail.
Notice the difference between a list of `IRTerm`s and a `Clause`: Namely, that a
`Clause` is unordered, while a list of `IRTerm`s is ordered.
>>> unify_lists(
... [ IRProp('imaginary', [IRProp('Rufio')]), IRProp('friend', [IRVar('x1'), IRVar('x3')]) ],
... [ IRProp('imaginary', [IRVar('x1')]), IRProp('friend', [IRVar('x2'), IRProp('Tavros')]) ]
... )
Ok((Rufio()/x1, Rufio()/x2, Tavros()/x3))
>>> unify_lists(
... [ IRProp('imaginary', [IRProp('Rufio')]), IRProp('friend', [IRVar('x1'), IRVar('x3')]) ],
... [ IRProp('imaginary', [IRVar('x1')]) ]
... )
Err(LengthMismatch(term=friend(Rufio(),*x3)))
"""
match (c1, c2):
case ([], []):
return Ok(tuple())
case ([h1, *t1], [h2, *t2]):
return unify(h1, h2) << (lambda subs:
unify_lists((*map(p(sub_all,subs),t1),), (*map(p(sub_all,subs),t2),)) <= (
lambda final_subs: (*subs, *final_subs)))
case ([h, *t], []) | ([], [h, *t]):
return Err(LengthMismatch(h))
raise Exception('Unreachable')
if __name__ == '__main__':
import doctest
doctest.testmod()

36
lab2.py
View file

@ -1,36 +0,0 @@
from emis_funky_funktions import *
from typing import Sequence
from grammar import lex_and_parse
from lex import Lexeme
from resolution import derives_false
def main(args: Sequence[str]) -> Tuple[str, int]:
match args[1:]:
case [file]:
with open(file) as input:
match lex_and_parse(input.read()):
case Ok(Ok(ir_kb)):
return (
'no'
if derives_false(ir_kb) else
'yes',
0
)
case Ok(Err(gen_ir_err)):
return (str(gen_ir_err), 102)
case Err((Lexeme(token, text, line, col_start, col_end), expected)):
return (
f'Parse error! Line {line}:{col_start}-{col_end}\n\nGot: {repr(text)}\nExpected: {expected}',
101
)
case _:
return (f'Usage: python {args[0]} some_file.cnf', 100)
raise Exception('Unreachable')
if __name__ == '__main__':
from sys import argv
out, code = main(argv)
print(out)
exit(code)

141
parse.py
View file

@ -1,141 +0,0 @@
from emis_funky_funktions import *
from dataclasses import dataclass
from functools import wraps
from operator import contains
from typing import Callable, Collection, Mapping, TypeGuard, TypeAlias
def _expected(row: Mapping[B, Collection[Sequence[Any]]]) -> Collection[B]:
"""
Given a single row from an oracle table, identify the expected terminals
"""
return [
terminal
for terminal, expansions in row.items()
if len(expansions)
]
Action: TypeAlias = Callable[[Sequence[C | D]], Sequence[C | D]]
def parser(
oracle: Mapping[A, Mapping[B, Collection[Sequence[A | B | Action]]]],
identify_lexeme: Callable[[D], B],
start_symbol: A,
) -> Callable[[Sequence[D]], Result[Sequence[C | D], Tuple[D, Collection[B]]]]:
"""
Produces a parser based on a grammar, an oracle, and a start symbol.
The `identify_lexeme` argument should be a function which converts a lexeme into the
token that it represents. This allows for the actual lexemes that are being fed in to
be more complex, and store additional data.
The oracle table my include "action" annotations in its sequences. Actions should be
an instance of `Action`, and should work on the AST stack. Every matched lexeme is
pushed to the AST stack. An action may transform this stack by popping some number of
items off of it, constructing some AST, pushing that AST back to the stack, and then
returning the modified stack.
A couple things to note about this process:
- The stack that is passed to each action is immutable. "Modifications" should be
made by simply constructing and returning a new stack.
- The bottom of the stack is the zero index.
If a parse is successful, the return value will be the AST stack at the end of the
parse. It is up the the caller to verify that this is an expected result.
If a parse fails, the return value will be a tuple containing the erronious lexeme and
a collection of expected tokens which failed to match it.
### Example:
We generate a simple grammar:
>>> class SimpleVariable(IntEnum):
... S = auto()
... Sum = auto()
... Sum_ = auto()
>>> class SimpleTerminal(IntEnum):
... Number = auto()
... Plus = auto()
... Eof = auto()
... def __repr__(self):
... return self.name
>>> build_S = lambda x: x[1:]
>>> build_Sum = lambda x: (x[0](x[1][1]), *x[2:])
>>> build_Sum_1 = lambda x: (lambda y: x[0] + y, *x[2:])
>>> build_Sum_2 = lambda x: (lambda y: y, *x)
>>> grammar = [
... (SimpleVariable.S, [SimpleVariable.Sum, SimpleTerminal.Eof, build_S]),
... (SimpleVariable.Sum, [SimpleTerminal.Number, SimpleVariable.Sum_, build_Sum]),
... (SimpleVariable.Sum_, [SimpleTerminal.Plus, SimpleVariable.Sum, build_Sum_1]),
... (SimpleVariable.Sum_, [build_Sum_2]),
... ]
>>> is_term = p_instance(SimpleTerminal)
>>> is_var = p_instance(SimpleVariable)
>>> my_oracle_table = oracle_table(is_term, is_var, grammar)
>>> my_parser = parser(my_oracle_table, lambda x: x[0], SimpleVariable.S)
>>> my_parser([
... (SimpleTerminal.Number, 1),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 3),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 10),
... (SimpleTerminal.Eof,),
... ])
Ok((14,))
>>> my_parser([
... (SimpleTerminal.Number, 1),
... (SimpleTerminal.Plus,),
... (SimpleTerminal.Number, 3),
... (SimpleTerminal.Number, 10), # <--- this is invalid!
... (SimpleTerminal.Eof,),
... ])
Err(((Number, 10), [Plus, Eof]))
"""
is_var: Callable[[Any], TypeGuard[A]] = p_instance(start_symbol.__class__)
is_tok: Callable[[Any], TypeGuard[B]] = p_instance(next(iter(oracle[start_symbol].keys())).__class__)
def inner(
stack: Sequence[A | B | Action],
ast_stack: Sequence[C | D],
lexemes: Sequence[D],
) -> Result[Sequence[C | D], Tuple[D, Collection[B]]]:
match stack:
# A [Variable]
case [top_of_stack, *popped_stack] if is_var(top_of_stack):
try:
expansions = oracle[top_of_stack][identify_lexeme(lexemes[0])]
except IndexError:
raise Exception('Unexpected end of input. Expected:', _expected(oracle[top_of_stack]))
match expansions:
case []:
return Err((lexemes[0], _expected(oracle[top_of_stack])))
case [expansion]:
return inner((*expansion, *popped_stack), ast_stack, lexemes)
case _:
raise Exception('Not an LL(1) grammar!!!')
# B [Token] (match)
case [top_of_stack, *popped_stack] if is_tok(top_of_stack) and top_of_stack == identify_lexeme(lexemes[0]):
return inner(popped_stack, (lexemes[0], *ast_stack), lexemes[1:])
# B [Token] (no match)
case [top_of_stack, *popped_stack] if is_tok(top_of_stack):
assert is_tok(top_of_stack)
return Err((lexemes[0], (top_of_stack,)))
# Action
case [f, *popped_stack]:
assert hasattr(f, '__call__')
return inner(popped_stack, f(ast_stack), lexemes)
# Empty stack (finished parsing)
case []:
if len(lexemes):
return Err((lexemes[0], []))
else:
return Ok(ast_stack)
raise Exception('Unreachable!')
return wraps(parser)(p(inner, [start_symbol], []))
if __name__ == '__main__':
import doctest
from enum import auto, IntEnum
from build_oracle import oracle_table
doctest.testmod()

View file

@ -1,250 +0,0 @@
from emis_funky_funktions import *
from itertools import combinations, product
from operator import eq
from typing import Collection, FrozenSet, TypeAlias
from ir import Clause, Clause_, IRNeg, IRProp, IRTerm, IRVar, KnowledgeBase, KnowledgeBase_, Substitutions, sub_all, unify
def terms_cancel(t1: IRTerm, t2: IRTerm) -> Option[Substitutions]:
"""
Determine if two terms could cancel each other out, given the right substitutions
That is, is there some set of substitutions such that t1 = ¬t2 or ¬t1 = t2?
If negation is possible and a substitution exists, that substitution is returned.
Otherwise, `None` is returned.
>>> terms_cancel(IRNeg(IRVar('x1')), IRProp('Terezi'))
Some((Terezi()/x1,))
>>> terms_cancel(IRProp('Nepeta'), IRVar('x1'))
Some((¬Nepeta()/x1,))
>>> terms_cancel(IRProp('ancestor', [IRVar('x1')]), IRVar('x1')) is None
True
"""
match (t1, t2):
case (IRNeg(a), b) | (b, IRNeg(a)): #type: ignore
return hush(unify(a, b))
case (IRVar(_) as x, a) | (a, IRVar(_) as x): #type: ignore
return hush(unify(x, IRNeg(a)))
return None
def merge_clauses(c1: Clause_, c2: Clause_) -> KnowledgeBase:
"""
Produce a list of all possible clauses which resolution could derive from c1 and c2
For each term in c1 that could cancel with a term in c2 using a substitution, a
possible clause is produced equal to the union of c1 and c2 with the canceled
terms removed and the substitution applied.
>>> merge_clauses(
... [ IRProp('day'), IRNeg(IRProp('night')) ],
... [ IRProp('night') ]
... )
{ { day() } }
>>> merge_clauses(
... [ IRNeg(IRProp('transgender', [IRVar('x1')])), IRProp('powerful', [IRVar('x1')]) ],
... [ IRNeg(IRProp('powerful', [IRVar('x2')])), IRProp('god', [IRVar('x2')]) ]
... )
{ { god(*x1), ¬transgender(*x1) } }
>>> merge_clauses(
... [ IRNeg(IRProp('day')), IRProp('night') ],
... [ IRVar('x2') ]
... )
{ { night() }, { ¬day() } }
If two clauses cannot merge, an empty set is returned
>>> merge_clauses(
... [ IRProp('day') ],
... [ IRProp('wet') ]
... )
{ }
"""
terms1, terms2 = list(c1), list(c2)
valid_substitutions = drop_none(
map_opt(lambda subs: (subs, i1, i2), terms_cancel(t1, t2))
for ((i1, t1), (i2, t2)) in product(enumerate(terms1), enumerate(terms2))
)
return FSet(
FSet(
sub_all(subs, term)
for term in (*terms1[:i1], *terms1[i1 + 1:], *terms2[:i2], *terms2[i2 + 1:])
)
for (subs, i1, i2) in valid_substitutions
)
def derive(clauses: KnowledgeBase_) -> KnowledgeBase:
"""
All possible clauses which derive in one step of resolution from a knowledge base
Attempts to merge every possible combination of clauses, in the knowledge base.
>>> derive([
... [IRNeg(IRProp('animal', [IRProp('Kim')]))],
... [IRNeg(IRProp('dog', [IRVar('x0')])), IRProp('animal', [IRVar('x0')])],
... [IRNeg(IRProp('cat', [IRVar('x1')])), IRProp('animal', [IRVar('x1')])],
... [IRProp('dog', [IRProp('Kim')])],
... ]) #doctest: +NORMALIZE_WHITESPACE
{
{ animal(Kim()) },
{ ¬cat(Kim()) },
{ ¬dog(Kim()) }
}
"""
return FSet(
FSet(clause)
for (c1, c2) in combinations(clauses, 2)
for clause in merge_clauses(c1, c2)
)
def derive2(kb1: KnowledgeBase_, kb2: KnowledgeBase_) -> KnowledgeBase:
"""
All clauses which derive in one step from the combination of two knowledge bases
Each resulting clause is the combination of one clause from the left knowledge base
with exactly one clause from the right knowledge base. Clauses from the same
knowledge base will not be combined.
>>> derive2([
... [IRNeg(IRProp('animal', [IRProp('Kim')]))],
... [IRNeg(IRProp('cat', [IRVar('x1')])), IRProp('animal', [IRVar('x1')])],
... ], [
... [IRNeg(IRProp('dog', [IRVar('x0')])), IRProp('animal', [IRVar('x0')])],
... [IRProp('dog', [IRProp('Kim')])],
... ])
{ { ¬dog(Kim()) } }
"""
return FSet(
FSet(clause)
for (c1, c2) in product(kb1, kb2)
for clause in merge_clauses(c1, c2)
)
false_clause: Clause = FSet()
"""
The clause which represents the logical condition False
i.e. the empty clause
"""
def next_generation(
previous_generations: KnowledgeBase,
current_generation: KnowledgeBase,
) -> KnowledgeBase:
"""
Advance resolution by one step
This performs `derive()` on the current generation as well as `derive2()` between the
current generation and all clauses from previous generations in order to produce a new
generation, hopefully with new and interesting clauses in it.
When using the new generation, previous generations should be considered to be the sum
of all clauses within the two knowledge bases passed to this function. That is, all
clauses which were used in the generation of that generation.
### Example
Starting from an example knowledge base, we produce a new generation. Notice that the
empty list for previous generations indicates that this is the zeroth (original)
knowledge base.
>>> next_generation(fset(
... ), fset(
... fset( IRNeg(IRProp('animal', (IRProp('Kim'),))) ),
... fset( IRNeg(IRProp('dog', (IRVar('x0'),))), IRProp('animal', (IRVar('x0'),)) ),
... fset( IRNeg(IRProp('cat', (IRVar('x1'),))), IRProp('animal', (IRVar('x1'),)) ),
... fset( IRProp('dog', (IRProp('Kim'),)) ),
... )) #doctest: +NORMALIZE_WHITESPACE
{
{ animal(Kim()) },
{ ¬cat(Kim()) },
{ ¬dog(Kim()) }
}
Now, we perform the next round of resolution. All of the terms originally passed into
the first generation have been moved to the previous generations knowledge base, and
the current generation contains only the results from the first call.
>>> next_generation(fset(
... fset( IRNeg(IRProp('animal', (IRProp('Kim'),))) ),
... fset( IRNeg(IRProp('dog', (IRVar('x0'),))), IRProp('animal', (IRVar('x0'),)) ),
... fset( IRNeg(IRProp('cat', (IRVar('x1'),))), IRProp('animal', (IRVar('x1'),)) ),
... fset( IRProp('dog', (IRProp('Kim'),)) ),
... ), fset(
... fset( IRNeg(IRProp('dog', (IRProp('Kim'),))) ),
... fset( IRNeg(IRProp('cat', (IRProp('Kim'),))) ),
... fset( IRProp('animal', (IRProp('Kim'),)) ),
... ))
{ { } }
The return of a knowledge base containing false (ie `[]`) indicates that resolution
has found a contradiction. In this case, there are two: One from ¬dog(Kim) and
dog(Kim), and one from animal(Kim) and ¬animal(Kim).
Excluded from the next generation is any term which exists in a previous generation.
For example:
>>> next_generation(fset(
... ), fset(
... fset( IRNeg(IRProp('day')), IRProp('day') ),
... fset( IRProp('day') ),
... ))
{ }
This configuration could theoretically derive day() again, however, it does not,
because that is already known, and is present in the current generation.
"""
return FSet(
(derive(current_generation) | derive2(current_generation, previous_generations))
- current_generation
- previous_generations
)
def derives_false(
knowledge_base: KnowledgeBase,
previous_generations: KnowledgeBase = FSet(),
) -> bool:
"""
Determine if it is possible to derive false from a given knowledge base.
Uses any number of generations of resolution via `next_generation()` to find a
generation which contains the `false_clause`, indicating that the original knowledge
base (and all subsiquent generations) are contradictory.
`previous_generations` may be set if the current knowledge base was derived from some
other series of clauses using `derive()`.
>>> derives_false(fset(
... fset( IRNeg(IRProp('animal', (IRProp('Kim'),))) ),
... fset( IRNeg(IRProp('dog', (IRVar('x0'),))), IRProp('animal', (IRVar('x0'),)) ),
... fset( IRNeg(IRProp('cat', (IRVar('x1'),))), IRProp('animal', (IRVar('x1'),)) ),
... fset( IRProp('dog', (IRProp('Kim'),)) ),
... ))
True
>>> derives_false(fset(
... fset( IRNeg(IRProp('animal', (IRProp('Kim'),))) ),
... fset( IRNeg(IRProp('dog', (IRVar('x0'),))), IRProp('animal', (IRVar('x0'),)) ),
... fset( IRNeg(IRProp('cat', (IRVar('x1'),))), IRProp('animal', (IRVar('x1'),)) ),
... ))
False
"""
if false_clause in knowledge_base:
return True
elif len(knowledge_base) == 0:
return False
else:
return derives_false(
next_generation(previous_generations, knowledge_base),
knowledge_base | previous_generations
)
if __name__ == '__main__':
import doctest
doctest.testmod()

View file

@ -1,45 +0,0 @@
from enum import auto, IntEnum
from typing import Collection, Tuple
from re import compile, Pattern
class Tok(IntEnum):
"""
All possible tokens used in the grammar
"""
Newline = auto()
Whitespace = auto()
PredicateSection = auto()
VariablesSection = auto()
ConstantsSection = auto()
FunctionsSection = auto()
ClausesSection = auto()
Negate = auto()
OpenP = auto()
CloseP = auto()
Comma = auto()
Identifier = auto()
Eof = auto()
def __repr__(self):
return self._name_
LEX_TABLE: Collection[Tuple[Pattern[str], Tok]] = [
(compile(r"\n"), Tok.Newline),
(compile(r"[ \t]+"), Tok.Whitespace),
(compile("Predicates:"), Tok.PredicateSection),
(compile("Variables:"), Tok.VariablesSection),
(compile("Constants:"), Tok.ConstantsSection),
(compile("Functions:"), Tok.FunctionsSection),
(compile("Clauses:"), Tok.ClausesSection),
(compile("!"), Tok.Negate),
(compile(r"\("), Tok.OpenP),
(compile(r"\)"), Tok.CloseP),
(compile(","), Tok.Comma),
(compile(r"\w+"), Tok.Identifier),
]
"""
A mapping of regexs to the tokens the identify
Tokens earlier on in the list should be regarded as higher priority, even if a match lower
on the list also matches. All unicode strings should be matched by at least one token.
"""