Amo/src/token.rs

//! A series of structures and methods for lexing an input
//!
//! Lexing is the process of taking a stream of characters and converting it into a series
//! of tokens (or lexemes).  Lexemes are designed to be the the smallest
//! machine-understandable unit of information - for example, a keyword, operator,
//! literal, or symbol.
//!
//! Look through the [`Token`] class to see a full list of the lexemes used by amo.
//!
//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes
//! a provided string.

use logos::Logos;

#[derive(Logos, Debug, PartialEq, Eq)]
pub enum Token {

	/// A lexer error happened, woops!
	///
	/// Also, this catches and ignores any whitespace that might be encountered
	#[error]
	#[regex(r"[ \t]+", logos::skip)]
	#[regex(r"\n[ \t]+", logos::skip)]
	#[regex(r"//.+", logos::skip)]
	#[regex(r"/\*([^*]*(\*[^/])?)+\*/", logos::skip)]
	Error, //d00

	/// The `type` keyword
	///
	/// Used to denote that a new type (called an enum in some languages) is being
	/// declared
	#[token("type")]
	Type, //d01

	/// The `struct` keyword
	///
	/// Used to denote the declaration of a kind of struct
	#[token("struct")]
	Struct, //d02

	/// The `trait` keywor, Eqd
	///
	/// Denotes the declaration of a new trait
	#[token("trait")]
	Trait, //d03

	/// The `needs` keyword
	///
	/// Used as part of a trait declaration to denote methods that will be required for a
	/// trait
	#[token("needs")]
	Needs, //d04

	/// The `if` keyword
	///
	/// Used to begin an If-Then-Else statement or an If-Is statement
	#[token("if")]
	If, //d05

	/// The `is` keyword
	///
	/// Used as part of an If-Is statement to indictate the start of the case listings
	#[token("is")]
	Is, //d06

	/// The `then` keyword
	///
	/// Indicates the start of the code block for the positive section of an If-Then-Else
	/// statement
	#[token("then")]
	Then, //d07

	/// The `else` keyword
	///
	/// Denotes the end of the positive section of an If-Then-Else block, and the begining
	/// of the negative section
	#[token("else")]
	Else, //d08

	/// the `impl` keyword
	///
	/// Used to denote the start of a trait implementation
	#[token("impl")]
	Impl, //d09

	/// the `on` keyword
	///
	/// Used in trait implementationsto seperate the trait being implemented and the type
	/// it's being implemented on.
	#[token("on")]
	On, //d10

	/// the `let` keyword
	///
	/// Allows binding a value to an immutable variable that can be used multiple times
	#[token("let")]
	Let, //d11

	/// the `in` keyword
	///
	/// Used to seperate a series of `let` bindings from the expression they're being used
	/// in.
	#[token("in")]
	In, //d12

	/// An `=>` arrow
	///
	/// Used as part of function type annotations as well as in the cases of If-Is blocks
	#[token("=>")]
	DubAro, //d13

	/// An `=` assignment operator
	///
	/// Used to seperate the left & right hand signs of an assignment operation
	#[token("=")]
	Assign, //d14

	/// Type Operator
	#[token("type", priority = 9)]
	TypeOp, //d15

	/// The `_` symbol
	///
	/// Generally used as a placeholder or standin for another type
	#[token("_")]
	Placeholder, //d16

	/// The `:` symbol
	///
	/// Used as a seperator in various parts of the language
	#[token(":")]
	Colon, //d17

	/// A rank 1 (applied last) infix binop (binary operator)
	///
	/// i.e. Logical Or
	#[token("||", |_| InfixRank1::LOr)]
	#[token("|", |_| InfixRank1::VBar)]
	R1Infix(InfixRank1), //d18

	/// A rank 2 infix binop (binary operator)
	///
	/// i.e. Logical And
	#[token("&&", |_| InfixRank2::LAnd)]
	#[token("->", |_| InfixRank2::Aro)]
	#[token(",", |_| InfixRank2::Aro)]
	R2Infix(InfixRank2), //d19

	/// A rank 3 infix binop (binary operator)
	///
	/// i.e. Comparison operators like == and <
	#[token("==", |_| InfixRank3::Eq)]
	#[token("!=", |_| InfixRank3::NEq)]
	#[token("<", |_| InfixRank3::LessThan)]
	#[token(">", |_| InfixRank3::GreaterThan)]
	R3Infix(InfixRank3), //d20

	/// A rank 5 infix binop (binary operator)
	///
	/// i.e. Range
	#[token("..", |_| InfixRank4::Range)]
	R4Infix(InfixRank4), //d21

	/// A rank 6 infix binop (binary operator)
	///
	/// i.e. Addition & Subtraction
	#[token("+", |_| InfixRank5::Add)]
	#[token("-", |_| InfixRank5::Sub)]
	R5Infix(InfixRank5), //d22

	/// A rank 7 (applied first) infix binop (binary operator)
	///
	/// i.e. Multiplication, Division, and Modulo
	#[token("*", |_| InfixRank6::Mul)]
	#[token("/", |_| InfixRank6::Div)]
	#[token("%", |_| InfixRank6::Mod)]
	R6Infix(InfixRank6), //d23

	/// Some literal (a constant value represented textually)
	///
	/// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is
	/// a boolean literal.
	#[regex("\"(?:[^\"]*(?:\\\\\")?)+\"",  |lex| Literal::from_string_match(lex.slice()))]
	#[regex(r"\d+",  |lex| Literal::from_int_match(lex.slice()))]
	Literal(Literal), //d24

	/// Some symbol, usually a variable or a type
	#[regex(r"[a-zA-Z_][a-zA-Z\d_]*",  |lex| lex.slice().to_string(), priority = 0)]
	Symbol(String), //d25

	/// An opening `[` square bracket
	///
	/// Usually used in arrays and domain restrictions
	#[token("[")]
	OpenSquareBracket, //d26

	/// A closing `]` square bracket
	///
	/// Usually used in arrays and domain restrictions, and the counterpart to the opening
	/// square bracket.
	///
	/// In amo, the opening and closing square brackets are both lesbians, and they're
	/// dating.  The closing square bracket is transgender, also.
	#[token("]")]
	CloseSquareBracket, //d27

	/// An opening `(` paren
	///
	/// Usually used to make explicit the order of operations
	#[token("(")]
	OpenParen, //d28

	/// A closing `)` paren
	///
	/// Usually used in arrays and domain restrictions, this is the counterpart to the
	/// open parenthesis.
	#[token(")")]
	CloseParen, //d29

	/// A `.` period
	///
	/// For getting fields of structs
	#[token(".")]
	Dot, //d30

	/// A `,` comma
	///
	/// The age-old and timeless delineator
	#[token(";")]
	Comma, //d31

	/// A newline NOT followed by whitespace
	///
	/// This means that the following tokens are at the start of a line.  For example
	///
	/// ```
	/// variable = value
	/// ```
	///
	/// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`,
	/// whereas
	///
	/// ```
	///     variable = value
	/// ```
	///
	/// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`.  This makes it easy
	/// to identify declarations.
	#[regex(r"(\s*\n)+")]
	DeclarationStart, //d32

	/// Denotes that the parser has reached the end of the input
	///
	/// This is always the last token in a stream, both in that it must be present in all
	/// streams, and in that it will never be followed by any tokens.
	EOF, //d33
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank1 {
	/// The logical OR operator
	///
	/// Takes two boolean values and returns true if either is true
	LOr,

	/// The VBar operator
	///
	/// Takes two variant sets and returns sum
	VBar,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank2 {
	/// The logical AND operator
	///
	/// Takes two boolean values and returns true iff both values are true.  Otherwise,
	/// returns false.
	LAnd,

	/// The Aro operator
	Aro,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank3 {
	/// The equality operator.
	///
	/// Takes two values and returns true iff they are equal
	Eq,

	/// The inequality operator.
	///
	/// Takes two values and returns true iff they are NOT equal
	NEq,

	/// The less-than operator.
	///
	/// Takes two numeric values and returns true iff the first is LESS than the second
	LessThan,

	/// The greater-than operator.
	///
	/// Takes two numeric values and returns true iff the first is GREATER than the second
	GreaterThan,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank4 {
	/// The additive operator.
	///
	/// Takes two numeric values and returns a range from the first to the second
	Range,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank5 {
	/// The additive operator.
	///
	/// Takes two numeric values and returns their sum
	Add,

	/// The subtractive operator.
	///
	/// Takes two numeric values and returns their difference
	Sub,
}

#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InfixRank6 {
	/// The multiplicitive operator.
	///
	/// Takes two numeric values and returns their product
	Mul,

	/// The modulo operator.
	///
	/// Takes two numeric values and returns the remainder of their division
	Mod,

	/// The division operator.
	///
	/// Takes two numeric values and returns their quotient
	Div,
}

#[derive(Clone, Debug, PartialEq, Eq)]
/// A specific type of literal, used for the [`Token::Literal`] token
pub enum Literal {
	/// A string literal
	///
	/// The internal [`String`] is the content of the string, with escape characters
	/// already processed.
	String(String),

	/// An integer literal
	Int(u64)
}

impl Literal {
	/// Create a string literal by parsing a matched regex
	///
	/// The input should be in the form of `"<content>"` WITH THE QUOTES (").  The quotes
	/// will then be trimmed, and character escape sequences will be substituted.
	pub fn from_string_match(s: &str) -> Self {
		if s.len() < 2 {
			// This should be unreachable, but this is kept just in case.
			eprintln!("[WARN] ---[Ruh roh!]-------------------------------------");
			eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!");
			eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect.");
			eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer.");
			eprintln!("[WARN] --------------------------------------------------");
			Self::String(String::new())
		} else {
			Self::String(s[1..s.len()-1].to_string())
		}
	}

	/// Create an integer literal by parsing a regex match
	///
	/// The input should be in the form of a series of ASCII digits 0-9 of any length.
	/// Any parse errors will result in [`None`] being returned instead.  These indicate a
	/// problem with the user's code, and should be reported.
	pub fn from_int_match(s: &str) -> Option<Self> {
		s.parse().ok().map(Self::Int)
	}
}