Amo/src/token.rs

//! A series of structures and methods for lexing an input
//!
//! Lexing is the process of taking a stream of characters and converting it into a series
//! of tokens (or lexemes).  Lexemes are designed to be the the smallest
//! machine-understandable unit of information - for example, a keyword, operator,
//! literal, or symbol.
//!
//! Look through the [`Token`] class to see a full list of the lexemes used by amo.
//!
//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes
//! a provided string.

use logos::Logos;

#[derive(Logos, Debug, PartialEq, Eq)]
pub enum Token {

	/// A lexer error happened, woops!
	///
	/// Also, this catches and ignores any whitespace that might be encountered
	#[error]
	#[regex(r"[ \t]+", logos::skip)]
	#[regex(r"\n[ \t]+", logos::skip)]
	#[regex(r"//.+", logos::skip)]
	#[regex(r"/\*[.\n]+\*/", logos::skip)]
	Error,

	/// The `type` keyword
	///
	/// Used to denote that a new type (called an enum in some languages) is being
	/// declared
	#[token("type")]
	Type,

	/// The `struct` keyword
	///
	/// Used to denote the declaration of a kind of struct
	#[token("struct")]
	Struct,

	/// The `trait` keywor, Eqd
	///
	/// Denotes the declaration of a new trait
	#[token("trait")]
	Trait,

	/// The `needs` keyword
	///
	/// Used as part of a trait declaration to denote methods that will be required for a
	/// trait
	#[token("needs")]
	Needs,

	/// The `if` keyword
	///
	/// Used to begin an If-Then-Else statement or an If-Is statement
	#[token("if")]
	If,

	/// The `is` keyword
	///
	/// Used as part of an If-Is statement to indictate the start of the case listings
	#[token("is")]
	Is,

	/// The `then` keyword
	///
	/// Indicates the start of the code block for the positive section of an If-Then-Else
	/// statement
	#[token("then")]
	Then,

	/// The `else` keyword
	///
	/// Denotes the end of the positive section of an If-Then-Else block, and the begining
	/// of the negative section
	#[token("else")]
	Else,

	/// the `impl` keyword
	///
	/// Used to denote the start of a trait implementation
	#[token("impl")]
	Impl,

	/// the `on` keyword
	///
	/// Used in trait implementationsto seperate the trait being implemented and the type
	/// it's being implemented on.
	#[token("on")]
	On,

	/// the `let` keyword
	///
	/// Allows binding a value to an immutable variable that can be used multiple times
	#[token("let")]
	Let,

	/// the `in` keyword
	///
	/// Used to seperate a series of `let` bindings from the expression they're being used
	/// in.
	#[token("in")]
	In,

	/// An `->` arrow
	///
	/// Used as part of function type annotations as well as in the cases of If-Is blocks
	#[token("->")]
	Aro,

	/// An `=` assignment operator
	///
	/// Used to seperate the left & right hand signs of an assignment operation
	#[token("=")]
	Assign,

	/// The `|` keyword (or punctuation?  idk what it's called)
	///
	/// Used in deliniating variants of a type
	#[token("|")]
	VBar,

	/// The `_` symbol
	///
	/// Generally used as a placeholder or standin for another type
	#[token("_")]
	Placeholder,

	/// The `:` symbol
	///
	/// Used as a seperator in various parts of the language
	#[token(":")]
	Colon,

	/// Any infix binop (binary operator)
	///
	/// E.g. +, -, >, /, %, etc.
	///
	/// These are operators that take two operands, one on the left, and one on the right,
	/// and produce a single value.  I don't think there are any two character
	#[token("&&", |_| InfixOp::LAnd)]
	#[token("||", |_| InfixOp::LOr)]
	#[token("==", |_| InfixOp::Eq)]
	#[token("!=", |_| InfixOp::NEq)]
	#[token("*",  |_| InfixOp::Mult)]
	#[token("%",  |_| InfixOp::Mod)]
	#[token("/",  |_| InfixOp::Div)]
	#[token("+",  |_| InfixOp::Add)]
	#[token("-",  |_| InfixOp::Sub)]
	#[token("<",  |_| InfixOp::Less)]
	#[token(">",  |_| InfixOp::Greater)]
	Infix(InfixOp),

	/// Some literal (a constant value represented textually)
	///
	/// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is
	/// a boolean literal.
	#[regex("\"(?:.+(?:\\\\\")?)+\"",  |lex| Literal::from_string_match(lex.slice()))]
	#[regex(r"\d+",  |lex| Literal::from_int_match(lex.slice()))]
	Literal(Literal),

	/// Some symbol, usually a variable or a type
	#[regex(r"[a-zA-Z][a-zA-Z\d]*",  |lex| lex.slice().to_string(), priority = 0)]
	Symbol(String),

	/// An opening `[` square bracket
	///
	/// Usually used in arrays and domain restrictions
	#[token("[")]
	OpenSquareBracket,

	/// A closing `]` square bracket
	///
	/// Usually used in arrays and domain restrictions, and the counterpart to the opening
	/// square bracket.
	///
	/// In amo, the opening and closing square brackets are both lesbians, and they're
	/// dating.  The closing square bracket is transgender, also.
	#[token("]")]
	CloseSquareBracket,

	/// An opening `(` paren
	///
	/// Usually used to make explicit the order of operations
	#[token("(")]
	OpenParen,

	/// A closing `)` paren
	///
	/// Usually used in arrays and domain restrictions, this is the counterpart to the
	/// open parenthesis.
	#[token(")")]
	CloseParen,

	/// A `..` range operator
	///
	/// Used to denote, well, a range between the values on the left and the right.
	#[token("..")]
	RangeOp,

	/// A `.` period
	///
	/// For getting fields of structs
	#[token(".")]
	Dot,

	/// A `,` comma
	///
	/// The age-old and timeless delineator
	#[token(",")]
	Comma,

	/// A newline NOT followed by whitespace
	///
	/// This means that the following tokens are at the start of a line.  For example
	///
	/// ```
	/// variable = value
	/// ```
	///
	/// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`,
	/// whereas
	///
	/// ```
	///     variable = value
	/// ```
	///
	/// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`.  This makes it easy
	/// to identify declarations.
	#[regex(r"\s*\n")]
	DeclarationStart,
}

#[derive(Debug, PartialEq, Eq)]
/// A specific infix operator
///
/// Used to specify the [`Token::Infix`] variant.
pub enum InfixOp {
	/// The logical AND operator
	///
	/// Takes two boolean values and returns true iff both values are true.  Otherwise,
	/// returns false.
	LAnd,

	/// The logical OR operator
	///
	/// Takes two boolean values and returns true if either is true
	LOr,

	/// The multiplicitive operator.
	///
	/// Takes two numeric values and returns their product
	Mult,

	/// The modulo operator.
	///
	/// Takes two numeric values and returns the remainder of their division
	Mod,

	/// The division operator.
	///
	/// Takes two numeric values and returns their quotient
	Div,

	/// The additive operator.
	///
	/// Takes two numeric values and returns their sum
	Add,

	/// The subtractive operator.
	///
	/// Takes two numeric values and returns their difference
	Sub,

	/// The equality operator.
	///
	/// Takes two values and returns true iff they are equal
	Eq,

	/// The inequality operator.
	///
	/// Takes two values and returns true iff they are NOT equal
	NEq,

	/// The less-than operator.
	///
	/// Takes two numeric values and returns true iff the first is LESS than the second
	Less,

	/// The greater-than operator.
	///
	/// Takes two numeric values and returns true iff the first is GREATER than the second
	Greater,
}

#[derive(Debug, PartialEq, Eq)]
/// A specific type of literal, used for the [`Token::Literal`] token
pub enum Literal {
	/// A string literal
	///
	/// The internal [`String`] is the content of the string, with escape characters
	/// already processed.
	String(String),

	/// An integer literal
	Int(u64)
}

impl Literal {
	/// Create a string literal by parsing a matched regex
	///
	/// The input should be in the form of `"<content>"` WITH THE QUOTES (").  The quotes
	/// will then be trimmed, and character escape sequences will be substituted.
	pub fn from_string_match(s: &str) -> Self {
		if s.len() < 2 {
			// This should be unreachable, but this is kept just in case.
			eprintln!("[WARN] ---[Ruh roh!]-------------------------------------");
			eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!");
			eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect.");
			eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer.");
			eprintln!("[WARN] --------------------------------------------------");
			Self::String(String::new())
		} else {
			Self::String(s[1..s.len()-1].to_string())
		}
	}

	/// Create an integer literal by parsing a regex match
	///
	/// The input should be in the form of a series of ASCII digits 0-9 of any length.
	/// Any parse errors will result in [`None`] being returned instead.  These indicate a
	/// problem with the user's code, and should be reported.
	pub fn from_int_match(s: &str) -> Option<Self> {
		s.parse().ok().map(Self::Int)
	}
}