Amo/src/token.rs

313 lines
8.2 KiB
Rust

//! A series of structures and methods for lexing an input
//!
//! Lexing is the process of taking a stream of characters and converting it into a series
//! of tokens (or lexemes). Lexemes are designed to be the the smallest
//! machine-understandable unit of information - for example, a keyword, operator,
//! literal, or symbol.
//!
//! Look through the [`Token`] class to see a full list of the lexemes used by amo.
//!
//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes
//! a provided string.
use logos::Logos;
#[derive(Logos, Debug, PartialEq, Eq)]
pub enum Token {
/// A lexer error happened, woops!
///
/// Also, this catches and ignores any whitespace that might be encountered
#[error]
#[regex(r"[ \t]+", logos::skip)]
#[regex(r"\n[ \t]+", logos::skip)]
#[regex(r"//.+", logos::skip)]
#[regex(r"/\*[.\n]+\*/", logos::skip)]
Error,
/// The `type` keyword
///
/// Used to denote that a new type (called an enum in some languages) is being
/// declared
#[token("type")]
Type,
/// The `struct` keyword
///
/// Used to denote the declaration of a kind of struct
#[token("struct")]
Struct,
/// The `trait` keywor, Eqd
///
/// Denotes the declaration of a new trait
#[token("trait")]
Trait,
/// The `needs` keyword
///
/// Used as part of a trait declaration to denote methods that will be required for a
/// trait
#[token("needs")]
Needs,
/// The `if` keyword
///
/// Used to begin an If-Then-Else statement or an If-Is statement
#[token("if")]
If,
/// The `is` keyword
///
/// Used as part of an If-Is statement to indictate the start of the case listings
#[token("is")]
Is,
/// The `then` keyword
///
/// Indicates the start of the code block for the positive section of an If-Then-Else
/// statement
#[token("then")]
Then,
/// The `else` keyword
///
/// Denotes the end of the positive section of an If-Then-Else block, and the begining
/// of the negative section
#[token("else")]
Else,
/// the `impl` keyword
///
/// Used to denote the start of a trait implementation
#[token("impl")]
Impl,
/// the `on` keyword
///
/// Used in trait implementationsto seperate the trait being implemented and the type
/// it's being implemented on.
#[token("on")]
On,
/// An `->` arrow
///
/// Used as part of function type annotations as well as in the cases of If-Is blocks
#[token("->")]
Aro,
/// An `=` assignment operator
///
/// Used to seperate the left & right hand signs of an assignment operation
#[token("=")]
Assign,
/// The `|` keyword (or punctuation? idk what it's called)
///
/// Used in deliniating variants of a type
#[token("|")]
VBar,
/// The `_` symbol
///
/// Generally used as a placeholder or standin for another type
#[token("_")]
Placeholder,
/// The `:` symbol
///
/// Used as a seperator in various parts of the language
#[token(":")]
Colon,
/// Any infix binop (binary operator)
///
/// E.g. +, -, >, /, %, etc.
///
/// These are operators that take two operands, one on the left, and one on the right,
/// and produce a single value. I don't think there are any two character
#[token("&&", |_| InfixOp::LAnd)]
#[token("||", |_| InfixOp::LOr)]
#[token("==", |_| InfixOp::Eq)]
#[token("!=", |_| InfixOp::NEq)]
#[token("*", |_| InfixOp::Mult)]
#[token("%", |_| InfixOp::Mod)]
#[token("/", |_| InfixOp::Div)]
#[token("+", |_| InfixOp::Add)]
#[token("-", |_| InfixOp::Sub)]
#[token("<", |_| InfixOp::Less)]
#[token(">", |_| InfixOp::Greater)]
Infix(InfixOp),
/// Some literal (a constant value represented textually)
///
/// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is
/// a boolean literal.
#[regex("\"(?:.+(?:\\\\\")?)+\"", |lex| Literal::from_string_match(lex.slice()))]
#[regex(r"\d+", |lex| Literal::from_int_match(lex.slice()))]
Literal(Literal),
/// Some symbol, usually a variable or a type
#[regex(r"[a-zA-Z][a-zA-Z\d]*", |lex| lex.slice().to_string(), priority = 0)]
Symbol(String),
/// An opening `[` square bracket
///
/// Usually used in arrays and domain restrictions
#[token("[")]
OpenSquareBracket,
/// A closing `]` square bracket
///
/// Usually used in arrays and domain restrictions, and the counterpart to the opening
/// square bracket.
///
/// In amo, the opening and closing square brackets are both lesbians, and they're
/// dating. The closing square bracket is transgender, also.
#[token("]")]
CloseSquareBracket,
/// An opening `(` paren
///
/// Usually used to make explicit the order of operations
#[token("(")]
OpenParen,
/// A closing `)` paren
///
/// Usually used in arrays and domain restrictions, this is the counterpart to the
/// open parenthesis.
#[token(")")]
CloseParen,
/// A `..` range operator
///
/// Used to denote, well, a range between the values on the left and the right.
#[token("..")]
RangeOp,
/// A newline NOT followed by whitespace
///
/// This means that the following tokens are at the start of a line. For example
///
/// ```
/// variable = value
/// ```
///
/// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`,
/// whereas
///
/// ```
/// variable = value
/// ```
///
/// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`. This makes it easy
/// to identify declarations.
#[regex(r"\s*\n")]
DeclarationStart,
}
#[derive(Debug, PartialEq, Eq)]
/// A specific infix operator
///
/// Used to specify the [`Token::Infix`] variant.
pub enum InfixOp {
/// The logical AND operator
///
/// Takes two boolean values and returns true iff both values are true. Otherwise,
/// returns false.
LAnd,
/// The logical OR operator
///
/// Takes two boolean values and returns true if either is true
LOr,
/// The multiplicitive operator.
///
/// Takes two numeric values and returns their product
Mult,
/// The modulo operator.
///
/// Takes two numeric values and returns the remainder of their division
Mod,
/// The division operator.
///
/// Takes two numeric values and returns their quotient
Div,
/// The additive operator.
///
/// Takes two numeric values and returns their sum
Add,
/// The subtractive operator.
///
/// Takes two numeric values and returns their difference
Sub,
/// The equality operator.
///
/// Takes two values and returns true iff they are equal
Eq,
/// The inequality operator.
///
/// Takes two values and returns true iff they are NOT equal
NEq,
/// The less-than operator.
///
/// Takes two numeric values and returns true iff the first is LESS than the second
Less,
/// The greater-than operator.
///
/// Takes two numeric values and returns true iff the first is GREATER than the second
Greater,
}
#[derive(Debug, PartialEq, Eq)]
/// A specific type of literal, used for the [`Token::Literal`] token
pub enum Literal {
/// A string literal
///
/// The internal [`String`] is the content of the string, with escape characters
/// already processed.
String(String),
/// An integer literal
Int(u64)
}
impl Literal {
/// Create a string literal by parsing a matched regex
///
/// The input should be in the form of `"<content>"` WITH THE QUOTES ("). The quotes
/// will then be trimmed, and character escape sequences will be substituted.
pub fn from_string_match(s: &str) -> Self {
if s.len() < 2 {
// This should be unreachable, but this is kept just in case.
eprintln!("[WARN] ---[Ruh roh!]-------------------------------------");
eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!");
eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect.");
eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer.");
eprintln!("[WARN] --------------------------------------------------");
Self::String(String::new())
} else {
Self::String(s[1..s.len()-1].to_string())
}
}
/// Create an integer literal by parsing a regex match
///
/// The input should be in the form of a series of ASCII digits 0-9 of any length.
/// Any parse errors will result in [`None`] being returned instead. These indicate a
/// problem with the user's code, and should be reported.
pub fn from_int_match(s: &str) -> Option<Self> {
s.parse().ok().map(Self::Int)
}
}