387 lines
9.9 KiB
Rust
387 lines
9.9 KiB
Rust
//! A series of structures and methods for lexing an input
|
|
//!
|
|
//! Lexing is the process of taking a stream of characters and converting it into a series
|
|
//! of tokens (or lexemes). Lexemes are designed to be the the smallest
|
|
//! machine-understandable unit of information - for example, a keyword, operator,
|
|
//! literal, or symbol.
|
|
//!
|
|
//! Look through the [`Token`] class to see a full list of the lexemes used by amo.
|
|
//!
|
|
//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes
|
|
//! a provided string.
|
|
|
|
use logos::Logos;
|
|
|
|
#[derive(Logos, Debug, PartialEq, Eq)]
|
|
pub enum Token {
|
|
|
|
/// A lexer error happened, woops!
|
|
///
|
|
/// Also, this catches and ignores any whitespace that might be encountered
|
|
#[error]
|
|
#[regex(r"[ \t]+", logos::skip)]
|
|
#[regex(r"\n[ \t]+", logos::skip)]
|
|
#[regex(r"//.+", logos::skip)]
|
|
#[regex(r"/\*([^*]*(\*[^/])?)+\*/", logos::skip)]
|
|
Error, //d00
|
|
|
|
/// The `type` keyword
|
|
///
|
|
/// Used to denote that a new type (called an enum in some languages) is being
|
|
/// declared
|
|
#[token("type")]
|
|
Type, //d01
|
|
|
|
/// The `struct` keyword
|
|
///
|
|
/// Used to denote the declaration of a kind of struct
|
|
#[token("struct")]
|
|
Struct, //d02
|
|
|
|
/// The `trait` keywor, Eqd
|
|
///
|
|
/// Denotes the declaration of a new trait
|
|
#[token("trait")]
|
|
Trait, //d03
|
|
|
|
/// The `needs` keyword
|
|
///
|
|
/// Used as part of a trait declaration to denote methods that will be required for a
|
|
/// trait
|
|
#[token("needs")]
|
|
Needs, //d04
|
|
|
|
/// The `if` keyword
|
|
///
|
|
/// Used to begin an If-Then-Else statement or an If-Is statement
|
|
#[token("if")]
|
|
If, //d05
|
|
|
|
/// The `is` keyword
|
|
///
|
|
/// Used as part of an If-Is statement to indictate the start of the case listings
|
|
#[token("is")]
|
|
Is, //d06
|
|
|
|
/// The `then` keyword
|
|
///
|
|
/// Indicates the start of the code block for the positive section of an If-Then-Else
|
|
/// statement
|
|
#[token("then")]
|
|
Then, //d07
|
|
|
|
/// The `else` keyword
|
|
///
|
|
/// Denotes the end of the positive section of an If-Then-Else block, and the begining
|
|
/// of the negative section
|
|
#[token("else")]
|
|
Else, //d08
|
|
|
|
/// the `impl` keyword
|
|
///
|
|
/// Used to denote the start of a trait implementation
|
|
#[token("impl")]
|
|
Impl, //d09
|
|
|
|
/// the `on` keyword
|
|
///
|
|
/// Used in trait implementationsto seperate the trait being implemented and the type
|
|
/// it's being implemented on.
|
|
#[token("on")]
|
|
On, //d10
|
|
|
|
/// the `let` keyword
|
|
///
|
|
/// Allows binding a value to an immutable variable that can be used multiple times
|
|
#[token("let")]
|
|
Let, //d11
|
|
|
|
/// the `in` keyword
|
|
///
|
|
/// Used to seperate a series of `let` bindings from the expression they're being used
|
|
/// in.
|
|
#[token("in")]
|
|
In, //d12
|
|
|
|
/// An `=>` arrow
|
|
///
|
|
/// Used as part of function type annotations as well as in the cases of If-Is blocks
|
|
#[token("=>")]
|
|
DubAro, //d13
|
|
|
|
/// An `=` assignment operator
|
|
///
|
|
/// Used to seperate the left & right hand signs of an assignment operation
|
|
#[token("=")]
|
|
Assign, //d14
|
|
|
|
/// Type Operator
|
|
#[token("type", priority = 9)]
|
|
TypeOp, //d15
|
|
|
|
/// The `_` symbol
|
|
///
|
|
/// Generally used as a placeholder or standin for another type
|
|
#[token("_")]
|
|
Placeholder, //d16
|
|
|
|
/// The `:` symbol
|
|
///
|
|
/// Used as a seperator in various parts of the language
|
|
#[token(":")]
|
|
Colon, //d17
|
|
|
|
/// A rank 1 (applied last) infix binop (binary operator)
|
|
///
|
|
/// i.e. Logical Or
|
|
#[token("||", |_| InfixRank1::LOr)]
|
|
#[token("|", |_| InfixRank1::VBar)]
|
|
R1Infix(InfixRank1), //d18
|
|
|
|
/// A rank 2 infix binop (binary operator)
|
|
///
|
|
/// i.e. Logical And
|
|
#[token("&&", |_| InfixRank2::LAnd)]
|
|
#[token("->", |_| InfixRank2::Aro)]
|
|
#[token(",", |_| InfixRank2::Aro)]
|
|
R2Infix(InfixRank2), //d19
|
|
|
|
/// A rank 3 infix binop (binary operator)
|
|
///
|
|
/// i.e. Comparison operators like == and <
|
|
#[token("==", |_| InfixRank3::Eq)]
|
|
#[token("!=", |_| InfixRank3::NEq)]
|
|
#[token("<", |_| InfixRank3::LessThan)]
|
|
#[token(">", |_| InfixRank3::GreaterThan)]
|
|
R3Infix(InfixRank3), //d20
|
|
|
|
/// A rank 5 infix binop (binary operator)
|
|
///
|
|
/// i.e. Range
|
|
#[token("..", |_| InfixRank4::Range)]
|
|
R4Infix(InfixRank4), //d21
|
|
|
|
/// A rank 6 infix binop (binary operator)
|
|
///
|
|
/// i.e. Addition & Subtraction
|
|
#[token("+", |_| InfixRank5::Add)]
|
|
#[token("-", |_| InfixRank5::Sub)]
|
|
R5Infix(InfixRank5), //d22
|
|
|
|
/// A rank 7 (applied first) infix binop (binary operator)
|
|
///
|
|
/// i.e. Multiplication, Division, and Modulo
|
|
#[token("*", |_| InfixRank6::Mul)]
|
|
#[token("/", |_| InfixRank6::Div)]
|
|
#[token("%", |_| InfixRank6::Mod)]
|
|
R6Infix(InfixRank6), //d23
|
|
|
|
/// Some literal (a constant value represented textually)
|
|
///
|
|
/// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is
|
|
/// a boolean literal.
|
|
#[regex("\"(?:[^\"]*(?:\\\\\")?)+\"", |lex| Literal::from_string_match(lex.slice()))]
|
|
#[regex(r"\d+", |lex| Literal::from_int_match(lex.slice()))]
|
|
Literal(Literal), //d24
|
|
|
|
/// Some symbol, usually a variable or a type
|
|
#[regex(r"[a-zA-Z_][a-zA-Z\d_]*", |lex| lex.slice().to_string(), priority = 0)]
|
|
Symbol(String), //d25
|
|
|
|
/// An opening `[` square bracket
|
|
///
|
|
/// Usually used in arrays and domain restrictions
|
|
#[token("[")]
|
|
OpenSquareBracket, //d26
|
|
|
|
/// A closing `]` square bracket
|
|
///
|
|
/// Usually used in arrays and domain restrictions, and the counterpart to the opening
|
|
/// square bracket.
|
|
///
|
|
/// In amo, the opening and closing square brackets are both lesbians, and they're
|
|
/// dating. The closing square bracket is transgender, also.
|
|
#[token("]")]
|
|
CloseSquareBracket, //d27
|
|
|
|
/// An opening `(` paren
|
|
///
|
|
/// Usually used to make explicit the order of operations
|
|
#[token("(")]
|
|
OpenParen, //d28
|
|
|
|
/// A closing `)` paren
|
|
///
|
|
/// Usually used in arrays and domain restrictions, this is the counterpart to the
|
|
/// open parenthesis.
|
|
#[token(")")]
|
|
CloseParen, //d29
|
|
|
|
/// A `.` period
|
|
///
|
|
/// For getting fields of structs
|
|
#[token(".")]
|
|
Dot, //d30
|
|
|
|
/// A `,` comma
|
|
///
|
|
/// The age-old and timeless delineator
|
|
#[token(";")]
|
|
Comma, //d31
|
|
|
|
/// A newline NOT followed by whitespace
|
|
///
|
|
/// This means that the following tokens are at the start of a line. For example
|
|
///
|
|
/// ```
|
|
/// variable = value
|
|
/// ```
|
|
///
|
|
/// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`,
|
|
/// whereas
|
|
///
|
|
/// ```
|
|
/// variable = value
|
|
/// ```
|
|
///
|
|
/// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`. This makes it easy
|
|
/// to identify declarations.
|
|
#[regex(r"(\s*\n)+")]
|
|
DeclarationStart, //d32
|
|
|
|
/// Denotes that the parser has reached the end of the input
|
|
///
|
|
/// This is always the last token in a stream, both in that it must be present in all
|
|
/// streams, and in that it will never be followed by any tokens.
|
|
EOF, //d33
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank1 {
|
|
/// The logical OR operator
|
|
///
|
|
/// Takes two boolean values and returns true if either is true
|
|
LOr,
|
|
|
|
/// The VBar operator
|
|
///
|
|
/// Takes two variant sets and returns sum
|
|
VBar,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank2 {
|
|
/// The logical AND operator
|
|
///
|
|
/// Takes two boolean values and returns true iff both values are true. Otherwise,
|
|
/// returns false.
|
|
LAnd,
|
|
|
|
/// The Aro operator
|
|
Aro,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank3 {
|
|
/// The equality operator.
|
|
///
|
|
/// Takes two values and returns true iff they are equal
|
|
Eq,
|
|
|
|
/// The inequality operator.
|
|
///
|
|
/// Takes two values and returns true iff they are NOT equal
|
|
NEq,
|
|
|
|
/// The less-than operator.
|
|
///
|
|
/// Takes two numeric values and returns true iff the first is LESS than the second
|
|
LessThan,
|
|
|
|
/// The greater-than operator.
|
|
///
|
|
/// Takes two numeric values and returns true iff the first is GREATER than the second
|
|
GreaterThan,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank4 {
|
|
/// The additive operator.
|
|
///
|
|
/// Takes two numeric values and returns a range from the first to the second
|
|
Range,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank5 {
|
|
/// The additive operator.
|
|
///
|
|
/// Takes two numeric values and returns their sum
|
|
Add,
|
|
|
|
/// The subtractive operator.
|
|
///
|
|
/// Takes two numeric values and returns their difference
|
|
Sub,
|
|
}
|
|
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub enum InfixRank6 {
|
|
/// The multiplicitive operator.
|
|
///
|
|
/// Takes two numeric values and returns their product
|
|
Mul,
|
|
|
|
/// The modulo operator.
|
|
///
|
|
/// Takes two numeric values and returns the remainder of their division
|
|
Mod,
|
|
|
|
/// The division operator.
|
|
///
|
|
/// Takes two numeric values and returns their quotient
|
|
Div,
|
|
}
|
|
|
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
|
/// A specific type of literal, used for the [`Token::Literal`] token
|
|
pub enum Literal {
|
|
/// A string literal
|
|
///
|
|
/// The internal [`String`] is the content of the string, with escape characters
|
|
/// already processed.
|
|
String(String),
|
|
|
|
/// An integer literal
|
|
Int(u64)
|
|
}
|
|
|
|
impl Literal {
|
|
/// Create a string literal by parsing a matched regex
|
|
///
|
|
/// The input should be in the form of `"<content>"` WITH THE QUOTES ("). The quotes
|
|
/// will then be trimmed, and character escape sequences will be substituted.
|
|
pub fn from_string_match(s: &str) -> Self {
|
|
if s.len() < 2 {
|
|
// This should be unreachable, but this is kept just in case.
|
|
eprintln!("[WARN] ---[Ruh roh!]-------------------------------------");
|
|
eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!");
|
|
eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect.");
|
|
eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer.");
|
|
eprintln!("[WARN] --------------------------------------------------");
|
|
Self::String(String::new())
|
|
} else {
|
|
Self::String(s[1..s.len()-1].to_string())
|
|
}
|
|
}
|
|
|
|
/// Create an integer literal by parsing a regex match
|
|
///
|
|
/// The input should be in the form of a series of ASCII digits 0-9 of any length.
|
|
/// Any parse errors will result in [`None`] being returned instead. These indicate a
|
|
/// problem with the user's code, and should be reported.
|
|
pub fn from_int_match(s: &str) -> Option<Self> {
|
|
s.parse().ok().map(Self::Int)
|
|
}
|
|
}
|