Added lexing!
This commit is contained in:
parent
2d6f5511b1
commit
efbe872f1e
93
Cargo.lock
generated
Normal file
93
Cargo.lock
generated
Normal file
|
@ -0,0 +1,93 @@
|
||||||
|
# This file is automatically @generated by Cargo.
|
||||||
|
# It is not intended for manual editing.
|
||||||
|
version = 3
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "amo"
|
||||||
|
version = "0.1.0"
|
||||||
|
dependencies = [
|
||||||
|
"logos",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "beef"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "bed554bd50246729a1ec158d08aa3235d1b69d94ad120ebe187e28894787e736"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "fnv"
|
||||||
|
version = "1.0.7"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "logos"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "427e2abca5be13136da9afdbf874e6b34ad9001dd70f2b103b083a85daa7b345"
|
||||||
|
dependencies = [
|
||||||
|
"logos-derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "logos-derive"
|
||||||
|
version = "0.12.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "56a7d287fd2ac3f75b11f19a1c8a874a7d55744bd91f7a1b3e7cf87d4343c36d"
|
||||||
|
dependencies = [
|
||||||
|
"beef",
|
||||||
|
"fnv",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"regex-syntax",
|
||||||
|
"syn",
|
||||||
|
"utf8-ranges",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro2"
|
||||||
|
version = "1.0.36"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "quote"
|
||||||
|
version = "1.0.15"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "regex-syntax"
|
||||||
|
version = "0.6.25"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "syn"
|
||||||
|
version = "1.0.86"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"unicode-xid",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-xid"
|
||||||
|
version = "0.2.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "utf8-ranges"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba"
|
|
@ -6,3 +6,4 @@ edition = "2021"
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
logos = "0.12.0"
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
union SimpleType
|
type SimpleType
|
||||||
= MyVariant1
|
= MyVariant1
|
||||||
| MyVariant2 Integer[u32]
|
| MyVariant2 Integer[u32]
|
||||||
| MyVariant3 Integer[1..20]
|
| MyVariant3 Integer[1..20]
|
||||||
| MyVariant4 Integer String
|
| MyVariant4 Integer String
|
||||||
|
|
||||||
union Option val
|
type Option val
|
||||||
= Some val
|
= Some val
|
||||||
| None
|
| None
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ trait Functor[a, b] on Self _ needs
|
||||||
map : (a -> b), Self a -> Self b
|
map : (a -> b), Self a -> Self b
|
||||||
pure : a -> Self a
|
pure : a -> Self a
|
||||||
|
|
||||||
union ComplexType[a, b : Functor]
|
type ComplexType[a, b : Functor]
|
||||||
= Left a
|
= Left a
|
||||||
| Right b
|
| Right b
|
||||||
|
|
||||||
|
|
18
src/main.rs
18
src/main.rs
|
@ -1,3 +1,17 @@
|
||||||
fn main() {
|
use std::{fs::File, io::Read};
|
||||||
println!("Hello, world!");
|
|
||||||
|
use logos::Logos;
|
||||||
|
|
||||||
|
mod token;
|
||||||
|
|
||||||
|
fn main() -> std::io::Result<()> {
|
||||||
|
let mut input_file = File::open("sample.amo")?;
|
||||||
|
let mut input = String::with_capacity(4096);
|
||||||
|
input_file.read_to_string(&mut input)?;
|
||||||
|
|
||||||
|
for tok in token::Token::lexer(&input) {
|
||||||
|
println!("{tok:?}");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
310
src/token.rs
Normal file
310
src/token.rs
Normal file
|
@ -0,0 +1,310 @@
|
||||||
|
//! A series of structures and methods for lexing an input
|
||||||
|
//!
|
||||||
|
//! Lexing is the process of taking a stream of characters and converting it into a series
|
||||||
|
//! of tokens (or lexemes). Lexemes are designed to be the the smallest
|
||||||
|
//! machine-understandable unit of information - for example, a keyword, operator,
|
||||||
|
//! literal, or symbol.
|
||||||
|
//!
|
||||||
|
//! Look through the [`Token`] class to see a full list of the lexemes used by amo.
|
||||||
|
//!
|
||||||
|
//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes
|
||||||
|
//! a provided string.
|
||||||
|
|
||||||
|
use logos::Logos;
|
||||||
|
|
||||||
|
#[derive(Logos, Debug, PartialEq, Eq)]
|
||||||
|
pub enum Token {
|
||||||
|
|
||||||
|
/// A lexer error happened, woops!
|
||||||
|
///
|
||||||
|
/// Also, this catches and ignores any whitespace that might be encountered
|
||||||
|
#[error]
|
||||||
|
#[regex(r"[ \t]+", logos::skip)]
|
||||||
|
#[regex(r"\n[ \t]+", logos::skip)]
|
||||||
|
Error,
|
||||||
|
|
||||||
|
/// The `type` keyword
|
||||||
|
///
|
||||||
|
/// Used to denote that a new type (called an enum in some languages) is being
|
||||||
|
/// declared
|
||||||
|
#[token("type")]
|
||||||
|
Type,
|
||||||
|
|
||||||
|
/// The `struct` keyword
|
||||||
|
///
|
||||||
|
/// Used to denote the declaration of a kind of struct
|
||||||
|
#[token("struct")]
|
||||||
|
Struct,
|
||||||
|
|
||||||
|
/// The `trait` keywor, Eqd
|
||||||
|
///
|
||||||
|
/// Denotes the declaration of a new trait
|
||||||
|
#[token("trait")]
|
||||||
|
Trait,
|
||||||
|
|
||||||
|
/// The `needs` keyword
|
||||||
|
///
|
||||||
|
/// Used as part of a trait declaration to denote methods that will be required for a
|
||||||
|
/// trait
|
||||||
|
#[token("needs")]
|
||||||
|
Needs,
|
||||||
|
|
||||||
|
/// The `if` keyword
|
||||||
|
///
|
||||||
|
/// Used to begin an If-Then-Else statement or an If-Is statement
|
||||||
|
#[token("if")]
|
||||||
|
If,
|
||||||
|
|
||||||
|
/// The `is` keyword
|
||||||
|
///
|
||||||
|
/// Used as part of an If-Is statement to indictate the start of the case listings
|
||||||
|
#[token("is")]
|
||||||
|
Is,
|
||||||
|
|
||||||
|
/// The `then` keyword
|
||||||
|
///
|
||||||
|
/// Indicates the start of the code block for the positive section of an If-Then-Else
|
||||||
|
/// statement
|
||||||
|
#[token("then")]
|
||||||
|
Then,
|
||||||
|
|
||||||
|
/// The `else` keyword
|
||||||
|
///
|
||||||
|
/// Denotes the end of the positive section of an If-Then-Else block, and the begining
|
||||||
|
/// of the negative section
|
||||||
|
#[token("else")]
|
||||||
|
Else,
|
||||||
|
|
||||||
|
/// the `impl` keyword
|
||||||
|
///
|
||||||
|
/// Used to denote the start of a trait implementation
|
||||||
|
#[token("impl")]
|
||||||
|
Impl,
|
||||||
|
|
||||||
|
/// the `on` keyword
|
||||||
|
///
|
||||||
|
/// Used in trait implementationsto seperate the trait being implemented and the type
|
||||||
|
/// it's being implemented on.
|
||||||
|
#[token("on")]
|
||||||
|
On,
|
||||||
|
|
||||||
|
/// An `->` arrow
|
||||||
|
///
|
||||||
|
/// Used as part of function type annotations as well as in the cases of If-Is blocks
|
||||||
|
#[token("->")]
|
||||||
|
Aro,
|
||||||
|
|
||||||
|
/// An `=` assignment operator
|
||||||
|
///
|
||||||
|
/// Used to seperate the left & right hand signs of an assignment operation
|
||||||
|
#[token("=")]
|
||||||
|
Assign,
|
||||||
|
|
||||||
|
/// The `|` keyword (or punctuation? idk what it's called)
|
||||||
|
///
|
||||||
|
/// Used in deliniating variants of a type
|
||||||
|
#[token("|")]
|
||||||
|
VBar,
|
||||||
|
|
||||||
|
/// The `_` symbol
|
||||||
|
///
|
||||||
|
/// Generally used as a placeholder or standin for another type
|
||||||
|
#[token("_")]
|
||||||
|
Placeholder,
|
||||||
|
|
||||||
|
/// The `:` symbol
|
||||||
|
///
|
||||||
|
/// Used as a seperator in various parts of the language
|
||||||
|
#[token(":")]
|
||||||
|
Colon,
|
||||||
|
|
||||||
|
/// Any infix binop (binary operator)
|
||||||
|
///
|
||||||
|
/// E.g. +, -, >, /, %, etc.
|
||||||
|
///
|
||||||
|
/// These are operators that take two operands, one on the left, and one on the right,
|
||||||
|
/// and produce a single value. I don't think there are any two character
|
||||||
|
#[token("&&", |_| InfixOp::LAnd)]
|
||||||
|
#[token("||", |_| InfixOp::LOr)]
|
||||||
|
#[token("==", |_| InfixOp::Eq)]
|
||||||
|
#[token("!=", |_| InfixOp::NEq)]
|
||||||
|
#[token("*", |_| InfixOp::Mult)]
|
||||||
|
#[token("%", |_| InfixOp::Mod)]
|
||||||
|
#[token("/", |_| InfixOp::Div)]
|
||||||
|
#[token("+", |_| InfixOp::Add)]
|
||||||
|
#[token("-", |_| InfixOp::Sub)]
|
||||||
|
#[token("<", |_| InfixOp::Less)]
|
||||||
|
#[token(">", |_| InfixOp::Greater)]
|
||||||
|
Infix(InfixOp),
|
||||||
|
|
||||||
|
/// Some literal (a constant value represented textually)
|
||||||
|
///
|
||||||
|
/// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is
|
||||||
|
/// a boolean literal.
|
||||||
|
#[regex("\"(?:.+(?:\\\\\")?)+\"", |lex| Literal::from_string_match(lex.slice()))]
|
||||||
|
#[regex(r"\d+", |lex| Literal::from_int_match(lex.slice()))]
|
||||||
|
Literal(Literal),
|
||||||
|
|
||||||
|
/// Some symbol, usually a variable or a type
|
||||||
|
#[regex(r"[a-zA-Z][a-zA-Z\d]*", |lex| lex.slice().to_string(), priority = 0)]
|
||||||
|
Symbol(String),
|
||||||
|
|
||||||
|
/// An opening `[` square bracket
|
||||||
|
///
|
||||||
|
/// Usually used in arrays and domain restrictions
|
||||||
|
#[token("[")]
|
||||||
|
OpenSquareBracket,
|
||||||
|
|
||||||
|
/// A closing `]` square bracket
|
||||||
|
///
|
||||||
|
/// Usually used in arrays and domain restrictions, and the counterpart to the opening
|
||||||
|
/// square bracket.
|
||||||
|
///
|
||||||
|
/// In amo, the opening and closing square brackets are both lesbians, and they're
|
||||||
|
/// dating. The closing square bracket is transgender, also.
|
||||||
|
#[token("]")]
|
||||||
|
CloseSquareBracket,
|
||||||
|
|
||||||
|
/// An opening `(` paren
|
||||||
|
///
|
||||||
|
/// Usually used to make explicit the order of operations
|
||||||
|
#[token("(")]
|
||||||
|
OpenParen,
|
||||||
|
|
||||||
|
/// A closing `)` paren
|
||||||
|
///
|
||||||
|
/// Usually used in arrays and domain restrictions, this is the counterpart to the
|
||||||
|
/// open parenthesis.
|
||||||
|
#[token(")")]
|
||||||
|
CloseParen,
|
||||||
|
|
||||||
|
/// A `..` range operator
|
||||||
|
///
|
||||||
|
/// Used to denote, well, a range between the values on the left and the right.
|
||||||
|
#[token("..")]
|
||||||
|
RangeOp,
|
||||||
|
|
||||||
|
/// A newline NOT followed by whitespace
|
||||||
|
///
|
||||||
|
/// This means that the following tokens are at the start of a line. For example
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// variable = value
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`,
|
||||||
|
/// whereas
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// variable = value
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`. This makes it easy
|
||||||
|
/// to identify declarations.
|
||||||
|
#[regex(r"\s*\n")]
|
||||||
|
DeclarationStart,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
/// A specific infix operator
|
||||||
|
///
|
||||||
|
/// Used to specify the [`Token::Infix`] variant.
|
||||||
|
pub enum InfixOp {
|
||||||
|
/// The logical AND operator
|
||||||
|
///
|
||||||
|
/// Takes two boolean values and returns true iff both values are true. Otherwise,
|
||||||
|
/// returns false.
|
||||||
|
LAnd,
|
||||||
|
|
||||||
|
/// The logical OR operator
|
||||||
|
///
|
||||||
|
/// Takes two boolean values and returns true if either is true
|
||||||
|
LOr,
|
||||||
|
|
||||||
|
/// The multiplicitive operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns their product
|
||||||
|
Mult,
|
||||||
|
|
||||||
|
/// The modulo operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns the remainder of their division
|
||||||
|
Mod,
|
||||||
|
|
||||||
|
/// The division operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns their quotient
|
||||||
|
Div,
|
||||||
|
|
||||||
|
/// The additive operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns their sum
|
||||||
|
Add,
|
||||||
|
|
||||||
|
/// The subtractive operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns their difference
|
||||||
|
Sub,
|
||||||
|
|
||||||
|
/// The equality operator.
|
||||||
|
///
|
||||||
|
/// Takes two values and returns true iff they are equal
|
||||||
|
Eq,
|
||||||
|
|
||||||
|
/// The inequality operator.
|
||||||
|
///
|
||||||
|
/// Takes two values and returns true iff they are NOT equal
|
||||||
|
NEq,
|
||||||
|
|
||||||
|
/// The less-than operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns true iff the first is LESS than the second
|
||||||
|
Less,
|
||||||
|
|
||||||
|
/// The greater-than operator.
|
||||||
|
///
|
||||||
|
/// Takes two numeric values and returns true iff the first is GREATER than the second
|
||||||
|
Greater,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
/// A specific type of literal, used for the [`Token::Literal`] token
|
||||||
|
pub enum Literal {
|
||||||
|
/// A string literal
|
||||||
|
///
|
||||||
|
/// The internal [`String`] is the content of the string, with escape characters
|
||||||
|
/// already processed.
|
||||||
|
String(String),
|
||||||
|
|
||||||
|
/// An integer literal
|
||||||
|
Int(u64)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Literal {
|
||||||
|
/// Create a string literal by parsing a matched regex
|
||||||
|
///
|
||||||
|
/// The input should be in the form of `"<content>"` WITH THE QUOTES ("). The quotes
|
||||||
|
/// will then be trimmed, and character escape sequences will be substituted.
|
||||||
|
pub fn from_string_match(s: &str) -> Self {
|
||||||
|
if s.len() < 2 {
|
||||||
|
// This should be unreachable, but this is kept just in case.
|
||||||
|
eprintln!("[WARN] ---[Ruh roh!]-------------------------------------");
|
||||||
|
eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!");
|
||||||
|
eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect.");
|
||||||
|
eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer.");
|
||||||
|
eprintln!("[WARN] --------------------------------------------------");
|
||||||
|
Self::String(String::new())
|
||||||
|
} else {
|
||||||
|
Self::String(s[1..s.len()-1].to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create an integer literal by parsing a regex match
|
||||||
|
///
|
||||||
|
/// The input should be in the form of a series of ASCII digits 0-9 of any length.
|
||||||
|
/// Any parse errors will result in [`None`] being returned instead. These indicate a
|
||||||
|
/// problem with the user's code, and should be reported.
|
||||||
|
pub fn from_int_match(s: &str) -> Option<Self> {
|
||||||
|
s.parse().ok().map(Self::Int)
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue