From efbe872f1ec884e3d89fbba5f669f3157b27e10a Mon Sep 17 00:00:00 2001 From: Emi Simpson Date: Thu, 10 Mar 2022 12:56:38 -0500 Subject: [PATCH] Added lexing! --- Cargo.lock | 93 ++++++++++++++++ Cargo.toml | 1 + sample.amo | 6 +- src/main.rs | 18 ++- src/token.rs | 310 +++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 423 insertions(+), 5 deletions(-) create mode 100644 Cargo.lock create mode 100644 src/token.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..bfef976 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,93 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "amo" +version = "0.1.0" +dependencies = [ + "logos", +] + +[[package]] +name = "beef" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bed554bd50246729a1ec158d08aa3235d1b69d94ad120ebe187e28894787e736" + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "logos" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "427e2abca5be13136da9afdbf874e6b34ad9001dd70f2b103b083a85daa7b345" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-derive" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56a7d287fd2ac3f75b11f19a1c8a874a7d55744bd91f7a1b3e7cf87d4343c36d" +dependencies = [ + "beef", + "fnv", + "proc-macro2", + "quote", + "regex-syntax", + "syn", + "utf8-ranges", +] + +[[package]] +name = "proc-macro2" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7342d5883fbccae1cc37a2353b09c87c9b0f3afd73f5fb9bba687a1f733b029" +dependencies = [ + "unicode-xid", +] + +[[package]] +name = "quote" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "864d3e96a899863136fc6e99f3d7cae289dafe43bf2c5ac19b70df7210c0a145" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "regex-syntax" +version = "0.6.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" + +[[package]] +name = "syn" +version = "1.0.86" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a65b3f4ffa0092e9887669db0eae07941f023991ab58ea44da8fe8e2d511c6b" +dependencies = [ + "proc-macro2", + "quote", + "unicode-xid", +] + +[[package]] +name = "unicode-xid" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" + +[[package]] +name = "utf8-ranges" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ae116fef2b7fea257ed6440d3cfcff7f190865f170cdad00bb6465bf18ecba" diff --git a/Cargo.toml b/Cargo.toml index 2f0c634..c69ccd2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +logos = "0.12.0" diff --git a/sample.amo b/sample.amo index 7fd6e28..7a973d4 100644 --- a/sample.amo +++ b/sample.amo @@ -1,10 +1,10 @@ -union SimpleType +type SimpleType = MyVariant1 | MyVariant2 Integer[u32] | MyVariant3 Integer[1..20] | MyVariant4 Integer String -union Option val +type Option val = Some val | None @@ -19,7 +19,7 @@ trait Functor[a, b] on Self _ needs map : (a -> b), Self a -> Self b pure : a -> Self a -union ComplexType[a, b : Functor] +type ComplexType[a, b : Functor] = Left a | Right b diff --git a/src/main.rs b/src/main.rs index e7a11a9..5cb4ef7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,17 @@ -fn main() { - println!("Hello, world!"); +use std::{fs::File, io::Read}; + +use logos::Logos; + +mod token; + +fn main() -> std::io::Result<()> { + let mut input_file = File::open("sample.amo")?; + let mut input = String::with_capacity(4096); + input_file.read_to_string(&mut input)?; + + for tok in token::Token::lexer(&input) { + println!("{tok:?}"); + } + + Ok(()) } diff --git a/src/token.rs b/src/token.rs new file mode 100644 index 0000000..f2abef8 --- /dev/null +++ b/src/token.rs @@ -0,0 +1,310 @@ +//! A series of structures and methods for lexing an input +//! +//! Lexing is the process of taking a stream of characters and converting it into a series +//! of tokens (or lexemes). Lexemes are designed to be the the smallest +//! machine-understandable unit of information - for example, a keyword, operator, +//! literal, or symbol. +//! +//! Look through the [`Token`] class to see a full list of the lexemes used by amo. +//! +//! The main entrypoint to this module is the derived [`Token::lexer`] method, which lexes +//! a provided string. + +use logos::Logos; + +#[derive(Logos, Debug, PartialEq, Eq)] +pub enum Token { + + /// A lexer error happened, woops! + /// + /// Also, this catches and ignores any whitespace that might be encountered + #[error] + #[regex(r"[ \t]+", logos::skip)] + #[regex(r"\n[ \t]+", logos::skip)] + Error, + + /// The `type` keyword + /// + /// Used to denote that a new type (called an enum in some languages) is being + /// declared + #[token("type")] + Type, + + /// The `struct` keyword + /// + /// Used to denote the declaration of a kind of struct + #[token("struct")] + Struct, + + /// The `trait` keywor, Eqd + /// + /// Denotes the declaration of a new trait + #[token("trait")] + Trait, + + /// The `needs` keyword + /// + /// Used as part of a trait declaration to denote methods that will be required for a + /// trait + #[token("needs")] + Needs, + + /// The `if` keyword + /// + /// Used to begin an If-Then-Else statement or an If-Is statement + #[token("if")] + If, + + /// The `is` keyword + /// + /// Used as part of an If-Is statement to indictate the start of the case listings + #[token("is")] + Is, + + /// The `then` keyword + /// + /// Indicates the start of the code block for the positive section of an If-Then-Else + /// statement + #[token("then")] + Then, + + /// The `else` keyword + /// + /// Denotes the end of the positive section of an If-Then-Else block, and the begining + /// of the negative section + #[token("else")] + Else, + + /// the `impl` keyword + /// + /// Used to denote the start of a trait implementation + #[token("impl")] + Impl, + + /// the `on` keyword + /// + /// Used in trait implementationsto seperate the trait being implemented and the type + /// it's being implemented on. + #[token("on")] + On, + + /// An `->` arrow + /// + /// Used as part of function type annotations as well as in the cases of If-Is blocks + #[token("->")] + Aro, + + /// An `=` assignment operator + /// + /// Used to seperate the left & right hand signs of an assignment operation + #[token("=")] + Assign, + + /// The `|` keyword (or punctuation? idk what it's called) + /// + /// Used in deliniating variants of a type + #[token("|")] + VBar, + + /// The `_` symbol + /// + /// Generally used as a placeholder or standin for another type + #[token("_")] + Placeholder, + + /// The `:` symbol + /// + /// Used as a seperator in various parts of the language + #[token(":")] + Colon, + + /// Any infix binop (binary operator) + /// + /// E.g. +, -, >, /, %, etc. + /// + /// These are operators that take two operands, one on the left, and one on the right, + /// and produce a single value. I don't think there are any two character + #[token("&&", |_| InfixOp::LAnd)] + #[token("||", |_| InfixOp::LOr)] + #[token("==", |_| InfixOp::Eq)] + #[token("!=", |_| InfixOp::NEq)] + #[token("*", |_| InfixOp::Mult)] + #[token("%", |_| InfixOp::Mod)] + #[token("/", |_| InfixOp::Div)] + #[token("+", |_| InfixOp::Add)] + #[token("-", |_| InfixOp::Sub)] + #[token("<", |_| InfixOp::Less)] + #[token(">", |_| InfixOp::Greater)] + Infix(InfixOp), + + /// Some literal (a constant value represented textually) + /// + /// For example, 100 is an integer literal, "hewwo" is a string literal, and `true` is + /// a boolean literal. + #[regex("\"(?:.+(?:\\\\\")?)+\"", |lex| Literal::from_string_match(lex.slice()))] + #[regex(r"\d+", |lex| Literal::from_int_match(lex.slice()))] + Literal(Literal), + + /// Some symbol, usually a variable or a type + #[regex(r"[a-zA-Z][a-zA-Z\d]*", |lex| lex.slice().to_string(), priority = 0)] + Symbol(String), + + /// An opening `[` square bracket + /// + /// Usually used in arrays and domain restrictions + #[token("[")] + OpenSquareBracket, + + /// A closing `]` square bracket + /// + /// Usually used in arrays and domain restrictions, and the counterpart to the opening + /// square bracket. + /// + /// In amo, the opening and closing square brackets are both lesbians, and they're + /// dating. The closing square bracket is transgender, also. + #[token("]")] + CloseSquareBracket, + + /// An opening `(` paren + /// + /// Usually used to make explicit the order of operations + #[token("(")] + OpenParen, + + /// A closing `)` paren + /// + /// Usually used in arrays and domain restrictions, this is the counterpart to the + /// open parenthesis. + #[token(")")] + CloseParen, + + /// A `..` range operator + /// + /// Used to denote, well, a range between the values on the left and the right. + #[token("..")] + RangeOp, + + /// A newline NOT followed by whitespace + /// + /// This means that the following tokens are at the start of a line. For example + /// + /// ``` + /// variable = value + /// ``` + /// + /// lexes to `DeclarationStart`, `Symbol(variable)`, `Assign`, `Symbol(value)`, + /// whereas + /// + /// ``` + /// variable = value + /// ``` + /// + /// simply lexes to `Symbol(variable)`, `Assign`, `Symbol(value)`. This makes it easy + /// to identify declarations. + #[regex(r"\s*\n")] + DeclarationStart, +} + +#[derive(Debug, PartialEq, Eq)] +/// A specific infix operator +/// +/// Used to specify the [`Token::Infix`] variant. +pub enum InfixOp { + /// The logical AND operator + /// + /// Takes two boolean values and returns true iff both values are true. Otherwise, + /// returns false. + LAnd, + + /// The logical OR operator + /// + /// Takes two boolean values and returns true if either is true + LOr, + + /// The multiplicitive operator. + /// + /// Takes two numeric values and returns their product + Mult, + + /// The modulo operator. + /// + /// Takes two numeric values and returns the remainder of their division + Mod, + + /// The division operator. + /// + /// Takes two numeric values and returns their quotient + Div, + + /// The additive operator. + /// + /// Takes two numeric values and returns their sum + Add, + + /// The subtractive operator. + /// + /// Takes two numeric values and returns their difference + Sub, + + /// The equality operator. + /// + /// Takes two values and returns true iff they are equal + Eq, + + /// The inequality operator. + /// + /// Takes two values and returns true iff they are NOT equal + NEq, + + /// The less-than operator. + /// + /// Takes two numeric values and returns true iff the first is LESS than the second + Less, + + /// The greater-than operator. + /// + /// Takes two numeric values and returns true iff the first is GREATER than the second + Greater, +} + +#[derive(Debug, PartialEq, Eq)] +/// A specific type of literal, used for the [`Token::Literal`] token +pub enum Literal { + /// A string literal + /// + /// The internal [`String`] is the content of the string, with escape characters + /// already processed. + String(String), + + /// An integer literal + Int(u64) +} + +impl Literal { + /// Create a string literal by parsing a matched regex + /// + /// The input should be in the form of `""` WITH THE QUOTES ("). The quotes + /// will then be trimmed, and character escape sequences will be substituted. + pub fn from_string_match(s: &str) -> Self { + if s.len() < 2 { + // This should be unreachable, but this is kept just in case. + eprintln!("[WARN] ---[Ruh roh!]-------------------------------------"); + eprintln!("[WARN] Unreachable executed in token::Literal::from_string_match!"); + eprintln!("[WARN] This sugguests that the regex or callback for the string literal token is incorrect."); + eprintln!("[WARN] Attempting to proceed anyway, but this indicates a serious problem with the lexer."); + eprintln!("[WARN] --------------------------------------------------"); + Self::String(String::new()) + } else { + Self::String(s[1..s.len()-1].to_string()) + } + } + + /// Create an integer literal by parsing a regex match + /// + /// The input should be in the form of a series of ASCII digits 0-9 of any length. + /// Any parse errors will result in [`None`] being returned instead. These indicate a + /// problem with the user's code, and should be reported. + pub fn from_int_match(s: &str) -> Option { + s.parse().ok().map(Self::Int) + } +}