#![feature(box_syntax)] static LOWEST_PRECEDENCE: usize = 0; #[derive(Debug, Copy, Clone)] enum Token<'a> { IntegerLiteral(&'a str), Plus, Star, Minus, Semicolon, LeftParen, RightParen, } impl Token<'_> { // Returns length of that token in bytes, which is used for advancing the // cursor in the lexer. pub fn len(&self) -> usize { return match self { Token::IntegerLiteral(s) => s.len(), // Token::Plus, Token::Minus, Token::Semicolon, Token::LeftParen, Token::RightParen, Token::Star _ => 1, }; } pub fn precedence(&self) -> usize { return match self { Token::Plus | Token::Minus => 1, Token::Star => 2, _ => LOWEST_PRECEDENCE, } } } struct Source<'a> { source: &'a str, cursor: usize, last: Option>, } impl<'a> Source<'a> { pub fn new(source: &'a str) -> Self { return Self { source, cursor: 0, last: None, }; } fn skip_whitespace(&mut self) -> bool { let mut chars = self.source[self.cursor..].chars(); let mut skipped = false; while let Some(c) = chars.next() { if c.is_whitespace() { self.cursor += c.len_utf8(); skipped = true; } else { return skipped; } }; return skipped; } fn skip_comments(&mut self) -> bool { let mut chars = self.source[self.cursor..].chars(); if let Some('/') = chars.next() { if let Some('/') = chars.next() { self.cursor += 2; while let Some(c) = chars.next() { if c == '\n' { self.cursor += 1; return true; }; self.cursor += c.len_utf8(); }; }; }; return false; } fn get_next(&mut self) -> Option> { // Skip all possible comments and whitespace. while self.skip_comments() || self.skip_whitespace() { }; let mut chars = self.source[self.cursor..].chars().peekable(); let token = match chars.next()? { '+' => Token::Plus, '-' => Token::Minus, '*' => Token::Star, ';' => Token::Semicolon, '(' => Token::LeftParen, ')' => Token::RightParen, c if c.is_ascii_digit() => { let start = self.cursor; let mut length = c.len_utf8(); loop { match chars.next()? { c if c.is_ascii_digit() => length += c.len_utf8(), _ => break, }; }; Token::IntegerLiteral(&self.source[start..start + length]) }, c => todo!("invalid character `{:?}`", c) }; return Some(token); } pub fn next(&mut self) -> Option> { let token = match self.last { Some(t) => { self.last = None; t }, None => self.get_next()?, }; self.cursor += token.len(); return Some(token); } pub fn peek(&mut self) -> Option> { // We unwrap and then wrap it again as it makes more semantic sense, since // an Option that get_next returns is not connected to what peek returns. // In future we might want to add more sophisticated error handling to that // function, and then it will get easier to refactor. More so, we avoid // exposing lexer's internal state to the user. self.last = Some(self.get_next()?); return self.last; } } // Represents a dynamic parsing process, will get converted to ast::Tree after // it completes. struct Parser<'a> { source: &'a mut Source<'a>, } #[derive(Debug)] enum Expr<'a> { Literal(&'a str), // Paren(Box>), Unary(Token<'a>, Box>), Binary(Token<'a>, Box>, Box>), } // TODO: Add expect method, which checks for whether two tokens are of the same // kind, without checking for the equality of their (possible) inner values. // The issue is if it should advance the parser (the more obvious way), or make // it up to caller to do it (less obvious, but maybe useful?). The other issue // is how should the parser behave on failed call to expect — should it panic, // or maybe we have to introduce more sophisticated error handling? And then // we also would want to use self.peek()? inside its implementation, is it even // possible? So many questions. >_< impl<'a> Parser<'a> { pub fn new(source: &'a mut Source<'a>) -> Self { return Self { source, }; } #[must_use = "You should always use the return value of the next method, as it mutates the parser."] #[inline(always)] fn next(&mut self) -> Option> { return self.source.next(); } #[inline(always)] fn peek(&mut self) -> Option> { return self.source.peek(); } // A wrapper around next function, which WILL panic if there is no token left // in the source. The intent is to ONLY use it when it is KNOWN that we can // safely take a token without causing a panic, such as when we peek a token. #[inline(always)] fn bump(&mut self) -> Token<'a> { return self.next().unwrap(); } fn parse_primary_expr(&mut self) -> Option> { return match self.next()? { Token::IntegerLiteral(s) => Some(Expr::Literal(s)), Token::LeftParen => { let expr = self.parse_expr(0)?; return match self.next()? { // Should we bother keeping the parentheses information in the AST? // Token::RightParen => Some(Expr::Paren(box expr)), Token::RightParen => Some(expr), _ => None, }; }, _ => None, }; } fn parse_unary_expr(&mut self) -> Option> { return match self.peek()? { Token::Plus | Token::Minus => { let token = self.bump(); let expr = self.parse_unary_expr()?; Some(Expr::Unary(token, box expr)) }, _ => self.parse_primary_expr(), }; } // expr = unary_expr [ op expr ] // unary_expr = '+' unary_expr | primary_expr // primary_expr = literal | '(' expr ')' pub fn parse_expr(&mut self, min_precedence: usize) -> Option> { let mut lhs = self.parse_unary_expr()?; loop { let token = self.peek()?; let prec = token.precedence(); if prec <= min_precedence { return Some(lhs); }; // NOTE: Don't advance the parser before we make sure that the // precedence is correct. self.bump(); let rhs = self.parse_expr(prec)?; lhs = Expr::Binary(token, box lhs, box rhs); }; } } fn main() { let inline_source = " // hello, world\n 3;"; // let inline_source = "3 + 2 - -5;"; // let inline_source = "3 + +5 * +7;"; // let inline_source = "3 + 5 * 7;"; // let inline_source = "(3 + 5) + 7;"; // let inline_source = "3 + (5 + (7 + 11));"; let mut source = Source::new(inline_source); let mut parser = Parser::new(&mut source); eprintln!("{:?}", parser.parse_expr(0)); }