From ba409b9be0634dad2ec0f18008f0902438eac9c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aodhnait=20=C3=89ta=C3=ADn?= Date: Sat, 22 May 2021 21:45:42 +0100 Subject: [PATCH] Add simple expression parsing Currently can only parse addition in a right-associative way, i.e. 1 + 2 + 3 => 1 + (2 + 3). The reason why are we using raw pointers in a form of OffsetStr struct is because I don't think there is a way to prove to Rust compiler that tokens the parse_next function returns are always valid references to the TokenStream source string. Therefore we simply have to bypass the borrow checker and handle this ourselves. --- src/main.rs | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/src/main.rs b/src/main.rs index 5446f27..75bc284 100644 --- a/src/main.rs +++ b/src/main.rs @@ -65,4 +65,168 @@ fn main() { let path = path.unwrap(); eprintln!("compiling `{}`", path); + + #[allow(non_upper_case_globals)] const source: &'static str = "23 + 21;"; + let mut tokens = TokenStream::from(source); + let expr = parse_expression(&mut tokens); + eprintln!("{:?}", expr); +} + +#[derive(Debug)] +enum Expression { + Literal(Token), + Binary(Token, Box, Box), +} + +fn parse_expression<'a>(tokens: &'a mut TokenStream<'a>) -> Option { + let lhs = match tokens.next()? { + token @ Token::IntegerLiteral(_) => Expression::Literal(token), + _ => return None, + }; + + return match tokens.next()? { + operator @ Token::Plus => { + let rhs = parse_expression(tokens)?; + + Some(Expression::Binary(operator, box lhs, box rhs)) + }, + // If it's not a valid operator, then caller can get rest of the input in the token stream + // it has provided to us. + _ => Some(lhs), + }; +} + +struct TokenStream<'a> { + source: &'a str, + cursor: usize, +} + +impl<'a> TokenStream<'a> { + pub fn from(source: &'a str) -> Self { + return Self { + source, + cursor: 0, + }; + } + + #[inline(always)] + fn chars(&'a self) -> std::str::Chars<'a> { + return self.source[self.cursor..].chars(); + } + + pub fn skip_whitespace(&mut self) { + let mut chars = self.chars().peekable(); + let mut length = 0; + + loop { + match match chars.peek() { + None => return, + Some(c) => c + } { + c if c.is_whitespace() => { + length += c.len_utf8(); + chars.next(); + }, + _ => break, + }; + }; + + self.cursor += length; + } + + pub fn parse_next(&mut self) -> Option { + let mut chars = self.chars(); + + let token = match chars.next()? { + '+' => Token::Plus, + ';' => Token::Semicolon, + c if c.is_numeric() => { + let start = self.cursor; + let mut length = c.len_utf8(); + + loop { + match chars.next()? { + c if c.is_numeric() => length += c.len_utf8(), + _ => break, + }; + }; + + Token::IntegerLiteral(OffsetStr::from(&self.source[start..start + length])) + }, + c => todo!("character unsupported: `{}`", natural_char_representation(c)), + }; + + self.cursor += token.len(); + return Some(token); + } + + pub fn next(&mut self) -> Option { + self.skip_whitespace(); + return self.parse_next(); + } +} + +fn natural_char_representation(c: char) -> char { + return match c { + ' ' => '␣', + '\t' => '→', + '\n' => '⏎', + _ => c, + }; +} + +#[derive(Copy, Clone)] +struct OffsetStr { + data: *const u8, + length: usize, +} + +impl std::fmt::Display for OffsetStr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + return write!(f, "{}", std::str::from_utf8(unsafe { std::slice::from_raw_parts(self.data, self.length) }).unwrap()); + } +} + +impl std::fmt::Debug for OffsetStr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + return write!(f, r#""{}""#, self); + } +} + +impl OffsetStr { + pub fn from(s: &str) -> Self { + return Self { + data: s.as_ptr(), + length: s.len(), + }; + } +} + +#[derive(Debug, Copy, Clone)] +enum Token { + Plus, + Semicolon, + IntegerLiteral(OffsetStr), +} + +impl std::fmt::Display for Token { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + return match self { + token @ Token::IntegerLiteral(_) => write!(f, "{}", token), + token => write!(f, "{}", match token { + Token::Plus => "+", + Token::Semicolon => ";", + _ => unreachable!(), + }), + }; + } +} + +impl Token { + pub fn len(&self) -> usize { + return match self { + Token::Plus | Token::Semicolon => 1, + Token::IntegerLiteral(i) => i.length, + }; + } }