Add simple expression parsing

Currently can only parse addition in a right-associative way, i.e.
  1 + 2 + 3 => 1 + (2 + 3).

The reason why are we using raw pointers in a form of OffsetStr struct
is because I don't think there is a way to prove to Rust compiler that
tokens the parse_next function returns are always valid references to
the TokenStream source string. Therefore we simply have to bypass the
borrow checker and handle this ourselves.
This commit is contained in:
Aodhnait Étaín 2021-05-22 21:45:42 +01:00
parent 4841f7b657
commit ba409b9be0

View file

@ -65,4 +65,168 @@ fn main() {
let path = path.unwrap();
eprintln!("compiling `{}`", path);
#[allow(non_upper_case_globals)] const source: &'static str = "23 + 21;";
let mut tokens = TokenStream::from(source);
let expr = parse_expression(&mut tokens);
eprintln!("{:?}", expr);
}
#[derive(Debug)]
enum Expression {
Literal(Token),
Binary(Token, Box<Expression>, Box<Expression>),
}
fn parse_expression<'a>(tokens: &'a mut TokenStream<'a>) -> Option<Expression> {
let lhs = match tokens.next()? {
token @ Token::IntegerLiteral(_) => Expression::Literal(token),
_ => return None,
};
return match tokens.next()? {
operator @ Token::Plus => {
let rhs = parse_expression(tokens)?;
Some(Expression::Binary(operator, box lhs, box rhs))
},
// If it's not a valid operator, then caller can get rest of the input in the token stream
// it has provided to us.
_ => Some(lhs),
};
}
struct TokenStream<'a> {
source: &'a str,
cursor: usize,
}
impl<'a> TokenStream<'a> {
pub fn from(source: &'a str) -> Self {
return Self {
source,
cursor: 0,
};
}
#[inline(always)]
fn chars(&'a self) -> std::str::Chars<'a> {
return self.source[self.cursor..].chars();
}
pub fn skip_whitespace(&mut self) {
let mut chars = self.chars().peekable();
let mut length = 0;
loop {
match match chars.peek() {
None => return,
Some(c) => c
} {
c if c.is_whitespace() => {
length += c.len_utf8();
chars.next();
},
_ => break,
};
};
self.cursor += length;
}
pub fn parse_next(&mut self) -> Option<Token> {
let mut chars = self.chars();
let token = match chars.next()? {
'+' => Token::Plus,
';' => Token::Semicolon,
c if c.is_numeric() => {
let start = self.cursor;
let mut length = c.len_utf8();
loop {
match chars.next()? {
c if c.is_numeric() => length += c.len_utf8(),
_ => break,
};
};
Token::IntegerLiteral(OffsetStr::from(&self.source[start..start + length]))
},
c => todo!("character unsupported: `{}`", natural_char_representation(c)),
};
self.cursor += token.len();
return Some(token);
}
pub fn next(&mut self) -> Option<Token> {
self.skip_whitespace();
return self.parse_next();
}
}
fn natural_char_representation(c: char) -> char {
return match c {
' ' => '␣',
'\t' => '→',
'\n' => '⏎',
_ => c,
};
}
#[derive(Copy, Clone)]
struct OffsetStr {
data: *const u8,
length: usize,
}
impl std::fmt::Display for OffsetStr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return write!(f, "{}", std::str::from_utf8(unsafe { std::slice::from_raw_parts(self.data, self.length) }).unwrap());
}
}
impl std::fmt::Debug for OffsetStr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return write!(f, r#""{}""#, self);
}
}
impl OffsetStr {
pub fn from(s: &str) -> Self {
return Self {
data: s.as_ptr(),
length: s.len(),
};
}
}
#[derive(Debug, Copy, Clone)]
enum Token {
Plus,
Semicolon,
IntegerLiteral(OffsetStr),
}
impl std::fmt::Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return match self {
token @ Token::IntegerLiteral(_) => write!(f, "{}", token),
token => write!(f, "{}", match token {
Token::Plus => "+",
Token::Semicolon => ";",
_ => unreachable!(),
}),
};
}
}
impl Token {
pub fn len(&self) -> usize {
return match self {
Token::Plus | Token::Semicolon => 1,
Token::IntegerLiteral(i) => i.length,
};
}
}