Add simple expression parsing
Currently can only parse addition in a right-associative way, i.e. 1 + 2 + 3 => 1 + (2 + 3). The reason why are we using raw pointers in a form of OffsetStr struct is because I don't think there is a way to prove to Rust compiler that tokens the parse_next function returns are always valid references to the TokenStream source string. Therefore we simply have to bypass the borrow checker and handle this ourselves.
This commit is contained in:
parent
4841f7b657
commit
ba409b9be0
164
src/main.rs
164
src/main.rs
|
@ -65,4 +65,168 @@ fn main() {
|
|||
|
||||
let path = path.unwrap();
|
||||
eprintln!("compiling `{}`", path);
|
||||
|
||||
#[allow(non_upper_case_globals)] const source: &'static str = "23 + 21;";
|
||||
let mut tokens = TokenStream::from(source);
|
||||
let expr = parse_expression(&mut tokens);
|
||||
eprintln!("{:?}", expr);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Expression {
|
||||
Literal(Token),
|
||||
Binary(Token, Box<Expression>, Box<Expression>),
|
||||
}
|
||||
|
||||
fn parse_expression<'a>(tokens: &'a mut TokenStream<'a>) -> Option<Expression> {
|
||||
let lhs = match tokens.next()? {
|
||||
token @ Token::IntegerLiteral(_) => Expression::Literal(token),
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
return match tokens.next()? {
|
||||
operator @ Token::Plus => {
|
||||
let rhs = parse_expression(tokens)?;
|
||||
|
||||
Some(Expression::Binary(operator, box lhs, box rhs))
|
||||
},
|
||||
// If it's not a valid operator, then caller can get rest of the input in the token stream
|
||||
// it has provided to us.
|
||||
_ => Some(lhs),
|
||||
};
|
||||
}
|
||||
|
||||
struct TokenStream<'a> {
|
||||
source: &'a str,
|
||||
cursor: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream<'a> {
|
||||
pub fn from(source: &'a str) -> Self {
|
||||
return Self {
|
||||
source,
|
||||
cursor: 0,
|
||||
};
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
fn chars(&'a self) -> std::str::Chars<'a> {
|
||||
return self.source[self.cursor..].chars();
|
||||
}
|
||||
|
||||
pub fn skip_whitespace(&mut self) {
|
||||
let mut chars = self.chars().peekable();
|
||||
let mut length = 0;
|
||||
|
||||
loop {
|
||||
match match chars.peek() {
|
||||
None => return,
|
||||
Some(c) => c
|
||||
} {
|
||||
c if c.is_whitespace() => {
|
||||
length += c.len_utf8();
|
||||
chars.next();
|
||||
},
|
||||
_ => break,
|
||||
};
|
||||
};
|
||||
|
||||
self.cursor += length;
|
||||
}
|
||||
|
||||
pub fn parse_next(&mut self) -> Option<Token> {
|
||||
let mut chars = self.chars();
|
||||
|
||||
let token = match chars.next()? {
|
||||
'+' => Token::Plus,
|
||||
';' => Token::Semicolon,
|
||||
c if c.is_numeric() => {
|
||||
let start = self.cursor;
|
||||
let mut length = c.len_utf8();
|
||||
|
||||
loop {
|
||||
match chars.next()? {
|
||||
c if c.is_numeric() => length += c.len_utf8(),
|
||||
_ => break,
|
||||
};
|
||||
};
|
||||
|
||||
Token::IntegerLiteral(OffsetStr::from(&self.source[start..start + length]))
|
||||
},
|
||||
c => todo!("character unsupported: `{}`", natural_char_representation(c)),
|
||||
};
|
||||
|
||||
self.cursor += token.len();
|
||||
return Some(token);
|
||||
}
|
||||
|
||||
pub fn next(&mut self) -> Option<Token> {
|
||||
self.skip_whitespace();
|
||||
return self.parse_next();
|
||||
}
|
||||
}
|
||||
|
||||
fn natural_char_representation(c: char) -> char {
|
||||
return match c {
|
||||
' ' => '␣',
|
||||
'\t' => '→',
|
||||
'\n' => '⏎',
|
||||
_ => c,
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
struct OffsetStr {
|
||||
data: *const u8,
|
||||
length: usize,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for OffsetStr {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
return write!(f, "{}", std::str::from_utf8(unsafe { std::slice::from_raw_parts(self.data, self.length) }).unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for OffsetStr {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
return write!(f, r#""{}""#, self);
|
||||
}
|
||||
}
|
||||
|
||||
impl OffsetStr {
|
||||
pub fn from(s: &str) -> Self {
|
||||
return Self {
|
||||
data: s.as_ptr(),
|
||||
length: s.len(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
enum Token {
|
||||
Plus,
|
||||
Semicolon,
|
||||
IntegerLiteral(OffsetStr),
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Token {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
return match self {
|
||||
token @ Token::IntegerLiteral(_) => write!(f, "{}", token),
|
||||
token => write!(f, "{}", match token {
|
||||
Token::Plus => "+",
|
||||
Token::Semicolon => ";",
|
||||
_ => unreachable!(),
|
||||
}),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
impl Token {
|
||||
pub fn len(&self) -> usize {
|
||||
return match self {
|
||||
Token::Plus | Token::Semicolon => 1,
|
||||
Token::IntegerLiteral(i) => i.length,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue