diff --git a/src/main.rs b/src/main.rs index 62c317e..b31b288 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,275 +1,39 @@ #![feature(box_syntax)] -// Try to keep this string updated with the argument parsing, otherwise it will -// get confusing for users. -static USAGE: &'static str = "usage: pine [options] input - -options: - --help print all options - -unstable options: - --unpretty val print un-prettified representation of the source code - valid options for `val` are: - dot, graphdotviz (dot-compatible graph)"; - -fn main() { - // Throw away the first argument, which usually is the executable name. - let args = std::env::args().skip(1).collect::>(); - - // If there is no arguments, we short circuit to avoid having to perform the - // expensive command-line argument generation and parsing step. We can allow - // ourselves to do this since, unlike i.e. rustc, we don't print full usage - // information on invocation of only the binary, but instead we behave more - // like clang or go. - if args.len() == 0 { - eprintln!("pine: \x1b[1;31merror\x1b[0m: no input files"); - std::process::exit(1); - } - - let mut path: Option<&str> = None; - let mut output_pretty: Option<&str> = None; - - // Handle command-line arguments. - let mut i = 0; - loop { - if i == args.len() { - break; - } - - let arg = args[i].as_str(); - - if arg.starts_with("--") { - match &arg[2..] { - "help" => { - println!("{}\n", USAGE); - return; - }, - "unpretty" => { - if i + 1 == args.len() { - eprintln!("pine: \x1b[1;31merror\x1b[0m: expected option to '{}'", arg); - std::process::exit(1); - } - - output_pretty = match args[i + 1].as_str() { - opt @ ("dot" | "graphdotviz") => Some(opt), - opt => { - eprintln!("pine: \x1b[1;31merror\x1b[0m: invalid option '{}' to '{}'", opt, arg); - std::process::exit(1); - }, - }; - - i += 1; - }, - _ => { - eprintln!("pine: \x1b[1;31merror\x1b[0m: unknown argument '{}'", arg); - std::process::exit(1); - }, - }; - } else if arg.starts_with("-") { - // We don't handle arguments that start with a single dash, this might be - // added later. For now we just exit with an error. - eprintln!("pine: \x1b[1;31merror\x1b[0m: unknown argument '{}'", arg); - std::process::exit(1); - } else { - if path.is_some() { - eprintln!("pine: \x1b[1;31merror\x1b[0m: multiple file names provided (first two are `{}` and `{}`)", path.unwrap(), arg); - std::process::exit(1); - } - - // Use this argument as an input file. - path = Some(arg); - } - - i += 1; - } - - if path.is_none() { - eprintln!("pine: \x1b[1;31merror\x1b[0m: no input files"); - std::process::exit(1); - } - - let path = path.unwrap(); - eprintln!("compiling `{}`", path); - - // #[allow(non_upper_case_globals)] const source: &'static str = "+17 + 23 + +21 + 11;"; - #[allow(non_upper_case_globals)] const source: &'static str = "11 + 13 * 17 + 19;"; - let mut tokens = TokenStream::from(source); - let expr = parse_expression(&mut tokens, 0); - eprintln!("{:?}", expr); - - match output_pretty { - Some("dot" | "graphdotviz") => expr.then(|e| { - let graph = e.create_graphviz_graph(unsafe { GRAPHVIZ_NODE_COUNTER.next() }); - let graphviz_format = "node [shape = box, style = filled, color = \"#bfd1e5\", fontname = monospace, fontsize = 12]"; - eprintln!("digraph {{\n{}\n{}\n}}", graphviz_format, graph); - }), - // This case is validated at the command-line parsing time, and we reject everything - // not specified there. This is why this can never happen, unless a solar flare changes - // a single bit. - Some(_) => unreachable!(), - None => {}, - }; +#[derive(Debug, Copy, Clone)] +enum Token<'a> { + IntegerLiteral(&'a str), + Plus, + Semicolon, } -// Represents a type characterised by a parameter T (either the type itself, or -// a type inside it, as in case of Option), on which we can call a procedure -// that doesn't return anything. -// -// This is similar to `and_then` method that Option and Result expose, the only -// difference being in that this is only useful for procedures that perform -// side effects, as we don't return either the original value, or a new value -// being a result of the procedure. -// -// The function name this trait exposes is similar to `then` method found on -// bool type, as the original intent was to call it if the value is Some, and -// do nothing if it's None. -trait Then { - fn then(&self, f: F) where F: FnOnce(&T); -} - -impl Then for std::option::Option { - fn then(&self, f: F) where F: FnOnce(&T) { - match self { - None => {}, - Some(v) => f(v), +impl Token<'_> { + pub fn len(&self) -> usize { + return match self { + Token::IntegerLiteral(s) => s.len(), + Token::Plus | Token::Semicolon => 1, }; } } -enum Expression { - Literal(Token), - Unary(Token, Box), - Binary(Token, Box, Box), -} - -impl std::fmt::Debug for Expression { +impl std::fmt::Display for Token<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { return match self { - Expression::Literal(token) => write!(f, "{}", token), - Expression::Unary(token, expr) => write!(f, "{}{:?}", token, expr), - Expression::Binary(token, left, right) => write!(f, "({} {:?} {:?})", token, left, right), + Token::IntegerLiteral(i) => write!(f, "{}", i), + Token::Plus => write!(f, "+"), + _ => unreachable!(), }; } } -struct Counter { - state: usize, -} - -impl Counter { - pub const fn new() -> Self { - return Self { - state: 0, - }; - } - - pub fn next(&mut self) -> usize { - let last_state = self.state; - self.state += 1; - return last_state; - } -} - -// Used for numbering nodes in GraphViz printer. -static mut GRAPHVIZ_NODE_COUNTER: Counter = Counter::new(); - -impl Expression { - pub fn create_graphviz_graph(&self, id: usize) -> String { - return match self { - Expression::Literal(Token::IntegerLiteral(i)) => { - format!("Node{} [label = \"{}\"]", id, i) - }, - Expression::Literal(_) => unreachable!(), - Expression::Unary(op, expr) => { - let expr_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() }; - - format!("Node{} -> Node{}\nNode{} [label = \"{}\"]\n{}", - id, expr_id, - id, op, - expr.create_graphviz_graph(expr_id)) - }, - Expression::Binary(op, left, right) => { - let left_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() }; - let right_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() }; - - format!("Node{} -> {{ Node{} Node{} }}\nNode{} [label = \"{}\"]\n{}\n{}", - id, left_id, right_id, - id, op, - left.create_graphviz_graph(left_id), right.create_graphviz_graph(right_id)) - }, - }; - } -} - -fn unary_precedence(token: Token) -> Option { - return match token { - Token::Plus => Some(3), - _ => None, - }; -} - -fn binary_precedence(token: Token) -> Option { - return match token { - Token::Asterisk => Some(2), - Token::Plus => Some(1), - _ => None, - }; -} - -fn is_binary_operator(token: Token) -> bool { - return match token { - Token::Plus | Token::Asterisk => true, - _ => false, - }; -} - -fn parse_expression<'a, 'b: 'a>(tokens: &'a mut TokenStream<'b>, highest_precedence: usize) -> Option { - let mut lhs = match tokens.next()? { - token @ Token::IntegerLiteral(_) => Expression::Literal(token), - token => { - if let Some(precedence) = unary_precedence(token) { - let expr = parse_expression(tokens, precedence)?; - Expression::Unary(token, box expr) - } else { - return None; - } - }, - }; - - loop { - let operator = match tokens.peek()? { - operator if is_binary_operator(operator) => operator, - _ => return Some(lhs), - }; - - let precedence = binary_precedence(operator)?; - - if precedence <= highest_precedence { - return Some(lhs); - } - - if is_binary_operator(operator) { - tokens.next(); - let rhs = parse_expression(tokens, precedence)?; - lhs = Expression::Binary(operator, box lhs, box rhs); - - if tokens.peek().map(is_binary_operator).unwrap_or(false) { - continue; - } - } - - return Some(lhs); - } -} - -struct TokenStream<'a> { +struct Source<'a> { source: &'a str, cursor: usize, - last: Option, + last: Option>, } -impl<'a> TokenStream<'a> { - pub fn from(source: &'a str) -> Self { +impl<'a> Source<'a> { + pub fn new(source: &'a str) -> Self { return Self { source, cursor: 0, @@ -277,164 +41,105 @@ impl<'a> TokenStream<'a> { }; } - // Utility function for creating an iterator over characters of the current source, - // starting at the cursor position, as we use this function in quite a few places. - #[inline(always)] - fn chars(&'a self) -> std::str::Chars<'a> { - return self.source[self.cursor..].chars(); - } - - // Advances current cursor positioon by ignoring all whitespace characters, as defined - // having White_Space property in Unicode [PropList.txt]. - // - // [PropList.txt]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt fn skip_whitespace(&mut self) { - let mut chars = self.chars().peekable(); - let mut length = 0; - - loop { - match match chars.peek() { - None => return, - Some(c) => c - } { - c if c.is_whitespace() => { - length += c.len_utf8(); - chars.next(); - }, - _ => break, - }; - }; + let mut chars = self.source[self.cursor..].chars(); - self.cursor += length; + while let Some(c) = chars.next() { + if c.is_whitespace() { + self.cursor += c.len_utf8(); + } else { + return; + } + } } - fn parse_next(&mut self) -> Option { - let mut chars = self.chars(); - + fn get_next(&mut self) -> Option> { + self.skip_whitespace(); + let mut chars = self.source[self.cursor..].chars(); + let token = match chars.next()? { - '*' => Token::Asterisk, '+' => Token::Plus, ';' => Token::Semicolon, - c if c.is_numeric() => { + c if c.is_ascii_digit() => { let start = self.cursor; let mut length = c.len_utf8(); loop { match chars.next()? { - c if c.is_numeric() => length += c.len_utf8(), + c if c.is_ascii_digit() => length += c.len_utf8(), _ => break, }; }; - Token::IntegerLiteral(OffsetStr::from(&self.source[start..start + length])) + Token::IntegerLiteral(&self.source[start..start + length]) }, - c => todo!("character unsupported: `{}`", natural_char_representation(c)), + c => todo!("invalid character `{:?}`", c) }; return Some(token); } - pub fn next(&mut self) -> Option { + pub fn next(&mut self) -> Option> { let token = match self.last { - Some(_) => std::mem::take(&mut self.last).unwrap(), - None => { - self.skip_whitespace(); - self.parse_next()? - }, + Some(t) => t, + None => self.get_next()?, }; + self.last = None; self.cursor += token.len(); return Some(token); } - pub fn peek(&mut self) -> Option { - self.skip_whitespace(); - self.last = Some(self.parse_next()?); + pub fn peek(&mut self) -> Option> { + self.last = Some(self.get_next()?); return self.last; } } -// Returns char representation in a way that is friendly for displaying in terminals. -fn natural_char_representation(c: char) -> char { - return match c { - ' ' => '␣', - '\t' => '→', - '\n' => '⏎', - _ => c, +#[derive(Debug)] +enum Statement<'a> { + Expression(Expression<'a>), +} + +// statement = expression ';' . +fn parse_statement<'a>(source: &mut Source<'a>) -> Option> { + let expression = match source.peek()? { + Token::IntegerLiteral(_) => parse_expression(source)?, + _ => return None, + }; + + return match source.next()? { + Token::Semicolon => Some(Statement::Expression(expression)), + _ => None, }; } -// This struct is a raw representation of Rust's &str, but a one that doesn't have to -// keep track of its lifetime. This allows us to express a notion of string slice that -// lives only as long as the underlying string does, although in much more unsafe way. -// Currently this is the only way I found possible to implement parse_next and next -// functions in TokenStream the way they are now (though it might be possible to change -// them and implement differently, without having to step aside the borrow checker). -// -// TODO: Do we really need this struct? Is there a way to make borrow checker accept what -// we want to convey? Or is it impossible due to how it currently works/is implemented? -#[derive(Copy, Clone)] -struct OffsetStr { - data: *const u8, - length: usize, +#[derive(Debug)] +enum Expression<'a> { + Literal(&'a str), + Binary(Token<'a>, Box>, Box>), } -impl std::fmt::Display for OffsetStr { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // SAFETY: Safety of this function relies on the caller to ensure that date it wants - // to print actually exists, is readable, and is a string. In other words, entirety - // of this function is unsafe. - return write!(f, "{}", std::str::from_utf8(unsafe { std::slice::from_raw_parts(self.data, self.length) }).unwrap()); - } +// expression = literal | expression '+' expression . +fn parse_expression<'a>(source: &mut Source<'a>) -> Option> { + let lhs = match source.next()? { + Token::IntegerLiteral(i) => Expression::Literal(i), + _ => return None, + }; + + let operator = match source.peek()? { + token @ Token::Plus => token, + Token::Semicolon => return Some(lhs), + _ => return None, + }; + source.next(); + + let rhs = parse_expression(source)?; + return Some(Expression::Binary(operator, box lhs, box rhs)); } -impl std::fmt::Debug for OffsetStr { - // SAFETY: Since it relies on Display implementation, it inherits the same SAFETY note - // as Debug::fmt, and is similarily unsafy. - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - return write!(f, r#""{}""#, self); - } -} - -impl OffsetStr { - // Constructs raw string from a string slice. It is up to the caller to ensure that, - // should it want to do anything with it, the underlying data is not dropped or used - // for other purpose. - pub fn from(s: &str) -> Self { - return Self { - data: s.as_ptr(), - length: s.len(), - }; - } -} - -#[derive(Debug, Copy, Clone)] -enum Token { - Plus, - Asterisk, - Semicolon, - IntegerLiteral(OffsetStr), -} - -impl std::fmt::Display for Token { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - return match self { - Token::IntegerLiteral(s) => write!(f, "{}", s), - token => write!(f, "{}", match token { - Token::Plus => "+", - Token::Asterisk => "*", - Token::Semicolon => ";", - _ => unreachable!(), - }), - }; - } -} - -impl Token { - pub fn len(&self) -> usize { - return match self { - Token::Plus | Token::Asterisk | Token::Semicolon => 1, - Token::IntegerLiteral(i) => i.length, - }; - } +fn main() { + let inline_source = "3 + 5 + 7;"; + let mut source = Source::new(inline_source); + eprintln!("{:?}", parse_statement(&mut source)); }