Rewrite to avoid using unsafe

For some reason in the previous version we had to use unsafe operations
on string slices because borrow checker would always complain. Somehow
after rewriting we don't need it anymore.
This commit is contained in:
Aodhnait Étaín 2021-05-25 18:59:12 +01:00
parent 5e2bdfa091
commit 4e339b1f6e
1 changed files with 79 additions and 374 deletions

View File

@ -1,275 +1,39 @@
#![feature(box_syntax)]
// Try to keep this string updated with the argument parsing, otherwise it will
// get confusing for users.
static USAGE: &'static str = "usage: pine [options] input
options:
--help print all options
unstable options:
--unpretty val print un-prettified representation of the source code
valid options for `val` are:
dot, graphdotviz (dot-compatible graph)";
fn main() {
// Throw away the first argument, which usually is the executable name.
let args = std::env::args().skip(1).collect::<Vec<_>>();
// If there is no arguments, we short circuit to avoid having to perform the
// expensive command-line argument generation and parsing step. We can allow
// ourselves to do this since, unlike i.e. rustc, we don't print full usage
// information on invocation of only the binary, but instead we behave more
// like clang or go.
if args.len() == 0 {
eprintln!("pine: \x1b[1;31merror\x1b[0m: no input files");
std::process::exit(1);
}
let mut path: Option<&str> = None;
let mut output_pretty: Option<&str> = None;
// Handle command-line arguments.
let mut i = 0;
loop {
if i == args.len() {
break;
}
let arg = args[i].as_str();
if arg.starts_with("--") {
match &arg[2..] {
"help" => {
println!("{}\n", USAGE);
return;
},
"unpretty" => {
if i + 1 == args.len() {
eprintln!("pine: \x1b[1;31merror\x1b[0m: expected option to '{}'", arg);
std::process::exit(1);
}
output_pretty = match args[i + 1].as_str() {
opt @ ("dot" | "graphdotviz") => Some(opt),
opt => {
eprintln!("pine: \x1b[1;31merror\x1b[0m: invalid option '{}' to '{}'", opt, arg);
std::process::exit(1);
},
};
i += 1;
},
_ => {
eprintln!("pine: \x1b[1;31merror\x1b[0m: unknown argument '{}'", arg);
std::process::exit(1);
},
};
} else if arg.starts_with("-") {
// We don't handle arguments that start with a single dash, this might be
// added later. For now we just exit with an error.
eprintln!("pine: \x1b[1;31merror\x1b[0m: unknown argument '{}'", arg);
std::process::exit(1);
} else {
if path.is_some() {
eprintln!("pine: \x1b[1;31merror\x1b[0m: multiple file names provided (first two are `{}` and `{}`)", path.unwrap(), arg);
std::process::exit(1);
}
// Use this argument as an input file.
path = Some(arg);
}
i += 1;
}
if path.is_none() {
eprintln!("pine: \x1b[1;31merror\x1b[0m: no input files");
std::process::exit(1);
}
let path = path.unwrap();
eprintln!("compiling `{}`", path);
// #[allow(non_upper_case_globals)] const source: &'static str = "+17 + 23 + +21 + 11;";
#[allow(non_upper_case_globals)] const source: &'static str = "11 + 13 * 17 + 19;";
let mut tokens = TokenStream::from(source);
let expr = parse_expression(&mut tokens, 0);
eprintln!("{:?}", expr);
match output_pretty {
Some("dot" | "graphdotviz") => expr.then(|e| {
let graph = e.create_graphviz_graph(unsafe { GRAPHVIZ_NODE_COUNTER.next() });
let graphviz_format = "node [shape = box, style = filled, color = \"#bfd1e5\", fontname = monospace, fontsize = 12]";
eprintln!("digraph {{\n{}\n{}\n}}", graphviz_format, graph);
}),
// This case is validated at the command-line parsing time, and we reject everything
// not specified there. This is why this can never happen, unless a solar flare changes
// a single bit.
Some(_) => unreachable!(),
None => {},
};
#[derive(Debug, Copy, Clone)]
enum Token<'a> {
IntegerLiteral(&'a str),
Plus,
Semicolon,
}
// Represents a type characterised by a parameter T (either the type itself, or
// a type inside it, as in case of Option<T>), on which we can call a procedure
// that doesn't return anything.
//
// This is similar to `and_then` method that Option and Result expose, the only
// difference being in that this is only useful for procedures that perform
// side effects, as we don't return either the original value, or a new value
// being a result of the procedure.
//
// The function name this trait exposes is similar to `then` method found on
// bool type, as the original intent was to call it if the value is Some, and
// do nothing if it's None.
trait Then<T> {
fn then<F>(&self, f: F) where F: FnOnce(&T);
}
impl<T> Then<T> for std::option::Option<T> {
fn then<F>(&self, f: F) where F: FnOnce(&T) {
match self {
None => {},
Some(v) => f(v),
impl Token<'_> {
pub fn len(&self) -> usize {
return match self {
Token::IntegerLiteral(s) => s.len(),
Token::Plus | Token::Semicolon => 1,
};
}
}
enum Expression {
Literal(Token),
Unary(Token, Box<Expression>),
Binary(Token, Box<Expression>, Box<Expression>),
}
impl std::fmt::Debug for Expression {
impl std::fmt::Display for Token<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return match self {
Expression::Literal(token) => write!(f, "{}", token),
Expression::Unary(token, expr) => write!(f, "{}{:?}", token, expr),
Expression::Binary(token, left, right) => write!(f, "({} {:?} {:?})", token, left, right),
Token::IntegerLiteral(i) => write!(f, "{}", i),
Token::Plus => write!(f, "+"),
_ => unreachable!(),
};
}
}
struct Counter {
state: usize,
}
impl Counter {
pub const fn new() -> Self {
return Self {
state: 0,
};
}
pub fn next(&mut self) -> usize {
let last_state = self.state;
self.state += 1;
return last_state;
}
}
// Used for numbering nodes in GraphViz printer.
static mut GRAPHVIZ_NODE_COUNTER: Counter = Counter::new();
impl Expression {
pub fn create_graphviz_graph(&self, id: usize) -> String {
return match self {
Expression::Literal(Token::IntegerLiteral(i)) => {
format!("Node{} [label = \"{}\"]", id, i)
},
Expression::Literal(_) => unreachable!(),
Expression::Unary(op, expr) => {
let expr_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() };
format!("Node{} -> Node{}\nNode{} [label = \"{}\"]\n{}",
id, expr_id,
id, op,
expr.create_graphviz_graph(expr_id))
},
Expression::Binary(op, left, right) => {
let left_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() };
let right_id = unsafe { GRAPHVIZ_NODE_COUNTER.next() };
format!("Node{} -> {{ Node{} Node{} }}\nNode{} [label = \"{}\"]\n{}\n{}",
id, left_id, right_id,
id, op,
left.create_graphviz_graph(left_id), right.create_graphviz_graph(right_id))
},
};
}
}
fn unary_precedence(token: Token) -> Option<usize> {
return match token {
Token::Plus => Some(3),
_ => None,
};
}
fn binary_precedence(token: Token) -> Option<usize> {
return match token {
Token::Asterisk => Some(2),
Token::Plus => Some(1),
_ => None,
};
}
fn is_binary_operator(token: Token) -> bool {
return match token {
Token::Plus | Token::Asterisk => true,
_ => false,
};
}
fn parse_expression<'a, 'b: 'a>(tokens: &'a mut TokenStream<'b>, highest_precedence: usize) -> Option<Expression> {
let mut lhs = match tokens.next()? {
token @ Token::IntegerLiteral(_) => Expression::Literal(token),
token => {
if let Some(precedence) = unary_precedence(token) {
let expr = parse_expression(tokens, precedence)?;
Expression::Unary(token, box expr)
} else {
return None;
}
},
};
loop {
let operator = match tokens.peek()? {
operator if is_binary_operator(operator) => operator,
_ => return Some(lhs),
};
let precedence = binary_precedence(operator)?;
if precedence <= highest_precedence {
return Some(lhs);
}
if is_binary_operator(operator) {
tokens.next();
let rhs = parse_expression(tokens, precedence)?;
lhs = Expression::Binary(operator, box lhs, box rhs);
if tokens.peek().map(is_binary_operator).unwrap_or(false) {
continue;
}
}
return Some(lhs);
}
}
struct TokenStream<'a> {
struct Source<'a> {
source: &'a str,
cursor: usize,
last: Option<Token>,
last: Option<Token<'a>>,
}
impl<'a> TokenStream<'a> {
pub fn from(source: &'a str) -> Self {
impl<'a> Source<'a> {
pub fn new(source: &'a str) -> Self {
return Self {
source,
cursor: 0,
@ -277,164 +41,105 @@ impl<'a> TokenStream<'a> {
};
}
// Utility function for creating an iterator over characters of the current source,
// starting at the cursor position, as we use this function in quite a few places.
#[inline(always)]
fn chars(&'a self) -> std::str::Chars<'a> {
return self.source[self.cursor..].chars();
}
// Advances current cursor positioon by ignoring all whitespace characters, as defined
// having White_Space property in Unicode [PropList.txt].
//
// [PropList.txt]: https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt
fn skip_whitespace(&mut self) {
let mut chars = self.chars().peekable();
let mut length = 0;
loop {
match match chars.peek() {
None => return,
Some(c) => c
} {
c if c.is_whitespace() => {
length += c.len_utf8();
chars.next();
},
_ => break,
};
};
let mut chars = self.source[self.cursor..].chars();
self.cursor += length;
while let Some(c) = chars.next() {
if c.is_whitespace() {
self.cursor += c.len_utf8();
} else {
return;
}
}
}
fn parse_next(&mut self) -> Option<Token> {
let mut chars = self.chars();
fn get_next(&mut self) -> Option<Token<'a>> {
self.skip_whitespace();
let mut chars = self.source[self.cursor..].chars();
let token = match chars.next()? {
'*' => Token::Asterisk,
'+' => Token::Plus,
';' => Token::Semicolon,
c if c.is_numeric() => {
c if c.is_ascii_digit() => {
let start = self.cursor;
let mut length = c.len_utf8();
loop {
match chars.next()? {
c if c.is_numeric() => length += c.len_utf8(),
c if c.is_ascii_digit() => length += c.len_utf8(),
_ => break,
};
};
Token::IntegerLiteral(OffsetStr::from(&self.source[start..start + length]))
Token::IntegerLiteral(&self.source[start..start + length])
},
c => todo!("character unsupported: `{}`", natural_char_representation(c)),
c => todo!("invalid character `{:?}`", c)
};
return Some(token);
}
pub fn next(&mut self) -> Option<Token> {
pub fn next(&mut self) -> Option<Token<'a>> {
let token = match self.last {
Some(_) => std::mem::take(&mut self.last).unwrap(),
None => {
self.skip_whitespace();
self.parse_next()?
},
Some(t) => t,
None => self.get_next()?,
};
self.last = None;
self.cursor += token.len();
return Some(token);
}
pub fn peek(&mut self) -> Option<Token> {
self.skip_whitespace();
self.last = Some(self.parse_next()?);
pub fn peek(&mut self) -> Option<Token<'a>> {
self.last = Some(self.get_next()?);
return self.last;
}
}
// Returns char representation in a way that is friendly for displaying in terminals.
fn natural_char_representation(c: char) -> char {
return match c {
' ' => '␣',
'\t' => '→',
'\n' => '⏎',
_ => c,
#[derive(Debug)]
enum Statement<'a> {
Expression(Expression<'a>),
}
// statement = expression ';' .
fn parse_statement<'a>(source: &mut Source<'a>) -> Option<Statement<'a>> {
let expression = match source.peek()? {
Token::IntegerLiteral(_) => parse_expression(source)?,
_ => return None,
};
return match source.next()? {
Token::Semicolon => Some(Statement::Expression(expression)),
_ => None,
};
}
// This struct is a raw representation of Rust's &str, but a one that doesn't have to
// keep track of its lifetime. This allows us to express a notion of string slice that
// lives only as long as the underlying string does, although in much more unsafe way.
// Currently this is the only way I found possible to implement parse_next and next
// functions in TokenStream the way they are now (though it might be possible to change
// them and implement differently, without having to step aside the borrow checker).
//
// TODO: Do we really need this struct? Is there a way to make borrow checker accept what
// we want to convey? Or is it impossible due to how it currently works/is implemented?
#[derive(Copy, Clone)]
struct OffsetStr {
data: *const u8,
length: usize,
#[derive(Debug)]
enum Expression<'a> {
Literal(&'a str),
Binary(Token<'a>, Box<Expression<'a>>, Box<Expression<'a>>),
}
impl std::fmt::Display for OffsetStr {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// SAFETY: Safety of this function relies on the caller to ensure that date it wants
// to print actually exists, is readable, and is a string. In other words, entirety
// of this function is unsafe.
return write!(f, "{}", std::str::from_utf8(unsafe { std::slice::from_raw_parts(self.data, self.length) }).unwrap());
}
// expression = literal | expression '+' expression .
fn parse_expression<'a>(source: &mut Source<'a>) -> Option<Expression<'a>> {
let lhs = match source.next()? {
Token::IntegerLiteral(i) => Expression::Literal(i),
_ => return None,
};
let operator = match source.peek()? {
token @ Token::Plus => token,
Token::Semicolon => return Some(lhs),
_ => return None,
};
source.next();
let rhs = parse_expression(source)?;
return Some(Expression::Binary(operator, box lhs, box rhs));
}
impl std::fmt::Debug for OffsetStr {
// SAFETY: Since it relies on Display implementation, it inherits the same SAFETY note
// as Debug::fmt, and is similarily unsafy.
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return write!(f, r#""{}""#, self);
}
}
impl OffsetStr {
// Constructs raw string from a string slice. It is up to the caller to ensure that,
// should it want to do anything with it, the underlying data is not dropped or used
// for other purpose.
pub fn from(s: &str) -> Self {
return Self {
data: s.as_ptr(),
length: s.len(),
};
}
}
#[derive(Debug, Copy, Clone)]
enum Token {
Plus,
Asterisk,
Semicolon,
IntegerLiteral(OffsetStr),
}
impl std::fmt::Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
return match self {
Token::IntegerLiteral(s) => write!(f, "{}", s),
token => write!(f, "{}", match token {
Token::Plus => "+",
Token::Asterisk => "*",
Token::Semicolon => ";",
_ => unreachable!(),
}),
};
}
}
impl Token {
pub fn len(&self) -> usize {
return match self {
Token::Plus | Token::Asterisk | Token::Semicolon => 1,
Token::IntegerLiteral(i) => i.length,
};
}
fn main() {
let inline_source = "3 + 5 + 7;";
let mut source = Source::new(inline_source);
eprintln!("{:?}", parse_statement(&mut source));
}