Unverified Commit baff926f authored by Adrian Thurston's avatar Adrian Thurston Committed by GitHub
Browse files

feat: added rust scanner (#3362)

* feat: ported the scanner to rust

To build the rust scanner you need ragel 7.0.1 installed. Consult
Dockerfile_build for instructions on how to install this.

* feat: preparing to add the rust scanner as a separate module

Both scanners can exist simultaneously for some time and it should be an easy
fix if we need to revert.

* feat: use the rust scanner, can switch back by changing the scanner struct

* fix: moved the rust-scanner to the new loction in core/src

After rebasing on master the file names need to update.

* fix: moved all of the rust scanner to a sub-module

Using the rust scanner instead of c is now just a matter of referecing
rust::Scanner instead of Scanner.

* fix: don't need to clone the token defs, can include scanner::**

Also renamed scan2() -> scan() and included rust scanner in generated targets.

* chore: call the original scanner

Reverting to disabling the rust scanner. To enable it simply use
scanner::rust::Scanner from the parser instead of scanner::Scanner.

* fix: added some explanation around errors if ragel-7 install is wrong

* fix: added the generated rust scanner code and removed dependency from Makefile

Adding the generated code and removing the dependency from the Makefile will
make it such that the rust library can be compiled, but users won't yet need
ragel 7 installed.

* fix: don't check fmt/clippy of the generated rust scanner
parent 0f18e77f
......@@ -124,6 +124,17 @@ vet: libflux-go
bench: libflux-go
$(GO_TEST) -bench=. -run=^$$ ./...
# This requires ragel 7.0.1.
libflux/core/src/scanner/rust/scanner.rs: libflux/core/src/scanner/rust/scanner.rl
ragel-rust -I libflux/core/src/scanner -o $@ $<
rm libflux/core/src/scanner/rust/scanner.ri
# If you see the error:
# ragel: -C is an invalid argument
# from this command, it means you have replaced ragel 6 with ragel 7. Instead,
# install ragel 7 to a unique location and put it on your path *after* ragel 6.
# This way the ragel 6 binary hides the ragel 7 binary, but ragel-rust from
# ragel 7 is still available. See Dockerfile_build for an example.
libflux/scanner.c: libflux/core/src/scanner/scanner.rl
ragel -C -o libflux/scanner.c libflux/core/src/scanner/scanner.rl
......
#![cfg_attr(feature = "strict", deny(warnings, missing_docs))]
#![cfg_attr(feature = "strict", allow(warnings, missing_docs))]
//! The flux crate handles the parsing and semantic analysis of flux source
//! code.
......
......@@ -7,6 +7,8 @@ use std::collections::HashMap;
use std::ffi::CString;
use std::str;
pub mod rust;
pub struct Scanner {
data: CString,
ps: *const CChar,
......
#![allow(missing_docs)]
include!(concat!(env!("OUT_DIR"), "/bindings.rs"));
use std::collections::HashMap;
use std::ffi::CString;
use std::str;
use std::vec::Vec;
#[rustfmt::skip]
#[allow(clippy::all)]
mod scanner;
use crate::scanner::*;
pub struct Scanner {
data: Vec<u8>,
ps: i32,
p: i32,
pe: i32,
eof: i32,
last_newline: i32,
cur_line: i32,
checkpoint: i32,
checkpoint_line: i32,
checkpoint_last_newline: i32,
token: TOK,
positions: HashMap<Position, u32>,
pub comments: Option<Box<Token>>,
}
impl Scanner {
// New creates a scanner with the provided input.
pub fn new(data: CString) -> Scanner {
let ptr = data.as_ptr();
let bytes = data.as_bytes();
let end = bytes.len() as i32;
Scanner {
data: data.into_bytes(),
ps: 0,
p: 0,
pe: end,
eof: end,
last_newline: 0,
cur_line: 1,
token: TOK_ILLEGAL,
checkpoint: 0,
checkpoint_line: 1,
checkpoint_last_newline: 0,
positions: HashMap::new(),
comments: None,
}
}
fn scan_with_comments(&mut self, mode: i32) -> Token {
let mut token;
loop {
token = self._scan(mode);
if token.tok != TOK_COMMENT {
break;
}
token.comments = self.comments.take();
self.comments = Some(Box::new(token));
}
token.comments = self.comments.take();
token
}
// scan produces the next token from the input.
pub fn scan(&mut self) -> Token {
self.scan_with_comments(0)
}
// scan_with_regex produces the next token from the input accounting for regex.
pub fn scan_with_regex(&mut self) -> Token {
self.scan_with_comments(1)
}
// scan_string_expr produces the next token from the input in a string expression.
pub fn scan_string_expr(&mut self) -> Token {
self.scan_with_comments(2)
}
// unread will reset the Scanner to go back to the Scanner's location
// before the last scan_with_regex or scan call. If either of the scan_with_regex methods
// returned an EOF token, a call to unread will not unread the discarded whitespace.
// This method is a no-op if called multiple times.
pub fn unread(&mut self) {
self.p = self.checkpoint;
self.cur_line = self.checkpoint_line;
self.last_newline = self.checkpoint_last_newline;
}
pub fn offset(&self, pos: &Position) -> u32 {
*self.positions.get(pos).expect("position should be in map")
}
fn get_eof_token(&self) -> Token {
let data_len = self.data.len() as u32;
let column = self.eof as u32 - self.last_newline as u32 + 1;
Token {
tok: TOK_EOF,
lit: String::from(""),
start_offset: data_len,
end_offset: data_len,
start_pos: Position {
line: self.cur_line as u32,
column: column as u32,
},
end_pos: Position {
line: self.cur_line as u32,
column: column as u32,
},
comments: None,
}
}
fn _scan(&mut self, mode: i32) -> Token {
if self.p == self.eof {
return self.get_eof_token();
}
// Save our state in case we need to unread
self.checkpoint = self.p;
self.checkpoint_line = self.cur_line;
self.checkpoint_last_newline = self.last_newline;
let mut token_start = 0 as i32;
let mut token_start_line = 0 as i32;
let mut token_start_col = 0 as i32;
let mut token_end = 0 as i32;
let mut token_end_line = 0 as i32;
let mut token_end_col = 0 as i32;
let error = {
scanner::scan(
&self.data,
mode,
&mut self.p,
self.ps,
self.pe,
self.eof,
&mut self.last_newline,
&mut self.cur_line,
&mut self.token,
&mut token_start,
&mut token_start_line,
&mut token_start_col,
&mut token_end,
&mut token_end_line,
&mut token_end_col,
)
};
let t = if error != 0 {
// Execution failed meaning we hit a pattern that we don't support and
// doesn't produce a token. Use the unicode library to decode the next character
// in the sequence so we don't break up any unicode tokens.
let nc = unsafe {
std::str::from_utf8_unchecked(&self.data[(token_start as usize)..])
.chars()
.next()
};
match nc {
Some(nc) => {
// It's possible that the C scanner left the data pointer in the middle
// of a character. This resets the pointer to the
// beginning of the token we just failed to scan.
self.p = unsafe { self.ps + token_start };
let size = nc.len_utf8();
// Advance the data pointer to after the character we just emitted.
self.p = unsafe { self.p + size as i32 };
Token {
tok: TOK_ILLEGAL,
lit: nc.to_string(),
start_offset: token_start as u32,
end_offset: (token_start + size as i32) as u32,
start_pos: Position {
line: token_start_line as u32,
column: token_start_col as u32,
},
end_pos: Position {
line: token_start_line as u32,
column: (token_start_col + size as i32) as u32,
},
comments: None,
}
}
// This should be impossible as we would have produced an EOF token
// instead, but going to handle this anyway as in this impossible scenario
// we would enter an infinite loop if we continued scanning past the token.
None => self.get_eof_token(),
}
} else if self.token == TOK_ILLEGAL && self.p == self.eof {
// end of input
self.get_eof_token()
} else {
// No error or EOF, we can process the returned values normally.
let lit = unsafe {
str::from_utf8_unchecked(&self.data[(token_start as usize)..(token_end as usize)])
};
Token {
tok: self.token,
lit: String::from(lit),
start_offset: token_start as u32,
end_offset: token_end as u32,
start_pos: Position {
line: token_start_line as u32,
column: token_start_col as u32,
},
end_pos: Position {
line: token_end_line as u32,
column: token_end_col as u32,
},
comments: None,
}
};
// Record mapping from position to offset so clients
// may later go from position to offset by calling offset()
self.positions.insert(t.start_pos.clone(), t.start_offset);
self.positions.insert(t.end_pos.clone(), t.end_offset);
t
}
}
use std::vec::Vec;
use crate::scanner::*;
%%{
machine flux;
alphtype u8;
include WChar "unicode.rl";
action advance_line {
// We do this for every newline we find.
// This allows us to return correct line/column for each token
// back to the caller.
*cur_line += 1;
*last_newline = fpc + 1;
}
action advance_line_between_tokens {
// We do this for each newline we find in the whitespace between tokens,
// so we can record the location of the first byte of a token.
last_newline_before_token = *last_newline;
cur_line_token_start = *cur_line;
}
newline = '\n' @advance_line;
any_count_line = any | newline;
identifier = ( ualpha | "_" ) ( ualnum | "_" )*;
decimal_lit = (digit - "0") digit*;
int_lit = "0" | decimal_lit;
float_lit = (digit+ "." digit*) | ("." digit+);
duration_unit = "y" | "mo" | "w" | "d" | "h" | "m" | "s" | "ms" | "us" | "µs" | "ns";
duration_lit = ( int_lit duration_unit )+;
date = digit{4} "-" digit{2} "-" digit{2};
time_offset = "Z" | (("+" | "-") digit{2} ":" digit{2});
time = digit{2} ":" digit{2} ":" digit{2} ( "." digit* )? time_offset?;
date_time_lit = date ( "T" time )?;
escaped_char = "\\" ( "n" | "r" | "t" | "\\" | '"' | "${" );
unicode_value = (any_count_line - [\\$]) | escaped_char;
byte_value = "\\x" xdigit{2};
dollar_value = "$" ( any_count_line - "{" );
string_lit_char = ( unicode_value | byte_value | dollar_value );
string_lit = '"' string_lit_char* "$"? :> '"';
regex_escaped_char = "\\" ( "/" | "\\");
regex_unicode_value = (any_count_line - "/") | regex_escaped_char;
regex_lit = "/" ( regex_unicode_value | byte_value )+ "/";
# The newline is optional so that a comment at the end of a file is considered valid.
single_line_comment = "//" [^\n]* newline?;
# Whitespace is standard ws and control codes->
# (Note that newlines are handled separately; see notes above)
whitespace = (space - '\n')+;
# The regex literal is not compatible with division so we need two machines->
# One machine contains the full grammar and is the main one, the other is used to scan when we are
# in the middle of an expression and we are potentially expecting a division operator.
main_with_regex := |*
# If we see a regex literal, we accept that and do not go to the other scanner.
regex_lit => { tok = TOK_REGEX; fbreak; };
# We have to specify whitespace here so that leading whitespace doesn't cause a state transition.
whitespace;
newline => advance_line_between_tokens;
# Any other character we transfer to the main state machine that defines the entire language.
any => { fhold; fgoto main; };
*|;
# This machine does not contain the regex literal.
main := |*
single_line_comment => { tok = TOK_COMMENT; fbreak; };
"and" => { tok = TOK_AND; fbreak; };
"or" => { tok = TOK_OR; fbreak; };
"not" => { tok = TOK_NOT; fbreak; };
"empty" => { tok = TOK_EMPTY; fbreak; };
"in" => { tok = TOK_IN; fbreak; };
"import" => { tok = TOK_IMPORT; fbreak; };
"package" => { tok = TOK_PACKAGE; fbreak; };
"return" => { tok = TOK_RETURN; fbreak; };
"option" => { tok = TOK_OPTION; fbreak; };
"builtin" => { tok = TOK_BUILTIN; fbreak; };
"test" => { tok = TOK_TEST; fbreak; };
"if" => { tok = TOK_IF; fbreak; };
"then" => { tok = TOK_THEN; fbreak; };
"else" => { tok = TOK_ELSE; fbreak; };
"exists" => { tok = TOK_EXISTS; fbreak; };
identifier => { tok = TOK_IDENT; fbreak; };
int_lit => { tok = TOK_INT; fbreak; };
float_lit => { tok = TOK_FLOAT; fbreak; };
duration_lit => { tok = TOK_DURATION; fbreak; };
date_time_lit => { tok = TOK_TIME; fbreak; };
string_lit => { tok = TOK_STRING; fbreak; };
"+" => { tok = TOK_ADD; fbreak; };
"-" => { tok = TOK_SUB; fbreak; };
"*" => { tok = TOK_MUL; fbreak; };
"/" => { tok = TOK_DIV; fbreak; };
"%" => { tok = TOK_MOD; fbreak; };
"^" => { tok = TOK_POW; fbreak; };
"==" => { tok = TOK_EQ; fbreak; };
"<" => { tok = TOK_LT; fbreak; };
">" => { tok = TOK_GT; fbreak; };
"<=" => { tok = TOK_LTE; fbreak; };
">=" => { tok = TOK_GTE; fbreak; };
"!=" => { tok = TOK_NEQ; fbreak; };
"=~" => { tok = TOK_REGEXEQ; fbreak; };
"!~" => { tok = TOK_REGEXNEQ; fbreak; };
"=" => { tok = TOK_ASSIGN; fbreak; };
"=>" => { tok = TOK_ARROW; fbreak; };
"<-" => { tok = TOK_PIPE_RECEIVE; fbreak; };
"(" => { tok = TOK_LPAREN; fbreak; };
")" => { tok = TOK_RPAREN; fbreak; };
"[" => { tok = TOK_LBRACK; fbreak; };
"]" => { tok = TOK_RBRACK; fbreak; };
"{" => { tok = TOK_LBRACE; fbreak; };
"}" => { tok = TOK_RBRACE; fbreak; };
":" => { tok = TOK_COLON; fbreak; };
"|>" => { tok = TOK_PIPE_FORWARD; fbreak; };
"," => { tok = TOK_COMMA; fbreak; };
"." => { tok = TOK_DOT; fbreak; };
'"' => { tok = TOK_QUOTE; fbreak; };
'?' => { tok = TOK_QUESTION_MARK; fbreak; };
whitespace;
newline => advance_line_between_tokens;
*|;
# This is the scanner used when parsing a string expression.
string_expr := |*
"${" => { tok = TOK_STRINGEXPR; fbreak; };
'"' => { tok = TOK_QUOTE; fbreak; };
(string_lit_char - "\"")+ => { tok = TOK_TEXT; fbreak; };
*|;
}%%
%% write data nofinal;
pub fn scan(
data: &[u8],
mode: i32,
pp: &mut i32,
_data: i32,
pe: i32,
eof: i32,
last_newline: &mut i32,
cur_line: &mut i32,
token: &mut u32,
token_start: &mut i32,
token_start_line: &mut i32,
token_start_col: &mut i32,
token_end: &mut i32,
token_end_line: &mut i32,
token_end_col: &mut i32 ) -> u32
{
let mut cs = flux_start;
match mode {
0 => { cs = flux_en_main },
1 => { cs = flux_en_main_with_regex },
2 => { cs = flux_en_string_expr },
_ => {},
}
let mut p: i32 = *pp;
let mut act: i32 = 0;
let mut ts: i32 = 0;
let mut te: i32 = 0;
let mut tok: TOK = TOK_ILLEGAL;
let mut last_newline_before_token: i32 = *last_newline;
let mut cur_line_token_start: i32 = *cur_line;
// alskdfj
%% write init nocs;
%% write exec;
// Update output args.
*token = tok;
*token_start = ts - _data;
*token_start_line = cur_line_token_start;
*token_start_col = ts - last_newline_before_token + 1;
*token_end = te - _data;
if (*last_newline > te) {
// te (the token end pointer) will only be less than last_newline
// (pointer to the last newline the scanner saw) if we are trying
// to find a multi-line token (either string or regex literal)
// but don't find the closing `/` or `"`.
// In that case we need to reset last_newline and cur_line.
*cur_line = cur_line_token_start;
*last_newline = last_newline_before_token;
}
*token_end_line = *cur_line;
*token_end_col = te - *last_newline + 1;
*pp = p;
if cs == flux_error {
return 1
} else {
return 0;
}
}
This diff is collapsed.
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment