rlox/frontend/src/lexer/_lexer.rs

375 lines
11 KiB
Rust

use phf::phf_map;
use super::{CodePos, LexerError, Token, TokenType};
/*====================================================================================================================*/
static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
"and" => TokenType::And,
"break" => TokenType::Break,
"class" => TokenType::Class,
"else" => TokenType::Else,
"false" => TokenType::False,
"for" => TokenType::For,
"fun" => TokenType::Fun,
"if" => TokenType::If,
"nil" => TokenType::Nil,
"or" => TokenType::Or,
"print" => TokenType::Print,
"return" => TokenType::Return,
"super" => TokenType::Super,
"this" => TokenType::This,
"true" => TokenType::True,
"var" => TokenType::Var,
"while" => TokenType::While
};
/*====================================================================================================================*/
pub fn scan_tokens(source_code: &str) -> Result<Vec<Token>, Vec<LexerError>> {
let lexer = Lexer::new(source_code);
lexer.scan_tokens()
}
/*====================================================================================================================*/
#[derive(Debug)]
struct Lexer {
source: Vec<char>,
tokens: Vec<Token>,
start: usize,
current: usize,
code_pos: CodePos,
errors: Vec<LexerError>,
}
impl Lexer {
fn new(source_code: &str) -> Self {
let source = source_code.chars().collect();
Lexer {
source,
tokens: Vec::new(),
start: 0,
current: 0,
code_pos: CodePos::default(),
errors: Vec::new(),
}
}
fn scan_tokens(self) -> Result<Vec<Token>, Vec<LexerError>> {
let mut me = self;
while !me.source_is_empty() {
me.scan_token();
}
me.tokens.push(Token::new(TokenType::EOF, me.code_pos));
if me.errors.is_empty() {
Ok(me.tokens)
} else {
Err(me.errors)
}
}
fn scan_token(&mut self) {
use TokenType::*;
self.start = self.current;
let c = self.advance();
match c {
'(' => self.push_token(LeftParen),
')' => self.push_token(RightParen),
'{' => self.push_token(LeftBrace),
'}' => self.push_token(RightBrace),
',' => self.push_token(Comma),
'.' => self.push_token(Dot),
'+' => self.push_token(Plus),
'-' => self.push_token(Minus),
';' => self.push_token(Semicolon),
'*' => self.push_token(Star),
'!' => {
if self.consume('=') {
self.push_token(BangEqual)
} else {
self.push_token(Bang)
}
}
'=' => {
if self.consume('=') {
self.push_token(EqualEqual)
} else {
self.push_token(Equal)
}
}
'<' => {
if self.consume('=') {
self.push_token(LessEqual)
} else {
self.push_token(Less)
}
}
'>' => {
if self.consume('=') {
self.push_token(GreaterEqual)
} else {
self.push_token(Greater)
}
}
'/' => {
if self.consume('/') {
// line comment
// advance until either source is empty or newline if found
while !self.source_is_empty() && self.advance() != '\n' {}
let comment: Box<str> =
self.source[self.start + 2..self.current].iter().collect();
self.push_token(TokenType::Comment(comment));
} else if self.consume('*') {
// block comment
let mut depth = 1;
loop {
if depth == 0 {
break;
}
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedBlockComment {
code_pos: self.code_pos,
});
break;
}
if self.peek() == Some('/') && self.peek_two() == Some('*') {
// nested block comment
// consume '/' and '*'
self.advance();
self.advance();
depth += 1;
continue;
}
if self.peek() == Some('*') && self.peek_two() == Some('/') {
// consume '*' and '/'
self.advance();
self.advance();
depth -= 1;
continue;
}
self.advance();
}
let comment: Box<str> = self.source[self.start + 2..self.current - 2]
.iter()
.collect();
self.push_token(TokenType::Comment(comment));
} else {
self.push_token(Slash)
}
}
'"' => self.try_parse_string(),
'0'..='9' => self.try_parse_number(),
' ' | '\r' | '\n' | '\t' => {} // handled automatically in advance()
c @ '_' | c if c.is_ascii_alphabetic() => self.try_parse_identifier(),
_ => {
self.errors.push(LexerError::UnexpectedCharacter {
c,
code_pos: self.code_pos,
});
}
};
}
fn source_is_empty(&self) -> bool {
self.current >= self.source.len()
}
fn advance(&mut self) -> char {
assert!(!self.source_is_empty());
let c = self.source[self.current];
self.current += 1;
self.code_pos.col += 1;
if c == '\t' {
self.code_pos.col += 3;
} else if c == '\n' {
self.code_pos.col = 0;
self.code_pos.line += 1;
}
c
}
fn peek(&self) -> Option<char> {
self.source.get(self.current).copied()
}
fn peek_two(&self) -> Option<char> {
self.source.get(self.current + 1).copied()
}
fn consume(&mut self, c: char) -> bool {
if self.peek() == Some(c) {
self.advance();
true
} else {
false
}
}
fn push_token(&mut self, token_type: TokenType) {
// let lexeme: String = self.source[self.start..self.current].iter().collect();
self.tokens.push(Token::new(token_type, self.code_pos));
}
fn try_parse_string(&mut self) {
// first '"' already consumed
// advance until second "
/* while self.advance() != '"' {
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedStringLiteral {
code_pos: self.code_pos,
});
return;
}
} */
let mut s = String::new();
let starting_pos = self.code_pos;
loop {
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedStringLiteral {
code_pos: starting_pos,
});
return;
}
match self.advance() {
'"' => break,
'\\' => {
// escape sequence -> handle later
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedStringLiteral {
code_pos: starting_pos,
});
return;
}
match self.advance() {
'n' => s.push('\n'),
'r' => s.push('\r'),
'\\' => s.push('\\'),
c => self.errors.push(LexerError::InvalidEscapeSequence {
code_pos: self.code_pos,
c,
}),
}
}
c => {
s.push(c);
}
}
}
// let string_literal: Box<str> = self.source[self.start + 1..self.current - 1]
// .iter()
// .collect();
// Some(TokenType::String(Box::new(string_literal)))
self.tokens.push(Token::new_string(s, self.code_pos));
}
fn try_parse_number(&mut self) {
let is_some_digit = |c: Option<char>| c.map_or(false, |c| c.is_ascii_digit());
// eat all digits
while is_some_digit(self.peek()) {
self.advance();
}
// consume separator dot and continue eating digits
if self.peek() == Some('.') && is_some_digit(self.peek_two()) {
// consume the '.'
self.advance();
while is_some_digit(self.peek()) {
self.advance();
}
}
// consume exponential e and continue eating digits
if self.peek() == Some('e') && is_some_digit(self.peek_two()) {
// consume the 'e'
self.advance();
while is_some_digit(self.peek()) {
self.advance();
}
}
let lexeme: String = self.source[self.start..self.current].iter().collect();
let num: f64 = match lexeme.parse() {
Ok(num) => num,
Err(err) => {
self.errors.push(LexerError::InvalidNumberLiteral {
lexeme,
msg: err.to_string(),
code_pos: self.code_pos,
});
return;
}
};
// Some(TokenType::Number(num))
self.tokens.push(Token::new_number(num, self.code_pos));
}
fn try_parse_identifier(&mut self) {
let is_alpha_num_underscore = |c: Option<char>| {
c.map_or(
false,
|c| matches!(c, '0'..='9' | 'A'..='Z' | '_' | 'a'..='z'),
)
};
while is_alpha_num_underscore(self.peek()) {
self.advance();
}
let lexeme: String = self.source[self.start..self.current].iter().collect();
/* let token_type = KEYWORDS
.get(&lexeme)
.cloned()
.unwrap_or(TokenType::Identifier(Box::new(lexeme))); */
if let Some(token_type) = KEYWORDS.get(&lexeme) {
// Token::new(token_type, self.code_pos)
self.push_token(token_type.clone());
} else {
self.tokens
.push(Token::new_identifier(lexeme, self.code_pos));
}
// Some(token_type)
}
}