rlox/src/lexer/_lexer.rs

316 lines
8.8 KiB
Rust
Raw Normal View History

2023-01-20 16:10:03 +01:00
use phf::phf_map;
use crate::error::LexerError;
use crate::misc::CodePos;
use super::{Token, TokenType};
/*====================================================================================================================*/
static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
"and" => TokenType::And,
2023-01-25 19:01:13 +01:00
"break" => TokenType::Break,
2023-01-20 16:10:03 +01:00
"class" => TokenType::Class,
"else" => TokenType::Else,
2023-01-25 19:01:13 +01:00
"false" => TokenType::False,
2023-01-20 16:10:03 +01:00
"for" => TokenType::For,
"fun" => TokenType::Fun,
"if" => TokenType::If,
"nil" => TokenType::Nil,
"or" => TokenType::Or,
"print" => TokenType::Print,
"return" => TokenType::Return,
"super" => TokenType::Super,
"this" => TokenType::This,
"true" => TokenType::True,
"var" => TokenType::Var,
"while" => TokenType::While
};
/*====================================================================================================================*/
pub fn scan_tokens(source_code: &str) -> Result<Vec<Token>, Vec<LexerError>> {
let lexer = Lexer::new(source_code);
lexer.scan_tokens()
}
/*====================================================================================================================*/
#[derive(Debug)]
struct Lexer {
source: Vec<char>,
tokens: Vec<Token>,
start: usize,
current: usize,
code_pos: CodePos,
errors: Vec<LexerError>,
}
impl Lexer {
fn new(source_code: &str) -> Self {
let source = source_code.chars().collect();
Lexer {
source,
tokens: Vec::new(),
start: 0,
current: 0,
code_pos: CodePos::default(),
errors: Vec::new(),
}
}
fn scan_tokens(self) -> Result<Vec<Token>, Vec<LexerError>> {
let mut me = self;
while !me.source_is_empty() {
me.scan_token();
}
2023-01-22 23:33:57 +01:00
me.tokens.push(Token::new(TokenType::EOF, me.code_pos));
2023-01-20 16:10:03 +01:00
if me.errors.is_empty() {
Ok(me.tokens)
} else {
Err(me.errors)
}
}
fn scan_token(&mut self) {
use TokenType::*;
self.start = self.current;
let c = self.advance();
let token_type = match c {
'(' => Some(LeftParen),
')' => Some(RightParen),
'{' => Some(LeftBrace),
'}' => Some(RightBrace),
',' => Some(Comma),
'.' => Some(Dot),
'+' => Some(Plus),
'-' => Some(Minus),
';' => Some(Semicolon),
'*' => Some(Star),
'!' => {
if self.consume('=') {
Some(BangEqual)
} else {
Some(Bang)
}
}
'=' => {
if self.consume('=') {
Some(EqualEqual)
} else {
Some(Equal)
}
}
'<' => {
if self.consume('=') {
Some(LessEqual)
} else {
Some(Less)
}
}
'>' => {
if self.consume('=') {
Some(GreaterEqual)
} else {
Some(Greater)
}
}
'/' => {
if self.consume('/') {
// line comment
// advance until either source is empty or newline if found
while !self.source_is_empty() && self.advance() != '\n' {}
None
} else if self.consume('*') {
// block comment
let mut depth = 1;
loop {
if depth == 0 {
break;
}
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedBlockComment {
code_pos: self.code_pos,
});
break;
}
if self.peek() == Some('/') && self.peek_two() == Some('*') {
// nested block comment
// consume '/' and '*'
self.advance();
self.advance();
depth += 1;
continue;
}
if self.peek() == Some('*') && self.peek_two() == Some('/') {
// consume '*' and '/'
self.advance();
self.advance();
depth -= 1;
continue;
}
self.advance();
}
None
} else {
Some(Slash)
}
}
'"' => self.try_parse_string(),
'0'..='9' => self.try_parse_number(),
' ' | '\r' | '\n' | '\t' => None, // handled automatically in advance()
c @ '_' | c if c.is_ascii_alphabetic() => self.try_parse_identifier(),
_ => {
self.errors.push(LexerError::UnexpectedCharacter {
c,
code_pos: self.code_pos,
});
None
}
};
if let Some(token_type) = token_type {
self.push_token(token_type);
}
}
fn source_is_empty(&self) -> bool {
self.current >= self.source.len()
}
fn advance(&mut self) -> char {
assert!(!self.source_is_empty());
let c = self.source[self.current];
self.current += 1;
self.code_pos.col += 1;
if c == '\t' {
self.code_pos.col += 3;
} else if c == '\n' {
self.code_pos.col = 0;
self.code_pos.line += 1;
}
c
}
fn peek(&self) -> Option<char> {
self.source.get(self.current).copied()
}
fn peek_two(&self) -> Option<char> {
self.source.get(self.current + 1).copied()
}
fn consume(&mut self, c: char) -> bool {
if self.peek() == Some(c) {
self.advance();
true
} else {
false
}
}
fn push_token(&mut self, token_type: TokenType) {
2023-01-25 19:01:13 +01:00
// let lexeme: String = self.source[self.start..self.current].iter().collect();
2023-01-20 16:10:03 +01:00
2023-01-22 23:33:57 +01:00
self.tokens.push(Token::new(token_type, self.code_pos));
2023-01-20 16:10:03 +01:00
}
fn try_parse_string(&mut self) -> Option<TokenType> {
// advance until second "
while self.advance() != '"' {
if self.source_is_empty() {
self.errors.push(LexerError::UnterminatedStringLiteral {
code_pos: self.code_pos,
});
return None;
}
}
let string_literal = self.source[self.start + 1..self.current - 1].iter().collect();
Some(TokenType::String(string_literal))
}
fn try_parse_number(&mut self) -> Option<TokenType> {
let is_some_digit = |c: Option<char>| c.map_or(false, |c| c.is_ascii_digit());
// eat all digits
while is_some_digit(self.peek()) {
self.advance();
}
// consume separator dot and continue eating digits
if self.peek() == Some('.') && is_some_digit(self.peek_two()) {
// consume the '.'
self.advance();
while is_some_digit(self.peek()) {
self.advance();
}
}
// consume exponential e and continue eating digits
if self.peek() == Some('e') && is_some_digit(self.peek_two()) {
// consume the 'e'
self.advance();
while is_some_digit(self.peek()) {
self.advance();
}
}
let lexeme: String = self.source[self.start..self.current].iter().collect();
let num: f64 = match lexeme.parse() {
Ok(num) => num,
Err(err) => {
self.errors.push(LexerError::InvalidNumberLiteral {
lexeme,
msg: format!("{err}"),
code_pos: self.code_pos,
});
return None;
}
};
Some(TokenType::Number(num))
}
fn try_parse_identifier(&mut self) -> Option<TokenType> {
let is_alpha_num_underscore =
|c: Option<char>| c.map_or(false, |c| matches!(c, '0'..='9' | 'A'..='Z' | '_' | 'a'..='z'));
while is_alpha_num_underscore(self.peek()) {
self.advance();
}
let lexeme: String = self.source[self.start..self.current].iter().collect();
let token_type = KEYWORDS.get(&lexeme).cloned().unwrap_or(TokenType::Identifier(lexeme));
Some(token_type)
}
}