mirror of
https://github.com/MorizzG/rlox.git
synced 2025-12-06 04:12:42 +00:00
375 lines
11 KiB
Rust
375 lines
11 KiB
Rust
use phf::phf_map;
|
|
|
|
use super::{CodePos, LexerError, Token, TokenType};
|
|
|
|
/*====================================================================================================================*/
|
|
|
|
static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
|
|
"and" => TokenType::And,
|
|
"break" => TokenType::Break,
|
|
"class" => TokenType::Class,
|
|
"else" => TokenType::Else,
|
|
"false" => TokenType::False,
|
|
"for" => TokenType::For,
|
|
"fun" => TokenType::Fun,
|
|
"if" => TokenType::If,
|
|
"nil" => TokenType::Nil,
|
|
"or" => TokenType::Or,
|
|
"print" => TokenType::Print,
|
|
"return" => TokenType::Return,
|
|
"super" => TokenType::Super,
|
|
"this" => TokenType::This,
|
|
"true" => TokenType::True,
|
|
"var" => TokenType::Var,
|
|
"while" => TokenType::While
|
|
};
|
|
|
|
/*====================================================================================================================*/
|
|
|
|
pub fn scan_tokens(source_code: &str) -> Result<Vec<Token>, Vec<LexerError>> {
|
|
let lexer = Lexer::new(source_code);
|
|
|
|
lexer.scan_tokens()
|
|
}
|
|
|
|
/*====================================================================================================================*/
|
|
|
|
#[derive(Debug)]
|
|
struct Lexer {
|
|
source: Vec<char>,
|
|
|
|
tokens: Vec<Token>,
|
|
|
|
start: usize,
|
|
current: usize,
|
|
|
|
code_pos: CodePos,
|
|
|
|
errors: Vec<LexerError>,
|
|
}
|
|
|
|
impl Lexer {
|
|
fn new(source_code: &str) -> Self {
|
|
let source = source_code.chars().collect();
|
|
|
|
Lexer {
|
|
source,
|
|
tokens: Vec::new(),
|
|
start: 0,
|
|
current: 0,
|
|
code_pos: CodePos::default(),
|
|
errors: Vec::new(),
|
|
}
|
|
}
|
|
|
|
fn scan_tokens(self) -> Result<Vec<Token>, Vec<LexerError>> {
|
|
let mut me = self;
|
|
|
|
while !me.source_is_empty() {
|
|
me.scan_token();
|
|
}
|
|
|
|
me.tokens.push(Token::new(TokenType::EOF, me.code_pos));
|
|
|
|
if me.errors.is_empty() {
|
|
Ok(me.tokens)
|
|
} else {
|
|
Err(me.errors)
|
|
}
|
|
}
|
|
|
|
fn scan_token(&mut self) {
|
|
use TokenType::*;
|
|
|
|
self.start = self.current;
|
|
|
|
let c = self.advance();
|
|
|
|
match c {
|
|
'(' => self.push_token(LeftParen),
|
|
')' => self.push_token(RightParen),
|
|
'{' => self.push_token(LeftBrace),
|
|
'}' => self.push_token(RightBrace),
|
|
',' => self.push_token(Comma),
|
|
'.' => self.push_token(Dot),
|
|
'+' => self.push_token(Plus),
|
|
'-' => self.push_token(Minus),
|
|
';' => self.push_token(Semicolon),
|
|
'*' => self.push_token(Star),
|
|
'!' => {
|
|
if self.consume('=') {
|
|
self.push_token(BangEqual)
|
|
} else {
|
|
self.push_token(Bang)
|
|
}
|
|
}
|
|
'=' => {
|
|
if self.consume('=') {
|
|
self.push_token(EqualEqual)
|
|
} else {
|
|
self.push_token(Equal)
|
|
}
|
|
}
|
|
'<' => {
|
|
if self.consume('=') {
|
|
self.push_token(LessEqual)
|
|
} else {
|
|
self.push_token(Less)
|
|
}
|
|
}
|
|
'>' => {
|
|
if self.consume('=') {
|
|
self.push_token(GreaterEqual)
|
|
} else {
|
|
self.push_token(Greater)
|
|
}
|
|
}
|
|
'/' => {
|
|
if self.consume('/') {
|
|
// line comment
|
|
// advance until either source is empty or newline if found
|
|
while !self.source_is_empty() && self.advance() != '\n' {}
|
|
|
|
let comment: Box<str> =
|
|
self.source[self.start + 2..self.current].iter().collect();
|
|
|
|
self.push_token(TokenType::Comment(comment));
|
|
} else if self.consume('*') {
|
|
// block comment
|
|
|
|
let mut depth = 1;
|
|
loop {
|
|
if depth == 0 {
|
|
break;
|
|
}
|
|
|
|
if self.source_is_empty() {
|
|
self.errors.push(LexerError::UnterminatedBlockComment {
|
|
code_pos: self.code_pos,
|
|
});
|
|
break;
|
|
}
|
|
|
|
if self.peek() == Some('/') && self.peek_two() == Some('*') {
|
|
// nested block comment
|
|
// consume '/' and '*'
|
|
self.advance();
|
|
self.advance();
|
|
depth += 1;
|
|
continue;
|
|
}
|
|
|
|
if self.peek() == Some('*') && self.peek_two() == Some('/') {
|
|
// consume '*' and '/'
|
|
self.advance();
|
|
self.advance();
|
|
depth -= 1;
|
|
continue;
|
|
}
|
|
|
|
self.advance();
|
|
}
|
|
|
|
let comment: Box<str> = self.source[self.start + 2..self.current - 2]
|
|
.iter()
|
|
.collect();
|
|
|
|
self.push_token(TokenType::Comment(comment));
|
|
} else {
|
|
self.push_token(Slash)
|
|
}
|
|
}
|
|
'"' => self.try_parse_string(),
|
|
'0'..='9' => self.try_parse_number(),
|
|
' ' | '\r' | '\n' | '\t' => {} // handled automatically in advance()
|
|
c @ '_' | c if c.is_ascii_alphabetic() => self.try_parse_identifier(),
|
|
_ => {
|
|
self.errors.push(LexerError::UnexpectedCharacter {
|
|
c,
|
|
code_pos: self.code_pos,
|
|
});
|
|
}
|
|
};
|
|
}
|
|
|
|
fn source_is_empty(&self) -> bool {
|
|
self.current >= self.source.len()
|
|
}
|
|
|
|
fn advance(&mut self) -> char {
|
|
assert!(!self.source_is_empty());
|
|
|
|
let c = self.source[self.current];
|
|
|
|
self.current += 1;
|
|
self.code_pos.col += 1;
|
|
|
|
if c == '\t' {
|
|
self.code_pos.col += 3;
|
|
} else if c == '\n' {
|
|
self.code_pos.col = 0;
|
|
self.code_pos.line += 1;
|
|
}
|
|
|
|
c
|
|
}
|
|
|
|
fn peek(&self) -> Option<char> {
|
|
self.source.get(self.current).copied()
|
|
}
|
|
|
|
fn peek_two(&self) -> Option<char> {
|
|
self.source.get(self.current + 1).copied()
|
|
}
|
|
|
|
fn consume(&mut self, c: char) -> bool {
|
|
if self.peek() == Some(c) {
|
|
self.advance();
|
|
true
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
fn push_token(&mut self, token_type: TokenType) {
|
|
// let lexeme: String = self.source[self.start..self.current].iter().collect();
|
|
|
|
self.tokens.push(Token::new(token_type, self.code_pos));
|
|
}
|
|
|
|
fn try_parse_string(&mut self) {
|
|
// first '"' already consumed
|
|
|
|
// advance until second "
|
|
/* while self.advance() != '"' {
|
|
if self.source_is_empty() {
|
|
self.errors.push(LexerError::UnterminatedStringLiteral {
|
|
code_pos: self.code_pos,
|
|
});
|
|
return;
|
|
}
|
|
} */
|
|
|
|
let mut s = String::new();
|
|
|
|
let starting_pos = self.code_pos;
|
|
|
|
loop {
|
|
if self.source_is_empty() {
|
|
self.errors.push(LexerError::UnterminatedStringLiteral {
|
|
code_pos: starting_pos,
|
|
});
|
|
return;
|
|
}
|
|
|
|
match self.advance() {
|
|
'"' => break,
|
|
'\\' => {
|
|
// escape sequence -> handle later
|
|
if self.source_is_empty() {
|
|
self.errors.push(LexerError::UnterminatedStringLiteral {
|
|
code_pos: starting_pos,
|
|
});
|
|
return;
|
|
}
|
|
|
|
match self.advance() {
|
|
'n' => s.push('\n'),
|
|
'r' => s.push('\r'),
|
|
'\\' => s.push('\\'),
|
|
c => self.errors.push(LexerError::InvalidEscapeSequence {
|
|
code_pos: self.code_pos,
|
|
c,
|
|
}),
|
|
}
|
|
}
|
|
c => {
|
|
s.push(c);
|
|
}
|
|
}
|
|
}
|
|
|
|
// let string_literal: Box<str> = self.source[self.start + 1..self.current - 1]
|
|
// .iter()
|
|
// .collect();
|
|
|
|
// Some(TokenType::String(Box::new(string_literal)))
|
|
self.tokens.push(Token::new_string(s, self.code_pos));
|
|
}
|
|
|
|
fn try_parse_number(&mut self) {
|
|
let is_some_digit = |c: Option<char>| c.map_or(false, |c| c.is_ascii_digit());
|
|
|
|
// eat all digits
|
|
while is_some_digit(self.peek()) {
|
|
self.advance();
|
|
}
|
|
|
|
// consume separator dot and continue eating digits
|
|
if self.peek() == Some('.') && is_some_digit(self.peek_two()) {
|
|
// consume the '.'
|
|
self.advance();
|
|
|
|
while is_some_digit(self.peek()) {
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
// consume exponential e and continue eating digits
|
|
if self.peek() == Some('e') && is_some_digit(self.peek_two()) {
|
|
// consume the 'e'
|
|
self.advance();
|
|
|
|
while is_some_digit(self.peek()) {
|
|
self.advance();
|
|
}
|
|
}
|
|
|
|
let lexeme: String = self.source[self.start..self.current].iter().collect();
|
|
|
|
let num: f64 = match lexeme.parse() {
|
|
Ok(num) => num,
|
|
Err(err) => {
|
|
self.errors.push(LexerError::InvalidNumberLiteral {
|
|
lexeme,
|
|
msg: err.to_string(),
|
|
code_pos: self.code_pos,
|
|
});
|
|
return;
|
|
}
|
|
};
|
|
|
|
// Some(TokenType::Number(num))
|
|
self.tokens.push(Token::new_number(num, self.code_pos));
|
|
}
|
|
|
|
fn try_parse_identifier(&mut self) {
|
|
let is_alpha_num_underscore = |c: Option<char>| {
|
|
c.map_or(
|
|
false,
|
|
|c| matches!(c, '0'..='9' | 'A'..='Z' | '_' | 'a'..='z'),
|
|
)
|
|
};
|
|
|
|
while is_alpha_num_underscore(self.peek()) {
|
|
self.advance();
|
|
}
|
|
|
|
let lexeme: String = self.source[self.start..self.current].iter().collect();
|
|
|
|
/* let token_type = KEYWORDS
|
|
.get(&lexeme)
|
|
.cloned()
|
|
.unwrap_or(TokenType::Identifier(Box::new(lexeme))); */
|
|
|
|
if let Some(token_type) = KEYWORDS.get(&lexeme) {
|
|
// Token::new(token_type, self.code_pos)
|
|
self.push_token(token_type.clone());
|
|
} else {
|
|
self.tokens
|
|
.push(Token::new_identifier(lexeme, self.code_pos));
|
|
}
|
|
|
|
// Some(token_type)
|
|
}
|
|
}
|