Chapter 7

2025-12-06 12:22:42 +00:00 · 2023-01-20 16:10:03 +01:00 · 2023-01-20 16:10:03 +01:00 · 42dbe531ad
commit 42dbe531ad
15 changed files with 1112 additions and 0 deletions
--- a/src/lexer/lexer.rs
+++ b/src/lexer/lexer.rs
@ -0,0 +1,315 @@
+use phf::phf_map;
+
+use crate::error::LexerError;
+use crate::misc::CodePos;
+
+use super::{Token, TokenType};
+
+/*====================================================================================================================*/
+
+static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
+    "and" => TokenType::And,
+    "class" => TokenType::Class,
+    "else" => TokenType::Else,
+    "false" => TokenType::Else,
+    "for" => TokenType::For,
+    "fun" => TokenType::Fun,
+    "if" => TokenType::If,
+    "nil" => TokenType::Nil,
+    "or" => TokenType::Or,
+    "print" => TokenType::Print,
+    "return" => TokenType::Return,
+    "super" => TokenType::Super,
+    "this" => TokenType::This,
+    "true" => TokenType::True,
+    "var" => TokenType::Var,
+    "while" => TokenType::While
+};
+
+/*====================================================================================================================*/
+
+pub fn scan_tokens(source_code: &str) -> Result<Vec<Token>, Vec<LexerError>> {
+    let lexer = Lexer::new(source_code);
+
+    lexer.scan_tokens()
+}
+
+/*====================================================================================================================*/
+
+#[derive(Debug)]
+struct Lexer {
+    source: Vec<char>,
+
+    tokens: Vec<Token>,
+
+    start: usize,
+    current: usize,
+
+    code_pos: CodePos,
+
+    errors: Vec<LexerError>,
+}
+
+impl Lexer {
+    fn new(source_code: &str) -> Self {
+        let source = source_code.chars().collect();
+
+        Lexer {
+            source,
+            tokens: Vec::new(),
+            start: 0,
+            current: 0,
+            code_pos: CodePos::default(),
+            errors: Vec::new(),
+        }
+    }
+
+    fn scan_tokens(self) -> Result<Vec<Token>, Vec<LexerError>> {
+        let mut me = self;
+
+        while !me.source_is_empty() {
+            me.scan_token();
+        }
+
+        me.tokens.push(Token::new(TokenType::EOF, "".to_owned(), me.code_pos));
+
+        if me.errors.is_empty() {
+            Ok(me.tokens)
+        } else {
+            Err(me.errors)
+        }
+    }
+
+    fn scan_token(&mut self) {
+        use TokenType::*;
+
+        self.start = self.current;
+
+        let c = self.advance();
+
+        let token_type = match c {
+            '(' => Some(LeftParen),
+            ')' => Some(RightParen),
+            '{' => Some(LeftBrace),
+            '}' => Some(RightBrace),
+            ',' => Some(Comma),
+            '.' => Some(Dot),
+            '+' => Some(Plus),
+            '-' => Some(Minus),
+            ';' => Some(Semicolon),
+            '*' => Some(Star),
+            '!' => {
+                if self.consume('=') {
+                    Some(BangEqual)
+                } else {
+                    Some(Bang)
+                }
+            }
+            '=' => {
+                if self.consume('=') {
+                    Some(EqualEqual)
+                } else {
+                    Some(Equal)
+                }
+            }
+            '<' => {
+                if self.consume('=') {
+                    Some(LessEqual)
+                } else {
+                    Some(Less)
+                }
+            }
+            '>' => {
+                if self.consume('=') {
+                    Some(GreaterEqual)
+                } else {
+                    Some(Greater)
+                }
+            }
+            '/' => {
+                if self.consume('/') {
+                    // line comment
+                    // advance until either source is empty or newline if found
+                    while !self.source_is_empty() && self.advance() != '\n' {}
+
+                    None
+                } else if self.consume('*') {
+                    // block comment
+
+                    let mut depth = 1;
+                    loop {
+                        if depth == 0 {
+                            break;
+                        }
+
+                        if self.source_is_empty() {
+                            self.errors.push(LexerError::UnterminatedBlockComment {
+                                code_pos: self.code_pos,
+                            });
+                            break;
+                        }
+
+                        if self.peek() == Some('/') && self.peek_two() == Some('*') {
+                            // nested block comment
+                            // consume '/' and '*'
+                            self.advance();
+                            self.advance();
+                            depth += 1;
+                            continue;
+                        }
+
+                        if self.peek() == Some('*') && self.peek_two() == Some('/') {
+                            // consume '*' and '/'
+                            self.advance();
+                            self.advance();
+                            depth -= 1;
+                            continue;
+                        }
+
+                        self.advance();
+                    }
+
+                    None
+                } else {
+                    Some(Slash)
+                }
+            }
+            '"' => self.try_parse_string(),
+            '0'..='9' => self.try_parse_number(),
+            ' ' | '\r' | '\n' | '\t' => None, // handled automatically in advance()
+            c @ '_' | c if c.is_ascii_alphabetic() => self.try_parse_identifier(),
+            _ => {
+                self.errors.push(LexerError::UnexpectedCharacter {
+                    c,
+                    code_pos: self.code_pos,
+                });
+                None
+            }
+        };
+
+        if let Some(token_type) = token_type {
+            self.push_token(token_type);
+        }
+    }
+
+    fn source_is_empty(&self) -> bool {
+        self.current >= self.source.len()
+    }
+
+    fn advance(&mut self) -> char {
+        assert!(!self.source_is_empty());
+
+        let c = self.source[self.current];
+
+        self.current += 1;
+        self.code_pos.col += 1;
+
+        if c == '\t' {
+            self.code_pos.col += 3;
+        } else if c == '\n' {
+            self.code_pos.col = 0;
+            self.code_pos.line += 1;
+        }
+
+        c
+    }
+
+    fn peek(&self) -> Option<char> {
+        self.source.get(self.current).copied()
+    }
+
+    fn peek_two(&self) -> Option<char> {
+        self.source.get(self.current + 1).copied()
+    }
+
+    fn consume(&mut self, c: char) -> bool {
+        if self.peek() == Some(c) {
+            self.advance();
+            true
+        } else {
+            false
+        }
+    }
+
+    fn push_token(&mut self, token_type: TokenType) {
+        let lexeme: String = self.source[self.start..self.current].iter().collect();
+
+        self.tokens.push(Token::new(token_type, lexeme, self.code_pos));
+    }
+
+    fn try_parse_string(&mut self) -> Option<TokenType> {
+        // advance until second "
+        while self.advance() != '"' {
+            if self.source_is_empty() {
+                self.errors.push(LexerError::UnterminatedStringLiteral {
+                    code_pos: self.code_pos,
+                });
+                return None;
+            }
+        }
+
+        let string_literal = self.source[self.start + 1..self.current - 1].iter().collect();
+
+        Some(TokenType::String(string_literal))
+    }
+
+    fn try_parse_number(&mut self) -> Option<TokenType> {
+        let is_some_digit = |c: Option<char>| c.map_or(false, |c| c.is_ascii_digit());
+
+        // eat all digits
+        while is_some_digit(self.peek()) {
+            self.advance();
+        }
+
+        // consume separator dot and continue eating digits
+        if self.peek() == Some('.') && is_some_digit(self.peek_two()) {
+            // consume the '.'
+            self.advance();
+
+            while is_some_digit(self.peek()) {
+                self.advance();
+            }
+        }
+
+        // consume exponential e and continue eating digits
+        if self.peek() == Some('e') && is_some_digit(self.peek_two()) {
+            // consume the 'e'
+            self.advance();
+
+            while is_some_digit(self.peek()) {
+                self.advance();
+            }
+        }
+
+        let lexeme: String = self.source[self.start..self.current].iter().collect();
+
+        let num: f64 = match lexeme.parse() {
+            Ok(num) => num,
+            Err(err) => {
+                self.errors.push(LexerError::InvalidNumberLiteral {
+                    lexeme,
+                    msg: format!("{err}"),
+                    code_pos: self.code_pos,
+                });
+                return None;
+            }
+        };
+
+        Some(TokenType::Number(num))
+    }
+
+    fn try_parse_identifier(&mut self) -> Option<TokenType> {
+        let is_alpha_num_underscore =
+            |c: Option<char>| c.map_or(false, |c| matches!(c, '0'..='9' | 'A'..='Z' | '_' | 'a'..='z'));
+
+        while is_alpha_num_underscore(self.peek()) {
+            self.advance();
+        }
+
+        let lexeme: String = self.source[self.start..self.current].iter().collect();
+
+        let token_type = KEYWORDS.get(&lexeme).cloned().unwrap_or(TokenType::Identifier(lexeme));
+
+        Some(token_type)
+    }
+}