rlox/frontend/src/lexer/_lexer.rs

use phf::phf_map;

use super::{CodePos, LexerError, Token, TokenType};

/*====================================================================================================================*/

static KEYWORDS: phf::Map<&'static str, TokenType> = phf_map! {
    "and" => TokenType::And,
    "break" => TokenType::Break,
    "class" => TokenType::Class,
    "else" => TokenType::Else,
    "false" => TokenType::False,
    "for" => TokenType::For,
    "fun" => TokenType::Fun,
    "if" => TokenType::If,
    "nil" => TokenType::Nil,
    "or" => TokenType::Or,
    "print" => TokenType::Print,
    "return" => TokenType::Return,
    "super" => TokenType::Super,
    "this" => TokenType::This,
    "true" => TokenType::True,
    "var" => TokenType::Var,
    "while" => TokenType::While
};

/*====================================================================================================================*/

pub fn scan_tokens(source_code: &str) -> Result<Vec<Token>, Vec<LexerError>> {
    let lexer = Lexer::new(source_code);

    lexer.scan_tokens()
}

/*====================================================================================================================*/

#[derive(Debug)]
struct Lexer {
    source: Vec<char>,

    tokens: Vec<Token>,

    start: usize,
    current: usize,

    code_pos: CodePos,

    errors: Vec<LexerError>,
}

impl Lexer {
    fn new(source_code: &str) -> Self {
        let source = source_code.chars().collect();

        Lexer {
            source,
            tokens: Vec::new(),
            start: 0,
            current: 0,
            code_pos: CodePos::default(),
            errors: Vec::new(),
        }
    }

    fn scan_tokens(self) -> Result<Vec<Token>, Vec<LexerError>> {
        let mut me = self;

        while !me.source_is_empty() {
            me.scan_token();
        }

        me.tokens.push(Token::new(TokenType::EOF, me.code_pos));

        if me.errors.is_empty() {
            Ok(me.tokens)
        } else {
            Err(me.errors)
        }
    }

    fn scan_token(&mut self) {
        use TokenType::*;

        self.start = self.current;

        let c = self.advance();

        match c {
            '(' => self.push_token(LeftParen),
            ')' => self.push_token(RightParen),
            '{' => self.push_token(LeftBrace),
            '}' => self.push_token(RightBrace),
            ',' => self.push_token(Comma),
            '.' => self.push_token(Dot),
            '+' => self.push_token(Plus),
            '-' => self.push_token(Minus),
            ';' => self.push_token(Semicolon),
            '*' => self.push_token(Star),
            '!' => {
                if self.consume('=') {
                    self.push_token(BangEqual)
                } else {
                    self.push_token(Bang)
                }
            }
            '=' => {
                if self.consume('=') {
                    self.push_token(EqualEqual)
                } else {
                    self.push_token(Equal)
                }
            }
            '<' => {
                if self.consume('=') {
                    self.push_token(LessEqual)
                } else {
                    self.push_token(Less)
                }
            }
            '>' => {
                if self.consume('=') {
                    self.push_token(GreaterEqual)
                } else {
                    self.push_token(Greater)
                }
            }
            '/' => {
                if self.consume('/') {
                    // line comment
                    // advance until either source is empty or newline if found
                    while !self.source_is_empty() && self.advance() != '\n' {}

                    let comment: Box<str> =
                        self.source[self.start + 2..self.current].iter().collect();

                    self.push_token(TokenType::Comment(comment));
                } else if self.consume('*') {
                    // block comment

                    let mut depth = 1;
                    loop {
                        if depth == 0 {
                            break;
                        }

                        if self.source_is_empty() {
                            self.errors.push(LexerError::UnterminatedBlockComment {
                                code_pos: self.code_pos,
                            });
                            break;
                        }

                        if self.peek() == Some('/') && self.peek_two() == Some('*') {
                            // nested block comment
                            // consume '/' and '*'
                            self.advance();
                            self.advance();
                            depth += 1;
                            continue;
                        }

                        if self.peek() == Some('*') && self.peek_two() == Some('/') {
                            // consume '*' and '/'
                            self.advance();
                            self.advance();
                            depth -= 1;
                            continue;
                        }

                        self.advance();
                    }

                    let comment: Box<str> = self.source[self.start + 2..self.current - 2]
                        .iter()
                        .collect();

                    self.push_token(TokenType::Comment(comment));
                } else {
                    self.push_token(Slash)
                }
            }
            '"' => self.try_parse_string(),
            '0'..='9' => self.try_parse_number(),
            ' ' | '\r' | '\n' | '\t' => {} // handled automatically in advance()
            c @ '_' | c if c.is_ascii_alphabetic() => self.try_parse_identifier(),
            _ => {
                self.errors.push(LexerError::UnexpectedCharacter {
                    c,
                    code_pos: self.code_pos,
                });
            }
        };
    }

    fn source_is_empty(&self) -> bool {
        self.current >= self.source.len()
    }

    fn advance(&mut self) -> char {
        assert!(!self.source_is_empty());

        let c = self.source[self.current];

        self.current += 1;
        self.code_pos.col += 1;

        if c == '\t' {
            self.code_pos.col += 3;
        } else if c == '\n' {
            self.code_pos.col = 0;
            self.code_pos.line += 1;
        }

        c
    }

    fn peek(&self) -> Option<char> {
        self.source.get(self.current).copied()
    }

    fn peek_two(&self) -> Option<char> {
        self.source.get(self.current + 1).copied()
    }

    fn consume(&mut self, c: char) -> bool {
        if self.peek() == Some(c) {
            self.advance();
            true
        } else {
            false
        }
    }

    fn push_token(&mut self, token_type: TokenType) {
        // let lexeme: String = self.source[self.start..self.current].iter().collect();

        self.tokens.push(Token::new(token_type, self.code_pos));
    }

    fn try_parse_string(&mut self) {
        // first '"' already consumed

        // advance until second "
        /* while self.advance() != '"' {
            if self.source_is_empty() {
                self.errors.push(LexerError::UnterminatedStringLiteral {
                    code_pos: self.code_pos,
                });
                return;
            }
        } */

        let mut s = String::new();

        let starting_pos = self.code_pos;

        loop {
            if self.source_is_empty() {
                self.errors.push(LexerError::UnterminatedStringLiteral {
                    code_pos: starting_pos,
                });
                return;
            }

            match self.advance() {
                '"' => break,
                '\\' => {
                    // escape sequence -> handle later
                    if self.source_is_empty() {
                        self.errors.push(LexerError::UnterminatedStringLiteral {
                            code_pos: starting_pos,
                        });
                        return;
                    }

                    match self.advance() {
                        'n' => s.push('\n'),
                        'r' => s.push('\r'),
                        '\\' => s.push('\\'),
                        c => self.errors.push(LexerError::InvalidEscapeSequence {
                            code_pos: self.code_pos,
                            c,
                        }),
                    }
                }
                c => {
                    s.push(c);
                }
            }
        }

        // let string_literal: Box<str> = self.source[self.start + 1..self.current - 1]
        //     .iter()
        //     .collect();

        // Some(TokenType::String(Box::new(string_literal)))
        self.tokens.push(Token::new_string(s, self.code_pos));
    }

    fn try_parse_number(&mut self) {
        let is_some_digit = |c: Option<char>| c.map_or(false, |c| c.is_ascii_digit());

        // eat all digits
        while is_some_digit(self.peek()) {
            self.advance();
        }

        // consume separator dot and continue eating digits
        if self.peek() == Some('.') && is_some_digit(self.peek_two()) {
            // consume the '.'
            self.advance();

            while is_some_digit(self.peek()) {
                self.advance();
            }
        }

        // consume exponential e and continue eating digits
        if self.peek() == Some('e') && is_some_digit(self.peek_two()) {
            // consume the 'e'
            self.advance();

            while is_some_digit(self.peek()) {
                self.advance();
            }
        }

        let lexeme: String = self.source[self.start..self.current].iter().collect();

        let num: f64 = match lexeme.parse() {
            Ok(num) => num,
            Err(err) => {
                self.errors.push(LexerError::InvalidNumberLiteral {
                    lexeme,
                    msg: err.to_string(),
                    code_pos: self.code_pos,
                });
                return;
            }
        };

        // Some(TokenType::Number(num))
        self.tokens.push(Token::new_number(num, self.code_pos));
    }

    fn try_parse_identifier(&mut self) {
        let is_alpha_num_underscore = |c: Option<char>| {
            c.map_or(
                false,
                |c| matches!(c, '0'..='9' | 'A'..='Z' | '_' | 'a'..='z'),
            )
        };

        while is_alpha_num_underscore(self.peek()) {
            self.advance();
        }

        let lexeme: String = self.source[self.start..self.current].iter().collect();

        /* let token_type = KEYWORDS
        .get(&lexeme)
        .cloned()
        .unwrap_or(TokenType::Identifier(Box::new(lexeme))); */

        if let Some(token_type) = KEYWORDS.get(&lexeme) {
            // Token::new(token_type, self.code_pos)
            self.push_token(token_type.clone());
        } else {
            self.tokens
                .push(Token::new_identifier(lexeme, self.code_pos));
        }

        // Some(token_type)
    }
}