diff --git a/lib/lexer.ml b/lib/lexer.ml index a15fb4a..5f3718f 100644 --- a/lib/lexer.ml +++ b/lib/lexer.ml @@ -23,6 +23,17 @@ open Error [@@deriving show { with_path = false }] [@@@ocamlformat "enable"] +let keywords = + let keywords = Hashtbl.create 16 in + let insert s tt keywords = + Hashtbl.add keywords s tt; + keywords + in + keywords |> insert "and" And |> insert "class" Class |> insert "else" Else |> insert "false" False + |> insert "for" For |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or + |> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This + |> insert "true" True |> insert "var" Var |> insert "while" While + type token = { token_type : token_type; pos : code_pos } let show_token (token : token) = @@ -84,21 +95,22 @@ module State = struct | Some c when f c -> advance_while f (snd (advance state)) | _ -> state (* EOF or no match *) - let last_char (state : state) = + let last_char (state : state) : char = assert (state.cur_pos > 0); state.source.[state.cur_pos - 1] - let append_token pos state token_type = + let append_token (pos : code_pos) (token_type : token_type) (state : state) : state = (* let pos = { line = state.line; col = state.col } in *) { state with tokens_rev = { token_type; pos } :: state.tokens_rev } - let append_error pos state msg = + let append_error (pos : code_pos) (msg : string) (state : state) : state = (* let pos = { line = state.line; col = state.col } in *) { state with errors_rev = LexerError.make pos msg :: state.errors_rev } - let parse_number (state : state) = + let parse_number (state : state) : state = let skip c state = snd @@ advance_if c state in - let code_pos = { line = state.line; col = state.col } in + (* since parse_number is only called if the first char was a digit we can col - 1 here *) + let code_pos = { line = state.line; col = state.col - 1 } in let state = state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e' |> advance_while is_digit @@ -106,58 +118,74 @@ module State = struct let lexeme = get_lexeme state state.start_pos state.cur_pos in let f = Float.of_string_opt lexeme in match f with - | None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme) - | Some f -> append_token code_pos state (Number f) + | None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state + | Some f -> append_token code_pos (Number f) state + + let parse_keyword_or_identifier (state : state) : state = + let code_pos = { line = state.line; col = state.col - 1 } in + let state = advance_while is_identifier state in + let lexeme = get_lexeme state state.start_pos state.cur_pos in + let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in + append_token code_pos tt state let rec tokenize_rec (state : state) : state = let pos = { line = state.line; col = state.col } in let append_token = append_token pos in let append_error = append_error pos in - if is_at_end state then append_token state Eof + if is_at_end state then append_token Eof state else let state = { state with start_pos = state.cur_pos } in let c, state = advance state in let state = + state + |> match c with - | '(' -> append_token state LeftParen - | ')' -> append_token state RightParen - | '{' -> append_token state LeftBrace - | '}' -> append_token state RightBrace - | ',' -> append_token state Comma - | ';' -> append_token state Semicolon - | '.' -> append_token state Dot - | '+' -> append_token state Plus - | '-' -> append_token state Minus - | '*' -> append_token state Star + | '(' -> append_token LeftParen + | ')' -> append_token RightParen + | '{' -> append_token LeftBrace + | '}' -> append_token RightBrace + | ',' -> append_token Comma + | ';' -> append_token Semicolon + | '.' -> append_token Dot + | '+' -> append_token Plus + | '-' -> append_token Minus + | '*' -> append_token Star | '!' -> - let b, state = advance_if '=' state in - append_token state (if b then BangEqual else Bang) + fun state -> + let b, state = advance_if '=' state in + append_token (if b then BangEqual else Bang) state | '=' -> - let b, state = advance_if '=' state in - append_token state (if b then EqualEqual else Equal) + fun state -> + let b, state = advance_if '=' state in + append_token (if b then EqualEqual else Equal) state | '<' -> - let b, state = advance_if '=' state in - append_token state (if b then LessEqual else Less) + fun state -> + let b, state = advance_if '=' state in + append_token (if b then LessEqual else Less) state | '>' -> - let b, state = advance_if '=' state in - append_token state (if b then GreaterEqual else Greater) + fun state -> + let b, state = advance_if '=' state in + append_token (if b then GreaterEqual else Greater) state | '/' -> - let found, state = advance_if '/' state in - if not found then append_token state Slash - else - let start_pos = state.cur_pos in - let _, state = advance_until '\n' state in - let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in - append_token state (Comment lexeme) + fun state -> + let found, state = advance_if '/' state in + if not found then append_token Slash state + else + let start_pos = state.cur_pos in + let _, state = advance_until '\n' state in + let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in + append_token (Comment lexeme) state | '"' -> - let found, state = advance_until '"' state in - if not found then append_error state "Unterminated string literal" - else - let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in - append_token state (String lexeme) - | '0' .. '9' -> parse_number state - | ' ' | '\t' | '\n' -> parse_number state - | c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c) + fun state -> + let found, state = advance_until '"' state in + if not found then append_error "Unterminated string literal" state + else + let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in + append_token (String lexeme) state + | '0' .. '9' -> parse_number + | c when is_alpha c || c = '_' -> parse_keyword_or_identifier + | ' ' | '\t' | '\n' -> fun state -> state + | c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c) in tokenize_rec state end