open Error [@@@ocamlformat "disable"] type token_type = | LeftParen | RightParen | LeftBrace | RightBrace | Plus | Minus | Star | Slash | Bang | Dot | Comma | Semicolon | Equal | EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual | Identifier of string | String of string | Number of float | And | Class | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True | Var | While | Comment of string | Eof [@@deriving show { with_path = false }] [@@@ocamlformat "enable"] let keywords = let keywords = Hashtbl.create 16 in let insert s tt keywords = Hashtbl.add keywords s tt; keywords in keywords |> insert "and" And |> insert "class" Class |> insert "else" Else |> insert "false" False |> insert "for" For |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or |> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This |> insert "true" True |> insert "var" Var |> insert "while" While type token = { token_type : token_type; pos : code_pos } let show_token (token : token) = let { line; col } = token.pos in Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col type lexer_result = (token list, lexer_error list) result type state = { (* source code *) source : string; start_pos : int; cur_pos : int; (* store tokens and errors in reverse to make building the list more efficient *) tokens_rev : token list; errors_rev : lexer_error list; (* position of current char in source *) line : int; col : int; } module State = struct (* type t = state *) let is_digit c = match c with '0' .. '9' -> true | _ -> false let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false let is_alphanum c = is_digit c || is_alpha c let is_identifier c = is_alphanum c || c = '_' let is_at_end (state : state) : bool = state.cur_pos = String.length state.source let get_lexeme (state : state) (first : int) (last : int) = String.sub state.source first (last - first) let next (state : state) : char * state = let c = state.source.[state.cur_pos] in let state = { state with cur_pos = state.cur_pos + 1 } in let state = match c with | '\t' -> { state with col = state.col + 4 } | '\n' -> { state with line = state.line + 1; col = 0 } | _ -> { state with col = state.col + 1 } in (c, state) let advance (state : state) : state = let _, state = next state in state let peek (state : state) : char option = if not (is_at_end state) then Some state.source.[state.cur_pos] else None let advance_if (c : char) (state : state) : bool * state = if peek state = Some c then (true, snd (next state)) else (false, state) let rec advance_until (c : char) (state : state) : bool * state = if is_at_end state then (false, state) else let c', state = next state in if c' = c then (true, state) else advance_until c state let rec advance_while (f : char -> bool) (state : state) : state = match peek state with | Some c when f c -> advance_while f (snd (next state)) | _ -> state (* EOF or no match *) let append_token (pos : code_pos) (token_type : token_type) (state : state) : state = (* let pos = { line = state.line; col = state.col } in *) { state with tokens_rev = { token_type; pos } :: state.tokens_rev } let append_error (pos : code_pos) (msg : string) (state : state) : state = (* let pos = { line = state.line; col = state.col } in *) { state with errors_rev = LexerError.make pos msg :: state.errors_rev } let parse_number (state : state) : state = let skip c state = snd @@ advance_if c state in (* since parse_number is only called if the first char was a digit we can col - 1 here *) let code_pos = { line = state.line; col = state.col - 1 } in let state = state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e' |> advance_while is_digit in let lexeme = get_lexeme state state.start_pos state.cur_pos in let f = Float.of_string_opt lexeme in match f with | None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state | Some f -> append_token code_pos (Number f) state let parse_keyword_or_identifier (state : state) : state = let code_pos = { line = state.line; col = state.col - 1 } in let state = advance_while is_identifier state in let lexeme = get_lexeme state state.start_pos state.cur_pos in let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in append_token code_pos tt state let rec parse_block_comment (state : state) : state = (* "- 2" since we already consumed the "/*" *) let pos = { line = state.line; col = state.col - 2 } in let found, state = advance_until '*' state in if not found then append_error pos "Unterminated block comment" state else if peek state = Some '/' then let state = advance state in let lexeme = get_lexeme state (state.start_pos + 2) (state.cur_pos - 2) in append_token pos (Comment lexeme) state else parse_block_comment state let rec tokenize_rec (state : state) : state = let pos = { line = state.line; col = state.col } in let append_token = append_token pos in let append_error = append_error pos in if is_at_end state then append_token Eof state else let state = { state with start_pos = state.cur_pos } in let c, state = next state in let state = state |> match c with | '(' -> append_token LeftParen | ')' -> append_token RightParen | '{' -> append_token LeftBrace | '}' -> append_token RightBrace | ',' -> append_token Comma | ';' -> append_token Semicolon | '.' -> append_token Dot | '+' -> append_token Plus | '-' -> append_token Minus | '*' -> append_token Star | '!' -> fun state -> let b, state = advance_if '=' state in append_token (if b then BangEqual else Bang) state | '=' -> fun state -> let b, state = advance_if '=' state in append_token (if b then EqualEqual else Equal) state | '<' -> fun state -> let b, state = advance_if '=' state in append_token (if b then LessEqual else Less) state | '>' -> fun state -> let b, state = advance_if '=' state in append_token (if b then GreaterEqual else Greater) state | '/' -> ( fun state -> match peek state with | Some '/' -> let start_pos = state.cur_pos in let _, state = advance_until '\n' state in let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in append_token (Comment lexeme) state | Some '*' -> parse_block_comment (advance state) | _ -> append_token Slash state) | '"' -> fun state -> let found, state = advance_until '"' state in if not found then append_error "Unterminated string literal" state else let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in append_token (String lexeme) state | '0' .. '9' -> parse_number | c when is_alpha c || c = '_' -> parse_keyword_or_identifier | ' ' | '\t' | '\n' -> fun state -> state | c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c) in tokenize_rec state end let tokenize (source : string) : lexer_result = let state = State.tokenize_rec { source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 } in (* reverse the reversed tokens/errors *) (* if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev) else Error (List.rev state.errors_rev) *) match state.errors_rev with [] -> Ok (List.rev state.tokens_rev) | es -> Error (List.rev es)