mlox/lib/lexer.ml

200 lines
7.4 KiB
OCaml
Raw Normal View History

2024-08-02 00:10:48 +02:00
open Error
[@@@ocamlformat "disable"]
type token_type =
| LeftParen | RightParen | LeftBrace | RightBrace
| Plus | Minus | Star | Slash | Bang
| Dot | Comma | Semicolon | Equal
| EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
| Identifier of string
| String of string
| Number of float
| And | Class | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
| Var | While
| Comment of string
| Eof
[@@deriving show { with_path = false }]
[@@@ocamlformat "enable"]
2024-08-02 01:14:35 +02:00
let keywords =
let keywords = Hashtbl.create 16 in
let insert s tt keywords =
Hashtbl.add keywords s tt;
keywords
in
keywords |> insert "and" And |> insert "class" Class |> insert "else" Else |> insert "false" False
|> insert "for" For |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or
|> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This
|> insert "true" True |> insert "var" Var |> insert "while" While
2024-08-02 00:10:48 +02:00
type token = { token_type : token_type; pos : code_pos }
let show_token (token : token) =
let { line; col } = token.pos in
Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col
type lexer_result = (token list, lexer_error list) result
type state = {
(* source code *)
source : string;
start_pos : int;
cur_pos : int;
(* store tokens and errors in reverse to make building the list more efficient *)
tokens_rev : token list;
errors_rev : lexer_error list;
(* position of current char in source *)
line : int;
col : int;
}
module State = struct
type t = state
let is_digit c = match c with '0' .. '9' -> true | _ -> false
let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
let is_alphanum c = is_digit c || is_alpha c
let is_identifier c = is_alphanum c || c = '_'
let is_at_end (state : state) : bool = state.cur_pos = String.length state.source
let get_lexeme (state : state) (first : int) (last : int) =
String.sub state.source first (last - first)
let advance (state : state) : char * state =
let c = state.source.[state.cur_pos] in
let state = { state with cur_pos = state.cur_pos + 1 } in
let state =
match c with
| '\t' -> { state with col = state.col + 4 }
| '\n' -> { state with line = state.line + 1; col = 0 }
| _ -> { state with col = state.col + 1 }
in
(c, state)
let peek (state : state) : char option =
if not (is_at_end state) then Some state.source.[state.cur_pos] else None
let advance_if (c : char) (state : state) : bool * state =
if peek state = Some c then (true, snd (advance state)) else (false, state)
let rec advance_until (c : char) (state : state) : bool * state =
if is_at_end state then (false, state)
else
let c', state = advance state in
if c' = c then (true, state) else advance_until c state
let rec advance_while (f : char -> bool) (state : state) : state =
match peek state with
| Some c when f c -> advance_while f (snd (advance state))
| _ -> state (* EOF or no match *)
2024-08-02 01:14:35 +02:00
let last_char (state : state) : char =
2024-08-02 00:10:48 +02:00
assert (state.cur_pos > 0);
state.source.[state.cur_pos - 1]
2024-08-02 01:14:35 +02:00
let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =
2024-08-02 00:10:48 +02:00
(* let pos = { line = state.line; col = state.col } in *)
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
2024-08-02 01:14:35 +02:00
let append_error (pos : code_pos) (msg : string) (state : state) : state =
2024-08-02 00:10:48 +02:00
(* let pos = { line = state.line; col = state.col } in *)
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
2024-08-02 01:14:35 +02:00
let parse_number (state : state) : state =
2024-08-02 00:10:48 +02:00
let skip c state = snd @@ advance_if c state in
2024-08-02 01:14:35 +02:00
(* since parse_number is only called if the first char was a digit we can col - 1 here *)
let code_pos = { line = state.line; col = state.col - 1 } in
2024-08-02 00:10:48 +02:00
let state =
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|> advance_while is_digit
in
let lexeme = get_lexeme state state.start_pos state.cur_pos in
let f = Float.of_string_opt lexeme in
match f with
2024-08-02 01:14:35 +02:00
| None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state
| Some f -> append_token code_pos (Number f) state
let parse_keyword_or_identifier (state : state) : state =
let code_pos = { line = state.line; col = state.col - 1 } in
let state = advance_while is_identifier state in
let lexeme = get_lexeme state state.start_pos state.cur_pos in
let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in
append_token code_pos tt state
2024-08-02 00:10:48 +02:00
let rec tokenize_rec (state : state) : state =
let pos = { line = state.line; col = state.col } in
let append_token = append_token pos in
let append_error = append_error pos in
2024-08-02 01:14:35 +02:00
if is_at_end state then append_token Eof state
2024-08-02 00:10:48 +02:00
else
let state = { state with start_pos = state.cur_pos } in
let c, state = advance state in
let state =
2024-08-02 01:14:35 +02:00
state
|>
2024-08-02 00:10:48 +02:00
match c with
2024-08-02 01:14:35 +02:00
| '(' -> append_token LeftParen
| ')' -> append_token RightParen
| '{' -> append_token LeftBrace
| '}' -> append_token RightBrace
| ',' -> append_token Comma
| ';' -> append_token Semicolon
| '.' -> append_token Dot
| '+' -> append_token Plus
| '-' -> append_token Minus
| '*' -> append_token Star
2024-08-02 00:10:48 +02:00
| '!' ->
2024-08-02 01:14:35 +02:00
fun state ->
let b, state = advance_if '=' state in
append_token (if b then BangEqual else Bang) state
2024-08-02 00:10:48 +02:00
| '=' ->
2024-08-02 01:14:35 +02:00
fun state ->
let b, state = advance_if '=' state in
append_token (if b then EqualEqual else Equal) state
2024-08-02 00:10:48 +02:00
| '<' ->
2024-08-02 01:14:35 +02:00
fun state ->
let b, state = advance_if '=' state in
append_token (if b then LessEqual else Less) state
2024-08-02 00:10:48 +02:00
| '>' ->
2024-08-02 01:14:35 +02:00
fun state ->
let b, state = advance_if '=' state in
append_token (if b then GreaterEqual else Greater) state
2024-08-02 00:10:48 +02:00
| '/' ->
2024-08-02 01:14:35 +02:00
fun state ->
let found, state = advance_if '/' state in
if not found then append_token Slash state
else
let start_pos = state.cur_pos in
let _, state = advance_until '\n' state in
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
append_token (Comment lexeme) state
2024-08-02 00:10:48 +02:00
| '"' ->
2024-08-02 01:14:35 +02:00
fun state ->
let found, state = advance_until '"' state in
if not found then append_error "Unterminated string literal" state
else
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
append_token (String lexeme) state
| '0' .. '9' -> parse_number
| c when is_alpha c || c = '_' -> parse_keyword_or_identifier
| ' ' | '\t' | '\n' -> fun state -> state
| c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
2024-08-02 00:10:48 +02:00
in
tokenize_rec state
end
let tokenize (source : string) : lexer_result =
let state =
State.tokenize_rec
{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
in
(* reverse the reversed tokens/errors *)
if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
else Error (List.rev state.errors_rev)