2024-08-02 00:10:48 +02:00
|
|
|
open Error
|
|
|
|
|
|
|
|
|
|
[@@@ocamlformat "disable"]
|
|
|
|
|
type token_type =
|
|
|
|
|
| LeftParen | RightParen | LeftBrace | RightBrace
|
|
|
|
|
|
|
|
|
|
| Plus | Minus | Star | Slash | Bang
|
|
|
|
|
|
|
|
|
|
| Dot | Comma | Semicolon | Equal
|
|
|
|
|
|
|
|
|
|
| EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
|
|
|
|
|
|
|
|
|
|
| Identifier of string
|
|
|
|
|
| String of string
|
|
|
|
|
| Number of float
|
|
|
|
|
|
2024-08-27 17:15:35 +02:00
|
|
|
| And | Break | Class | Continue | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
|
2024-08-02 00:10:48 +02:00
|
|
|
| Var | While
|
|
|
|
|
|
|
|
|
|
| Comment of string
|
|
|
|
|
|
|
|
|
|
| Eof
|
|
|
|
|
[@@deriving show { with_path = false }]
|
|
|
|
|
[@@@ocamlformat "enable"]
|
|
|
|
|
|
2024-08-02 01:14:35 +02:00
|
|
|
let keywords =
|
|
|
|
|
let keywords = Hashtbl.create 16 in
|
|
|
|
|
let insert s tt keywords =
|
|
|
|
|
Hashtbl.add keywords s tt;
|
|
|
|
|
keywords
|
|
|
|
|
in
|
2024-08-27 17:15:35 +02:00
|
|
|
keywords |> insert "and" And |> insert "break" Break |> insert "class" Class
|
|
|
|
|
|> insert "continue" Continue |> insert "else" Else |> insert "false" False |> insert "for" For
|
|
|
|
|
|> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or
|
|
|
|
|
|> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This
|
|
|
|
|
|> insert "true" True |> insert "var" Var |> insert "while" While
|
2024-08-02 01:14:35 +02:00
|
|
|
|
2024-08-02 00:10:48 +02:00
|
|
|
type token = { token_type : token_type; pos : code_pos }
|
|
|
|
|
|
|
|
|
|
let show_token (token : token) =
|
|
|
|
|
let { line; col } = token.pos in
|
|
|
|
|
Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col
|
|
|
|
|
|
|
|
|
|
type lexer_result = (token list, lexer_error list) result
|
|
|
|
|
|
|
|
|
|
type state = {
|
|
|
|
|
(* source code *)
|
|
|
|
|
source : string;
|
|
|
|
|
start_pos : int;
|
|
|
|
|
cur_pos : int;
|
|
|
|
|
(* store tokens and errors in reverse to make building the list more efficient *)
|
|
|
|
|
tokens_rev : token list;
|
|
|
|
|
errors_rev : lexer_error list;
|
|
|
|
|
(* position of current char in source *)
|
|
|
|
|
line : int;
|
|
|
|
|
col : int;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module State = struct
|
2024-08-12 16:31:28 +02:00
|
|
|
(* type t = state *)
|
2024-08-02 00:10:48 +02:00
|
|
|
|
|
|
|
|
let is_digit c = match c with '0' .. '9' -> true | _ -> false
|
|
|
|
|
let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
|
|
|
|
|
let is_alphanum c = is_digit c || is_alpha c
|
|
|
|
|
let is_identifier c = is_alphanum c || c = '_'
|
|
|
|
|
let is_at_end (state : state) : bool = state.cur_pos = String.length state.source
|
|
|
|
|
|
|
|
|
|
let get_lexeme (state : state) (first : int) (last : int) =
|
|
|
|
|
String.sub state.source first (last - first)
|
|
|
|
|
|
2024-08-26 01:58:03 +02:00
|
|
|
let next (state : state) : char * state =
|
2024-08-02 00:10:48 +02:00
|
|
|
let c = state.source.[state.cur_pos] in
|
|
|
|
|
let state = { state with cur_pos = state.cur_pos + 1 } in
|
|
|
|
|
let state =
|
|
|
|
|
match c with
|
|
|
|
|
| '\t' -> { state with col = state.col + 4 }
|
|
|
|
|
| '\n' -> { state with line = state.line + 1; col = 0 }
|
|
|
|
|
| _ -> { state with col = state.col + 1 }
|
|
|
|
|
in
|
|
|
|
|
(c, state)
|
|
|
|
|
|
2024-08-26 01:58:03 +02:00
|
|
|
let advance (state : state) : state =
|
|
|
|
|
let _, state = next state in
|
|
|
|
|
state
|
|
|
|
|
|
2024-08-02 00:10:48 +02:00
|
|
|
let peek (state : state) : char option =
|
|
|
|
|
if not (is_at_end state) then Some state.source.[state.cur_pos] else None
|
|
|
|
|
|
|
|
|
|
let advance_if (c : char) (state : state) : bool * state =
|
2024-08-26 01:58:03 +02:00
|
|
|
if peek state = Some c then (true, snd (next state)) else (false, state)
|
2024-08-02 00:10:48 +02:00
|
|
|
|
|
|
|
|
let rec advance_until (c : char) (state : state) : bool * state =
|
|
|
|
|
if is_at_end state then (false, state)
|
|
|
|
|
else
|
2024-08-26 01:58:03 +02:00
|
|
|
let c', state = next state in
|
2024-08-02 00:10:48 +02:00
|
|
|
if c' = c then (true, state) else advance_until c state
|
|
|
|
|
|
|
|
|
|
let rec advance_while (f : char -> bool) (state : state) : state =
|
|
|
|
|
match peek state with
|
2024-08-26 01:58:03 +02:00
|
|
|
| Some c when f c -> advance_while f (snd (next state))
|
2024-08-02 00:10:48 +02:00
|
|
|
| _ -> state (* EOF or no match *)
|
|
|
|
|
|
2024-08-02 01:14:35 +02:00
|
|
|
let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =
|
2024-08-02 00:10:48 +02:00
|
|
|
(* let pos = { line = state.line; col = state.col } in *)
|
|
|
|
|
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
|
|
|
|
|
|
2024-08-02 01:14:35 +02:00
|
|
|
let append_error (pos : code_pos) (msg : string) (state : state) : state =
|
2024-08-02 00:10:48 +02:00
|
|
|
(* let pos = { line = state.line; col = state.col } in *)
|
|
|
|
|
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
|
|
|
|
|
|
2024-08-02 01:14:35 +02:00
|
|
|
let parse_number (state : state) : state =
|
2024-08-02 00:10:48 +02:00
|
|
|
let skip c state = snd @@ advance_if c state in
|
2024-08-02 01:14:35 +02:00
|
|
|
(* since parse_number is only called if the first char was a digit we can col - 1 here *)
|
|
|
|
|
let code_pos = { line = state.line; col = state.col - 1 } in
|
2024-08-02 00:10:48 +02:00
|
|
|
let state =
|
|
|
|
|
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|
|
|
|
|
|> advance_while is_digit
|
|
|
|
|
in
|
|
|
|
|
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
|
|
|
|
let f = Float.of_string_opt lexeme in
|
|
|
|
|
match f with
|
2024-08-02 01:14:35 +02:00
|
|
|
| None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state
|
|
|
|
|
| Some f -> append_token code_pos (Number f) state
|
|
|
|
|
|
|
|
|
|
let parse_keyword_or_identifier (state : state) : state =
|
|
|
|
|
let code_pos = { line = state.line; col = state.col - 1 } in
|
|
|
|
|
let state = advance_while is_identifier state in
|
|
|
|
|
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
|
|
|
|
let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in
|
|
|
|
|
append_token code_pos tt state
|
2024-08-02 00:10:48 +02:00
|
|
|
|
2024-08-26 01:58:03 +02:00
|
|
|
let rec parse_block_comment (state : state) : state =
|
2024-08-04 00:33:28 +02:00
|
|
|
(* "- 2" since we already consumed the "/*" *)
|
|
|
|
|
let pos = { line = state.line; col = state.col - 2 } in
|
|
|
|
|
let found, state = advance_until '*' state in
|
2024-08-26 01:58:03 +02:00
|
|
|
if not found then append_error pos "Unterminated block comment" state
|
2024-08-04 00:33:28 +02:00
|
|
|
else if peek state = Some '/' then
|
2024-08-26 01:58:03 +02:00
|
|
|
let state = advance state in
|
2024-08-04 00:33:28 +02:00
|
|
|
let lexeme = get_lexeme state (state.start_pos + 2) (state.cur_pos - 2) in
|
|
|
|
|
append_token pos (Comment lexeme) state
|
2024-08-26 01:58:03 +02:00
|
|
|
else parse_block_comment state
|
2024-08-04 00:33:28 +02:00
|
|
|
|
2024-08-02 00:10:48 +02:00
|
|
|
let rec tokenize_rec (state : state) : state =
|
|
|
|
|
let pos = { line = state.line; col = state.col } in
|
|
|
|
|
let append_token = append_token pos in
|
|
|
|
|
let append_error = append_error pos in
|
2024-08-02 01:14:35 +02:00
|
|
|
if is_at_end state then append_token Eof state
|
2024-08-02 00:10:48 +02:00
|
|
|
else
|
|
|
|
|
let state = { state with start_pos = state.cur_pos } in
|
2024-08-26 01:58:03 +02:00
|
|
|
let c, state = next state in
|
2024-08-02 00:10:48 +02:00
|
|
|
let state =
|
2024-08-02 01:14:35 +02:00
|
|
|
state
|
|
|
|
|
|>
|
2024-08-02 00:10:48 +02:00
|
|
|
match c with
|
2024-08-02 01:14:35 +02:00
|
|
|
| '(' -> append_token LeftParen
|
|
|
|
|
| ')' -> append_token RightParen
|
|
|
|
|
| '{' -> append_token LeftBrace
|
|
|
|
|
| '}' -> append_token RightBrace
|
|
|
|
|
| ',' -> append_token Comma
|
|
|
|
|
| ';' -> append_token Semicolon
|
|
|
|
|
| '.' -> append_token Dot
|
|
|
|
|
| '+' -> append_token Plus
|
|
|
|
|
| '-' -> append_token Minus
|
|
|
|
|
| '*' -> append_token Star
|
2024-08-02 00:10:48 +02:00
|
|
|
| '!' ->
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
|
|
|
|
let b, state = advance_if '=' state in
|
|
|
|
|
append_token (if b then BangEqual else Bang) state
|
2024-08-02 00:10:48 +02:00
|
|
|
| '=' ->
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
|
|
|
|
let b, state = advance_if '=' state in
|
|
|
|
|
append_token (if b then EqualEqual else Equal) state
|
2024-08-02 00:10:48 +02:00
|
|
|
| '<' ->
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
|
|
|
|
let b, state = advance_if '=' state in
|
|
|
|
|
append_token (if b then LessEqual else Less) state
|
2024-08-02 00:10:48 +02:00
|
|
|
| '>' ->
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
|
|
|
|
let b, state = advance_if '=' state in
|
|
|
|
|
append_token (if b then GreaterEqual else Greater) state
|
2024-08-04 00:33:28 +02:00
|
|
|
| '/' -> (
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
2024-08-04 00:33:28 +02:00
|
|
|
match peek state with
|
|
|
|
|
| Some '/' ->
|
|
|
|
|
let start_pos = state.cur_pos in
|
|
|
|
|
let _, state = advance_until '\n' state in
|
|
|
|
|
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
|
|
|
|
|
append_token (Comment lexeme) state
|
2024-08-26 01:58:03 +02:00
|
|
|
| Some '*' -> parse_block_comment (advance state)
|
2024-08-04 00:33:28 +02:00
|
|
|
| _ -> append_token Slash state)
|
2024-08-02 00:10:48 +02:00
|
|
|
| '"' ->
|
2024-08-02 01:14:35 +02:00
|
|
|
fun state ->
|
|
|
|
|
let found, state = advance_until '"' state in
|
|
|
|
|
if not found then append_error "Unterminated string literal" state
|
|
|
|
|
else
|
|
|
|
|
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
|
|
|
|
|
append_token (String lexeme) state
|
|
|
|
|
| '0' .. '9' -> parse_number
|
|
|
|
|
| c when is_alpha c || c = '_' -> parse_keyword_or_identifier
|
|
|
|
|
| ' ' | '\t' | '\n' -> fun state -> state
|
|
|
|
|
| c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
|
2024-08-02 00:10:48 +02:00
|
|
|
in
|
|
|
|
|
tokenize_rec state
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
let tokenize (source : string) : lexer_result =
|
|
|
|
|
let state =
|
|
|
|
|
State.tokenize_rec
|
|
|
|
|
{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
|
|
|
|
|
in
|
|
|
|
|
(* reverse the reversed tokens/errors *)
|
2024-08-12 16:31:28 +02:00
|
|
|
(* if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
|
|
|
|
|
else Error (List.rev state.errors_rev) *)
|
|
|
|
|
match state.errors_rev with [] -> Ok (List.rev state.tokens_rev) | es -> Error (List.rev es)
|