mlox/lib/lexer.ml

177 lines
6.3 KiB
OCaml
Raw Normal View History

2024-08-02 00:10:48 +02:00
open Error
[@@@ocamlformat "disable"]
type token_type =
| LeftParen | RightParen | LeftBrace | RightBrace
| Plus | Minus | Star | Slash | Bang
| Dot | Comma | Semicolon | Equal
| EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
| Identifier of string
| String of string
| Number of float
| And | Class | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
| Var | While
| Comment of string
| Eof
[@@deriving show { with_path = false }]
[@@@ocamlformat "enable"]
type token = { token_type : token_type; pos : code_pos }
let show_token (token : token) =
let { line; col } = token.pos in
Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col
type lexer_result = (token list, lexer_error list) result
type state = {
(* source code *)
source : string;
start_pos : int;
cur_pos : int;
(* store tokens and errors in reverse to make building the list more efficient *)
tokens_rev : token list;
errors_rev : lexer_error list;
(* position of current char in source *)
line : int;
col : int;
}
module State = struct
type t = state
let is_digit c = match c with '0' .. '9' -> true | _ -> false
let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
let is_alphanum c = is_digit c || is_alpha c
let is_identifier c = is_alphanum c || c = '_'
let is_at_end (state : state) : bool = state.cur_pos = String.length state.source
let get_lexeme (state : state) (first : int) (last : int) =
String.sub state.source first (last - first)
let advance (state : state) : char * state =
let c = state.source.[state.cur_pos] in
let state = { state with cur_pos = state.cur_pos + 1 } in
let state =
match c with
| '\t' -> { state with col = state.col + 4 }
| '\n' -> { state with line = state.line + 1; col = 0 }
| _ -> { state with col = state.col + 1 }
in
(c, state)
let peek (state : state) : char option =
if not (is_at_end state) then Some state.source.[state.cur_pos] else None
let advance_if (c : char) (state : state) : bool * state =
if peek state = Some c then (true, snd (advance state)) else (false, state)
let rec advance_until (c : char) (state : state) : bool * state =
if is_at_end state then (false, state)
else
let c', state = advance state in
if c' = c then (true, state) else advance_until c state
let rec advance_while (f : char -> bool) (state : state) : state =
match peek state with
| Some c when f c -> advance_while f (snd (advance state))
| _ -> state (* EOF or no match *)
let last_char (state : state) =
assert (state.cur_pos > 0);
state.source.[state.cur_pos - 1]
let append_token pos state token_type =
(* let pos = { line = state.line; col = state.col } in *)
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
let append_error pos state msg =
(* let pos = { line = state.line; col = state.col } in *)
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
let parse_number (state : state) =
let skip c state = snd @@ advance_if c state in
let code_pos = { line = state.line; col = state.col } in
let state =
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|> advance_while is_digit
in
let lexeme = get_lexeme state state.start_pos state.cur_pos in
let f = Float.of_string_opt lexeme in
match f with
| None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme)
| Some f -> append_token code_pos state (Number f)
let rec tokenize_rec (state : state) : state =
let pos = { line = state.line; col = state.col } in
let append_token = append_token pos in
let append_error = append_error pos in
if is_at_end state then append_token state Eof
else
let state = { state with start_pos = state.cur_pos } in
let c, state = advance state in
let state =
match c with
| '(' -> append_token state LeftParen
| ')' -> append_token state RightParen
| '{' -> append_token state LeftBrace
| '}' -> append_token state RightBrace
| ',' -> append_token state Comma
| ';' -> append_token state Semicolon
| '.' -> append_token state Dot
| '+' -> append_token state Plus
| '-' -> append_token state Minus
| '*' -> append_token state Star
| '!' ->
let b, state = advance_if '=' state in
append_token state (if b then BangEqual else Bang)
| '=' ->
let b, state = advance_if '=' state in
append_token state (if b then EqualEqual else Equal)
| '<' ->
let b, state = advance_if '=' state in
append_token state (if b then LessEqual else Less)
| '>' ->
let b, state = advance_if '=' state in
append_token state (if b then GreaterEqual else Greater)
| '/' ->
let found, state = advance_if '/' state in
if not found then append_token state Slash
else
let start_pos = state.cur_pos in
let _, state = advance_until '\n' state in
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
append_token state (Comment lexeme)
| '"' ->
let found, state = advance_until '"' state in
if not found then append_error state "Unterminated string literal"
else
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
append_token state (String lexeme)
| '0' .. '9' -> parse_number state
| ' ' | '\t' | '\n' -> parse_number state
| c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
in
tokenize_rec state
end
let tokenize (source : string) : lexer_result =
print_endline "Scanning source";
print_endline "---";
print_endline source;
print_endline "---";
(* Ok [] *)
let state =
State.tokenize_rec
{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
in
(* reverse the reversed tokens/errors *)
if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
else Error (List.rev state.errors_rev)