mlox/lib/lexer.ml

open Error

[@@@ocamlformat "disable"]
  type token_type =
  | LeftParen | RightParen | LeftBrace | RightBrace

  | Plus | Minus | Star | Slash | Bang

  | Dot | Comma | Semicolon | Equal

  | EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
  
  | Identifier of string
  | String of string
  | Number of float

  | And | Break | Class | Continue | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
  | Var | While

  | Comment of string
  
  | Eof
[@@deriving show { with_path = false }]
[@@@ocamlformat "enable"]

let keywords =
  let keywords = Hashtbl.create 16 in
  let insert s tt keywords =
    Hashtbl.add keywords s tt;
    keywords
  in
  keywords |> insert "and" And |> insert "break" Break |> insert "class" Class
  |> insert "continue" Continue |> insert "else" Else |> insert "false" False |> insert "for" For
  |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or
  |> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This
  |> insert "true" True |> insert "var" Var |> insert "while" While

type token = { token_type : token_type; pos : code_pos }

let show_token (token : token) =
  let { line; col } = token.pos in
  Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col

type lexer_result = (token list, lexer_error list) result

type state = {
  (* source code *)
  source : string;
  start_pos : int;
  cur_pos : int;
  (* store tokens and errors in reverse to make building the list more efficient *)
  tokens_rev : token list;
  errors_rev : lexer_error list;
  (* position of current char in source *)
  line : int;
  col : int;
}

module State = struct
  (* type t = state *)

  let is_digit c = match c with '0' .. '9' -> true | _ -> false
  let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
  let is_alphanum c = is_digit c || is_alpha c
  let is_identifier c = is_alphanum c || c = '_'
  let is_at_end (state : state) : bool = state.cur_pos = String.length state.source

  let get_lexeme (state : state) (first : int) (last : int) =
    String.sub state.source first (last - first)

  let next (state : state) : char * state =
    let c = state.source.[state.cur_pos] in
    let state = { state with cur_pos = state.cur_pos + 1 } in
    let state =
      match c with
      | '\t' -> { state with col = state.col + 4 }
      | '\n' -> { state with line = state.line + 1; col = 0 }
      | _ -> { state with col = state.col + 1 }
    in
    (c, state)

  let advance (state : state) : state =
    let _, state = next state in
    state

  let peek (state : state) : char option =
    if not (is_at_end state) then Some state.source.[state.cur_pos] else None

  let advance_if (c : char) (state : state) : bool * state =
    if peek state = Some c then (true, snd (next state)) else (false, state)

  let rec advance_until (c : char) (state : state) : bool * state =
    if is_at_end state then (false, state)
    else
      let c', state = next state in
      if c' = c then (true, state) else advance_until c state

  let rec advance_while (f : char -> bool) (state : state) : state =
    match peek state with
    | Some c when f c -> advance_while f (snd (next state))
    | _ -> state (* EOF or no match *)

  let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =
    (* let pos = { line = state.line; col = state.col } in *)
    { state with tokens_rev = { token_type; pos } :: state.tokens_rev }

  let append_error (pos : code_pos) (msg : string) (state : state) : state =
    (* let pos = { line = state.line; col = state.col } in *)
    { state with errors_rev = LexerError.make pos msg :: state.errors_rev }

  let parse_number (state : state) : state =
    let skip c state = snd @@ advance_if c state in
    (* since parse_number is only called if the first char was a digit we can col - 1 here *)
    let code_pos = { line = state.line; col = state.col - 1 } in
    let state =
      state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
      |> advance_while is_digit
    in
    let lexeme = get_lexeme state state.start_pos state.cur_pos in
    let f = Float.of_string_opt lexeme in
    match f with
    | None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state
    | Some f -> append_token code_pos (Number f) state

  let parse_keyword_or_identifier (state : state) : state =
    let code_pos = { line = state.line; col = state.col - 1 } in
    let state = advance_while is_identifier state in
    let lexeme = get_lexeme state state.start_pos state.cur_pos in
    let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in
    append_token code_pos tt state

  let rec parse_block_comment (state : state) : state =
    (* "- 2" since we already consumed the "/*" *)
    let pos = { line = state.line; col = state.col - 2 } in
    let found, state = advance_until '*' state in
    if not found then append_error pos "Unterminated block comment" state
    else if peek state = Some '/' then
      let state = advance state in
      let lexeme = get_lexeme state (state.start_pos + 2) (state.cur_pos - 2) in
      append_token pos (Comment lexeme) state
    else parse_block_comment state

  let rec tokenize_rec (state : state) : state =
    let pos = { line = state.line; col = state.col } in
    let append_token = append_token pos in
    let append_error = append_error pos in
    if is_at_end state then append_token Eof state
    else
      let state = { state with start_pos = state.cur_pos } in
      let c, state = next state in
      let state =
        state
        |>
        match c with
        | '(' -> append_token LeftParen
        | ')' -> append_token RightParen
        | '{' -> append_token LeftBrace
        | '}' -> append_token RightBrace
        | ',' -> append_token Comma
        | ';' -> append_token Semicolon
        | '.' -> append_token Dot
        | '+' -> append_token Plus
        | '-' -> append_token Minus
        | '*' -> append_token Star
        | '!' ->
            fun state ->
              let b, state = advance_if '=' state in
              append_token (if b then BangEqual else Bang) state
        | '=' ->
            fun state ->
              let b, state = advance_if '=' state in
              append_token (if b then EqualEqual else Equal) state
        | '<' ->
            fun state ->
              let b, state = advance_if '=' state in
              append_token (if b then LessEqual else Less) state
        | '>' ->
            fun state ->
              let b, state = advance_if '=' state in
              append_token (if b then GreaterEqual else Greater) state
        | '/' -> (
            fun state ->
              match peek state with
              | Some '/' ->
                  let start_pos = state.cur_pos in
                  let _, state = advance_until '\n' state in
                  let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
                  append_token (Comment lexeme) state
              | Some '*' -> parse_block_comment (advance state)
              | _ -> append_token Slash state)
        | '"' ->
            fun state ->
              let found, state = advance_until '"' state in
              if not found then append_error "Unterminated string literal" state
              else
                let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
                append_token (String lexeme) state
        | '0' .. '9' -> parse_number
        | c when is_alpha c || c = '_' -> parse_keyword_or_identifier
        | ' ' | '\t' | '\n' -> fun state -> state
        | c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
      in
      tokenize_rec state
end

let tokenize (source : string) : lexer_result =
  let state =
    State.tokenize_rec
      { source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
  in
  (* reverse the reversed tokens/errors *)
  (* if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
     else Error (List.rev state.errors_rev) *)
  match state.errors_rev with [] -> Ok (List.rev state.tokens_rev) | es -> Error (List.rev es)
init commit 2024-08-02 00:10:48 +02:00			`open Error`

			`[@@@ocamlformat "disable"]`
			`type token_type =`
			`\| LeftParen \| RightParen \| LeftBrace \| RightBrace`

			`\| Plus \| Minus \| Star \| Slash \| Bang`

			`\| Dot \| Comma \| Semicolon \| Equal`

			`\| EqualEqual \| BangEqual \| Greater \| Less \| GreaterEqual \| LessEqual`

			`\| Identifier of string`
			`\| String of string`
			`\| Number of float`

moved for loop to separate Stmt and added continue 2024-08-27 17:15:35 +02:00			`\| And \| Break \| Class \| Continue \| Else \| False \| Fun \| For \| If \| Nil \| Or \| Print \| Return \| Super \| This \| True`
init commit 2024-08-02 00:10:48 +02:00			`\| Var \| While`

			`\| Comment of string`

			`\| Eof`
			`[@@deriving show { with_path = false }]`
			`[@@@ocamlformat "enable"]`

added identifiers and keywords 2024-08-02 01:14:35 +02:00			`let keywords =`
			`let keywords = Hashtbl.create 16 in`
			`let insert s tt keywords =`
			`Hashtbl.add keywords s tt;`
			`keywords`
			`in`
moved for loop to separate Stmt and added continue 2024-08-27 17:15:35 +02:00			`keywords \|> insert "and" And \|> insert "break" Break \|> insert "class" Class`
			`\|> insert "continue" Continue \|> insert "else" Else \|> insert "false" False \|> insert "for" For`
			`\|> insert "fun" Fun \|> insert "if" If \|> insert "nil" Nil \|> insert "or" Or`
			`\|> insert "print" Print \|> insert "return" Return \|> insert "super" Super \|> insert "this" This`
			`\|> insert "true" True \|> insert "var" Var \|> insert "while" While`
added identifiers and keywords 2024-08-02 01:14:35 +02:00
init commit 2024-08-02 00:10:48 +02:00			`type token = { token_type : token_type; pos : code_pos }`

			`let show_token (token : token) =`
			`let { line; col } = token.pos in`
			`Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col`

			`type lexer_result = (token list, lexer_error list) result`

			`type state = {`
			`(* source code *)`
			`source : string;`
			`start_pos : int;`
			`cur_pos : int;`
			`(* store tokens and errors in reverse to make building the list more efficient *)`
			`tokens_rev : token list;`
			`errors_rev : lexer_error list;`
			`(* position of current char in source *)`
			`line : int;`
			`col : int;`
			`}`

			`module State = struct`
more work on the parser 2024-08-12 16:31:28 +02:00			`(* type t = state *)`
init commit 2024-08-02 00:10:48 +02:00
			`let is_digit c = match c with '0' .. '9' -> true \| _ -> false`
			`let is_alpha c = match c with 'a' .. 'z' \| 'A' .. 'Z' -> true \| _ -> false`
			`let is_alphanum c = is_digit c \|\| is_alpha c`
			`let is_identifier c = is_alphanum c \|\| c = '_'`
			`let is_at_end (state : state) : bool = state.cur_pos = String.length state.source`

			`let get_lexeme (state : state) (first : int) (last : int) =`
			`String.sub state.source first (last - first)`

global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let next (state : state) : char * state =`
init commit 2024-08-02 00:10:48 +02:00			`let c = state.source.[state.cur_pos] in`
			`let state = { state with cur_pos = state.cur_pos + 1 } in`
			`let state =`
			`match c with`
			`\| '\t' -> { state with col = state.col + 4 }`
			`\| '\n' -> { state with line = state.line + 1; col = 0 }`
			`\| _ -> { state with col = state.col + 1 }`
			`in`
			`(c, state)`

global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let advance (state : state) : state =`
			`let _, state = next state in`
			`state`

init commit 2024-08-02 00:10:48 +02:00			`let peek (state : state) : char option =`
			`if not (is_at_end state) then Some state.source.[state.cur_pos] else None`

			`let advance_if (c : char) (state : state) : bool * state =`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`if peek state = Some c then (true, snd (next state)) else (false, state)`
init commit 2024-08-02 00:10:48 +02:00
			`let rec advance_until (c : char) (state : state) : bool * state =`
			`if is_at_end state then (false, state)`
			`else`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let c', state = next state in`
init commit 2024-08-02 00:10:48 +02:00			`if c' = c then (true, state) else advance_until c state`

			`let rec advance_while (f : char -> bool) (state : state) : state =`
			`match peek state with`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`\| Some c when f c -> advance_while f (snd (next state))`
init commit 2024-08-02 00:10:48 +02:00			`\| _ -> state (* EOF or no match *)`

added identifiers and keywords 2024-08-02 01:14:35 +02:00			`let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =`
init commit 2024-08-02 00:10:48 +02:00			`(* let pos = { line = state.line; col = state.col } in *)`
			`{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }`

added identifiers and keywords 2024-08-02 01:14:35 +02:00			`let append_error (pos : code_pos) (msg : string) (state : state) : state =`
init commit 2024-08-02 00:10:48 +02:00			`(* let pos = { line = state.line; col = state.col } in *)`
			`{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }`

added identifiers and keywords 2024-08-02 01:14:35 +02:00			`let parse_number (state : state) : state =`
init commit 2024-08-02 00:10:48 +02:00			`let skip c state = snd @@ advance_if c state in`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`(* since parse_number is only called if the first char was a digit we can col - 1 here *)`
			`let code_pos = { line = state.line; col = state.col - 1 } in`
init commit 2024-08-02 00:10:48 +02:00			`let state =`
			`state \|> advance_while is_digit \|> skip '.' \|> advance_while is_digit \|> skip 'e'`
			`\|> advance_while is_digit`
			`in`
			`let lexeme = get_lexeme state state.start_pos state.cur_pos in`
			`let f = Float.of_string_opt lexeme in`
			`match f with`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`\| None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state`
			`\| Some f -> append_token code_pos (Number f) state`

			`let parse_keyword_or_identifier (state : state) : state =`
			`let code_pos = { line = state.line; col = state.col - 1 } in`
			`let state = advance_while is_identifier state in`
			`let lexeme = get_lexeme state state.start_pos state.cur_pos in`
			`let tt = lexeme \|> Hashtbl.find_opt keywords \|> Option.value ~default:(Identifier lexeme) in`
			`append_token code_pos tt state`
init commit 2024-08-02 00:10:48 +02:00
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let rec parse_block_comment (state : state) : state =`
added block comments 2024-08-04 00:33:28 +02:00			`(* "- 2" since we already consumed the "/" )`
			`let pos = { line = state.line; col = state.col - 2 } in`
			`let found, state = advance_until '*' state in`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`if not found then append_error pos "Unterminated block comment" state`
added block comments 2024-08-04 00:33:28 +02:00			`else if peek state = Some '/' then`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let state = advance state in`
added block comments 2024-08-04 00:33:28 +02:00			`let lexeme = get_lexeme state (state.start_pos + 2) (state.cur_pos - 2) in`
			`append_token pos (Comment lexeme) state`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`else parse_block_comment state`
added block comments 2024-08-04 00:33:28 +02:00
init commit 2024-08-02 00:10:48 +02:00			`let rec tokenize_rec (state : state) : state =`
			`let pos = { line = state.line; col = state.col } in`
			`let append_token = append_token pos in`
			`let append_error = append_error pos in`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`if is_at_end state then append_token Eof state`
init commit 2024-08-02 00:10:48 +02:00			`else`
			`let state = { state with start_pos = state.cur_pos } in`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`let c, state = next state in`
init commit 2024-08-02 00:10:48 +02:00			`let state =`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`state`
			`\|>`
init commit 2024-08-02 00:10:48 +02:00			`match c with`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`\| '(' -> append_token LeftParen`
			`\| ')' -> append_token RightParen`
			`\| '{' -> append_token LeftBrace`
			`\| '}' -> append_token RightBrace`
			`\| ',' -> append_token Comma`
			`\| ';' -> append_token Semicolon`
			`\| '.' -> append_token Dot`
			`\| '+' -> append_token Plus`
			`\| '-' -> append_token Minus`
			`\| '*' -> append_token Star`
init commit 2024-08-02 00:10:48 +02:00			`\| '!' ->`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
			`let b, state = advance_if '=' state in`
			`append_token (if b then BangEqual else Bang) state`
init commit 2024-08-02 00:10:48 +02:00			`\| '=' ->`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
			`let b, state = advance_if '=' state in`
			`append_token (if b then EqualEqual else Equal) state`
init commit 2024-08-02 00:10:48 +02:00			`\| '<' ->`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
			`let b, state = advance_if '=' state in`
			`append_token (if b then LessEqual else Less) state`
init commit 2024-08-02 00:10:48 +02:00			`\| '>' ->`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
			`let b, state = advance_if '=' state in`
			`append_token (if b then GreaterEqual else Greater) state`
added block comments 2024-08-04 00:33:28 +02:00			`\| '/' -> (`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
added block comments 2024-08-04 00:33:28 +02:00			`match peek state with`
			`\| Some '/' ->`
			`let start_pos = state.cur_pos in`
			`let _, state = advance_until '\n' state in`
			`let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in`
			`append_token (Comment lexeme) state`
global and local variables with scope also reworked arg parsing, now has --debug flag 2024-08-26 01:58:03 +02:00			`\| Some '*' -> parse_block_comment (advance state)`
added block comments 2024-08-04 00:33:28 +02:00			`\| _ -> append_token Slash state)`
init commit 2024-08-02 00:10:48 +02:00			`\| '"' ->`
added identifiers and keywords 2024-08-02 01:14:35 +02:00			`fun state ->`
			`let found, state = advance_until '"' state in`
			`if not found then append_error "Unterminated string literal" state`
			`else`
			`let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in`
			`append_token (String lexeme) state`
			`\| '0' .. '9' -> parse_number`
			`\| c when is_alpha c \|\| c = '_' -> parse_keyword_or_identifier`
			`\| ' ' \| '\t' \| '\n' -> fun state -> state`
			`\| c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)`
init commit 2024-08-02 00:10:48 +02:00			`in`
			`tokenize_rec state`
			`end`

			`let tokenize (source : string) : lexer_result =`
			`let state =`
			`State.tokenize_rec`
			`{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }`
			`in`
			`(* reverse the reversed tokens/errors *)`
more work on the parser 2024-08-12 16:31:28 +02:00			`(* if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)`
			`else Error (List.rev state.errors_rev) *)`
			`match state.errors_rev with [] -> Ok (List.rev state.tokens_rev) \| es -> Error (List.rev es)`