mlox/lib/lexer.ml

open Error

[@@@ocamlformat "disable"]
  type token_type =
  | LeftParen | RightParen | LeftBrace | RightBrace

  | Plus | Minus | Star | Slash | Bang

  | Dot | Comma | Semicolon | Equal

  | EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
  
  | Identifier of string
  | String of string
  | Number of float

  | And | Class | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
  | Var | While

  | Comment of string
  
  | Eof
[@@deriving show { with_path = false }]
[@@@ocamlformat "enable"]

type token = { token_type : token_type; pos : code_pos }

let show_token (token : token) =
  let { line; col } = token.pos in
  Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col

type lexer_result = (token list, lexer_error list) result

type state = {
  (* source code *)
  source : string;
  start_pos : int;
  cur_pos : int;
  (* store tokens and errors in reverse to make building the list more efficient *)
  tokens_rev : token list;
  errors_rev : lexer_error list;
  (* position of current char in source *)
  line : int;
  col : int;
}

module State = struct
  type t = state

  let is_digit c = match c with '0' .. '9' -> true | _ -> false
  let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
  let is_alphanum c = is_digit c || is_alpha c
  let is_identifier c = is_alphanum c || c = '_'
  let is_at_end (state : state) : bool = state.cur_pos = String.length state.source

  let get_lexeme (state : state) (first : int) (last : int) =
    String.sub state.source first (last - first)

  let advance (state : state) : char * state =
    let c = state.source.[state.cur_pos] in
    let state = { state with cur_pos = state.cur_pos + 1 } in
    let state =
      match c with
      | '\t' -> { state with col = state.col + 4 }
      | '\n' -> { state with line = state.line + 1; col = 0 }
      | _ -> { state with col = state.col + 1 }
    in
    (c, state)

  let peek (state : state) : char option =
    if not (is_at_end state) then Some state.source.[state.cur_pos] else None

  let advance_if (c : char) (state : state) : bool * state =
    if peek state = Some c then (true, snd (advance state)) else (false, state)

  let rec advance_until (c : char) (state : state) : bool * state =
    if is_at_end state then (false, state)
    else
      let c', state = advance state in
      if c' = c then (true, state) else advance_until c state

  let rec advance_while (f : char -> bool) (state : state) : state =
    match peek state with
    | Some c when f c -> advance_while f (snd (advance state))
    | _ -> state (* EOF or no match *)

  let last_char (state : state) =
    assert (state.cur_pos > 0);
    state.source.[state.cur_pos - 1]

  let append_token pos state token_type =
    (* let pos = { line = state.line; col = state.col } in *)
    { state with tokens_rev = { token_type; pos } :: state.tokens_rev }

  let append_error pos state msg =
    (* let pos = { line = state.line; col = state.col } in *)
    { state with errors_rev = LexerError.make pos msg :: state.errors_rev }

  let parse_number (state : state) =
    let skip c state = snd @@ advance_if c state in
    let code_pos = { line = state.line; col = state.col } in
    let state =
      state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
      |> advance_while is_digit
    in
    let lexeme = get_lexeme state state.start_pos state.cur_pos in
    let f = Float.of_string_opt lexeme in
    match f with
    | None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme)
    | Some f -> append_token code_pos state (Number f)

  let rec tokenize_rec (state : state) : state =
    let pos = { line = state.line; col = state.col } in
    let append_token = append_token pos in
    let append_error = append_error pos in
    if is_at_end state then append_token state Eof
    else
      let state = { state with start_pos = state.cur_pos } in
      let c, state = advance state in
      let state =
        match c with
        | '(' -> append_token state LeftParen
        | ')' -> append_token state RightParen
        | '{' -> append_token state LeftBrace
        | '}' -> append_token state RightBrace
        | ',' -> append_token state Comma
        | ';' -> append_token state Semicolon
        | '.' -> append_token state Dot
        | '+' -> append_token state Plus
        | '-' -> append_token state Minus
        | '*' -> append_token state Star
        | '!' ->
            let b, state = advance_if '=' state in
            append_token state (if b then BangEqual else Bang)
        | '=' ->
            let b, state = advance_if '=' state in
            append_token state (if b then EqualEqual else Equal)
        | '<' ->
            let b, state = advance_if '=' state in
            append_token state (if b then LessEqual else Less)
        | '>' ->
            let b, state = advance_if '=' state in
            append_token state (if b then GreaterEqual else Greater)
        | '/' ->
            let found, state = advance_if '/' state in
            if not found then append_token state Slash
            else
              let start_pos = state.cur_pos in
              let _, state = advance_until '\n' state in
              let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
              append_token state (Comment lexeme)
        | '"' ->
            let found, state = advance_until '"' state in
            if not found then append_error state "Unterminated string literal"
            else
              let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
              append_token state (String lexeme)
        | '0' .. '9' -> parse_number state
        | ' ' | '\t' | '\n' -> parse_number state
        | c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
      in
      tokenize_rec state
end

let tokenize (source : string) : lexer_result =
  print_endline "Scanning source";
  print_endline "---";
  print_endline source;
  print_endline "---";
  (* Ok [] *)
  let state =
    State.tokenize_rec
      { source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
  in
  (* reverse the reversed tokens/errors *)
  if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
  else Error (List.rev state.errors_rev)
init commit 2024-08-02 00:10:48 +02:00			`open Error`

			`[@@@ocamlformat "disable"]`
			`type token_type =`
			`\| LeftParen \| RightParen \| LeftBrace \| RightBrace`

			`\| Plus \| Minus \| Star \| Slash \| Bang`

			`\| Dot \| Comma \| Semicolon \| Equal`

			`\| EqualEqual \| BangEqual \| Greater \| Less \| GreaterEqual \| LessEqual`

			`\| Identifier of string`
			`\| String of string`
			`\| Number of float`

			`\| And \| Class \| Else \| False \| Fun \| For \| If \| Nil \| Or \| Print \| Return \| Super \| This \| True`
			`\| Var \| While`

			`\| Comment of string`

			`\| Eof`
			`[@@deriving show { with_path = false }]`
			`[@@@ocamlformat "enable"]`

			`type token = { token_type : token_type; pos : code_pos }`

			`let show_token (token : token) =`
			`let { line; col } = token.pos in`
			`Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col`

			`type lexer_result = (token list, lexer_error list) result`

			`type state = {`
			`(* source code *)`
			`source : string;`
			`start_pos : int;`
			`cur_pos : int;`
			`(* store tokens and errors in reverse to make building the list more efficient *)`
			`tokens_rev : token list;`
			`errors_rev : lexer_error list;`
			`(* position of current char in source *)`
			`line : int;`
			`col : int;`
			`}`

			`module State = struct`
			`type t = state`

			`let is_digit c = match c with '0' .. '9' -> true \| _ -> false`
			`let is_alpha c = match c with 'a' .. 'z' \| 'A' .. 'Z' -> true \| _ -> false`
			`let is_alphanum c = is_digit c \|\| is_alpha c`
			`let is_identifier c = is_alphanum c \|\| c = '_'`
			`let is_at_end (state : state) : bool = state.cur_pos = String.length state.source`

			`let get_lexeme (state : state) (first : int) (last : int) =`
			`String.sub state.source first (last - first)`

			`let advance (state : state) : char * state =`
			`let c = state.source.[state.cur_pos] in`
			`let state = { state with cur_pos = state.cur_pos + 1 } in`
			`let state =`
			`match c with`
			`\| '\t' -> { state with col = state.col + 4 }`
			`\| '\n' -> { state with line = state.line + 1; col = 0 }`
			`\| _ -> { state with col = state.col + 1 }`
			`in`
			`(c, state)`

			`let peek (state : state) : char option =`
			`if not (is_at_end state) then Some state.source.[state.cur_pos] else None`

			`let advance_if (c : char) (state : state) : bool * state =`
			`if peek state = Some c then (true, snd (advance state)) else (false, state)`

			`let rec advance_until (c : char) (state : state) : bool * state =`
			`if is_at_end state then (false, state)`
			`else`
			`let c', state = advance state in`
			`if c' = c then (true, state) else advance_until c state`

			`let rec advance_while (f : char -> bool) (state : state) : state =`
			`match peek state with`
			`\| Some c when f c -> advance_while f (snd (advance state))`
			`\| _ -> state (* EOF or no match *)`

			`let last_char (state : state) =`
			`assert (state.cur_pos > 0);`
			`state.source.[state.cur_pos - 1]`

			`let append_token pos state token_type =`
			`(* let pos = { line = state.line; col = state.col } in *)`
			`{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }`

			`let append_error pos state msg =`
			`(* let pos = { line = state.line; col = state.col } in *)`
			`{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }`

			`let parse_number (state : state) =`
			`let skip c state = snd @@ advance_if c state in`
			`let code_pos = { line = state.line; col = state.col } in`
			`let state =`
			`state \|> advance_while is_digit \|> skip '.' \|> advance_while is_digit \|> skip 'e'`
			`\|> advance_while is_digit`
			`in`
			`let lexeme = get_lexeme state state.start_pos state.cur_pos in`
			`let f = Float.of_string_opt lexeme in`
			`match f with`
			`\| None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme)`
			`\| Some f -> append_token code_pos state (Number f)`

			`let rec tokenize_rec (state : state) : state =`
			`let pos = { line = state.line; col = state.col } in`
			`let append_token = append_token pos in`
			`let append_error = append_error pos in`
			`if is_at_end state then append_token state Eof`
			`else`
			`let state = { state with start_pos = state.cur_pos } in`
			`let c, state = advance state in`
			`let state =`
			`match c with`
			`\| '(' -> append_token state LeftParen`
			`\| ')' -> append_token state RightParen`
			`\| '{' -> append_token state LeftBrace`
			`\| '}' -> append_token state RightBrace`
			`\| ',' -> append_token state Comma`
			`\| ';' -> append_token state Semicolon`
			`\| '.' -> append_token state Dot`
			`\| '+' -> append_token state Plus`
			`\| '-' -> append_token state Minus`
			`\| '*' -> append_token state Star`
			`\| '!' ->`
			`let b, state = advance_if '=' state in`
			`append_token state (if b then BangEqual else Bang)`
			`\| '=' ->`
			`let b, state = advance_if '=' state in`
			`append_token state (if b then EqualEqual else Equal)`
			`\| '<' ->`
			`let b, state = advance_if '=' state in`
			`append_token state (if b then LessEqual else Less)`
			`\| '>' ->`
			`let b, state = advance_if '=' state in`
			`append_token state (if b then GreaterEqual else Greater)`
			`\| '/' ->`
			`let found, state = advance_if '/' state in`
			`if not found then append_token state Slash`
			`else`
			`let start_pos = state.cur_pos in`
			`let _, state = advance_until '\n' state in`
			`let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in`
			`append_token state (Comment lexeme)`
			`\| '"' ->`
			`let found, state = advance_until '"' state in`
			`if not found then append_error state "Unterminated string literal"`
			`else`
			`let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in`
			`append_token state (String lexeme)`
			`\| '0' .. '9' -> parse_number state`
			`\| ' ' \| '\t' \| '\n' -> parse_number state`
			`\| c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)`
			`in`
			`tokenize_rec state`
			`end`

			`let tokenize (source : string) : lexer_result =`
			`print_endline "Scanning source";`
			`print_endline "---";`
			`print_endline source;`
			`print_endline "---";`
			`(* Ok [] *)`
			`let state =`
			`State.tokenize_rec`
			`{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }`
			`in`
			`(* reverse the reversed tokens/errors *)`
			`if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)`
			`else Error (List.rev state.errors_rev)`