added identifiers and keywords

This commit is contained in:
Moritz Gmeiner 2024-08-02 01:14:35 +02:00
commit 821f5c62bc

View file

@ -23,6 +23,17 @@ open Error
[@@deriving show { with_path = false }] [@@deriving show { with_path = false }]
[@@@ocamlformat "enable"] [@@@ocamlformat "enable"]
let keywords =
let keywords = Hashtbl.create 16 in
let insert s tt keywords =
Hashtbl.add keywords s tt;
keywords
in
keywords |> insert "and" And |> insert "class" Class |> insert "else" Else |> insert "false" False
|> insert "for" For |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or
|> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This
|> insert "true" True |> insert "var" Var |> insert "while" While
type token = { token_type : token_type; pos : code_pos } type token = { token_type : token_type; pos : code_pos }
let show_token (token : token) = let show_token (token : token) =
@ -84,21 +95,22 @@ module State = struct
| Some c when f c -> advance_while f (snd (advance state)) | Some c when f c -> advance_while f (snd (advance state))
| _ -> state (* EOF or no match *) | _ -> state (* EOF or no match *)
let last_char (state : state) = let last_char (state : state) : char =
assert (state.cur_pos > 0); assert (state.cur_pos > 0);
state.source.[state.cur_pos - 1] state.source.[state.cur_pos - 1]
let append_token pos state token_type = let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =
(* let pos = { line = state.line; col = state.col } in *) (* let pos = { line = state.line; col = state.col } in *)
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev } { state with tokens_rev = { token_type; pos } :: state.tokens_rev }
let append_error pos state msg = let append_error (pos : code_pos) (msg : string) (state : state) : state =
(* let pos = { line = state.line; col = state.col } in *) (* let pos = { line = state.line; col = state.col } in *)
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev } { state with errors_rev = LexerError.make pos msg :: state.errors_rev }
let parse_number (state : state) = let parse_number (state : state) : state =
let skip c state = snd @@ advance_if c state in let skip c state = snd @@ advance_if c state in
let code_pos = { line = state.line; col = state.col } in (* since parse_number is only called if the first char was a digit we can col - 1 here *)
let code_pos = { line = state.line; col = state.col - 1 } in
let state = let state =
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e' state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|> advance_while is_digit |> advance_while is_digit
@ -106,58 +118,74 @@ module State = struct
let lexeme = get_lexeme state state.start_pos state.cur_pos in let lexeme = get_lexeme state state.start_pos state.cur_pos in
let f = Float.of_string_opt lexeme in let f = Float.of_string_opt lexeme in
match f with match f with
| None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme) | None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state
| Some f -> append_token code_pos state (Number f) | Some f -> append_token code_pos (Number f) state
let parse_keyword_or_identifier (state : state) : state =
let code_pos = { line = state.line; col = state.col - 1 } in
let state = advance_while is_identifier state in
let lexeme = get_lexeme state state.start_pos state.cur_pos in
let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in
append_token code_pos tt state
let rec tokenize_rec (state : state) : state = let rec tokenize_rec (state : state) : state =
let pos = { line = state.line; col = state.col } in let pos = { line = state.line; col = state.col } in
let append_token = append_token pos in let append_token = append_token pos in
let append_error = append_error pos in let append_error = append_error pos in
if is_at_end state then append_token state Eof if is_at_end state then append_token Eof state
else else
let state = { state with start_pos = state.cur_pos } in let state = { state with start_pos = state.cur_pos } in
let c, state = advance state in let c, state = advance state in
let state = let state =
state
|>
match c with match c with
| '(' -> append_token state LeftParen | '(' -> append_token LeftParen
| ')' -> append_token state RightParen | ')' -> append_token RightParen
| '{' -> append_token state LeftBrace | '{' -> append_token LeftBrace
| '}' -> append_token state RightBrace | '}' -> append_token RightBrace
| ',' -> append_token state Comma | ',' -> append_token Comma
| ';' -> append_token state Semicolon | ';' -> append_token Semicolon
| '.' -> append_token state Dot | '.' -> append_token Dot
| '+' -> append_token state Plus | '+' -> append_token Plus
| '-' -> append_token state Minus | '-' -> append_token Minus
| '*' -> append_token state Star | '*' -> append_token Star
| '!' -> | '!' ->
let b, state = advance_if '=' state in fun state ->
append_token state (if b then BangEqual else Bang) let b, state = advance_if '=' state in
append_token (if b then BangEqual else Bang) state
| '=' -> | '=' ->
let b, state = advance_if '=' state in fun state ->
append_token state (if b then EqualEqual else Equal) let b, state = advance_if '=' state in
append_token (if b then EqualEqual else Equal) state
| '<' -> | '<' ->
let b, state = advance_if '=' state in fun state ->
append_token state (if b then LessEqual else Less) let b, state = advance_if '=' state in
append_token (if b then LessEqual else Less) state
| '>' -> | '>' ->
let b, state = advance_if '=' state in fun state ->
append_token state (if b then GreaterEqual else Greater) let b, state = advance_if '=' state in
append_token (if b then GreaterEqual else Greater) state
| '/' -> | '/' ->
let found, state = advance_if '/' state in fun state ->
if not found then append_token state Slash let found, state = advance_if '/' state in
else if not found then append_token Slash state
let start_pos = state.cur_pos in else
let _, state = advance_until '\n' state in let start_pos = state.cur_pos in
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in let _, state = advance_until '\n' state in
append_token state (Comment lexeme) let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
append_token (Comment lexeme) state
| '"' -> | '"' ->
let found, state = advance_until '"' state in fun state ->
if not found then append_error state "Unterminated string literal" let found, state = advance_until '"' state in
else if not found then append_error "Unterminated string literal" state
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in else
append_token state (String lexeme) let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
| '0' .. '9' -> parse_number state append_token (String lexeme) state
| ' ' | '\t' | '\n' -> parse_number state | '0' .. '9' -> parse_number
| c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c) | c when is_alpha c || c = '_' -> parse_keyword_or_identifier
| ' ' | '\t' | '\n' -> fun state -> state
| c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
in in
tokenize_rec state tokenize_rec state
end end