mirror of
https://github.com/MorizzG/MLox.git
synced 2025-12-06 04:22:41 +00:00
added identifiers and keywords
This commit is contained in:
parent
d33023f435
commit
821f5c62bc
1 changed files with 69 additions and 41 deletions
110
lib/lexer.ml
110
lib/lexer.ml
|
|
@ -23,6 +23,17 @@ open Error
|
||||||
[@@deriving show { with_path = false }]
|
[@@deriving show { with_path = false }]
|
||||||
[@@@ocamlformat "enable"]
|
[@@@ocamlformat "enable"]
|
||||||
|
|
||||||
|
let keywords =
|
||||||
|
let keywords = Hashtbl.create 16 in
|
||||||
|
let insert s tt keywords =
|
||||||
|
Hashtbl.add keywords s tt;
|
||||||
|
keywords
|
||||||
|
in
|
||||||
|
keywords |> insert "and" And |> insert "class" Class |> insert "else" Else |> insert "false" False
|
||||||
|
|> insert "for" For |> insert "fun" Fun |> insert "if" If |> insert "nil" Nil |> insert "or" Or
|
||||||
|
|> insert "print" Print |> insert "return" Return |> insert "super" Super |> insert "this" This
|
||||||
|
|> insert "true" True |> insert "var" Var |> insert "while" While
|
||||||
|
|
||||||
type token = { token_type : token_type; pos : code_pos }
|
type token = { token_type : token_type; pos : code_pos }
|
||||||
|
|
||||||
let show_token (token : token) =
|
let show_token (token : token) =
|
||||||
|
|
@ -84,21 +95,22 @@ module State = struct
|
||||||
| Some c when f c -> advance_while f (snd (advance state))
|
| Some c when f c -> advance_while f (snd (advance state))
|
||||||
| _ -> state (* EOF or no match *)
|
| _ -> state (* EOF or no match *)
|
||||||
|
|
||||||
let last_char (state : state) =
|
let last_char (state : state) : char =
|
||||||
assert (state.cur_pos > 0);
|
assert (state.cur_pos > 0);
|
||||||
state.source.[state.cur_pos - 1]
|
state.source.[state.cur_pos - 1]
|
||||||
|
|
||||||
let append_token pos state token_type =
|
let append_token (pos : code_pos) (token_type : token_type) (state : state) : state =
|
||||||
(* let pos = { line = state.line; col = state.col } in *)
|
(* let pos = { line = state.line; col = state.col } in *)
|
||||||
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
|
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
|
||||||
|
|
||||||
let append_error pos state msg =
|
let append_error (pos : code_pos) (msg : string) (state : state) : state =
|
||||||
(* let pos = { line = state.line; col = state.col } in *)
|
(* let pos = { line = state.line; col = state.col } in *)
|
||||||
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
|
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
|
||||||
|
|
||||||
let parse_number (state : state) =
|
let parse_number (state : state) : state =
|
||||||
let skip c state = snd @@ advance_if c state in
|
let skip c state = snd @@ advance_if c state in
|
||||||
let code_pos = { line = state.line; col = state.col } in
|
(* since parse_number is only called if the first char was a digit we can col - 1 here *)
|
||||||
|
let code_pos = { line = state.line; col = state.col - 1 } in
|
||||||
let state =
|
let state =
|
||||||
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|
||||||
|> advance_while is_digit
|
|> advance_while is_digit
|
||||||
|
|
@ -106,58 +118,74 @@ module State = struct
|
||||||
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
||||||
let f = Float.of_string_opt lexeme in
|
let f = Float.of_string_opt lexeme in
|
||||||
match f with
|
match f with
|
||||||
| None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme)
|
| None -> append_error code_pos (Printf.sprintf "Invalid float literal \"%s\"" lexeme) state
|
||||||
| Some f -> append_token code_pos state (Number f)
|
| Some f -> append_token code_pos (Number f) state
|
||||||
|
|
||||||
|
let parse_keyword_or_identifier (state : state) : state =
|
||||||
|
let code_pos = { line = state.line; col = state.col - 1 } in
|
||||||
|
let state = advance_while is_identifier state in
|
||||||
|
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
||||||
|
let tt = lexeme |> Hashtbl.find_opt keywords |> Option.value ~default:(Identifier lexeme) in
|
||||||
|
append_token code_pos tt state
|
||||||
|
|
||||||
let rec tokenize_rec (state : state) : state =
|
let rec tokenize_rec (state : state) : state =
|
||||||
let pos = { line = state.line; col = state.col } in
|
let pos = { line = state.line; col = state.col } in
|
||||||
let append_token = append_token pos in
|
let append_token = append_token pos in
|
||||||
let append_error = append_error pos in
|
let append_error = append_error pos in
|
||||||
if is_at_end state then append_token state Eof
|
if is_at_end state then append_token Eof state
|
||||||
else
|
else
|
||||||
let state = { state with start_pos = state.cur_pos } in
|
let state = { state with start_pos = state.cur_pos } in
|
||||||
let c, state = advance state in
|
let c, state = advance state in
|
||||||
let state =
|
let state =
|
||||||
|
state
|
||||||
|
|>
|
||||||
match c with
|
match c with
|
||||||
| '(' -> append_token state LeftParen
|
| '(' -> append_token LeftParen
|
||||||
| ')' -> append_token state RightParen
|
| ')' -> append_token RightParen
|
||||||
| '{' -> append_token state LeftBrace
|
| '{' -> append_token LeftBrace
|
||||||
| '}' -> append_token state RightBrace
|
| '}' -> append_token RightBrace
|
||||||
| ',' -> append_token state Comma
|
| ',' -> append_token Comma
|
||||||
| ';' -> append_token state Semicolon
|
| ';' -> append_token Semicolon
|
||||||
| '.' -> append_token state Dot
|
| '.' -> append_token Dot
|
||||||
| '+' -> append_token state Plus
|
| '+' -> append_token Plus
|
||||||
| '-' -> append_token state Minus
|
| '-' -> append_token Minus
|
||||||
| '*' -> append_token state Star
|
| '*' -> append_token Star
|
||||||
| '!' ->
|
| '!' ->
|
||||||
let b, state = advance_if '=' state in
|
fun state ->
|
||||||
append_token state (if b then BangEqual else Bang)
|
let b, state = advance_if '=' state in
|
||||||
|
append_token (if b then BangEqual else Bang) state
|
||||||
| '=' ->
|
| '=' ->
|
||||||
let b, state = advance_if '=' state in
|
fun state ->
|
||||||
append_token state (if b then EqualEqual else Equal)
|
let b, state = advance_if '=' state in
|
||||||
|
append_token (if b then EqualEqual else Equal) state
|
||||||
| '<' ->
|
| '<' ->
|
||||||
let b, state = advance_if '=' state in
|
fun state ->
|
||||||
append_token state (if b then LessEqual else Less)
|
let b, state = advance_if '=' state in
|
||||||
|
append_token (if b then LessEqual else Less) state
|
||||||
| '>' ->
|
| '>' ->
|
||||||
let b, state = advance_if '=' state in
|
fun state ->
|
||||||
append_token state (if b then GreaterEqual else Greater)
|
let b, state = advance_if '=' state in
|
||||||
|
append_token (if b then GreaterEqual else Greater) state
|
||||||
| '/' ->
|
| '/' ->
|
||||||
let found, state = advance_if '/' state in
|
fun state ->
|
||||||
if not found then append_token state Slash
|
let found, state = advance_if '/' state in
|
||||||
else
|
if not found then append_token Slash state
|
||||||
let start_pos = state.cur_pos in
|
else
|
||||||
let _, state = advance_until '\n' state in
|
let start_pos = state.cur_pos in
|
||||||
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
|
let _, state = advance_until '\n' state in
|
||||||
append_token state (Comment lexeme)
|
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
|
||||||
|
append_token (Comment lexeme) state
|
||||||
| '"' ->
|
| '"' ->
|
||||||
let found, state = advance_until '"' state in
|
fun state ->
|
||||||
if not found then append_error state "Unterminated string literal"
|
let found, state = advance_until '"' state in
|
||||||
else
|
if not found then append_error "Unterminated string literal" state
|
||||||
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
|
else
|
||||||
append_token state (String lexeme)
|
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
|
||||||
| '0' .. '9' -> parse_number state
|
append_token (String lexeme) state
|
||||||
| ' ' | '\t' | '\n' -> parse_number state
|
| '0' .. '9' -> parse_number
|
||||||
| c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
|
| c when is_alpha c || c = '_' -> parse_keyword_or_identifier
|
||||||
|
| ' ' | '\t' | '\n' -> fun state -> state
|
||||||
|
| c -> append_error (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
|
||||||
in
|
in
|
||||||
tokenize_rec state
|
tokenize_rec state
|
||||||
end
|
end
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue