mirror of
https://github.com/MorizzG/MLox.git
synced 2025-12-06 04:22:41 +00:00
init commit
This commit is contained in:
commit
d33023f435
13 changed files with 325 additions and 0 deletions
177
lib/lexer.ml
Normal file
177
lib/lexer.ml
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
open Error
|
||||
|
||||
[@@@ocamlformat "disable"]
|
||||
type token_type =
|
||||
| LeftParen | RightParen | LeftBrace | RightBrace
|
||||
|
||||
| Plus | Minus | Star | Slash | Bang
|
||||
|
||||
| Dot | Comma | Semicolon | Equal
|
||||
|
||||
| EqualEqual | BangEqual | Greater | Less | GreaterEqual | LessEqual
|
||||
|
||||
| Identifier of string
|
||||
| String of string
|
||||
| Number of float
|
||||
|
||||
| And | Class | Else | False | Fun | For | If | Nil | Or | Print | Return | Super | This | True
|
||||
| Var | While
|
||||
|
||||
| Comment of string
|
||||
|
||||
| Eof
|
||||
[@@deriving show { with_path = false }]
|
||||
[@@@ocamlformat "enable"]
|
||||
|
||||
type token = { token_type : token_type; pos : code_pos }
|
||||
|
||||
let show_token (token : token) =
|
||||
let { line; col } = token.pos in
|
||||
Printf.sprintf "<%s at %d:%d>" (show_token_type token.token_type) line col
|
||||
|
||||
type lexer_result = (token list, lexer_error list) result
|
||||
|
||||
type state = {
|
||||
(* source code *)
|
||||
source : string;
|
||||
start_pos : int;
|
||||
cur_pos : int;
|
||||
(* store tokens and errors in reverse to make building the list more efficient *)
|
||||
tokens_rev : token list;
|
||||
errors_rev : lexer_error list;
|
||||
(* position of current char in source *)
|
||||
line : int;
|
||||
col : int;
|
||||
}
|
||||
|
||||
module State = struct
|
||||
type t = state
|
||||
|
||||
let is_digit c = match c with '0' .. '9' -> true | _ -> false
|
||||
let is_alpha c = match c with 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
|
||||
let is_alphanum c = is_digit c || is_alpha c
|
||||
let is_identifier c = is_alphanum c || c = '_'
|
||||
let is_at_end (state : state) : bool = state.cur_pos = String.length state.source
|
||||
|
||||
let get_lexeme (state : state) (first : int) (last : int) =
|
||||
String.sub state.source first (last - first)
|
||||
|
||||
let advance (state : state) : char * state =
|
||||
let c = state.source.[state.cur_pos] in
|
||||
let state = { state with cur_pos = state.cur_pos + 1 } in
|
||||
let state =
|
||||
match c with
|
||||
| '\t' -> { state with col = state.col + 4 }
|
||||
| '\n' -> { state with line = state.line + 1; col = 0 }
|
||||
| _ -> { state with col = state.col + 1 }
|
||||
in
|
||||
(c, state)
|
||||
|
||||
let peek (state : state) : char option =
|
||||
if not (is_at_end state) then Some state.source.[state.cur_pos] else None
|
||||
|
||||
let advance_if (c : char) (state : state) : bool * state =
|
||||
if peek state = Some c then (true, snd (advance state)) else (false, state)
|
||||
|
||||
let rec advance_until (c : char) (state : state) : bool * state =
|
||||
if is_at_end state then (false, state)
|
||||
else
|
||||
let c', state = advance state in
|
||||
if c' = c then (true, state) else advance_until c state
|
||||
|
||||
let rec advance_while (f : char -> bool) (state : state) : state =
|
||||
match peek state with
|
||||
| Some c when f c -> advance_while f (snd (advance state))
|
||||
| _ -> state (* EOF or no match *)
|
||||
|
||||
let last_char (state : state) =
|
||||
assert (state.cur_pos > 0);
|
||||
state.source.[state.cur_pos - 1]
|
||||
|
||||
let append_token pos state token_type =
|
||||
(* let pos = { line = state.line; col = state.col } in *)
|
||||
{ state with tokens_rev = { token_type; pos } :: state.tokens_rev }
|
||||
|
||||
let append_error pos state msg =
|
||||
(* let pos = { line = state.line; col = state.col } in *)
|
||||
{ state with errors_rev = LexerError.make pos msg :: state.errors_rev }
|
||||
|
||||
let parse_number (state : state) =
|
||||
let skip c state = snd @@ advance_if c state in
|
||||
let code_pos = { line = state.line; col = state.col } in
|
||||
let state =
|
||||
state |> advance_while is_digit |> skip '.' |> advance_while is_digit |> skip 'e'
|
||||
|> advance_while is_digit
|
||||
in
|
||||
let lexeme = get_lexeme state state.start_pos state.cur_pos in
|
||||
let f = Float.of_string_opt lexeme in
|
||||
match f with
|
||||
| None -> append_error code_pos state (Printf.sprintf "Invalid float literal %s" lexeme)
|
||||
| Some f -> append_token code_pos state (Number f)
|
||||
|
||||
let rec tokenize_rec (state : state) : state =
|
||||
let pos = { line = state.line; col = state.col } in
|
||||
let append_token = append_token pos in
|
||||
let append_error = append_error pos in
|
||||
if is_at_end state then append_token state Eof
|
||||
else
|
||||
let state = { state with start_pos = state.cur_pos } in
|
||||
let c, state = advance state in
|
||||
let state =
|
||||
match c with
|
||||
| '(' -> append_token state LeftParen
|
||||
| ')' -> append_token state RightParen
|
||||
| '{' -> append_token state LeftBrace
|
||||
| '}' -> append_token state RightBrace
|
||||
| ',' -> append_token state Comma
|
||||
| ';' -> append_token state Semicolon
|
||||
| '.' -> append_token state Dot
|
||||
| '+' -> append_token state Plus
|
||||
| '-' -> append_token state Minus
|
||||
| '*' -> append_token state Star
|
||||
| '!' ->
|
||||
let b, state = advance_if '=' state in
|
||||
append_token state (if b then BangEqual else Bang)
|
||||
| '=' ->
|
||||
let b, state = advance_if '=' state in
|
||||
append_token state (if b then EqualEqual else Equal)
|
||||
| '<' ->
|
||||
let b, state = advance_if '=' state in
|
||||
append_token state (if b then LessEqual else Less)
|
||||
| '>' ->
|
||||
let b, state = advance_if '=' state in
|
||||
append_token state (if b then GreaterEqual else Greater)
|
||||
| '/' ->
|
||||
let found, state = advance_if '/' state in
|
||||
if not found then append_token state Slash
|
||||
else
|
||||
let start_pos = state.cur_pos in
|
||||
let _, state = advance_until '\n' state in
|
||||
let lexeme = String.trim @@ get_lexeme state start_pos state.cur_pos in
|
||||
append_token state (Comment lexeme)
|
||||
| '"' ->
|
||||
let found, state = advance_until '"' state in
|
||||
if not found then append_error state "Unterminated string literal"
|
||||
else
|
||||
let lexeme = get_lexeme state (state.start_pos + 1) (state.cur_pos - 1) in
|
||||
append_token state (String lexeme)
|
||||
| '0' .. '9' -> parse_number state
|
||||
| ' ' | '\t' | '\n' -> parse_number state
|
||||
| c -> append_error state (String.escaped @@ Printf.sprintf "Unexpected character '%c'" c)
|
||||
in
|
||||
tokenize_rec state
|
||||
end
|
||||
|
||||
let tokenize (source : string) : lexer_result =
|
||||
print_endline "Scanning source";
|
||||
print_endline "---";
|
||||
print_endline source;
|
||||
print_endline "---";
|
||||
(* Ok [] *)
|
||||
let state =
|
||||
State.tokenize_rec
|
||||
{ source; start_pos = 0; cur_pos = 0; tokens_rev = []; errors_rev = []; line = 1; col = 0 }
|
||||
in
|
||||
(* reverse the reversed tokens/errors *)
|
||||
if List.length state.errors_rev = 0 then Ok (List.rev state.tokens_rev)
|
||||
else Error (List.rev state.errors_rev)
|
||||
Loading…
Add table
Add a link
Reference in a new issue