finishing up parser

main
Carsten Kragelund 2023-06-17 03:13:03 +02:00
parent bbbd19eac7
commit de30d605f6
7 changed files with 258 additions and 38 deletions

@ -1,3 +1,3 @@
(library
(public_name hcfg)
(libraries fmt))
(libraries base fmt))

@ -1,2 +1,3 @@
module Token = Token
module Lexer = Lexer
module Lexer = Lexer
module Parser = Parser

@ -1,6 +1,3 @@
let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
let is_digit = function '0' .. '9' -> true | _ -> false
module Lexer = struct
include Token
@ -22,7 +19,23 @@ module Lexer = struct
let peek_char lexer =
let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in
if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1)
if read_to_end then null_byte else String.get lexer.input (lexer.read_position)
let rec advance_to ch lexer =
if peek_char lexer != ch then advance lexer |> advance_to ch
else advance lexer
let rec advance_to2 ch nch lexer =
let _ = print_endline (Format.sprintf "%c%c\n" ch nch) in
let lex = advance_to ch lexer in
let _ = print_endline (Format.sprintf "%c\n" (peek_char lex)) in
if peek_char lex = nch then lex else advance_to2 ch nch lex
let rec skip_whitespace lexer =
match lexer.ch with
| ' ' | '\n' | '\t' -> advance lexer |> skip_whitespace
| _ -> lexer
let new_lexer input_string =
@ -44,33 +57,57 @@ module Lexer = struct
let rec read_ident lxr str =
match lxr.ch with
| '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c)
| _ -> (lxr, Token.IDENT str)
| _ -> match str with
| "null" -> (lxr, Token.NULL)
| "true" | "false" -> (lxr, Token.BOOL (bool_of_string str))
| _ -> (lxr, Token.IDENT str)
let read_number lexer =
let rec read_decimals lxr str =
match lxr.ch with
| '0'..'9' as c -> read_decimals (advance lxr) (str ^ Char.escaped c)
| 'f' -> (advance lxr, Token.FLOAT (float_of_string str))
| _ -> (lxr, Token.FLOAT (float_of_string str))
in
let rec read_num lxr str =
match lxr.ch with
| '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c)
| 'f' -> (advance lxr, Token.FLOAT (float_of_string str))
| '.' -> read_decimals (advance lxr) (str ^ Char.escaped '.')
| _ -> (lxr, Token.INTEGER (int_of_string str))
in
read_num lexer ""
let next_char lexer = match lexer.ch with
| ':' -> (advance lexer, Token.COLON)
| ',' -> (advance lexer, Token.COMMA)
| '{' -> (advance lexer, Token.LBRACE)
| '}' -> (advance lexer, Token.RBRACE)
| '"' -> advance lexer |> read_string
| ' ' -> (advance lexer, Token.SPACE)
| 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c)
| '0'..'9' -> read_number lexer
| '\x00' -> (lexer, Token.EOF)
| _ -> (advance lexer, Token.ILLEGAL)
let read_binary lexer =
let rec read_bin lxr str =
match lxr.ch with
| '"' -> (advance lxr, Token.BINARY (Bytes.of_string str))
| c -> read_bin (advance lxr) (str ^ Char.escaped c) in
read_bin lexer ""
let rec next_token lexer =
let lxr = skip_whitespace lexer in
match lxr.ch with
| ':' -> (advance lxr, Token.COLON)
| ',' -> (advance lxr, Token.COMMA)
| '{' -> (advance lxr, Token.LBRACE)
| '}' -> (advance lxr, Token.RBRACE)
| '[' -> (advance lxr, Token.LBRACKET)
| ']' -> (advance lxr, Token.RBRACKET)
| '/' when peek_char lxr = '/' -> let l = advance_to '\n' lxr in next_token l
| '/' when peek_char lxr = '*' -> let l = advance lxr |> advance |> advance_to2 '*' '/' in advance l |> advance |> next_token
| '"' -> advance lxr |> read_string
| 'b' when peek_char lxr = '"' -> advance lxr |> advance |> read_binary
| 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lxr) (Char.escaped c)
| '0'..'9' -> read_number lxr
| '\x00' -> (lxr, Token.EOF)
| _ -> (advance lxr, Token.ILLEGAL)
let generate_tokens input_string =
let lexer = new_lexer input_string in
let rec gen lxr tokens =
match next_char lxr with
| (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF]
match next_token lxr with
| (_, Token.EOF) -> List.rev_append tokens [Token.EOF]
| (l, tok) -> gen l (tok :: tokens)
in gen lexer []
end

@ -0,0 +1,143 @@
include Token
let rec split_at1 l n =
if n = 0 then ([], l) else
match l with
| [] -> ([], [])
| h :: t -> let (l1, l2) = split_at1 t (n-1) in (h :: l1, l2);;
let safe_list xs =
match List.length xs with
| 0 -> (None, None)
| 1 -> ((Some (List.hd xs)), None)
| _ -> ((Some (List.hd xs)), Some(List.tl xs))
let list_skip xs n =
match split_at1 xs n with
| (_, tl) -> tl
let if_list_peek xs n v =
match List.nth_opt xs n with
| Some x -> x = v
| _ -> false
module Parser = struct
include Token
open Base
let split_with (l:Token.token_type list) left right =
let f (length, level) (i, tk) =
if level < 1 && i < length then
(i, level)
else
match tk with
| a when Token.equal a left -> (length, level + 1)
| a when Token.equal a right -> (length, level - 1)
| _ -> (length, level)
in
match List.fold_left ~f:f ~init:(List.length l, 1) (List.mapi ~f:(fun i e -> (i,e)) l) with
| (i, _) -> Some (split_at1 l i)
type hvalue =
| INTEGER of int
| FLOAT of float
| STRING of string
| BINARY of Bytes.t
| NULL
| ARRAY of hvalue List.t
| OBJECT of (string, hvalue, String.comparator_witness) Map.t
let rec equal a b =
match (a,b) with
| (INTEGER a, INTEGER b) -> Int.equal a b
| (FLOAT a, FLOAT b) -> Float.equal a b
| (STRING a, STRING b) -> String.equal a b
| (BINARY a, BINARY b) -> Bytes.equal a b
| (NULL, NULL) -> true
| (ARRAY a, ARRAY b) -> List.equal equal a b
| (OBJECT a, OBJECT b) -> Map.equal equal a b
| _ -> false
let rec hvalue_to_string = function
| INTEGER a -> "INTEGER: " ^ Int.to_string a
| FLOAT a -> "INTEGER: " ^ Float.to_string a
| STRING a -> "INTEGER: " ^ a
| BINARY a -> "BINARY " ^ Bytes.fold a ~init:"" ~f:(fun acc b -> acc ^ Printf.sprintf "\\x%2x" (Char.to_int b))
| NULL -> "NULL"
| ARRAY a -> "ARRAY [" ^ (String.concat ~sep:", " (List.map a ~f:hvalue_to_string)) ^ "]"
| OBJECT a -> "OBJECT {" ^ (String.concat ~sep:", " (List.map (Map.to_alist a) ~f:(fun (k,v) -> k ^ ": (" ^ (hvalue_to_string v) ^ ")"))) ^ "}"
let pretty_print ppf tok = Fmt.pf ppf "HValue %s" (hvalue_to_string tok)
let parse_primitive token =
match token with
| Token.INTEGER a -> INTEGER a
| Token.FLOAT a -> FLOAT a
| Token.STRING a -> STRING a
| Token.BINARY a -> BINARY a
| Token.NULL -> NULL
| _ -> failwith "Not a primitive token"
let print_token_array tokens =
List.map tokens ~f:Token.token_to_string |> String.concat ~sep:"; "
let rec parse tokens =
match List.hd tokens with
| Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) -> parse_primitive t
| Some(Token.LBRACE) -> (match List.tl tokens with
| Some(tl) -> parse_object tl
| _ -> failwith "Not a valid hcfg file")
| Some(Token.LBRACKET) -> (match List.tl tokens with
| Some(tl) -> parse_arr tl
| _ -> failwith "Not a valid hcfg file")
| Some(Token.EOF) -> failwith "Empty tree"
| _ -> failwith "Not a valid hcfg file"
and parse_object tokens =
let rec parse_object_i tkns acc =
match safe_list tkns with
| (Some hd, Some tl) -> (match hd with
| Token.IDENT i when if_list_peek tl 0 Token.COLON -> (match List.nth tl 1 with
| Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) when if_list_peek tl 2 Token.COMMA -> let (_arr, rest) = split_at1 tl 3 in parse_object_i rest (Map.set acc ~key:i ~data:(parse_primitive t))
| Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) when if_list_peek tl 2 Token.RBRACE -> let (_arr, rest) = split_at1 tl 2 in parse_object_i rest (Map.set acc ~key:i ~data:(parse_primitive t))
| Some Token.LBRACKET -> (match split_with (list_skip tl 2) Token.LBRACKET Token.RBRACKET with
| Some (arr, rest) ->
parse_object_i (rest) (Map.set acc ~key:i ~data:(parse_arr arr))
| None -> failwith "uhhh arr in obj")
| Some Token.LBRACE -> (match split_with (list_skip tl 2) Token.LBRACE Token.RBRACE with
| Some (arr, rest) -> parse_object_i (rest) (Map.set acc ~key:i ~data:(parse_object arr))
| None -> failwith "uhhh arr in obj")
| _ -> "TODO: Unhandled token for object value" ^ token_to_string hd ^ "(" ^ print_token_array tl ^ ")" |> failwith
)
| Token.COMMA when not (if_list_peek tl 0 Token.RBRACE) -> parse_object_i tl acc
| Token.COMMA when if_list_peek tl 0 Token.RBRACE -> acc
| Token.RBRACE -> acc
| _ -> "TODO: Unhandled token in object: " ^ (Token.token_to_string hd) ^ " (" ^ (print_token_array tl) ^ ")"|> failwith)
| (Some Token.RBRACE, _) -> acc
| _ -> (String.concat ~sep:", " (List.map (Map.to_alist acc) ~f:(fun (k,v) -> k ^ ": (" ^ (hvalue_to_string v) ^ ")"))) |> failwith
in OBJECT (parse_object_i tokens (Map.empty (module String)))
and parse_arr tokens =
let rec parse_arr_i tkns acc =
match safe_list tkns with
| (Some hd, Some tl) -> (match hd with
| Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t -> (match List.hd tl with
| Some Token.COMMA when List.length tl >= 2 -> parse_arr_i (List.tl_exn tl) ((parse_primitive t) :: acc)
| Some Token.RBRACKET -> ((parse_primitive t) :: acc)
| _ -> failwith "Invalid array"
)
| Token.LBRACKET -> (match split_with tl Token.LBRACKET Token.RBRACKET with
| Some (arr, rest) -> parse_arr_i (rest) (parse_arr arr :: acc)
| None -> failwith "uhhh arr in arr")
| Token.LBRACE -> (match split_with tl Token.LBRACE Token.RBRACE with
| Some (arr, rest) -> parse_arr_i (rest) (parse_object arr :: acc)
| None -> failwith "uhhh obj in arr")
| Token.RBRACKET -> acc
| t -> "TODO: Unhandled token in array: " ^ (Token.token_to_string t) ^ " (" ^ (print_token_array tl) ^ ")"|> failwith
)
| (Some Token.EOF, None) -> acc
| _ -> "TODO: parse_arr (" ^ (print_token_array tkns) ^ ")"|> failwith
in ARRAY (List.rev (parse_arr_i tokens []))
end

@ -1,31 +1,42 @@
module Token = struct
open Base
type token_type =
| ILLEGAL
| EOF
| SPACE
(* Identifiers and literals *)
| IDENT of string
| INTEGER of int
| FLOAT of float
| BOOL of bool
| STRING of string
| BINARY of Bytes.t
| NULL
(* -- Delimiters *)
| COLON
| COMMA
| LBRACE
| RBRACE
| LBRACKET
| RBRACKET
let equal (a:token_type) b = Poly.equal a b
let token_to_string = function
| ILLEGAL -> "ILLEGAL"
| EOF -> "EOF"
| SPACE -> "SPACE"
| IDENT a -> "IDENT " ^ a
| INTEGER a -> "INTEGER " ^ string_of_int a
| FLOAT a -> "FLOAT " ^ string_of_float a
| INTEGER a -> "INTEGER " ^ Int.to_string a
| FLOAT a -> "FLOAT " ^ Float.to_string a
| BOOL a -> "BOOL " ^ Bool.to_string a
| STRING a -> "STRING " ^ a
| BINARY a -> "BINARY " ^ Bytes.fold a ~init:"" ~f:(fun acc b -> acc ^ Printf.sprintf "\\x%2x" (Char.to_int b))
| NULL -> "NULL"
| COMMA -> "COMMA"
| COLON -> "COLON"
| LBRACE -> "LBRACE"
| RBRACE -> "RBRACE"
| LBRACKET -> "LBRACKET"
| RBRACKET -> "RBRACKET"
let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok)
end

@ -0,0 +1,11 @@
{
// test
key: "value",
/* this is a comment */
other_key: 100,
floating: 123f,
other_float: 123.3,
bin: b"test",
arr: [1,2],
nested: { key: "trailing", },
}

@ -1,29 +1,46 @@
open Hcfg
include Lexer
include Token
include Lexer
include Parser
open Base
let token_testable = Alcotest.testable Token.pretty_print (=)
let token_testable = Alcotest.testable Token.pretty_print Token.equal
let hvalue_testable = Alcotest.testable Parser.pretty_print Parser.equal
let test_lexer_delimiters () =
let read_test_file =
(* open_in_bin works correctly on Unix and Windows *)
let ch = Stdlib.open_in_bin "test/test.hcfg" in
let s = Stdlib.really_input_string ch (Stdlib.in_channel_length ch) in
Stdlib.close_in ch;
s
let test_lexer() =
Alcotest.(check (list token_testable))
"same token types" [
Token.LBRACE
; Token.IDENT "key"
; Token.COLON
; Token.STRING "value"
; Token.COMMA
; Token.IDENT "other_key"
; Token.COLON
; Token.INTEGER 100
; Token.IDENT "key" ; Token.COLON ; Token.STRING "value" ; Token.COMMA
; Token.IDENT "other_key" ; Token.COLON ; Token.INTEGER 100 ; Token.COMMA
; Token.IDENT "floating" ; Token.COLON ; Token.FLOAT 123.0 ; Token.COMMA
; Token.IDENT "other_float" ; Token.COLON ; Token.FLOAT 123.3 ; Token.COMMA
; Token.IDENT "bin" ; Token.COLON ; Token.BINARY (Bytes.of_string "test") ; Token.COMMA
; Token.IDENT "arr" ; Token.COLON ; Token.LBRACKET ; Token.INTEGER 1 ; Token.COMMA ; Token.INTEGER 2; Token.RBRACKET ; Token.COMMA
; Token.IDENT "nested" ; Token.COLON ; Token.LBRACE ; Token.IDENT "key" ; Token.COLON ; Token.STRING "trailing" ; Token.COMMA ; Token.RBRACE ; Token.COMMA
; Token.RBRACE
; Token.EOF
]
(Lexer.generate_tokens {|{key: "value", other_key: 100}|})
(Lexer.generate_tokens read_test_file)
let test_ast() =
Alcotest.(check (hvalue_testable))
"same token types" (Parser.OBJECT (Base.Map.of_alist_exn (module String) [("a", Parser.FLOAT 1.); ("b", Parser.ARRAY [Parser.INTEGER 1; Parser.INTEGER 2]); ("c", Parser.OBJECT (Base.Map.of_alist_exn (module String) [("a", Parser.INTEGER 2)]))]))
(Lexer.generate_tokens {|{a: 1f,b: [1,2], c: { a: 2}}|} |> Parser.parse)
let () =
(* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *)
Alcotest.run "Lexer"
[
( "list-delimiters",
[ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] );
( "lex test file",
[ Alcotest.test_case "" `Slow test_lexer] );
( "parse test file",
[ Alcotest.test_case "" `Slow test_ast]);
]