From de30d605f6f83bb060eb95f07254f03cd866469d Mon Sep 17 00:00:00 2001 From: Carsten Kragelund Date: Sat, 17 Jun 2023 03:13:03 +0200 Subject: [PATCH] finishing up parser --- src/dune | 2 +- src/hcfg.ml | 3 +- src/lexer.ml | 73 ++++++++++++++++++------- src/parser.ml | 143 +++++++++++++++++++++++++++++++++++++++++++++++++ src/token.ml | 19 +++++-- test/test.hcfg | 11 ++++ test/test.ml | 45 +++++++++++----- 7 files changed, 258 insertions(+), 38 deletions(-) create mode 100644 src/parser.ml create mode 100644 test/test.hcfg diff --git a/src/dune b/src/dune index a1cfcbb..d8899f7 100644 --- a/src/dune +++ b/src/dune @@ -1,3 +1,3 @@ (library (public_name hcfg) - (libraries fmt)) + (libraries base fmt)) diff --git a/src/hcfg.ml b/src/hcfg.ml index c092e8f..fb0e66b 100644 --- a/src/hcfg.ml +++ b/src/hcfg.ml @@ -1,2 +1,3 @@ module Token = Token -module Lexer = Lexer \ No newline at end of file +module Lexer = Lexer +module Parser = Parser \ No newline at end of file diff --git a/src/lexer.ml b/src/lexer.ml index c67d0b4..f6ced88 100644 --- a/src/lexer.ml +++ b/src/lexer.ml @@ -1,6 +1,3 @@ -let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false -let is_digit = function '0' .. '9' -> true | _ -> false - module Lexer = struct include Token @@ -22,7 +19,23 @@ module Lexer = struct let peek_char lexer = let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in - if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1) + if read_to_end then null_byte else String.get lexer.input (lexer.read_position) + + let rec advance_to ch lexer = + if peek_char lexer != ch then advance lexer |> advance_to ch + else advance lexer + + let rec advance_to2 ch nch lexer = + let _ = print_endline (Format.sprintf "%c%c\n" ch nch) in + let lex = advance_to ch lexer in + let _ = print_endline (Format.sprintf "%c\n" (peek_char lex)) in + if peek_char lex = nch then lex else advance_to2 ch nch lex + + + let rec skip_whitespace lexer = + match lexer.ch with + | ' ' | '\n' | '\t' -> advance lexer |> skip_whitespace + | _ -> lexer let new_lexer input_string = @@ -44,33 +57,57 @@ module Lexer = struct let rec read_ident lxr str = match lxr.ch with | '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c) - | _ -> (lxr, Token.IDENT str) + | _ -> match str with + | "null" -> (lxr, Token.NULL) + | "true" | "false" -> (lxr, Token.BOOL (bool_of_string str)) + | _ -> (lxr, Token.IDENT str) let read_number lexer = + let rec read_decimals lxr str = + match lxr.ch with + | '0'..'9' as c -> read_decimals (advance lxr) (str ^ Char.escaped c) + | 'f' -> (advance lxr, Token.FLOAT (float_of_string str)) + | _ -> (lxr, Token.FLOAT (float_of_string str)) + in let rec read_num lxr str = match lxr.ch with | '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c) + | 'f' -> (advance lxr, Token.FLOAT (float_of_string str)) + | '.' -> read_decimals (advance lxr) (str ^ Char.escaped '.') | _ -> (lxr, Token.INTEGER (int_of_string str)) in read_num lexer "" - let next_char lexer = match lexer.ch with - | ':' -> (advance lexer, Token.COLON) - | ',' -> (advance lexer, Token.COMMA) - | '{' -> (advance lexer, Token.LBRACE) - | '}' -> (advance lexer, Token.RBRACE) - | '"' -> advance lexer |> read_string - | ' ' -> (advance lexer, Token.SPACE) - | 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c) - | '0'..'9' -> read_number lexer - | '\x00' -> (lexer, Token.EOF) - | _ -> (advance lexer, Token.ILLEGAL) + let read_binary lexer = + let rec read_bin lxr str = + match lxr.ch with + | '"' -> (advance lxr, Token.BINARY (Bytes.of_string str)) + | c -> read_bin (advance lxr) (str ^ Char.escaped c) in + read_bin lexer "" + + let rec next_token lexer = + let lxr = skip_whitespace lexer in + match lxr.ch with + | ':' -> (advance lxr, Token.COLON) + | ',' -> (advance lxr, Token.COMMA) + | '{' -> (advance lxr, Token.LBRACE) + | '}' -> (advance lxr, Token.RBRACE) + | '[' -> (advance lxr, Token.LBRACKET) + | ']' -> (advance lxr, Token.RBRACKET) + | '/' when peek_char lxr = '/' -> let l = advance_to '\n' lxr in next_token l + | '/' when peek_char lxr = '*' -> let l = advance lxr |> advance |> advance_to2 '*' '/' in advance l |> advance |> next_token + | '"' -> advance lxr |> read_string + | 'b' when peek_char lxr = '"' -> advance lxr |> advance |> read_binary + | 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lxr) (Char.escaped c) + | '0'..'9' -> read_number lxr + | '\x00' -> (lxr, Token.EOF) + | _ -> (advance lxr, Token.ILLEGAL) let generate_tokens input_string = let lexer = new_lexer input_string in let rec gen lxr tokens = - match next_char lxr with - | (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF] + match next_token lxr with + | (_, Token.EOF) -> List.rev_append tokens [Token.EOF] | (l, tok) -> gen l (tok :: tokens) in gen lexer [] end \ No newline at end of file diff --git a/src/parser.ml b/src/parser.ml new file mode 100644 index 0000000..22e1c36 --- /dev/null +++ b/src/parser.ml @@ -0,0 +1,143 @@ +include Token + +let rec split_at1 l n = + if n = 0 then ([], l) else + match l with + | [] -> ([], []) + | h :: t -> let (l1, l2) = split_at1 t (n-1) in (h :: l1, l2);; + + +let safe_list xs = + match List.length xs with + | 0 -> (None, None) + | 1 -> ((Some (List.hd xs)), None) + | _ -> ((Some (List.hd xs)), Some(List.tl xs)) + +let list_skip xs n = + match split_at1 xs n with + | (_, tl) -> tl + +let if_list_peek xs n v = + match List.nth_opt xs n with + | Some x -> x = v + | _ -> false + +module Parser = struct + include Token + open Base + + let split_with (l:Token.token_type list) left right = + let f (length, level) (i, tk) = + if level < 1 && i < length then + (i, level) + else + match tk with + | a when Token.equal a left -> (length, level + 1) + | a when Token.equal a right -> (length, level - 1) + | _ -> (length, level) + in + match List.fold_left ~f:f ~init:(List.length l, 1) (List.mapi ~f:(fun i e -> (i,e)) l) with + | (i, _) -> Some (split_at1 l i) + + type hvalue = + | INTEGER of int + | FLOAT of float + | STRING of string + | BINARY of Bytes.t + | NULL + | ARRAY of hvalue List.t + | OBJECT of (string, hvalue, String.comparator_witness) Map.t + + let rec equal a b = + match (a,b) with + | (INTEGER a, INTEGER b) -> Int.equal a b + | (FLOAT a, FLOAT b) -> Float.equal a b + | (STRING a, STRING b) -> String.equal a b + | (BINARY a, BINARY b) -> Bytes.equal a b + | (NULL, NULL) -> true + | (ARRAY a, ARRAY b) -> List.equal equal a b + | (OBJECT a, OBJECT b) -> Map.equal equal a b + | _ -> false + + + let rec hvalue_to_string = function + | INTEGER a -> "INTEGER: " ^ Int.to_string a + | FLOAT a -> "INTEGER: " ^ Float.to_string a + | STRING a -> "INTEGER: " ^ a + | BINARY a -> "BINARY " ^ Bytes.fold a ~init:"" ~f:(fun acc b -> acc ^ Printf.sprintf "\\x%2x" (Char.to_int b)) + | NULL -> "NULL" + | ARRAY a -> "ARRAY [" ^ (String.concat ~sep:", " (List.map a ~f:hvalue_to_string)) ^ "]" + | OBJECT a -> "OBJECT {" ^ (String.concat ~sep:", " (List.map (Map.to_alist a) ~f:(fun (k,v) -> k ^ ": (" ^ (hvalue_to_string v) ^ ")"))) ^ "}" + + let pretty_print ppf tok = Fmt.pf ppf "HValue %s" (hvalue_to_string tok) + + let parse_primitive token = + match token with + | Token.INTEGER a -> INTEGER a + | Token.FLOAT a -> FLOAT a + | Token.STRING a -> STRING a + | Token.BINARY a -> BINARY a + | Token.NULL -> NULL + | _ -> failwith "Not a primitive token" + + let print_token_array tokens = + List.map tokens ~f:Token.token_to_string |> String.concat ~sep:"; " + + + let rec parse tokens = + match List.hd tokens with + | Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) -> parse_primitive t + | Some(Token.LBRACE) -> (match List.tl tokens with + | Some(tl) -> parse_object tl + | _ -> failwith "Not a valid hcfg file") + + | Some(Token.LBRACKET) -> (match List.tl tokens with + | Some(tl) -> parse_arr tl + | _ -> failwith "Not a valid hcfg file") + | Some(Token.EOF) -> failwith "Empty tree" + | _ -> failwith "Not a valid hcfg file" + and parse_object tokens = + let rec parse_object_i tkns acc = + match safe_list tkns with + | (Some hd, Some tl) -> (match hd with + | Token.IDENT i when if_list_peek tl 0 Token.COLON -> (match List.nth tl 1 with + | Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) when if_list_peek tl 2 Token.COMMA -> let (_arr, rest) = split_at1 tl 3 in parse_object_i rest (Map.set acc ~key:i ~data:(parse_primitive t)) + | Some(Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t) when if_list_peek tl 2 Token.RBRACE -> let (_arr, rest) = split_at1 tl 2 in parse_object_i rest (Map.set acc ~key:i ~data:(parse_primitive t)) + | Some Token.LBRACKET -> (match split_with (list_skip tl 2) Token.LBRACKET Token.RBRACKET with + | Some (arr, rest) -> + parse_object_i (rest) (Map.set acc ~key:i ~data:(parse_arr arr)) + | None -> failwith "uhhh arr in obj") + | Some Token.LBRACE -> (match split_with (list_skip tl 2) Token.LBRACE Token.RBRACE with + | Some (arr, rest) -> parse_object_i (rest) (Map.set acc ~key:i ~data:(parse_object arr)) + | None -> failwith "uhhh arr in obj") + | _ -> "TODO: Unhandled token for object value" ^ token_to_string hd ^ "(" ^ print_token_array tl ^ ")" |> failwith + ) + | Token.COMMA when not (if_list_peek tl 0 Token.RBRACE) -> parse_object_i tl acc + | Token.COMMA when if_list_peek tl 0 Token.RBRACE -> acc + | Token.RBRACE -> acc + | _ -> "TODO: Unhandled token in object: " ^ (Token.token_to_string hd) ^ " (" ^ (print_token_array tl) ^ ")"|> failwith) + | (Some Token.RBRACE, _) -> acc + | _ -> (String.concat ~sep:", " (List.map (Map.to_alist acc) ~f:(fun (k,v) -> k ^ ": (" ^ (hvalue_to_string v) ^ ")"))) |> failwith + in OBJECT (parse_object_i tokens (Map.empty (module String))) + and parse_arr tokens = + let rec parse_arr_i tkns acc = + match safe_list tkns with + | (Some hd, Some tl) -> (match hd with + | Token.INTEGER _ | Token.FLOAT _ | Token.STRING _ | Token.BINARY _ | Token.NULL as t -> (match List.hd tl with + | Some Token.COMMA when List.length tl >= 2 -> parse_arr_i (List.tl_exn tl) ((parse_primitive t) :: acc) + | Some Token.RBRACKET -> ((parse_primitive t) :: acc) + | _ -> failwith "Invalid array" + ) + | Token.LBRACKET -> (match split_with tl Token.LBRACKET Token.RBRACKET with + | Some (arr, rest) -> parse_arr_i (rest) (parse_arr arr :: acc) + | None -> failwith "uhhh arr in arr") + | Token.LBRACE -> (match split_with tl Token.LBRACE Token.RBRACE with + | Some (arr, rest) -> parse_arr_i (rest) (parse_object arr :: acc) + | None -> failwith "uhhh obj in arr") + | Token.RBRACKET -> acc + | t -> "TODO: Unhandled token in array: " ^ (Token.token_to_string t) ^ " (" ^ (print_token_array tl) ^ ")"|> failwith + ) + | (Some Token.EOF, None) -> acc + | _ -> "TODO: parse_arr (" ^ (print_token_array tkns) ^ ")"|> failwith + in ARRAY (List.rev (parse_arr_i tokens [])) +end \ No newline at end of file diff --git a/src/token.ml b/src/token.ml index f5bc7ad..b3ca04e 100644 --- a/src/token.ml +++ b/src/token.ml @@ -1,31 +1,42 @@ module Token = struct + open Base type token_type = | ILLEGAL | EOF - | SPACE (* Identifiers and literals *) | IDENT of string | INTEGER of int | FLOAT of float + | BOOL of bool | STRING of string + | BINARY of Bytes.t + | NULL (* -- Delimiters *) | COLON | COMMA | LBRACE | RBRACE + | LBRACKET + | RBRACKET + + let equal (a:token_type) b = Poly.equal a b let token_to_string = function | ILLEGAL -> "ILLEGAL" | EOF -> "EOF" - | SPACE -> "SPACE" | IDENT a -> "IDENT " ^ a - | INTEGER a -> "INTEGER " ^ string_of_int a - | FLOAT a -> "FLOAT " ^ string_of_float a + | INTEGER a -> "INTEGER " ^ Int.to_string a + | FLOAT a -> "FLOAT " ^ Float.to_string a + | BOOL a -> "BOOL " ^ Bool.to_string a | STRING a -> "STRING " ^ a + | BINARY a -> "BINARY " ^ Bytes.fold a ~init:"" ~f:(fun acc b -> acc ^ Printf.sprintf "\\x%2x" (Char.to_int b)) + | NULL -> "NULL" | COMMA -> "COMMA" | COLON -> "COLON" | LBRACE -> "LBRACE" | RBRACE -> "RBRACE" + | LBRACKET -> "LBRACKET" + | RBRACKET -> "RBRACKET" let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok) end diff --git a/test/test.hcfg b/test/test.hcfg new file mode 100644 index 0000000..d068828 --- /dev/null +++ b/test/test.hcfg @@ -0,0 +1,11 @@ +{ + // test + key: "value", + /* this is a comment */ + other_key: 100, + floating: 123f, + other_float: 123.3, + bin: b"test", + arr: [1,2], + nested: { key: "trailing", }, +} \ No newline at end of file diff --git a/test/test.ml b/test/test.ml index 630be08..8459e47 100644 --- a/test/test.ml +++ b/test/test.ml @@ -1,29 +1,46 @@ open Hcfg -include Lexer include Token +include Lexer +include Parser + +open Base -let token_testable = Alcotest.testable Token.pretty_print (=) +let token_testable = Alcotest.testable Token.pretty_print Token.equal +let hvalue_testable = Alcotest.testable Parser.pretty_print Parser.equal -let test_lexer_delimiters () = +let read_test_file = + (* open_in_bin works correctly on Unix and Windows *) + let ch = Stdlib.open_in_bin "test/test.hcfg" in + let s = Stdlib.really_input_string ch (Stdlib.in_channel_length ch) in + Stdlib.close_in ch; + s + +let test_lexer() = Alcotest.(check (list token_testable)) "same token types" [ Token.LBRACE - ; Token.IDENT "key" - ; Token.COLON - ; Token.STRING "value" - ; Token.COMMA - ; Token.IDENT "other_key" - ; Token.COLON - ; Token.INTEGER 100 + ; Token.IDENT "key" ; Token.COLON ; Token.STRING "value" ; Token.COMMA + ; Token.IDENT "other_key" ; Token.COLON ; Token.INTEGER 100 ; Token.COMMA + ; Token.IDENT "floating" ; Token.COLON ; Token.FLOAT 123.0 ; Token.COMMA + ; Token.IDENT "other_float" ; Token.COLON ; Token.FLOAT 123.3 ; Token.COMMA + ; Token.IDENT "bin" ; Token.COLON ; Token.BINARY (Bytes.of_string "test") ; Token.COMMA + ; Token.IDENT "arr" ; Token.COLON ; Token.LBRACKET ; Token.INTEGER 1 ; Token.COMMA ; Token.INTEGER 2; Token.RBRACKET ; Token.COMMA + ; Token.IDENT "nested" ; Token.COLON ; Token.LBRACE ; Token.IDENT "key" ; Token.COLON ; Token.STRING "trailing" ; Token.COMMA ; Token.RBRACE ; Token.COMMA ; Token.RBRACE ; Token.EOF ] - (Lexer.generate_tokens {|{key: "value", other_key: 100}|}) + (Lexer.generate_tokens read_test_file) + +let test_ast() = + Alcotest.(check (hvalue_testable)) + "same token types" (Parser.OBJECT (Base.Map.of_alist_exn (module String) [("a", Parser.FLOAT 1.); ("b", Parser.ARRAY [Parser.INTEGER 1; Parser.INTEGER 2]); ("c", Parser.OBJECT (Base.Map.of_alist_exn (module String) [("a", Parser.INTEGER 2)]))])) + (Lexer.generate_tokens {|{a: 1f,b: [1,2], c: { a: 2}}|} |> Parser.parse) let () = - (* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *) Alcotest.run "Lexer" [ - ( "list-delimiters", - [ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] ); + ( "lex test file", + [ Alcotest.test_case "" `Slow test_lexer] ); + ( "parse test file", + [ Alcotest.test_case "" `Slow test_ast]); ] \ No newline at end of file