From bbbd19eac77c14774a951c3fdb6f7b7bfaf19731 Mon Sep 17 00:00:00 2001 From: Carsten Kragelund Date: Sat, 17 Jun 2023 03:13:03 +0200 Subject: [PATCH] feat: basic lexer and test --- .gitignore | 1 + dune-project | 3 +++ hcfg.opam | 7 +++++ src/dune | 3 +++ src/hcfg.ml | 2 ++ src/lexer.ml | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/token.ml | 31 +++++++++++++++++++++ test/dune | 3 +++ test/test.ml | 29 ++++++++++++++++++++ 9 files changed, 155 insertions(+) create mode 100644 .gitignore create mode 100644 dune-project create mode 100644 hcfg.opam create mode 100644 src/dune create mode 100644 src/hcfg.ml create mode 100644 src/lexer.ml create mode 100644 src/token.ml create mode 100644 test/dune create mode 100644 test/test.ml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c5f578 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +_build \ No newline at end of file diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..3d0d0bb --- /dev/null +++ b/dune-project @@ -0,0 +1,3 @@ +(lang dune 3.8) + +(name hcfg) diff --git a/hcfg.opam b/hcfg.opam new file mode 100644 index 0000000..bd36fcd --- /dev/null +++ b/hcfg.opam @@ -0,0 +1,7 @@ +opam-version: "2.0.0" +maintainer: "carsten@kragelund.me" +authors: ["Carsten Kragelund"] +license: "MIT" +build: [ + ["dune" "build" "--only" "hcfg" "--root" "." "-j" jobs "@install"] +] \ No newline at end of file diff --git a/src/dune b/src/dune new file mode 100644 index 0000000..a1cfcbb --- /dev/null +++ b/src/dune @@ -0,0 +1,3 @@ +(library + (public_name hcfg) + (libraries fmt)) diff --git a/src/hcfg.ml b/src/hcfg.ml new file mode 100644 index 0000000..c092e8f --- /dev/null +++ b/src/hcfg.ml @@ -0,0 +1,2 @@ +module Token = Token +module Lexer = Lexer \ No newline at end of file diff --git a/src/lexer.ml b/src/lexer.ml new file mode 100644 index 0000000..c67d0b4 --- /dev/null +++ b/src/lexer.ml @@ -0,0 +1,76 @@ +let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false +let is_digit = function '0' .. '9' -> true | _ -> false + +module Lexer = struct + include Token + + type lexer = { + input : string; + position : int; + read_position : int; + ch : char; + } + + let null_byte = '\x00' + + + let advance lexer = + let read_to_end = lexer.read_position >= String.length(lexer.input) in + let new_ch = if read_to_end then null_byte else String.get lexer.input lexer.read_position in + let new_lexer = {lexer with position = lexer.read_position; read_position = lexer.read_position + 1; ch = new_ch} in + new_lexer + + let peek_char lexer = + let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in + if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1) + + + let new_lexer input_string = + let lexer = { + input = input_string; + position = 0; + read_position = 0; + ch = null_byte; + } in + advance lexer + + let read_string lexer = + let rec read_str lxr str = + match lxr.ch with + | '"' -> (advance lxr, Token.STRING str) + | c -> read_str (advance lxr) (str ^ Char.escaped c) in + read_str lexer "" + + let rec read_ident lxr str = + match lxr.ch with + | '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c) + | _ -> (lxr, Token.IDENT str) + + let read_number lexer = + let rec read_num lxr str = + match lxr.ch with + | '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c) + | _ -> (lxr, Token.INTEGER (int_of_string str)) + in + read_num lexer "" + + let next_char lexer = match lexer.ch with + | ':' -> (advance lexer, Token.COLON) + | ',' -> (advance lexer, Token.COMMA) + | '{' -> (advance lexer, Token.LBRACE) + | '}' -> (advance lexer, Token.RBRACE) + | '"' -> advance lexer |> read_string + | ' ' -> (advance lexer, Token.SPACE) + | 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c) + | '0'..'9' -> read_number lexer + | '\x00' -> (lexer, Token.EOF) + | _ -> (advance lexer, Token.ILLEGAL) + + let generate_tokens input_string = + let lexer = new_lexer input_string in + let rec gen lxr tokens = + match next_char lxr with + | (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF] + | (l, tok) -> gen l (tok :: tokens) + in gen lexer [] +end \ No newline at end of file diff --git a/src/token.ml b/src/token.ml new file mode 100644 index 0000000..f5bc7ad --- /dev/null +++ b/src/token.ml @@ -0,0 +1,31 @@ +module Token = struct + type token_type = + | ILLEGAL + | EOF + | SPACE + (* Identifiers and literals *) + | IDENT of string + | INTEGER of int + | FLOAT of float + | STRING of string + (* -- Delimiters *) + | COLON + | COMMA + | LBRACE + | RBRACE + + let token_to_string = function + | ILLEGAL -> "ILLEGAL" + | EOF -> "EOF" + | SPACE -> "SPACE" + | IDENT a -> "IDENT " ^ a + | INTEGER a -> "INTEGER " ^ string_of_int a + | FLOAT a -> "FLOAT " ^ string_of_float a + | STRING a -> "STRING " ^ a + | COMMA -> "COMMA" + | COLON -> "COLON" + | LBRACE -> "LBRACE" + | RBRACE -> "RBRACE" + + let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok) +end diff --git a/test/dune b/test/dune new file mode 100644 index 0000000..f2cf0da --- /dev/null +++ b/test/dune @@ -0,0 +1,3 @@ +(test + (name test) + (libraries alcotest hcfg fmt)) diff --git a/test/test.ml b/test/test.ml new file mode 100644 index 0000000..630be08 --- /dev/null +++ b/test/test.ml @@ -0,0 +1,29 @@ +open Hcfg +include Lexer +include Token + +let token_testable = Alcotest.testable Token.pretty_print (=) + +let test_lexer_delimiters () = + Alcotest.(check (list token_testable)) + "same token types" [ + Token.LBRACE + ; Token.IDENT "key" + ; Token.COLON + ; Token.STRING "value" + ; Token.COMMA + ; Token.IDENT "other_key" + ; Token.COLON + ; Token.INTEGER 100 + ; Token.RBRACE + ; Token.EOF + ] + (Lexer.generate_tokens {|{key: "value", other_key: 100}|}) + +let () = + (* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *) + Alcotest.run "Lexer" + [ + ( "list-delimiters", + [ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] ); + ] \ No newline at end of file