feat: basic lexer and test

2023-06-17 03:13:03 +02:00 · 2023-06-17 03:13:03 +02:00 · bbbd19eac7
commit bbbd19eac7
9 changed files with 155 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+_build
--- a/3
+++ b/3
@ -0,0 +1,3 @@
+(lang dune 3.8)
+
+(name hcfg)
--- a/hcfg.opam
+++ b/hcfg.opam
@ -0,0 +1,7 @@
+opam-version: "2.0.0"
+maintainer: "carsten@kragelund.me"
+authors: ["Carsten Kragelund"]
+license: "MIT"
+build: [
+  ["dune" "build" "--only" "hcfg" "--root" "." "-j" jobs "@install"]
+]
--- a/src/dune
+++ b/src/dune
@ -0,0 +1,3 @@
+(library
+ (public_name hcfg)
+ (libraries fmt))
--- a/src/hcfg.ml
+++ b/src/hcfg.ml
@ -0,0 +1,2 @@
+module Token = Token
+module Lexer = Lexer
--- a/src/lexer.ml
+++ b/src/lexer.ml
@ -0,0 +1,76 @@
+let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
+let is_digit = function '0' .. '9' -> true | _ -> false
+
+module Lexer = struct
+  include Token
+
+  type lexer = {
+    input : string;
+    position : int;
+    read_position : int;
+    ch : char;
+  }
+
+  let null_byte = '\x00'
+
+
+  let advance lexer =
+    let read_to_end = lexer.read_position >= String.length(lexer.input) in
+    let new_ch = if read_to_end then null_byte else String.get lexer.input lexer.read_position in
+    let new_lexer = {lexer with position = lexer.read_position; read_position = lexer.read_position + 1; ch = new_ch} in
+    new_lexer
+
+  let peek_char lexer =
+    let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in
+    if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1)
+    
+
+  let new_lexer input_string =
+    let lexer = {
+      input = input_string;
+      position = 0;
+      read_position = 0;
+      ch = null_byte;
+    } in
+    advance lexer
+
+  let read_string lexer =
+    let rec read_str lxr str =
+      match lxr.ch with
+      | '"' -> (advance lxr, Token.STRING str)
+      | c -> read_str (advance lxr) (str ^ Char.escaped c) in
+    read_str lexer ""
+
+  let rec read_ident lxr str = 
+    match lxr.ch with
+    | '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c)
+    | _ -> (lxr, Token.IDENT str)
+
+  let read_number lexer =
+    let rec read_num lxr str =
+      match lxr.ch with
+      | '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c)
+      | _ -> (lxr, Token.INTEGER (int_of_string str))
+    in
+    read_num lexer ""
+
+  let next_char lexer = match lexer.ch with
+    | ':' -> (advance lexer, Token.COLON)
+    | ',' -> (advance lexer, Token.COMMA)
+    | '{' -> (advance lexer, Token.LBRACE)
+    | '}' -> (advance lexer, Token.RBRACE)
+    | '"' -> advance lexer |> read_string
+    | ' ' -> (advance lexer, Token.SPACE)
+    | 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c)
+    | '0'..'9' -> read_number lexer
+    | '\x00' -> (lexer, Token.EOF)
+    | _ -> (advance lexer, Token.ILLEGAL)
+
+  let generate_tokens input_string =
+    let lexer = new_lexer input_string in
+    let rec gen lxr tokens =
+      match next_char lxr with
+      | (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF]
+      | (l, tok) -> gen l (tok :: tokens)
+    in gen lexer []
+end
--- a/src/token.ml
+++ b/src/token.ml
@ -0,0 +1,31 @@
+module Token = struct
+  type token_type =
+    | ILLEGAL
+    | EOF
+    | SPACE
+    (* Identifiers and literals *)
+    | IDENT of string
+    | INTEGER of int
+    | FLOAT of float
+    | STRING of string
+    (* -- Delimiters *)
+    | COLON
+    | COMMA
+    | LBRACE
+    | RBRACE
+
+  let token_to_string = function
+    | ILLEGAL -> "ILLEGAL"
+    | EOF -> "EOF"
+    | SPACE -> "SPACE"
+    | IDENT a -> "IDENT " ^ a
+    | INTEGER a -> "INTEGER " ^ string_of_int a
+    | FLOAT a -> "FLOAT " ^ string_of_float a
+    | STRING a -> "STRING " ^ a
+    | COMMA -> "COMMA"
+    | COLON -> "COLON"
+    | LBRACE -> "LBRACE"
+    | RBRACE -> "RBRACE"
+
+  let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok)
+end
--- a/test/dune
+++ b/test/dune
@ -0,0 +1,3 @@
+(test
+ (name test)
+ (libraries alcotest hcfg fmt))
--- a/test/test.ml
+++ b/test/test.ml
@ -0,0 +1,29 @@
+open Hcfg
+include Lexer
+include Token
+
+let token_testable = Alcotest.testable Token.pretty_print (=)
+
+let test_lexer_delimiters () =
+  Alcotest.(check (list token_testable))
+    "same token types" [
+        Token.LBRACE
+      ; Token.IDENT "key"
+      ; Token.COLON
+      ; Token.STRING "value"
+      ; Token.COMMA
+      ; Token.IDENT "other_key"
+      ; Token.COLON
+      ; Token.INTEGER 100
+      ; Token.RBRACE
+      ; Token.EOF
+      ]
+      (Lexer.generate_tokens {|{key: "value", other_key: 100}|})
+
+let () =
+  (* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *)
+  Alcotest.run "Lexer"
+    [
+      ( "list-delimiters",
+        [ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] );
+    ]