feat: basic lexer and test

2023-06-17 03:13:03 +02:00 · 2023-06-17 03:13:03 +02:00 · bbbd19eac7
commit bbbd19eac7
9 changed files with 155 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 _build
--- a/3
+++ b/3
@ -0,0 +1,3 @@
 (lang dune 3.8)
 (name hcfg)
--- a/hcfg.opam
+++ b/hcfg.opam
@ -0,0 +1,7 @@
 opam-version: "2.0.0"
 maintainer: "carsten@kragelund.me"
 authors: ["Carsten Kragelund"]
 license: "MIT"
 build: [
  ["dune" "build" "--only" "hcfg" "--root" "." "-j" jobs "@install"]
 ]
--- a/src/dune
+++ b/src/dune
@ -0,0 +1,3 @@
 (library
 (public_name hcfg)
 (libraries fmt))
--- a/src/hcfg.ml
+++ b/src/hcfg.ml
@ -0,0 +1,2 @@
 module Token = Token
 module Lexer = Lexer
--- a/src/lexer.ml
+++ b/src/lexer.ml
@ -0,0 +1,76 @@
 let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
 let is_digit = function '0' .. '9' -> true | _ -> false
 module Lexer = struct
  include Token
  type lexer = {
    input : string;
    position : int;
    read_position : int;
    ch : char;
  }
  let null_byte = '\x00'
  let advance lexer =
    let read_to_end = lexer.read_position >= String.length(lexer.input) in
    let new_ch = if read_to_end then null_byte else String.get lexer.input lexer.read_position in
    let new_lexer = {lexer with position = lexer.read_position; read_position = lexer.read_position + 1; ch = new_ch} in
    new_lexer
  let peek_char lexer =
    let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in
    if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1)
  let new_lexer input_string =
    let lexer = {
      input = input_string;
      position = 0;
      read_position = 0;
      ch = null_byte;
    } in
    advance lexer
  let read_string lexer =
    let rec read_str lxr str =
      match lxr.ch with
      | '"' -> (advance lxr, Token.STRING str)
      | c -> read_str (advance lxr) (str ^ Char.escaped c) in
    read_str lexer ""
  let rec read_ident lxr str = 
    match lxr.ch with
    | '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c)
    | _ -> (lxr, Token.IDENT str)
  let read_number lexer =
    let rec read_num lxr str =
      match lxr.ch with
      | '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c)
      | _ -> (lxr, Token.INTEGER (int_of_string str))
    in
    read_num lexer ""
  let next_char lexer = match lexer.ch with
    | ':' -> (advance lexer, Token.COLON)
    | ',' -> (advance lexer, Token.COMMA)
    | '{' -> (advance lexer, Token.LBRACE)
    | '}' -> (advance lexer, Token.RBRACE)
    | '"' -> advance lexer |> read_string
    | ' ' -> (advance lexer, Token.SPACE)
    | 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c)
    | '0'..'9' -> read_number lexer
    | '\x00' -> (lexer, Token.EOF)
    | _ -> (advance lexer, Token.ILLEGAL)
  let generate_tokens input_string =
    let lexer = new_lexer input_string in
    let rec gen lxr tokens =
      match next_char lxr with
      | (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF]
      | (l, tok) -> gen l (tok :: tokens)
    in gen lexer []
 end
--- a/src/token.ml
+++ b/src/token.ml
@ -0,0 +1,31 @@
 module Token = struct
  type token_type =
    | ILLEGAL
    | EOF
    | SPACE
    (* Identifiers and literals *)
    | IDENT of string
    | INTEGER of int
    | FLOAT of float
    | STRING of string
    (* -- Delimiters *)
    | COLON
    | COMMA
    | LBRACE
    | RBRACE
  let token_to_string = function
    | ILLEGAL -> "ILLEGAL"
    | EOF -> "EOF"
    | SPACE -> "SPACE"
    | IDENT a -> "IDENT " ^ a
    | INTEGER a -> "INTEGER " ^ string_of_int a
    | FLOAT a -> "FLOAT " ^ string_of_float a
    | STRING a -> "STRING " ^ a
    | COMMA -> "COMMA"
    | COLON -> "COLON"
    | LBRACE -> "LBRACE"
    | RBRACE -> "RBRACE"
  let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok)
 end
--- a/test/dune
+++ b/test/dune
@ -0,0 +1,3 @@
 (test
 (name test)
 (libraries alcotest hcfg fmt))
--- a/test/test.ml
+++ b/test/test.ml
@ -0,0 +1,29 @@
 open Hcfg
 include Lexer
 include Token
 let token_testable = Alcotest.testable Token.pretty_print (=)
 let test_lexer_delimiters () =
  Alcotest.(check (list token_testable))
    "same token types" [
        Token.LBRACE
      ; Token.IDENT "key"
      ; Token.COLON
      ; Token.STRING "value"
      ; Token.COMMA
      ; Token.IDENT "other_key"
      ; Token.COLON
      ; Token.INTEGER 100
      ; Token.RBRACE
      ; Token.EOF
      ]
      (Lexer.generate_tokens {|{key: "value", other_key: 100}|})
 let () =
  (* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *)
  Alcotest.run "Lexer"
    [
      ( "list-delimiters",
        [ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] );
    ]
		`@ -0,0 +1,2 @@`
							`module Token = Token`
							`module Lexer = Lexer`