feat: basic lexer and test

main
Carsten Kragelund 2023-06-17 03:13:03 +02:00
commit bbbd19eac7
9 changed files with 155 additions and 0 deletions

1
.gitignore vendored

@ -0,0 +1 @@
_build

@ -0,0 +1,3 @@
(lang dune 3.8)
(name hcfg)

@ -0,0 +1,7 @@
opam-version: "2.0.0"
maintainer: "carsten@kragelund.me"
authors: ["Carsten Kragelund"]
license: "MIT"
build: [
["dune" "build" "--only" "hcfg" "--root" "." "-j" jobs "@install"]
]

@ -0,0 +1,3 @@
(library
(public_name hcfg)
(libraries fmt))

@ -0,0 +1,2 @@
module Token = Token
module Lexer = Lexer

@ -0,0 +1,76 @@
let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
let is_digit = function '0' .. '9' -> true | _ -> false
module Lexer = struct
include Token
type lexer = {
input : string;
position : int;
read_position : int;
ch : char;
}
let null_byte = '\x00'
let advance lexer =
let read_to_end = lexer.read_position >= String.length(lexer.input) in
let new_ch = if read_to_end then null_byte else String.get lexer.input lexer.read_position in
let new_lexer = {lexer with position = lexer.read_position; read_position = lexer.read_position + 1; ch = new_ch} in
new_lexer
let peek_char lexer =
let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in
if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1)
let new_lexer input_string =
let lexer = {
input = input_string;
position = 0;
read_position = 0;
ch = null_byte;
} in
advance lexer
let read_string lexer =
let rec read_str lxr str =
match lxr.ch with
| '"' -> (advance lxr, Token.STRING str)
| c -> read_str (advance lxr) (str ^ Char.escaped c) in
read_str lexer ""
let rec read_ident lxr str =
match lxr.ch with
| '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c)
| _ -> (lxr, Token.IDENT str)
let read_number lexer =
let rec read_num lxr str =
match lxr.ch with
| '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c)
| _ -> (lxr, Token.INTEGER (int_of_string str))
in
read_num lexer ""
let next_char lexer = match lexer.ch with
| ':' -> (advance lexer, Token.COLON)
| ',' -> (advance lexer, Token.COMMA)
| '{' -> (advance lexer, Token.LBRACE)
| '}' -> (advance lexer, Token.RBRACE)
| '"' -> advance lexer |> read_string
| ' ' -> (advance lexer, Token.SPACE)
| 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c)
| '0'..'9' -> read_number lexer
| '\x00' -> (lexer, Token.EOF)
| _ -> (advance lexer, Token.ILLEGAL)
let generate_tokens input_string =
let lexer = new_lexer input_string in
let rec gen lxr tokens =
match next_char lxr with
| (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF]
| (l, tok) -> gen l (tok :: tokens)
in gen lexer []
end

@ -0,0 +1,31 @@
module Token = struct
type token_type =
| ILLEGAL
| EOF
| SPACE
(* Identifiers and literals *)
| IDENT of string
| INTEGER of int
| FLOAT of float
| STRING of string
(* -- Delimiters *)
| COLON
| COMMA
| LBRACE
| RBRACE
let token_to_string = function
| ILLEGAL -> "ILLEGAL"
| EOF -> "EOF"
| SPACE -> "SPACE"
| IDENT a -> "IDENT " ^ a
| INTEGER a -> "INTEGER " ^ string_of_int a
| FLOAT a -> "FLOAT " ^ string_of_float a
| STRING a -> "STRING " ^ a
| COMMA -> "COMMA"
| COLON -> "COLON"
| LBRACE -> "LBRACE"
| RBRACE -> "RBRACE"
let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok)
end

@ -0,0 +1,3 @@
(test
(name test)
(libraries alcotest hcfg fmt))

@ -0,0 +1,29 @@
open Hcfg
include Lexer
include Token
let token_testable = Alcotest.testable Token.pretty_print (=)
let test_lexer_delimiters () =
Alcotest.(check (list token_testable))
"same token types" [
Token.LBRACE
; Token.IDENT "key"
; Token.COLON
; Token.STRING "value"
; Token.COMMA
; Token.IDENT "other_key"
; Token.COLON
; Token.INTEGER 100
; Token.RBRACE
; Token.EOF
]
(Lexer.generate_tokens {|{key: "value", other_key: 100}|})
let () =
(* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *)
Alcotest.run "Lexer"
[
( "list-delimiters",
[ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] );
]