feat: basic lexer and test
commit
bbbd19eac7
@ -0,0 +1 @@
|
||||
_build
|
@ -0,0 +1,3 @@
|
||||
(lang dune 3.8)
|
||||
|
||||
(name hcfg)
|
@ -0,0 +1,7 @@
|
||||
opam-version: "2.0.0"
|
||||
maintainer: "carsten@kragelund.me"
|
||||
authors: ["Carsten Kragelund"]
|
||||
license: "MIT"
|
||||
build: [
|
||||
["dune" "build" "--only" "hcfg" "--root" "." "-j" jobs "@install"]
|
||||
]
|
@ -0,0 +1,2 @@
|
||||
module Token = Token
|
||||
module Lexer = Lexer
|
@ -0,0 +1,76 @@
|
||||
let is_alpha = function 'a' .. 'z' | 'A' .. 'Z' -> true | _ -> false
|
||||
let is_digit = function '0' .. '9' -> true | _ -> false
|
||||
|
||||
module Lexer = struct
|
||||
include Token
|
||||
|
||||
type lexer = {
|
||||
input : string;
|
||||
position : int;
|
||||
read_position : int;
|
||||
ch : char;
|
||||
}
|
||||
|
||||
let null_byte = '\x00'
|
||||
|
||||
|
||||
let advance lexer =
|
||||
let read_to_end = lexer.read_position >= String.length(lexer.input) in
|
||||
let new_ch = if read_to_end then null_byte else String.get lexer.input lexer.read_position in
|
||||
let new_lexer = {lexer with position = lexer.read_position; read_position = lexer.read_position + 1; ch = new_ch} in
|
||||
new_lexer
|
||||
|
||||
let peek_char lexer =
|
||||
let read_to_end = lexer.read_position >= String.length(lexer.input) - 1 in
|
||||
if read_to_end then null_byte else String.get lexer.input (lexer.read_position + 1)
|
||||
|
||||
|
||||
let new_lexer input_string =
|
||||
let lexer = {
|
||||
input = input_string;
|
||||
position = 0;
|
||||
read_position = 0;
|
||||
ch = null_byte;
|
||||
} in
|
||||
advance lexer
|
||||
|
||||
let read_string lexer =
|
||||
let rec read_str lxr str =
|
||||
match lxr.ch with
|
||||
| '"' -> (advance lxr, Token.STRING str)
|
||||
| c -> read_str (advance lxr) (str ^ Char.escaped c) in
|
||||
read_str lexer ""
|
||||
|
||||
let rec read_ident lxr str =
|
||||
match lxr.ch with
|
||||
| '0'..'9' | 'a'..'z' | 'A'..'Z' | '_' as c -> read_ident (advance lxr) (str ^ Char.escaped c)
|
||||
| _ -> (lxr, Token.IDENT str)
|
||||
|
||||
let read_number lexer =
|
||||
let rec read_num lxr str =
|
||||
match lxr.ch with
|
||||
| '0'..'9' as c -> read_num (advance lxr) (str ^ Char.escaped c)
|
||||
| _ -> (lxr, Token.INTEGER (int_of_string str))
|
||||
in
|
||||
read_num lexer ""
|
||||
|
||||
let next_char lexer = match lexer.ch with
|
||||
| ':' -> (advance lexer, Token.COLON)
|
||||
| ',' -> (advance lexer, Token.COMMA)
|
||||
| '{' -> (advance lexer, Token.LBRACE)
|
||||
| '}' -> (advance lexer, Token.RBRACE)
|
||||
| '"' -> advance lexer |> read_string
|
||||
| ' ' -> (advance lexer, Token.SPACE)
|
||||
| 'a'..'z' | 'A'..'Z' as c -> read_ident (advance lexer) (Char.escaped c)
|
||||
| '0'..'9' -> read_number lexer
|
||||
| '\x00' -> (lexer, Token.EOF)
|
||||
| _ -> (advance lexer, Token.ILLEGAL)
|
||||
|
||||
let generate_tokens input_string =
|
||||
let lexer = new_lexer input_string in
|
||||
let rec gen lxr tokens =
|
||||
match next_char lxr with
|
||||
| (_, Token.EOF) -> List.rev_append (List.filter (fun t -> t != Token.SPACE) tokens) [Token.EOF]
|
||||
| (l, tok) -> gen l (tok :: tokens)
|
||||
in gen lexer []
|
||||
end
|
@ -0,0 +1,31 @@
|
||||
module Token = struct
|
||||
type token_type =
|
||||
| ILLEGAL
|
||||
| EOF
|
||||
| SPACE
|
||||
(* Identifiers and literals *)
|
||||
| IDENT of string
|
||||
| INTEGER of int
|
||||
| FLOAT of float
|
||||
| STRING of string
|
||||
(* -- Delimiters *)
|
||||
| COLON
|
||||
| COMMA
|
||||
| LBRACE
|
||||
| RBRACE
|
||||
|
||||
let token_to_string = function
|
||||
| ILLEGAL -> "ILLEGAL"
|
||||
| EOF -> "EOF"
|
||||
| SPACE -> "SPACE"
|
||||
| IDENT a -> "IDENT " ^ a
|
||||
| INTEGER a -> "INTEGER " ^ string_of_int a
|
||||
| FLOAT a -> "FLOAT " ^ string_of_float a
|
||||
| STRING a -> "STRING " ^ a
|
||||
| COMMA -> "COMMA"
|
||||
| COLON -> "COLON"
|
||||
| LBRACE -> "LBRACE"
|
||||
| RBRACE -> "RBRACE"
|
||||
|
||||
let pretty_print ppf tok = Fmt.pf ppf "Token %s" (token_to_string tok)
|
||||
end
|
@ -0,0 +1,3 @@
|
||||
(test
|
||||
(name test)
|
||||
(libraries alcotest hcfg fmt))
|
@ -0,0 +1,29 @@
|
||||
open Hcfg
|
||||
include Lexer
|
||||
include Token
|
||||
|
||||
let token_testable = Alcotest.testable Token.pretty_print (=)
|
||||
|
||||
let test_lexer_delimiters () =
|
||||
Alcotest.(check (list token_testable))
|
||||
"same token types" [
|
||||
Token.LBRACE
|
||||
; Token.IDENT "key"
|
||||
; Token.COLON
|
||||
; Token.STRING "value"
|
||||
; Token.COMMA
|
||||
; Token.IDENT "other_key"
|
||||
; Token.COLON
|
||||
; Token.INTEGER 100
|
||||
; Token.RBRACE
|
||||
; Token.EOF
|
||||
]
|
||||
(Lexer.generate_tokens {|{key: "value", other_key: 100}|})
|
||||
|
||||
let () =
|
||||
(* Lexer.generate_tokens {|{key: "value",}|} |> List.map Token.token_to_string |> String.concat ", " |> print_endline *)
|
||||
Alcotest.run "Lexer"
|
||||
[
|
||||
( "list-delimiters",
|
||||
[ Alcotest.test_case "first case" `Slow test_lexer_delimiters ] );
|
||||
]
|
Loading…
Reference in New Issue