in order to improve myself, I would like to know if you find something wrong with the following function or what I can improve in it:
edit : the following function has been modified in order to not raise an exception Invalid_argument "index out of bounds" with empty or little strings (length == 1) thanks to @jeffsco.
let camel_case_to_capitalized_snake_case str =
let extract str start stop =
let len = (stop + 1) - start in
let sub_string = String.sub str start len in
if start == 0 then String.capitalize_ascii sub_string (* ensure that the first part is always capitalized *)
else String.lowercase_ascii sub_string
in
let len = String.length str in
if len <= 1 then String.capitalize_ascii str
else (
let rec _parse str start acc index =
if index + 1 == len then let sub_string = extract str start index in
let acc' = (sub_string :: acc) in String.concat "_" (List.rev acc')
else (
let c = str.[index] in
let c_next = str.[index + 1] in
if c == Char.lowercase_ascii c && c_next == Char.uppercase_ascii c_next then
let sub_string = extract str start index in
let acc' = (sub_string :: acc) in
_parse str (index + 1) acc' (index + 1)
else
_parse str start acc (index + 1)
)
in _parse str 0 [] 0
)
In fact it seems that the context of the usage of my function is important. This function will be used to convert C structure, union, enum, GObject object or interface names to their corresponding OCaml module name in Capitalized_snake_case name. In my case, I think that I should not separate each capitalized char with an undescore. For example :
SList : SList
IConv : IConv.
IOChannel : IOChannel
PollFD : Poll_fd
MemVTable : Mem_vtable
DoubleIEEE754 : Double_ieee754
Please let me know if you have better ideas for the ouput that my function should generate.
I haven’t looked closely at your implementation but I would try to come up with a set of rules first that capture how to split a camel-cased identifier into words. I probably would implement this using OCamlLex (the scanner generator - based on regular expressions). I understand that you set yourself the constraint not to use regular expressions but I believe they are the right tool for the job here.
I wrote a blog post Recipes for OCamlLex. Edit: and here is a sketch. The idea is to use a scanner to split a string into a list of words. It would now be easy to add special cases, for example for common abbreviations. The file below is camel.mll for OCamlLex.
{
exception Error of string
let error fmt = Printf.kprintf (fun msg -> raise (Error msg)) fmt
let get = Lexing.lexeme
}
let digit = ['0'-'9']
let lower = ['a'-'z']
let upper = ['A'-'Z']
let punct = ['_']
rule split = parse
| upper+ (lower|digit)* { let word = get lexbuf in word :: split lexbuf }
| eof { [] }
| _ { get lexbuf |> error "illegal character '%s'" }
{
let snake_case str =
Lexing.from_string str
|> split
|> ( function
| [] -> []
| x::xs -> x :: List.map String.lowercase_ascii xs
)
|> String.concat "_"
let main () =
Array.to_list Sys.argv
|> List.tl
|> List.map snake_case
|> List.iter print_endline
let () = main (); exit 0
}
You can do what you want in a more functionnal way (without OCamlLex) like this :
let is_lower c = c = Char.lowercase_ascii c
let next_index s offset =
let rec loop quit idx = match is_lower s.[idx] with
| true -> loop true (succ idx)
| false -> if quit then idx else loop false (succ idx)
| exception e -> idx (* out of string index *)
in loop false offset
let decompose s =
let rec loop acc idx =
match next_index s idx - idx with
| 0 -> List.rev acc
| len -> loop (String.sub s idx len :: acc) (len + idx)
in loop [] 0
let camel_case_to_capitalized_snake_case s =
decompose s
|> List.mapi (fun i s -> if i = 0 then s else String.lowercase_ascii s)
|> String.concat "_"
[ "SList" ; "IConv"; "IOChannel"; "PollFD"; "MemVTable"; "DoubleIEEE754"]
|> List.map camel_case_to_capitalized_snake_case;;
- : string list = ["SList"; "IConv"; "IOChannel"; "Poll_fd"; "Mem_vtable"; "Double_ieee754"]
Thanks for taking the time to write this little example. Could you add information about how to use the lexer in a lib. For example, it was not obvious for me that I needed to just add this part in a lib/lexer.ml file :
{
exception Error of string
let error fmt = Printf.kprintf (fun msg -> raise (Error msg)) fmt
let get = Lexing.lexeme
}
let digit = ['0'-'9']
let lower = ['a'-'z']
let upper = ['A'-'Z']
let punct = ['_']
rule split = parse
| upper+ (lower|digit)* { let word = get lexbuf in word :: split lexbuf }
| eof { [] }
| _ { get lexbuf |> error "illegal character '%s'" }
{
let snake_case str =
Lexing.from_string str
|> split
|> ( function
| [] -> []
| x::xs -> x :: List.map String.lowercase_ascii xs
)
|> String.concat "_"
}
Out of bounds errors raise Invalid_argument. In OCaml these exceptions are not meant to be catched, they denote a programming error.
Thanks for the precision. When you say these exceptions denote a programming error do you mean that a program should never throw such exception and that it is the duty of the caller to check that the callee will never throw an Invalid_argument exception ? Is this denotation a tacit convention between OCaml programmer ?
I tried to compile my code with -unsafe and -unsafe-string. In the first case I got an uncatch Invalid_argument exception but with String.sub : I suppose it came from the use in decompose but I don’t understand why. In the second case there is no problem. In both cases, I never saw a segfault.
I changed the next_index function with :
let next_index s offset =
let len = String.length s in
let rec loop quit idx =
if idx >= len then idx
else match is_lower s.[idx] with
| true -> loop true (succ idx)
| false when quit-> idx
| _ -> loop false (succ idx)
in loop false offset
and it works fine with both -unsafe and -unsafe-string options.