I wrote an awk script a little while back to convert some chess data to CSV. It handles about 160Mb of chess PGN data in about 1.9 seconds piped into it via stdin. I built an equivalent OCaml version yesterday, and stripped it down to something more-or-less the same (awk version does a bit more) so that I could measure performance. I’ve included the code below.
I’m running it as native code (built with dune), piping in the same file. It processes the file in 2.7 seconds versus 1.9 seconds with AWK (run using mawk). I was really hoping to be able to substantially beat the awk version.
Is there something that stands out that I can do to improve this code in order to beat my awk version? I’d like to keep it immutable though please.
Ocaml version:
module StringMap = Map.Make(String)
let re_pairs = Re2.create_exn "([A-Za-z]+) \"([^\"]+)"
let re_line = Re2.create_exn "^(.+)$"
let to_pairs x =
try
let match_ = Re2.first_match_exn re_pairs x in
let k = Re2.Match.get match_ ~sub:(`Index 1) in
let v = Re2.Match.get match_ ~sub:(`Index 2) in
Some ((Option.value k ~default:""),(Option.value v ~default:""))
with
_ -> try Some ("Game", (Re2.find_first_exn re_line x))
with _ -> None
let rec to_map ?(m=StringMap.empty) f =
match f () with
| None -> to_map ~m:m f
| Some ("Game" as k,v) -> StringMap.add k v m
| Some (k,v) -> to_map ~m:(StringMap.add k v m) f
let to_csv m =
let f k= StringMap.find k m in
Printf.printf "%s,%s,%s,%s,%s,%s,%s\n"
(f "White") (f "Black") (f "TimeControl")
(f "Result") (f "UTCDate") (f "WhiteElo")
(f "BlackElo")
let rec parse() =
let read() = to_pairs (read_line()) in
try
(to_map read) |> to_csv;
parse()
with End_of_file -> ()
let ()=parse()
AWK reference version:
{
st = index($0, " ");
n = substr($0, 2, st-2);
v = substr($0, st+1);
gsub(/["\]\.]/, "", v)
x[n] = v;
if (n == "Termination") {
tc = x["TimeControl"];
gsub(/[\+\-]/, ",", tc);
res = "0";
if (x["Result"] == "1-0") res="1";
if (x["Result"] == "0-1") res="-1";
printf "%s,%s,%s,%s,%s,%s,%s\n",
x["White"], x["Black"], tc, res,
x["UTCDate"], x["WhiteElo"], x["BlackElo"];
delete x;
}
}