It shouldn’t make a difference to the timing. On *nix systems at least the pipe is established straightaway so “time” ends up running more or less immediately wherever you put it in the pipe. The other statistics – other than the clock – however may be more useful if associated to the exe in question though.
That’s interesting. I’m compiling the OCaml programme under version 4.13.1. What about you, and might that make a difference?
4.14, alpine musl (but mawk links to glibc because nix). Heard there were nice perf improvements to the gc on this release (they added prefetching, was it this or 4.13?)
I’ve made an over-optimised OCaml version. On my machine it is more than three times as fast as the mawk version, although you only see it if you first uncompress the data (otherwise the cost of bzcat
dominates everything else). Of course, I’ve cheated a bit (I don’t even build maps/dictionaries), but otherwise it’s very reasonable. The only place where I use mutable state is for printing.
Here is the code (it needs OCaml 4.14 for a few library functions):
module Parser (Input : sig val input : string end) = struct
let s = Input.input
let len = String.length s
let[@inline] get pos =
String.unsafe_get s pos
let rec index_rec lim i c =
if i >= lim then raise Not_found else
if get i = c then i else index_rec lim (i + 1) c
let sub_matches pos reference =
let len_ref = String.length reference in
let rec aux i =
if i = len_ref then true
else if get (i + pos) <> String.unsafe_get reference i then false
else aux (i + 1)
in len >= len_ref + pos && aux 0
let print_buf = Buffer.create 64
let print_game
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
=
Buffer.clear print_buf;
if white_found then Buffer.add_substring print_buf s white_start white_length;
Buffer.add_char print_buf ',';
if black_found then Buffer.add_substring print_buf s black_start black_length;
Buffer.add_char print_buf ',';
if time_control_found then Buffer.add_substring print_buf s time_control_start time_control_length;
Buffer.add_char print_buf ',';
if result_found then Buffer.add_substring print_buf s result_start result_length;
Buffer.add_char print_buf ',';
if utcdate_found then Buffer.add_substring print_buf s utcdate_start utcdate_length;
Buffer.add_char print_buf ',';
if white_elo_found then Buffer.add_substring print_buf s white_elo_start white_elo_length;
Buffer.add_char print_buf ',';
if black_elo_found then Buffer.add_substring print_buf s black_elo_start black_elo_length;
Buffer.add_char print_buf '\n';
Buffer.output_buffer stdout print_buf;
()
let parse () =
let rec parse_game current_pos =
parse_lines current_pos
false 0 0
false 0 0
false 0 0
false 0 0
false 0 0
false 0 0
false 0 0
and parse_lines line_start
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
=
let[@local] finish_game line_end =
print_game
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length;
parse_game (line_end + 1)
in
let[@local] ignore_line line_end =
parse_lines (line_end + 1)
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
in
let[@local] parse_line line_end =
match get line_start with
| '[' ->
let pos = line_start + 1 in
let first_space = index_rec line_end line_start ' ' in
if first_space = pos + 5 &&
sub_matches pos "White"
then
parse_lines line_end
true (first_space + 2) (line_end - first_space - 3)
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 5 &&
sub_matches pos "Black"
then
parse_lines line_end
white_found white_start white_length
true (first_space + 2) (line_end - first_space - 3)
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 10 &&
sub_matches pos "TimeControl"
then
parse_lines line_end
white_found white_start white_length
black_found black_start black_length
true (first_space + 2) (line_end - first_space - 3)
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 6 &&
sub_matches pos "Result"
then
parse_lines line_end
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
true (first_space + 2) (line_end - first_space - 3)
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 7 &&
sub_matches pos "UTCDate"
then
parse_lines line_end
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
true (first_space + 2) (line_end - first_space - 3)
white_elo_found white_elo_start white_elo_length
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 8 &&
sub_matches pos "WhiteElo"
then
parse_lines line_end
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
true (first_space + 2) (line_end - first_space - 3)
black_elo_found black_elo_start black_elo_length
else if first_space = pos + 8 &&
sub_matches pos "BlackElo"
then
parse_lines line_end
white_found white_start white_length
black_found black_start black_length
time_control_found time_control_start time_control_length
result_found result_start result_length
utcdate_found utcdate_start utcdate_length
white_elo_found white_elo_start white_elo_length
true (first_space + 2) (line_end - first_space - 3)
else ignore_line line_end
| _ -> finish_game line_end
in
if line_start >= len then ()
else match index_rec len line_start '\n' with
| exception Not_found ->
(* End of file: if the file does not end with a newline
character then handle the last line *)
if line_start < len - 1 then parse_line len else ()
| line_end ->
if line_end = line_start then
(* Empty line: ignore *)
ignore_line line_end
else
parse_line line_end
in
parse_game 0
end
let input = In_channel.input_all stdin
module M = (Parser[@inlined])(struct let input = input end)
let () = M.parse ()
And here are the performance numbers I get:
# bzcat lichess_db_standard_rated_2013-01.pgn.bz2 > uncompressed
# time mawk -f parse.awk < uncompressed | wc -l
121332
real 0m1.063s
user 0m1.075s
sys 0m0.016s
# time ./parse.ocaml < uncompressed | wc -l
121332
real 0m0.289s
user 0m0.253s
sys 0m0.044s
huh i guess bzcat was the bottleneck in both your ridiculously optimized and normal version. Normal ocaml version is already twice as fast as mawk on my machine when the input is uncompressed ahead of time!
The things we do in the name of science Thank you for the experiment, its quite useful to see where the performance is.
It turns out the compiler version matters a lot. My original results were under 4.11.1 it turns out. I switched to 4.14. In the numbers below I also removed bzcat from the mix by uncompressing the file. The scripts process one line at a time, so the rate at which the lines are fed does not matter but it does mean that there is a bzcat attributable constant to remove. Much easier to just remove bzcat Anyway, results on 4.14
awk
real 0m0.642s
user 0m0.667s
sys 0m0.097s
ocaml
real 0m0.381s
user 0m0.308s
sys 0m0.277s
I’ve very very pleased by this result – about twice as fast as my reference implementation. Exactly what I was hoping for. Thank you all for the help getting here.
I guess this is why, once people invested enough time on working on the optimizer middle-end within the compiler, we don’t allow them to write code anymore
Huh, I don’t see this speedup. I uncompressed the input, then sent the output to /dev/null, and mawk runs in .99sec, ocaml in 2.25sec. I compiled with “-O3”, and of course with ocamlopt. I wonder what else is different? I compiled thus:
ocamlfind ocamlopt -O3 -package re2 -linkall -linkpkg -o lichess lichess.ml
Did you do something different?
So I was running it on 4.11.1 before. I switched to 4.14.0 – that’s what made all the difference. Otherwise the parse.awk code is as above.
The latest OCaml code is:
module StringMap = Map.Make(String)
let to_pairs x =
let n = String.length x in
match String.sub x 0 1 with
| exception _ -> None
| "[" ->
(match String.index_from_opt x 2 ' ' with
| Some i -> Some (String.sub x 1 (i-1), String.sub x (i+2) (n-i-4))
| None -> None)
| "1" -> Some ("Game", "")
| _ -> None
let to_csv m =
let f k = StringMap.find k m in
Printf.printf "%s,%s,%s,%s,%s,%s,%s\n"
(f "White") (f "Black") (f "TimeControl")
(f "Result") (f "UTCDate") (f "WhiteElo")
(f "BlackElo")
let () =
let rec loop m = match read_line () with
| exception End_of_file -> ()
| line ->
match to_pairs line with
| None -> loop m
| Some ("Game",_) -> (to_csv m; loop StringMap.empty)
| Some (k, v) -> loop (StringMap.add k v m)
in
loop StringMap.empty
Aha, you switched from Re to straight string-ops. Interesting. I’ve always found that pcre is quite competitive with direct string-manipulation. Wonder why Re2 wasn’t (or whether pcre wouldn’t be competitive in this case).
That’s because we’re comparing apples to oranges here. One version just looks for a space and splits. The other version (the original version with regular expressions) looks for a moderately complex regular expression. I think that if you implemented the space split with a regular expression it wouldn’t be much more expensive than the pure String version, but it seems overkill to use a regular expression for that.