Can performance be improved for this code?

It shouldn’t make a difference to the timing. On *nix systems at least the pipe is established straightaway so “time” ends up running more or less immediately wherever you put it in the pipe. The other statistics – other than the clock – however may be more useful if associated to the exe in question though.

That’s interesting. I’m compiling the OCaml programme under version 4.13.1. What about you, and might that make a difference?

4.14, alpine musl (but mawk links to glibc because nix). Heard there were nice perf improvements to the gc on this release (they added prefetching, was it this or 4.13?)

I’ve made an over-optimised OCaml version. On my machine it is more than three times as fast as the mawk version, although you only see it if you first uncompress the data (otherwise the cost of bzcat dominates everything else). Of course, I’ve cheated a bit (I don’t even build maps/dictionaries), but otherwise it’s very reasonable. The only place where I use mutable state is for printing.
Here is the code (it needs OCaml 4.14 for a few library functions):

module Parser (Input : sig val input : string end) = struct
  let s = Input.input
  let len = String.length s

  let[@inline] get pos =
    String.unsafe_get s pos

  let rec index_rec lim i c =
    if i >= lim then raise Not_found else
    if get i = c then i else index_rec lim (i + 1) c

  let sub_matches pos reference =
    let len_ref = String.length reference in
    let rec aux i =
      if i = len_ref then true
      else if get (i + pos) <> String.unsafe_get reference i then false
      else aux (i + 1)
    in len >= len_ref + pos && aux 0

  let print_buf = Buffer.create 64

  let print_game
      white_found white_start white_length
      black_found black_start black_length
      time_control_found time_control_start time_control_length
      result_found result_start result_length
      utcdate_found utcdate_start utcdate_length
      white_elo_found white_elo_start white_elo_length
      black_elo_found black_elo_start black_elo_length
    =
    Buffer.clear print_buf;
    if white_found then Buffer.add_substring print_buf s white_start white_length;
    Buffer.add_char print_buf ',';
    if black_found then Buffer.add_substring print_buf s black_start black_length;
    Buffer.add_char print_buf ',';
    if time_control_found then Buffer.add_substring print_buf s time_control_start time_control_length;
    Buffer.add_char print_buf ',';
    if result_found then Buffer.add_substring print_buf s result_start result_length;
    Buffer.add_char print_buf ',';
    if utcdate_found then Buffer.add_substring print_buf s utcdate_start utcdate_length;
    Buffer.add_char print_buf ',';
    if white_elo_found then Buffer.add_substring print_buf s white_elo_start white_elo_length;
    Buffer.add_char print_buf ',';
    if black_elo_found then Buffer.add_substring print_buf s black_elo_start black_elo_length;
    Buffer.add_char print_buf '\n';
    Buffer.output_buffer stdout print_buf;
    ()

  let parse () =
    let rec parse_game current_pos =
      parse_lines current_pos
        false 0 0
        false 0 0
        false 0 0
        false 0 0
        false 0 0
        false 0 0
        false 0 0
    and parse_lines line_start
        white_found white_start white_length
        black_found black_start black_length
        time_control_found time_control_start time_control_length
        result_found result_start result_length
        utcdate_found utcdate_start utcdate_length
        white_elo_found white_elo_start white_elo_length
        black_elo_found black_elo_start black_elo_length
      =
      let[@local] finish_game line_end =
        print_game
          white_found white_start white_length
          black_found black_start black_length
          time_control_found time_control_start time_control_length
          result_found result_start result_length
          utcdate_found utcdate_start utcdate_length
          white_elo_found white_elo_start white_elo_length
          black_elo_found black_elo_start black_elo_length;
        parse_game (line_end + 1)
      in
      let[@local] ignore_line line_end =
        parse_lines (line_end + 1)
          white_found white_start white_length
          black_found black_start black_length
          time_control_found time_control_start time_control_length
          result_found result_start result_length
          utcdate_found utcdate_start utcdate_length
          white_elo_found white_elo_start white_elo_length
          black_elo_found black_elo_start black_elo_length
      in
      let[@local] parse_line line_end =
        match get line_start with
        | '[' ->
          let pos = line_start + 1 in
          let first_space = index_rec line_end line_start ' ' in
          if first_space = pos + 5 &&
             sub_matches pos "White"
          then
            parse_lines line_end
              true (first_space + 2) (line_end - first_space - 3)
              black_found black_start black_length
              time_control_found time_control_start time_control_length
              result_found result_start result_length
              utcdate_found utcdate_start utcdate_length
              white_elo_found white_elo_start white_elo_length
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 5 &&
                  sub_matches pos "Black"
          then
            parse_lines line_end
              white_found white_start white_length
              true (first_space + 2) (line_end - first_space - 3)
              time_control_found time_control_start time_control_length
              result_found result_start result_length
              utcdate_found utcdate_start utcdate_length
              white_elo_found white_elo_start white_elo_length
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 10 &&
                  sub_matches pos "TimeControl"
          then
            parse_lines line_end
              white_found white_start white_length
              black_found black_start black_length
              true (first_space + 2) (line_end - first_space - 3)
              result_found result_start result_length
              utcdate_found utcdate_start utcdate_length
              white_elo_found white_elo_start white_elo_length
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 6 &&
                  sub_matches pos "Result"
          then
            parse_lines line_end
              white_found white_start white_length
              black_found black_start black_length
              time_control_found time_control_start time_control_length
              true (first_space + 2) (line_end - first_space - 3)
              utcdate_found utcdate_start utcdate_length
              white_elo_found white_elo_start white_elo_length
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 7 &&
                  sub_matches pos "UTCDate"
          then
            parse_lines line_end
              white_found white_start white_length
              black_found black_start black_length
              time_control_found time_control_start time_control_length
              result_found result_start result_length
              true (first_space + 2) (line_end - first_space - 3)
              white_elo_found white_elo_start white_elo_length
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 8 &&
                  sub_matches pos "WhiteElo"
          then
            parse_lines line_end
              white_found white_start white_length
              black_found black_start black_length
              time_control_found time_control_start time_control_length
              result_found result_start result_length
              utcdate_found utcdate_start utcdate_length
              true (first_space + 2) (line_end - first_space - 3)
              black_elo_found black_elo_start black_elo_length
          else if first_space = pos + 8 &&
                  sub_matches pos "BlackElo"
          then
            parse_lines line_end
              white_found white_start white_length
              black_found black_start black_length
              time_control_found time_control_start time_control_length
              result_found result_start result_length
              utcdate_found utcdate_start utcdate_length
              white_elo_found white_elo_start white_elo_length
              true (first_space + 2) (line_end - first_space - 3)
          else ignore_line line_end
        | _ -> finish_game line_end
      in
      if line_start >= len then ()
      else match index_rec len line_start '\n' with
        | exception Not_found ->
          (* End of file: if the file does not end with a newline
             character then handle the last line *)
          if line_start < len - 1 then parse_line len else ()
        | line_end ->
          if line_end = line_start then
            (* Empty line: ignore *)
            ignore_line line_end
          else
            parse_line line_end
    in
    parse_game 0
end

let input = In_channel.input_all stdin

module M = (Parser[@inlined])(struct let input = input end)

let () = M.parse ()

And here are the performance numbers I get:

# bzcat lichess_db_standard_rated_2013-01.pgn.bz2 > uncompressed
# time mawk -f parse.awk < uncompressed | wc -l
121332

real    0m1.063s
user    0m1.075s
sys     0m0.016s
# time ./parse.ocaml < uncompressed | wc -l
121332

real    0m0.289s
user    0m0.253s
sys     0m0.044s
3 Likes

huh i guess bzcat was the bottleneck in both your ridiculously optimized and normal version. Normal ocaml version is already twice as fast as mawk on my machine when the input is uncompressed ahead of time!

3 Likes

The things we do in the name of science :slight_smile: Thank you for the experiment, its quite useful to see where the performance is.

It turns out the compiler version matters a lot. My original results were under 4.11.1 it turns out. I switched to 4.14. In the numbers below I also removed bzcat from the mix by uncompressing the file. The scripts process one line at a time, so the rate at which the lines are fed does not matter but it does mean that there is a bzcat attributable constant to remove. Much easier to just remove bzcat :slight_smile: Anyway, results on 4.14

awk

real 0m0.642s
user 0m0.667s
sys  0m0.097s

ocaml

real 0m0.381s
user 0m0.308s
sys  0m0.277s

I’ve very very pleased by this result – about twice as fast as my reference implementation. Exactly what I was hoping for. Thank you all for the help getting here.

3 Likes

I guess this is why, once people invested enough time on working on the optimizer middle-end within the compiler, we don’t allow them to write code anymore :slight_smile:

10 Likes

Huh, I don’t see this speedup. I uncompressed the input, then sent the output to /dev/null, and mawk runs in .99sec, ocaml in 2.25sec. I compiled with “-O3”, and of course with ocamlopt. I wonder what else is different? I compiled thus:

ocamlfind ocamlopt -O3 -package re2 -linkall -linkpkg -o lichess lichess.ml

Did you do something different?

So I was running it on 4.11.1 before. I switched to 4.14.0 – that’s what made all the difference. Otherwise the parse.awk code is as above.

The latest OCaml code is:

module StringMap = Map.Make(String)

let to_pairs x =
  let n = String.length x in
  match String.sub x 0 1 with
  | exception _ -> None
  | "[" ->
     (match String.index_from_opt x 2 ' ' with
     | Some i -> Some (String.sub x 1 (i-1), String.sub x (i+2) (n-i-4))
     | None   -> None)
  | "1" -> Some ("Game", "")
  | _ -> None

let to_csv m =
  let f k = StringMap.find k m in
  Printf.printf "%s,%s,%s,%s,%s,%s,%s\n"
    (f "White") (f "Black") (f "TimeControl")
    (f "Result") (f "UTCDate") (f "WhiteElo")
    (f "BlackElo")

let () =
  let rec loop m = match read_line () with
  | exception End_of_file -> ()
  | line -> 
       match to_pairs line with
       | None -> loop m
       | Some ("Game",_) -> (to_csv m; loop StringMap.empty)
       | Some (k, v) -> loop (StringMap.add k v m)
  in
  loop StringMap.empty

Aha, you switched from Re to straight string-ops. Interesting. I’ve always found that pcre is quite competitive with direct string-manipulation. Wonder why Re2 wasn’t (or whether pcre wouldn’t be competitive in this case).

1 Like

That’s because we’re comparing apples to oranges here. One version just looks for a space and splits. The other version (the original version with regular expressions) looks for a moderately complex regular expression. I think that if you implemented the space split with a regular expression it wouldn’t be much more expensive than the pure String version, but it seems overkill to use a regular expression for that.