Back from the trenches: a tale of custom runtime events and being faster than faster

Last week I decided to see what was possible to do with custom runtime events.

I wrote this piece of code:

(** Operation to profile *)
let my_slow_op res i =
  res := !res + i;
  Unix.sleepf 0.001

(** Operation to profile *)
let my_op res i = res := !res + i

let my_op =
  match Sys.argv.(4) with "slow" -> my_slow_op | _ | (exception _) -> my_op

(** Profile with a [time] function (usually Unix.gettimeofday or Sys.time *)
let profile time _n times =
  let total_time = ref 0. in
  let res = ref 0 in
  for i = 1 to times do
    let start = time () in
    my_op res i;
    let stop = time () in
    total_time := !total_time +. (stop -. start)
  done;
  (!total_time, !res)

let sys_profile = profile Sys.time
let unix_profile = profile Unix.gettimeofday

(** No profiling, control function*)
let no_profile = profile (fun () -> 0.)

(** Profile with [mclock] elapsed function *)
let clock_profile _n times =
  let total_time = ref 0L in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.elapsed_ns () in
    my_op res i;
    let stop = Mtime_clock.elapsed_ns () in
    total_time := Int64.add !total_time (Int64.sub stop start)
  done;
  (!total_time, !res)

(** Profile with [mclock] counter function *)
let counter_profile _n times =
  let counter = Mtime_clock.counter () in
  let total_time = ref Mtime.Span.zero in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.count counter in
    my_op res i;
    let stop = Mtime_clock.count counter in
    total_time := Mtime.Span.add !total_time (Mtime.Span.abs_diff stop start)
  done;
  (!total_time, !res)

(** Custom runtime events *)
type Runtime_events.User.tag += CustomSpan

type span = Begin | End

let to_int = function Begin -> 0 | End -> 1
let of_int = function 0 -> Begin | 1 -> End | _ -> assert false

(** Our custom event is an (int64 * span) pair *)
let custom_span_encoding =
  let encode buffer (count, span) =
    Bytes.set_int64_ne buffer 0 count;
    Bytes.set_int8 buffer 8 (to_int span);
    9
  in
  let decode buffer _size =
    let count = Bytes.get_int64_ne buffer 0 in
    let span = Bytes.get_int8 buffer 8 |> of_int in
    (count, span)
  in
  Runtime_events.Type.register ~encode ~decode

(** Register our custom event *)
let count_span =
  Runtime_events.User.register "count.span" CustomSpan custom_span_encoding

(** Handle our custom event:
    - If the event associated value is Begin, register the prev_time
    - If the event associated value is End, add to the total_time the current
      time minus prev_time *)
let span_event_handler, get_time =
  let prev_time = ref 0L in
  let total_time = ref 0L in
  let prev_count = ref 0L in
  ( (fun _domain_id ts event value ->
      match (Runtime_events.User.tag event, value) with
      | CustomSpan, (count, Begin) ->
          prev_time := Runtime_events.Timestamp.to_int64 ts;
          prev_count := count
      | CustomSpan, (_count, End) ->
          (* if not @@ Int64.equal !prev_count count then *)
          (*     failwith *)
          (*       (Printf.sprintf "Prev: %d, Current: %d" (Int64.to_int !prev_count) *)
          (*          (Int64.to_int count)); *)
          total_time :=
            Int64.add !total_time
              (Int64.sub (Runtime_events.Timestamp.to_int64 ts) !prev_time)
      | _ -> ()),
    fun () -> !total_time )

(** Profile with custom runtime events *)
let runtime_event_profile _n times =
  let res = ref 0 in
  for i = 1 to times do
    Runtime_events.User.write count_span (Int64.of_int i, Begin);
    my_op res i;
    Runtime_events.User.write count_span (Int64.of_int i, End)
  done;
  !res

let finished = ref false

(** This function periodically polls events from the ring buffer containing all
    emitted events and triggers their callbacks. It should be executed in a
    parallel process *)
let read_poll _pid _path =
  let cursor = Runtime_events.create_cursor None in
  let callbacks =
    Runtime_events.(
      Callbacks.create ()
      |> Callbacks.add_user_event custom_span_encoding span_event_handler)
  in
  fun () ->
    Fun.protect
      ~finally:(fun () ->
        Runtime_events.read_poll cursor callbacks None |> ignore;
        Runtime_events.free_cursor cursor)
      (fun () ->
        while not !finished do
          Runtime_events.read_poll cursor callbacks None |> ignore
        done;
        get_time ())

(** Gather the results *)
let () =
  Runtime_events.start ();
  let times = try int_of_string Sys.argv.(2) with _ -> 100 in
  let n = try int_of_string Sys.argv.(3) with _ -> 40 in
  let total_time =
    match Sys.argv.(1) with
    | "unix" ->
        let total_time, _ = unix_profile n times in
        total_time *. 1_000_000_000.
    | "sys" ->
        let total_time, _ = sys_profile n times in
        total_time *. 1_000_000_000.
    | "clock" ->
        let total_time, _ = clock_profile n times in
        let total_time = Int64.to_float total_time in
        total_time
    | "counter" ->
        let total_time, _ = counter_profile n times in
        Mtime.Span.to_float_ns total_time
    | "runtime" ->
        let main_pid = Unix.getpid () in
        let path = Runtime_events.path () |> Option.get in
        let d = Domain.spawn (read_poll main_pid path) in
        let _ = runtime_event_profile n times in
        finished := true;
        let total_time = Domain.join d in
        total_time |> Int64.to_float
    | _ | (exception _) -> no_profile n times |> fst
  in
  let mean_time = total_time /. float times in
  Format.eprintf "total: %.2f, mean: %.2f@." total_time mean_time

Basically it creates two functions, a fast and a slow one that I want to profile and it profiles them with Unix.gettimofday, Sys.time, Mclock functions and Runtime custom events.

Let’s see what happens with the slow one:

❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 500 30 slow'
Benchmark 1: ./_build/default/bin/main.exe no 500 30 slow
  Time (mean ± σ):     531.9 ms ±   0.4 ms    [User: 0.5 ms, System: 5.7 ms]
  Range (min … max):   531.3 ms … 532.7 ms    10 runs
 
Benchmark 2: ./_build/default/bin/main.exe unix 500 30 slow
  Time (mean ± σ):     532.0 ms ±   0.3 ms    [User: 0.2 ms, System: 6.0 ms]
  Range (min … max):   531.4 ms … 532.5 ms    10 runs
 
Benchmark 3: ./_build/default/bin/main.exe clock 500 30 slow
  Time (mean ± σ):     532.3 ms ±   0.6 ms    [User: 0.4 ms, System: 6.0 ms]
  Range (min … max):   531.4 ms … 533.3 ms    10 runs
 
Benchmark 4: ./_build/default/bin/main.exe runtime 500 30 slow
  Time (mean ± σ):     533.3 ms ±   2.1 ms    [User: 525.3 ms, System: 5.7 ms]
  Range (min … max):   531.9 ms … 539.2 ms    10 runs
❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 50000 30 slow'
Benchmark 1: ./_build/default/bin/main.exe no 50000 30 slow
  Time (mean ± σ):     52.633 s ±  0.001 s    [User: 0.009 s, System: 0.070 s]
  Range (min … max):   52.630 s … 52.634 s    10 runs
 
Benchmark 2: ./_build/default/bin/main.exe unix 50000 30 slow
  Time (mean ± σ):     52.636 s ±  0.002 s    [User: 0.003 s, System: 0.076 s]
  Range (min … max):   52.634 s … 52.639 s    10 runs
 
Benchmark 3: ./_build/default/bin/main.exe clock 50000 30 slow
  Time (mean ± σ):     52.639 s ±  0.005 s    [User: 0.005 s, System: 0.076 s]
  Range (min … max):   52.635 s … 52.654 s    10 runs
 
Benchmark 4: ./_build/default/bin/main.exe runtime 50000 30 slow
  Time (mean ± σ):     52.729 s ±  0.023 s    [User: 52.336 s, System: 0.121 s]
  Range (min … max):   52.689 s … 52.761 s    10 runs

And the results returned by each function:

❯ ./_build/default/bin/main.exe unix 500 30 slow
total: 526319503.78, mean: 1052639.01
❯ ./_build/default/bin/main.exe clock 500 30 slow
total: 526429191.00, mean: 1052858.38
❯ ./_build/default/bin/main.exe counter 500 30 slow
total: 526507983.00, mean: 1053015.97
❯ ./_build/default/bin/main.exe runtime 500 30 slow
total: 525626739.00, mean: 1051253.48

Ok, so my function is slow enough so that my profiling functions impact is negligible and all the profiling tools are returning the same result, that’s good.

What happens with the fast one?

❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 50000 30'
Benchmark 1: ./_build/default/bin/main.exe no 50000 30
  Time (mean ± σ):       4.9 ms ±   0.5 ms    [User: 0.3 ms, System: 4.5 ms]
  Range (min … max):     3.8 ms …   7.0 ms    561 runs
 
Benchmark 2: ./_build/default/bin/main.exe unix 50000 30
  Time (mean ± σ):       7.5 ms ±   0.6 ms    [User: 2.8 ms, System: 4.5 ms]
  Range (min … max):     6.2 ms …  12.1 ms    387 runs
 
Benchmark 3: ./_build/default/bin/main.exe clock 50000 30
  Time (mean ± σ):       7.7 ms ±   0.8 ms    [User: 2.7 ms, System: 4.8 ms]
  Range (min … max):     6.2 ms …  13.1 ms    478 runs
 
Benchmark 4: ./_build/default/bin/main.exe runtime 50000 30
Error: Command terminated with non-zero exit code 2 in benchmark iteration 19. Use the '-i'/'--ignore-failure' option if you want to ignore this. Alternatively, use the '--show-output' option to debug what went wrong.

Oh, the runtime one is failing. Let’s see what happens:

❯ ./_build/default/bin/main.exe runtime 50000 30
total: 6709119.00, mean: 134.18
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4446581.00, mean: 88.93
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4496382.00, mean: 89.93
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 5004347.00, mean: 100.09
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Failure("Runtime_events: corrupt stream")

The results are inconsistent and it fails randomly? Is one result at least correct?

❯ ./_build/default/bin/main.exe unix 50000 30
total: 1240968.70, mean: 24.82
❯ ./_build/default/bin/main.exe clock 50000 30
total: 1310707.00, mean: 26.21
❯ ./_build/default/bin/main.exe counter 50000 30
total: 1191367.00, mean: 23.83

We’re far from the results returned by the other profiling tools.

If I uncomment

if not @@ Int64.equal !prev_count count then
  failwith
    (Printf.sprintf "Prev: %d, Current: %d" (Int64.to_int !prev_count)
       (Int64.to_int count));

Here’s what I get:

❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4606128.00, mean: 92.12
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4442051.00, mean: 88.84
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Fun.Finally_raised: Failure("Prev: 33894, Current: 33973")
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Fun.Finally_raised: Failure("Prev: 20291, Current: 20380")
❯ ./_build/default/bin/main.exe runtime 50000 30

If you read the link about runtime events you know they’re stored in a ring buffer. Looks like the events are written faster than they’re polled and this messes with the ring buffer content.

I tried polling after emitting an End event but this increased the execution time drastically so that’s not a solution.

They lived happily everafter with a lot of small monads.

The end