Last week I decided to see what was possible to do with custom runtime events.
I wrote this piece of code:
(** Operation to profile *)
let my_slow_op res i =
res := !res + i;
Unix.sleepf 0.001
(** Operation to profile *)
let my_op res i = res := !res + i
let my_op =
match Sys.argv.(4) with "slow" -> my_slow_op | _ | (exception _) -> my_op
(** Profile with a [time] function (usually Unix.gettimeofday or Sys.time *)
let profile time _n times =
let total_time = ref 0. in
let res = ref 0 in
for i = 1 to times do
let start = time () in
my_op res i;
let stop = time () in
total_time := !total_time +. (stop -. start)
done;
(!total_time, !res)
let sys_profile = profile Sys.time
let unix_profile = profile Unix.gettimeofday
(** No profiling, control function*)
let no_profile = profile (fun () -> 0.)
(** Profile with [mclock] elapsed function *)
let clock_profile _n times =
let total_time = ref 0L in
let res = ref 0 in
for i = 1 to times do
let start = Mtime_clock.elapsed_ns () in
my_op res i;
let stop = Mtime_clock.elapsed_ns () in
total_time := Int64.add !total_time (Int64.sub stop start)
done;
(!total_time, !res)
(** Profile with [mclock] counter function *)
let counter_profile _n times =
let counter = Mtime_clock.counter () in
let total_time = ref Mtime.Span.zero in
let res = ref 0 in
for i = 1 to times do
let start = Mtime_clock.count counter in
my_op res i;
let stop = Mtime_clock.count counter in
total_time := Mtime.Span.add !total_time (Mtime.Span.abs_diff stop start)
done;
(!total_time, !res)
(** Custom runtime events *)
type Runtime_events.User.tag += CustomSpan
type span = Begin | End
let to_int = function Begin -> 0 | End -> 1
let of_int = function 0 -> Begin | 1 -> End | _ -> assert false
(** Our custom event is an (int64 * span) pair *)
let custom_span_encoding =
let encode buffer (count, span) =
Bytes.set_int64_ne buffer 0 count;
Bytes.set_int8 buffer 8 (to_int span);
9
in
let decode buffer _size =
let count = Bytes.get_int64_ne buffer 0 in
let span = Bytes.get_int8 buffer 8 |> of_int in
(count, span)
in
Runtime_events.Type.register ~encode ~decode
(** Register our custom event *)
let count_span =
Runtime_events.User.register "count.span" CustomSpan custom_span_encoding
(** Handle our custom event:
- If the event associated value is Begin, register the prev_time
- If the event associated value is End, add to the total_time the current
time minus prev_time *)
let span_event_handler, get_time =
let prev_time = ref 0L in
let total_time = ref 0L in
let prev_count = ref 0L in
( (fun _domain_id ts event value ->
match (Runtime_events.User.tag event, value) with
| CustomSpan, (count, Begin) ->
prev_time := Runtime_events.Timestamp.to_int64 ts;
prev_count := count
| CustomSpan, (_count, End) ->
(* if not @@ Int64.equal !prev_count count then *)
(* failwith *)
(* (Printf.sprintf "Prev: %d, Current: %d" (Int64.to_int !prev_count) *)
(* (Int64.to_int count)); *)
total_time :=
Int64.add !total_time
(Int64.sub (Runtime_events.Timestamp.to_int64 ts) !prev_time)
| _ -> ()),
fun () -> !total_time )
(** Profile with custom runtime events *)
let runtime_event_profile _n times =
let res = ref 0 in
for i = 1 to times do
Runtime_events.User.write count_span (Int64.of_int i, Begin);
my_op res i;
Runtime_events.User.write count_span (Int64.of_int i, End)
done;
!res
let finished = ref false
(** This function periodically polls events from the ring buffer containing all
emitted events and triggers their callbacks. It should be executed in a
parallel process *)
let read_poll _pid _path =
let cursor = Runtime_events.create_cursor None in
let callbacks =
Runtime_events.(
Callbacks.create ()
|> Callbacks.add_user_event custom_span_encoding span_event_handler)
in
fun () ->
Fun.protect
~finally:(fun () ->
Runtime_events.read_poll cursor callbacks None |> ignore;
Runtime_events.free_cursor cursor)
(fun () ->
while not !finished do
Runtime_events.read_poll cursor callbacks None |> ignore
done;
get_time ())
(** Gather the results *)
let () =
Runtime_events.start ();
let times = try int_of_string Sys.argv.(2) with _ -> 100 in
let n = try int_of_string Sys.argv.(3) with _ -> 40 in
let total_time =
match Sys.argv.(1) with
| "unix" ->
let total_time, _ = unix_profile n times in
total_time *. 1_000_000_000.
| "sys" ->
let total_time, _ = sys_profile n times in
total_time *. 1_000_000_000.
| "clock" ->
let total_time, _ = clock_profile n times in
let total_time = Int64.to_float total_time in
total_time
| "counter" ->
let total_time, _ = counter_profile n times in
Mtime.Span.to_float_ns total_time
| "runtime" ->
let main_pid = Unix.getpid () in
let path = Runtime_events.path () |> Option.get in
let d = Domain.spawn (read_poll main_pid path) in
let _ = runtime_event_profile n times in
finished := true;
let total_time = Domain.join d in
total_time |> Int64.to_float
| _ | (exception _) -> no_profile n times |> fst
in
let mean_time = total_time /. float times in
Format.eprintf "total: %.2f, mean: %.2f@." total_time mean_time
Basically it creates two functions, a fast and a slow one that I want to profile and it profiles them with Unix.gettimofday, Sys.time, Mclock functions and Runtime custom events.
Let’s see what happens with the slow one:
❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 500 30 slow'
Benchmark 1: ./_build/default/bin/main.exe no 500 30 slow
Time (mean ± σ): 531.9 ms ± 0.4 ms [User: 0.5 ms, System: 5.7 ms]
Range (min … max): 531.3 ms … 532.7 ms 10 runs
Benchmark 2: ./_build/default/bin/main.exe unix 500 30 slow
Time (mean ± σ): 532.0 ms ± 0.3 ms [User: 0.2 ms, System: 6.0 ms]
Range (min … max): 531.4 ms … 532.5 ms 10 runs
Benchmark 3: ./_build/default/bin/main.exe clock 500 30 slow
Time (mean ± σ): 532.3 ms ± 0.6 ms [User: 0.4 ms, System: 6.0 ms]
Range (min … max): 531.4 ms … 533.3 ms 10 runs
Benchmark 4: ./_build/default/bin/main.exe runtime 500 30 slow
Time (mean ± σ): 533.3 ms ± 2.1 ms [User: 525.3 ms, System: 5.7 ms]
Range (min … max): 531.9 ms … 539.2 ms 10 runs
❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 50000 30 slow'
Benchmark 1: ./_build/default/bin/main.exe no 50000 30 slow
Time (mean ± σ): 52.633 s ± 0.001 s [User: 0.009 s, System: 0.070 s]
Range (min … max): 52.630 s … 52.634 s 10 runs
Benchmark 2: ./_build/default/bin/main.exe unix 50000 30 slow
Time (mean ± σ): 52.636 s ± 0.002 s [User: 0.003 s, System: 0.076 s]
Range (min … max): 52.634 s … 52.639 s 10 runs
Benchmark 3: ./_build/default/bin/main.exe clock 50000 30 slow
Time (mean ± σ): 52.639 s ± 0.005 s [User: 0.005 s, System: 0.076 s]
Range (min … max): 52.635 s … 52.654 s 10 runs
Benchmark 4: ./_build/default/bin/main.exe runtime 50000 30 slow
Time (mean ± σ): 52.729 s ± 0.023 s [User: 52.336 s, System: 0.121 s]
Range (min … max): 52.689 s … 52.761 s 10 runs
And the results returned by each function:
❯ ./_build/default/bin/main.exe unix 500 30 slow
total: 526319503.78, mean: 1052639.01
❯ ./_build/default/bin/main.exe clock 500 30 slow
total: 526429191.00, mean: 1052858.38
❯ ./_build/default/bin/main.exe counter 500 30 slow
total: 526507983.00, mean: 1053015.97
❯ ./_build/default/bin/main.exe runtime 500 30 slow
total: 525626739.00, mean: 1051253.48
Ok, so my function is slow enough so that my profiling functions impact is negligible and all the profiling tools are returning the same result, that’s good.
What happens with the fast one?
❯ hyperfine --warmup 3 -N -L profiler no,unix,clock,runtime './_build/default/bin/main.exe {profiler} 50000 30'
Benchmark 1: ./_build/default/bin/main.exe no 50000 30
Time (mean ± σ): 4.9 ms ± 0.5 ms [User: 0.3 ms, System: 4.5 ms]
Range (min … max): 3.8 ms … 7.0 ms 561 runs
Benchmark 2: ./_build/default/bin/main.exe unix 50000 30
Time (mean ± σ): 7.5 ms ± 0.6 ms [User: 2.8 ms, System: 4.5 ms]
Range (min … max): 6.2 ms … 12.1 ms 387 runs
Benchmark 3: ./_build/default/bin/main.exe clock 50000 30
Time (mean ± σ): 7.7 ms ± 0.8 ms [User: 2.7 ms, System: 4.8 ms]
Range (min … max): 6.2 ms … 13.1 ms 478 runs
Benchmark 4: ./_build/default/bin/main.exe runtime 50000 30
Error: Command terminated with non-zero exit code 2 in benchmark iteration 19. Use the '-i'/'--ignore-failure' option if you want to ignore this. Alternatively, use the '--show-output' option to debug what went wrong.
Oh, the runtime one is failing. Let’s see what happens:
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 6709119.00, mean: 134.18
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4446581.00, mean: 88.93
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4496382.00, mean: 89.93
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 5004347.00, mean: 100.09
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Failure("Runtime_events: corrupt stream")
The results are inconsistent and it fails randomly? Is one result at least correct?
❯ ./_build/default/bin/main.exe unix 50000 30
total: 1240968.70, mean: 24.82
❯ ./_build/default/bin/main.exe clock 50000 30
total: 1310707.00, mean: 26.21
❯ ./_build/default/bin/main.exe counter 50000 30
total: 1191367.00, mean: 23.83
We’re far from the results returned by the other profiling tools.
If I uncomment
if not @@ Int64.equal !prev_count count then
failwith
(Printf.sprintf "Prev: %d, Current: %d" (Int64.to_int !prev_count)
(Int64.to_int count));
Here’s what I get:
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4606128.00, mean: 92.12
❯ ./_build/default/bin/main.exe runtime 50000 30
total: 4442051.00, mean: 88.84
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Fun.Finally_raised: Failure("Prev: 33894, Current: 33973")
❯ ./_build/default/bin/main.exe runtime 50000 30
Fatal error: exception Fun.Finally_raised: Failure("Prev: 20291, Current: 20380")
❯ ./_build/default/bin/main.exe runtime 50000 30
If you read the link about runtime events you know they’re stored in a ring buffer. Looks like the events are written faster than they’re polled and this messes with the ring buffer content.
I tried polling after emitting an End event but this increased the execution time drastically so that’s not a solution.
They lived happily everafter with a lot of small monads.
The end