Profiling an OCaml program

I have the following code:

let profile time _n times =
  let total_time = ref 0. in
  let res = ref 0 in
  for i = 1 to times do
    let start = time () in
    res := !res + i;
    let stop = time () in
    total_time := !total_time +. (stop -. start)
  done;
  (!total_time, !res)

let sys_profile = profile Sys.time
let unix_profile = profile Unix.gettimeofday
let no_profile = profile (fun () -> 0.)

let clock_profile _n times =
  let total_time = ref 0L in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.elapsed_ns () in
    res := !res + i;
    let stop = Mtime_clock.elapsed_ns () in
    total_time := Int64.add !total_time (Int64.sub stop start)
  done;
  (!total_time, !res)

let counter_profile _n times =
  let counter = Mtime_clock.counter () in
  let total_time = ref Mtime.Span.zero in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.count counter in
    res := !res + i;
    let stop = Mtime_clock.count counter in
    total_time := Mtime.Span.add !total_time (Mtime.Span.abs_diff stop start)
  done;
  (!total_time, !res)

type Runtime_events.User.tag += CustomSpan

let count_span =
  Runtime_events.User.register "count.span" CustomSpan Runtime_events.Type.span

let span_event_handler, get_time =
  let time = ref 0L in
  ( (fun _domain_id ts event value ->
      match (Runtime_events.User.tag event, value) with
      | CustomSpan, Runtime_events.Type.Begin ->
          time := Runtime_events.Timestamp.to_int64 ts
      | CustomSpan, End ->
          time := Int64.sub (Runtime_events.Timestamp.to_int64 ts) !time
      | _ -> ()),
    fun () -> !time )

let runtime_event_profile _n times =
  Runtime_events.start ();
  let cursor = Runtime_events.create_cursor None in
  let callbacks =
    Runtime_events.(
      Callbacks.create ()
      |> Callbacks.add_user_event Type.span span_event_handler)
  in
  let total_time = ref 0L in
  let res = ref 0 in
  for i = 1 to times do
    Runtime_events.User.write count_span Begin;
    res := !res + i;
    Runtime_events.User.write count_span End;
    ignore (Runtime_events.read_poll cursor callbacks None);
    total_time := Int64.add !total_time (get_time ())
  done;
  (!total_time, !res)

let () =
  let times = try int_of_string Sys.argv.(2) with _ -> 100 in
  let n = try int_of_string Sys.argv.(3) with _ -> 40 in
  let total_time =
    match Sys.argv.(1) with
    | "unix" ->
        let total_time, _ = unix_profile n times in
        total_time *. 1_000_000_000.
    | "sys" ->
        let total_time, _ = sys_profile n times in
        total_time *. 1_000_000_000.
    | "clock" ->
        let total_time, _ = clock_profile n times in
        let total_time = Int64.to_float total_time in
        total_time
    | "counter" ->
        let total_time, _ = counter_profile n times in
        Mtime.Span.to_float_ns total_time
    | "runtime" ->
        let total_time, _ = runtime_event_profile n times in
        let total_time = Int64.to_float total_time in
        total_time
    | _ -> no_profile n times |> fst
  in
  let mean_time = total_time /. float times in
  Format.eprintf "total: %.2f, mean: %.2f@." total_time mean_time

Where I’m trying to profile the same simple addition to see which time computation is the fastest. Here are the results:

❯ hyperfine --warmup 3 -N -L profiler no,clock,counter,unix,runtime './_build/default/bin/main.exe {profiler} 5000000 30'
Benchmark 1: ./_build/default/bin/main.exe no 5000000 30
  Time (mean ± σ):      14.6 ms ±   0.3 ms    [User: 13.7 ms, System: 0.7 ms]
  Range (min … max):    14.2 ms …  16.4 ms    202 runs
 
Benchmark 2: ./_build/default/bin/main.exe clock 5000000 30
  Time (mean ± σ):     229.8 ms ±   1.7 ms    [User: 227.2 ms, System: 0.8 ms]
  Range (min … max):   227.7 ms … 232.8 ms    13 runs
 
Benchmark 3: ./_build/default/bin/main.exe counter 5000000 30
  Time (mean ± σ):     243.8 ms ±   2.1 ms    [User: 241.0 ms, System: 0.7 ms]
  Range (min … max):   240.9 ms … 247.9 ms    12 runs
 
Benchmark 4: ./_build/default/bin/main.exe unix 5000000 30
  Time (mean ± σ):     259.2 ms ±   1.3 ms    [User: 256.0 ms, System: 1.0 ms]
  Range (min … max):   257.6 ms … 261.7 ms    11 runs

Benchmark 5: ./_build/default/bin/main.exe sys 5000000 30
  Time (mean ± σ):      3.632 s ±  0.028 s    [User: 0.542 s, System: 3.069 s]
  Range (min … max):    3.604 s …  3.688 s    10 runs
 
Benchmark 5: ./_build/default/bin/main.exe runtime 5000000 30
  Time (mean ± σ):      2.791 s ±  0.041 s    [User: 2.755 s, System: 0.017 s]
  Range (min … max):    2.753 s …  2.850 s    10 runs

I may be doing it wrong but using custom runtime events is quite slow, I expected to not pay such a toll when using them. Is there a way of improving it or is this expected when using runtime events?

Not an answer to your question but a section on profiling was recently added to the manual. I don’t know if it exists in a rendered state somewhere yet but it will as soon as 5.4 is released.

So the issue here is that Runtime_events isn’t really designed for you to read from the ring buffer in a loop like that. The idea is that you write events frequently and then only read them periodically or in a different domain or even process. If you remove the read from within the loop then the performance is roughly the same as unix/clock/counter (though sys is still faster):

  ./_build/default/prog.exe no 500000 30 ran
  135.18 ± 19.54 times faster than ./_build/default/prog.exe sys 500000 30
  397.38 ± 57.87 times faster than ./_build/default/prog.exe runtime 500000 30
  402.06 ± 58.32 times faster than ./_build/default/prog.exe unix 500000 30
  405.97 ± 58.75 times faster than ./_build/default/prog.exe clock 500000 30
  413.68 ± 59.74 times faster than ./_build/default/prog.exe counter 500000 30

Profiling shows that nearly all the time in runtime is spent in clock_gettime so the similarity is understandable.

Interestingly my numbers look quite different from the ones you have. What OS are you doing this on? (I am running this on an Ubuntu 24.04/Linux 6.11 system)

Funny that Sys.time is faster for you.

I’m running on CachyOS/Linux 6.15.3-2

I’ll try moving the polling to another process and see how it behaves, thanks!

Ok so it looks like my polling is not fast enough with domains which leads to wrong results. The issue here is that I have a spawned domain that’s supposed to poll continuously but my operation is too fast and the ring-buffer reaches its end before I can pull it.

let my_op res i = res := !res + i
(* Unix.sleepf 0.01 *)

let profile time _n times =
  let total_time = ref 0. in
  let res = ref 0 in
  for i = 1 to times do
    let start = time () in
    my_op res i;
    let stop = time () in
    total_time := !total_time +. (stop -. start)
  done;
  (!total_time, !res)

let sys_profile = profile Sys.time
let unix_profile = profile Unix.gettimeofday
let no_profile = profile (fun () -> 0.)

let clock_profile _n times =
  let total_time = ref 0L in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.elapsed_ns () in
    my_op res i;
    let stop = Mtime_clock.elapsed_ns () in
    total_time := Int64.add !total_time (Int64.sub stop start)
  done;
  (!total_time, !res)

let counter_profile _n times =
  let counter = Mtime_clock.counter () in
  let total_time = ref Mtime.Span.zero in
  let res = ref 0 in
  for i = 1 to times do
    let start = Mtime_clock.count counter in
    my_op res i;
    let stop = Mtime_clock.count counter in
    total_time := Mtime.Span.add !total_time (Mtime.Span.abs_diff stop start)
  done;
  (!total_time, !res)

type Runtime_events.User.tag += CustomSpan
type span = Begin | End

let to_int = function Begin -> 0 | End -> 1
let of_int = function 0 -> Begin | 1 -> End | _ -> assert false

let custom_span_encoding =
  let encode buffer (count, span) =
    Bytes.set_int64_ne buffer 0 count;
    Bytes.set_int8 buffer 8 (to_int span);
    9
  in
  let decode buffer _size =
    let count = Bytes.get_int64_ne buffer 0 in
    let span = Bytes.get_int8 buffer 8 |> of_int in
    (count, span)
  in
  Runtime_events.Type.register ~encode ~decode

let count_span =
  Runtime_events.User.register "count.span" CustomSpan custom_span_encoding

let span_event_handler, get_time =
  let prev_time = ref 0L in
  let total_time = ref 0L in
  let prev_count = ref 0L in
  ( (fun _domain_id ts event value ->
      match (Runtime_events.User.tag event, value) with
      | CustomSpan, (count, Begin) ->
          prev_time := Runtime_events.Timestamp.to_int64 ts;
          prev_count := count
      | CustomSpan, (count, End) ->
          if not @@ Int64.equal !prev_count count then
            failwith
              (Printf.sprintf "Prev: %d, Current: %d" (Int64.to_int !prev_count)
                 (Int64.to_int count));
          total_time :=
            Int64.add !total_time
              (Int64.sub (Runtime_events.Timestamp.to_int64 ts) !prev_time)
      | _ -> ()),
    fun () -> !total_time )

let runtime_event_profile _n times =
  let res = ref 0 in
  for i = 1 to times do
    Runtime_events.User.write count_span (Int64.of_int i, Begin);
    my_op res i;
    Runtime_events.User.write count_span (Int64.of_int i, End)
  done;
  !res

let finished = ref false

let read_poll _pid _path =
  let cursor = Runtime_events.create_cursor None in
  let callbacks =
    Runtime_events.(
      Callbacks.create ()
      |> Callbacks.add_user_event custom_span_encoding span_event_handler)
  in
  fun () ->
    Fun.protect
      ~finally:(fun () ->
        Runtime_events.read_poll cursor callbacks None |> ignore;
        Runtime_events.free_cursor cursor)
      (fun () ->
        while not !finished do
          Runtime_events.read_poll cursor callbacks None |> ignore
        done;
        get_time ())

let () =
  Runtime_events.start ();
  let times = try int_of_string Sys.argv.(2) with _ -> 100 in
  let n = try int_of_string Sys.argv.(3) with _ -> 40 in
  let total_time =
    match Sys.argv.(1) with
    | "unix" ->
        let total_time, _ = unix_profile n times in
        total_time *. 1_000_000_000.
    | "sys" ->
        let total_time, _ = sys_profile n times in
        total_time *. 1_000_000_000.
    | "clock" ->
        let total_time, _ = clock_profile n times in
        let total_time = Int64.to_float total_time in
        total_time
    | "counter" ->
        let total_time, _ = counter_profile n times in
        Mtime.Span.to_float_ns total_time
    | "runtime" ->
        let main_pid = Unix.getpid () in
        let path = Runtime_events.path () |> Option.get in
        let d = Domain.spawn (read_poll main_pid path) in
        let _ = runtime_event_profile n times in
        finished := true;
        let total_time = Domain.join d in
        total_time |> Int64.to_float
    | _ -> no_profile n times |> fst
  in
  let mean_time = total_time /. float times in
  Format.eprintf "total: %.2f, mean: %.2f@." total_time mean_time