Help with Jane street extension performance

I am trying to understand how to use the unboxing and local features from the Jane Street fork to get more performant code. This is from the link here: Jane Street OCaml extensions When I benchmark this, the first function is much faster than the second. I used the -dcmm flag to dump the result but the final code looks identical to me so I’m stumped as to why one is slower than the other. The only difference I can see is that the second function gets inlined into the main expression at the very end, and the first does not. I tried to use the [@inline never] annotation to prevent the second function from being unlined, but from my inspection of the cmm output this had no affect, so I don’t know if this annotation is implemented in the new pass or there’s a bug.

@dkalinichenko do you have any thoughts?

module M = Stdlib_upstream_compatible.Float_u

let pi_approx : int -> float =
  fun n ->
  let rec pi_sum k (acc: float#) : float# =
    if k > 0 then 
      pi_sum (k-1)
        (M.add acc (let a = M.of_float (float_of_int k) in M.div #1. (M.mul a a)))
    else acc
  in
  M.to_float (pi_sum n #0.)

type float_ref = { mutable field : float# };;
let pi_approx_while n =
  let local_ acc : float_ref = { field = #0.0 } in
  let local_ k = ref n in
  while !k > 0 do
    acc.field <-
      (M.add acc.field @@
        (let a = M.of_float (float_of_int !k) in M.div #1. (M.mul a a)));
    k := local_ (!k - 1)
  done;
  M.to_float acc.field
;;
 
let () =
  (let t0 = Unix.gettimeofday () in
  let b = (pi_approx 10_000_000) in 
  let t1 = Unix.gettimeofday () in
   Printf.printf "%f %f\n" (t1 -. t0) b);
  flush stdout;
  (let t0 = Unix.gettimeofday () in
  let s = pi_approx_while 10_000_000 in
  let t1 = Unix.gettimeofday () in
   Printf.printf "%f %f\n" (t1 -. t0) s);
  flush stdout;
;;

Output:

0.023381 1.644934
0.062211 1.644934