Function call not inlined

Consider the following program:

let selection_sort cmp a left right =
  for i=left to right-2 do
    let leftmost = Array.get a i in
    let minimum_pos = ref i
    and minimum = ref leftmost in
    for j=i+1 to right-1 do
      let current = Array.get a j in
      if (cmp[@inlined always]) current !minimum < 0 then
        ( minimum := current;
          minimum_pos := j)
    done;
    Array.set a !minimum_pos leftmost;
    Array.set a i !minimum
  done;;

let mycmp x y = x - y;;

let n = 107 in
    let t = Array.init n (fun i -> i lxor 0x45) in
    selection_sort mycmp t 0 n;
    Array.iter (fun x -> Printf.printf "%d " x) t;;

Neither the classic inliner neither flambda at -O3 seem to be able to inline the call to cmp.

File "polymorph.ml", line 8, characters 9-48:
8 |       if (cmp[@inlined always]) current !minimum < 0 then
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Warning 55 [inlining-impossible]: Cannot inline: [@inlined] attribute was not used on this function application (the optimizer did not know what function was being applied)

I would have thought flambda would specialize the selection sort function for that particular cmp?!

Try adding an inline annotation to selection_sort as well. The compiler can’t really inline cmp without inlining selection_sort as a whole, but I’m guessing the inner annotation alone won’t trigger that. So the optimizer is deciding not to inline selection_sort and warns you it can’t satisfy the inline always constraint.

Right. Merely specifying that selection_sort should be inlined is not enough to make the warning disappear, because the compiler still emits it because of the generic version of the code. But if I make the selection_sort function private by using a module interface, indeed, it seems mycmp is not emitted.

Moving on now to the (more interesting) case of nested calls to be specialized.

Now for this example:

let selection_sort cmp a left right =
  for i=left to right-2 do
    let leftmost = Array.get a i in
    let minimum_pos = ref i
    and minimum = ref leftmost in
    for j=i+1 to right-1 do
      let current = Array.get a j in
      if (cmp [@inlined hint]) current !minimum < 0 then
        ( minimum := current;
          minimum_pos := j)
    done;
    Array.set a !minimum_pos leftmost;
    Array.set a i !minimum
  done
  [@@inline always];;

let pivot (cmp : 'a -> 'a -> int) (a : 'a array) (left : int) (right : int) =
  assert(0 <= left);
  assert(left <= right);
  assert(right <= Array.length a);
  let mid_start = ref left
  and mid_end = ref (left + 1)
  and current_pos = ref (right - 1)
  and pivot = a.(left) in
  while !mid_end <= !current_pos do
    assert (left <= !mid_start);
    assert (!mid_start < !mid_end);
    assert (!current_pos < right);
    let current = a.(!current_pos) in
    match (cmp [@inlined hint]) current pivot with
    | 0 ->
       a.(!current_pos) <- a.(!mid_end);
       mid_end := !mid_end + 1
    | n when n < 0 ->
       a.(!mid_start) <- current;
       a.(!current_pos) <- a.(!mid_end);
       mid_start := !mid_start+1;
       mid_end := !mid_end+1
    | _ ->
       current_pos :=  !current_pos - 1
  done;
  for i= !mid_start to !mid_end-1 do
    a.(i) <- pivot
  done;
  (!mid_start, !mid_end)
  [@@inline always];;

let selection_thresh = 10

let rec quicksort (cmp : 'a -> 'a -> int) (a : 'a array) (left : int) (right : int) =
  assert(0 <= left);
  assert(left <= right);
  assert(right <= Array.length a);
  if right - left <= selection_thresh
  then selection_sort cmp a left right
  else
    let (mid_start, mid_end) = pivot cmp a left right in
    let len_left = mid_start - left
    and len_right = right - mid_end in
    if len_left < len_right then
      (quicksort cmp a left mid_start;
       quicksort cmp a mid_end right)
    else
      (quicksort cmp a mid_end right;
       quicksort cmp a left mid_start)
    [@@inline always];;

It seems that quicksort is not specialized (one sees it being called with a closure).

Note that for Flambda at least, you can pass the -inlining-report flag to the compiler, and it will generate some human-readable description of the inlining decisions. In particular, it will show that the size of the selection_sort function is significantly bigger than any benefit from specialising the comparison, which is why it’s not inlined (or specialised, which is almost the same for non-recursive functions).

We have thought about a few features that would have helped in this case (the initial one), most notably an annotation on parameters that triggers specialisation/inlining if the parameter is “known” at the call site (in your case, you could have added the annotation on the cmp parameter), but it’s not implemented yet.
We do have in internship proposition still open on the subject (in french),

In the quicksort example, what would be needed would be the ability to specialize functions without inlining. Is this feasible?

Yes, it only works for recursive functions but you can use [@@specialise always] instead of [@@inline always].

[@@specialise always] does not seem to have an effect on quicksort, or maybe I used it wrong?

Did you check what happens when the function is used, outside of its definition ? The definition itself cannot be specialised, since you don’t know its parameters yet. The specialise annotation should ensure that whenever you call quicksort (outside its definition), a new version is created which is specialised to the comparison function you gave.

My bad, I took the wrong version of the .s file, and indeed the mycmp function disappeared, meaning it was inlined.

Now getting the same problem with this version. Normally quicksort and its sub-calls should be specialized, but they are not (the mycmp function is passed as a closure). Is this an issue with cross-module specialization/inlining?

It looks like a silly typo: the attribute is [@@specialise], not [@@specialize]. Could you check again with this fix ?

OH DEAR. American vs British spelling, and I tend to write American. Thanks!

Indeed, it seems that [@@specialise] works. I’ll see if I can make the big arrays polymorphic as well.

1 Like

Let us consider now the specialization of accesses to big arrays.

open Bigarray

type array_type = (int, int_elt, c_layout) Array1.t

let selection_sort  (cmp : 'a -> 'a -> int)
      (a : array_type) left right =
  for i=left to right-2 do
    let leftmost = Array1.get a i in
    let minimum_pos = ref i
    and minimum = ref leftmost in
    for j=i+1 to right-1 do
      let current = Array1.get a j in
      if (cmp[@inlined hint]) current !minimum < 0 then
        ( minimum := current;
          minimum_pos := j)
    done;
    Array1.set a !minimum_pos leftmost;
    Array1.set a i !minimum
  done
  [@@inline always];;

let mycmp x y = x - y;;

let n = 107 in
    let t = Array1.init Int C_layout n (fun i -> i lxor 0x45) in
    selection_sort mycmp t 0 n;
    for i=0 to n-1 do
      Printf.printf "%d " (Array1.get t i)
    done;;

Compiling this code yields no caml_ba function calls. Now compile a empty .mli file (to ensure that no generic version of the sorting code is produced), and remove the type annotation in the signature of selection_sort. The code then produced contains caml_ba calls. It looks like function specialization does not specialize Bigarray accesses regarding to layout and kind.

Indeed. Array and Bigarray specialisation relies on the typing information, while inlining and specialisation of functions occur after most of the types have been discarded. So even when the function is specialised, we don’t have access to the typing information that would allow us to specialise the Bigarray accesses.
We’ve managed to rebuild some amount of Array specialisation after inlining in our experimental Flambda 2 branch, but we haven’t done the same for Bigarrays yet. Hopefully by the time Flambda 2 is merged Bigarrays will be handled too, but this is likely not going to happen anytime soon.

Thank you for taking the time to answer my naive questions.

By the way, if you want your Bigarray example to work, you can cheat by passing the get/set functions as parameters:

let selection_sort get set (cmp : 'a -> 'a -> int)
      a left right =
  for i=left to right-2 do
    let leftmost = get a i in
    let minimum_pos = ref i
    and minimum = ref leftmost in
    for j=i+1 to right-1 do
      let current = get a j in
      if (cmp[@inlined hint]) current !minimum < 0 then
        ( minimum := current;
          minimum_pos := j)
    done;
    set a !minimum_pos leftmost;
    set a i !minimum
  done
  [@@inline always];;

let mycmp x y = x - y;;

open Bigarray

type array_type = (int, int_elt, c_layout) Array1.t

let () =
  let n = 107 in
  let t = Array1.init Int C_layout n (fun i -> i lxor 0x45) in
  let[@inline] get (a : array_type) i = Array1.get a i in
  let[@inline] set (a : array_type) i v = Array1.set a i v in
    selection_sort get set mycmp t 0 n;
    for i=0 to n-1 do
      Printf.printf "%d " (Array1.get t i)
    done;;

This trick means that you never generate non-specialised bigarray operations, so you should get the result you want. (And as a bonus, you can reuse the same code for arrays, bigarrays, and whatever other datatypes you might think of that implements get and set.)

Oh, indeed. (For the record: I was trying to evaluate the how Domainslib could help us speed up some computations, and I tried implementing parallel quicksort first as a test, then I got sucked into issues that did not have anything to do with concurrency.)

1 Like