Any suggestions on making this piece of OCaml numerical code more efficient?

Dear OCaml hackers,

I have this little piece of code whose performance is critical.
If you have some ideas on how to accelerate it, you are very welcome.

If people are interested, I can create a small repository where the code
would be executable.

I am thinking about giving a try to BER MetaOCaml.

Thanks a lot,
Francois.

  (* initialize x.grid as a vdW grid for ligand atomic number [l_a] *)
  let vdW_grid g prot_atoms l_a =
    let x_min, y_min, z_min = V3.to_triplet g.low in
    let dx = g.step in
    let n = A.length prot_atoms in
    for i = 0 to g.nx - 1 do
      let x = x_min +. (float i) *. dx in
      for j = 0 to g.ny - 1 do
        let y = y_min +. (float j) *. dx in
        for k = 0 to g.nz - 1 do
          let z = z_min +. (float k) *. dx in
          let l_p = V3.make x y z in (* ligand atom position *)
          for m = 0 to n - 1 do (* over all protein atoms *)
            let p_p, p_a = A.unsafe_get prot_atoms m in
            let r_ij = Math.non_zero_dist (V3.dist l_p p_p) in
            let vdw = UFF.vdW_xiDi l_a p_a in
            let p6 = FF.pow6 (vdw.x_ij /. r_ij) in
            (* g.grid.{i,j,k} <-
               g.grid.{i,j,k} +.
               (vdw.d_ij *. ((-2.0 *. p6) +. (p6 *. p6))) *)
            BA3.unsafe_set g.grid i j k
              (BA3.unsafe_get g.grid i j k +.
               (vdw.d_ij *. ((-2.0 *. p6) +. (p6 *. p6))))
          done
        done
      done
    done;
    g
2 Likes

I suspect @vlaviron might have some good ideas.
Maybe @ryanrhymes also.

I don’t know how expensive this operation is, but it looks like you could lift it out of the inner loop and pre-compute an array so it doesn’t have to be repeated x*y*z times.

This is just fetching a value from a 2D array.

This seems pretty close to optimal.
Have you looked at the code compilation in godbolt?
If the floats allocate, you can try using Owl_base.

1 Like

If n is not too large so that you can accommodate with n times float errors
(n = 1000 seems ok to me)
you can replace
let x = x_min +. (float i) *. dx
by
x := !x +. dx

On a little test, the gains are significant: about 40% speedup!

let x_min = 1.1
let y_min = 0.2
let z_min = -. 1.
let dx = 0.0001

let loop_mult n =
  for i = 0 to n do
    let x = x_min +. (float i) *. dx in
    for j = 0 to n do
      let y = y_min +. (float j) *. dx in
      for k = 0 to n do
        let z = z_min +. (float k) *. dx in
        ignore ([| x; y; z|])
      done
    done
  done

let loop_add n =
  let x = ref x_min in
  let y = ref y_min in
  let z = ref z_min in
  for i = 0 to n do
    y := y_min;
    for j = 0 to n do
      z := z_min;
      for k = 0 to n do
        ignore ([| !x; !y; !z|]);
        z := !z +. dx
      done;
      y := !y +. dx
    done;
    x := !x +. dx
  done

let test f s =
  let t1 = Sys.time () in
  ignore (f s);
  let t2 = Sys.time () in
  Printf.printf "Elapsed time = %f\n" (t2 -. t1);;

let () =
  test loop_add 1000;
  test loop_mult 1000

(* ocamlfind ocamlopt -o loops loops.ml *)

(* result:

Elapsed time = 1.626832
Elapsed time = 2.665006

*)
1 Like

I don’t think I can give very good advice with just that piece of code.
Knowing what the expected values are for g.nx, g.ny, g.nz and n would help, but I can’t say much about the inner loop without knowing how the various functions are implemented.
A small improvement, maybe: ((-2.0 *. p6) +. (p6 *. p6)) is equivalent (up to rounding errors) to ((p6 -. 2.0) *. p6), which saves one floating-point multiplication.
I might be able to find some time to investigate further if you setup a repository.

1 Like

I also suspect that in V3.dist l_p p_p
you have a square root that is not necessary, since afterwards you compute the 6th power.
You should let r_ij be the norm squared
and then
p6 = FF.pow3 (vdw.x_ij *. vdw.x_ij /. r_ij)

2 Likes

We can do further and precompute all x,y,z

let x_tbl = Array.init g.nx (fun i -> x_min +. (float i) *. dx) in ...

But I guess most of the time are spent in distance and p6 functions.

this is a good and natural idea but somehow this doesn’t seem to speed up the loops:

let loop_array n =
  let x_tbl = Array.init (n+1) (fun i -> x_min +. (float i) *. dx) in
  let y_tbl = Array.init (n+1) (fun i -> y_min +. (float i) *. dx) in
  let z_tbl = Array.init (n+1) (fun i -> z_min +. (float i) *. dx) in
  for i = 0 to n do
    let x = Array.unsafe_get x_tbl i in
    for j = 0 to n do
      let y = Array.unsafe_get y_tbl j in
      for k = 0 to n do
        let z = Array.unsafe_get z_tbl k in
        ignore ([| x; y; z|])
      done
    done
  done

I get Elapsed time = 1.679322 versus Elapsed time = 1.631657 for loop_add

It’s hard to know without knowing what numbers you are dealing with. E.g. if g.nx*g.ny*g.nz is somewhat
large (and n is somewhat small) then the cost of allocating in V3.make just to deconstruct it afterward (in V3.dist) might be significant. So you could maybe expose V3.dist6 taking 6 floats.
Also checking that small floating point functions are properly inlined (like Math.non_zero_dist) since if they are not, it may result in boxing their floating point argument.

I also don’t think it makes that much of a difference but pre computing g.ny - 1, g.nz - 1 and g.grid in local variables could help ? (but maybe not, again depend of whether the loops bound are large etc…).

These are just random thought, not sure it improves anything…

Yes, I wrote in this way ( let x = x_min +. (float i) *. dx) to avoid accumulation of FP errors. But even my biggest grid is quite small (100 x 100 x 100), so your proposition is reasonable.

Ok, I’ll prepare a self-contained git repository w/ the relevant code and a runnable example.

You could try using gccjit to jit-compile the hot loop. If it’s not too overwhelming, you can look at my array language implementation here: ocannl/arrayjit/lib/gccjit_backend.ml.

What’s non_zero_dist?

P.S. I’m hoping to announce OCANNL next week.

2 Likes
(* avoid division by 0 in case of too small atomic distance *)
let non_zero_dist x =
  if x < 0.01 then
    0.01 (* trick used in D. Horvath's S4MPLE docking program *)
  else
    x

0.01 was chosen because of the problem domain (i.e. this function is not meant to be generic)

Ok, I managed to create an independent piece of code which is runnable
and with some realistic data sample:

There is now a runnable example in there:

Thanks for any improvements!

There is a runnable example in there:

cf. GitHub - UnixJunkie/vdW_grid_bench: Van der Waals 3D grid initialization benchmark for a runnable exemple

1 Like

Quick early summary: don’t forget to put --profile=release in your dune invocation, and adding [@@inline] annotations on V3.dist and V3.dist2 helps.

1 Like