I have this little piece of code whose performance is critical.
If you have some ideas on how to accelerate it, you are very welcome.

If people are interested, I can create a small repository where the code
would be executable.

I am thinking about giving a try to BER MetaOCaml.

Thanks a lot,
Francois.

(* initialize x.grid as a vdW grid for ligand atomic number [l_a] *)
let vdW_grid g prot_atoms l_a =
let x_min, y_min, z_min = V3.to_triplet g.low in
let dx = g.step in
let n = A.length prot_atoms in
for i = 0 to g.nx - 1 do
let x = x_min +. (float i) *. dx in
for j = 0 to g.ny - 1 do
let y = y_min +. (float j) *. dx in
for k = 0 to g.nz - 1 do
let z = z_min +. (float k) *. dx in
let l_p = V3.make x y z in (* ligand atom position *)
for m = 0 to n - 1 do (* over all protein atoms *)
let p_p, p_a = A.unsafe_get prot_atoms m in
let r_ij = Math.non_zero_dist (V3.dist l_p p_p) in
let vdw = UFF.vdW_xiDi l_a p_a in
let p6 = FF.pow6 (vdw.x_ij /. r_ij) in
(* g.grid.{i,j,k} <-
g.grid.{i,j,k} +.
(vdw.d_ij *. ((-2.0 *. p6) +. (p6 *. p6))) *)
BA3.unsafe_set g.grid i j k
(BA3.unsafe_get g.grid i j k +.
(vdw.d_ij *. ((-2.0 *. p6) +. (p6 *. p6))))
done
done
done
done;
g

I donâ€™t know how expensive this operation is, but it looks like you could lift it out of the inner loop and pre-compute an array so it doesnâ€™t have to be repeated x*y*z times.

If n is not too large so that you can accommodate with n times float errors
(n = 1000 seems ok to me)
you can replace let x = x_min +. (float i) *. dx
by x := !x +. dx

On a little test, the gains are significant: about 40% speedup!

let x_min = 1.1
let y_min = 0.2
let z_min = -. 1.
let dx = 0.0001
let loop_mult n =
for i = 0 to n do
let x = x_min +. (float i) *. dx in
for j = 0 to n do
let y = y_min +. (float j) *. dx in
for k = 0 to n do
let z = z_min +. (float k) *. dx in
ignore ([| x; y; z|])
done
done
done
let loop_add n =
let x = ref x_min in
let y = ref y_min in
let z = ref z_min in
for i = 0 to n do
y := y_min;
for j = 0 to n do
z := z_min;
for k = 0 to n do
ignore ([| !x; !y; !z|]);
z := !z +. dx
done;
y := !y +. dx
done;
x := !x +. dx
done
let test f s =
let t1 = Sys.time () in
ignore (f s);
let t2 = Sys.time () in
Printf.printf "Elapsed time = %f\n" (t2 -. t1);;
let () =
test loop_add 1000;
test loop_mult 1000
(* ocamlfind ocamlopt -o loops loops.ml *)
(* result:
Elapsed time = 1.626832
Elapsed time = 2.665006
*)

I donâ€™t think I can give very good advice with just that piece of code.
Knowing what the expected values are for g.nx, g.ny, g.nz and n would help, but I canâ€™t say much about the inner loop without knowing how the various functions are implemented.
A small improvement, maybe: ((-2.0 *. p6) +. (p6 *. p6)) is equivalent (up to rounding errors) to ((p6 -. 2.0) *. p6), which saves one floating-point multiplication.
I might be able to find some time to investigate further if you setup a repository.

I also suspect that in V3.dist l_p p_p
you have a square root that is not necessary, since afterwards you compute the 6th power.
You should let r_ij be the norm squared
and then p6 = FF.pow3 (vdw.x_ij *. vdw.x_ij /. r_ij)

this is a good and natural idea but somehow this doesnâ€™t seem to speed up the loops:

let loop_array n =
let x_tbl = Array.init (n+1) (fun i -> x_min +. (float i) *. dx) in
let y_tbl = Array.init (n+1) (fun i -> y_min +. (float i) *. dx) in
let z_tbl = Array.init (n+1) (fun i -> z_min +. (float i) *. dx) in
for i = 0 to n do
let x = Array.unsafe_get x_tbl i in
for j = 0 to n do
let y = Array.unsafe_get y_tbl j in
for k = 0 to n do
let z = Array.unsafe_get z_tbl k in
ignore ([| x; y; z|])
done
done
done

I get Elapsed time = 1.679322 versus Elapsed time = 1.631657 for loop_add

Itâ€™s hard to know without knowing what numbers you are dealing with. E.g. if g.nx*g.ny*g.nz is somewhat
large (and n is somewhat small) then the cost of allocating in V3.make just to deconstruct it afterward (in V3.dist) might be significant. So you could maybe expose V3.dist6 taking 6 floats.
Also checking that small floating point functions are properly inlined (like Math.non_zero_dist) since if they are not, it may result in boxing their floating point argument.

I also donâ€™t think it makes that much of a difference but pre computing g.ny - 1, g.nz - 1 and g.grid in local variables could help ? (but maybe not, again depend of whether the loops bound are large etcâ€¦).

These are just random thought, not sure it improves anythingâ€¦

Yes, I wrote in this way ( let x = x_min +. (float i) *. dx) to avoid accumulation of FP errors. But even my biggest grid is quite small (100 x 100 x 100), so your proposition is reasonable.

You could try using gccjit to jit-compile the hot loop. If itâ€™s not too overwhelming, you can look at my array language implementation here: ocannl/arrayjit/lib/gccjit_backend.ml.

(* avoid division by 0 in case of too small atomic distance *)
let non_zero_dist x =
if x < 0.01 then
0.01 (* trick used in D. Horvath's S4MPLE docking program *)
else
x

0.01 was chosen because of the problem domain (i.e. this function is not meant to be generic)