type ('a, 'b) tree =
| Empty
| Node of ('a, 'b) tree * 'a * 'b * ('a, 'b) tree * int (* height *)
let rec find ~cmp t k =
match t with
| Empty -> None
| Node (l, nk, nv, r, _) ->
let cr = cmp k nk in
if cr = 0 then Some nv else if cr < 0 then find ~cmp l k else find ~cmp r k
;;

to

type ('a, 'b) tree =
| Empty
| Node of
{ left : ('a, 'b) tree
; right : ('a, 'b) tree
; key : 'a
; value : 'b
; height : int
}
let rec find ~cmp t k =
match t with
| Empty -> None
| Node n ->
let cr = cmp k n.key in
if cr < 0
then find ~cmp n.left k
else if cr > 0
then find ~cmp n.right k
else Some n.value
;;

It seems ocaml failed to optimize the first one to move the field fetching(s) after the branches.

I noticed that the Map in stdlib also written in first way, which perform worse than latter.

As you point out, the two codes are not equivalent; in the one using inline records you are doing the memory read later than in the one using tuples. If you were to destructure the record eagerly, ie

let rec find ~cmp t k =
match t with
| Empty -> None
| Node {key; value; left; right; _} ->
let cr = cmp k key in
if cr < 0
then find ~cmp left k
else if cr > 0
then find ~cmp right k
else Some value

then both versions should perform equivalently. The advantage of using inline records is the lightweight “dot” syntax that allows accessing the fields of the block in a way that is not possible using tuples.

How are you benchmarking them? Note that you have changed the order of comparisons between the two snippets, so branch prediction might be performing better in the second one, depending on the generated code. Try benchmarking the exact same code first.