Functorial vs "comparator" container interfaces?

We did such micro benchmarking. We compared the standard library Map and Hash functors, with Base’s comparator-style maps and hashtables, and the same structures that were created using the Core functors.

We were using OCaml 4.09.0, Core v0.12.0, and tried three different compilation options, (1) without any optimization, (2) with -O3 and (3) with -O3 and flambda switch.

The main takeaways. The standard library Maps and Hashtables are consistently slightly faster (~10%) than their Janestreet counterparts without flambda and are significantly faster when flambda is enabled (nearly twice as faster).

The experiment

We did a very simple test (that was representative for our problem) that inserts N integers into the map, then lookups the same integers. Each test case is represented as <Source>.<Structure>.<Size>, where <Source> is either Caml (the standard library), Base the Base library, or Core the functor interface, e.g., Map.Make(Int) (we also tried the Map.Int module but didn’t find a significant difference with Map.Make(Int)).

Here are the numbers:

1) vanilla compiler, no specific optimization options.

┌────────────────────┬─────────────┬───────────────┬───────────────┬───────────────┬────────────┐
│ Name               │    Time/Run │       mWd/Run │      mjWd/Run │      Prom/Run │ Percentage │
├────────────────────┼─────────────┼───────────────┼───────────────┼───────────────┼────────────┤
│ Caml.Map:16        │      2.52us │       371.02w │               │               │            │
│ Caml.Map:256       │     90.51us │    12_737.58w │        42.08w │        42.08w │      0.10% │
│ Caml.Map:65536     │ 74_726.08us │ 6_440_940.17w │ 1_733_335.64w │ 1_733_335.64w │     80.61% │
│ Base.Map:16        │      2.39us │       563.03w │         0.13w │         0.13w │            │
│ Base.Map:256       │     91.12us │    19_004.83w │        53.00w │        53.00w │      0.10% │
│ Base.Map:65536     │ 82_342.61us │ 9_626_675.28w │ 1_786_030.13w │ 1_786_030.13w │     88.82% │
│ Core.Map:16        │      2.78us │       557.03w │         0.14w │         0.14w │            │
│ Core.Map:256       │    103.61us │    18_998.83w │        53.22w │        53.22w │      0.11% │
│ Core.Map:65536     │ 92_704.59us │ 9_626_671.81w │ 1_784_322.55w │ 1_784_322.55w │    100.00% │
│ Caml.Hashtbl:16    │      2.01us │       177.01w │               │               │            │
│ Caml.Hashtbl:256   │     43.79us │     2_815.08w │        12.40w │        12.40w │      0.05% │
│ Caml.Hashtbl:65536 │ 17_783.56us │   590_934.33w │   392_225.13w │   262_163.13w │     19.18% │
│ Base.Hashtbl:16    │      2.16us │       108.00w │               │               │            │
│ Base.Hashtbl:256   │     50.56us │     2_547.07w │        16.21w │        16.21w │      0.05% │
│ Base.Hashtbl:65536 │ 21_912.52us │   522_319.67w │   651_433.70w │   520_865.70w │     23.64% │
│ Core.Hashtbl:16    │      2.15us │       100.00w │               │               │            │
│ Core.Hashtbl:256   │     51.14us │     2_539.07w │        16.84w │        16.84w │      0.06% │
│ Core.Hashtbl:65536 │ 22_198.09us │   522_311.89w │   651_436.44w │   520_868.44w │     23.94% │
└────────────────────┴─────────────┴───────────────┴───────────────┴───────────────┴────────────┘

2) vanilla compiler, -O3 option

┌────────────────────┬─────────────┬───────────────┬───────────────┬───────────────┬────────────┐
│ Name               │    Time/Run │       mWd/Run │      mjWd/Run │      Prom/Run │ Percentage │
├────────────────────┼─────────────┼───────────────┼───────────────┼───────────────┼────────────┤
│ Caml.Map:16        │      2.63us │       371.02w │               │               │            │
│ Caml.Map:256       │     95.79us │    12_737.58w │        41.88w │        41.88w │      0.11% │
│ Caml.Map:65536     │ 77_185.38us │ 6_440_940.17w │ 1_733_335.64w │ 1_733_335.64w │     89.80% │
│ Base.Map:16        │      2.44us │       563.03w │         0.13w │         0.13w │            │
│ Base.Map:256       │     90.29us │    19_004.83w │        52.90w │        52.90w │      0.11% │
│ Base.Map:65536     │ 81_299.52us │ 9_626_675.08w │ 1_786_544.39w │ 1_786_544.39w │     94.59% │
│ Core.Map:16        │      2.69us │       557.03w │         0.14w │         0.14w │            │
│ Core.Map:256       │    101.39us │    18_998.83w │        52.94w │        52.94w │      0.12% │
│ Core.Map:65536     │ 85_951.43us │ 9_626_672.41w │ 1_782_706.67w │ 1_782_706.67w │    100.00% │
│ Caml.Hashtbl:16    │      1.95us │       177.01w │               │               │            │
│ Caml.Hashtbl:256   │     43.34us │     2_815.08w │        12.42w │        12.42w │      0.05% │
│ Caml.Hashtbl:65536 │ 18_014.40us │   590_933.91w │   392_225.34w │   262_163.34w │     20.96% │
│ Base.Hashtbl:16    │      2.16us │       108.00w │               │               │            │
│ Base.Hashtbl:256   │     50.72us │     2_547.07w │        16.13w │        16.13w │      0.06% │
│ Base.Hashtbl:65536 │ 22_109.95us │   522_319.57w │   651_427.30w │   520_859.30w │     25.72% │
│ Core.Hashtbl:16    │      2.14us │       100.00w │               │               │            │
│ Core.Hashtbl:256   │     49.79us │     2_539.07w │        16.84w │        16.84w │      0.06% │
│ Core.Hashtbl:65536 │ 21_654.90us │   522_311.75w │   651_438.06w │   520_870.06w │     25.19% │
└────────────────────┴─────────────┴───────────────┴───────────────┴───────────────┴────────────┘

3) flambda compiler, -O3
┌────────────────────┬─────────────────┬───────────────┬───────────────┬───────────────┬────────────┐
│ Name               │        Time/Run │       mWd/Run │      mjWd/Run │      Prom/Run │ Percentage │
├────────────────────┼─────────────────┼───────────────┼───────────────┼───────────────┼────────────┤
│ Caml.Map:16        │        882.79ns │       360.02w │               │               │            │
│ Caml.Map:256       │     39_562.08ns │    12_726.58w │        41.68w │        41.68w │      0.05% │
│ Caml.Map:65536     │ 53_809_874.70ns │ 6_440_930.79w │ 1_735_117.84w │ 1_735_117.84w │     64.61% │
│ Base.Map:16        │      1_815.46ns │       546.03w │         0.12w │         0.12w │            │
│ Base.Map:256       │     79_654.15ns │    18_987.81w │        53.00w │        53.00w │      0.10% │
│ Base.Map:65536     │ 79_160_991.15ns │ 9_626_659.73w │ 1_787_850.52w │ 1_787_850.52w │     95.04% │
│ Core.Map:16        │      2_102.74ns │       546.03w │         0.12w │         0.12w │            │
│ Core.Map:256       │     90_244.58ns │    18_987.81w │        52.78w │        52.78w │      0.11% │
│ Core.Map:65536     │ 83_290_490.19ns │ 9_626_659.10w │ 1_786_810.45w │ 1_786_810.45w │    100.00% │
│ Caml.Hashtbl:16    │        457.38ns │       150.00w │               │               │            │
│ Caml.Hashtbl:256   │     13_812.19ns │     2_548.06w │        11.49w │        11.49w │      0.02% │
│ Caml.Hashtbl:65536 │ 10_470_655.56ns │   525_380.79w │   392_212.18w │   262_150.18w │     12.57% │
│ Base.Hashtbl:16    │      1_513.58ns │        89.00w │               │               │            │
│ Base.Hashtbl:256   │     40_135.58ns │     2_528.09w │        15.58w │        15.58w │      0.05% │
│ Base.Hashtbl:65536 │ 19_374_446.48ns │   522_301.02w │   650_583.30w │   520_015.30w │     23.26% │
│ Core.Hashtbl:16    │      1_534.87ns │        89.00w │               │               │            │
│ Core.Hashtbl:256   │     40_766.55ns │     2_528.09w │        15.58w │        15.58w │      0.05% │
│ Core.Hashtbl:65536 │ 19_377_124.52ns │   522_300.77w │   650_544.29w │   519_976.29w │     23.26% │
└────────────────────┴─────────────────┴───────────────┴───────────────┴───────────────┴────────────┘

Conclusions

As we can see, with flambda both Map and Hash tables of the standard library are significantly faster, 53 ms vs 92 ms for maps and 10 ms vs 19 ms for Hashtbls. We can also see that Janestreet times didn’t change between flambda and non-flambda version hence the increased difference. We can’t really tell whether it is because flambda has more opportunities for inlining for functors or because the flambda version of the JS library was compiled without the optimization information, while the standard library build system is somehow specialized for flambda. (It is important to notice, that when an flambda switch is used but -ON is not specified with N>=1 the generated code doesn’t differ in performance from the code generated by the normal compiler, therefore it is usually a good idea to set -O2 by default when packaging your project).

Source code

open! Core
open Core_bench

let benchmark = Bench.Test.create_indexed ~args:[ 16; 256; 65536 ]

module type Ints = sig
  type t
  val empty : unit -> t

  val add : int -> t -> t
  val mem : int -> t -> bool
end

module type Tests = sig
  type  t
  val create : int array -> t
  val count : int array -> t -> int
end

module MakeTest(Map : Ints) : Tests = struct
  type t = Map.t
  let create inputs =
    let init = Map.empty () in
    Array.fold inputs ~init ~f:(fun map x -> Map.add x map)

  let count inputs map =
    Array.fold inputs ~init:0 ~f:(fun sum x ->
        if Map.mem x map then sum + 1 else sum)
end

module SM = MakeTest(struct
    module Map = Caml.Map.Make(Int)
    type t = unit Map.t
    let empty () = Map.empty
    let add k m = Map.add k () m
    let mem = Map.mem
  end)

module CM = MakeTest(struct
    module Map = Map.Make(Int)
    type t = unit Map.t
    let empty () = Map.empty
    let add k m = Map.set ~key:k ~data:() m
    let mem k m = Map.mem m k
  end)

module BM = MakeTest(struct
    type t = unit Base.Map.M(Int).t
    let empty () = Base.Map.empty (module Int)
    let add k m = Base.Map.set ~key:k ~data:() m
    let mem k m = Map.mem m k
  end)

module BH = MakeTest(struct
    type t = unit Base.Hashtbl.M(Int).t
    let empty () = Base.Hashtbl.create ~size:16 (module Int)
    let add k m = Base.Hashtbl.set m ~key:k ~data:(); m
    let mem k m = Base.Hashtbl.mem m k
  end)

module SH = MakeTest(struct
    module H = Caml.Hashtbl.Make(Int)
    type t = unit H.t
    let empty () = H.create 16
    let add k m = H.add m k (); m
    let mem k m = H.mem m k
  end)

module CH = MakeTest(struct
    module H = Hashtbl.Make(Int)
    type t = unit H.t
    let empty () = H.create ~size:16 ()
    let add k m = H.set m k (); m
    let mem k m = H.mem m k
  end)

let (>:) name (module Test : Tests) =
  benchmark ~name @@ fun len ->
  let input = Array.init len ~f:ident in
  let random_state = Base.Random.State.make [|len|] in
  Array.permute ~random_state input;
  stage (fun () ->
      let map = Test.create input in
      assert (Test.count input map = Array.length input))


let () =
  Command.run @@ Bench.make_command [
    "Caml.Map" >: (module SM);
    "Base.Map" >: (module BM);
    "Core.Map" >: (module CM);
    "Caml.Hashtbl" >: (module SH);
    "Base.Hashtbl" >: (module BH);
    "Core.Hashtbl" >: (module CH);
  ]

Compiled with

ocamlfind ocamlopt -O3 -thread -package core_bench bench.ml -linkpkg -o bench
8 Likes