I am trying to use Jane Street’s Rpc_parallel library in Ocaml to create multiple workers in different remote servers. Using the simple example that Jane Street provided, I created a program to compute the sum of numbers from 0 to max
on the host, 172.17.0.3
. The code can be seen below:
open Core
open Async
(* A bare bones use case of the [Rpc_parallel] library. This demonstrates how to
define a simple worker type that implements some functions. The master then spawns a
worker of this type and calls a function to run on this worker *)
module Sum_worker = struct
module T = struct
(* A [Sum_worker.worker] implements a single function [sum : int -> int]. Because this
function is parameterized on a ['worker], it can only be run on workers of the
[Sum_worker.worker] type. *)
type 'worker functions = { sum : ('worker, int, int) Rpc_parallel.Function.t }
(* No initialization upon spawn *)
module Worker_state = struct
type init_arg = unit [@@deriving bin_io]
type t = unit
end
module Connection_state = struct
type init_arg = unit [@@deriving bin_io]
type t = unit
end
module Functions
(C : Rpc_parallel.Creator
with type worker_state := Worker_state.t
and type connection_state := Connection_state.t) =
struct
(* Define the implementation for the [sum] function *)
let sum_impl ~worker_state:() ~conn_state:() arg =
let sum = List.fold ~init:0 ~f:( + ) (List.init arg ~f:Fn.id) in
Log.Global.info "Sum_worker.sum: %i\n" sum;
Core.printf !"%d" sum;
return sum
;;
(* Create a [Rpc_parallel.Function.t] from the above implementation *)
let sum = C.create_rpc ~f:sum_impl ~bin_input:Int.bin_t ~bin_output:Int.bin_t ()
(* This type must match the ['worker functions] type defined above *)
let functions = { sum }
let init_worker_state () = Deferred.unit
let init_connection_state ~connection:_ ~worker_state:_ = return
end
end
include Rpc_parallel.Make (T)
end
let main max log_dir () =
let executable_path = "/app/simple.exe" in
let host = "root@172.17.0.3"in
let redirect_stdout, redirect_stderr =
match log_dir with
| None -> `Dev_null, `Dev_null
| Some _ -> `File_append "sum.out", `File_append "sum.err"
in
Sum_worker.spawn
~on_failure:Error.raise
?cd:log_dir
~shutdown_on:Disconnect
~redirect_stdout
~redirect_stderr
~where:(Rpc_parallel.Executable_location.Remote
(Rpc_parallel.Remote_executable.existing_on_host ~executable_path
~strict_host_key_checking:`No host))
~connection_state_init_arg:()
()
>>=? fun conn ->
Sum_worker.Connection.run conn ~f:Sum_worker.functions.sum ~arg:max
>>=? fun res ->
Core.Printf.printf "sum_worker: %d\n%!" res;
Deferred.Or_error.ok_unit
;;
let command =
(* Make sure to always use [Command.async] *)
Command.async_spec_or_error
~summary:"Simple use of Async Rpc_parallel V2"
Command.Spec.(
empty
+> flag "max" (required int) ~doc:""
+> flag "log-dir" (optional string) ~doc:" Folder to write worker logs to")
main
;;
(* This call to [Rpc_parallel.start_app] must be top level *)
let () = Rpc_parallel.start_app command
let () = never_returns (Scheduler.go ())
The master node tries to spawn a worker node on the remote host. When I run simple.exe
, it asks me for permissions to ssh into 172.17.0.3
. So, I am successfully ssh’d into 172.17.0.3
. However, I get the following error:
[WORKER STDERR]: (((pid 283) (thread_id 0)) "2018-07-27 22:17:22.611147668Z"
[WORKER STDERR]: "unhandled exception in Async scheduler"
[WORKER STDERR]: ("unhandled exception"
[WORKER STDERR]: ((monitor.ml.Error
[WORKER STDERR]: ("Worker failed to register"
[WORKER STDERR]: (monitor.ml.Error
[WORKER STDERR]: (src/core_unix.ml.Inet_addr.Get_inet_addr ubuntu "host not found")
[WORKER STDERR]: ("Raised at file \"src/import0.ml\" (inlined), line 351, characters 22-32"
[WORKER STDERR]: "Called from file \"src/result.ml\", line 168, characters 17-26"
[WORKER STDERR]: "Called from file \"src/deferred1.ml\", line 20, characters 40-45"
[WORKER STDERR]: "Called from file \"src/job_queue.ml\", line 159, characters 6-47"))
[WORKER STDERR]: src/parallel.ml:1365:19)
[WORKER STDERR]: ("Raised at file \"src/import0.ml\" (inlined), line 351, characters 22-32"
[WORKER STDERR]: "Called from file \"src/error.ml\" (inlined), line 9, characters 14-30"
[WORKER STDERR]: "Called from file \"src/error.ml\" (inlined), line 5, characters 2-50"
[WORKER STDERR]: "Called from file \"src/parallel.ml\", line 1365, characters 19-78"
[WORKER STDERR]: "Called from file \"src/deferred1.ml\", line 20, characters 40-45"
[WORKER STDERR]: "Called from file \"src/job_queue.ml\", line 159, characters 6-47"
[WORKER STDERR]: "Caught by monitor main"))
[WORKER STDERR]: ((pid 283) (thread_id 2)))))
It says that host is not found, but I find it strange that I successfully ssh’d into the host without any problem. Does anybody have clue of why this is happening?