Hello, I experimented with different ways of replicating the unix command cat infile > outfile
, for a large-ish file (~1 GB
), using Lwt_io
, Lwt_process
, Lwt_stream
modules. I am only using cat
here as a simplification for repeatable testing - the actual command producing the > 1 GB
of data is different in my actual use-case.
My first attempt to do this was:
open Lwt.Syntax
let infile = "perf.blob"
let cmd = ("", [|"cat"; infile|])
let using_stream outfile =
let stream = Lwt_process.pread_chars cmd in
let* () =
Lwt_io.with_file ~mode:Output outfile (fun ch ->
Lwt_io.write_chars ch stream )
in
Lwt.return ()
This turned out to take a long time and after running callgrind
on it, it became obvious that this was a bad idea - a character at a time the whole 1 GB
makes it into the heap requiring several allocations and collections.
So I thought using a channel
would might be more appropriate, with a single buffer allocated once, used for reading into, and writing from, and I came up with this code:
open Lwt.Syntax
let infile = "perf.blob"
let cmd = ("", [|"cat"; infile|])
let using_channel outfile =
let size = 1024 in (* performance not sensitive to this size *)
let buf = Bytes.make size (Char.chr 0) in
let* () =
Lwt_io.with_file ~mode:Lwt_io.output outfile (fun och ->
Lwt_process.with_process_in cmd (fun p ->
let ich = p#stdout in
let rec read_then f =
let* n = Lwt_io.read_into ich buf 0 size in
if n > 0 then
let* () = f n in
read_then f
else
Lwt.return ()
(* end of input *)
in
let write n = Lwt_io.write_from_exactly och buf 0 n in
read_then write ) )
in
Lwt.return ()
Then, I compared this to using the system
command:
open Lwt.Syntax
let infile = "perf.blob"
let using_system outfile =
let* _ = Lwt_unix.system (Printf.sprintf "cat %s > %s" infile outfile) in
(* ignore status *)
Lwt.return ()
as well as using the redirection
functionality of Lwt_process.exec
:
open Lwt.Syntax
let infile = "perf.blob"
let cmd = ("", [|"cat"; infile|])
let using_redirection outfile =
let f = Unix.openfile outfile Unix.[O_RDWR; O_CREAT] 0o640 in
let* _ = Lwt_process.exec ~stdout:(`FD_move f) cmd in
(* ignore status *)
Lwt.return ()
The using_stream
was so slow that I never had the patience to wait for it to finish. The stats (measured using Unix.gettimeofday()
) for the other methods look consistently like this:
using_redirection took 0.635495 seconds
using_system took 1.689260 seconds
using_channel took 7.045363 seconds
I am wondering:
a) Is there a better/faster way to redirect the output of a command to file / input to another command?
b) Am I doing something obviously wrong in any of these implementations?
c) Why might redirection
be faster than system
?
Thanks in advance for your help!