Decoding many Unicode strings with uutf

Uutf’s decoder and encoder abstractions do a lot of stuff for you that you are absolutely not using here.

  1. Use Uuseg.String.fold_utf_8. It doesn’t go through the decoder abstraction.
  2. If you are on > 4.06 directly use Stdlib.Buffer.add_utf_8_uchar.

And once you can afford 4.14, I suggest you ditch uutf for the Stdlib UTF decoders which do not allocate at all.

Incidentally I just wrote such a loop using them (warning code untested, also this replaces U+0000 by Uchar.rep), this will not allocate a new string in case there’s no decoding error.

let cleanup_input s =
  let clean s dirty =
    let flush b max start i =
      if start <= max then Buffer.add_substring b s start (i - start);
    in
    let rec loop b s max start i =
      if i > max then (flush b max start i; Buffer.contents b) else
      match String.unsafe_get s i with
      | '\x01' .. '\x7F' (* US-ASCII *) -> loop b s max start (i + 1)
      | '\x00' ->
          let next = i + 1 in
          flush b max start i; Buffer.add_utf_8_uchar b Uchar.rep;
          loop b s max next next
      | _ ->
          let d = String.get_utf_8_uchar s i in
          match Uchar.utf_decode_is_valid d with
          | true -> loop b s max start (i + Uchar.utf_decode_length d)
          | false ->
              let next = i + Uchar.utf_decode_length d in
              flush b max start i; Buffer.add_utf_8_uchar b Uchar.rep;
              loop b s max next next
    in
    let b = Buffer.create (String.length s + 2 (* assume only one error *)) in
    let max = String.length s - 1 in
    flush b max 0 dirty; loop b s max dirty dirty
  in
  let rec check s max i =
    if i > max then s else
    match String.unsafe_get s i with
    | '\x01' .. '\x7F' (* US-ASCII *) -> check s max (i + 1)
    | '\x00' -> clean s i
    | _ ->
        let d = String.get_utf_8_uchar s i in
        if Uchar.utf_decode_is_valid d
        then check s max (i + Uchar.utf_decode_length d)
        else clean s i
  in
  check s (String.length s - 1) 0