Uutf’s decoder and encoder abstractions do a lot of stuff for you that you are absolutely not using here.
- Use
Uuseg.String.fold_utf_8. It doesn’t go through the decoder abstraction. - If you are on
> 4.06directly useStdlib.Buffer.add_utf_8_uchar.
And once you can afford 4.14, I suggest you ditch uutf for the Stdlib UTF decoders which do not allocate at all.
Incidentally I just wrote such a loop using them (warning code untested, also this replaces U+0000 by Uchar.rep), this will not allocate a new string in case there’s no decoding error.
let cleanup_input s =
let clean s dirty =
let flush b max start i =
if start <= max then Buffer.add_substring b s start (i - start);
in
let rec loop b s max start i =
if i > max then (flush b max start i; Buffer.contents b) else
match String.unsafe_get s i with
| '\x01' .. '\x7F' (* US-ASCII *) -> loop b s max start (i + 1)
| '\x00' ->
let next = i + 1 in
flush b max start i; Buffer.add_utf_8_uchar b Uchar.rep;
loop b s max next next
| _ ->
let d = String.get_utf_8_uchar s i in
match Uchar.utf_decode_is_valid d with
| true -> loop b s max start (i + Uchar.utf_decode_length d)
| false ->
let next = i + Uchar.utf_decode_length d in
flush b max start i; Buffer.add_utf_8_uchar b Uchar.rep;
loop b s max next next
in
let b = Buffer.create (String.length s + 2 (* assume only one error *)) in
let max = String.length s - 1 in
flush b max 0 dirty; loop b s max dirty dirty
in
let rec check s max i =
if i > max then s else
match String.unsafe_get s i with
| '\x01' .. '\x7F' (* US-ASCII *) -> check s max (i + 1)
| '\x00' -> clean s i
| _ ->
let d = String.get_utf_8_uchar s i in
if Uchar.utf_decode_is_valid d
then check s max (i + Uchar.utf_decode_length d)
else clean s i
in
check s (String.length s - 1) 0