I have a string that has html-special characters such as “&” (and others such as quote) and I would like to espace these to their html equivalent. How could I do this?
You could implement a scanner (module) using OCamlLex to do that: based on regular expressions, it scans the string and builds a buffer with the critical characters escaped. It’s a bit unusual but quite efficient.
{
(* ocamllex scan.mll && ocamlc -o scan scan.ml *)
module L = Lexing
module B = Buffer
let get = L.lexeme
let sprintf = Printf.sprintf
let position lexbuf =
let p = lexbuf.L.lex_curr_p in
sprintf "%s:%d:%d"
p.L.pos_fname p.L.pos_lnum (p.L.pos_cnum - p.L.pos_bol)
exception Error of string
let error lexbuf fmt =
Printf.kprintf (fun msg ->
raise (Error ((position lexbuf)^" "^msg))) fmt
}
rule escape b = parse
| '&' { B.add_string b "&"; escape b lexbuf }
| '"' { B.add_string b """; escape b lexbuf }
| '\'' { B.add_string b "'"; escape b lexbuf }
| '>' { B.add_string b ">"; escape b lexbuf }
| '<' { B.add_string b "<"; escape b lexbuf }
| [^'&' '"' '\'' '>' '<']+
{ B.add_string b @@ get lexbuf
; escape b lexbuf
}
| eof { let x = B.contents b in B.clear b; x }
| _ { error lexbuf
"don't know how to quote: %s" (get lexbuf) }
{
let escape str = escape (B.create 100) (L.from_string str)
let main () =
let args = Sys.argv |> Array.to_list |> List.tl in
args |> List.iter (fun str -> escape str |> print_endline)
let () = main () (* call main function on startup *)
}
Here’s a function that escapes certain characters of a string and adds the result in a Buffer.t value. It should be easy to adapt to your wishes as long as your escapes are of the form char -> string:
let add_esc : Buffer.t -> string -> unit = fun b s ->
let add = Buffer.add_string in
let len = String.length s in
let max_idx = len - 1 in
let flush b start i =
if start < len then Buffer.add_substring b s start (i - start)
in
let rec loop start i =
if i > max_idx then flush b start i else
let next = i + 1 in
match String.get s i with
| '&' -> flush b start i; add b "&"; loop next next
| '<' -> flush b start i; add b "<"; loop next next
| '>' -> flush b start i; add b ">"; loop next next
| '\'' -> flush b start i; add b "'"; loop next next
| '\"' -> flush b start i; add b """; loop next next
| '@' -> flush b start i; add b "@"; loop next next
| c -> loop start next
in
loop 0 0
Are you looking to produce an HTML encoding, that is, text that, when viewed in a browser as HTML, will display the source text? Or are you looking to do URL-encoding? That is, the component parts can be taken apart from the encoded URL, and once decoded, will have the values you started with ?
In the latter case, there’s Netencoding.Url, which has two utility functions to encode and decode URLs and their parameters.
Thank you everybody for the suggested solutions! I was looking for a library function that I can call and the NetEncoding solutions seems to have worked.