Html encoding of string

I have a string that has html-special characters such as “&” (and others such as quote) and I would like to espace these to their html equivalent. How could I do this?

Example: a string like this (note link is not meant to work) https://www.youtube.com/watch?v=something&list=somethingelse&index=2&t=0s

You can use ocamlnet: http://projects.camlcity.org/projects/dl/ocamlnet-4.1.6/doc/html-main/Netencoding.Html.html

You could implement a scanner (module) using OCamlLex to do that: based on regular expressions, it scans the string and builds a buffer with the critical characters escaped. It’s a bit unusual but quite efficient.

{
    (* ocamllex scan.mll &&  ocamlc -o scan scan.ml *)

    module L = Lexing 
    module B = Buffer

let get      = L.lexeme
let sprintf  = Printf.sprintf

let position lexbuf =
    let p = lexbuf.L.lex_curr_p in
        sprintf "%s:%d:%d" 
        p.L.pos_fname p.L.pos_lnum (p.L.pos_cnum - p.L.pos_bol)

exception Error of string
let error lexbuf fmt = 
    Printf.kprintf (fun msg -> 
        raise (Error ((position lexbuf)^" "^msg))) fmt

}

rule escape b = parse
| '&'       { B.add_string b "&";  escape b lexbuf } 
| '"'       { B.add_string b """; escape b lexbuf } 
| '\''      { B.add_string b "'"; escape b lexbuf }
| '>'       { B.add_string b ">";   escape b lexbuf }
| '<'       { B.add_string b "&lt;";   escape b lexbuf }
| [^'&' '"' '\'' '>' '<']+ 
            { B.add_string b @@ get lexbuf
            ; escape b lexbuf
            }
| eof       { let x = B.contents b in B.clear b; x }
| _         { error lexbuf 
                "don't know how to quote: %s" (get lexbuf) }

{
let escape str = escape (B.create 100) (L.from_string str)

let main () =
  let args = Sys.argv |> Array.to_list |> List.tl in
  args |> List.iter (fun str -> escape str |> print_endline)

let () = main () (* call main function on startup *)
}
2 Likes

Here’s a function that escapes certain characters of a string and adds the result in a Buffer.t value. It should be easy to adapt to your wishes as long as your escapes are of the form char -> string:

let add_esc : Buffer.t -> string -> unit = fun b s ->
  let add = Buffer.add_string in                      
  let len = String.length s in
  let max_idx = len - 1 in
  let flush b start i =
    if start < len then Buffer.add_substring b s start (i - start)
  in
  let rec loop start i =
    if i > max_idx then flush b start i else
    let next = i + 1 in
    match String.get s i with
    | '&' -> flush b start i; add b "&amp;"; loop next next
    | '<' -> flush b start i; add b "&lt;"; loop next next
    | '>' -> flush b start i; add b "&gt;"; loop next next
    | '\'' -> flush b start i; add b "&apos;"; loop next next
    | '\"' -> flush b start i; add b "&quot;"; loop next next
    | '@' -> flush b start i; add b "&commat;"; loop next next
    | c -> loop start next
  in
  loop 0 0
4 Likes

Are you looking to produce an HTML encoding, that is, text that, when viewed in a browser as HTML, will display the source text? Or are you looking to do URL-encoding? That is, the component parts can be taken apart from the encoded URL, and once decoded, will have the values you started with ?

In the latter case, there’s Netencoding.Url, which has two utility functions to encode and decode URLs and their parameters.

1 Like

Thank you. this is exactly what i was looking for!

Thank you everybody for the suggested solutions! I was looking for a library function that I can call and the NetEncoding solutions seems to have worked.