Please comment my code ... thanks!

Hello!

I’m quite new to OCaml, and my brain is still stuck in C-world. Reading questions that I don’t even understand at all is proof enough.
So I ask you to comment my code and to get it more OCaml-ish. I encourage you to nitpick. Indentation, naming, … as you like.

This is only part of a self-assigned small project.

type tag_variant = XML_tag_empty | XML_tag_open | XML_tag_close | XML_no_tag (* <x/> | <x> | </x> | no tag found *)
exception XML_attrib_failure
let escape_pair_list = [("&amp;", "&"); ("&lt;", "<"); ("&gt;", ">"); ("&apos;", "'"); ("&quot;", "\"")]

type tag_attr_rcrd_t = {
  attrib : string;
  value  : string;
}

type xml_element_rcrd_t = {
  tag_str   : string;
  tag_type  : tag_variant;
  tag_attrs : tag_attr_rcrd_t list;
  value     : string;
  children  : xml_element_rcrd_t list;
}

let rec xml_format ?(indent = 2) ?(level = 0) element =
  let flat = indent = 0 in

    let get_fmt tag_type : _ format =
      match tag_type with 
      | XML_tag_empty -> if flat then "%s<%s/>" else "\n%s<%s/>" 
      | XML_tag_open  -> if flat then "%s<%s" else "\n%s<%s"
      | XML_tag_close -> "%s<%s/>"
      | XML_no_tag    -> "\n%s error %s"
    in

    let buff = Buffer.create 100 in
      bprintf buff (get_fmt element.tag_type) (String.make (indent * level) ' ') element.tag_str;
      List.iter (fun attr -> (bprintf buff " %s='%s'" attr.attrib attr.value)) element.tag_attrs;
      bprintf buff ">";

      (* No value may follow when children follow *)
      if element.children = [] then
        bprintf buff  "\"%s\"" element.value;

      List.iter (fun child -> bprintf buff "%s" (xml_format ~indent:indent ~level:(level + 1) child)) element.children;

      if element.tag_type = XML_tag_open then begin
        if element.children <> [] && not flat then
          bprintf buff "\n%s" (String.make (indent * level) ' ');

        bprintf buff (get_fmt XML_tag_close) "" element.tag_str;
      end;

      Buffer.contents buff

(* ------------------------------------------------------------------------- *)
let get_rest_str in_str start : string =
  if start >= String.length in_str then
    ""
  else
    String.sub in_str start ((String.length in_str) - start)

(* ------------------------------------------------------------------------- *)

let rec unescape_str ?(ndx = 0) ?(pair_list = escape_pair_list) str : string =

  let matches_at_index pair_list str index =
    try
      Some (List.find (fun pair -> match pair with (l, _) -> String.starts_with ~prefix:l (get_rest_str str index)) pair_list)

    with Not_found -> None
  in
    if ndx >= String.length str then
      str
    else begin
      let ref_str = ref str in
      try
        let amp_ndx = String.index_from !ref_str ndx '&' in
          match (matches_at_index pair_list !ref_str amp_ndx) with
          | Some (l, r) -> ref_str := String.sub str 0 amp_ndx ^ r ^ String.sub str (amp_ndx + String.length l) (String.length str - amp_ndx - String.length l);
                           unescape_str ~ndx:(amp_ndx + 1) ~pair_list:(pair_list) !ref_str
          | None -> !ref_str
      with Not_found -> !ref_str
      end

(* ------------------------------------------------------------------------- *)
let rec trim_left in_str : string =
  if String.length in_str = 0 then
    ""
  else 
    match in_str.[0] with
      | '\r'
      | '\n'
      | '\t'
      | '\012'
      | ' ' -> trim_left (String.sub in_str 1 ((String.length in_str) - 1))
      | _ -> in_str

type state = Start | Att_start | Equal | Value_start | Assemble

(* ------------------------------------------------------------------------- *)
type gatt_att_rcrd_t = {
  attr : string;
  rest : string;
}

let get_att_att in_str : gatt_att_rcrd_t =
  if (Str.string_match (Str.regexp "[A-Za-z][A-Za-z0-9]*") in_str 0) then begin  (* BUGBUG: Verify charset *)
    let attr = String.sub in_str 0 (Str.match_end ()) in
      {rest = get_rest_str in_str (String.length attr); attr = attr}
  end
  else
    raise XML_attrib_failure

(* ------------------------------------------------------------------------- *)
type gatt_equ_rcrd_t = {
  attr : string;
  quote_char : char;
  rest : string;
}

let get_att_equ (prev : gatt_att_rcrd_t) : gatt_equ_rcrd_t =
  let in_str = trim_left prev.rest in
    if String.length in_str = 0 then
      raise XML_attrib_failure
    else if in_str.[0] = '=' then begin
      let in_str = trim_left (get_rest_str in_str 1) in
        if String.length in_str = 0 then
          raise XML_attrib_failure
        else if (in_str.[0] = '\'') || (in_str.[0] = '"') then
          {attr = prev.attr; quote_char = in_str.[0]; rest = get_rest_str in_str 1}
        else
          raise XML_attrib_failure
    end
    else
      raise XML_attrib_failure

(* ------------------------------------------------------------------------- *)
type gatt_val_rcrd_t = {
  attr : string;
  value : string;
  rest : string;
}

let get_att_val (prev : gatt_equ_rcrd_t) : gatt_val_rcrd_t =
  let in_str = trim_left prev.rest in (* BUGBUG: really trim left? Then we would need a trim right too *)
    try
      let right = String.index in_str prev.quote_char in
        {attr = prev.attr; value = (String.sub in_str 0 right); rest = trim_left (get_rest_str in_str (right + 1))}
    
    with Not_found -> raise XML_attrib_failure
(* ------------------------------------------------------------------------- *)
let get_attributes in_str : tag_attr_rcrd_t list =
  let in_str = trim_left in_str in
  
    let rec loop in_str in_list =
      let res = get_att_att in_str |> get_att_equ |> get_att_val in
        let in_list = in_list @ [{attrib = res.attr; value = res.value}] in (* BUGBUG: unescape res.value *)
          if String.length res.rest = 0 then (* get_att_val returns a left-trimmed rest-string *)
            in_list
          else
            loop res.rest in_list
    in
      loop in_str []
    
(* ------------------------------------------------------------------------- *)

let get_tag_attrs in_tag_wrapped : wrapped_tag_rcrd_t =
  (* we are looking for a pattern like "tag_str attr1='value1' attr2="value2" ..." *) (* BUGBUG: Not sure about spaces around the '=' *)

  (* Are there any blanks? *)
  let in_tag_str = in_tag_wrapped.tag_rcrd.tag_str in (* just for convenience *)
    try
      let left = String.index in_tag_str ' ' in begin
        let tag_str = String.sub in_tag_str 0 left
        and attr_str = String.sub in_tag_str left (String.length in_tag_str - left) in
          printf "\ngetTagAttrs '%s':'%s'\n" tag_str attr_str;
          {tag_rcrd = {tag_type = in_tag_wrapped.tag_rcrd.tag_type; tag_str = tag_str; tag_attrs = (get_attributes attr_str)}; rest = in_tag_wrapped.rest; failed = in_tag_wrapped.failed}
      end
    with Not_found -> in_tag_wrapped (* Nothing to do and tag_attrs is already [] *)


Thanks for any input or tips.

1 Like

A quick critique…

Writing comments to explain each type and function would help.

My C++ code uses Javadoc-style comments, since I learned java first. Likewise for other programming languages that I develop computer programs and software with, such as Fortran, MATLAB, Ruby, Python, Perl, Tcl, Racket, and Haskell.

If you want, you can use Doxygen-style comments to enable automated documentation generation.

The main things that are missing are:

  • program (functional) specifications
    • E.g., what is the program supposed to do?
  • your assumptions, since we cannot read your mind
  • conditions to run the computer program, so that we can avoid exceptions when executing the program
  • What does the type/class model?
  • What does the function do, accepts as inputs, and provides as outputs?

Providing a test suite, or complementary program, to test the program and each function automatically for a range of conditions/inputs would be helpful.

If you realize that C++, Java, Python, C#, and JavaScript support functional programming to varying degrees, you can practice implementing your computer program in these programming languages, using functional programming, before porting them to OCaml. You cannot do likewise for C, since it is not a programming language that supports functional programming nor object-oriented programming well.

You don’t need Doxygen, OCaml has odoc comments for documentation. Eg

type tag_variant =
| XML_tag_empty (** <x/> *)
| XML_tag_open (** <x> *)
| XML_tag_close (** </x> *)
| XML_no_tag (** no tag found *)

Then use dune and odoc to generate the HTML.

1 Like

I think you might find the following 2 articles useful regarding coding style and namespacing in OCaml:

  1. Dromedary and a half - The Sense of Coding Style
  2. Namespacing Variants in ML
2 Likes

Ahhhh :frowning:
I missed the most obvious thing. An explanaition about what is what.
Overall, these are parts of an XML-parser.
It consists of three snippets:
1.)
xml_format that returns a string of a parsed XML-file that could be parsed again, lossless. The XML-tree is stored in xml_element_rcrd_t

get_rest_str is just a helper for the following functions, for completeness.

2.) unescape_str unescapes escapes in XML. Like “>”, “<” etc. Also known from HTML. The unescape can’t be done with regex (at least, I can’t). Think of something like “&lt;” that should translate to “<” and might no be reparsed. So unescape_str looks a bit odd.

3.) get_tag_attrs parses the attribute fields of an XML tag. How that looks like is described in the comment of that (last function). in_tag_wrapped (that I forgot) just provides the string, wrapped_tag_rcrd as result is the completed tag with the list of attribute/value and the remaining string to be processe by the following functions (like get the value of that tag or get its children).

So comments are missing, and I am aware of ocamldoc and use(d) it. I am using Doxygen extensively. So I am aware how much this helps. I left it out to keep the code short for posting here. My pitty!

I do know other languages and used them, not just C. Just to give you some background: Pascal, Modula-2, EIffel, C, C++, ObjC, Perl, NewtonScript, Tcl plus some oddities and even own languages that I wrote “compilers” (they do work with an OO evaluation-tree) for. I do not intend to translate C to C++ and then to OCaml, because that only ends in an abuse of OCaml. I started this little project from scrap (actually, it is the first part of something much bigger), albeit I do have a C-version of it. Just to avoid getting trapped in other languages.
I want to learn to think in OCaml and not know how to tweak other languages into OCaml. That’s my rationale behind this post.

I divided get_tag_attrs into several functions to increase readability. I know this set of 4 functions could be crammed into one. But I don’t think this would help. Also eases UnitTest if I can test them seperately.

Thank you!

1 Like