OCaml Parser combinator library as powerful as fastparse(Scala)?

@aryx

As promised the working definition of a combinator parser based on the library
fmlib.

First the abstract syntax (which I have slightly modified from your post):

(* Abstract syntax *)
type expression =
    | Literal of int
    | Variable of string
    | Plus of expression * expression
    | Lambda of string list * statement list

and statement =
    | Return of expression
    | If of expression * statement list * statement list
    | Assign of string * expression

and definition =
    string * string list * statement list


(* We want to parse a definition, therefore the final type is a definition *)
module Final =
struct
    type t = definition
end

Now the parser module

module Parse =
struct
    include Fmlib_parse.Character.Make (Unit) (Final) (String)

    module String_set = Set.Make (String)

    let keywords: String_set.t =
        String_set.(empty |> add "if" |> add "return" |> add "lambda")


    let whitespace: int t =
        skip_zero_or_more (char ' ' </> char '\n' </> char '\r')


    let keyword (kw: string): string t =
        let* str = backtrack (string kw) kw in
        let* _   = whitespace in
        return str


    (* A raw identifier is a string starting with letter, followed by zero or
     * more letters, digits or underscores. *)
    let raw_name: string t =
        let* first = letter in
        zero_or_more
            (String.make 1 first)
            (fun c str -> str ^ String.make 1 c)
            (letter </> range '0' '9'  </> char '_')


    (* An identifier is a raw identifier which is not a keyword. Whitespace is
     * stripped. *)
    let identifier: string t =
        let* id =
            backtrack
                (
                    let* name = raw_name in
                    if String_set.mem name keywords then
                        unexpected "identifier"
                    else
                        return name
                )
                "identifier"
        in
        let* _ = whitespace in
        return id


    (* Digit sequence with whitespace stripped. *)
    let number: expression t =
        let* n =
            one_or_more
                Fun.id
                (fun n d -> 10 * n + d)
                digit
        in
        let* _ = whitespace in
        return (Literal n)


    (* [p1] and [p2] are the opening and closing parentheses. *)
    let parenthesized (p1: char) (p2: char) (p: unit -> 'a t): 'a t =
        let* _ = char p1 in
        let* _ = whitespace in
        let* a = p () in
        let* _ = whitespace in
        let* _ = char p2 in
        let* _ = whitespace in
        return a


    (* '(' name1, name2, ... ')' *)
    let formal_arguments: string list t =
        let separator =
            let* _ = char ',' in
            whitespace
        in
        parenthesized
            '('
            ')'
            (fun () ->
                 let* lst =
                     one_or_more_separated
                         (fun first -> [first])
                         (fun lst _ nxt -> nxt :: lst)
                         identifier
                         separator
                     </>
                     return []
                 in
                 return (List.rev lst))


    (* Now the mutual recursive definition of [expression], atomic expression,
     * statement and statement list. *)
    let rec expression (): expression t =
        let plus =
            let* _ = char '+' in
            whitespace
        in
        one_or_more_separated
            Fun.id
            (fun e1 _ e2 -> Plus (e1, e2))
            (atomic ())
            plus


    and atomic (): expression t =
        number
        </>
        (
            let* id = identifier in
            return (Variable id)
        )
        </>
        (
            let* _ = keyword "lambda" in
            let* fargs = formal_arguments in
            let* stmts = statements () in
            return (Lambda (fargs, stmts))
        )
        </>
        parenthesized '(' ')' expression

    and statement (): statement t =
        (
            let* _ = keyword "return" in
            let* e = expression () in
            return (Return e)
        )
        </>
        (
            let* _ = keyword "if" in
            let* bexp = parenthesized '(' ')' expression in
            let* stmts1 = statements () in
            let* _ = keyword "else" in
            let* stmts2 = statements ()
            in
            return (If (bexp, stmts1, stmts2))
        )
        </>
        (
            let* id = identifier in
            let* _  = char '=' in
            let* _  = whitespace in
            let* expr = expression () in
            return (Assign (id, expr))
        )


    and statements (): statement list t =
        parenthesized
            '{'
            '}'
            (fun () ->
                 let* statements =
                     one_or_more_separated
                         (fun stmt -> [stmt])
                         (fun stmts _ nxt -> nxt :: stmts)
                         (statement ())
                         (let* _ = char ';' in whitespace)
                 in
                 return (List.rev statements))


    (* name (arg1, arg2, ... ) { body } *)
    let definition: definition t =
        let* id = identifier in
        let* fargs = formal_arguments in
        let* stmts = statements () in
        return (id, fargs, stmts)
end

Finally an inline test which proves that the parser works as expected.

let%test _ =
    let open Parse in
    let p =
        Parser.run_on_string
            "f1 (a,b,c) { return (lambda (i,j) { a = 10; return i }) }"
            (make () definition)
    in
    Printf.printf "pos %d, %d\n" (Parser.line p) (Parser.column p);
    Parser.has_succeeded p
3 Likes