How does one match strings inside quotes using a lexer in OCamlex?

I am trying to satisfy all of the following cases for my lexer:

# get_all_tokens "\"some string\"";;
- : Common.token list = [STRING "some string"]
# get_all_tokens "\" she said, \\\"hello\\\"\"";;
- : Common.token list = [STRING " she said, \"hello\""]
# get_all_tokens "\" \\100 \\001 \"";;
- : Common.token list = [STRING " d \001 "]
# get_all_tokens
"\"a line \\n starts here; indent \\t starts here next string\" \"starts here\"";;
- : Common.token list =
[STRING "a line \n starts here; indent \t starts here next string";
STRING "starts here"]

I tried many things and this is my current lexer:

{
open Common;;

}

(* You can assign names to commonly-used regular expressions in this part
   of the code, to save the trouble of re-typing them each time they are used *)

let numeric = ['0' - '9']
let lowercase = ['a' - 'z']
let alpha = ['a' - 'z' 'A' - 'Z']
let alphanum = alpha | numeric
let hexadec = ['a' - 'f'] | numeric

let whitespace = [' ' '\t' '\n']

let ddd = ['0' - '2'] ['0' - '5'] ['0' - '5']

rule token = parse
  | [' ' '\t'] { token lexbuf }  (* skip over whitespace *)
  | ['\n']     { token lexbuf }  (* skip over whitespace *)
  | eof        { EOF          }

  | "~"     { NEG }
  | "-"     { MINUS  }
  | "*"     { TIMES  }
  | "/"     { DIV  }
  | "+."    { DPLUS  }
  | "-."    { DMINUS  }
  | "*."    { DTIMES  }
  | "/."    { DDIV  }
  | "^"     { CARAT  }
  | "<"     { LT  }
  | "<="    { LEQ  }
  | ">="    { GEQ  }
  | "="     { EQUALS  }
  | "<>"    { NEQ }
  | "|"     { PIPE  }
  | "->"    { ARROW  }
  | "::"    { DCOLON  }
  | ";"     { SEMI }
  | ";;"    { DSEMI }
  | "@"     { AT  }
  | "[]"    { NIL }
  | "let"   { LET  }
  | "and"   { AND}
  | "end"   { END}
  | "in"    { IN  }
  | "if"    { IF  }
  | "then"  { THEN  }
  | "else"  { ELSE  }
  | "mod"   { MOD  }
  | "try"   { TRY }
  | "with"  { WITH }
  | "not"   { NOT  }
  | "&&"    { LOGICALAND}
  | "||"    { LOGICALOR}
  | "["     { LBRAC  }
  | "()"    { UNIT }
  | "("     { LPAREN  }
  | ")"     { RPAREN  }
  | ","     { COMMA  }
  | "_"     { UNDERSCORE }
  | "true"  { TRUE }
  | "false" { FALSE }
  | "()"    { UNIT }

  | numeric+ as s         { INT (int_of_string s) }
  | ("0x"(hexadec)+) as s { INT (int_of_string s) }
  | ((numeric+)'.'(numeric*)('e'(numeric)+)?) as s       { FLOAT (float_of_string s) }
(* your rules go here *)

  | "+" { PLUS }
  | ">" { GT }
  | "]" { RBRAC }
  | "rec" { REC }
  | "fun" { FUN }
  | "raise" { RAISE }

  | "0b" ['0' - '1']+ as s { INT (int_of_string s)  }

  | lowercase (alphanum | "\'" | "_")* as s { IDENT s }

  | "\"" { get_str "" lexbuf }

(* and get_str accumulator:string = parse *)
and get_str accumulator = parse
  | "\"" { STRING accumulator }
  | '\\' { print_string ("accumulator="^accumulator);print_string ("-get_str\n"); get_stuff_after_backslash accumulator lexbuf }
  | (alphanum|whitespace|",")* as s { print_string ("accumulator="^accumulator);print_string (s^" -get_str\n");get_str (accumulator^s) lexbuf }

and get_stuff_after_backslash accumulator = parse
  | ddd as s {
      let n=int_of_string s in
      let c=char_of_int n in
      let s=String.make 1 c in
      get_str (accumulator^s) lexbuf
    }
  | ['\\' '\'' '\"' '\t' '\n' '\r' '\b' ' '] as c { let s=(String.make 1 c) in print_string ("accumulator="^accumulator);print_string (s^"- get_stuff_after_backslash\n");get_str (s^accumulator) lexbuf }

(*
get_all_tokens "\" \"";;
get_all_tokens "\\";;
get_all_tokens "\"some string\"";;
get_all_tokens "\" she said, \\\"hello\\\"\"";;
get_all_tokens "\" \\100 \\001 \"";;
get_all_tokens "\" \\100 \\001 \"";;
*)

{(* do not modify this function: *)
 let lextest s = token (Lexing.from_string s)

 let get_all_tokens s =
     let b = Lexing.from_string (s^"\n") in
     let rec g () =
     match token b with EOF -> []
     | t -> t :: g () in
     g ()

let try_get_all_tokens s =
    try (Some (get_all_tokens s), true)
    with Failure "unmatched open comment" -> (None, true)
       | Failure "unmatched closed comment" -> (None, false)
 }

which looks very sensible but its not working. Look:

utop # get_all_tokens "\" she said, \\\"hello\\\"\"";;
accumulator= she said,  -get_str
accumulator= she said, -get_str
accumulator= she said, "- get_stuff_after_backslash
accumulator=" she said, hello -get_str
accumulator=" she said, hello-get_str
accumulator=" she said, hello"- get_stuff_after_backslash
- : token list = [STRING "\"\" she said, hello"]

I have no idea why and I feel like giving up :frowning:

What am I missing?


Missing common file:

(* File: common.ml *)

type token =
  | INT of (int)
  | FLOAT of (float)
  | BOOL of (bool)
  | STRING of (string)
  | IDENT of (string)
  | NEG
  | PLUS
  | MINUS
  | TIMES
  | DIV
  | DPLUS
  | DMINUS
  | DTIMES
  | DDIV
  | CARAT
  | LT
  | GT
  | LEQ
  | GEQ
  | EQUALS
  | NEQ
  | PIPE
  | ARROW
  | SEMI
  | DSEMI
  | DCOLON
  | AT
  | NIL
  | LET
  | REC
  | AND
  | END
  | IN
  | IF
  | THEN
  | ELSE
  | FUN
  | MOD
  | RAISE
  | TRY
  | WITH
  | NOT
  | LOGICALAND
  | LOGICALOR
  | LBRAC
  | RBRAC
  | LPAREN
  | RPAREN
  | COMMA
  | UNDERSCORE
  | TRUE
  | FALSE
  | UNIT
  | EOF

module Scanf = struct end

crossposted:

Perhaps you should back up, and, instead of writing your quoted-string rule as a function with an accumulator and such, just write it as a straight-up regular expression ?

1 Like