Infinite loop when I try to save and re-throw an exception using caml_raise


#1

I’m working with bindings for a C library (FreeTDS) that does error handling in callbacks, and I want to handle errors without blowing up FreeTDS’s current stack whenever on occurs, but for some reason when I call caml_raise, my code blocks forever in what seems to be an infinite loop. I’m guessing I’m doing something wrong with CAMLlocal1 or how I’m calling caml_raise, or maybe the way I’m preventing the exception from being GC’d.

The code I have now looks like this:

typedef struct User_data {
  value latest_exception;
} User_data;

static void userdata_free(DBPROCESS* proc)
{
  User_data* data = (User_data*)dbgetuserdata(proc);
  if (data == NULL)
    return;

  caml_remove_global_root(&(data->latest_exception));

  dbsetuserdata(proc, NULL);
  caml_stat_free(data);
}

static void userdata_setup(DBPROCESS* proc)
{
  userdata_free(proc);

  User_data* data = caml_stat_alloc(sizeof(User_data));
  data->latest_exception = Val_unit;
  caml_register_global_root(&(data->latest_exception));
  dbsetuserdata(proc, (BYTE*)data);
}

static void userdata_set_latest_exception(DBPROCESS *proc, value exn)
{
  CAMLparam1(exn);
  User_data* data = (User_data*)dbgetuserdata(proc);
  if(data == NULL)
    caml_raise(exn);

  data->latest_exception = exn;
  CAMLreturn0;
}

static void maybe_raise_userdata_exn(DBPROCESS* proc)
{
  CAMLparam0();
  CAMLlocal1(vexn);

  User_data* data = (User_data*)dbgetuserdata(proc);
  if(data == NULL || data->latest_exception == Val_unit)
    CAMLreturn0;

  vexn = data->latest_exception;
  data->latest_exception = Val_unit;
  caml_raise(vexn);
}

But if I actually raise an exception, I get an infinite loop in caml_raise:

(gdb) bt
#0  0x0000000000b835b3 in caml_raise (v=140737336655744) at fail.c:72
#1  0x0000000000b6f5ea in maybe_raise_userdata_exn (proc=proc@entry=0x7fffdc0090c0) at dblib_stubs.c:100
#2  0x0000000000b70113 in ocaml_freetds_dbresults (vdbproc=<optimized out>) at dblib_stubs.c:487
#3  0x0000000000682774 in camlMssql__Client__result_set_loop_9138 () at mssql/src/client.ml:58
#4  0x0000000000682668 in camlMssql__Client__run_query_8733 () at mssql/src/client.ml:85
#5  0x0000000000681aad in camlMssql__Mssql_error__with_wrap_inner_9163 () at mssql/src/mssql_error.ml:43
#6  0x0000000000a8f175 in camlBase__Result__try_with_2876 () at src/result.ml:161
#7  0x00000000006f3f22 in camlAsync_unix__In_thread__doit_2854 () at src/in_thread.ml:12
#8  0x00000000006deca2 in camlAsync_unix__Thread_pool__loop_49809 () at src/thread_pool.ml:397
#9  0x000000000084d536 in camlCore__Core_thread__f_3838 () at src/core_thread.ml:12
#10 0x00000000008a4999 in camlThread__fun_1571 () at thread.ml:39
#11 0x0000000000b9cfa2 in caml_start_program ()
#12 0x0000000000b79620 in caml_thread_start ()
#13 0x00007ffff7e3558e in start_thread () from /lib64/libpthread.so.0
#14 0x00007ffff7bce6a3 in clone () from /lib64/libc.so.6

Relevant code is here in the version of OCaml I’m using:

void caml_raise(value v)
{
  Unlock_exn();
  if (caml_exception_pointer == NULL) caml_fatal_uncaught_exception(v);

#ifndef Stack_grows_upwards
#define PUSHED_AFTER <
#else
#define PUSHED_AFTER >
#endif
  while (caml_local_roots != NULL &&
         (char *) caml_local_roots PUSHED_AFTER caml_exception_pointer) {
    caml_local_roots = caml_local_roots->next; // line 72
  }
#undef PUSHED_AFTER

  caml_raise_exception(v);
}

If I change the code to call caml_raise_exception instead of caml_raise, this doesn’t block, but I do get random segfaults later on caml_do_local_roots (which isn’t too surprising since caml_raise_exception is a private function and presumably this code that’s blocking is doing something important).

Does anyone know what I should be doing here?


#2

If I print caml_local_roots in gdb, it looks like there’s a very small cycle, where caml_local_roots == caml_local_roots->next. Is this something my code is somehow messing up? I don’t understand why anything I did would have caused that.

(gdb) print caml_local_roots
$2 = (struct caml__roots_block *) 0x7fffe56ea5d0
(gdb) print *caml_local_roots
$3 = {next = 0x7fffe56ea5d0, ntables = 4, nitems = 1, tables = {0x7fffe56ea5b0, 0x7fffe56ea5b8, 0x7fffe56ea5c0, 0x7fffe56ea5c8, 0x7fffc4000020}}

Also I can confirm that this code works if I compile it in byte mode, just not in native mode.


#3

That’s… suspicious.


#4

It might help to have a driver that shows the problem. I put together a driver and I don’t see the error you describe.

I added these lines at the beginning of your C code:

#include <caml/mlvalues.h>
#include <caml/memory.h>
#include <caml/fail.h>

typedef unsigned char BYTE;

typedef struct dbprocess {
    void *user_data;
} DBPROCESS;

void *dbgetuserdata(DBPROCESS *proc)
{
    return proc->user_data;
}

void dbsetuserdata(DBPROCESS *proc, void *data)
{
    proc->user_data = data;
}

Then I added these lines after your C code:

static DBPROCESS g_proc = { NULL };

value setup(value unitval)
{
    CAMLparam1(unitval);
    userdata_setup(&g_proc);
    CAMLreturn(Val_unit);
}

value set_latest_exception(value exn)
{
    CAMLparam1(exn);
    userdata_set_latest_exception(&g_proc, exn);
    CAMLreturn(Val_unit);
}

value raise_exn(value unitval)
{
    CAMLparam1(unitval);
    maybe_raise_userdata_exn(&g_proc);
    CAMLreturn(Val_unit);
}

My ocaml driver looks like this:

$ cat try.ml
external setup : unit -> unit = "setup"
external set_latest_exception : exn -> unit = "set_latest_exception"
external raise_exn : unit -> unit = "raise_exn"

exception Local_exn of int * string

let main () =
    setup ();
    set_latest_exception (Local_exn (99, "failure message"));
    raise_exn ()

let () = main ()

When I run it I see this:

$ ./try
Fatal error: exception Try.Local_exn(99, "failure message")
$

No doubt this is because your code doesn’t look like my simple driver.

It might also be interesting to know if the behavior seems completely deterministic or if it varies.


#5

After thinking about it more this weekend, I think this code is probably doing something wrong in the multithreading sections (not shown). I bet we’re messing with the ocaml variable list while the runtime lock is released. I’m going to try to fix that and I’ll check back here with a bigger example if that doesn’t work.


#6

Ok, sorry to waste your time. This was definitely caused by calling CAMLparam, CAMLlocal, or CAMLreturn when the runtime lock was released.

A side question though: Is there a way to do the equivalent of CAMLreturn without immediately returning? I have an error callback that needs to handle OCaml locals but has to release the runtime lock before returning. I solved it by turning it into two functions (one calls the other function and then releases the runtime lock) but I’m curious if there’s a lower-level function to “do what CAMLreturn does but don’t return”.


#7

In memory.h there is a definition CAMLdrop that does this.


#8

Thanks, that made changing my code to do this correctly a lot simpler!