Change approach to GHC-8 support

On initialisation just reserve the memory block that will be required by the CUDA driver, and release it only once the user calls 'cuInit'. This still doesn't work with ghci, but feels like it is moving in the right direction. (Now, 'cuInit' crashes with 'SIGBUS' (macos) or 'SIGSEGV' (ubuntu), rather than giving the same "out of memory error" even if we had already called 'cuInit' by the previous method via LD_PRELOAD/DYLD_INSERT_LIBRARIES before the RTS initialised.) towards: #39
tmcdonell · Aug 9, 2016 · 8b2e48c · 8b2e48c
1 parent bfa5b02
commit 8b2e48c
Showing 1 changed file with 135 additions and 17 deletions.
diff --git a/cbits/init.c b/cbits/init.c
@@ -1,5 +1,16 @@
-#include "cbits/stubs.h"
+#define _GNU_SOURCE
+
+#include <cuda.h>
+
+#include <stdint.h>
 #include <stdio.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
 
 /*
  * Make sure that the linker always touches this module so that it notices the
@@ -19,34 +30,141 @@ void enable_constructors() { }
  * that address at the time we call 'cuInit', driver initialisation will fail
  * with an "out of memory" error.
  *
- * The workaround is to call 'cuInit' before initialising the RTS. Then the
+ * One workaround is to call 'cuInit' before initialising the RTS. Then the
  * RTS's allocation will avoid CUDA's allocation, since the RTS doesn't care
  * where in the address space it gets that memory. Embedding the following
  * __attribute__((constructor)) function in the library does the trick nicely,
  * and the linker will ensure that this gets executed when the shared library is
  * loaded (during program startup).
  *
- * Another way around this, without actually calling 'cuInit', would be to just
- * reserve the regions that 'cuInit' requires in the constructor function so
- * that the RTS avoids them, then release them before calling 'cuInit'. However,
- * since the CUDA driver is closed and we don't know exactly which regions to
- * reserve, that approach would be fragile.
+ * However, this forces us to initialise the CUDA driver even if we would never
+ * have used it, and precludes setting custom initialisation options (even
+ * though the option flags to 'cuInit' are currently ignored). Instead, we
+ * simply reserve the memory region that 'cuInit' requires in the constructor
+ * function so that the RTS avoids it, and override the 'cuInit' symbol with our
+ * own version that releases that region before calling the original 'cuInit'.
+ * The disadvantage of course is that this is a little fragile, since the CUDA
+ * driver is closed and we don't know exactly which regions to reserve.
  *
  * See: https://github.com/tmcdonell/cuda/issues/39
  */
-__attribute__((constructor)) void preinitialise_cuda()
+static int*      __reserved               = NULL;
+static uintptr_t __reserved_regions[1][2] = {{0x200000000, 0x1000000000}};
+
+/*
+ * Reserve the CUDA memory range so that the GHC RTS avoids it.
+ *
+ * The magic numbers in '__reserved_regions' found by running under gdb and
+ * calling 'info proc mappings' before and after the call to 'cuInit'. These may
+ * not be the only regions that need to be reserved...
+ */
+void reserve_cuda_memory_region()
+{
+    msync(__reserved, sizeof(int), MS_SYNC);
+
+    if ( !(*__reserved) ) {
+        int i;
+        for (i = 0; i < 1; ++i) {
+            void *result = mmap( (void*) __reserved_regions[i][0], __reserved_regions[i][1]
+                               , PROT_NONE
+                               , MAP_PRIVATE | MAP_FIXED | MAP_NORESERVE | MAP_ANONYMOUS
+                               , -1
+                               , 0
+                               );
+
+            if ( MAP_FAILED == result ) {
+                perror("Failed to reserve CUDA memory region");
+                return;
+            }
+        }
+        *__reserved = 1;
+    }
+}
+
+/*
+ * Once the GHC RTS has been initialised, we can release the memory region and
+ * 'cuInit' should proceed as usual
+ */
+void release_cuda_memory_region()
+{
+    msync(__reserved, sizeof(int), MS_SYNC);
+
+    if ( *__reserved ) {
+        int i;
+        for (i = 0; i < 1; ++i) {
+            int result = munmap( (void*) __reserved_regions[i][0], __reserved_regions[i][1] );
+
+            if ( 0 != result ) {
+                perror("Failed to release reserved CUDA memory region");
+                return;
+            }
+        }
+        *__reserved = 0;
+    }
+}
+
+/*
+ * Transparently replace NVIDIA's implementation of 'cuInit' with our own, which
+ * releases the reserved memory region, if necessary, before calling the real
+ * 'cuInit'.
+ *
+ * Sadly this doesn't appear to override the call to 'cuInit' from deep within
+ * the (statically linked) runtime API, so users of the runtime API will need to
+ * explicitly 'initialise' their programs now.
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags)
+{
+    release_cuda_memory_region();
+
+    CUresult CUDAAPI (*original_cuInit)(unsigned int) = dlsym(RTLD_NEXT, "cuInit");
+    return original_cuInit(Flags);
+}
+
+
+#define SHM_FILE "/hscuda.reserved.smkey"
+
+/*
+* Every process linked against a library will get its own copy of the
+* global/static variables defined by that library. In this case this is not what
+* we want, since we want to signal to _all_ processes whether or not the CUDA
+* regions have been reserved or not, so instead we create a shared variable to
+* signal this. Then, only the first loaded instance of the library reserves the
+* memory block, and only when [our version of] 'cuInit' is called is that memory
+* released for use by the CUDA driver.
+*
+* We don't expect this to be run concurrently, so a mutex is unnecessary.
+*/
+__attribute__((constructor)) void __hscuda_setup()
 {
-    CUresult status = cuInit (0);
+    int fd = shm_open(SHM_FILE, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
 
-    if ( status != CUDA_SUCCESS ) {
-#if CUDA_VERSION >= 6000
-        const char* str = NULL;
+    if ( fd < 0 ) {
+        perror("Failed to create shared memory object");
+        exit(EXIT_FAILURE);
+    }
 
-        cuGetErrorString(status, &str);
-        fprintf(stderr, "Failed to pre-initialise CUDA: %s\n", str);
-#else
-        fprintf(stderr, "Failed to pre-initialise CUDA (%d)\n", status);
-#endif
+    int err = ftruncate(fd, sizeof(int));
+    if ( err < 0 ) {
+        perror("Failed to create shared memory object");
+        exit(EXIT_FAILURE);
     }
+
+    __reserved = mmap(NULL, sizeof(int), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if ( MAP_FAILED == __reserved ) {
+        perror("Failed to reserve shared memory");
+        exit(EXIT_FAILURE);
+    }
+
+    reserve_cuda_memory_region();
 }
 
+__attribute__((destructor)) void __hscuda_teardown()
+{
+    release_cuda_memory_region();
+
+    munmap(__reserved, sizeof(int));
+    shm_unlink(SHM_FILE);
+}
+
+#undef SHM_FILE
+