diff mbox

[gomp4] remove use of CUDA unified memory in libgomp

Message ID 239c8d27-7b8f-130e-8e06-d2007053164c@codesourcery.com
State New
Headers show

Commit Message

Cesar Philippidis Nov. 19, 2016, 12:56 a.m. UTC
This patch eliminates the use of CUDA unified shared memory via cuMemcpy
inside nvptx_exec. The major problem with unified memory is that the
CUDA driver needs to copy all of the host addresses to a special data
region prior to transferring the data to and from the accelerator. I'm
not sure why the use CUDA unified shared memory is necessary here
because libgomp already has the necessary machinery to manage the data
mappings itself directly.

The only usage of CUDA unified memory occurs inside nvptx_exec.
Specifically, that function uses 'dp', which is managed by the map_*
functions, as a staging area to upload the omp struct arguments to the
PTX kernel. This particular use of unified memory is particularly
unfortunately as it imposes a synchronization barrier each time a PTX
kernel is launched.

At first I tried to eliminate those map* functions altogether, but that
caused failures in asyncwait-1.c. This happens because the device memory
used by async PTX kernel X may be overridden by async PTX kernel Y.
Apparently, the map functions try to rectify this situation by
implementing a pseudo circular queue in a paged sized block of memory.
However, that queue implementation is fragile because map_pop 'frees'
all of the allocations from the queue, not just the memory for the most
recently used kernel.

This patch does two things. First, it replaces the call to cuMemcpy
inside nvptx_exec with cuMemcpyHtoD. Second, it teaches the map routines
how to use a linked-list data structure to manage asynchronous PTX
stream arguments. As an optimization map_init reserves a large block
of memory for the first cuda_map so that non-async functions do not have
to waste time allocating device memory. Additional cuda_maps are created
for each async PTX stream with the requested SIZE as necessary.

Because of the asynchronous nature, the map routines need to be guarded
by a pthread lock. map_init and map_fini are already guarded by
ptx_dev_lock, but map_push and map_pop need to be locked by ptx_event_lock.

I don't really like the use of malloc for the cuda_maps, but that can be
optimized with a different data structure later.

I've applied this patch to gomp-4_0-branch.

Cesar
diff mbox

Patch

2016-11-18  Cesar Philippidis  <cesar@codesourcery.com>

	libgomp/
	* plugin/plugin-nvptx.c (struct cuda_map): New.
	(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
	h_tail with (cuda_map *) map.
	(cuda_map_create): New function.
	(cuda_map_destroy): New function.
	(map_init): Update to use a linked list of cuda_map objects.
	(map_fini): Likewise.
	(map_pop): Likewise.
	(map_push): Likewise.  Return CUdeviceptr instead of void.
	(init_streams_for_device): Remove stales references to ptx_stream
	members.
	(select_stream_for_async): Likewise.
	(nvptx_exec): Update call to map_init.


diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index e4fcc0e..c435012 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -95,20 +95,20 @@  cuda_error (CUresult r)
 static unsigned int instantiated_devices = 0;
 static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
 
+struct cuda_map
+{
+  CUdeviceptr d;
+  size_t size;
+  bool active;
+  struct cuda_map *next;
+};
+
 struct ptx_stream
 {
   CUstream stream;
   pthread_t host_thread;
   bool multithreaded;
-
-  CUdeviceptr d;
-  void *h;
-  void *h_begin;
-  void *h_end;
-  void *h_next;
-  void *h_prev;
-  void *h_tail;
-
+  struct cuda_map *map;
   struct ptx_stream *next;
 };
 
@@ -120,101 +120,114 @@  struct nvptx_thread
   struct ptx_device *ptx_dev;
 };
 
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+  struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+  assert (map);
+
+  map->next = NULL;
+  map->size = size;
+  map->active = false;
+
+  CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+  assert (map->d);
+
+  return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+  CUDA_CALL_ASSERT (cuMemFree, map->d);
+  free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+   contains the data mapping arguments for cuLaunchKernel.  Each
+   asynchronous PTX stream may have multiple pending kernel
+   invocations, which are launched in a FIFO order.  As such, the map
+   routines maintains a queue of cuLaunchKernel arguments.
+
+   Calls to map_push and map_pop must be guarded by ptx_event_lock.
+   Likewise, calls to map_init and map_fini are guarded by
+   ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+   GOMP_OFFLOAD_fini_device, respectively.  */
+
 static bool
 map_init (struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
-  assert (!s->d);
-  assert (!s->h);
-
-  CUDA_CALL (cuMemAllocHost, &s->h, size);
-  CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
 
-  assert (s->h);
+  s->map = cuda_map_create (size);
 
-  s->h_begin = s->h;
-  s->h_end = s->h_begin + size;
-  s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
-  assert (s->h_next);
-  assert (s->h_end);
   return true;
 }
 
 static bool
 map_fini (struct ptx_stream *s)
 {
-  CUDA_CALL (cuMemFreeHost, s->h);
+  assert (s->map->next == NULL);
+  assert (!s->map->active);
+
+  cuda_map_destroy (s->map);
+
   return true;
 }
 
 static void
 map_pop (struct ptx_stream *s)
 {
-  assert (s != NULL);
-  assert (s->h_next);
-  assert (s->h_prev);
-  assert (s->h_tail);
-
-  s->h_tail = s->h_next;
-
-  if (s->h_tail >= s->h_end)
-    s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+  struct cuda_map *next;
 
-  if (s->h_next == s->h_tail)
-    s->h_prev = s->h_next;
+  assert (s != NULL);
 
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
+  if (s->map->next == NULL)
+    {
+      s->map->active = false;
+      return;
+    }
 
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  next = s->map->next;
+  cuda_map_destroy (s->map);
+  s->map = next;
 }
 
-static void
-map_push (struct ptx_stream *s, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
 {
-  int left;
-  int offset;
+  struct cuda_map *map = NULL, *t = NULL;
 
-  assert (s != NULL);
+  assert (s);
+  assert (s->map);
 
-  left = s->h_end - s->h_next;
+  /* Each PTX stream requires a separate data region to store the
+     launch arguments for cuLaunchKernel.  Allocate a new
+     cuda_map and push it to the end of the list.  */
+  if (s->map->active)
+    {
+      map = cuda_map_create (size);
 
-  assert (s->h_prev);
-  assert (s->h_next);
+      for (t = s->map; t->next != NULL; t = t->next)
+	;
 
-  if (size >= left)
+      t->next = map;
+    }
+  else if (s->map->size < size)
     {
-      assert (s->h_next == s->h_prev);
-      s->h_next = s->h_prev = s->h_tail = s->h_begin;
+      cuda_map_destroy (s->map);
+      map = cuda_map_create (size);
     }
+  else
+    map = s->map;
 
-  assert (s->h_next);
-
-  offset = s->h_next - s->h;
-
-  *d = (void *)(s->d + offset);
-  *h = (void *)(s->h + offset);
-
-  s->h_prev = s->h_next;
-  s->h_next += size;
-
-  assert (s->h_prev);
-  assert (s->h_next);
-
-  assert (s->h_next >= s->h_begin);
-  assert (s->h_tail >= s->h_begin);
-  assert (s->h_prev >= s->h_begin);
-  assert (s->h_next <= s->h_end);
-  assert (s->h_tail <= s->h_end);
-  assert (s->h_prev <= s->h_end);
+  s->map = map;
+  s->map->active = true;
 
-  return;
+  return s->map->d;
 }
 
 /* Target data function launch information.  */
@@ -335,8 +348,6 @@  init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  null_stream->d = (CUdeviceptr) NULL;
-  null_stream->h = NULL;
   if (!map_init (null_stream))
     return false;
 
@@ -470,8 +481,6 @@  select_stream_for_async (int async, pthread_t thread, bool create,
 	  s->host_thread = thread;
 	  s->multithreaded = false;
 
-	  s->d = (CUdeviceptr) NULL;
-	  s->h = NULL;
 	  if (!map_init (s))
 	    {
 	      pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -889,7 +898,8 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   int i;
   struct ptx_stream *dev_str;
   void *kargs[1];
-  void *hp, *dp;
+  void *hp;
+  CUdeviceptr dp;
   struct nvptx_thread *nvthd = nvptx_thread ();
   const char *maybe_abort_msg = "(perhaps abort was called)";
 
@@ -999,17 +1009,20 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
-  map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
+  pthread_mutex_lock (&ptx_event_lock);
+  dp = map_push (dev_str, mapnum * sizeof (void *));
+  pthread_mutex_unlock (&ptx_event_lock);
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
 
   /* Copy the array of arguments to the mapped page.  */
+  hp = alloca(sizeof(void *) * mapnum);
   for (i = 0; i < mapnum; i++)
     ((void **) hp)[i] = devaddrs[i];
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
-  CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+  CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
 		    mapnum * sizeof (void *));
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",