2016-11-18 Cesar Philippidis <cesar@codesourcery.com>
libgomp/
* plugin/plugin-nvptx.c (struct cuda_map): New.
(struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev,
h_tail with (cuda_map *) map.
(cuda_map_create): New function.
(cuda_map_destroy): New function.
(map_init): Update to use a linked list of cuda_map objects.
(map_fini): Likewise.
(map_pop): Likewise.
(map_push): Likewise. Return CUdeviceptr instead of void.
(init_streams_for_device): Remove stales references to ptx_stream
members.
(select_stream_for_async): Likewise.
(nvptx_exec): Update call to map_init.
@@ -95,20 +95,20 @@ cuda_error (CUresult r)
static unsigned int instantiated_devices = 0;
static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER;
+struct cuda_map
+{
+ CUdeviceptr d;
+ size_t size;
+ bool active;
+ struct cuda_map *next;
+};
+
struct ptx_stream
{
CUstream stream;
pthread_t host_thread;
bool multithreaded;
-
- CUdeviceptr d;
- void *h;
- void *h_begin;
- void *h_end;
- void *h_next;
- void *h_prev;
- void *h_tail;
-
+ struct cuda_map *map;
struct ptx_stream *next;
};
@@ -120,101 +120,114 @@ struct nvptx_thread
struct ptx_device *ptx_dev;
};
+static struct cuda_map *
+cuda_map_create (size_t size)
+{
+ struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
+
+ assert (map);
+
+ map->next = NULL;
+ map->size = size;
+ map->active = false;
+
+ CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
+ assert (map->d);
+
+ return map;
+}
+
+static void
+cuda_map_destroy (struct cuda_map *map)
+{
+ CUDA_CALL_ASSERT (cuMemFree, map->d);
+ free (map);
+}
+
+/* The following map_* routines manage the CUDA device memory that
+ contains the data mapping arguments for cuLaunchKernel. Each
+ asynchronous PTX stream may have multiple pending kernel
+ invocations, which are launched in a FIFO order. As such, the map
+ routines maintains a queue of cuLaunchKernel arguments.
+
+ Calls to map_push and map_pop must be guarded by ptx_event_lock.
+ Likewise, calls to map_init and map_fini are guarded by
+ ptx_dev_lock inside GOMP_OFFLOAD_init_device and
+ GOMP_OFFLOAD_fini_device, respectively. */
+
static bool
map_init (struct ptx_stream *s)
{
int size = getpagesize ();
assert (s);
- assert (!s->d);
- assert (!s->h);
-
- CUDA_CALL (cuMemAllocHost, &s->h, size);
- CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0);
- assert (s->h);
+ s->map = cuda_map_create (size);
- s->h_begin = s->h;
- s->h_end = s->h_begin + size;
- s->h_next = s->h_prev = s->h_tail = s->h_begin;
-
- assert (s->h_next);
- assert (s->h_end);
return true;
}
static bool
map_fini (struct ptx_stream *s)
{
- CUDA_CALL (cuMemFreeHost, s->h);
+ assert (s->map->next == NULL);
+ assert (!s->map->active);
+
+ cuda_map_destroy (s->map);
+
return true;
}
static void
map_pop (struct ptx_stream *s)
{
- assert (s != NULL);
- assert (s->h_next);
- assert (s->h_prev);
- assert (s->h_tail);
-
- s->h_tail = s->h_next;
-
- if (s->h_tail >= s->h_end)
- s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end);
+ struct cuda_map *next;
- if (s->h_next == s->h_tail)
- s->h_prev = s->h_next;
+ assert (s != NULL);
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
+ if (s->map->next == NULL)
+ {
+ s->map->active = false;
+ return;
+ }
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
+ next = s->map->next;
+ cuda_map_destroy (s->map);
+ s->map = next;
}
-static void
-map_push (struct ptx_stream *s, size_t size, void **h, void **d)
+static CUdeviceptr
+map_push (struct ptx_stream *s, size_t size)
{
- int left;
- int offset;
+ struct cuda_map *map = NULL, *t = NULL;
- assert (s != NULL);
+ assert (s);
+ assert (s->map);
- left = s->h_end - s->h_next;
+ /* Each PTX stream requires a separate data region to store the
+ launch arguments for cuLaunchKernel. Allocate a new
+ cuda_map and push it to the end of the list. */
+ if (s->map->active)
+ {
+ map = cuda_map_create (size);
- assert (s->h_prev);
- assert (s->h_next);
+ for (t = s->map; t->next != NULL; t = t->next)
+ ;
- if (size >= left)
+ t->next = map;
+ }
+ else if (s->map->size < size)
{
- assert (s->h_next == s->h_prev);
- s->h_next = s->h_prev = s->h_tail = s->h_begin;
+ cuda_map_destroy (s->map);
+ map = cuda_map_create (size);
}
+ else
+ map = s->map;
- assert (s->h_next);
-
- offset = s->h_next - s->h;
-
- *d = (void *)(s->d + offset);
- *h = (void *)(s->h + offset);
-
- s->h_prev = s->h_next;
- s->h_next += size;
-
- assert (s->h_prev);
- assert (s->h_next);
-
- assert (s->h_next >= s->h_begin);
- assert (s->h_tail >= s->h_begin);
- assert (s->h_prev >= s->h_begin);
- assert (s->h_next <= s->h_end);
- assert (s->h_tail <= s->h_end);
- assert (s->h_prev <= s->h_end);
+ s->map = map;
+ s->map->active = true;
- return;
+ return s->map->d;
}
/* Target data function launch information. */
@@ -335,8 +348,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency)
null_stream->stream = NULL;
null_stream->host_thread = pthread_self ();
null_stream->multithreaded = true;
- null_stream->d = (CUdeviceptr) NULL;
- null_stream->h = NULL;
if (!map_init (null_stream))
return false;
@@ -470,8 +481,6 @@ select_stream_for_async (int async, pthread_t thread, bool create,
s->host_thread = thread;
s->multithreaded = false;
- s->d = (CUdeviceptr) NULL;
- s->h = NULL;
if (!map_init (s))
{
pthread_mutex_unlock (&ptx_dev->stream_lock);
@@ -889,7 +898,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
int i;
struct ptx_stream *dev_str;
void *kargs[1];
- void *hp, *dp;
+ void *hp;
+ CUdeviceptr dp;
struct nvptx_thread *nvthd = nvptx_thread ();
const char *maybe_abort_msg = "(perhaps abort was called)";
@@ -999,17 +1009,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
/* This reserves a chunk of a pre-allocated page of memory mapped on both
the host and the device. HP is a host pointer to the new chunk, and DP is
the corresponding device pointer. */
- map_push (dev_str, mapnum * sizeof (void *), &hp, &dp);
+ pthread_mutex_lock (&ptx_event_lock);
+ dp = map_push (dev_str, mapnum * sizeof (void *));
+ pthread_mutex_unlock (&ptx_event_lock);
GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__);
/* Copy the array of arguments to the mapped page. */
+ hp = alloca(sizeof(void *) * mapnum);
for (i = 0; i < mapnum; i++)
((void **) hp)[i] = devaddrs[i];
/* Copy the (device) pointers to arguments to the device (dp and hp might in
fact have the same value on a unified-memory system). */
- CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp,
+ CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
mapnum * sizeof (void *));
GOMP_PLUGIN_debug (0, " %s: kernel %s: launch"
" gangs=%u, workers=%u, vectors=%u\n",