[gomp4] remove use of CUDA unified memory in libgomp

Message ID	239c8d27-7b8f-130e-8e06-d2007053164c@codesourcery.com
State	New
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gcc-patches-return-442032-patch=linaro.org@gcc.gnu.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :subject:to:message-id:date:mime-version:content-type; q=dns; s= default; b=nBENjgpL5DbSTrXCcvXyYx2HDREHHrlODiC1nIn2TaWYkc6yfRlD7 +Iv1uKL1cFxjGiNqTUpXwnCJS4yyB6kExHHHCdnDRy09EFl46oarI/7yzJfzlQ9I KGugR5f2ZFKeO4xJtkKWJ9WXZ4vB8ivCO+B3jFFA2ArOtf93g/PDBM= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Cesar Philippidis <cesar@codesourcery.com> Subject: [gomp4] remove use of CUDA unified memory in libgomp To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org> Message-ID: <239c8d27-7b8f-130e-8e06-d2007053164c@codesourcery.com> Date: Fri, 18 Nov 2016 16:56:43 -0800 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.3.0 MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------98F0396469C88FC53733FA35"

2016-11-18 Cesar Philippidis <cesar@codesourcery.com> libgomp/ * plugin/plugin-nvptx.c (struct cuda_map): New. (struct ptx_stream): Replace d, h, h_begin, h_end, h_next, h_prev, h_tail with (cuda_map *) map. (cuda_map_create): New function. (cuda_map_destroy): New function. (map_init): Update to use a linked list of cuda_map objects. (map_fini): Likewise. (map_pop): Likewise. (map_push): Likewise. Return CUdeviceptr instead of void. (init_streams_for_device): Remove stales references to ptx_stream members. (select_stream_for_async): Likewise. (nvptx_exec): Update call to map_init. diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index e4fcc0e..c435012 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -95,20 +95,20 @@ cuda_error (CUresult r) static unsigned int instantiated_devices = 0; static pthread_mutex_t ptx_dev_lock = PTHREAD_MUTEX_INITIALIZER; +struct cuda_map +{ + CUdeviceptr d; + size_t size; + bool active; + struct cuda_map *next; +}; + struct ptx_stream { CUstream stream; pthread_t host_thread; bool multithreaded; - - CUdeviceptr d; - void *h; - void *h_begin; - void *h_end; - void *h_next; - void *h_prev; - void *h_tail; - + struct cuda_map *map; struct ptx_stream *next; }; @@ -120,101 +120,114 @@ struct nvptx_thread struct ptx_device *ptx_dev; }; +static struct cuda_map * +cuda_map_create (size_t size) +{ + struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); + + assert (map); + + map->next = NULL; + map->size = size; + map->active = false; + + CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); + assert (map->d); + + return map; +} + +static void +cuda_map_destroy (struct cuda_map *map) +{ + CUDA_CALL_ASSERT (cuMemFree, map->d); + free (map); +} + +/* The following map_* routines manage the CUDA device memory that + contains the data mapping arguments for cuLaunchKernel. Each + asynchronous PTX stream may have multiple pending kernel + invocations, which are launched in a FIFO order. As such, the map + routines maintains a queue of cuLaunchKernel arguments. + + Calls to map_push and map_pop must be guarded by ptx_event_lock. + Likewise, calls to map_init and map_fini are guarded by + ptx_dev_lock inside GOMP_OFFLOAD_init_device and + GOMP_OFFLOAD_fini_device, respectively. */ + static bool map_init (struct ptx_stream *s) { int size = getpagesize (); assert (s); - assert (!s->d); - assert (!s->h); - - CUDA_CALL (cuMemAllocHost, &s->h, size); - CUDA_CALL (cuMemHostGetDevicePointer, &s->d, s->h, 0); - assert (s->h); + s->map = cuda_map_create (size); - s->h_begin = s->h; - s->h_end = s->h_begin + size; - s->h_next = s->h_prev = s->h_tail = s->h_begin; - - assert (s->h_next); - assert (s->h_end); return true; } static bool map_fini (struct ptx_stream *s) { - CUDA_CALL (cuMemFreeHost, s->h); + assert (s->map->next == NULL); + assert (!s->map->active); + + cuda_map_destroy (s->map); + return true; } static void map_pop (struct ptx_stream *s) { - assert (s != NULL); - assert (s->h_next); - assert (s->h_prev); - assert (s->h_tail); - - s->h_tail = s->h_next; - - if (s->h_tail >= s->h_end) - s->h_tail = s->h_begin + (int) (s->h_tail - s->h_end); + struct cuda_map *next; - if (s->h_next == s->h_tail) - s->h_prev = s->h_next; + assert (s != NULL); - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); + if (s->map->next == NULL) + { + s->map->active = false; + return; + } - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + next = s->map->next; + cuda_map_destroy (s->map); + s->map = next; } -static void -map_push (struct ptx_stream *s, size_t size, void **h, void **d) +static CUdeviceptr +map_push (struct ptx_stream *s, size_t size) { - int left; - int offset; + struct cuda_map *map = NULL, *t = NULL; - assert (s != NULL); + assert (s); + assert (s->map); - left = s->h_end - s->h_next; + /* Each PTX stream requires a separate data region to store the + launch arguments for cuLaunchKernel. Allocate a new + cuda_map and push it to the end of the list. */ + if (s->map->active) + { + map = cuda_map_create (size); - assert (s->h_prev); - assert (s->h_next); + for (t = s->map; t->next != NULL; t = t->next) + ; - if (size >= left) + t->next = map; + } + else if (s->map->size < size) { - assert (s->h_next == s->h_prev); - s->h_next = s->h_prev = s->h_tail = s->h_begin; + cuda_map_destroy (s->map); + map = cuda_map_create (size); } + else + map = s->map; - assert (s->h_next); - - offset = s->h_next - s->h; - - *d = (void *)(s->d + offset); - *h = (void *)(s->h + offset); - - s->h_prev = s->h_next; - s->h_next += size; - - assert (s->h_prev); - assert (s->h_next); - - assert (s->h_next >= s->h_begin); - assert (s->h_tail >= s->h_begin); - assert (s->h_prev >= s->h_begin); - assert (s->h_next <= s->h_end); - assert (s->h_tail <= s->h_end); - assert (s->h_prev <= s->h_end); + s->map = map; + s->map->active = true; - return; + return s->map->d; } /* Target data function launch information. */ @@ -335,8 +348,6 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) null_stream->stream = NULL; null_stream->host_thread = pthread_self (); null_stream->multithreaded = true; - null_stream->d = (CUdeviceptr) NULL; - null_stream->h = NULL; if (!map_init (null_stream)) return false; @@ -470,8 +481,6 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->host_thread = thread; s->multithreaded = false; - s->d = (CUdeviceptr) NULL; - s->h = NULL; if (!map_init (s)) { pthread_mutex_unlock (&ptx_dev->stream_lock); @@ -889,7 +898,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int i; struct ptx_stream *dev_str; void *kargs[1]; - void *hp, *dp; + void *hp; + CUdeviceptr dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; @@ -999,17 +1009,20 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ - map_push (dev_str, mapnum * sizeof (void *), &hp, &dp); + pthread_mutex_lock (&ptx_event_lock); + dp = map_push (dev_str, mapnum * sizeof (void *)); + pthread_mutex_unlock (&ptx_event_lock); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ + hp = alloca(sizeof(void *) * mapnum); for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ - CUDA_CALL_ASSERT (cuMemcpy, (CUdeviceptr) dp, (CUdeviceptr) hp, + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, mapnum * sizeof (void *)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n",

[gomp4] remove use of CUDA unified memory in libgomp

Commit Message

Patch