[openacc] adjust default num_gangs

Message ID	1811a6f1-7d68-0dd8-becb-1d0df3a5894b@codesourcery.com
State	Superseded
Headers	show Delivered-To: patch@linaro.org Received-SPF: pass (google.com: domain of gcc-patches-return-440262-patch=linaro.org@gcc.gnu.org designates 209.132.180.131 as permitted sender) client-ip=209.132.180.131; DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id :list-unsubscribe:list-archive:list-post:list-help:sender:from :subject:to:message-id:date:mime-version:content-type; q=dns; s= default; b=oKLUBUAI9jNahEZQjBziiupnkqt9YA5hlqllQ3tQkS+G42zqpNv+3 gIQT2Sm0IPKiCE7kVzFLKnnL7ZiHM9u/RWtXmYxUEUB4Iw9At/RSZtY9wDlGeSYe 4YITza9bFSDA6fWr3TYFgoJ1xzWevK3q9LrvTfyYShmmPMDo7fz2u8= Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm Precedence: bulk Sender: gcc-patches-owner@gcc.gnu.org From: Cesar Philippidis <cesar@codesourcery.com> Subject: [openacc] adjust default num_gangs To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>, Jakub Jelinek <jakub@redhat.com> Message-ID: <1811a6f1-7d68-0dd8-becb-1d0df3a5894b@codesourcery.com> Date: Wed, 2 Nov 2016 12:34:47 -0700 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Thunderbird/45.3.0 MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------101791D3FA0CD19F85AA7BB8"

Message ID

1811a6f1-7d68-0dd8-becb-1d0df3a5894b@codesourcery.com

State

Superseded

Headers

Received-SPF: pass (google.com: domain of
	gcc-patches-return-440262-patch=linaro.org@gcc.gnu.org
	designates 209.132.180.131 as permitted sender)
	client-ip=209.132.180.131; 
DomainKey-Signature: a=rsa-sha1; c=nofws; d=gcc.gnu.org; h=list-id
	:list-unsubscribe:list-archive:list-post:list-help:sender:from
	:subject:to:message-id:date:mime-version:content-type; q=dns; s=
	default; b=oKLUBUAI9jNahEZQjBziiupnkqt9YA5hlqllQ3tQkS+G42zqpNv+3
	gIQT2Sm0IPKiCE7kVzFLKnnL7ZiHM9u/RWtXmYxUEUB4Iw9At/RSZtY9wDlGeSYe
	4YITza9bFSDA6fWr3TYFgoJ1xzWevK3q9LrvTfyYShmmPMDo7fz2u8=
Mailing-List: contact gcc-patches-help@gcc.gnu.org; run by ezmlm
Precedence: bulk
Sender: gcc-patches-owner@gcc.gnu.org
From: Cesar Philippidis <cesar@codesourcery.com>
Subject: [openacc] adjust default num_gangs
To: "gcc-patches@gcc.gnu.org" <gcc-patches@gcc.gnu.org>,
	Jakub Jelinek	<jakub@redhat.com>
Message-ID: <1811a6f1-7d68-0dd8-becb-1d0df3a5894b@codesourcery.com>
Date: Wed, 2 Nov 2016 12:34:47 -0700
User-Agent: Mozilla/5.0 (X11; Linux x86_64;
	rv:45.0) Gecko/20100101 Thunderbird/45.3.0
MIME-Version: 1.0
Content-Type: multipart/mixed;
	boundary="------------101791D3FA0CD19F85AA7BB8"

Commit Message

Cesar Philippidis Nov. 2, 2016, 7:34 p.m. UTC

This patch teaches the libgomp runtime how to probe the CUDA driver to
extract the number of Stream Multiprocessors that are available on the
graphics hardware and use that as the default value for num_gangs.
Without that patch, libgomp used to have num_gangs default to 32, which
was chosen arbitrarily. At least this value maps onto a hardware value.

More details regarding this patch can be found here:

  https://gcc.gnu.org/ml/gcc-patches/2016-08/msg02064.html
  https://gcc.gnu.org/ml/gcc-patches/2016-08/msg02084.html

Is this patch OK for trunk?

Cesar

Comments

Jakub Jelinek Nov. 2, 2016, 7:50 p.m. UTC | #1

On Wed, Nov 02, 2016 at 12:34:47PM -0700, Cesar Philippidis wrote:
> @@ -932,9 +933,84 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,

>  

>    if (seen_zero)

>      {

> +      /* See if the user provided GOMP_OPENACC_DIM environment

> +	 variable to specify runtime defaults. */

> +      static int default_dims[GOMP_DIM_MAX];

> +

> +      if (!default_dims[0])

> +	{


Is this guarded by some lock, or is it just racy if multiple
nvptx_execs are done at the same time?

> +	  /* We only read the environment variable once.  You can't

> +	     change it in the middle of execution.  The sytntax  is


syntax

> +	     the same as for the -fopenacc-dim compilation option.  */

> +	  const char *env_var = getenv ("GOMP_OPENACC_DIM");


> +

> +	  if (CUDA_SUCCESS == cuDeviceGetAttribute

> +	      (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev)

> +	      && CUDA_SUCCESS == cuDeviceGetAttribute

> +	      (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev)

> +	      && CUDA_SUCCESS == cuDeviceGetAttribute

> +	      (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)

> +	      && CUDA_SUCCESS == cuDeviceGetAttribute

> +	      (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev))


The formatting is wrong.  1) you should use the call should be on lhs of ==,
not rhs 2) ( should be after cuDeviceGetAttribute, not on the next line
3) still the lines are too long.

	  if (cuDeviceGetAttribute (&block_size,
				    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
				    dev) == CUDA_SUCCESS
	      && cuDeviceGetAttribute (...

CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR
is still way too long, perhaps initialize a temporary const var
to that, or use some macro like
DEV_ATTR (MAX_THREADS_PER_MULTIPROCESSOR)
where
#define DEV_ATTR(x) CU_DEVICE_ATTRIBUTE_##x

Otherwise LGTM.

	Jakub

2016-11-02  Cesar Philippidis  <cesar@codesourcery.com>
	    Nathan Sidwell  <nathan@acm.org>

	gcc/
	* config/nvptx/nvptx.c (PTX_GANG_DEFAULT): Set to zero.

	libgomp/
	* plugin/plugin-nvptx.c (nvptx_exec): Interrogate board attributes
	to determine default geometry.
	* testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c: Set gang
	dimension.

diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index 80fa9ae..782bbde 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -4174,7 +4174,7 @@  nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
 /* Define dimension sizes for known hardware.  */
 #define PTX_VECTOR_LENGTH 32
 #define PTX_WORKER_LENGTH 32
-#define PTX_GANG_DEFAULT  32
+#define PTX_GANG_DEFAULT  0 /* Defer to runtime.  */
 
 /* Validate compute dimensions of an OpenACC offload or routine, fill
    in non-unity defaults.  FN_LEVEL indicates the level at which a
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 327500c..91c1386 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -45,6 +45,7 @@ 
 #include <stdio.h>
 #include <unistd.h>
 #include <assert.h>
+#include <errno.h>
 
 static const char *
 cuda_error (CUresult r)
@@ -932,9 +933,84 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
   if (seen_zero)
     {
+      /* See if the user provided GOMP_OPENACC_DIM environment
+	 variable to specify runtime defaults. */
+      static int default_dims[GOMP_DIM_MAX];
+
+      if (!default_dims[0])
+	{
+	  /* We only read the environment variable once.  You can't
+	     change it in the middle of execution.  The sytntax  is
+	     the same as for the -fopenacc-dim compilation option.  */
+	  const char *env_var = getenv ("GOMP_OPENACC_DIM");
+	  if (env_var)
+	    {
+	      const char *pos = env_var;
+
+	      for (i = 0; *pos && i != GOMP_DIM_MAX; i++)
+		{
+		  if (i && *pos++ != ':')
+		    break;
+		  if (*pos != ':')
+		    {
+		      const char *eptr;
+
+		      errno = 0;
+		      long val = strtol (pos, (char **)&eptr, 10);
+		      if (errno || val < 0 || (unsigned)val != val)
+			break;
+		      default_dims[i] = (int)val;
+		      pos = eptr;
+		    }
+		}
+	    }
+
+	  int warp_size, block_size, dev_size, cpu_size;
+	  CUdevice dev = nvptx_thread()->ptx_dev->dev;
+	  /* 32 is the default for known hardware.  */
+	  int gang = 0, worker = 32, vector = 32;
+
+	  if (CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&block_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&warp_size, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&dev_size, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev)
+	      && CUDA_SUCCESS == cuDeviceGetAttribute
+	      (&cpu_size, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, dev))
+	    {
+	      GOMP_PLUGIN_debug (0, " warp_size=%d, block_size=%d,"
+				 " dev_size=%d, cpu_size=%d\n",
+				 warp_size, block_size, dev_size, cpu_size);
+	      gang = (cpu_size / block_size) * dev_size;
+	      worker = block_size / warp_size;
+	      vector = warp_size;
+	    }
+
+	  /* There is no upper bound on the gang size.  The best size
+	     matches the hardware configuration.  Logical gangs are
+	     scheduled onto physical hardware.  To maximize usage, we
+	     should guess a large number.  */
+	  if (default_dims[GOMP_DIM_GANG] < 1)
+	    default_dims[GOMP_DIM_GANG] = gang ? gang : 1024;
+	  /* The worker size must not exceed the hardware.  */
+	  if (default_dims[GOMP_DIM_WORKER] < 1
+	      || (default_dims[GOMP_DIM_WORKER] > worker && gang))
+	    default_dims[GOMP_DIM_WORKER] = worker;
+	  /* The vector size must exactly match the hardware.  */
+	  if (default_dims[GOMP_DIM_VECTOR] < 1
+	      || (default_dims[GOMP_DIM_VECTOR] != vector && gang))
+	    default_dims[GOMP_DIM_VECTOR] = vector;
+
+	  GOMP_PLUGIN_debug (0, " default dimensions [%d,%d,%d]\n",
+			     default_dims[GOMP_DIM_GANG],
+			     default_dims[GOMP_DIM_WORKER],
+			     default_dims[GOMP_DIM_VECTOR]);
+	}
+
       for (i = 0; i != GOMP_DIM_MAX; i++)
-       if (!dims[i])
-         dims[i] = /* TODO */ 32;
+	if (!dims[i])
+	  dims[i] = default_dims[i];
     }
 
   /* This reserves a chunk of a pre-allocated page of memory mapped on both
@@ -954,8 +1030,8 @@  nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 		    mapnum * sizeof (void *));
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
 		     " gangs=%u, workers=%u, vectors=%u\n",
-		     __FUNCTION__, targ_fn->launch->fn,
-		     dims[0], dims[1], dims[2]);
+		     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
+		     dims[GOMP_DIM_WORKER], dims[GOMP_DIM_VECTOR]);
 
   // OpenACC		CUDA
   //
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
index 8a755b8..3ca9388 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-auto-1.c
@@ -2,6 +2,8 @@ 
    not optimized away at -O0, and then confuses the target assembler.
    { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
 
+/* { dg-additional-options "-fopenacc-dim=32" } */
+
 #include <stdio.h>
 #include <openacc.h>

[openacc] adjust default num_gangs

Commit Message

Comments

Patch