diff mbox

[libstdc++] Optimize selection sampling by using generator to get two values at once

Message ID 20161021155509.GU2922@redhat.com
State New
Headers show

Commit Message

Jonathan Wakely Oct. 21, 2016, 3:55 p.m. UTC
On 19/10/16 12:48 +0200, Eelis van der Weegen wrote:
>This is the same optimization as was recently applied to std::shuffle.

>

>It reduces the runtime of the following program by 20% on my machine:

>

>	int main()

>	{

>		std::vector<uint64_t> a(10000), b(1000);

>		std::mt19937 gen;

>

>		uint64_t c = 0;

>

>		for (int i = 0; i != 1000; ++i)

>		{

>			std::sample(a.begin(), a.end(), b.begin(), b.size(), gen);

>			c += uint64_t(b[32]);

>		}

>

>		std::cout << c;

>	}


Thanks, I've committd this slightly revised version to trunk (tweaking
some whitespace, removing some redundant std:: qualification, and
using foo_t<T> aliases instead of typename foo<T>::type).

Tested powerpc64le-linux. Committed to trunk.
diff mbox

Patch

commit 01535578bc44d810e7cf4c2bfbc3836d7977e229
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Fri Oct 21 16:37:21 2016 +0100

    Optimize RNG use in std::sample selection sampling
    
    2016-10-21  Eelis van der Weegen  <eelis@eelis.net>
    
    	* include/bits/stl_algo.h (__gen_two_uniform_ints): Move logic out
    	of shuffle into new function.
    	(shuffle): Call __gen_two_uniform_ints.
    	(__sample<ForwardIterator, OutputIterator, Cat, Size, URBG>): Use
    	__gen_two_uniform_ints and perform two samples at a time.

diff --git a/libstdc++-v3/include/bits/stl_algo.h b/libstdc++-v3/include/bits/stl_algo.h
index 6c771bb..3ecb33b 100644
--- a/libstdc++-v3/include/bits/stl_algo.h
+++ b/libstdc++-v3/include/bits/stl_algo.h
@@ -3741,6 +3741,37 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
   /**
+   *  @brief Generate two uniformly distributed integers using a
+   *         single distribution invocation.
+   *  @param  __b0    The upper bound for the first integer.
+   *  @param  __b1    The upper bound for the second integer.
+   *  @param  __g     A UniformRandomBitGenerator.
+   *  @return  A pair (i, j) with i and j uniformly distributed
+   *           over [0, __b0) and [0, __b1), respectively.
+   *
+   *  Requires: __b0 * __b1 <= __g.max() - __g.min().
+   *
+   *  Using uniform_int_distribution with a range that is very
+   *  small relative to the range of the generator ends up wasting
+   *  potentially expensively generated randomness, since
+   *  uniform_int_distribution does not store leftover randomness
+   *  between invocations.
+   *
+   *  If we know we want two integers in ranges that are sufficiently
+   *  small, we can compose the ranges, use a single distribution
+   *  invocation, and significantly reduce the waste.
+  */
+  template<typename _IntType, typename _UniformRandomBitGenerator>
+    pair<_IntType, _IntType>
+    __gen_two_uniform_ints(_IntType __b0, _IntType __b1,
+			   _UniformRandomBitGenerator&& __g)
+    {
+      _IntType __x
+	= uniform_int_distribution<_IntType>{0, (__b0 * __b1) - 1}(__g);
+      return std::make_pair(__x / __b1, __x % __b1);
+    }
+
+  /**
    *  @brief Shuffle the elements of a sequence using a uniform random
    *         number generator.
    *  @ingroup mutating_algorithms
@@ -3773,8 +3804,10 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
       typedef typename std::uniform_int_distribution<__ud_type> __distr_type;
       typedef typename __distr_type::param_type __p_type;
 
-      typedef typename std::remove_reference<_UniformRandomNumberGenerator>::type _Gen;
-      typedef typename std::common_type<typename _Gen::result_type, __ud_type>::type __uc_type;
+      typedef typename remove_reference<_UniformRandomNumberGenerator>::type
+	_Gen;
+      typedef typename common_type<typename _Gen::result_type, __ud_type>::type
+	__uc_type;
 
       const __uc_type __urngrange = __g.max() - __g.min();
       const __uc_type __urange = __uc_type(__last - __first);
@@ -3801,13 +3834,12 @@  _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	while (__i != __last)
 	{
 	  const __uc_type __swap_range = __uc_type(__i - __first) + 1;
-	  const __uc_type __comp_range = __swap_range * (__swap_range + 1);
 
-	  std::uniform_int_distribution<__uc_type> __d{0, __comp_range - 1};
-	  const __uc_type __pospos = __d(__g);
+	  const pair<__uc_type, __uc_type> __pospos =
+	    __gen_two_uniform_ints(__swap_range, __swap_range + 1, __g);
 
-	  std::iter_swap(__i++, __first + (__pospos % __swap_range));
-	  std::iter_swap(__i++, __first + (__pospos / __swap_range));
+	  std::iter_swap(__i++, __first + __pospos.first);
+	  std::iter_swap(__i++, __first + __pospos.second);
 	}
 
 	return;
@@ -5695,9 +5727,52 @@  _GLIBCXX_BEGIN_NAMESPACE_ALGO
     {
       using __distrib_type = uniform_int_distribution<_Size>;
       using __param_type = typename __distrib_type::param_type;
+      using _USize = make_unsigned_t<_Size>;
+      using _Gen = remove_reference_t<_UniformRandomBitGenerator>;
+      using __uc_type = common_type_t<typename _Gen::result_type, _USize>;
+
       __distrib_type __d{};
       _Size __unsampled_sz = std::distance(__first, __last);
-      for (__n = std::min(__n, __unsampled_sz); __n != 0; ++__first)
+      __n = std::min(__n, __unsampled_sz);
+
+      // If possible, we use __gen_two_uniform_ints to efficiently produce
+      // two random numbers using a single distribution invocation:
+
+      const __uc_type __urngrange = __g.max() - __g.min();
+      if (__urngrange / __uc_type(__unsampled_sz) >= __uc_type(__unsampled_sz))
+        // I.e. (__urngrange >= __unsampled_sz * __unsampled_sz) but without
+	// wrapping issues.
+        {
+	  while (__n != 0 && __unsampled_sz >= 2)
+	    {
+	      const pair<_Size, _Size> p =
+		__gen_two_uniform_ints(__unsampled_sz, __unsampled_sz - 1, __g);
+
+	      --__unsampled_sz;
+	      if (p.first < __n)
+		{
+		  *__out++ = *__first;
+		  --__n;
+		}
+
+	      ++__first;
+
+	      if (__n == 0) break;
+
+	      --__unsampled_sz;
+	      if (p.second < __n)
+		{
+		  *__out++ = *__first;
+		  --__n;
+		}
+
+	      ++__first;
+	    }
+        }
+
+      // The loop above is otherwise equivalent to this one-at-a-time version:
+
+      for (; __n != 0; ++__first)
 	if (__d(__g, __param_type{0, --__unsampled_sz}) < __n)
 	  {
 	    *__out++ = *__first;