diff mbox

[tip/core/rcu,12/23] rcu: Prevent force_quiescent_state() memory contention

Message ID 1346350718-30937-12-git-send-email-paulmck@linux.vnet.ibm.com
State Superseded
Headers show

Commit Message

Paul E. McKenney Aug. 30, 2012, 6:18 p.m. UTC
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>

Large systems running RCU_FAST_NO_HZ kernels see extreme memory
contention on the rcu_state structure's ->fqslock field.  This
can be avoided by disabling RCU_FAST_NO_HZ, either at compile time
or at boot time (via the nohz kernel boot parameter), but large
systems will no doubt become sensitive to energy consumption.
This commit therefore uses a combining-tree approach to spread the
memory contention across new cache lines in the leaf rcu_node structures.
This can be thought of as a tournament lock that has only a try-lock
acquisition primitive.

The effect on small systems is minimal, because such systems have
an rcu_node "tree" consisting of a single node.  In addition, this
functionality is not used on fastpaths.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c |   47 +++++++++++++++++++++++++++++++++++++----------
 kernel/rcutree.h |    1 +
 2 files changed, 38 insertions(+), 10 deletions(-)

Comments

Josh Triplett Sept. 2, 2012, 10:47 a.m. UTC | #1
On Thu, Aug 30, 2012 at 11:18:27AM -0700, Paul E. McKenney wrote:
> --- a/kernel/rcutree.c
> +++ b/kernel/rcutree.c
[...]
> @@ -1824,16 +1825,35 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
>  static void force_quiescent_state(struct rcu_state *rsp)
>  {
>  	unsigned long flags;
> -	struct rcu_node *rnp = rcu_get_root(rsp);
> +	bool ret;
> +	struct rcu_node *rnp;
> +	struct rcu_node *rnp_old = NULL;
> +
> +	/* Funnel through hierarchy to reduce memory contention. */
> +	rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;

What makes this use of raw_smp_processor_id() safe?  (And, could you
document the answer here?)

> +	for (; rnp != NULL; rnp = rnp->parent) {
> +		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
> +		      !raw_spin_trylock(&rnp->fqslock);

So, the root lock will still get trylocked by one CPU per second-level
tree node, just not by every CPU?

> @@ -2721,10 +2741,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
>  static void __init rcu_init_one(struct rcu_state *rsp,
>  		struct rcu_data __percpu *rda)
>  {
> -	static char *buf[] = { "rcu_node_level_0",
> -			       "rcu_node_level_1",
> -			       "rcu_node_level_2",
> -			       "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
> +	static char *buf[] = { "rcu_node_0",
> +			       "rcu_node_1",
> +			       "rcu_node_2",
> +			       "rcu_node_3" };  /* Match MAX_RCU_LVLS */

Why rename these?

- Josh Triplett
diff mbox

Patch

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index cce73ff..ed1be62 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -61,6 +61,7 @@ 
 /* Data structures. */
 
 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
 
 #define RCU_STATE_INITIALIZER(sname, cr) { \
 	.level = { &sname##_state.node[0] }, \
@@ -1824,16 +1825,35 @@  static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
 static void force_quiescent_state(struct rcu_state *rsp)
 {
 	unsigned long flags;
-	struct rcu_node *rnp = rcu_get_root(rsp);
+	bool ret;
+	struct rcu_node *rnp;
+	struct rcu_node *rnp_old = NULL;
+
+	/* Funnel through hierarchy to reduce memory contention. */
+	rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+	for (; rnp != NULL; rnp = rnp->parent) {
+		ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+		      !raw_spin_trylock(&rnp->fqslock);
+		if (rnp_old != NULL)
+			raw_spin_unlock(&rnp_old->fqslock);
+		if (ret) {
+			rsp->n_force_qs_lh++;
+			return;
+		}
+		rnp_old = rnp;
+	}
+	/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
 
-	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS)
+	/* Reached the root of the rcu_node tree, acquire lock. */
+	raw_spin_lock_irqsave(&rnp_old->lock, flags);
+	raw_spin_unlock(&rnp_old->fqslock);
+	if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+		rsp->n_force_qs_lh++;
+		raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 		return;  /* Someone beat us to it. */
-	if (!raw_spin_trylock_irqsave(&rnp->lock, flags)) {
-		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
-		return;
 	}
 	rsp->gp_flags |= RCU_GP_FLAG_FQS;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
 	wake_up(&rsp->gp_wq);  /* Memory barrier implied by wake_up() path. */
 }
 
@@ -2721,10 +2741,14 @@  static void __init rcu_init_levelspread(struct rcu_state *rsp)
 static void __init rcu_init_one(struct rcu_state *rsp,
 		struct rcu_data __percpu *rda)
 {
-	static char *buf[] = { "rcu_node_level_0",
-			       "rcu_node_level_1",
-			       "rcu_node_level_2",
-			       "rcu_node_level_3" };  /* Match MAX_RCU_LVLS */
+	static char *buf[] = { "rcu_node_0",
+			       "rcu_node_1",
+			       "rcu_node_2",
+			       "rcu_node_3" };  /* Match MAX_RCU_LVLS */
+	static char *fqs[] = { "rcu_node_fqs_0",
+			       "rcu_node_fqs_1",
+			       "rcu_node_fqs_2",
+			       "rcu_node_fqs_3" };  /* Match MAX_RCU_LVLS */
 	int cpustride = 1;
 	int i;
 	int j;
@@ -2749,6 +2773,9 @@  static void __init rcu_init_one(struct rcu_state *rsp,
 			raw_spin_lock_init(&rnp->lock);
 			lockdep_set_class_and_name(&rnp->lock,
 						   &rcu_node_class[i], buf[i]);
+			raw_spin_lock_init(&rnp->fqslock);
+			lockdep_set_class_and_name(&rnp->fqslock,
+						   &rcu_fqs_class[i], fqs[i]);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 36916df..2d4cc18 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,7 @@  struct rcu_node {
 				/*  per-CPU kthreads as needed. */
 	unsigned int node_kthread_status;
 				/* State of node_kthread_task for tracing. */
+	raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
 } ____cacheline_internodealigned_in_smp;
 
 /*