diff mbox series

[v1,4/4] cpufreq: intel_pstate: Implement the ->adjust_perf() callback

Message ID 3342398.tGQZsKHvNY@kreacher
State Superseded
Headers show
Series [v1,1/4] cpufreq: schedutil: Add util to struct sg_cpu | expand

Commit Message

Rafael J. Wysocki Dec. 7, 2020, 4:38 p.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Make intel_pstate expose the ->adjust_perf() callback when it
operates in the passive mode with HWP enabled which causes the
schedutil governor to use that callback instead of ->fast_switch().

The minimum and target performance-level values passed by the
governor to ->adjust_perf() are converted to HWP.REQ.MIN and
HWP.REQ.DESIRED, respectively, which allows the processor to
adjust its configuration to maximize energy-efficiency while
providing sufficient capacity.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---

Changes with respect to the RFC:
 - Drop the code related to the dropped "busy" argument of
   ->adjust_perf().
 - Update the changelog accordingly.

---
 drivers/cpufreq/intel_pstate.c |   70 +++++++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 12 deletions(-)

Comments

Peter Zijlstra Dec. 8, 2020, 12:43 p.m. UTC | #1
On Mon, Dec 07, 2020 at 05:38:58PM +0100, Rafael J. Wysocki wrote:

> +static void intel_cpufreq_adjust_perf(unsigned int cpunum,

> +				      unsigned long min_perf,

> +				      unsigned long target_perf,

> +				      unsigned long capacity)

> +{

> +	struct cpudata *cpu = all_cpu_data[cpunum];

> +	int old_pstate = cpu->pstate.current_pstate;

> +	int cap_pstate, min_pstate, max_pstate, target_pstate;

> +

> +	update_turbo_state();

> +	cap_pstate = global.turbo_disabled ? cpu->pstate.max_pstate :

> +					     cpu->pstate.turbo_pstate;

> +

> +	/* Optimization: Avoid unnecessary divisions. */

> +

> +	target_pstate = cap_pstate;

> +	if (target_perf < capacity)

> +		target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);

> +

> +	min_pstate = cap_pstate;

> +	if (min_perf < capacity)

> +		min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);

> +

> +	if (min_pstate < cpu->pstate.min_pstate)

> +		min_pstate = cpu->pstate.min_pstate;

> +

> +	if (min_pstate < cpu->min_perf_ratio)

> +		min_pstate = cpu->min_perf_ratio;

> +

> +	max_pstate = min(cap_pstate, cpu->max_perf_ratio);

> +	if (max_pstate < min_pstate)

> +		max_pstate = min_pstate;

> +

> +	target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);

> +

> +	intel_cpufreq_adjust_hwp(cpu, min_pstate, max_pstate, target_pstate, true);


I'm confused... HWP doesn't do pstate, yet everything here is now called
pstate, help?

> +

> +	cpu->pstate.current_pstate = target_pstate;

> +	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);

> +}
Rafael J. Wysocki Dec. 8, 2020, 5:10 p.m. UTC | #2
On Tue, Dec 8, 2020 at 1:44 PM Peter Zijlstra <peterz@infradead.org> wrote:
>

> On Mon, Dec 07, 2020 at 05:38:58PM +0100, Rafael J. Wysocki wrote:

>

> > +static void intel_cpufreq_adjust_perf(unsigned int cpunum,

> > +                                   unsigned long min_perf,

> > +                                   unsigned long target_perf,

> > +                                   unsigned long capacity)

> > +{

> > +     struct cpudata *cpu = all_cpu_data[cpunum];

> > +     int old_pstate = cpu->pstate.current_pstate;

> > +     int cap_pstate, min_pstate, max_pstate, target_pstate;

> > +

> > +     update_turbo_state();

> > +     cap_pstate = global.turbo_disabled ? cpu->pstate.max_pstate :

> > +                                          cpu->pstate.turbo_pstate;

> > +

> > +     /* Optimization: Avoid unnecessary divisions. */

> > +

> > +     target_pstate = cap_pstate;

> > +     if (target_perf < capacity)

> > +             target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);

> > +

> > +     min_pstate = cap_pstate;

> > +     if (min_perf < capacity)

> > +             min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);

> > +

> > +     if (min_pstate < cpu->pstate.min_pstate)

> > +             min_pstate = cpu->pstate.min_pstate;

> > +

> > +     if (min_pstate < cpu->min_perf_ratio)

> > +             min_pstate = cpu->min_perf_ratio;

> > +

> > +     max_pstate = min(cap_pstate, cpu->max_perf_ratio);

> > +     if (max_pstate < min_pstate)

> > +             max_pstate = min_pstate;

> > +

> > +     target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);

> > +

> > +     intel_cpufreq_adjust_hwp(cpu, min_pstate, max_pstate, target_pstate, true);

>

> I'm confused... HWP doesn't do pstate, yet everything here is now called

> pstate, help?


HWP.REQ.MIN, HWP.REQ.MAX and HWP.REQ.DESIRED all are in the same space
of values as the original PERF_CTL MSR, which is P-states, at least
effectively.

> > +

> > +     cpu->pstate.current_pstate = target_pstate;

> > +     intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);

> > +}
diff mbox series

Patch

Index: linux-pm/drivers/cpufreq/intel_pstate.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/intel_pstate.c
+++ linux-pm/drivers/cpufreq/intel_pstate.c
@@ -2526,20 +2526,19 @@  static void intel_cpufreq_trace(struct c
 		fp_toint(cpu->iowait_boost * 100));
 }
 
-static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 target_pstate,
-				     bool strict, bool fast_switch)
+static void intel_cpufreq_adjust_hwp(struct cpudata *cpu, u32 min, u32 max,
+				     u32 desired, bool fast_switch)
 {
 	u64 prev = READ_ONCE(cpu->hwp_req_cached), value = prev;
 
 	value &= ~HWP_MIN_PERF(~0L);
-	value |= HWP_MIN_PERF(target_pstate);
+	value |= HWP_MIN_PERF(min);
 
-	/*
-	 * The entire MSR needs to be updated in order to update the HWP min
-	 * field in it, so opportunistically update the max too if needed.
-	 */
 	value &= ~HWP_MAX_PERF(~0L);
-	value |= HWP_MAX_PERF(strict ? target_pstate : cpu->max_perf_ratio);
+	value |= HWP_MAX_PERF(max);
+
+	value &= ~HWP_DESIRED_PERF(~0L);
+	value |= HWP_DESIRED_PERF(desired);
 
 	if (value == prev)
 		return;
@@ -2569,11 +2568,15 @@  static int intel_cpufreq_update_pstate(s
 	int old_pstate = cpu->pstate.current_pstate;
 
 	target_pstate = intel_pstate_prepare_request(cpu, target_pstate);
-	if (hwp_active)
-		intel_cpufreq_adjust_hwp(cpu, target_pstate,
-					 policy->strict_target, fast_switch);
-	else if (target_pstate != old_pstate)
+	if (hwp_active) {
+		int max_pstate = policy->strict_target ?
+					target_pstate : cpu->max_perf_ratio;
+
+		intel_cpufreq_adjust_hwp(cpu, target_pstate, max_pstate, 0,
+					 fast_switch);
+	} else if (target_pstate != old_pstate) {
 		intel_cpufreq_adjust_perf_ctl(cpu, target_pstate, fast_switch);
+	}
 
 	cpu->pstate.current_pstate = target_pstate;
 
@@ -2634,6 +2637,47 @@  static unsigned int intel_cpufreq_fast_s
 	return target_pstate * cpu->pstate.scaling;
 }
 
+static void intel_cpufreq_adjust_perf(unsigned int cpunum,
+				      unsigned long min_perf,
+				      unsigned long target_perf,
+				      unsigned long capacity)
+{
+	struct cpudata *cpu = all_cpu_data[cpunum];
+	int old_pstate = cpu->pstate.current_pstate;
+	int cap_pstate, min_pstate, max_pstate, target_pstate;
+
+	update_turbo_state();
+	cap_pstate = global.turbo_disabled ? cpu->pstate.max_pstate :
+					     cpu->pstate.turbo_pstate;
+
+	/* Optimization: Avoid unnecessary divisions. */
+
+	target_pstate = cap_pstate;
+	if (target_perf < capacity)
+		target_pstate = DIV_ROUND_UP(cap_pstate * target_perf, capacity);
+
+	min_pstate = cap_pstate;
+	if (min_perf < capacity)
+		min_pstate = DIV_ROUND_UP(cap_pstate * min_perf, capacity);
+
+	if (min_pstate < cpu->pstate.min_pstate)
+		min_pstate = cpu->pstate.min_pstate;
+
+	if (min_pstate < cpu->min_perf_ratio)
+		min_pstate = cpu->min_perf_ratio;
+
+	max_pstate = min(cap_pstate, cpu->max_perf_ratio);
+	if (max_pstate < min_pstate)
+		max_pstate = min_pstate;
+
+	target_pstate = clamp_t(int, target_pstate, min_pstate, max_pstate);
+
+	intel_cpufreq_adjust_hwp(cpu, min_pstate, max_pstate, target_pstate, true);
+
+	cpu->pstate.current_pstate = target_pstate;
+	intel_cpufreq_trace(cpu, INTEL_PSTATE_TRACE_FAST_SWITCH, old_pstate);
+}
+
 static int intel_cpufreq_cpu_init(struct cpufreq_policy *policy)
 {
 	int max_state, turbo_max, min_freq, max_freq, ret;
@@ -3032,6 +3076,8 @@  static int __init intel_pstate_init(void
 			intel_pstate.attr = hwp_cpufreq_attrs;
 			intel_cpufreq.attr = hwp_cpufreq_attrs;
 			intel_cpufreq.flags |= CPUFREQ_NEED_UPDATE_LIMITS;
+			intel_cpufreq.fast_switch = NULL;
+			intel_cpufreq.adjust_perf = intel_cpufreq_adjust_perf;
 			if (!default_driver)
 				default_driver = &intel_pstate;