devfreq: Register devfreq as a cooling device

Message ID 20210304125034.28404-1-daniel.lezcano@linaro.org
State New
Headers show
Series
  • devfreq: Register devfreq as a cooling device
Related show

Commit Message

Daniel Lezcano March 4, 2021, 12:50 p.m.
Currently the default behavior is to manually having the devfreq
backend to register themselves as a devfreq cooling device.

There are no so many and actually it makes more sense to register the
devfreq device when adding it.

Consequently, every devfreq becomes a cooling device like cpufreq is.

Having a devfreq being registered as a cooling device can not mitigate
a thermal zone if it is not bound to this one. Thus, the current
configurations are not impacted by this change.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

---
 drivers/devfreq/devfreq.c                   |  8 ++++++++
 drivers/gpu/drm/lima/lima_devfreq.c         | 13 -------------
 drivers/gpu/drm/lima/lima_devfreq.h         |  2 --
 drivers/gpu/drm/msm/msm_gpu.c               | 11 -----------
 drivers/gpu/drm/msm/msm_gpu.h               |  2 --
 drivers/gpu/drm/panfrost/panfrost_devfreq.c | 13 -------------
 include/linux/devfreq.h                     |  3 +++
 7 files changed, 11 insertions(+), 41 deletions(-)

-- 
2.17.1

Comments

Chanwoo Choi March 4, 2021, 3:06 p.m. | #1
Hi Daniel,

As Lukasz's comment, actually some devfreq devices like memory bus
might not affect the thermal critically. In the mainline,
there are four types devfreq as following:
1. GPU
2. UFS Storage
3. DMC (Memory Controller)
4. Memory bus like AMBA AXI

I think that you can specify this devfreq device will be used
for cooling device by editing the devfreq_dev_profile structure.

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index bf3047896e41..77966a17d03f 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -935,6 +935,13 @@ struct devfreq *devfreq_add_device(struct device *dev,

         mutex_unlock(&devfreq_list_lock);

+       if (devfreq->profile->is_cooling_device) {
+               devfreq->cdev = devfreq_cooling_em_register(devfreq, NULL);
+               if (IS_ERR(devfreq->cdev))
+                       dev_info(dev,
+                               "Failed to register devfreq cooling 
device\n");
+       }
+
         return devfreq;

  err_init:
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 26ea0850be9b..26dc69f1047b 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -103,6 +103,7 @@ struct devfreq_dev_profile {
         unsigned long initial_freq;
         unsigned int polling_ms;
         enum devfreq_timer timer;
+       bool is_cooling_device;

         int (*target)(struct device *dev, unsigned long *freq, u32 flags);
         int (*get_dev_status)(struct device *dev,


Thanks
Chanwoo Choi

On 21. 3. 4. 오후 9:50, Daniel Lezcano wrote:
> Currently the default behavior is to manually having the devfreq
> backend to register themselves as a devfreq cooling device.
> 
> There are no so many and actually it makes more sense to register the
> devfreq device when adding it.
> 
> Consequently, every devfreq becomes a cooling device like cpufreq is.
> 
> Having a devfreq being registered as a cooling device can not mitigate
> a thermal zone if it is not bound to this one. Thus, the current
> configurations are not impacted by this change.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>   drivers/devfreq/devfreq.c                   |  8 ++++++++
>   drivers/gpu/drm/lima/lima_devfreq.c         | 13 -------------
>   drivers/gpu/drm/lima/lima_devfreq.h         |  2 --
>   drivers/gpu/drm/msm/msm_gpu.c               | 11 -----------
>   drivers/gpu/drm/msm/msm_gpu.h               |  2 --
>   drivers/gpu/drm/panfrost/panfrost_devfreq.c | 13 -------------
>   include/linux/devfreq.h                     |  3 +++
>   7 files changed, 11 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
> index b6d63f02d293..19149b31b000 100644
> --- a/drivers/devfreq/devfreq.c
> +++ b/drivers/devfreq/devfreq.c
> @@ -11,6 +11,7 @@
>   #include <linux/kmod.h>
>   #include <linux/sched.h>
>   #include <linux/debugfs.h>
> +#include <linux/devfreq_cooling.h>
>   #include <linux/errno.h>
>   #include <linux/err.h>
>   #include <linux/init.h>
> @@ -26,6 +27,7 @@
>   #include <linux/hrtimer.h>
>   #include <linux/of.h>
>   #include <linux/pm_qos.h>
> +#include <linux/thermal.h>
>   #include <linux/units.h>
>   #include "governor.h"
>   
> @@ -935,6 +937,10 @@ struct devfreq *devfreq_add_device(struct device *dev,
>   
>   	mutex_unlock(&devfreq_list_lock);
>   
> +	devfreq->cdev = devfreq_cooling_em_register(devfreq, NULL);
> +	if (IS_ERR(devfreq->cdev))
> +		dev_info(dev, "Failed to register devfreq cooling device\n");
> +
>   	return devfreq;
>   
>   err_init:
> @@ -960,6 +966,8 @@ int devfreq_remove_device(struct devfreq *devfreq)
>   	if (!devfreq)
>   		return -EINVAL;
>   
> +	thermal_cooling_device_unregister(devfreq->cdev);
> +
>   	if (devfreq->governor) {
>   		devfreq->governor->event_handler(devfreq,
>   						 DEVFREQ_GOV_STOP, NULL);
> diff --git a/drivers/gpu/drm/lima/lima_devfreq.c b/drivers/gpu/drm/lima/lima_devfreq.c
> index 5686ad4aaf7c..a696eff1642c 100644
> --- a/drivers/gpu/drm/lima/lima_devfreq.c
> +++ b/drivers/gpu/drm/lima/lima_devfreq.c
> @@ -7,7 +7,6 @@
>    */
>   #include <linux/clk.h>
>   #include <linux/devfreq.h>
> -#include <linux/devfreq_cooling.h>
>   #include <linux/device.h>
>   #include <linux/platform_device.h>
>   #include <linux/pm_opp.h>
> @@ -90,11 +89,6 @@ void lima_devfreq_fini(struct lima_device *ldev)
>   {
>   	struct lima_devfreq *devfreq = &ldev->devfreq;
>   
> -	if (devfreq->cooling) {
> -		devfreq_cooling_unregister(devfreq->cooling);
> -		devfreq->cooling = NULL;
> -	}
> -
>   	if (devfreq->devfreq) {
>   		devm_devfreq_remove_device(ldev->dev, devfreq->devfreq);
>   		devfreq->devfreq = NULL;
> @@ -110,7 +104,6 @@ void lima_devfreq_fini(struct lima_device *ldev)
>   
>   int lima_devfreq_init(struct lima_device *ldev)
>   {
> -	struct thermal_cooling_device *cooling;
>   	struct device *dev = ldev->dev;
>   	struct opp_table *opp_table;
>   	struct devfreq *devfreq;
> @@ -173,12 +166,6 @@ int lima_devfreq_init(struct lima_device *ldev)
>   
>   	ldevfreq->devfreq = devfreq;
>   
> -	cooling = of_devfreq_cooling_register(dev->of_node, devfreq);
> -	if (IS_ERR(cooling))
> -		dev_info(dev, "Failed to register cooling device\n");
> -	else
> -		ldevfreq->cooling = cooling;
> -
>   	return 0;
>   
>   err_fini:
> diff --git a/drivers/gpu/drm/lima/lima_devfreq.h b/drivers/gpu/drm/lima/lima_devfreq.h
> index 2d9b3008ce77..c43a2069e5d3 100644
> --- a/drivers/gpu/drm/lima/lima_devfreq.h
> +++ b/drivers/gpu/drm/lima/lima_devfreq.h
> @@ -9,7 +9,6 @@
>   
>   struct devfreq;
>   struct opp_table;
> -struct thermal_cooling_device;
>   
>   struct lima_device;
>   
> @@ -17,7 +16,6 @@ struct lima_devfreq {
>   	struct devfreq *devfreq;
>   	struct opp_table *clkname_opp_table;
>   	struct opp_table *regulators_opp_table;
> -	struct thermal_cooling_device *cooling;
>   
>   	ktime_t busy_time;
>   	ktime_t idle_time;
> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
> index ab7c167b0623..d7f80ebfe9df 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.c
> +++ b/drivers/gpu/drm/msm/msm_gpu.c
> @@ -14,7 +14,6 @@
>   #include <generated/utsrelease.h>
>   #include <linux/string_helpers.h>
>   #include <linux/devfreq.h>
> -#include <linux/devfreq_cooling.h>
>   #include <linux/devcoredump.h>
>   #include <linux/sched/task.h>
>   
> @@ -112,14 +111,6 @@ static void msm_devfreq_init(struct msm_gpu *gpu)
>   	}
>   
>   	devfreq_suspend_device(gpu->devfreq.devfreq);
> -
> -	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
> -			gpu->devfreq.devfreq);
> -	if (IS_ERR(gpu->cooling)) {
> -		DRM_DEV_ERROR(&gpu->pdev->dev,
> -				"Couldn't register GPU cooling device\n");
> -		gpu->cooling = NULL;
> -	}
>   }
>   
>   static int enable_pwrrail(struct msm_gpu *gpu)
> @@ -1056,6 +1047,4 @@ void msm_gpu_cleanup(struct msm_gpu *gpu)
>   	if (gpu->worker) {
>   		kthread_destroy_worker(gpu->worker);
>   	}
> -
> -	devfreq_cooling_unregister(gpu->cooling);
>   }
> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
> index d7cd02cd2109..93419368bac8 100644
> --- a/drivers/gpu/drm/msm/msm_gpu.h
> +++ b/drivers/gpu/drm/msm/msm_gpu.h
> @@ -155,8 +155,6 @@ struct msm_gpu {
>   	struct msm_gpu_state *crashstate;
>   	/* True if the hardware supports expanded apriv (a650 and newer) */
>   	bool hw_apriv;
> -
> -	struct thermal_cooling_device *cooling;
>   };
>   
>   static inline struct msm_gpu *dev_to_gpu(struct device *dev)
> diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> index 56b3f5935703..2cb6300de1f1 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> @@ -3,7 +3,6 @@
>   
>   #include <linux/clk.h>
>   #include <linux/devfreq.h>
> -#include <linux/devfreq_cooling.h>
>   #include <linux/platform_device.h>
>   #include <linux/pm_opp.h>
>   
> @@ -90,7 +89,6 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
>   	struct device *dev = &pfdev->pdev->dev;
>   	struct devfreq *devfreq;
>   	struct opp_table *opp_table;
> -	struct thermal_cooling_device *cooling;
>   	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
>   
>   	opp_table = dev_pm_opp_set_regulators(dev, pfdev->comp->supply_names,
> @@ -139,12 +137,6 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
>   	}
>   	pfdevfreq->devfreq = devfreq;
>   
> -	cooling = devfreq_cooling_em_register(devfreq, NULL);
> -	if (IS_ERR(cooling))
> -		DRM_DEV_INFO(dev, "Failed to register cooling device\n");
> -	else
> -		pfdevfreq->cooling = cooling;
> -
>   	return 0;
>   
>   err_fini:
> @@ -156,11 +148,6 @@ void panfrost_devfreq_fini(struct panfrost_device *pfdev)
>   {
>   	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
>   
> -	if (pfdevfreq->cooling) {
> -		devfreq_cooling_unregister(pfdevfreq->cooling);
> -		pfdevfreq->cooling = NULL;
> -	}
> -
>   	if (pfdevfreq->opp_of_table_added) {
>   		dev_pm_opp_of_remove_table(&pfdev->pdev->dev);
>   		pfdevfreq->opp_of_table_added = false;
> diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
> index 26ea0850be9b..690bd4affe18 100644
> --- a/include/linux/devfreq.h
> +++ b/include/linux/devfreq.h
> @@ -198,6 +198,9 @@ struct devfreq {
>   
>   	struct srcu_notifier_head transition_notifier_list;
>   
> +	/* Pointer to the cooling device if used for thermal mitigation */
> +	struct thermal_cooling_device *cdev;
> +
>   	struct notifier_block nb_min;
>   	struct notifier_block nb_max;
>   };
>
Daniel Lezcano March 4, 2021, 4:53 p.m. | #2
Hi Lukasz,

thanks for commenting this patch,

On 04/03/2021 14:47, Lukasz Luba wrote:
> Hi Daniel,
> 
> On 3/4/21 12:50 PM, Daniel Lezcano wrote:
>> Currently the default behavior is to manually having the devfreq
>> backend to register themselves as a devfreq cooling device.
>>
>> There are no so many and actually it makes more sense to register the
>> devfreq device when adding it.
>>
>> Consequently, every devfreq becomes a cooling device like cpufreq is.
>>
>> Having a devfreq being registered as a cooling device can not mitigate
>> a thermal zone if it is not bound to this one. Thus, the current
>> configurations are not impacted by this change.
> 
> There are also different type of devices, which register into devfreq
> framework like NoC buses, UFS/eMMC, jpeg and video accelerators, ISP,
> etc.
> In some platforms there are plenty of those devices and they all would
> occupy memory due to private freq_table in devfreq_cooling, function:
> devfreq_cooling_gen_tables().
> 
> IIRC in OdroidXU4 there are ~20 devfreq devs for NoC buses.

I'm curious, do you have a pointer to such kernels having all those
devfreq ?

> It's true that they will not affect thermal zones, but unnecessarily,
> they all will show up in the /sys/class/thermal/ as
> thermal-devfreq-X
>
>
> IMO the devfreq shouldn't be tight with devfreq cooling thermal.

The energy model is tied with a cooling device initialization.

So if we want to do power limitation, the energy model must be
initialized with the device, thus the cooling device also.

That is the reason why I'm ending up with this change. Chanwoo
suggestion makes sense and that will allow to move the initialization to
devfreq which is more sane but it does not solve the initial problem
with the energy model.
Lukasz Luba March 4, 2021, 5:12 p.m. | #3
On 3/4/21 4:53 PM, Daniel Lezcano wrote:
> 
> Hi Lukasz,
> 
> thanks for commenting this patch,
> 
> On 04/03/2021 14:47, Lukasz Luba wrote:
>> Hi Daniel,
>>
>> On 3/4/21 12:50 PM, Daniel Lezcano wrote:
>>> Currently the default behavior is to manually having the devfreq
>>> backend to register themselves as a devfreq cooling device.
>>>
>>> There are no so many and actually it makes more sense to register the
>>> devfreq device when adding it.
>>>
>>> Consequently, every devfreq becomes a cooling device like cpufreq is.
>>>
>>> Having a devfreq being registered as a cooling device can not mitigate
>>> a thermal zone if it is not bound to this one. Thus, the current
>>> configurations are not impacted by this change.
>>
>> There are also different type of devices, which register into devfreq
>> framework like NoC buses, UFS/eMMC, jpeg and video accelerators, ISP,
>> etc.
>> In some platforms there are plenty of those devices and they all would
>> occupy memory due to private freq_table in devfreq_cooling, function:
>> devfreq_cooling_gen_tables().
>>
>> IIRC in OdroidXU4 there are ~20 devfreq devs for NoC buses.
> 
> I'm curious, do you have a pointer to such kernels having all those
> devfreq ?

Sure, it's mainline code, you can build it with exynos_defconfig.
You can check the DT files to find them arch/arm/boot/dts/exynos*.
(this particular OdroidXU4 is Exynos5422, but it grabs some generic dt
files).

Here is the mainline kernel content of /sys/class/devfreq/
----------------------------------------------------------
sys/class/devfreq /
10c20000.memory-controller  soc:bus-g2d          soc:bus-mfc
11800000.gpu                soc:bus-g2d-acp      soc:bus-mscl
soc:bus-disp1               soc:bus-gen          soc:bus-noc
soc:bus-disp1-fimd          soc:bus-gscl-scaler  soc:bus-peri
soc:bus-fsys-apb            soc:bus-jpeg         soc:bus-wcore
soc:bus-fsys2               soc:bus-jpeg-apb
----------------------------------------------------------

IIRC some Odroid kernel maintained by Hardkernel had more devices
in this dir.


Here is how these bus devices print themselves during boot:
----------------------------------------------------------
[    8.674840] exynos-bus: new bus device registered: soc:bus-wcore ( 
88700 KHz ~ 532000 KHz)
[    8.686412] exynos-bus: new bus device registered: soc:bus-noc ( 
66600 KHz ~ 111000 KHz)
[    8.696080] exynos-bus: new bus device registered: soc:bus-fsys-apb 
(111000 KHz ~ 222000 KHz)
[    8.706590] exynos-bus: new bus device registered: soc:bus-fsys2 ( 
75000 KHz ~ 200000 KHz)
[    8.717661] exynos-bus: new bus device registered: soc:bus-mfc ( 
83250 KHz ~ 333000 KHz)
[    8.728139] exynos-bus: new bus device registered: soc:bus-gen ( 
88700 KHz ~ 266000 KHz)
[    8.737551] exynos-bus: new bus device registered: soc:bus-peri ( 
66600 KHz ~  66600 KHz)
[    8.748625] exynos-bus: new bus device registered: soc:bus-g2d ( 
83250 KHz ~ 333000 KHz)
[    8.759427] exynos-bus: new bus device registered: soc:bus-g2d-acp ( 
66500 KHz ~ 266000 KHz)
[    8.770366] exynos-bus: new bus device registered: soc:bus-jpeg ( 
75000 KHz ~ 300000 KHz)
[    8.781135] exynos-bus: new bus device registered: soc:bus-jpeg-apb ( 
83250 KHz ~ 166500 KHz)
[    8.791366] exynos-bus: new bus device registered: soc:bus-disp1-fimd 
(120000 KHz ~ 200000 KHz)
[    8.802418] exynos-bus: new bus device registered: soc:bus-disp1 
(120000 KHz ~ 300000 KHz)
[    8.813050] exynos-bus: new bus device registered: 
soc:bus-gscl-scaler (150000 KHz ~ 300000 KHz)
[    8.825308] exynos-bus: new bus device registered: soc:bus-mscl ( 
84000 KHz ~ 666000 KHz)

----------------------------------------------------------


> 
>> It's true that they will not affect thermal zones, but unnecessarily,
>> they all will show up in the /sys/class/thermal/ as
>> thermal-devfreq-X
>>
>>
>> IMO the devfreq shouldn't be tight with devfreq cooling thermal.
> 
> The energy model is tied with a cooling device initialization.
> 
> So if we want to do power limitation, the energy model must be
> initialized with the device, thus the cooling device also.
> 
> That is the reason why I'm ending up with this change. Chanwoo
> suggestion makes sense and that will allow to move the initialization to
> devfreq which is more sane but it does not solve the initial problem
> with the energy model.

Make sense, the 'is_cooling_device' does the job.

Regards,
Lukasz
Steven Price March 5, 2021, 8:12 a.m. | #4
On 04/03/2021 12:50, Daniel Lezcano wrote:
> Currently the default behavior is to manually having the devfreq
> backend to register themselves as a devfreq cooling device.
> 
> There are no so many and actually it makes more sense to register the
> devfreq device when adding it.
> 
> Consequently, every devfreq becomes a cooling device like cpufreq is.
> 
> Having a devfreq being registered as a cooling device can not mitigate
> a thermal zone if it is not bound to this one. Thus, the current
> configurations are not impacted by this change.
> 
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
>   drivers/devfreq/devfreq.c                   |  8 ++++++++
>   drivers/gpu/drm/lima/lima_devfreq.c         | 13 -------------
>   drivers/gpu/drm/lima/lima_devfreq.h         |  2 --
>   drivers/gpu/drm/msm/msm_gpu.c               | 11 -----------
>   drivers/gpu/drm/msm/msm_gpu.h               |  2 --
>   drivers/gpu/drm/panfrost/panfrost_devfreq.c | 13 -------------
>   include/linux/devfreq.h                     |  3 +++
>   7 files changed, 11 insertions(+), 41 deletions(-)
> 
[...]
> diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> index 56b3f5935703..2cb6300de1f1 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
> @@ -3,7 +3,6 @@
>   
>   #include <linux/clk.h>
>   #include <linux/devfreq.h>
> -#include <linux/devfreq_cooling.h>
>   #include <linux/platform_device.h>
>   #include <linux/pm_opp.h>
>   
> @@ -90,7 +89,6 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
>   	struct device *dev = &pfdev->pdev->dev;
>   	struct devfreq *devfreq;
>   	struct opp_table *opp_table;
> -	struct thermal_cooling_device *cooling;
>   	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
>   
>   	opp_table = dev_pm_opp_set_regulators(dev, pfdev->comp->supply_names,
> @@ -139,12 +137,6 @@ int panfrost_devfreq_init(struct panfrost_device *pfdev)
>   	}
>   	pfdevfreq->devfreq = devfreq;
>   
> -	cooling = devfreq_cooling_em_register(devfreq, NULL);
> -	if (IS_ERR(cooling))
> -		DRM_DEV_INFO(dev, "Failed to register cooling device\n");
> -	else
> -		pfdevfreq->cooling = cooling;
> -
>   	return 0;
>   
>   err_fini:
> @@ -156,11 +148,6 @@ void panfrost_devfreq_fini(struct panfrost_device *pfdev)
>   {
>   	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
>   
> -	if (pfdevfreq->cooling) {
> -		devfreq_cooling_unregister(pfdevfreq->cooling);
> -		pfdevfreq->cooling = NULL;
> -	}
> -
>   	if (pfdevfreq->opp_of_table_added) {
>   		dev_pm_opp_of_remove_table(&pfdev->pdev->dev);
>   		pfdevfreq->opp_of_table_added = false;

You've removed all references to pfdevfreq->cooling, so please also 
remove the member from struct panfrost_devfreq (as already done with 
lima and msm).

Thanks,

Steve
Daniel Lezcano March 5, 2021, 9:10 a.m. | #5
On 05/03/2021 09:12, Steven Price wrote:
> On 04/03/2021 12:50, Daniel Lezcano wrote:
>> Currently the default behavior is to manually having the devfreq
>> backend to register themselves as a devfreq cooling device.
>>
>> There are no so many and actually it makes more sense to register the
>> devfreq device when adding it.
>>
>> Consequently, every devfreq becomes a cooling device like cpufreq is.
>>
>> Having a devfreq being registered as a cooling device can not mitigate
>> a thermal zone if it is not bound to this one. Thus, the current
>> configurations are not impacted by this change.
>>
>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
>> ---

[ ... ]

>>       if (pfdevfreq->opp_of_table_added) {
>>           dev_pm_opp_of_remove_table(&pfdev->pdev->dev);
>>           pfdevfreq->opp_of_table_added = false;
> 
> You've removed all references to pfdevfreq->cooling, so please also
> remove the member from struct panfrost_devfreq (as already done with
> lima and msm).

Sure, thanks for spotting this.

Patch

diff --git a/drivers/devfreq/devfreq.c b/drivers/devfreq/devfreq.c
index b6d63f02d293..19149b31b000 100644
--- a/drivers/devfreq/devfreq.c
+++ b/drivers/devfreq/devfreq.c
@@ -11,6 +11,7 @@ 
 #include <linux/kmod.h>
 #include <linux/sched.h>
 #include <linux/debugfs.h>
+#include <linux/devfreq_cooling.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/init.h>
@@ -26,6 +27,7 @@ 
 #include <linux/hrtimer.h>
 #include <linux/of.h>
 #include <linux/pm_qos.h>
+#include <linux/thermal.h>
 #include <linux/units.h>
 #include "governor.h"
 
@@ -935,6 +937,10 @@  struct devfreq *devfreq_add_device(struct device *dev,
 
 	mutex_unlock(&devfreq_list_lock);
 
+	devfreq->cdev = devfreq_cooling_em_register(devfreq, NULL);
+	if (IS_ERR(devfreq->cdev))
+		dev_info(dev, "Failed to register devfreq cooling device\n");
+
 	return devfreq;
 
 err_init:
@@ -960,6 +966,8 @@  int devfreq_remove_device(struct devfreq *devfreq)
 	if (!devfreq)
 		return -EINVAL;
 
+	thermal_cooling_device_unregister(devfreq->cdev);
+
 	if (devfreq->governor) {
 		devfreq->governor->event_handler(devfreq,
 						 DEVFREQ_GOV_STOP, NULL);
diff --git a/drivers/gpu/drm/lima/lima_devfreq.c b/drivers/gpu/drm/lima/lima_devfreq.c
index 5686ad4aaf7c..a696eff1642c 100644
--- a/drivers/gpu/drm/lima/lima_devfreq.c
+++ b/drivers/gpu/drm/lima/lima_devfreq.c
@@ -7,7 +7,6 @@ 
  */
 #include <linux/clk.h>
 #include <linux/devfreq.h>
-#include <linux/devfreq_cooling.h>
 #include <linux/device.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
@@ -90,11 +89,6 @@  void lima_devfreq_fini(struct lima_device *ldev)
 {
 	struct lima_devfreq *devfreq = &ldev->devfreq;
 
-	if (devfreq->cooling) {
-		devfreq_cooling_unregister(devfreq->cooling);
-		devfreq->cooling = NULL;
-	}
-
 	if (devfreq->devfreq) {
 		devm_devfreq_remove_device(ldev->dev, devfreq->devfreq);
 		devfreq->devfreq = NULL;
@@ -110,7 +104,6 @@  void lima_devfreq_fini(struct lima_device *ldev)
 
 int lima_devfreq_init(struct lima_device *ldev)
 {
-	struct thermal_cooling_device *cooling;
 	struct device *dev = ldev->dev;
 	struct opp_table *opp_table;
 	struct devfreq *devfreq;
@@ -173,12 +166,6 @@  int lima_devfreq_init(struct lima_device *ldev)
 
 	ldevfreq->devfreq = devfreq;
 
-	cooling = of_devfreq_cooling_register(dev->of_node, devfreq);
-	if (IS_ERR(cooling))
-		dev_info(dev, "Failed to register cooling device\n");
-	else
-		ldevfreq->cooling = cooling;
-
 	return 0;
 
 err_fini:
diff --git a/drivers/gpu/drm/lima/lima_devfreq.h b/drivers/gpu/drm/lima/lima_devfreq.h
index 2d9b3008ce77..c43a2069e5d3 100644
--- a/drivers/gpu/drm/lima/lima_devfreq.h
+++ b/drivers/gpu/drm/lima/lima_devfreq.h
@@ -9,7 +9,6 @@ 
 
 struct devfreq;
 struct opp_table;
-struct thermal_cooling_device;
 
 struct lima_device;
 
@@ -17,7 +16,6 @@  struct lima_devfreq {
 	struct devfreq *devfreq;
 	struct opp_table *clkname_opp_table;
 	struct opp_table *regulators_opp_table;
-	struct thermal_cooling_device *cooling;
 
 	ktime_t busy_time;
 	ktime_t idle_time;
diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c
index ab7c167b0623..d7f80ebfe9df 100644
--- a/drivers/gpu/drm/msm/msm_gpu.c
+++ b/drivers/gpu/drm/msm/msm_gpu.c
@@ -14,7 +14,6 @@ 
 #include <generated/utsrelease.h>
 #include <linux/string_helpers.h>
 #include <linux/devfreq.h>
-#include <linux/devfreq_cooling.h>
 #include <linux/devcoredump.h>
 #include <linux/sched/task.h>
 
@@ -112,14 +111,6 @@  static void msm_devfreq_init(struct msm_gpu *gpu)
 	}
 
 	devfreq_suspend_device(gpu->devfreq.devfreq);
-
-	gpu->cooling = of_devfreq_cooling_register(gpu->pdev->dev.of_node,
-			gpu->devfreq.devfreq);
-	if (IS_ERR(gpu->cooling)) {
-		DRM_DEV_ERROR(&gpu->pdev->dev,
-				"Couldn't register GPU cooling device\n");
-		gpu->cooling = NULL;
-	}
 }
 
 static int enable_pwrrail(struct msm_gpu *gpu)
@@ -1056,6 +1047,4 @@  void msm_gpu_cleanup(struct msm_gpu *gpu)
 	if (gpu->worker) {
 		kthread_destroy_worker(gpu->worker);
 	}
-
-	devfreq_cooling_unregister(gpu->cooling);
 }
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index d7cd02cd2109..93419368bac8 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -155,8 +155,6 @@  struct msm_gpu {
 	struct msm_gpu_state *crashstate;
 	/* True if the hardware supports expanded apriv (a650 and newer) */
 	bool hw_apriv;
-
-	struct thermal_cooling_device *cooling;
 };
 
 static inline struct msm_gpu *dev_to_gpu(struct device *dev)
diff --git a/drivers/gpu/drm/panfrost/panfrost_devfreq.c b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
index 56b3f5935703..2cb6300de1f1 100644
--- a/drivers/gpu/drm/panfrost/panfrost_devfreq.c
+++ b/drivers/gpu/drm/panfrost/panfrost_devfreq.c
@@ -3,7 +3,6 @@ 
 
 #include <linux/clk.h>
 #include <linux/devfreq.h>
-#include <linux/devfreq_cooling.h>
 #include <linux/platform_device.h>
 #include <linux/pm_opp.h>
 
@@ -90,7 +89,6 @@  int panfrost_devfreq_init(struct panfrost_device *pfdev)
 	struct device *dev = &pfdev->pdev->dev;
 	struct devfreq *devfreq;
 	struct opp_table *opp_table;
-	struct thermal_cooling_device *cooling;
 	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
 
 	opp_table = dev_pm_opp_set_regulators(dev, pfdev->comp->supply_names,
@@ -139,12 +137,6 @@  int panfrost_devfreq_init(struct panfrost_device *pfdev)
 	}
 	pfdevfreq->devfreq = devfreq;
 
-	cooling = devfreq_cooling_em_register(devfreq, NULL);
-	if (IS_ERR(cooling))
-		DRM_DEV_INFO(dev, "Failed to register cooling device\n");
-	else
-		pfdevfreq->cooling = cooling;
-
 	return 0;
 
 err_fini:
@@ -156,11 +148,6 @@  void panfrost_devfreq_fini(struct panfrost_device *pfdev)
 {
 	struct panfrost_devfreq *pfdevfreq = &pfdev->pfdevfreq;
 
-	if (pfdevfreq->cooling) {
-		devfreq_cooling_unregister(pfdevfreq->cooling);
-		pfdevfreq->cooling = NULL;
-	}
-
 	if (pfdevfreq->opp_of_table_added) {
 		dev_pm_opp_of_remove_table(&pfdev->pdev->dev);
 		pfdevfreq->opp_of_table_added = false;
diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h
index 26ea0850be9b..690bd4affe18 100644
--- a/include/linux/devfreq.h
+++ b/include/linux/devfreq.h
@@ -198,6 +198,9 @@  struct devfreq {
 
 	struct srcu_notifier_head transition_notifier_list;
 
+	/* Pointer to the cooling device if used for thermal mitigation */
+	struct thermal_cooling_device *cdev;
+
 	struct notifier_block nb_min;
 	struct notifier_block nb_max;
 };