[v4,4/4] thermal: core: Add notifications call in the framework

Message ID 20200706105538.2159-4-daniel.lezcano@linaro.org
State New
Headers show
Series
  • [v4,1/4] thermal: core: Add helpers to browse the cdev, tz and governor list
Related show

Commit Message

Daniel Lezcano July 6, 2020, 10:55 a.m.
The generic netlink protocol is implemented but the different
notification functions are not yet connected to the core code.

These changes add the notification calls in the different
corresponding places.

Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

---
  v4:
     - Fixed missing static declaration, reported by kbuild-bot
     - Removed max state notification
---
 drivers/thermal/thermal_core.c    | 21 +++++++++++++++++++++
 drivers/thermal/thermal_helpers.c | 13 +++++++++++--
 drivers/thermal/thermal_sysfs.c   | 15 ++++++++++++++-
 3 files changed, 46 insertions(+), 3 deletions(-)

-- 
2.17.1

Comments

Marek Szyprowski July 6, 2020, 1:17 p.m. | #1
Hi Daniel,

On 06.07.2020 12:55, Daniel Lezcano wrote:
> The generic netlink protocol is implemented but the different

> notification functions are not yet connected to the core code.

>

> These changes add the notification calls in the different

> corresponding places.

>

> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>


This patch landed in today's linux-next 20200706 as commit 5df786e46560 
("thermal: core: Add notifications call in the framework"). Sadly it 
breaks booting various Samsung Exynos based boards. Here is an example 
log from Odroid U3 board:

Unable to handle kernel NULL pointer dereference at virtual address 00000010
pgd = (ptrval)
[00000010] *pgd=00000000
Internal error: Oops: 5 [#1] PREEMPT SMP ARM
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560 
#1146
Hardware name: Samsung Exynos (Flattened Device Tree)
PC is at kmem_cache_alloc+0x13c/0x418
LR is at kmem_cache_alloc+0x48/0x418
pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053
...
Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none
Control: 10c5387d  Table: 4000404a  DAC: 00000051
Process swapper/0 (pid: 1, stack limit = 0x(ptrval))
Stack: (0xee8f1cf8 to 0xee8f2000)
...
[<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] (__alloc_skb+0x5c/0x170)
[<c08cd170>] (__alloc_skb) from [<c07ec19c>] 
(thermal_genl_send_event+0x24/0x174)
[<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>] 
(thermal_notify_tz_create+0x58/0x74)
[<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>] 
(thermal_zone_device_register+0x358/0x650)
[<c07e9058>] (thermal_zone_device_register) from [<c1028d34>] 
(of_parse_thermal_zones+0x304/0x7a4)
[<c1028d34>] (of_parse_thermal_zones) from [<c1028964>] 
(thermal_init+0xdc/0x154)
[<c1028964>] (thermal_init) from [<c0102378>] (do_one_initcall+0x8c/0x424)
[<c0102378>] (do_one_initcall) from [<c1001158>] 
(kernel_init_freeable+0x190/0x204)
[<c1001158>] (kernel_init_freeable) from [<c0ab85f4>] 
(kernel_init+0x8/0x118)
[<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

Reverting it on top of linux-next fixes the boot issue. I will 
investigate it further soon.



> ---

>    v4:

>       - Fixed missing static declaration, reported by kbuild-bot

>       - Removed max state notification

> ---

>   drivers/thermal/thermal_core.c    | 21 +++++++++++++++++++++

>   drivers/thermal/thermal_helpers.c | 13 +++++++++++--

>   drivers/thermal/thermal_sysfs.c   | 15 ++++++++++++++-

>   3 files changed, 46 insertions(+), 3 deletions(-)

>

> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c

> index 5fae1621fb01..25ef29123f72 100644

> --- a/drivers/thermal/thermal_core.c

> +++ b/drivers/thermal/thermal_core.c

> @@ -215,6 +215,8 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *tz,

>   	mutex_unlock(&tz->lock);

>   	mutex_unlock(&thermal_governor_lock);

>   

> +	thermal_notify_tz_gov_change(tz->id, policy);

> +

>   	return ret;

>   }

>   

> @@ -406,12 +408,25 @@ static void handle_critical_trips(struct thermal_zone_device *tz,

>   static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)

>   {

>   	enum thermal_trip_type type;

> +	int trip_temp, hyst = 0;

>   

>   	/* Ignore disabled trip points */

>   	if (test_bit(trip, &tz->trips_disabled))

>   		return;

>   

> +	tz->ops->get_trip_temp(tz, trip, &trip_temp);

>   	tz->ops->get_trip_type(tz, trip, &type);

> +	if (tz->ops->get_trip_hyst)

> +		tz->ops->get_trip_hyst(tz, trip, &hyst);

> +

> +	if (tz->last_temperature != THERMAL_TEMP_INVALID) {

> +		if (tz->last_temperature < trip_temp &&

> +		    tz->temperature >= trip_temp)

> +			thermal_notify_tz_trip_up(tz->id, trip);

> +		if (tz->last_temperature >= trip_temp &&

> +		    tz->temperature < (trip_temp - hyst))

> +			thermal_notify_tz_trip_down(tz->id, trip);

> +	}

>   

>   	if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)

>   		handle_critical_trips(tz, trip, type);

> @@ -443,6 +458,8 @@ static void update_temperature(struct thermal_zone_device *tz)

>   	mutex_unlock(&tz->lock);

>   

>   	trace_thermal_temperature(tz);

> +

> +	thermal_genl_sampling_temp(tz->id, temp);

>   }

>   

>   static void thermal_zone_device_init(struct thermal_zone_device *tz)

> @@ -1405,6 +1422,8 @@ thermal_zone_device_register(const char *type, int trips, int mask,

>   	if (atomic_cmpxchg(&tz->need_update, 1, 0))

>   		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

>   

> +	thermal_notify_tz_create(tz->id, tz->type);

> +

>   	return tz;

>   

>   unregister:

> @@ -1476,6 +1495,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)

>   	ida_destroy(&tz->ida);

>   	mutex_destroy(&tz->lock);

>   	device_unregister(&tz->device);

> +

> +	thermal_notify_tz_delete(tz->id);

>   }

>   EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);

>   

> diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c

> index 87b1256fa2f2..c94bc824e5d3 100644

> --- a/drivers/thermal/thermal_helpers.c

> +++ b/drivers/thermal/thermal_helpers.c

> @@ -175,6 +175,16 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)

>   	mutex_unlock(&tz->lock);

>   }

>   

> +static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,

> +				       int target)

> +{

> +	if (cdev->ops->set_cur_state(cdev, target))

> +		return;

> +

> +	thermal_notify_cdev_state_update(cdev->id, target);

> +	thermal_cooling_device_stats_update(cdev, target);

> +}

> +

>   void thermal_cdev_update(struct thermal_cooling_device *cdev)

>   {

>   	struct thermal_instance *instance;

> @@ -197,8 +207,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)

>   			target = instance->target;

>   	}

>   

> -	if (!cdev->ops->set_cur_state(cdev, target))

> -		thermal_cooling_device_stats_update(cdev, target);

> +	thermal_cdev_set_cur_state(cdev, target);

>   

>   	cdev->updated = true;

>   	mutex_unlock(&cdev->lock);

> diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c

> index aa99edb4dff7..ff449943f757 100644

> --- a/drivers/thermal/thermal_sysfs.c

> +++ b/drivers/thermal/thermal_sysfs.c

> @@ -124,7 +124,8 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,

>   {

>   	struct thermal_zone_device *tz = to_thermal_zone(dev);

>   	int trip, ret;

> -	int temperature;

> +	int temperature, hyst = 0;

> +	enum thermal_trip_type type;

>   

>   	if (!tz->ops->set_trip_temp)

>   		return -EPERM;

> @@ -139,6 +140,18 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,

>   	if (ret)

>   		return ret;

>   

> +	if (tz->ops->get_trip_hyst) {

> +		ret = tz->ops->get_trip_hyst(tz, trip, &hyst);

> +		if (ret)

> +			return ret;

> +	}

> +

> +	ret = tz->ops->get_trip_type(tz, trip, &type);

> +	if (ret)

> +		return ret;

> +

> +	thermal_notify_tz_trip_change(tz->id, trip, type, temperature, hyst);

> +

>   	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

>   

>   	return count;


Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland
Daniel Lezcano July 6, 2020, 1:46 p.m. | #2
On 06/07/2020 15:17, Marek Szyprowski wrote:
> Hi Daniel,

> 

> On 06.07.2020 12:55, Daniel Lezcano wrote:

>> The generic netlink protocol is implemented but the different

>> notification functions are not yet connected to the core code.

>>

>> These changes add the notification calls in the different

>> corresponding places.

>>

>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

> 

> This patch landed in today's linux-next 20200706 as commit 5df786e46560 

> ("thermal: core: Add notifications call in the framework"). Sadly it 

> breaks booting various Samsung Exynos based boards. Here is an example 

> log from Odroid U3 board:

> 

> Unable to handle kernel NULL pointer dereference at virtual address 00000010

> pgd = (ptrval)

> [00000010] *pgd=00000000

> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

> Modules linked in:

> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560 

> #1146

> Hardware name: Samsung Exynos (Flattened Device Tree)

> PC is at kmem_cache_alloc+0x13c/0x418

> LR is at kmem_cache_alloc+0x48/0x418

> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

> ...

> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

> Control: 10c5387d  Table: 4000404a  DAC: 00000051

> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

> Stack: (0xee8f1cf8 to 0xee8f2000)

> ...

> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] (__alloc_skb+0x5c/0x170)

> [<c08cd170>] (__alloc_skb) from [<c07ec19c>] 

> (thermal_genl_send_event+0x24/0x174)

> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>] 

> (thermal_notify_tz_create+0x58/0x74)

> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>] 

> (thermal_zone_device_register+0x358/0x650)

> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>] 

> (of_parse_thermal_zones+0x304/0x7a4)

> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>] 

> (thermal_init+0xdc/0x154)

> [<c1028964>] (thermal_init) from [<c0102378>] (do_one_initcall+0x8c/0x424)

> [<c0102378>] (do_one_initcall) from [<c1001158>] 

> (kernel_init_freeable+0x190/0x204)

> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>] 

> (kernel_init+0x8/0x118)

> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

> 

> Reverting it on top of linux-next fixes the boot issue. I will 

> investigate it further soon.


Thanks for reporting this.

Can you send the addr2line result and code it points to ?


>> ---

>>    v4:

>>       - Fixed missing static declaration, reported by kbuild-bot

>>       - Removed max state notification

>> ---

>>   drivers/thermal/thermal_core.c    | 21 +++++++++++++++++++++

>>   drivers/thermal/thermal_helpers.c | 13 +++++++++++--

>>   drivers/thermal/thermal_sysfs.c   | 15 ++++++++++++++-

>>   3 files changed, 46 insertions(+), 3 deletions(-)

>>

>> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c

>> index 5fae1621fb01..25ef29123f72 100644

>> --- a/drivers/thermal/thermal_core.c

>> +++ b/drivers/thermal/thermal_core.c

>> @@ -215,6 +215,8 @@ int thermal_zone_device_set_policy(struct thermal_zone_device *tz,

>>   	mutex_unlock(&tz->lock);

>>   	mutex_unlock(&thermal_governor_lock);

>>   

>> +	thermal_notify_tz_gov_change(tz->id, policy);

>> +

>>   	return ret;

>>   }

>>   

>> @@ -406,12 +408,25 @@ static void handle_critical_trips(struct thermal_zone_device *tz,

>>   static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)

>>   {

>>   	enum thermal_trip_type type;

>> +	int trip_temp, hyst = 0;

>>   

>>   	/* Ignore disabled trip points */

>>   	if (test_bit(trip, &tz->trips_disabled))

>>   		return;

>>   

>> +	tz->ops->get_trip_temp(tz, trip, &trip_temp);

>>   	tz->ops->get_trip_type(tz, trip, &type);

>> +	if (tz->ops->get_trip_hyst)

>> +		tz->ops->get_trip_hyst(tz, trip, &hyst);

>> +

>> +	if (tz->last_temperature != THERMAL_TEMP_INVALID) {

>> +		if (tz->last_temperature < trip_temp &&

>> +		    tz->temperature >= trip_temp)

>> +			thermal_notify_tz_trip_up(tz->id, trip);

>> +		if (tz->last_temperature >= trip_temp &&

>> +		    tz->temperature < (trip_temp - hyst))

>> +			thermal_notify_tz_trip_down(tz->id, trip);

>> +	}

>>   

>>   	if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)

>>   		handle_critical_trips(tz, trip, type);

>> @@ -443,6 +458,8 @@ static void update_temperature(struct thermal_zone_device *tz)

>>   	mutex_unlock(&tz->lock);

>>   

>>   	trace_thermal_temperature(tz);

>> +

>> +	thermal_genl_sampling_temp(tz->id, temp);

>>   }

>>   

>>   static void thermal_zone_device_init(struct thermal_zone_device *tz)

>> @@ -1405,6 +1422,8 @@ thermal_zone_device_register(const char *type, int trips, int mask,

>>   	if (atomic_cmpxchg(&tz->need_update, 1, 0))

>>   		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

>>   

>> +	thermal_notify_tz_create(tz->id, tz->type);

>> +

>>   	return tz;

>>   

>>   unregister:

>> @@ -1476,6 +1495,8 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)

>>   	ida_destroy(&tz->ida);

>>   	mutex_destroy(&tz->lock);

>>   	device_unregister(&tz->device);

>> +

>> +	thermal_notify_tz_delete(tz->id);

>>   }

>>   EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);

>>   

>> diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c

>> index 87b1256fa2f2..c94bc824e5d3 100644

>> --- a/drivers/thermal/thermal_helpers.c

>> +++ b/drivers/thermal/thermal_helpers.c

>> @@ -175,6 +175,16 @@ void thermal_zone_set_trips(struct thermal_zone_device *tz)

>>   	mutex_unlock(&tz->lock);

>>   }

>>   

>> +static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,

>> +				       int target)

>> +{

>> +	if (cdev->ops->set_cur_state(cdev, target))

>> +		return;

>> +

>> +	thermal_notify_cdev_state_update(cdev->id, target);

>> +	thermal_cooling_device_stats_update(cdev, target);

>> +}

>> +

>>   void thermal_cdev_update(struct thermal_cooling_device *cdev)

>>   {

>>   	struct thermal_instance *instance;

>> @@ -197,8 +207,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)

>>   			target = instance->target;

>>   	}

>>   

>> -	if (!cdev->ops->set_cur_state(cdev, target))

>> -		thermal_cooling_device_stats_update(cdev, target);

>> +	thermal_cdev_set_cur_state(cdev, target);

>>   

>>   	cdev->updated = true;

>>   	mutex_unlock(&cdev->lock);

>> diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c

>> index aa99edb4dff7..ff449943f757 100644

>> --- a/drivers/thermal/thermal_sysfs.c

>> +++ b/drivers/thermal/thermal_sysfs.c

>> @@ -124,7 +124,8 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,

>>   {

>>   	struct thermal_zone_device *tz = to_thermal_zone(dev);

>>   	int trip, ret;

>> -	int temperature;

>> +	int temperature, hyst = 0;

>> +	enum thermal_trip_type type;

>>   

>>   	if (!tz->ops->set_trip_temp)

>>   		return -EPERM;

>> @@ -139,6 +140,18 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,

>>   	if (ret)

>>   		return ret;

>>   

>> +	if (tz->ops->get_trip_hyst) {

>> +		ret = tz->ops->get_trip_hyst(tz, trip, &hyst);

>> +		if (ret)

>> +			return ret;

>> +	}

>> +

>> +	ret = tz->ops->get_trip_type(tz, trip, &type);

>> +	if (ret)

>> +		return ret;

>> +

>> +	thermal_notify_tz_trip_change(tz->id, trip, type, temperature, hyst);

>> +

>>   	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

>>   

>>   	return count;

> 

> Best regards

> 



-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Zhang Rui July 7, 2020, 1:55 a.m. | #3
On Mon, 2020-07-06 at 12:55 +0200, Daniel Lezcano wrote:
> The generic netlink protocol is implemented but the different

> notification functions are not yet connected to the core code.

> 

> These changes add the notification calls in the different

> corresponding places.

> 

> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>


Acked-by: Zhang Rui <rui.zhang@intel.com>

> ---

>   v4:

>      - Fixed missing static declaration, reported by kbuild-bot

>      - Removed max state notification

> ---

>  drivers/thermal/thermal_core.c    | 21 +++++++++++++++++++++

>  drivers/thermal/thermal_helpers.c | 13 +++++++++++--

>  drivers/thermal/thermal_sysfs.c   | 15 ++++++++++++++-

>  3 files changed, 46 insertions(+), 3 deletions(-)

> 

> diff --git a/drivers/thermal/thermal_core.c

> b/drivers/thermal/thermal_core.c

> index 5fae1621fb01..25ef29123f72 100644

> --- a/drivers/thermal/thermal_core.c

> +++ b/drivers/thermal/thermal_core.c

> @@ -215,6 +215,8 @@ int thermal_zone_device_set_policy(struct

> thermal_zone_device *tz,

>  	mutex_unlock(&tz->lock);

>  	mutex_unlock(&thermal_governor_lock);

>  

> +	thermal_notify_tz_gov_change(tz->id, policy);

> +

>  	return ret;

>  }

>  

> @@ -406,12 +408,25 @@ static void handle_critical_trips(struct

> thermal_zone_device *tz,

>  static void handle_thermal_trip(struct thermal_zone_device *tz, int

> trip)

>  {

>  	enum thermal_trip_type type;

> +	int trip_temp, hyst = 0;

>  

>  	/* Ignore disabled trip points */

>  	if (test_bit(trip, &tz->trips_disabled))

>  		return;

>  

> +	tz->ops->get_trip_temp(tz, trip, &trip_temp);

>  	tz->ops->get_trip_type(tz, trip, &type);

> +	if (tz->ops->get_trip_hyst)

> +		tz->ops->get_trip_hyst(tz, trip, &hyst);

> +

> +	if (tz->last_temperature != THERMAL_TEMP_INVALID) {

> +		if (tz->last_temperature < trip_temp &&

> +		    tz->temperature >= trip_temp)

> +			thermal_notify_tz_trip_up(tz->id, trip);

> +		if (tz->last_temperature >= trip_temp &&

> +		    tz->temperature < (trip_temp - hyst))

> +			thermal_notify_tz_trip_down(tz->id, trip);

> +	}

>  

>  	if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)

>  		handle_critical_trips(tz, trip, type);

> @@ -443,6 +458,8 @@ static void update_temperature(struct

> thermal_zone_device *tz)

>  	mutex_unlock(&tz->lock);

>  

>  	trace_thermal_temperature(tz);

> +

> +	thermal_genl_sampling_temp(tz->id, temp);

>  }

>  

>  static void thermal_zone_device_init(struct thermal_zone_device *tz)

> @@ -1405,6 +1422,8 @@ thermal_zone_device_register(const char *type,

> int trips, int mask,

>  	if (atomic_cmpxchg(&tz->need_update, 1, 0))

>  		thermal_zone_device_update(tz,

> THERMAL_EVENT_UNSPECIFIED);

>  

> +	thermal_notify_tz_create(tz->id, tz->type);

> +

>  	return tz;

>  

>  unregister:

> @@ -1476,6 +1495,8 @@ void thermal_zone_device_unregister(struct

> thermal_zone_device *tz)

>  	ida_destroy(&tz->ida);

>  	mutex_destroy(&tz->lock);

>  	device_unregister(&tz->device);

> +

> +	thermal_notify_tz_delete(tz->id);

>  }

>  EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);

>  

> diff --git a/drivers/thermal/thermal_helpers.c

> b/drivers/thermal/thermal_helpers.c

> index 87b1256fa2f2..c94bc824e5d3 100644

> --- a/drivers/thermal/thermal_helpers.c

> +++ b/drivers/thermal/thermal_helpers.c

> @@ -175,6 +175,16 @@ void thermal_zone_set_trips(struct

> thermal_zone_device *tz)

>  	mutex_unlock(&tz->lock);

>  }

>  

> +static void thermal_cdev_set_cur_state(struct thermal_cooling_device

> *cdev,

> +				       int target)

> +{

> +	if (cdev->ops->set_cur_state(cdev, target))

> +		return;

> +

> +	thermal_notify_cdev_state_update(cdev->id, target);

> +	thermal_cooling_device_stats_update(cdev, target);

> +}

> +

>  void thermal_cdev_update(struct thermal_cooling_device *cdev)

>  {

>  	struct thermal_instance *instance;

> @@ -197,8 +207,7 @@ void thermal_cdev_update(struct

> thermal_cooling_device *cdev)

>  			target = instance->target;

>  	}

>  

> -	if (!cdev->ops->set_cur_state(cdev, target))

> -		thermal_cooling_device_stats_update(cdev, target);

> +	thermal_cdev_set_cur_state(cdev, target);

>  

>  	cdev->updated = true;

>  	mutex_unlock(&cdev->lock);

> diff --git a/drivers/thermal/thermal_sysfs.c

> b/drivers/thermal/thermal_sysfs.c

> index aa99edb4dff7..ff449943f757 100644

> --- a/drivers/thermal/thermal_sysfs.c

> +++ b/drivers/thermal/thermal_sysfs.c

> @@ -124,7 +124,8 @@ trip_point_temp_store(struct device *dev, struct

> device_attribute *attr,

>  {

>  	struct thermal_zone_device *tz = to_thermal_zone(dev);

>  	int trip, ret;

> -	int temperature;

> +	int temperature, hyst = 0;

> +	enum thermal_trip_type type;

>  

>  	if (!tz->ops->set_trip_temp)

>  		return -EPERM;

> @@ -139,6 +140,18 @@ trip_point_temp_store(struct device *dev, struct

> device_attribute *attr,

>  	if (ret)

>  		return ret;

>  

> +	if (tz->ops->get_trip_hyst) {

> +		ret = tz->ops->get_trip_hyst(tz, trip, &hyst);

> +		if (ret)

> +			return ret;

> +	}

> +

> +	ret = tz->ops->get_trip_type(tz, trip, &type);

> +	if (ret)

> +		return ret;

> +

> +	thermal_notify_tz_trip_change(tz->id, trip, type, temperature,

> hyst);

> +

>  	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);

>  

>  	return count;
Marek Szyprowski July 7, 2020, 9:15 a.m. | #4
Hi Daniel,

On 06.07.2020 15:46, Daniel Lezcano wrote:
> On 06/07/2020 15:17, Marek Szyprowski wrote:

>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>> The generic netlink protocol is implemented but the different

>>> notification functions are not yet connected to the core code.

>>>

>>> These changes add the notification calls in the different

>>> corresponding places.

>>>

>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>> ("thermal: core: Add notifications call in the framework"). Sadly it

>> breaks booting various Samsung Exynos based boards. Here is an example

>> log from Odroid U3 board:

>>

>> Unable to handle kernel NULL pointer dereference at virtual address 00000010

>> pgd = (ptrval)

>> [00000010] *pgd=00000000

>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>> Modules linked in:

>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>> #1146

>> Hardware name: Samsung Exynos (Flattened Device Tree)

>> PC is at kmem_cache_alloc+0x13c/0x418

>> LR is at kmem_cache_alloc+0x48/0x418

>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>> ...

>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>> Stack: (0xee8f1cf8 to 0xee8f2000)

>> ...

>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] (__alloc_skb+0x5c/0x170)

>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>> (thermal_genl_send_event+0x24/0x174)

>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>> (thermal_notify_tz_create+0x58/0x74)

>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>> (thermal_zone_device_register+0x358/0x650)

>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>> (of_parse_thermal_zones+0x304/0x7a4)

>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>> (thermal_init+0xdc/0x154)

>> [<c1028964>] (thermal_init) from [<c0102378>] (do_one_initcall+0x8c/0x424)

>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>> (kernel_init_freeable+0x190/0x204)

>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>> (kernel_init+0x8/0x118)

>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>

>> Reverting it on top of linux-next fixes the boot issue. I will

>> investigate it further soon.

> Thanks for reporting this.

>

> Can you send the addr2line result and code it points to ?


addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to mm/slub.c 
+2839, but I'm not sure if we can trust it. imho it looks like some 
trashed memory somewhere, but I don't have time right now to analyze it 
further now...

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland
Marek Szyprowski July 13, 2020, 9:31 a.m. | #5
Hi

On 07.07.2020 11:15, Marek Szyprowski wrote:
> On 06.07.2020 15:46, Daniel Lezcano wrote:

>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>> The generic netlink protocol is implemented but the different

>>>> notification functions are not yet connected to the core code.

>>>>

>>>> These changes add the notification calls in the different

>>>> corresponding places.

>>>>

>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>> breaks booting various Samsung Exynos based boards. Here is an example

>>> log from Odroid U3 board:

>>>

>>> Unable to handle kernel NULL pointer dereference at virtual address 

>>> 00000010

>>> pgd = (ptrval)

>>> [00000010] *pgd=00000000

>>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>>> Modules linked in:

>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>>> #1146

>>> Hardware name: Samsung Exynos (Flattened Device Tree)

>>> PC is at kmem_cache_alloc+0x13c/0x418

>>> LR is at kmem_cache_alloc+0x48/0x418

>>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>>> ...

>>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>>> Stack: (0xee8f1cf8 to 0xee8f2000)

>>> ...

>>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] 

>>> (__alloc_skb+0x5c/0x170)

>>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>>> (thermal_genl_send_event+0x24/0x174)

>>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>>> (thermal_notify_tz_create+0x58/0x74)

>>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>>> (thermal_zone_device_register+0x358/0x650)

>>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>>> (of_parse_thermal_zones+0x304/0x7a4)

>>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>>> (thermal_init+0xdc/0x154)

>>> [<c1028964>] (thermal_init) from [<c0102378>] 

>>> (do_one_initcall+0x8c/0x424)

>>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>>> (kernel_init_freeable+0x190/0x204)

>>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>>> (kernel_init+0x8/0x118)

>>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>>

>>> Reverting it on top of linux-next fixes the boot issue. I will

>>> investigate it further soon.

>> Thanks for reporting this.

>>

>> Can you send the addr2line result and code it points to ?

>

> addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to 

> mm/slub.c +2839, but I'm not sure if we can trust it. imho it looks 

> like some trashed memory somewhere, but I don't have time right now to 

> analyze it further now...


Just one more thing I've noticed. The crash happens only if the kernel 
is compiled with old GCC (tested with arm-linux-gnueabi-gcc (Linaro GCC 
4.9-2017.01) 4.9.4). If I compile kernel with newed GCC (like 
arm-linux-gnueabi-gcc (Linaro GCC 6.4-2017.11) 6.4.1 20171012), it works 
fine...

This happens also with Linux next-20200710, which again got this commit.

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland
Daniel Lezcano July 13, 2020, 9:45 a.m. | #6
Added Arnd in Cc.

On 13/07/2020 11:31, Marek Szyprowski wrote:
> Hi

> 

> On 07.07.2020 11:15, Marek Szyprowski wrote:

>> On 06.07.2020 15:46, Daniel Lezcano wrote:

>>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>>> The generic netlink protocol is implemented but the different

>>>>> notification functions are not yet connected to the core code.

>>>>>

>>>>> These changes add the notification calls in the different

>>>>> corresponding places.

>>>>>

>>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>>> breaks booting various Samsung Exynos based boards. Here is an example

>>>> log from Odroid U3 board:

>>>>

>>>> Unable to handle kernel NULL pointer dereference at virtual address 

>>>> 00000010

>>>> pgd = (ptrval)

>>>> [00000010] *pgd=00000000

>>>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>>>> Modules linked in:

>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>>>> #1146

>>>> Hardware name: Samsung Exynos (Flattened Device Tree)

>>>> PC is at kmem_cache_alloc+0x13c/0x418

>>>> LR is at kmem_cache_alloc+0x48/0x418

>>>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>>>> ...

>>>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>>>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>>>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>>>> Stack: (0xee8f1cf8 to 0xee8f2000)

>>>> ...

>>>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] 

>>>> (__alloc_skb+0x5c/0x170)

>>>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>>>> (thermal_genl_send_event+0x24/0x174)

>>>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>>>> (thermal_notify_tz_create+0x58/0x74)

>>>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>>>> (thermal_zone_device_register+0x358/0x650)

>>>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>>>> (of_parse_thermal_zones+0x304/0x7a4)

>>>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>>>> (thermal_init+0xdc/0x154)

>>>> [<c1028964>] (thermal_init) from [<c0102378>] 

>>>> (do_one_initcall+0x8c/0x424)

>>>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>>>> (kernel_init_freeable+0x190/0x204)

>>>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>>>> (kernel_init+0x8/0x118)

>>>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>>>

>>>> Reverting it on top of linux-next fixes the boot issue. I will

>>>> investigate it further soon.

>>> Thanks for reporting this.

>>>

>>> Can you send the addr2line result and code it points to ?

>>

>> addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to 

>> mm/slub.c +2839, but I'm not sure if we can trust it. imho it looks 

>> like some trashed memory somewhere, but I don't have time right now to 

>> analyze it further now...

> 

> Just one more thing I've noticed. The crash happens only if the kernel 

> is compiled with old GCC (tested with arm-linux-gnueabi-gcc (Linaro GCC 

> 4.9-2017.01) 4.9.4). If I compile kernel with newed GCC (like 

> arm-linux-gnueabi-gcc (Linaro GCC 6.4-2017.11) 6.4.1 20171012), it works 

> fine...

> 

> This happens also with Linux next-20200710, which again got this commit.

Arnd,

are you aware of any issue with this gcc version which can explain this
kernel panic ? Sounds like the problem does not appear with more recent
version.

-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Daniel Lezcano July 13, 2020, 10:28 a.m. | #7
On 13/07/2020 11:31, Marek Szyprowski wrote:
> Hi

> 

> On 07.07.2020 11:15, Marek Szyprowski wrote:

>> On 06.07.2020 15:46, Daniel Lezcano wrote:

>>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>>> The generic netlink protocol is implemented but the different

>>>>> notification functions are not yet connected to the core code.

>>>>>

>>>>> These changes add the notification calls in the different

>>>>> corresponding places.

>>>>>

>>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>>> breaks booting various Samsung Exynos based boards. Here is an example

>>>> log from Odroid U3 board:


Does it break also arm64 platforms?


-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Daniel Lezcano July 13, 2020, 8:32 p.m. | #8
On 13/07/2020 11:31, Marek Szyprowski wrote:
> Hi

> 

> On 07.07.2020 11:15, Marek Szyprowski wrote:

>> On 06.07.2020 15:46, Daniel Lezcano wrote:

>>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>>> The generic netlink protocol is implemented but the different

>>>>> notification functions are not yet connected to the core code.

>>>>>

>>>>> These changes add the notification calls in the different

>>>>> corresponding places.

>>>>>

>>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>>> breaks booting various Samsung Exynos based boards. Here is an example

>>>> log from Odroid U3 board:

>>>>

>>>> Unable to handle kernel NULL pointer dereference at virtual address 

>>>> 00000010

>>>> pgd = (ptrval)

>>>> [00000010] *pgd=00000000

>>>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>>>> Modules linked in:

>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>>>> #1146

>>>> Hardware name: Samsung Exynos (Flattened Device Tree)

>>>> PC is at kmem_cache_alloc+0x13c/0x418

>>>> LR is at kmem_cache_alloc+0x48/0x418

>>>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>>>> ...

>>>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>>>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>>>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>>>> Stack: (0xee8f1cf8 to 0xee8f2000)

>>>> ...

>>>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] 

>>>> (__alloc_skb+0x5c/0x170)

>>>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>>>> (thermal_genl_send_event+0x24/0x174)

>>>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>>>> (thermal_notify_tz_create+0x58/0x74)

>>>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>>>> (thermal_zone_device_register+0x358/0x650)

>>>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>>>> (of_parse_thermal_zones+0x304/0x7a4)

>>>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>>>> (thermal_init+0xdc/0x154)

>>>> [<c1028964>] (thermal_init) from [<c0102378>] 

>>>> (do_one_initcall+0x8c/0x424)

>>>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>>>> (kernel_init_freeable+0x190/0x204)

>>>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>>>> (kernel_init+0x8/0x118)

>>>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>>>

>>>> Reverting it on top of linux-next fixes the boot issue. I will

>>>> investigate it further soon.

>>> Thanks for reporting this.

>>>

>>> Can you send the addr2line result and code it points to ?

>>

>> addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to 

>> mm/slub.c +2839, but I'm not sure if we can trust it. imho it looks 

>> like some trashed memory somewhere, but I don't have time right now to 

>> analyze it further now...

> 

> Just one more thing I've noticed. The crash happens only if the kernel 

> is compiled with old GCC (tested with arm-linux-gnueabi-gcc (Linaro GCC 

> 4.9-2017.01) 4.9.4). If I compile kernel with newed GCC (like 

> arm-linux-gnueabi-gcc (Linaro GCC 6.4-2017.11) 6.4.1 20171012), it works 

> fine...

> 

> This happens also with Linux next-20200710, which again got this commit.


So I finally succeed to reproduce on an ARM64 with a recent compiler,
earlycon, and the option CONFIG_INIT_ON_ALLOC_DEFAULT_ON.



-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Daniel Lezcano July 14, 2020, 11:20 p.m. | #9
On 13/07/2020 22:32, Daniel Lezcano wrote:
> On 13/07/2020 11:31, Marek Szyprowski wrote:

>> Hi

>>

>> On 07.07.2020 11:15, Marek Szyprowski wrote:

>>> On 06.07.2020 15:46, Daniel Lezcano wrote:

>>>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>>>> The generic netlink protocol is implemented but the different

>>>>>> notification functions are not yet connected to the core code.

>>>>>>

>>>>>> These changes add the notification calls in the different

>>>>>> corresponding places.

>>>>>>

>>>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>>>> breaks booting various Samsung Exynos based boards. Here is an example

>>>>> log from Odroid U3 board:

>>>>>

>>>>> Unable to handle kernel NULL pointer dereference at virtual address 

>>>>> 00000010

>>>>> pgd = (ptrval)

>>>>> [00000010] *pgd=00000000

>>>>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>>>>> Modules linked in:

>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>>>>> #1146

>>>>> Hardware name: Samsung Exynos (Flattened Device Tree)

>>>>> PC is at kmem_cache_alloc+0x13c/0x418

>>>>> LR is at kmem_cache_alloc+0x48/0x418

>>>>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>>>>> ...

>>>>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>>>>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>>>>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>>>>> Stack: (0xee8f1cf8 to 0xee8f2000)

>>>>> ...

>>>>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>] 

>>>>> (__alloc_skb+0x5c/0x170)

>>>>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>>>>> (thermal_genl_send_event+0x24/0x174)

>>>>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>>>>> (thermal_notify_tz_create+0x58/0x74)

>>>>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>>>>> (thermal_zone_device_register+0x358/0x650)

>>>>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>>>>> (of_parse_thermal_zones+0x304/0x7a4)

>>>>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>>>>> (thermal_init+0xdc/0x154)

>>>>> [<c1028964>] (thermal_init) from [<c0102378>] 

>>>>> (do_one_initcall+0x8c/0x424)

>>>>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>>>>> (kernel_init_freeable+0x190/0x204)

>>>>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>>>>> (kernel_init+0x8/0x118)

>>>>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>>>>

>>>>> Reverting it on top of linux-next fixes the boot issue. I will

>>>>> investigate it further soon.

>>>> Thanks for reporting this.

>>>>

>>>> Can you send the addr2line result and code it points to ?

>>>

>>> addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to 

>>> mm/slub.c +2839, but I'm not sure if we can trust it. imho it looks 

>>> like some trashed memory somewhere, but I don't have time right now to 

>>> analyze it further now...

>>

>> Just one more thing I've noticed. The crash happens only if the kernel 

>> is compiled with old GCC (tested with arm-linux-gnueabi-gcc (Linaro GCC 

>> 4.9-2017.01) 4.9.4). If I compile kernel with newed GCC (like 

>> arm-linux-gnueabi-gcc (Linaro GCC 6.4-2017.11) 6.4.1 20171012), it works 

>> fine...

>>

>> This happens also with Linux next-20200710, which again got this commit.

> 

> So I finally succeed to reproduce on an ARM64 with a recent compiler,

> earlycon, and the option CONFIG_INIT_ON_ALLOC_DEFAULT_ON.



Finally, narrowed down the issue.

 - genetlink initialization is done at subsys initcall.
 - thermal netlink init is done at core initcall
 - netlink is done at core initcall

By changing the order:

 - netlink and genetlink at core initcall
 - thermal init at postcore initcall

That fixes the problem. The genetlink initcall order is from 2005 and
IMO it makes sense to come right after the netlink initialization.

It is acceptable to have the thermal init at the postcore initcall. It
is very recently we moved from fs_initcall to core_initcall.

Thanks to Arnd who give me a direction to look at.

-- 
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
Marek Szyprowski July 15, 2020, 6:09 a.m. | #10
Hi Daniel,

On 15.07.2020 01:20, Daniel Lezcano wrote:
> On 13/07/2020 22:32, Daniel Lezcano wrote:

>> On 13/07/2020 11:31, Marek Szyprowski wrote:

>>> On 07.07.2020 11:15, Marek Szyprowski wrote:

>>>> On 06.07.2020 15:46, Daniel Lezcano wrote:

>>>>> On 06/07/2020 15:17, Marek Szyprowski wrote:

>>>>>> On 06.07.2020 12:55, Daniel Lezcano wrote:

>>>>>>> The generic netlink protocol is implemented but the different

>>>>>>> notification functions are not yet connected to the core code.

>>>>>>>

>>>>>>> These changes add the notification calls in the different

>>>>>>> corresponding places.

>>>>>>>

>>>>>>> Reviewed-by: Amit Kucheria <amit.kucheria@linaro.org>

>>>>>>> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>

>>>>>> This patch landed in today's linux-next 20200706 as commit 5df786e46560

>>>>>> ("thermal: core: Add notifications call in the framework"). Sadly it

>>>>>> breaks booting various Samsung Exynos based boards. Here is an example

>>>>>> log from Odroid U3 board:

>>>>>>

>>>>>> Unable to handle kernel NULL pointer dereference at virtual address

>>>>>> 00000010

>>>>>> pgd = (ptrval)

>>>>>> [00000010] *pgd=00000000

>>>>>> Internal error: Oops: 5 [#1] PREEMPT SMP ARM

>>>>>> Modules linked in:

>>>>>> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.8.0-rc3-00015-g5df786e46560

>>>>>> #1146

>>>>>> Hardware name: Samsung Exynos (Flattened Device Tree)

>>>>>> PC is at kmem_cache_alloc+0x13c/0x418

>>>>>> LR is at kmem_cache_alloc+0x48/0x418

>>>>>> pc : [<c02b5cac>]    lr : [<c02b5bb8>]    psr: 20000053

>>>>>> ...

>>>>>> Flags: nzCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment none

>>>>>> Control: 10c5387d  Table: 4000404a  DAC: 00000051

>>>>>> Process swapper/0 (pid: 1, stack limit = 0x(ptrval))

>>>>>> Stack: (0xee8f1cf8 to 0xee8f2000)

>>>>>> ...

>>>>>> [<c02b5cac>] (kmem_cache_alloc) from [<c08cd170>]

>>>>>> (__alloc_skb+0x5c/0x170)

>>>>>> [<c08cd170>] (__alloc_skb) from [<c07ec19c>]

>>>>>> (thermal_genl_send_event+0x24/0x174)

>>>>>> [<c07ec19c>] (thermal_genl_send_event) from [<c07ec648>]

>>>>>> (thermal_notify_tz_create+0x58/0x74)

>>>>>> [<c07ec648>] (thermal_notify_tz_create) from [<c07e9058>]

>>>>>> (thermal_zone_device_register+0x358/0x650)

>>>>>> [<c07e9058>] (thermal_zone_device_register) from [<c1028d34>]

>>>>>> (of_parse_thermal_zones+0x304/0x7a4)

>>>>>> [<c1028d34>] (of_parse_thermal_zones) from [<c1028964>]

>>>>>> (thermal_init+0xdc/0x154)

>>>>>> [<c1028964>] (thermal_init) from [<c0102378>]

>>>>>> (do_one_initcall+0x8c/0x424)

>>>>>> [<c0102378>] (do_one_initcall) from [<c1001158>]

>>>>>> (kernel_init_freeable+0x190/0x204)

>>>>>> [<c1001158>] (kernel_init_freeable) from [<c0ab85f4>]

>>>>>> (kernel_init+0x8/0x118)

>>>>>> [<c0ab85f4>] (kernel_init) from [<c0100114>] (ret_from_fork+0x14/0x20)

>>>>>>

>>>>>> Reverting it on top of linux-next fixes the boot issue. I will

>>>>>> investigate it further soon.

>>>>> Thanks for reporting this.

>>>>>

>>>>> Can you send the addr2line result and code it points to ?

>>>> addr2line of c02b5cac (kmem_cache_alloc+0x13c/0x418) points to

>>>> mm/slub.c +2839, but I'm not sure if we can trust it. imho it looks

>>>> like some trashed memory somewhere, but I don't have time right now to

>>>> analyze it further now...

>>> Just one more thing I've noticed. The crash happens only if the kernel

>>> is compiled with old GCC (tested with arm-linux-gnueabi-gcc (Linaro GCC

>>> 4.9-2017.01) 4.9.4). If I compile kernel with newed GCC (like

>>> arm-linux-gnueabi-gcc (Linaro GCC 6.4-2017.11) 6.4.1 20171012), it works

>>> fine...

>>>

>>> This happens also with Linux next-20200710, which again got this commit.

>> So I finally succeed to reproduce on an ARM64 with a recent compiler,

>> earlycon, and the option CONFIG_INIT_ON_ALLOC_DEFAULT_ON.

>

> Finally, narrowed down the issue.

>

>   - genetlink initialization is done at subsys initcall.

>   - thermal netlink init is done at core initcall

>   - netlink is done at core initcall

>

> By changing the order:

>

>   - netlink and genetlink at core initcall

>   - thermal init at postcore initcall

>

> That fixes the problem.

I confirm that such change fixes the issue! Feel free to add:

Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>

Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>


to the final patch.

> The genetlink initcall order is from 2005 and

> IMO it makes sense to come right after the netlink initialization.

>

> It is acceptable to have the thermal init at the postcore initcall. It

> is very recently we moved from fs_initcall to core_initcall.

>

> Thanks to Arnd who give me a direction to look at.

Best regards

-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland

Patch

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5fae1621fb01..25ef29123f72 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -215,6 +215,8 @@  int thermal_zone_device_set_policy(struct thermal_zone_device *tz,
 	mutex_unlock(&tz->lock);
 	mutex_unlock(&thermal_governor_lock);
 
+	thermal_notify_tz_gov_change(tz->id, policy);
+
 	return ret;
 }
 
@@ -406,12 +408,25 @@  static void handle_critical_trips(struct thermal_zone_device *tz,
 static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
 {
 	enum thermal_trip_type type;
+	int trip_temp, hyst = 0;
 
 	/* Ignore disabled trip points */
 	if (test_bit(trip, &tz->trips_disabled))
 		return;
 
+	tz->ops->get_trip_temp(tz, trip, &trip_temp);
 	tz->ops->get_trip_type(tz, trip, &type);
+	if (tz->ops->get_trip_hyst)
+		tz->ops->get_trip_hyst(tz, trip, &hyst);
+
+	if (tz->last_temperature != THERMAL_TEMP_INVALID) {
+		if (tz->last_temperature < trip_temp &&
+		    tz->temperature >= trip_temp)
+			thermal_notify_tz_trip_up(tz->id, trip);
+		if (tz->last_temperature >= trip_temp &&
+		    tz->temperature < (trip_temp - hyst))
+			thermal_notify_tz_trip_down(tz->id, trip);
+	}
 
 	if (type == THERMAL_TRIP_CRITICAL || type == THERMAL_TRIP_HOT)
 		handle_critical_trips(tz, trip, type);
@@ -443,6 +458,8 @@  static void update_temperature(struct thermal_zone_device *tz)
 	mutex_unlock(&tz->lock);
 
 	trace_thermal_temperature(tz);
+
+	thermal_genl_sampling_temp(tz->id, temp);
 }
 
 static void thermal_zone_device_init(struct thermal_zone_device *tz)
@@ -1405,6 +1422,8 @@  thermal_zone_device_register(const char *type, int trips, int mask,
 	if (atomic_cmpxchg(&tz->need_update, 1, 0))
 		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
+	thermal_notify_tz_create(tz->id, tz->type);
+
 	return tz;
 
 unregister:
@@ -1476,6 +1495,8 @@  void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 	ida_destroy(&tz->ida);
 	mutex_destroy(&tz->lock);
 	device_unregister(&tz->device);
+
+	thermal_notify_tz_delete(tz->id);
 }
 EXPORT_SYMBOL_GPL(thermal_zone_device_unregister);
 
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 87b1256fa2f2..c94bc824e5d3 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -175,6 +175,16 @@  void thermal_zone_set_trips(struct thermal_zone_device *tz)
 	mutex_unlock(&tz->lock);
 }
 
+static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
+				       int target)
+{
+	if (cdev->ops->set_cur_state(cdev, target))
+		return;
+
+	thermal_notify_cdev_state_update(cdev->id, target);
+	thermal_cooling_device_stats_update(cdev, target);
+}
+
 void thermal_cdev_update(struct thermal_cooling_device *cdev)
 {
 	struct thermal_instance *instance;
@@ -197,8 +207,7 @@  void thermal_cdev_update(struct thermal_cooling_device *cdev)
 			target = instance->target;
 	}
 
-	if (!cdev->ops->set_cur_state(cdev, target))
-		thermal_cooling_device_stats_update(cdev, target);
+	thermal_cdev_set_cur_state(cdev, target);
 
 	cdev->updated = true;
 	mutex_unlock(&cdev->lock);
diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
index aa99edb4dff7..ff449943f757 100644
--- a/drivers/thermal/thermal_sysfs.c
+++ b/drivers/thermal/thermal_sysfs.c
@@ -124,7 +124,8 @@  trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 {
 	struct thermal_zone_device *tz = to_thermal_zone(dev);
 	int trip, ret;
-	int temperature;
+	int temperature, hyst = 0;
+	enum thermal_trip_type type;
 
 	if (!tz->ops->set_trip_temp)
 		return -EPERM;
@@ -139,6 +140,18 @@  trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
+	if (tz->ops->get_trip_hyst) {
+		ret = tz->ops->get_trip_hyst(tz, trip, &hyst);
+		if (ret)
+			return ret;
+	}
+
+	ret = tz->ops->get_trip_type(tz, trip, &type);
+	if (ret)
+		return ret;
+
+	thermal_notify_tz_trip_change(tz->id, trip, type, temperature, hyst);
+
 	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return count;