diff mbox series

[v3,1/2] thermal/debugfs: Add thermal cooling device debugfs information

Message ID 20231219092539.3655172-1-daniel.lezcano@linaro.org
State New
Headers show
Series [v3,1/2] thermal/debugfs: Add thermal cooling device debugfs information | expand

Commit Message

Daniel Lezcano Dec. 19, 2023, 9:25 a.m. UTC
The thermal framework does not have any debug information except a
sysfs stat which is a bit controversial. This one allocates big chunks
of memory for every cooling devices with a high number of states and
could represent on some systems in production several megabytes of
memory for just a portion of it. As the sysfs is limited to a page
size, the output is not exploitable with large data array and gets
truncated.

The patch provides the same information than sysfs except the
transitions are dynamically allocated, thus they won't show more
events than the ones which actually occurred. There is no longer a
size limitation and it opens the field for more debugging information
where the debugfs is designed for, not sysfs.

The thermal debugfs directory structure tries to stay consistent with
the sysfs one but in a very simplified way:

thermal/
 -- cooling_devices
    |-- 0
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 1
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 2
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    |-- 3
    |   |-- clear
    |   |-- time_in_state_ms
    |   |-- total_trans
    |   `-- trans_table
    `-- 4
        |-- clear
        |-- time_in_state_ms
        |-- total_trans
        `-- trans_table

The content of the files in the cooling devices directory is the same
as the sysfs one except for the trans_table which has the following
format:

Transition	Hits
1->0      	246
0->1      	246
2->1      	632
1->2      	632
3->2      	98
2->3      	98

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
---
Changelog:
  - v3
    - Fixed kerneldoc description (kbuild)
    - Made some functions static
  - v2
    - Added parameter names to fix kbuild report
    - Renamed 'reset' to 'clear' to avoid confusion (Rafael)
    - Fixed several typos and rephrased some sentences (Rafael)
    - Renamed structure field name s/list/node/ (Rafael)
    - Documented structures and exported functions (Rafael)
    - s/trans_list/transitions/ (Rafael)
    - s/duration_list/durations/ (Rafael)
    - Folded 'alloc' and 'insert' into a single function (Rafael)
    - s/list/lists/ as it is an array of lists (Rafael)
    - s/pos/entry/ (Rafael)
    - Introduced a locking in the 'clear' callback function (Rafael)
    - s/to/new_state/ and s/from/old_state/ (Rafael)
    - Added some comments in thermal_debug_cdev_transition() (Rafael)
    - Explained why char[11] (Rafael)
    - s/Hits/Occurrences/ (Rafael)
    - s/Time/Residency/ (Rafael)
    - Constified structure pointer passed to thermal_debug_cdev_transition()
    - s/thermal_debug_cdev_transition()/thermal_debug_cdev_state_update()/
  - v1 (from RFC):
    - Fixed typo "occurred"
    - Changed Kconfig option name and description
    - Removed comment in the Makefile
    - Renamed exported function name s/debugfs/debug/
    - Replaced thermal_debug_cdev_[unregister|register] by [add|remove]
---
 drivers/thermal/Kconfig           |   7 +
 drivers/thermal/Makefile          |   2 +
 drivers/thermal/thermal_core.c    |   6 +
 drivers/thermal/thermal_core.h    |   1 +
 drivers/thermal/thermal_debugfs.c | 424 ++++++++++++++++++++++++++++++
 drivers/thermal/thermal_debugfs.h |  14 +
 drivers/thermal/thermal_helpers.c |  27 +-
 include/linux/thermal.h           |   7 +
 8 files changed, 482 insertions(+), 6 deletions(-)
 create mode 100644 drivers/thermal/thermal_debugfs.c
 create mode 100644 drivers/thermal/thermal_debugfs.h

Comments

Rafael J. Wysocki Dec. 21, 2023, 5:19 p.m. UTC | #1
On Tue, Dec 19, 2023 at 10:25 AM Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
>
> The thermal framework does not have any debug information except a
> sysfs stat which is a bit controversial. This one allocates big chunks
> of memory for every cooling devices with a high number of states and
> could represent on some systems in production several megabytes of
> memory for just a portion of it. As the sysfs is limited to a page
> size, the output is not exploitable with large data array and gets
> truncated.
>
> The patch provides the same information than sysfs except the
> transitions are dynamically allocated, thus they won't show more
> events than the ones which actually occurred. There is no longer a
> size limitation and it opens the field for more debugging information
> where the debugfs is designed for, not sysfs.
>
> The thermal debugfs directory structure tries to stay consistent with
> the sysfs one but in a very simplified way:
>
> thermal/
>  -- cooling_devices
>     |-- 0
>     |   |-- clear
>     |   |-- time_in_state_ms
>     |   |-- total_trans
>     |   `-- trans_table
>     |-- 1
>     |   |-- clear
>     |   |-- time_in_state_ms
>     |   |-- total_trans
>     |   `-- trans_table
>     |-- 2
>     |   |-- clear
>     |   |-- time_in_state_ms
>     |   |-- total_trans
>     |   `-- trans_table
>     |-- 3
>     |   |-- clear
>     |   |-- time_in_state_ms
>     |   |-- total_trans
>     |   `-- trans_table
>     `-- 4
>         |-- clear
>         |-- time_in_state_ms
>         |-- total_trans
>         `-- trans_table
>
> The content of the files in the cooling devices directory is the same
> as the sysfs one except for the trans_table which has the following
> format:
>
> Transition      Hits
> 1->0            246
> 0->1            246
> 2->1            632
> 1->2            632
> 3->2            98
> 2->3            98
>
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
> Changelog:
>   - v3
>     - Fixed kerneldoc description (kbuild)
>     - Made some functions static

There are a few minor nits still, as far as I'm concerned (see below).

>   - v2
>     - Added parameter names to fix kbuild report
>     - Renamed 'reset' to 'clear' to avoid confusion (Rafael)
>     - Fixed several typos and rephrased some sentences (Rafael)
>     - Renamed structure field name s/list/node/ (Rafael)
>     - Documented structures and exported functions (Rafael)
>     - s/trans_list/transitions/ (Rafael)
>     - s/duration_list/durations/ (Rafael)
>     - Folded 'alloc' and 'insert' into a single function (Rafael)
>     - s/list/lists/ as it is an array of lists (Rafael)
>     - s/pos/entry/ (Rafael)
>     - Introduced a locking in the 'clear' callback function (Rafael)
>     - s/to/new_state/ and s/from/old_state/ (Rafael)
>     - Added some comments in thermal_debug_cdev_transition() (Rafael)
>     - Explained why char[11] (Rafael)
>     - s/Hits/Occurrences/ (Rafael)
>     - s/Time/Residency/ (Rafael)
>     - Constified structure pointer passed to thermal_debug_cdev_transition()
>     - s/thermal_debug_cdev_transition()/thermal_debug_cdev_state_update()/
>   - v1 (from RFC):
>     - Fixed typo "occurred"
>     - Changed Kconfig option name and description
>     - Removed comment in the Makefile
>     - Renamed exported function name s/debugfs/debug/
>     - Replaced thermal_debug_cdev_[unregister|register] by [add|remove]
> ---
>  drivers/thermal/Kconfig           |   7 +
>  drivers/thermal/Makefile          |   2 +
>  drivers/thermal/thermal_core.c    |   6 +
>  drivers/thermal/thermal_core.h    |   1 +
>  drivers/thermal/thermal_debugfs.c | 424 ++++++++++++++++++++++++++++++
>  drivers/thermal/thermal_debugfs.h |  14 +
>  drivers/thermal/thermal_helpers.c |  27 +-
>  include/linux/thermal.h           |   7 +
>  8 files changed, 482 insertions(+), 6 deletions(-)
>  create mode 100644 drivers/thermal/thermal_debugfs.c
>  create mode 100644 drivers/thermal/thermal_debugfs.h

[cut]

All of the above changes look good to me.

> +
> +/**
> + * struct cdev_debugfs - per cooling device statistics structure
> + * A cooling device can have a high number of states. Showing the
> + * transitions on a matrix based representation can be overkill given
> + * most of the transitions won't happen and we end up with a matrix
> + * filled with zero. Instead, we show the transitions which actually
> + * happened.
> + *
> + * Every transition updates the current_state and the timestamp. The
> + * transitions and the durations are stored in lists.
> + *
> + * @total: the number of transitions for this cooling device
> + * @current_state: the current cooling device state
> + * @timestamp: the state change timestamp
> + * @durations: an array of lists containing the residencies of each state
> + * @transitions: an array of lists containing the state transitions

I would use the same ordering as for the fields below.

> + */
> +struct cdev_debugfs {
> +       u32 total;
> +       int current_state;
> +       ktime_t timestamp;
> +       struct list_head transitions[CDEVSTATS_HASH_SIZE];
> +       struct list_head durations[CDEVSTATS_HASH_SIZE];
> +};
> +
> +/**
> + * struct cdev_value - Common structure for cooling device entry
> + *
> + * The following common structure allows to store the information
> + * related to the transitions and to the state residencies. They are
> + * identified with a id which is associated to a value. It is used as
> + * nodes for the "transitions" and "durations" above.
> + *
> + * @node: node to insert the structure in a list
> + * @id: identifier of the value which can be a state or a transition
> + * @value: the id associated value which can be a duration or an occurrence

s/an occurrence/a number of occurrences/  IIUC.

> + */
> +struct cdev_value {

I'm not sure about the name here.  I would rather call it cdev_record,
because it consists of two items, the id and the value.

> +       struct list_head node;
> +       int id;
> +       u64 value;

This is kind of a union, but sort of in disguise.

Why not make it a union proper, that is

struct cdev_record {
        struct list_head node;
        int id;
        union {
                krime_t residency; /* for duration records */
                u64 count; /* for occurrences records */
        } data;
};

which then would result in a bit cleaner code in some places below, if
I'm not mistaken?

> +};
> +
> +/**
> + * struct thermal_debugfs - High level structure for a thermal object in debugfs
> + *
> + * The thermal_debugfs structure is the common structure used by the
> + * cooling device to compute the statistics.
> + *
> + * @d_top: top directory of the thermal object directory
> + * @lock: per object lock to protect the internals
> + *
> + * @cdev: a cooling device debug structure
> + */
> +struct thermal_debugfs {
> +       struct dentry *d_top;
> +       struct mutex lock;
> +       union {
> +               struct cdev_debugfs cdev;

I would call this cdev_dbg so it is not confused with a cooling device object.

> +       };
> +};
> +
> +void thermal_debug_init(void)
> +{
> +       d_root = debugfs_create_dir("thermal", NULL);
> +       if (!d_root)
> +               return;
> +
> +       d_cdev = debugfs_create_dir("cooling_devices", d_root);
> +}
> +
> +static struct thermal_debugfs *thermal_debugfs_add_id(struct dentry *d, int id)
> +{
> +       struct thermal_debugfs *dfs;

I'm not sure about the "fs" parts of the variable names here and
below.  It doesn't seem to mean anything in particular, so it would be
better to use something more meaningful IMV.  For example, this
particular pointer could be called thermdbg and pointers to the cdev
or thermal zone (in the next patch) data sets could be called cdev_dbg
and tz_dbg, respectively.

> +       char ids[IDSLENGTH];
> +
> +       dfs = kzalloc(sizeof(*dfs), GFP_KERNEL);
> +       if (!dfs)
> +               return NULL;
> +
> +       mutex_init(&dfs->lock);
> +
> +       snprintf(ids, IDSLENGTH, "%d", id);
> +
> +       dfs->d_top = debugfs_create_dir(ids, d);
> +       if (!dfs->d_top) {
> +               kfree(dfs);
> +               return NULL;
> +       }
> +
> +       return dfs;
> +}
> +
> +static void thermal_debugfs_remove_id(struct thermal_debugfs *dfs)
> +{
> +       if (!dfs)
> +               return;
> +
> +       debugfs_remove(dfs->d_top);
> +
> +       kfree(dfs);
> +}
> +
> +static struct cdev_value *thermal_debugfs_cdev_value_alloc(struct thermal_debugfs *dfs,
> +                                                          struct list_head *list, int id)

The analogous list_head pointer in the next function is called lists,
so this one could be called lists either.

> +{
> +       struct cdev_value *cdev_value;
> +
> +       cdev_value = kzalloc(sizeof(*cdev_value), GFP_KERNEL);
> +       if (cdev_value) {
> +               cdev_value->id = id;
> +               INIT_LIST_HEAD(&cdev_value->node);
> +               list_add_tail(&cdev_value->node, &list[cdev_value->id % CDEVSTATS_HASH_SIZE]);
> +       }

What about (using the names suggested above)

cdev_record = kzalloc(sizeof(*cdev_record), GFP_KERNEL);
if (!cdev_record)
        return NULL;

cdev_record->id = id;
list_add_tail(&cdev_record->node, &lists[cdev_record->id %
CDEVSTATS_HASH_SIZE]);

> +
> +       return cdev_value;
> +}

BTW, a list_head need not be initialized before adding it to an existing list.

> +
> +static struct cdev_value *thermal_debugfs_cdev_value_find(struct thermal_debugfs *dfs,
> +                                                         struct list_head *lists, int id)
> +{
> +       struct cdev_value *entry;
> +
> +       list_for_each_entry(entry, &lists[id % CDEVSTATS_HASH_SIZE], node)
> +               if (entry->id == id)
> +                       return entry;
> +
> +       return NULL;
> +}
> +
> +static struct cdev_value *thermal_debugfs_cdev_value_get(struct thermal_debugfs *dfs,
> +                                                        struct list_head *list, int id)

And here?

> +{
> +       struct cdev_value *cdev_value;
> +
> +       cdev_value = thermal_debugfs_cdev_value_find(dfs, list, id);
> +       if (cdev_value)
> +               return cdev_value;
> +
> +       return thermal_debugfs_cdev_value_alloc(dfs, list, id);
> +}
> +
> +static void thermal_debugfs_cdev_clear(struct cdev_debugfs *cfs)
> +{
> +       int i;
> +       struct cdev_value *entry, *tmp;
> +
> +       for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
> +
> +               list_for_each_entry_safe(entry, tmp, &cfs->transitions[i], node) {
> +                       list_del(&entry->node);
> +                       kfree(entry);
> +               }
> +
> +               list_for_each_entry_safe(entry, tmp, &cfs->durations[i], node) {
> +                       list_del(&entry->node);
> +                       kfree(entry);
> +               }
> +       }
> +
> +       cfs->total = 0;
> +}
> +
> +static void *cdev_seq_start(struct seq_file *s, loff_t *pos)
> +{
> +       struct thermal_debugfs *dfs = s->private;
> +
> +       mutex_lock(&dfs->lock);
> +
> +       return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
> +}
> +
> +static void *cdev_seq_next(struct seq_file *s, void *v, loff_t *pos)
> +{
> +       (*pos)++;
> +
> +       return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
> +}
> +
> +static void cdev_seq_stop(struct seq_file *s, void *v)
> +{
> +       struct thermal_debugfs *dfs = s->private;
> +
> +       mutex_unlock(&dfs->lock);
> +}
> +
> +static int cdev_tt_seq_show(struct seq_file *s, void *v)
> +{
> +       struct thermal_debugfs *dfs = s->private;
> +       struct cdev_debugfs *cfs = &dfs->cdev;
> +       struct list_head *transitions = cfs->transitions;
> +       struct cdev_value *entry;
> +       int i = *(loff_t *)v;
> +
> +       if (!i)
> +               seq_puts(s, "Transition\tOccurences\n");
> +
> +       list_for_each_entry(entry, &transitions[i], node) {
> +               /*
> +                * Assuming maximum cdev states is 1024, the longer
> +                * string for a transition would be "1024->1024\0"
> +                */
> +               char buffer[11];
> +
> +               snprintf(buffer, ARRAY_SIZE(buffer), "%d->%d",
> +                        entry->id >> 16, entry->id & 0xFFFF);
> +
> +               seq_printf(s, "%-10s\t%-10llu\n", buffer, entry->value);
> +       }
> +
> +       return 0;
> +}
> +
> +static const struct seq_operations tt_sops = {
> +       .start = cdev_seq_start,
> +       .next = cdev_seq_next,
> +       .stop = cdev_seq_stop,
> +       .show = cdev_tt_seq_show,
> +};
> +
> +DEFINE_SEQ_ATTRIBUTE(tt);
> +
> +static int cdev_dt_seq_show(struct seq_file *s, void *v)
> +{
> +       struct thermal_debugfs *dfs = s->private;
> +       struct cdev_debugfs *cfs = &dfs->cdev;
> +       struct list_head *durations = cfs->durations;
> +       struct cdev_value *entry;
> +       int i = *(loff_t *)v;
> +
> +       if (!i)
> +               seq_puts(s, "State\tResidency\n");
> +
> +       list_for_each_entry(entry, &durations[i], node) {
> +               s64 duration = entry->value;
> +
> +               if (entry->id == cfs->current_state)
> +                       duration += ktime_ms_delta(ktime_get(), cfs->timestamp);
> +
> +               seq_printf(s, "%-5d\t%-10llu\n", entry->id, duration);
> +       }
> +
> +       return 0;
> +}
> +
> +static const struct seq_operations dt_sops = {
> +       .start = cdev_seq_start,
> +       .next = cdev_seq_next,
> +       .stop = cdev_seq_stop,
> +       .show = cdev_dt_seq_show,
> +};
> +
> +DEFINE_SEQ_ATTRIBUTE(dt);
> +
> +static int cdev_clear_set(void *data, u64 val)
> +{
> +       struct thermal_debugfs *dfs = data;
> +
> +       if (!val)
> +               return -EINVAL;
> +
> +       mutex_lock(&dfs->lock);
> +
> +       thermal_debugfs_cdev_clear(&dfs->cdev);
> +
> +       mutex_unlock(&dfs->lock);
> +
> +       return 0;
> +}
> +
> +DEFINE_DEBUGFS_ATTRIBUTE(cdev_clear_fops, NULL, cdev_clear_set, "%llu\n");
> +
> +/**
> + * thermal_debug_cdev_state_update - Update a cooling device state change
> + *
> + * Computes a transition and the duration of the previous state residency.
> + *
> + * @cdev : a pointer to a cooling device
> + * @new_state: an integer corresponding to the new cooling device state
> + */
> +void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
> +                                    int new_state)
> +{
> +       struct thermal_debugfs *dfs = cdev->debugfs;
> +       struct cdev_debugfs *cfs;
> +       struct cdev_value *cdev_value;
> +       ktime_t now = ktime_get();

I'd call ktime_get() right before using now for the first time.

> +       int transition, old_state;
> +
> +       if (!dfs || (dfs->cdev.current_state == new_state))
> +               return;
> +
> +       mutex_lock(&dfs->lock);
> +
> +       cfs = &dfs->cdev;
> +
> +       old_state = cfs->current_state;

Because new_state and transition are only used later, after the
duration record has been processed, I would move the next two lines
after the if () below.

> +       cfs->current_state = new_state;
> +       transition = (old_state << 16) | new_state;
> +
> +       /*
> +        * Get the old state information in the durations list. If
> +        * this one does not exist, a new allocated one will be
> +        * returned. Recompute the total duration in the old state and
> +        * get a new timestamp for the new state.
> +        */
> +       cdev_value = thermal_debugfs_cdev_value_get(dfs, cfs->durations, old_state);
> +       if (cdev_value) {
> +               cdev_value->value += ktime_ms_delta(now, cfs->timestamp);

Use ktime_add() here and convert to ms when printing the record?

> +               cfs->timestamp = now;
> +       }
> +
> +       /*
> +        * Get the transition in the transitions list. If this one
> +        * does not exist, a new allocated one will be returned.
> +        * Increment the occurrence of this transition which is stored
> +        * in the value field.
> +        */
> +       cdev_value = thermal_debugfs_cdev_value_get(dfs, cfs->transitions,
> +                                                   transition);
> +       if (cdev_value)
> +               cdev_value->value++;
> +
> +       cfs->total++;
> +
> +       mutex_unlock(&dfs->lock);
> +}
> +
> +/**
> + * thermal_debug_cdev_add - Add a cooling device debugfs entry
> + *
> + * Allocates a cooling device object for debug, initializes the
> + * statistics and create the entries in sysfs.
> + * @cdev: a pointer to a cooling device
> + */
> +void thermal_debug_cdev_add(struct thermal_cooling_device *cdev)
> +{
> +       struct thermal_debugfs *dfs;
> +       struct cdev_debugfs *cfs;
> +       int i;
> +
> +       dfs = thermal_debugfs_add_id(d_cdev, cdev->id);
> +       if (!dfs)
> +               return;
> +
> +       cfs = &dfs->cdev;
> +
> +       for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
> +               INIT_LIST_HEAD(&cfs->transitions[i]);
> +               INIT_LIST_HEAD(&cfs->durations[i]);
> +       }
> +
> +       cfs->current_state = 0;
> +       cfs->timestamp = ktime_get();
> +
> +       debugfs_create_file("trans_table", 0400, dfs->d_top, dfs, &tt_fops);
> +
> +       debugfs_create_file("time_in_state_ms", 0400, dfs->d_top, dfs, &dt_fops);
> +
> +       debugfs_create_file("clear", 0200, dfs->d_top, dfs, &cdev_clear_fops);
> +
> +       debugfs_create_u32("total_trans", 0400, dfs->d_top, &cfs->total);
> +
> +       cdev->debugfs = dfs;
> +}
> +
> +/**
> + * thermal_debug_cdev_remove - Remove a cooling device debugfs entry
> + *
> + * Frees the statistics memory data and remove the debugfs entry
> + *
> + * @cdev: a pointer to a cooling device
> + */
> +void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev)
> +{
> +       struct thermal_debugfs *dfs = cdev->debugfs;
> +
> +       if (!dfs)
> +               return;
> +
> +       mutex_lock(&dfs->lock);
> +
> +       thermal_debugfs_cdev_clear(&dfs->cdev);
> +       cdev->debugfs = NULL;
> +       thermal_debugfs_remove_id(dfs);
> +
> +       mutex_unlock(&dfs->lock);
> +}
> diff --git a/drivers/thermal/thermal_debugfs.h b/drivers/thermal/thermal_debugfs.h
> new file mode 100644
> index 000000000000..341499388448
> --- /dev/null
> +++ b/drivers/thermal/thermal_debugfs.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifdef CONFIG_THERMAL_DEBUGFS
> +void thermal_debug_init(void);
> +void thermal_debug_cdev_add(struct thermal_cooling_device *cdev);
> +void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev);
> +void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev, int state);
> +#else
> +static inline void thermal_debug_init(void) {}
> +static inline void thermal_debug_cdev_add(struct thermal_cooling_device *cdev) {}
> +static inline void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev) {}
> +static inline void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
> +                                                  int state) {}
> +#endif /* CONFIG_THERMAL_DEBUGFS */
> diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
> index 69e8ea4aa908..435c123b721b 100644
> --- a/drivers/thermal/thermal_helpers.c
> +++ b/drivers/thermal/thermal_helpers.c
> @@ -151,14 +151,29 @@ int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
>  }
>  EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
>
> -static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
> -                                      int target)
> +void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms)
>  {
> -       if (cdev->ops->set_cur_state(cdev, target))
> -               return;
> +       *delay_jiffies = msecs_to_jiffies(delay_ms);
> +       if (delay_ms > 1000)
> +               *delay_jiffies = round_jiffies(*delay_jiffies);
> +}

This adds a function that's never used AFAICS.  I suppose the next
patch will use it?

The rest of the patch LGTM.

Thanks!
Rafael J. Wysocki Dec. 21, 2023, 7:26 p.m. UTC | #2
On Tue, Dec 19, 2023 at 10:25 AM Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
>
> The mitigation episodes are recorded. A mitigation episode happens
> when the first trip point is crossed the way up and then the way
> down. During this episode other trip points can be crossed also and
> are accounted for this mitigation episode. The interesting information
> is the average temperature at the trip point, the undershot and the
> overshot. The standard deviation of the mitigated temperature will be
> added later.
>
> The thermal debugfs directory structure tries to stay consistent with
> the sysfs one but in a very simplified way:
>
> thermal/
>  `-- thermal_zones
>      |-- 0
>      |   `-- mitigations
>      `-- 1
>          `-- mitigations
>
> The content of the mitigations file has the following format:
>
> ,-Mitigation at 349988258us, duration=130136ms
> | trip |     type | temp(°mC) | hyst(°mC) |  duration  |  avg(°mC) |  min(°mC) |  max(°mC) |
> |    0 |  passive |     65000 |      2000 |     130136 |     68227 |     62500 |     75625 |
> |    1 |  passive |     75000 |      2000 |     104209 |     74857 |     71666 |     77500 |
> ,-Mitigation at 272451637us, duration=75000ms
> | trip |     type | temp(°mC) | hyst(°mC) |  duration  |  avg(°mC) |  min(°mC) |  max(°mC) |
> |    0 |  passive |     65000 |      2000 |      75000 |     68561 |     62500 |     75000 |
> |    1 |  passive |     75000 |      2000 |      60714 |     74820 |     70555 |     77500 |
> ,-Mitigation at 238184119us, duration=27316ms
> | trip |     type | temp(°mC) | hyst(°mC) |  duration  |  avg(°mC) |  min(°mC) |  max(°mC) |
> |    0 |  passive |     65000 |      2000 |      27316 |     73377 |     62500 |     75000 |
> |    1 |  passive |     75000 |      2000 |      19468 |     75284 |     69444 |     77500 |
> ,-Mitigation at 39863713us, duration=136196ms
> | trip |     type | temp(°mC) | hyst(°mC) |  duration  |  avg(°mC) |  min(°mC) |  max(°mC) |
> |    0 |  passive |     65000 |      2000 |     136196 |     73922 |     62500 |     75000 |
> |    1 |  passive |     75000 |      2000 |      91721 |     74386 |     69444 |     78125 |
>
> More information for a better understanding of the thermal behavior
> will be added after. The idea is to give detailed statistics
> information about the undershots and overshots, the temperature speed,
> etc... As all the information in a single file is too much, the idea
> would be to create a directory named with the mitigation timestamp
> where all data could be added.
>
> Please note this code is immune against trip ordering.
>
> Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
> ---
> Changelog:
>   - v3
>     - Fixed kerneldoc (kbuild)
>     - Fixed wrong indentation s/<space>/<tab>/
>   - v2
>     - Applied changes based on comments from patch 1/2
>     - Constified structure in function parameters
>   - v1 (from RFC):
>     - Replaced exported function name s/debugfs/debug/
>     - Used "struct thermal_trip" parameter instead of "trip_id"
>     - Renamed handle_way_[up|down] by tz_trip_[up|down]
>     - Replaced thermal_debug_tz_[unregister|register] by [add|remove]
> ---
>  drivers/thermal/thermal_core.c    |   7 +
>  drivers/thermal/thermal_debugfs.c | 367 +++++++++++++++++++++++++++++-
>  drivers/thermal/thermal_debugfs.h |  14 ++
>  3 files changed, 387 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
> index 33332d401b13..a0cbe8d7b945 100644
> --- a/drivers/thermal/thermal_core.c
> +++ b/drivers/thermal/thermal_core.c

[cut]

The changes above LGTM.

> diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
> index 8fceddb5f6d2..5fd2553260b2 100644
> --- a/drivers/thermal/thermal_debugfs.c
> +++ b/drivers/thermal/thermal_debugfs.c
> @@ -14,8 +14,11 @@
>  #include <linux/mutex.h>
>  #include <linux/thermal.h>
>
> +#include "thermal_core.h"
> +
>  static struct dentry *d_root;
>  static struct dentry *d_cdev;
> +static struct dentry *d_tz;
>
>  /*
>   * Length of the string containing the thermal zone id or the cooling
> @@ -77,22 +80,84 @@ struct cdev_value {
>         u64 value;
>  };
>
> +/**
> + * struct trip_stats - Thermal trip statistics
> + *
> + * The trip_stats structure has the relevant information to show the
> + * statistics related to a trip point violation during a mitigation
> + * episode.

I wouldn't use the term "violation" here, it feels  too strong.

I looked for a replacement word, but I couldn't find a suitable one.
In the sentence above I would just say "related to going above a trip
point".

> + *
> + * @timestamp: the trip crossing timestamp
> + * @duration: the total duration of trip point violation

"total time when the zone temperature was above the trip point"

> + * @count: the number of occurrences of the trip point violation

"the number of times the zone temperature was above the trip point"

> + * @max: maximum temperature during the trip point violation

"maximum recorded temperature above the trip point"

> + * @min: min temperature during the trip point violation

"minimum recorded temperature above the trip point"

> + * @avg: average temperature during the trip point violation

"average temperature above the trip point"

> + */
> +struct trip_stats {
> +       ktime_t timestamp;
> +       ktime_t duration;
> +       int count;
> +       int max;
> +       int min;
> +       int avg;
> +};
> +
> +/**
> + * struct tz_events - Store all events related to a mitigation episode
> + *
> + * The tz_events structure describes a mitigation episode.

So why not call it tz_mitigation?

> A
> + * mitigation episode is when the mitigation begins and ends. During
> + * this episode we can have multiple trip points crossed the way up
> + * and down if there are multiple trip describes in the firmware.
> + *
> + * @node: a list element to be added to the list of tz events
> + * @trip_stats: per trip point statistics
> + * @timestamp: First trip point crossed the way up
> + * @duration: total duration of the mitigation episode
> + */
> +struct tz_events {
> +       struct list_head node;
> +       struct trip_stats *trip_stats;
> +       ktime_t timestamp;
> +       ktime_t duration;
> +};
> +
> +/**
> + * struct tz_debugfs - Store all mitigations episodes for a thermal zone

"Collection of all mitigation episodes for a thermal zone"

> + *
> + * The tz_debugfs structure contains the list of the mitigation
> + * episodes and has to track which trip point has been crossed in
> + * order to handle correctly nested trip point mitigation episodes.
> + *
> + * @tz_events: a list of thermal mitigation episodes
> + * @trips_crossed: an array of trip point crossed by id

This requires a bit more explanation IMV.
Daniel Lezcano Dec. 22, 2023, 3:03 p.m. UTC | #3
On 21/12/2023 18:19, Rafael J. Wysocki wrote:

[ ... ]

>> +struct cdev_value {
> 
> I'm not sure about the name here.  I would rather call it cdev_record,
> because it consists of two items, the id and the value.
> 
>> +       struct list_head node;
>> +       int id;
>> +       u64 value;
> 
> This is kind of a union, but sort of in disguise.
> 
> Why not make it a union proper, that is
> 
> struct cdev_record {
>          struct list_head node;
>          int id;
>          union {
>                  krime_t residency; /* for duration records */
>                  u64 count; /* for occurrences records */
>          } data;
> };
> 
> which then would result in a bit cleaner code in some places below, if
> I'm not mistaken?

Can we stick to

struct cdev_record {
          struct list_head node;
          int id;
          union {
                  u64 residency_ms;  <----- ?
                  u64 count;
          };
};

?

The usage of the ktime_t will have a more important impact in the code.
Rafael J. Wysocki Dec. 22, 2023, 3:48 p.m. UTC | #4
On Fri, Dec 22, 2023 at 4:03 PM Daniel Lezcano
<daniel.lezcano@linaro.org> wrote:
>
> On 21/12/2023 18:19, Rafael J. Wysocki wrote:
>
> [ ... ]
>
> >> +struct cdev_value {
> >
> > I'm not sure about the name here.  I would rather call it cdev_record,
> > because it consists of two items, the id and the value.
> >
> >> +       struct list_head node;
> >> +       int id;
> >> +       u64 value;
> >
> > This is kind of a union, but sort of in disguise.
> >
> > Why not make it a union proper, that is
> >
> > struct cdev_record {
> >          struct list_head node;
> >          int id;
> >          union {
> >                  krime_t residency; /* for duration records */
> >                  u64 count; /* for occurrences records */
> >          } data;
> > };
> >
> > which then would result in a bit cleaner code in some places below, if
> > I'm not mistaken?
>
> Can we stick to
>
> struct cdev_record {
>           struct list_head node;
>           int id;
>           union {
>                   u64 residency_ms;  <----- ?
>                   u64 count;
>           };
> };
>
> ?
>
> The usage of the ktime_t will have a more important impact in the code.

OK, but patch [2/2] uses ktime_t for duration computations regarding
trip points.  I'm not sure why this is different.
Daniel Lezcano Dec. 23, 2023, 11:41 a.m. UTC | #5
Hi Rafael,

On 21/12/2023 20:26, Rafael J. Wysocki wrote:

[ ... ]


>> +/**
>> + * struct tz_events - Store all events related to a mitigation episode
>> + *
>> + * The tz_events structure describes a mitigation episode.
> 
> So why not call it tz_mitigation?

A mitigation episode = N x tz_events

eg.
trip A = passive cooling - cpufreq cluster0
trip B = passive cooling - cpufreq cluster0 + cluster1
trip C = active cooling + fan

temperature trip A < trip B < trip C

The mitigation episode, as defined, begins at trip A, and we can have 
multiple events (eg. trip B crossed several times, trip C, then trip B 
again etc ...).

[ ... ]

>> +       if (dfs->tz.trip_index < 0) {
>> +               tze = thermal_debugfs_tz_event_alloc(tz, now);
>> +               if (!tze)
>> +                       return;
>> +
>> +               list_add(&tze->node, &dfs->tz.tz_events);
>> +       }
>> +
>> +       dfs->tz.trip_index++;
>> +       dfs->tz.trips_crossed[dfs->tz.trip_index] = trip_id;
> 
> So trip_index is an index into trips_crossed[] and the value is the ID
> of the trip passed by thermal_debug_tz_trip_up() IIUC, so the trip IDs
> in trips_crossed[] are always sorted by the trip temperature, in the
> ascending order.
> 
> It would be good to write this down somewhere in a comment.
> 
> And what if trip temperatures change during a mitigation episode such
> that the order by the trip temperature changes?

Changing a trip point temperature during a mitigation is a general 
question about the thermal framework.

How the governors will behave with such a change on the fly while they 
are in action?

IMO, we should prevent to change a trip point temperature when this one 
is crossed and has a cooling device bound to it.

>> +
>> +       tze = list_first_entry(&dfs->tz.tz_events, struct tz_events, node);
>> +       tze->trip_stats[trip_id].timestamp = now;
>> +       tze->trip_stats[trip_id].max = max(tze->trip_stats[trip_id].max, temperature);
>> +       tze->trip_stats[trip_id].min = min(tze->trip_stats[trip_id].min, temperature);
>> +       tze->trip_stats[trip_id].avg = tze->trip_stats[trip_id].avg +
>> +               (temperature - tze->trip_stats[trip_id].avg) /
>> +               tze->trip_stats[trip_id].count;
>> +
>> +       mutex_unlock(&dfs->lock);
>> +}
diff mbox series

Patch

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index c81a00fbca7d..3ff7add3fb7c 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -33,6 +33,13 @@  config THERMAL_STATISTICS
 
 	  If in doubt, say N.
 
+config THERMAL_DEBUGFS
+	bool "Thermal subsystem debug support"
+	depends on DEBUG_FS
+	help
+	  Say Y to allow the thermal subsystem to collect diagnostic
+	  information that can be accessed via debugfs.
+
 config THERMAL_EMERGENCY_POWEROFF_DELAY_MS
 	int "Emergency poweroff delay in milli-seconds"
 	default 0
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index c934cab309ae..0f65ae86a9c6 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -10,6 +10,8 @@  thermal_sys-y			+= thermal_trip.o thermal_helpers.o
 # netlink interface to manage the thermal framework
 thermal_sys-$(CONFIG_THERMAL_NETLINK)		+= thermal_netlink.o
 
+thermal_sys-$(CONFIG_THERMAL_DEBUGFS)	+= thermal_debugfs.o
+
 # interface to/from other layers providing sensors
 thermal_sys-$(CONFIG_THERMAL_HWMON)		+= thermal_hwmon.o
 thermal_sys-$(CONFIG_THERMAL_OF)		+= thermal_of.o
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 625ba07cbe2f..33332d401b13 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -945,6 +945,8 @@  __thermal_cooling_device_register(struct device_node *np,
 
 	mutex_unlock(&thermal_list_lock);
 
+	thermal_debug_cdev_add(cdev);
+	
 	return cdev;
 
 out_cooling_dev:
@@ -1151,6 +1153,8 @@  void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev)
 	if (!cdev)
 		return;
 
+	thermal_debug_cdev_remove(cdev);
+
 	mutex_lock(&thermal_list_lock);
 
 	if (!thermal_cooling_device_present(cdev)) {
@@ -1570,6 +1574,8 @@  static int __init thermal_init(void)
 {
 	int result;
 
+	thermal_debug_init();
+
 	result = thermal_netlink_init();
 	if (result)
 		goto error;
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 7dfe6c8deb8e..4b3452a65a2f 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -13,6 +13,7 @@ 
 #include <linux/thermal.h>
 
 #include "thermal_netlink.h"
+#include "thermal_debugfs.h"
 
 /* Default Thermal Governor */
 #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE)
diff --git a/drivers/thermal/thermal_debugfs.c b/drivers/thermal/thermal_debugfs.c
new file mode 100644
index 000000000000..8fceddb5f6d2
--- /dev/null
+++ b/drivers/thermal/thermal_debugfs.c
@@ -0,0 +1,424 @@ 
+
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023 Linaro Limited
+ *
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org>
+ *
+ * Thermal subsystem debug support
+ */
+#include <linux/debugfs.h>
+#include <linux/ktime.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
+#include <linux/thermal.h>
+
+static struct dentry *d_root;
+static struct dentry *d_cdev;
+
+/*
+ * Length of the string containing the thermal zone id or the cooling
+ * device id, including the ending nul character. We can reasonably
+ * assume there won't be more than 256 thermal zones as the maximum
+ * observed today is around 32.
+ */
+#define IDSLENGTH 4
+
+/*
+ * The cooling device transition list is stored in a hash table where
+ * the size is CDEVSTATS_HASH_SIZE. The majority of cooling devices
+ * have dozen of states but some can have much more, so a hash table
+ * is more adequate in this case, because the cost of browsing the entire
+ * list when storing the transitions may not be negligible.
+ */
+#define CDEVSTATS_HASH_SIZE 16
+
+/**
+ * struct cdev_debugfs - per cooling device statistics structure
+ * A cooling device can have a high number of states. Showing the
+ * transitions on a matrix based representation can be overkill given
+ * most of the transitions won't happen and we end up with a matrix
+ * filled with zero. Instead, we show the transitions which actually
+ * happened.
+ *
+ * Every transition updates the current_state and the timestamp. The
+ * transitions and the durations are stored in lists.
+ *
+ * @total: the number of transitions for this cooling device
+ * @current_state: the current cooling device state
+ * @timestamp: the state change timestamp
+ * @durations: an array of lists containing the residencies of each state
+ * @transitions: an array of lists containing the state transitions
+ */
+struct cdev_debugfs {
+	u32 total;
+	int current_state;
+	ktime_t timestamp;
+	struct list_head transitions[CDEVSTATS_HASH_SIZE];
+	struct list_head durations[CDEVSTATS_HASH_SIZE];
+};
+
+/**
+ * struct cdev_value - Common structure for cooling device entry
+ *
+ * The following common structure allows to store the information
+ * related to the transitions and to the state residencies. They are
+ * identified with a id which is associated to a value. It is used as
+ * nodes for the "transitions" and "durations" above.
+ *
+ * @node: node to insert the structure in a list
+ * @id: identifier of the value which can be a state or a transition
+ * @value: the id associated value which can be a duration or an occurrence
+ */
+struct cdev_value {
+	struct list_head node;
+	int id;
+	u64 value;
+};
+
+/**
+ * struct thermal_debugfs - High level structure for a thermal object in debugfs
+ *
+ * The thermal_debugfs structure is the common structure used by the
+ * cooling device to compute the statistics.
+ *
+ * @d_top: top directory of the thermal object directory
+ * @lock: per object lock to protect the internals
+ *
+ * @cdev: a cooling device debug structure
+ */
+struct thermal_debugfs {
+	struct dentry *d_top;
+	struct mutex lock;
+	union {
+		struct cdev_debugfs cdev;
+	};
+};
+
+void thermal_debug_init(void)
+{
+	d_root = debugfs_create_dir("thermal", NULL);
+	if (!d_root)
+		return;
+
+	d_cdev = debugfs_create_dir("cooling_devices", d_root);
+}
+
+static struct thermal_debugfs *thermal_debugfs_add_id(struct dentry *d, int id)
+{
+	struct thermal_debugfs *dfs;
+	char ids[IDSLENGTH];
+
+	dfs = kzalloc(sizeof(*dfs), GFP_KERNEL);
+	if (!dfs)
+		return NULL;
+
+	mutex_init(&dfs->lock);
+	
+	snprintf(ids, IDSLENGTH, "%d", id);
+
+	dfs->d_top = debugfs_create_dir(ids, d);
+	if (!dfs->d_top) {
+		kfree(dfs);
+		return NULL;
+	}
+
+	return dfs;
+}
+
+static void thermal_debugfs_remove_id(struct thermal_debugfs *dfs)
+{
+	if (!dfs)
+		return;
+
+	debugfs_remove(dfs->d_top);
+
+	kfree(dfs);
+}
+
+static struct cdev_value *thermal_debugfs_cdev_value_alloc(struct thermal_debugfs *dfs,
+							   struct list_head *list, int id)
+{
+	struct cdev_value *cdev_value;
+
+	cdev_value = kzalloc(sizeof(*cdev_value), GFP_KERNEL);
+	if (cdev_value) {
+		cdev_value->id = id;
+		INIT_LIST_HEAD(&cdev_value->node);
+		list_add_tail(&cdev_value->node, &list[cdev_value->id % CDEVSTATS_HASH_SIZE]);
+	}
+
+	return cdev_value;
+}
+
+static struct cdev_value *thermal_debugfs_cdev_value_find(struct thermal_debugfs *dfs,
+							  struct list_head *lists, int id)
+{
+	struct cdev_value *entry;
+
+	list_for_each_entry(entry, &lists[id % CDEVSTATS_HASH_SIZE], node)
+		if (entry->id == id)
+			return entry;
+
+	return NULL;
+}
+
+static struct cdev_value *thermal_debugfs_cdev_value_get(struct thermal_debugfs *dfs,
+							 struct list_head *list, int id)
+{
+	struct cdev_value *cdev_value;
+
+	cdev_value = thermal_debugfs_cdev_value_find(dfs, list, id);
+	if (cdev_value)
+		return cdev_value;
+
+	return thermal_debugfs_cdev_value_alloc(dfs, list, id);
+}
+
+static void thermal_debugfs_cdev_clear(struct cdev_debugfs *cfs)
+{
+	int i;
+	struct cdev_value *entry, *tmp;
+
+	for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
+
+		list_for_each_entry_safe(entry, tmp, &cfs->transitions[i], node) {
+			list_del(&entry->node);
+			kfree(entry);
+		}
+
+		list_for_each_entry_safe(entry, tmp, &cfs->durations[i], node) {
+			list_del(&entry->node);
+			kfree(entry);
+		}
+	}
+
+	cfs->total = 0;
+}
+
+static void *cdev_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct thermal_debugfs *dfs = s->private;
+
+	mutex_lock(&dfs->lock);
+
+	return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
+}
+
+static void *cdev_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return (*pos < CDEVSTATS_HASH_SIZE) ? pos : NULL;
+}
+
+static void cdev_seq_stop(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *dfs = s->private;
+
+	mutex_unlock(&dfs->lock);
+}
+
+static int cdev_tt_seq_show(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *dfs = s->private;	
+	struct cdev_debugfs *cfs = &dfs->cdev;
+	struct list_head *transitions = cfs->transitions;
+	struct cdev_value *entry;
+	int i = *(loff_t *)v;
+
+	if (!i)
+		seq_puts(s, "Transition\tOccurences\n");
+
+	list_for_each_entry(entry, &transitions[i], node) {
+		/*
+		 * Assuming maximum cdev states is 1024, the longer
+		 * string for a transition would be "1024->1024\0"
+		 */
+		char buffer[11];
+		
+		snprintf(buffer, ARRAY_SIZE(buffer), "%d->%d",
+			 entry->id >> 16, entry->id & 0xFFFF);
+
+		seq_printf(s, "%-10s\t%-10llu\n", buffer, entry->value);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations tt_sops = {
+	.start = cdev_seq_start,
+	.next = cdev_seq_next,
+	.stop = cdev_seq_stop,
+	.show = cdev_tt_seq_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(tt);
+
+static int cdev_dt_seq_show(struct seq_file *s, void *v)
+{
+	struct thermal_debugfs *dfs = s->private;
+	struct cdev_debugfs *cfs = &dfs->cdev;
+	struct list_head *durations = cfs->durations;
+	struct cdev_value *entry;
+	int i = *(loff_t *)v;
+
+	if (!i)
+		seq_puts(s, "State\tResidency\n");
+
+	list_for_each_entry(entry, &durations[i], node) {
+		s64 duration = entry->value;
+
+		if (entry->id == cfs->current_state)
+			duration += ktime_ms_delta(ktime_get(), cfs->timestamp);
+
+		seq_printf(s, "%-5d\t%-10llu\n", entry->id, duration);
+	}
+
+	return 0;
+}
+
+static const struct seq_operations dt_sops = {
+	.start = cdev_seq_start,
+	.next = cdev_seq_next,
+	.stop = cdev_seq_stop,
+	.show = cdev_dt_seq_show,
+};
+
+DEFINE_SEQ_ATTRIBUTE(dt);
+
+static int cdev_clear_set(void *data, u64 val)
+{
+	struct thermal_debugfs *dfs = data;
+
+	if (!val)
+		return -EINVAL;
+
+	mutex_lock(&dfs->lock);
+	
+	thermal_debugfs_cdev_clear(&dfs->cdev);
+
+	mutex_unlock(&dfs->lock);
+
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(cdev_clear_fops, NULL, cdev_clear_set, "%llu\n");
+
+/**
+ * thermal_debug_cdev_state_update - Update a cooling device state change
+ *
+ * Computes a transition and the duration of the previous state residency.
+ *
+ * @cdev : a pointer to a cooling device
+ * @new_state: an integer corresponding to the new cooling device state
+ */
+void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
+				     int new_state)
+{
+	struct thermal_debugfs *dfs = cdev->debugfs;
+	struct cdev_debugfs *cfs;
+	struct cdev_value *cdev_value;
+	ktime_t now = ktime_get();
+	int transition, old_state;
+
+	if (!dfs || (dfs->cdev.current_state == new_state))
+		return;
+
+	mutex_lock(&dfs->lock);
+
+	cfs = &dfs->cdev;
+
+	old_state = cfs->current_state;
+	cfs->current_state = new_state;
+	transition = (old_state << 16) | new_state;
+
+	/*
+	 * Get the old state information in the durations list. If
+	 * this one does not exist, a new allocated one will be
+	 * returned. Recompute the total duration in the old state and
+	 * get a new timestamp for the new state.
+	 */
+	cdev_value = thermal_debugfs_cdev_value_get(dfs, cfs->durations, old_state);
+	if (cdev_value) {
+		cdev_value->value += ktime_ms_delta(now, cfs->timestamp);
+		cfs->timestamp = now;
+	}
+
+	/*
+	 * Get the transition in the transitions list. If this one
+	 * does not exist, a new allocated one will be returned.
+	 * Increment the occurrence of this transition which is stored
+	 * in the value field.
+	 */
+	cdev_value = thermal_debugfs_cdev_value_get(dfs, cfs->transitions,
+						    transition);
+	if (cdev_value)
+		cdev_value->value++;
+
+	cfs->total++;
+
+	mutex_unlock(&dfs->lock);
+}
+
+/**
+ * thermal_debug_cdev_add - Add a cooling device debugfs entry
+ *
+ * Allocates a cooling device object for debug, initializes the
+ * statistics and create the entries in sysfs.
+ * @cdev: a pointer to a cooling device
+ */
+void thermal_debug_cdev_add(struct thermal_cooling_device *cdev)
+{
+	struct thermal_debugfs *dfs;
+	struct cdev_debugfs *cfs;
+	int i;
+
+	dfs = thermal_debugfs_add_id(d_cdev, cdev->id);
+	if (!dfs)
+		return;
+
+	cfs = &dfs->cdev;
+
+	for (i = 0; i < CDEVSTATS_HASH_SIZE; i++) {
+		INIT_LIST_HEAD(&cfs->transitions[i]);
+		INIT_LIST_HEAD(&cfs->durations[i]);
+	}
+
+	cfs->current_state = 0;
+	cfs->timestamp = ktime_get();
+
+	debugfs_create_file("trans_table", 0400, dfs->d_top, dfs, &tt_fops);
+
+	debugfs_create_file("time_in_state_ms", 0400, dfs->d_top, dfs, &dt_fops);
+
+	debugfs_create_file("clear", 0200, dfs->d_top, dfs, &cdev_clear_fops);
+
+	debugfs_create_u32("total_trans", 0400, dfs->d_top, &cfs->total);
+
+	cdev->debugfs = dfs;
+}
+
+/**
+ * thermal_debug_cdev_remove - Remove a cooling device debugfs entry
+ *
+ * Frees the statistics memory data and remove the debugfs entry
+ *
+ * @cdev: a pointer to a cooling device
+ */
+void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev)
+{
+	struct thermal_debugfs *dfs = cdev->debugfs;
+
+	if (!dfs)
+		return;
+
+	mutex_lock(&dfs->lock);
+	
+	thermal_debugfs_cdev_clear(&dfs->cdev);
+	cdev->debugfs = NULL;
+	thermal_debugfs_remove_id(dfs);
+
+	mutex_unlock(&dfs->lock);
+}
diff --git a/drivers/thermal/thermal_debugfs.h b/drivers/thermal/thermal_debugfs.h
new file mode 100644
index 000000000000..341499388448
--- /dev/null
+++ b/drivers/thermal/thermal_debugfs.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifdef CONFIG_THERMAL_DEBUGFS
+void thermal_debug_init(void);
+void thermal_debug_cdev_add(struct thermal_cooling_device *cdev);
+void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev);
+void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev, int state);
+#else
+static inline void thermal_debug_init(void) {}
+static inline void thermal_debug_cdev_add(struct thermal_cooling_device *cdev) {}
+static inline void thermal_debug_cdev_remove(struct thermal_cooling_device *cdev) {}
+static inline void thermal_debug_cdev_state_update(const struct thermal_cooling_device *cdev,
+						   int state) {}
+#endif /* CONFIG_THERMAL_DEBUGFS */
diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
index 69e8ea4aa908..435c123b721b 100644
--- a/drivers/thermal/thermal_helpers.c
+++ b/drivers/thermal/thermal_helpers.c
@@ -151,14 +151,29 @@  int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp)
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
-static void thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev,
-				       int target)
+void thermal_set_delay_jiffies(unsigned long *delay_jiffies, int delay_ms)
 {
-	if (cdev->ops->set_cur_state(cdev, target))
-		return;
+	*delay_jiffies = msecs_to_jiffies(delay_ms);
+	if (delay_ms > 1000)
+		*delay_jiffies = round_jiffies(*delay_jiffies);
+}
+
+static int thermal_cdev_set_cur_state(struct thermal_cooling_device *cdev, int state)
+{
+	int ret;
 
-	thermal_notify_cdev_state_update(cdev->id, target);
-	thermal_cooling_device_stats_update(cdev, target);
+	/*
+	 * No check is needed for the ops->set_cur_state as the
+	 * registering function checked the ops are correctly set
+	 */
+	ret = cdev->ops->set_cur_state(cdev, state);
+	if (!ret) {
+		thermal_notify_cdev_state_update(cdev->id, state);
+		thermal_cooling_device_stats_update(cdev, state);
+		thermal_debug_cdev_state_update(cdev, state);
+	}
+
+	return ret;
 }
 
 void __thermal_cdev_update(struct thermal_cooling_device *cdev)
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 0ea99f50d57c..8461f008c3de 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -32,6 +32,7 @@ 
 struct thermal_zone_device;
 struct thermal_cooling_device;
 struct thermal_instance;
+struct thermal_debugfs;
 struct thermal_attr;
 
 enum thermal_trend {
@@ -110,6 +111,9 @@  struct thermal_cooling_device {
 	struct mutex lock; /* protect thermal_instances list */
 	struct list_head thermal_instances;
 	struct list_head node;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 };
 
 /**
@@ -183,6 +187,9 @@  struct thermal_zone_device {
 	struct list_head node;
 	struct delayed_work poll_queue;
 	enum thermal_notify_event notify_event;
+#ifdef CONFIG_THERMAL_DEBUGFS
+	struct thermal_debugfs *debugfs;
+#endif
 };
 
 /**