diff mbox series

[v3,6/7] drm: Add fdinfo memory stats

Message ID 20230411225725.2032862-7-robdclark@gmail.com
State Superseded
Headers show
Series drm: fdinfo memory stats | expand

Commit Message

Rob Clark April 11, 2023, 10:56 p.m. UTC
From: Rob Clark <robdclark@chromium.org>

Add support to dump GEM stats to fdinfo.

v2: Fix typos, change size units to match docs, use div_u64
v3: Do it in core

Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
---
 Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
 drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
 include/drm/drm_file.h                |  1 +
 include/drm/drm_gem.h                 | 19 +++++++
 4 files changed, 117 insertions(+)

Comments

Daniel Vetter April 12, 2023, 8:01 a.m. UTC | #1
On Tue, Apr 11, 2023 at 03:56:11PM -0700, Rob Clark wrote:
> From: Rob Clark <robdclark@chromium.org>
> 
> Add support to dump GEM stats to fdinfo.
> 
> v2: Fix typos, change size units to match docs, use div_u64
> v3: Do it in core
> 
> Signed-off-by: Rob Clark <robdclark@chromium.org>
> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> ---
>  Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
>  drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
>  include/drm/drm_file.h                |  1 +
>  include/drm/drm_gem.h                 | 19 +++++++
>  4 files changed, 117 insertions(+)
> 
> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> index b46327356e80..b5e7802532ed 100644
> --- a/Documentation/gpu/drm-usage-stats.rst
> +++ b/Documentation/gpu/drm-usage-stats.rst
> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
>  Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
>  indicating kibi- or mebi-bytes.
>  
> +- drm-shared-memory: <uint> [KiB|MiB]
> +
> +The total size of buffers that are shared with another file (ie. have more
> +than a single handle).
> +
> +- drm-private-memory: <uint> [KiB|MiB]
> +
> +The total size of buffers that are not shared with another file.
> +
> +- drm-resident-memory: <uint> [KiB|MiB]
> +
> +The total size of buffers that are resident in system memory.
> +
> +- drm-purgeable-memory: <uint> [KiB|MiB]
> +
> +The total size of buffers that are purgeable.
> +
> +- drm-active-memory: <uint> [KiB|MiB]
> +
> +The total size of buffers that are active on one or more rings.
> +
>  - drm-cycles-<str> <uint>
>  
>  Engine identifier string must be the same as the one specified in the
> diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
> index 37dfaa6be560..46fdd843bb3a 100644
> --- a/drivers/gpu/drm/drm_file.c
> +++ b/drivers/gpu/drm/drm_file.c
> @@ -42,6 +42,7 @@
>  #include <drm/drm_client.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_file.h>
> +#include <drm/drm_gem.h>
>  #include <drm/drm_print.h>
>  
>  #include "drm_crtc_internal.h"
> @@ -871,6 +872,79 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
>  }
>  EXPORT_SYMBOL(drm_send_event);
>  
> +static void print_size(struct drm_printer *p, const char *stat, size_t sz)
> +{
> +	const char *units[] = {"", " KiB", " MiB"};
> +	unsigned u;
> +
> +	for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
> +		if (sz < SZ_1K)
> +			break;
> +		sz = div_u64(sz, SZ_1K);
> +	}
> +
> +	drm_printf(p, "%s:\t%zu%s\n", stat, sz, units[u]);
> +}
> +
> +static void print_memory_stats(struct drm_printer *p, struct drm_file *file)
> +{
> +	struct drm_gem_object *obj;
> +	struct {
> +		size_t shared;
> +		size_t private;
> +		size_t resident;
> +		size_t purgeable;
> +		size_t active;
> +	} size = {0};
> +	bool has_status = false;
> +	int id;
> +
> +	spin_lock(&file->table_lock);
> +	idr_for_each_entry (&file->object_idr, obj, id) {
> +		enum drm_gem_object_status s = 0;
> +
> +		if (obj->funcs && obj->funcs->status) {
> +			s = obj->funcs->status(obj);
> +			has_status = true;
> +		}
> +
> +		if (obj->handle_count > 1) {
> +			size.shared += obj->size;
> +		} else {
> +			size.private += obj->size;
> +		}
> +
> +		if (s & DRM_GEM_OBJECT_RESIDENT) {
> +			size.resident += obj->size;
> +		} else {
> +			/* If already purged or not yet backed by pages, don't
> +			 * count it as purgeable:
> +			 */
> +			s &= ~DRM_GEM_OBJECT_PURGEABLE;
> +		}
> +
> +		if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) {
> +			size.active += obj->size;
> +
> +			/* If still active, don't count as purgeable: */

Maybe mention this in the kerneldoc for DRM_GEM_OBJECT_PURGEABLE?

Otherwise looks tidy:

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> +			s &= ~DRM_GEM_OBJECT_PURGEABLE;
> +		}
> +
> +		if (s & DRM_GEM_OBJECT_PURGEABLE)
> +			size.purgeable += obj->size;
> +	}
> +	spin_unlock(&file->table_lock);
> +
> +	print_size(p, "drm-shared-memory", size.shared);
> +	print_size(p, "drm-private-memory", size.private);
> +	print_size(p, "drm-active-memory", size.active);
> +
> +	if (has_status) {
> +		print_size(p, "drm-resident-memory", size.resident);
> +		print_size(p, "drm-purgeable-memory", size.purgeable);
> +	}
> +}
> +
>  /**
>   * drm_fop_show_fdinfo - helper for drm file fops
>   * @seq_file: output stream
> @@ -904,6 +978,8 @@ void drm_fop_show_fdinfo(struct seq_file *m, struct file *f)
>  
>  	if (dev->driver->show_fdinfo)
>  		dev->driver->show_fdinfo(&p, file);
> +
> +	print_memory_stats(&p, file);
>  }
>  EXPORT_SYMBOL(drm_fop_show_fdinfo);
>  
> diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
> index dfa995b787e1..e5b40084538f 100644
> --- a/include/drm/drm_file.h
> +++ b/include/drm/drm_file.h
> @@ -41,6 +41,7 @@
>  struct dma_fence;
>  struct drm_file;
>  struct drm_device;
> +struct drm_printer;
>  struct device;
>  struct file;
>  
> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> index 189fd618ca65..213917bb6b11 100644
> --- a/include/drm/drm_gem.h
> +++ b/include/drm/drm_gem.h
> @@ -42,6 +42,14 @@
>  struct iosys_map;
>  struct drm_gem_object;
>  
> +/**
> + * enum drm_gem_object_status - bitmask of object state for fdinfo reporting
> + */
> +enum drm_gem_object_status {
> +	DRM_GEM_OBJECT_RESIDENT  = BIT(0),
> +	DRM_GEM_OBJECT_PURGEABLE = BIT(1),
> +};
> +
>  /**
>   * struct drm_gem_object_funcs - GEM object functions
>   */
> @@ -174,6 +182,17 @@ struct drm_gem_object_funcs {
>  	 */
>  	int (*evict)(struct drm_gem_object *obj);
>  
> +	/**
> +	 * @status:
> +	 *
> +	 * The optional status callback can return additional object state
> +	 * which determines which stats the object is counted against.  The
> +	 * callback is called under table_lock.  Racing against object status
> +	 * change is "harmless", and the callback can expect to not race
> +	 * against object destruction.
> +	 */
> +	enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
> +
>  	/**
>  	 * @vm_ops:
>  	 *
> -- 
> 2.39.2
>
Rob Clark April 12, 2023, 6:42 p.m. UTC | #2
On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> > On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com> wrote:
> > >
> > >
> > > On 11/04/2023 23:56, Rob Clark wrote:
> > > > From: Rob Clark <robdclark@chromium.org>
> > > >
> > > > Add support to dump GEM stats to fdinfo.
> > > >
> > > > v2: Fix typos, change size units to match docs, use div_u64
> > > > v3: Do it in core
> > > >
> > > > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > > > Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> > > > ---
> > > >   Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> > > >   drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> > > >   include/drm/drm_file.h                |  1 +
> > > >   include/drm/drm_gem.h                 | 19 +++++++
> > > >   4 files changed, 117 insertions(+)
> > > >
> > > > diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> > > > index b46327356e80..b5e7802532ed 100644
> > > > --- a/Documentation/gpu/drm-usage-stats.rst
> > > > +++ b/Documentation/gpu/drm-usage-stats.rst
> > > > @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> > > >   Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> > > >   indicating kibi- or mebi-bytes.
> > > >
> > > > +- drm-shared-memory: <uint> [KiB|MiB]
> > > > +
> > > > +The total size of buffers that are shared with another file (ie. have more
> > > > +than a single handle).
> > > > +
> > > > +- drm-private-memory: <uint> [KiB|MiB]
> > > > +
> > > > +The total size of buffers that are not shared with another file.
> > > > +
> > > > +- drm-resident-memory: <uint> [KiB|MiB]
> > > > +
> > > > +The total size of buffers that are resident in system memory.
> > >
> > > I think this naming maybe does not work best with the existing
> > > drm-memory-<region> keys.
> >
> > Actually, it was very deliberate not to conflict with the existing
> > drm-memory-<region> keys ;-)
> >
> > I wouldn't have preferred drm-memory-{active,resident,...} but it
> > could be mis-parsed by existing userspace so my hands were a bit tied.
> >
> > > How about introduce the concept of a memory region from the start and
> > > use naming similar like we do for engines?
> > >
> > > drm-memory-$CATEGORY-$REGION: ...
> > >
> > > Then we document a bunch of categories and their semantics, for instance:
> > >
> > > 'size' - All reachable objects
> > > 'shared' - Subset of 'size' with handle_count > 1
> > > 'resident' - Objects with backing store
> > > 'active' - Objects in use, subset of resident
> > > 'purgeable' - Or inactive? Subset of resident.
> > >
> > > We keep the same semantics as with process memory accounting (if I got
> > > it right) which could be desirable for a simplified mental model.
> > >
> > > (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> > > correctly captured this in the first round it should be equivalent to
> > > 'resident' above. In any case we can document no category is equal to
> > > which category, and at most one of the two must be output.)
> > >
> > > Region names we at most partially standardize. Like we could say
> > > 'system' is to be used where backing store is system RAM and others are
> > > driver defined.
> > >
> > > Then discrete GPUs could emit N sets of key-values, one for each memory
> > > region they support.
> > >
> > > I think this all also works for objects which can be migrated between
> > > memory regions. 'Size' accounts them against all regions while for
> > > 'resident' they only appear in the region of their current placement, etc.
> >
> > I'm not too sure how to rectify different memory regions with this,
> > since drm core doesn't really know about the driver's memory regions.
> > Perhaps we can go back to this being a helper and drivers with vram
> > just don't use the helper?  Or??
>
> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> all works out reasonably consistently?

That is basically what we have now.  I could append -system to each to
make things easier to add vram/etc (from a uabi standpoint)..

BR,
-R

> And ttm could/should perhaps provide a helper to dump the region specific
> version of this. Or we lift the concept of regions out of ttm a bit
> higher, that's kinda needed for cgroups eventually anyway I think.
> -Daniel
>
> >
> > BR,
> > -R
> >
> > > Userspace can aggregate if it wishes to do so but kernel side should not.
> > >
> > > > +
> > > > +- drm-purgeable-memory: <uint> [KiB|MiB]
> > > > +
> > > > +The total size of buffers that are purgeable.
> > > > +
> > > > +- drm-active-memory: <uint> [KiB|MiB]
> > > > +
> > > > +The total size of buffers that are active on one or more rings.
> > > > +
> > > >   - drm-cycles-<str> <uint>
> > > >
> > > >   Engine identifier string must be the same as the one specified in the
> > > > diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
> > > > index 37dfaa6be560..46fdd843bb3a 100644
> > > > --- a/drivers/gpu/drm/drm_file.c
> > > > +++ b/drivers/gpu/drm/drm_file.c
> > > > @@ -42,6 +42,7 @@
> > > >   #include <drm/drm_client.h>
> > > >   #include <drm/drm_drv.h>
> > > >   #include <drm/drm_file.h>
> > > > +#include <drm/drm_gem.h>
> > > >   #include <drm/drm_print.h>
> > > >
> > > >   #include "drm_crtc_internal.h"
> > > > @@ -871,6 +872,79 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
> > > >   }
> > > >   EXPORT_SYMBOL(drm_send_event);
> > > >
> > > > +static void print_size(struct drm_printer *p, const char *stat, size_t sz)
> > > > +{
> > > > +     const char *units[] = {"", " KiB", " MiB"};
> > > > +     unsigned u;
> > > > +
> > > > +     for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
> > > > +             if (sz < SZ_1K)
> > > > +                     break;
> > > > +             sz = div_u64(sz, SZ_1K);
> > > > +     }
> > > > +
> > > > +     drm_printf(p, "%s:\t%zu%s\n", stat, sz, units[u]);
> > > > +}
> > > > +
> > > > +static void print_memory_stats(struct drm_printer *p, struct drm_file *file)
> > > > +{
> > > > +     struct drm_gem_object *obj;
> > > > +     struct {
> > > > +             size_t shared;
> > > > +             size_t private;
> > > > +             size_t resident;
> > > > +             size_t purgeable;
> > > > +             size_t active;
> > > > +     } size = {0};
> > > > +     bool has_status = false;
> > > > +     int id;
> > > > +
> > > > +     spin_lock(&file->table_lock);
> > > > +     idr_for_each_entry (&file->object_idr, obj, id) {
> > > > +             enum drm_gem_object_status s = 0;
> > > > +
> > > > +             if (obj->funcs && obj->funcs->status) {
> > > > +                     s = obj->funcs->status(obj);
> > > > +                     has_status = true;
> > > > +             }
> > > > +
> > > > +             if (obj->handle_count > 1) {
> > > > +                     size.shared += obj->size;
> > > > +             } else {
> > > > +                     size.private += obj->size;
> > > > +             }
> > > > +
> > > > +             if (s & DRM_GEM_OBJECT_RESIDENT) {
> > > > +                     size.resident += obj->size;
> > > > +             } else {
> > > > +                     /* If already purged or not yet backed by pages, don't
> > > > +                      * count it as purgeable:
> > > > +                      */
> > > > +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
> > >
> > > Side question - why couldn't resident buffers be purgeable? Did you mean
> > > for the if branch check to be active here? But then it wouldn't make
> > > sense for a driver to report active _and_ purgeable..
> > >
> > > > +             }
> > > > +
> > > > +             if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) {
> > > > +                     size.active += obj->size;
> > > > +
> > > > +                     /* If still active, don't count as purgeable: */
> > > > +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
> > >
> > > Another side question - I guess this tidies a race in reporting? If so
> > > not sure it matters given the stats are all rather approximate.
> > >
> > > > +             }
> > > > +
> > > > +             if (s & DRM_GEM_OBJECT_PURGEABLE)
> > > > +                     size.purgeable += obj->size;
> > > > +     }
> > >
> > > One concern I have here is that it is all based on obj->size. That is,
> > > there is no provision for drivers to implement page level granularity.
> > > So correct reporting in use cases such as VM BIND in the future wouldn't
> > > work unless it was a driver hook to get almost all of the info above. At
> > > which point common code is just a loop. TBF I don't know if any drivers
> > > do sub obj->size backing store granularity today, but I think it is
> > > sometimes to be sure of before proceeding.
> > >
> > > Second concern is what I touched upon in the first reply block - if the
> > > common code blindly loops over all objects then on discrete GPUs it
> > > seems we get an 'aggregate' value here which is not what I think we
> > > want. We rather want to have the ability for drivers to list stats per
> > > individual memory region.
> > >
> > > > +     spin_unlock(&file->table_lock);
> > > > +
> > > > +     print_size(p, "drm-shared-memory", size.shared);
> > > > +     print_size(p, "drm-private-memory", size.private);
> > > > +     print_size(p, "drm-active-memory", size.active);
> > > > +
> > > > +     if (has_status) {
> > > > +             print_size(p, "drm-resident-memory", size.resident);
> > > > +             print_size(p, "drm-purgeable-memory", size.purgeable);
> > > > +     }
> > > > +}
> > > > +
> > > >   /**
> > > >    * drm_fop_show_fdinfo - helper for drm file fops
> > > >    * @seq_file: output stream
> > > > @@ -904,6 +978,8 @@ void drm_fop_show_fdinfo(struct seq_file *m, struct file *f)
> > > >
> > > >       if (dev->driver->show_fdinfo)
> > > >               dev->driver->show_fdinfo(&p, file);
> > > > +
> > > > +     print_memory_stats(&p, file);
> > > >   }
> > > >   EXPORT_SYMBOL(drm_fop_show_fdinfo);
> > > >
> > > > diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
> > > > index dfa995b787e1..e5b40084538f 100644
> > > > --- a/include/drm/drm_file.h
> > > > +++ b/include/drm/drm_file.h
> > > > @@ -41,6 +41,7 @@
> > > >   struct dma_fence;
> > > >   struct drm_file;
> > > >   struct drm_device;
> > > > +struct drm_printer;
> > > >   struct device;
> > > >   struct file;
> > > >
> > > > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> > > > index 189fd618ca65..213917bb6b11 100644
> > > > --- a/include/drm/drm_gem.h
> > > > +++ b/include/drm/drm_gem.h
> > > > @@ -42,6 +42,14 @@
> > > >   struct iosys_map;
> > > >   struct drm_gem_object;
> > > >
> > > > +/**
> > > > + * enum drm_gem_object_status - bitmask of object state for fdinfo reporting
> > > > + */
> > > > +enum drm_gem_object_status {
> > > > +     DRM_GEM_OBJECT_RESIDENT  = BIT(0),
> > > > +     DRM_GEM_OBJECT_PURGEABLE = BIT(1),
> > > > +};
> > > > +
> > > >   /**
> > > >    * struct drm_gem_object_funcs - GEM object functions
> > > >    */
> > > > @@ -174,6 +182,17 @@ struct drm_gem_object_funcs {
> > > >        */
> > > >       int (*evict)(struct drm_gem_object *obj);
> > > >
> > > > +     /**
> > > > +      * @status:
> > > > +      *
> > > > +      * The optional status callback can return additional object state
> > > > +      * which determines which stats the object is counted against.  The
> > > > +      * callback is called under table_lock.  Racing against object status
> > > > +      * change is "harmless", and the callback can expect to not race
> > > > +      * against object destruction.
> > > > +      */
> > > > +     enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
> > >
> > > Does this needs to be in object funcs and couldn't be consolidated to
> > > driver level?
> > >
> > > Regards,
> > >
> > > Tvrtko
> > >
> > > > +
> > > >       /**
> > > >        * @vm_ops:
> > > >        *
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter April 12, 2023, 7:18 p.m. UTC | #3
On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> > > On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> > > <tvrtko.ursulin@linux.intel.com> wrote:
> > > >
> > > >
> > > > On 11/04/2023 23:56, Rob Clark wrote:
> > > > > From: Rob Clark <robdclark@chromium.org>
> > > > >
> > > > > Add support to dump GEM stats to fdinfo.
> > > > >
> > > > > v2: Fix typos, change size units to match docs, use div_u64
> > > > > v3: Do it in core
> > > > >
> > > > > Signed-off-by: Rob Clark <robdclark@chromium.org>
> > > > > Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> > > > > ---
> > > > >   Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> > > > >   drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> > > > >   include/drm/drm_file.h                |  1 +
> > > > >   include/drm/drm_gem.h                 | 19 +++++++
> > > > >   4 files changed, 117 insertions(+)
> > > > >
> > > > > diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> > > > > index b46327356e80..b5e7802532ed 100644
> > > > > --- a/Documentation/gpu/drm-usage-stats.rst
> > > > > +++ b/Documentation/gpu/drm-usage-stats.rst
> > > > > @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> > > > >   Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> > > > >   indicating kibi- or mebi-bytes.
> > > > >
> > > > > +- drm-shared-memory: <uint> [KiB|MiB]
> > > > > +
> > > > > +The total size of buffers that are shared with another file (ie. have more
> > > > > +than a single handle).
> > > > > +
> > > > > +- drm-private-memory: <uint> [KiB|MiB]
> > > > > +
> > > > > +The total size of buffers that are not shared with another file.
> > > > > +
> > > > > +- drm-resident-memory: <uint> [KiB|MiB]
> > > > > +
> > > > > +The total size of buffers that are resident in system memory.
> > > >
> > > > I think this naming maybe does not work best with the existing
> > > > drm-memory-<region> keys.
> > >
> > > Actually, it was very deliberate not to conflict with the existing
> > > drm-memory-<region> keys ;-)
> > >
> > > I wouldn't have preferred drm-memory-{active,resident,...} but it
> > > could be mis-parsed by existing userspace so my hands were a bit tied.
> > >
> > > > How about introduce the concept of a memory region from the start and
> > > > use naming similar like we do for engines?
> > > >
> > > > drm-memory-$CATEGORY-$REGION: ...
> > > >
> > > > Then we document a bunch of categories and their semantics, for instance:
> > > >
> > > > 'size' - All reachable objects
> > > > 'shared' - Subset of 'size' with handle_count > 1
> > > > 'resident' - Objects with backing store
> > > > 'active' - Objects in use, subset of resident
> > > > 'purgeable' - Or inactive? Subset of resident.
> > > >
> > > > We keep the same semantics as with process memory accounting (if I got
> > > > it right) which could be desirable for a simplified mental model.
> > > >
> > > > (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> > > > correctly captured this in the first round it should be equivalent to
> > > > 'resident' above. In any case we can document no category is equal to
> > > > which category, and at most one of the two must be output.)
> > > >
> > > > Region names we at most partially standardize. Like we could say
> > > > 'system' is to be used where backing store is system RAM and others are
> > > > driver defined.
> > > >
> > > > Then discrete GPUs could emit N sets of key-values, one for each memory
> > > > region they support.
> > > >
> > > > I think this all also works for objects which can be migrated between
> > > > memory regions. 'Size' accounts them against all regions while for
> > > > 'resident' they only appear in the region of their current placement, etc.
> > >
> > > I'm not too sure how to rectify different memory regions with this,
> > > since drm core doesn't really know about the driver's memory regions.
> > > Perhaps we can go back to this being a helper and drivers with vram
> > > just don't use the helper?  Or??
> >
> > I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> > all works out reasonably consistently?
> 
> That is basically what we have now.  I could append -system to each to
> make things easier to add vram/etc (from a uabi standpoint)..

What you have isn't really -system, but everything. So doesn't really make
sense to me to mark this -system, it's only really true for integrated (if
they don't have stolen or something like that).

Also my comment was more in reply to Tvrtko's suggestion.
-Daniel


> 
> BR,
> -R
> 
> > And ttm could/should perhaps provide a helper to dump the region specific
> > version of this. Or we lift the concept of regions out of ttm a bit
> > higher, that's kinda needed for cgroups eventually anyway I think.
> > -Daniel
> >
> > >
> > > BR,
> > > -R
> > >
> > > > Userspace can aggregate if it wishes to do so but kernel side should not.
> > > >
> > > > > +
> > > > > +- drm-purgeable-memory: <uint> [KiB|MiB]
> > > > > +
> > > > > +The total size of buffers that are purgeable.
> > > > > +
> > > > > +- drm-active-memory: <uint> [KiB|MiB]
> > > > > +
> > > > > +The total size of buffers that are active on one or more rings.
> > > > > +
> > > > >   - drm-cycles-<str> <uint>
> > > > >
> > > > >   Engine identifier string must be the same as the one specified in the
> > > > > diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
> > > > > index 37dfaa6be560..46fdd843bb3a 100644
> > > > > --- a/drivers/gpu/drm/drm_file.c
> > > > > +++ b/drivers/gpu/drm/drm_file.c
> > > > > @@ -42,6 +42,7 @@
> > > > >   #include <drm/drm_client.h>
> > > > >   #include <drm/drm_drv.h>
> > > > >   #include <drm/drm_file.h>
> > > > > +#include <drm/drm_gem.h>
> > > > >   #include <drm/drm_print.h>
> > > > >
> > > > >   #include "drm_crtc_internal.h"
> > > > > @@ -871,6 +872,79 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
> > > > >   }
> > > > >   EXPORT_SYMBOL(drm_send_event);
> > > > >
> > > > > +static void print_size(struct drm_printer *p, const char *stat, size_t sz)
> > > > > +{
> > > > > +     const char *units[] = {"", " KiB", " MiB"};
> > > > > +     unsigned u;
> > > > > +
> > > > > +     for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
> > > > > +             if (sz < SZ_1K)
> > > > > +                     break;
> > > > > +             sz = div_u64(sz, SZ_1K);
> > > > > +     }
> > > > > +
> > > > > +     drm_printf(p, "%s:\t%zu%s\n", stat, sz, units[u]);
> > > > > +}
> > > > > +
> > > > > +static void print_memory_stats(struct drm_printer *p, struct drm_file *file)
> > > > > +{
> > > > > +     struct drm_gem_object *obj;
> > > > > +     struct {
> > > > > +             size_t shared;
> > > > > +             size_t private;
> > > > > +             size_t resident;
> > > > > +             size_t purgeable;
> > > > > +             size_t active;
> > > > > +     } size = {0};
> > > > > +     bool has_status = false;
> > > > > +     int id;
> > > > > +
> > > > > +     spin_lock(&file->table_lock);
> > > > > +     idr_for_each_entry (&file->object_idr, obj, id) {
> > > > > +             enum drm_gem_object_status s = 0;
> > > > > +
> > > > > +             if (obj->funcs && obj->funcs->status) {
> > > > > +                     s = obj->funcs->status(obj);
> > > > > +                     has_status = true;
> > > > > +             }
> > > > > +
> > > > > +             if (obj->handle_count > 1) {
> > > > > +                     size.shared += obj->size;
> > > > > +             } else {
> > > > > +                     size.private += obj->size;
> > > > > +             }
> > > > > +
> > > > > +             if (s & DRM_GEM_OBJECT_RESIDENT) {
> > > > > +                     size.resident += obj->size;
> > > > > +             } else {
> > > > > +                     /* If already purged or not yet backed by pages, don't
> > > > > +                      * count it as purgeable:
> > > > > +                      */
> > > > > +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
> > > >
> > > > Side question - why couldn't resident buffers be purgeable? Did you mean
> > > > for the if branch check to be active here? But then it wouldn't make
> > > > sense for a driver to report active _and_ purgeable..
> > > >
> > > > > +             }
> > > > > +
> > > > > +             if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) {
> > > > > +                     size.active += obj->size;
> > > > > +
> > > > > +                     /* If still active, don't count as purgeable: */
> > > > > +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
> > > >
> > > > Another side question - I guess this tidies a race in reporting? If so
> > > > not sure it matters given the stats are all rather approximate.
> > > >
> > > > > +             }
> > > > > +
> > > > > +             if (s & DRM_GEM_OBJECT_PURGEABLE)
> > > > > +                     size.purgeable += obj->size;
> > > > > +     }
> > > >
> > > > One concern I have here is that it is all based on obj->size. That is,
> > > > there is no provision for drivers to implement page level granularity.
> > > > So correct reporting in use cases such as VM BIND in the future wouldn't
> > > > work unless it was a driver hook to get almost all of the info above. At
> > > > which point common code is just a loop. TBF I don't know if any drivers
> > > > do sub obj->size backing store granularity today, but I think it is
> > > > sometimes to be sure of before proceeding.
> > > >
> > > > Second concern is what I touched upon in the first reply block - if the
> > > > common code blindly loops over all objects then on discrete GPUs it
> > > > seems we get an 'aggregate' value here which is not what I think we
> > > > want. We rather want to have the ability for drivers to list stats per
> > > > individual memory region.
> > > >
> > > > > +     spin_unlock(&file->table_lock);
> > > > > +
> > > > > +     print_size(p, "drm-shared-memory", size.shared);
> > > > > +     print_size(p, "drm-private-memory", size.private);
> > > > > +     print_size(p, "drm-active-memory", size.active);
> > > > > +
> > > > > +     if (has_status) {
> > > > > +             print_size(p, "drm-resident-memory", size.resident);
> > > > > +             print_size(p, "drm-purgeable-memory", size.purgeable);
> > > > > +     }
> > > > > +}
> > > > > +
> > > > >   /**
> > > > >    * drm_fop_show_fdinfo - helper for drm file fops
> > > > >    * @seq_file: output stream
> > > > > @@ -904,6 +978,8 @@ void drm_fop_show_fdinfo(struct seq_file *m, struct file *f)
> > > > >
> > > > >       if (dev->driver->show_fdinfo)
> > > > >               dev->driver->show_fdinfo(&p, file);
> > > > > +
> > > > > +     print_memory_stats(&p, file);
> > > > >   }
> > > > >   EXPORT_SYMBOL(drm_fop_show_fdinfo);
> > > > >
> > > > > diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
> > > > > index dfa995b787e1..e5b40084538f 100644
> > > > > --- a/include/drm/drm_file.h
> > > > > +++ b/include/drm/drm_file.h
> > > > > @@ -41,6 +41,7 @@
> > > > >   struct dma_fence;
> > > > >   struct drm_file;
> > > > >   struct drm_device;
> > > > > +struct drm_printer;
> > > > >   struct device;
> > > > >   struct file;
> > > > >
> > > > > diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
> > > > > index 189fd618ca65..213917bb6b11 100644
> > > > > --- a/include/drm/drm_gem.h
> > > > > +++ b/include/drm/drm_gem.h
> > > > > @@ -42,6 +42,14 @@
> > > > >   struct iosys_map;
> > > > >   struct drm_gem_object;
> > > > >
> > > > > +/**
> > > > > + * enum drm_gem_object_status - bitmask of object state for fdinfo reporting
> > > > > + */
> > > > > +enum drm_gem_object_status {
> > > > > +     DRM_GEM_OBJECT_RESIDENT  = BIT(0),
> > > > > +     DRM_GEM_OBJECT_PURGEABLE = BIT(1),
> > > > > +};
> > > > > +
> > > > >   /**
> > > > >    * struct drm_gem_object_funcs - GEM object functions
> > > > >    */
> > > > > @@ -174,6 +182,17 @@ struct drm_gem_object_funcs {
> > > > >        */
> > > > >       int (*evict)(struct drm_gem_object *obj);
> > > > >
> > > > > +     /**
> > > > > +      * @status:
> > > > > +      *
> > > > > +      * The optional status callback can return additional object state
> > > > > +      * which determines which stats the object is counted against.  The
> > > > > +      * callback is called under table_lock.  Racing against object status
> > > > > +      * change is "harmless", and the callback can expect to not race
> > > > > +      * against object destruction.
> > > > > +      */
> > > > > +     enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
> > > >
> > > > Does this needs to be in object funcs and couldn't be consolidated to
> > > > driver level?
> > > >
> > > > Regards,
> > > >
> > > > Tvrtko
> > > >
> > > > > +
> > > > >       /**
> > > > >        * @vm_ops:
> > > > >        *
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Tvrtko Ursulin April 13, 2023, 12:58 p.m. UTC | #4
On 12/04/2023 20:18, Daniel Vetter wrote:
> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>
>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
>>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>>
>>>>>
>>>>> On 11/04/2023 23:56, Rob Clark wrote:
>>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>>
>>>>>> Add support to dump GEM stats to fdinfo.
>>>>>>
>>>>>> v2: Fix typos, change size units to match docs, use div_u64
>>>>>> v3: Do it in core
>>>>>>
>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
>>>>>> ---
>>>>>>    Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
>>>>>>    drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
>>>>>>    include/drm/drm_file.h                |  1 +
>>>>>>    include/drm/drm_gem.h                 | 19 +++++++
>>>>>>    4 files changed, 117 insertions(+)
>>>>>>
>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
>>>>>> index b46327356e80..b5e7802532ed 100644
>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
>>>>>>    Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
>>>>>>    indicating kibi- or mebi-bytes.
>>>>>>
>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
>>>>>> +
>>>>>> +The total size of buffers that are shared with another file (ie. have more
>>>>>> +than a single handle).
>>>>>> +
>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
>>>>>> +
>>>>>> +The total size of buffers that are not shared with another file.
>>>>>> +
>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
>>>>>> +
>>>>>> +The total size of buffers that are resident in system memory.
>>>>>
>>>>> I think this naming maybe does not work best with the existing
>>>>> drm-memory-<region> keys.
>>>>
>>>> Actually, it was very deliberate not to conflict with the existing
>>>> drm-memory-<region> keys ;-)
>>>>
>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
>>>>
>>>>> How about introduce the concept of a memory region from the start and
>>>>> use naming similar like we do for engines?
>>>>>
>>>>> drm-memory-$CATEGORY-$REGION: ...
>>>>>
>>>>> Then we document a bunch of categories and their semantics, for instance:
>>>>>
>>>>> 'size' - All reachable objects
>>>>> 'shared' - Subset of 'size' with handle_count > 1
>>>>> 'resident' - Objects with backing store
>>>>> 'active' - Objects in use, subset of resident
>>>>> 'purgeable' - Or inactive? Subset of resident.
>>>>>
>>>>> We keep the same semantics as with process memory accounting (if I got
>>>>> it right) which could be desirable for a simplified mental model.
>>>>>
>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
>>>>> correctly captured this in the first round it should be equivalent to
>>>>> 'resident' above. In any case we can document no category is equal to
>>>>> which category, and at most one of the two must be output.)
>>>>>
>>>>> Region names we at most partially standardize. Like we could say
>>>>> 'system' is to be used where backing store is system RAM and others are
>>>>> driver defined.
>>>>>
>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
>>>>> region they support.
>>>>>
>>>>> I think this all also works for objects which can be migrated between
>>>>> memory regions. 'Size' accounts them against all regions while for
>>>>> 'resident' they only appear in the region of their current placement, etc.
>>>>
>>>> I'm not too sure how to rectify different memory regions with this,
>>>> since drm core doesn't really know about the driver's memory regions.
>>>> Perhaps we can go back to this being a helper and drivers with vram
>>>> just don't use the helper?  Or??
>>>
>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
>>> all works out reasonably consistently?
>>
>> That is basically what we have now.  I could append -system to each to
>> make things easier to add vram/etc (from a uabi standpoint)..
> 
> What you have isn't really -system, but everything. So doesn't really make
> sense to me to mark this -system, it's only really true for integrated (if
> they don't have stolen or something like that).
> 
> Also my comment was more in reply to Tvrtko's suggestion.

Right so my proposal was drm-memory-$CATEGORY-$REGION which I think 
aligns with the current drm-memory-$REGION by extending, rather than 
creating confusion with different order of key name components.

AMD currently has (among others) drm-memory-vram, which we could define 
in the spec maps to category X, if category component is not present.

Some examples:

drm-memory-resident-system:
drm-memory-size-lmem0:
drm-memory-active-vram:

Etc.. I think it creates a consistent story.

Other than this, my two I think significant opens which haven't been 
addressed yet are:

1)

Why do we want totals (not per region) when userspace can trivially 
aggregate if they want. What is the use case?

2)

Current proposal limits the value to whole objects and fixates that by 
having it in the common code. If/when some driver is able to support 
sub-BO granularity they will need to opt out of the common printer at 
which point it may be less churn to start with a helper rather than 
mid-layer. Or maybe some drivers already support this, I don't know. 
Given how important VM BIND is I wouldn't be surprised.

Regards,

Tvrtko

>>> And ttm could/should perhaps provide a helper to dump the region specific
>>> version of this. Or we lift the concept of regions out of ttm a bit
>>> higher, that's kinda needed for cgroups eventually anyway I think.
>>> -Daniel
>>>
>>>>
>>>> BR,
>>>> -R
>>>>
>>>>> Userspace can aggregate if it wishes to do so but kernel side should not.
>>>>>
>>>>>> +
>>>>>> +- drm-purgeable-memory: <uint> [KiB|MiB]
>>>>>> +
>>>>>> +The total size of buffers that are purgeable.
>>>>>> +
>>>>>> +- drm-active-memory: <uint> [KiB|MiB]
>>>>>> +
>>>>>> +The total size of buffers that are active on one or more rings.
>>>>>> +
>>>>>>    - drm-cycles-<str> <uint>
>>>>>>
>>>>>>    Engine identifier string must be the same as the one specified in the
>>>>>> diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
>>>>>> index 37dfaa6be560..46fdd843bb3a 100644
>>>>>> --- a/drivers/gpu/drm/drm_file.c
>>>>>> +++ b/drivers/gpu/drm/drm_file.c
>>>>>> @@ -42,6 +42,7 @@
>>>>>>    #include <drm/drm_client.h>
>>>>>>    #include <drm/drm_drv.h>
>>>>>>    #include <drm/drm_file.h>
>>>>>> +#include <drm/drm_gem.h>
>>>>>>    #include <drm/drm_print.h>
>>>>>>
>>>>>>    #include "drm_crtc_internal.h"
>>>>>> @@ -871,6 +872,79 @@ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
>>>>>>    }
>>>>>>    EXPORT_SYMBOL(drm_send_event);
>>>>>>
>>>>>> +static void print_size(struct drm_printer *p, const char *stat, size_t sz)
>>>>>> +{
>>>>>> +     const char *units[] = {"", " KiB", " MiB"};
>>>>>> +     unsigned u;
>>>>>> +
>>>>>> +     for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
>>>>>> +             if (sz < SZ_1K)
>>>>>> +                     break;
>>>>>> +             sz = div_u64(sz, SZ_1K);
>>>>>> +     }
>>>>>> +
>>>>>> +     drm_printf(p, "%s:\t%zu%s\n", stat, sz, units[u]);
>>>>>> +}
>>>>>> +
>>>>>> +static void print_memory_stats(struct drm_printer *p, struct drm_file *file)
>>>>>> +{
>>>>>> +     struct drm_gem_object *obj;
>>>>>> +     struct {
>>>>>> +             size_t shared;
>>>>>> +             size_t private;
>>>>>> +             size_t resident;
>>>>>> +             size_t purgeable;
>>>>>> +             size_t active;
>>>>>> +     } size = {0};
>>>>>> +     bool has_status = false;
>>>>>> +     int id;
>>>>>> +
>>>>>> +     spin_lock(&file->table_lock);
>>>>>> +     idr_for_each_entry (&file->object_idr, obj, id) {
>>>>>> +             enum drm_gem_object_status s = 0;
>>>>>> +
>>>>>> +             if (obj->funcs && obj->funcs->status) {
>>>>>> +                     s = obj->funcs->status(obj);
>>>>>> +                     has_status = true;
>>>>>> +             }
>>>>>> +
>>>>>> +             if (obj->handle_count > 1) {
>>>>>> +                     size.shared += obj->size;
>>>>>> +             } else {
>>>>>> +                     size.private += obj->size;
>>>>>> +             }
>>>>>> +
>>>>>> +             if (s & DRM_GEM_OBJECT_RESIDENT) {
>>>>>> +                     size.resident += obj->size;
>>>>>> +             } else {
>>>>>> +                     /* If already purged or not yet backed by pages, don't
>>>>>> +                      * count it as purgeable:
>>>>>> +                      */
>>>>>> +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
>>>>>
>>>>> Side question - why couldn't resident buffers be purgeable? Did you mean
>>>>> for the if branch check to be active here? But then it wouldn't make
>>>>> sense for a driver to report active _and_ purgeable..
>>>>>
>>>>>> +             }
>>>>>> +
>>>>>> +             if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) {
>>>>>> +                     size.active += obj->size;
>>>>>> +
>>>>>> +                     /* If still active, don't count as purgeable: */
>>>>>> +                     s &= ~DRM_GEM_OBJECT_PURGEABLE;
>>>>>
>>>>> Another side question - I guess this tidies a race in reporting? If so
>>>>> not sure it matters given the stats are all rather approximate.
>>>>>
>>>>>> +             }
>>>>>> +
>>>>>> +             if (s & DRM_GEM_OBJECT_PURGEABLE)
>>>>>> +                     size.purgeable += obj->size;
>>>>>> +     }
>>>>>
>>>>> One concern I have here is that it is all based on obj->size. That is,
>>>>> there is no provision for drivers to implement page level granularity.
>>>>> So correct reporting in use cases such as VM BIND in the future wouldn't
>>>>> work unless it was a driver hook to get almost all of the info above. At
>>>>> which point common code is just a loop. TBF I don't know if any drivers
>>>>> do sub obj->size backing store granularity today, but I think it is
>>>>> sometimes to be sure of before proceeding.
>>>>>
>>>>> Second concern is what I touched upon in the first reply block - if the
>>>>> common code blindly loops over all objects then on discrete GPUs it
>>>>> seems we get an 'aggregate' value here which is not what I think we
>>>>> want. We rather want to have the ability for drivers to list stats per
>>>>> individual memory region.
>>>>>
>>>>>> +     spin_unlock(&file->table_lock);
>>>>>> +
>>>>>> +     print_size(p, "drm-shared-memory", size.shared);
>>>>>> +     print_size(p, "drm-private-memory", size.private);
>>>>>> +     print_size(p, "drm-active-memory", size.active);
>>>>>> +
>>>>>> +     if (has_status) {
>>>>>> +             print_size(p, "drm-resident-memory", size.resident);
>>>>>> +             print_size(p, "drm-purgeable-memory", size.purgeable);
>>>>>> +     }
>>>>>> +}
>>>>>> +
>>>>>>    /**
>>>>>>     * drm_fop_show_fdinfo - helper for drm file fops
>>>>>>     * @seq_file: output stream
>>>>>> @@ -904,6 +978,8 @@ void drm_fop_show_fdinfo(struct seq_file *m, struct file *f)
>>>>>>
>>>>>>        if (dev->driver->show_fdinfo)
>>>>>>                dev->driver->show_fdinfo(&p, file);
>>>>>> +
>>>>>> +     print_memory_stats(&p, file);
>>>>>>    }
>>>>>>    EXPORT_SYMBOL(drm_fop_show_fdinfo);
>>>>>>
>>>>>> diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
>>>>>> index dfa995b787e1..e5b40084538f 100644
>>>>>> --- a/include/drm/drm_file.h
>>>>>> +++ b/include/drm/drm_file.h
>>>>>> @@ -41,6 +41,7 @@
>>>>>>    struct dma_fence;
>>>>>>    struct drm_file;
>>>>>>    struct drm_device;
>>>>>> +struct drm_printer;
>>>>>>    struct device;
>>>>>>    struct file;
>>>>>>
>>>>>> diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
>>>>>> index 189fd618ca65..213917bb6b11 100644
>>>>>> --- a/include/drm/drm_gem.h
>>>>>> +++ b/include/drm/drm_gem.h
>>>>>> @@ -42,6 +42,14 @@
>>>>>>    struct iosys_map;
>>>>>>    struct drm_gem_object;
>>>>>>
>>>>>> +/**
>>>>>> + * enum drm_gem_object_status - bitmask of object state for fdinfo reporting
>>>>>> + */
>>>>>> +enum drm_gem_object_status {
>>>>>> +     DRM_GEM_OBJECT_RESIDENT  = BIT(0),
>>>>>> +     DRM_GEM_OBJECT_PURGEABLE = BIT(1),
>>>>>> +};
>>>>>> +
>>>>>>    /**
>>>>>>     * struct drm_gem_object_funcs - GEM object functions
>>>>>>     */
>>>>>> @@ -174,6 +182,17 @@ struct drm_gem_object_funcs {
>>>>>>         */
>>>>>>        int (*evict)(struct drm_gem_object *obj);
>>>>>>
>>>>>> +     /**
>>>>>> +      * @status:
>>>>>> +      *
>>>>>> +      * The optional status callback can return additional object state
>>>>>> +      * which determines which stats the object is counted against.  The
>>>>>> +      * callback is called under table_lock.  Racing against object status
>>>>>> +      * change is "harmless", and the callback can expect to not race
>>>>>> +      * against object destruction.
>>>>>> +      */
>>>>>> +     enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
>>>>>
>>>>> Does this needs to be in object funcs and couldn't be consolidated to
>>>>> driver level?
>>>>>
>>>>> Regards,
>>>>>
>>>>> Tvrtko
>>>>>
>>>>>> +
>>>>>>        /**
>>>>>>         * @vm_ops:
>>>>>>         *
>>>
>>> --
>>> Daniel Vetter
>>> Software Engineer, Intel Corporation
>>> http://blog.ffwll.ch
>
Tvrtko Ursulin April 14, 2023, 8:57 a.m. UTC | #5
On 13/04/2023 21:05, Daniel Vetter wrote:
> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
>>
>> On 13/04/2023 14:27, Daniel Vetter wrote:
>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>
>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>>>>>>
>>>>>>>>>> Add support to dump GEM stats to fdinfo.
>>>>>>>>>>
>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
>>>>>>>>>> v3: Do it in core
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
>>>>>>>>>> ---
>>>>>>>>>>      Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
>>>>>>>>>>      drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
>>>>>>>>>>      include/drm/drm_file.h                |  1 +
>>>>>>>>>>      include/drm/drm_gem.h                 | 19 +++++++
>>>>>>>>>>      4 files changed, 117 insertions(+)
>>>>>>>>>>
>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
>>>>>>>>>>      Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
>>>>>>>>>>      indicating kibi- or mebi-bytes.
>>>>>>>>>>
>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
>>>>>>>>>> +
>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
>>>>>>>>>> +than a single handle).
>>>>>>>>>> +
>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
>>>>>>>>>> +
>>>>>>>>>> +The total size of buffers that are not shared with another file.
>>>>>>>>>> +
>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
>>>>>>>>>> +
>>>>>>>>>> +The total size of buffers that are resident in system memory.
>>>>>>>>>
>>>>>>>>> I think this naming maybe does not work best with the existing
>>>>>>>>> drm-memory-<region> keys.
>>>>>>>>
>>>>>>>> Actually, it was very deliberate not to conflict with the existing
>>>>>>>> drm-memory-<region> keys ;-)
>>>>>>>>
>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
>>>>>>>>
>>>>>>>>> How about introduce the concept of a memory region from the start and
>>>>>>>>> use naming similar like we do for engines?
>>>>>>>>>
>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
>>>>>>>>>
>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
>>>>>>>>>
>>>>>>>>> 'size' - All reachable objects
>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
>>>>>>>>> 'resident' - Objects with backing store
>>>>>>>>> 'active' - Objects in use, subset of resident
>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
>>>>>>>>>
>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
>>>>>>>>> it right) which could be desirable for a simplified mental model.
>>>>>>>>>
>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
>>>>>>>>> correctly captured this in the first round it should be equivalent to
>>>>>>>>> 'resident' above. In any case we can document no category is equal to
>>>>>>>>> which category, and at most one of the two must be output.)
>>>>>>>>>
>>>>>>>>> Region names we at most partially standardize. Like we could say
>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
>>>>>>>>> driver defined.
>>>>>>>>>
>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
>>>>>>>>> region they support.
>>>>>>>>>
>>>>>>>>> I think this all also works for objects which can be migrated between
>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
>>>>>>>>
>>>>>>>> I'm not too sure how to rectify different memory regions with this,
>>>>>>>> since drm core doesn't really know about the driver's memory regions.
>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
>>>>>>>> just don't use the helper?  Or??
>>>>>>>
>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
>>>>>>> all works out reasonably consistently?
>>>>>>
>>>>>> That is basically what we have now.  I could append -system to each to
>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
>>>>>
>>>>> What you have isn't really -system, but everything. So doesn't really make
>>>>> sense to me to mark this -system, it's only really true for integrated (if
>>>>> they don't have stolen or something like that).
>>>>>
>>>>> Also my comment was more in reply to Tvrtko's suggestion.
>>>>
>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
>>>> with the current drm-memory-$REGION by extending, rather than creating
>>>> confusion with different order of key name components.
>>>
>>> Oh my comment was pretty much just bikeshed, in case someone creates a
>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
>>> So $CATEGORY before the -memory.
>>>
>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
>>> folks like :-) I don't really care much personally.
>>
>> Okay I missed the parsing problem.
>>
>>>> AMD currently has (among others) drm-memory-vram, which we could define in
>>>> the spec maps to category X, if category component is not present.
>>>>
>>>> Some examples:
>>>>
>>>> drm-memory-resident-system:
>>>> drm-memory-size-lmem0:
>>>> drm-memory-active-vram:
>>>>
>>>> Etc.. I think it creates a consistent story.
>>>>
>>>> Other than this, my two I think significant opens which haven't been
>>>> addressed yet are:
>>>>
>>>> 1)
>>>>
>>>> Why do we want totals (not per region) when userspace can trivially
>>>> aggregate if they want. What is the use case?
>>>>
>>>> 2)
>>>>
>>>> Current proposal limits the value to whole objects and fixates that by
>>>> having it in the common code. If/when some driver is able to support sub-BO
>>>> granularity they will need to opt out of the common printer at which point
>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
>>>> some drivers already support this, I don't know. Given how important VM BIND
>>>> is I wouldn't be surprised.
>>>
>>> I feel like for drivers using ttm we want a ttm helper which takes care of
>>> the region printing in hopefully a standard way. And that could then also
>>> take care of all kinds of of partial binding and funny rules (like maybe
>>> we want a standard vram region that addds up all the lmem regions on
>>> intel, so that all dgpu have a common vram bucket that generic tools
>>> understand?).
>>
>> First part yes, but for the second I would think we want to avoid any
>> aggregation in the kernel which can be done in userspace just as well. Such
>> total vram bucket would be pretty useless on Intel even since userspace
>> needs to be region aware to make use of all resources. It could even be
>> counter productive I think - "why am I getting out of memory when half of my
>> vram is unused!?".
> 
> This is not for intel-aware userspace. This is for fairly generic "gputop"
> style userspace, which might simply have no clue or interest in what lmemX
> means, but would understand vram.
> 
> Aggregating makes sense.

Lmem vs vram is now an argument not about aggregation but about 
standardizing regions names.

One detail also is a change in philosophy compared to engine stats where 
engine names are not centrally prescribed and it was expected userspace 
will have to handle things generically and with some vendor specific 
knowledge.

Like in my gputop patches. It doesn't need to understand what is what, 
it just finds what's there and presents it to the user.

Come some accel driver with local memory it wouldn't be vram any more. 
Or even a headless data center GPU. So I really don't think it is good 
to hardcode 'vram' in the spec, or midlayer, or helpers.

And for aggregation.. again, userspace can do it just as well. If we do 
it in kernel then immediately we have multiple sets of keys to output 
for any driver which wants to show the region view. IMO it is just 
pointless work in the kernel and more code in the kernel, when userspace 
can do it.

Proposal A (one a discrete gpu, one category only):

drm-resident-memory: x KiB
drm-resident-memory-system: x KiB
drm-resident-memory-vram: x KiB

Two loops in the kernel, more parsing in userspace.

Proposal B:

drm-resident-memory-system: x KiB
drm-resident-memory-vram: x KiB

Can be one loop, one helper, less text for userspace to parse and it can 
still trivially show the total if so desired.

For instance a helper (or two) with a common struct containing region 
names and totals, where a callback into the driver tallies under each 
region, as the drm helper is walking objects.

>>> It does mean we walk the bo list twice, but *shrug*. People have been
>>> complaining about procutils for decades, they're still horrible, I think
>>> walking bo lists twice internally in the ttm case is going to be ok. If
>>> not, it's internals, we can change them again.
>>>
>>> Also I'd lean a lot more towards making ttm a helper and not putting that
>>> into core, exactly because it's pretty clear we'll need more flexibility
>>> when it comes to accurate stats for multi-region drivers.
>>
>> Exactly.
>>
>>> But for a first "how much gpu space does this app use" across everything I
>>> think this is a good enough starting point.
>>
>> Okay so we agree this would be better as a helper and not in the core.
> 
> Nope, if you mean with this = Rob's patch. I was talking about a
> hypothetical region-aware extension for ttm-using drivers.
> 
>> On the point are keys/semantics good enough as a starting point I am still
>> not convinced kernel should aggregate and that instead we should start from
>> day one by appending -system (or something) to Rob's proposed keys.
> 
> It should imo. Inflicting driver knowledge on generic userspace makes not
> much sense, we should start with the more generally useful stuff imo.
> That's why there's the drm fdinfo spec and all that so it's not a
> free-for-all.
> 
> Also Rob's stuff is _not_ system. Check on a i915 dgpu if you want :-)

I am well aware it adds up everything, that is beside the point.

Drm-usage-stats.rst text needs to be more precise across all keys at least:

+- drm-resident-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are resident in system memory.

But as said, I don't see the point in providing aggregated values.

Regards,

Tvrtko
Daniel Vetter April 16, 2023, 7:48 a.m. UTC | #6
On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
> >
> >
> > On 13/04/2023 21:05, Daniel Vetter wrote:
> > > On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
> > >>
> > >> On 13/04/2023 14:27, Daniel Vetter wrote:
> > >>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
> > >>>>
> > >>>> On 12/04/2023 20:18, Daniel Vetter wrote:
> > >>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
> > >>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>>>>>>
> > >>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> > >>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> > >>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
> > >>>>>>>>>
> > >>>>>>>>>
> > >>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
> > >>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
> > >>>>>>>>>>
> > >>>>>>>>>> Add support to dump GEM stats to fdinfo.
> > >>>>>>>>>>
> > >>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
> > >>>>>>>>>> v3: Do it in core
> > >>>>>>>>>>
> > >>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> > >>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> > >>>>>>>>>> ---
> > >>>>>>>>>>      Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> > >>>>>>>>>>      drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> > >>>>>>>>>>      include/drm/drm_file.h                |  1 +
> > >>>>>>>>>>      include/drm/drm_gem.h                 | 19 +++++++
> > >>>>>>>>>>      4 files changed, 117 insertions(+)
> > >>>>>>>>>>
> > >>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>> index b46327356e80..b5e7802532ed 100644
> > >>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> > >>>>>>>>>>      Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> > >>>>>>>>>>      indicating kibi- or mebi-bytes.
> > >>>>>>>>>>
> > >>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
> > >>>>>>>>>> +
> > >>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
> > >>>>>>>>>> +than a single handle).
> > >>>>>>>>>> +
> > >>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
> > >>>>>>>>>> +
> > >>>>>>>>>> +The total size of buffers that are not shared with another file.
> > >>>>>>>>>> +
> > >>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
> > >>>>>>>>>> +
> > >>>>>>>>>> +The total size of buffers that are resident in system memory.
> > >>>>>>>>>
> > >>>>>>>>> I think this naming maybe does not work best with the existing
> > >>>>>>>>> drm-memory-<region> keys.
> > >>>>>>>>
> > >>>>>>>> Actually, it was very deliberate not to conflict with the existing
> > >>>>>>>> drm-memory-<region> keys ;-)
> > >>>>>>>>
> > >>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
> > >>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
> > >>>>>>>>
> > >>>>>>>>> How about introduce the concept of a memory region from the start and
> > >>>>>>>>> use naming similar like we do for engines?
> > >>>>>>>>>
> > >>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
> > >>>>>>>>>
> > >>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
> > >>>>>>>>>
> > >>>>>>>>> 'size' - All reachable objects
> > >>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
> > >>>>>>>>> 'resident' - Objects with backing store
> > >>>>>>>>> 'active' - Objects in use, subset of resident
> > >>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
> > >>>>>>>>>
> > >>>>>>>>> We keep the same semantics as with process memory accounting (if I got
> > >>>>>>>>> it right) which could be desirable for a simplified mental model.
> > >>>>>>>>>
> > >>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> > >>>>>>>>> correctly captured this in the first round it should be equivalent to
> > >>>>>>>>> 'resident' above. In any case we can document no category is equal to
> > >>>>>>>>> which category, and at most one of the two must be output.)
> > >>>>>>>>>
> > >>>>>>>>> Region names we at most partially standardize. Like we could say
> > >>>>>>>>> 'system' is to be used where backing store is system RAM and others are
> > >>>>>>>>> driver defined.
> > >>>>>>>>>
> > >>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
> > >>>>>>>>> region they support.
> > >>>>>>>>>
> > >>>>>>>>> I think this all also works for objects which can be migrated between
> > >>>>>>>>> memory regions. 'Size' accounts them against all regions while for
> > >>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
> > >>>>>>>>
> > >>>>>>>> I'm not too sure how to rectify different memory regions with this,
> > >>>>>>>> since drm core doesn't really know about the driver's memory regions.
> > >>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
> > >>>>>>>> just don't use the helper?  Or??
> > >>>>>>>
> > >>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> > >>>>>>> all works out reasonably consistently?
> > >>>>>>
> > >>>>>> That is basically what we have now.  I could append -system to each to
> > >>>>>> make things easier to add vram/etc (from a uabi standpoint)..
> > >>>>>
> > >>>>> What you have isn't really -system, but everything. So doesn't really make
> > >>>>> sense to me to mark this -system, it's only really true for integrated (if
> > >>>>> they don't have stolen or something like that).
> > >>>>>
> > >>>>> Also my comment was more in reply to Tvrtko's suggestion.
> > >>>>
> > >>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
> > >>>> with the current drm-memory-$REGION by extending, rather than creating
> > >>>> confusion with different order of key name components.
> > >>>
> > >>> Oh my comment was pretty much just bikeshed, in case someone creates a
> > >>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
> > >>> So $CATEGORY before the -memory.
> > >>>
> > >>> Otoh I don't think that'll happen, so I guess we can go with whatever more
> > >>> folks like :-) I don't really care much personally.
> > >>
> > >> Okay I missed the parsing problem.
> > >>
> > >>>> AMD currently has (among others) drm-memory-vram, which we could define in
> > >>>> the spec maps to category X, if category component is not present.
> > >>>>
> > >>>> Some examples:
> > >>>>
> > >>>> drm-memory-resident-system:
> > >>>> drm-memory-size-lmem0:
> > >>>> drm-memory-active-vram:
> > >>>>
> > >>>> Etc.. I think it creates a consistent story.
> > >>>>
> > >>>> Other than this, my two I think significant opens which haven't been
> > >>>> addressed yet are:
> > >>>>
> > >>>> 1)
> > >>>>
> > >>>> Why do we want totals (not per region) when userspace can trivially
> > >>>> aggregate if they want. What is the use case?
> > >>>>
> > >>>> 2)
> > >>>>
> > >>>> Current proposal limits the value to whole objects and fixates that by
> > >>>> having it in the common code. If/when some driver is able to support sub-BO
> > >>>> granularity they will need to opt out of the common printer at which point
> > >>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
> > >>>> some drivers already support this, I don't know. Given how important VM BIND
> > >>>> is I wouldn't be surprised.
> > >>>
> > >>> I feel like for drivers using ttm we want a ttm helper which takes care of
> > >>> the region printing in hopefully a standard way. And that could then also
> > >>> take care of all kinds of of partial binding and funny rules (like maybe
> > >>> we want a standard vram region that addds up all the lmem regions on
> > >>> intel, so that all dgpu have a common vram bucket that generic tools
> > >>> understand?).
> > >>
> > >> First part yes, but for the second I would think we want to avoid any
> > >> aggregation in the kernel which can be done in userspace just as well. Such
> > >> total vram bucket would be pretty useless on Intel even since userspace
> > >> needs to be region aware to make use of all resources. It could even be
> > >> counter productive I think - "why am I getting out of memory when half of my
> > >> vram is unused!?".
> > >
> > > This is not for intel-aware userspace. This is for fairly generic "gputop"
> > > style userspace, which might simply have no clue or interest in what lmemX
> > > means, but would understand vram.
> > >
> > > Aggregating makes sense.
> >
> > Lmem vs vram is now an argument not about aggregation but about
> > standardizing regions names.
> >
> > One detail also is a change in philosophy compared to engine stats where
> > engine names are not centrally prescribed and it was expected userspace
> > will have to handle things generically and with some vendor specific
> > knowledge.
> >
> > Like in my gputop patches. It doesn't need to understand what is what,
> > it just finds what's there and presents it to the user.
> >
> > Come some accel driver with local memory it wouldn't be vram any more.
> > Or even a headless data center GPU. So I really don't think it is good
> > to hardcode 'vram' in the spec, or midlayer, or helpers.
> >
> > And for aggregation.. again, userspace can do it just as well. If we do
> > it in kernel then immediately we have multiple sets of keys to output
> > for any driver which wants to show the region view. IMO it is just
> > pointless work in the kernel and more code in the kernel, when userspace
> > can do it.
> >
> > Proposal A (one a discrete gpu, one category only):
> >
> > drm-resident-memory: x KiB
> > drm-resident-memory-system: x KiB
> > drm-resident-memory-vram: x KiB
> >
> > Two loops in the kernel, more parsing in userspace.
> 
> why would it be more than one loop, ie.
> 
>     mem.resident += size;
>     mem.category[cat].resident += size;
> 
> At the end of the day, there is limited real-estate to show a million
> different columns of information.  Even the gputop patches I posted
> don't show everything of what is currently there.  And nvtop only
> shows toplevel resident stat.  So I think the "everything" stat is
> going to be what most tools use.

Yeah with enough finesse the double-loop isn't needed, it's just the
simplest possible approach.

Also this is fdinfo, I _really_ want perf data showing that it's a
real-world problem when we conjecture about algorithmic complexity.
procutils have been algorithmically garbage since decades after all :-)

Cheers, Daniel

> 
> BR,
> -R
> 
> > Proposal B:
> >
> > drm-resident-memory-system: x KiB
> > drm-resident-memory-vram: x KiB
> >
> > Can be one loop, one helper, less text for userspace to parse and it can
> > still trivially show the total if so desired.
> >
> > For instance a helper (or two) with a common struct containing region
> > names and totals, where a callback into the driver tallies under each
> > region, as the drm helper is walking objects.
> >
> > >>> It does mean we walk the bo list twice, but *shrug*. People have been
> > >>> complaining about procutils for decades, they're still horrible, I think
> > >>> walking bo lists twice internally in the ttm case is going to be ok. If
> > >>> not, it's internals, we can change them again.
> > >>>
> > >>> Also I'd lean a lot more towards making ttm a helper and not putting that
> > >>> into core, exactly because it's pretty clear we'll need more flexibility
> > >>> when it comes to accurate stats for multi-region drivers.
> > >>
> > >> Exactly.
> > >>
> > >>> But for a first "how much gpu space does this app use" across everything I
> > >>> think this is a good enough starting point.
> > >>
> > >> Okay so we agree this would be better as a helper and not in the core.
> > >
> > > Nope, if you mean with this = Rob's patch. I was talking about a
> > > hypothetical region-aware extension for ttm-using drivers.
> > >
> > >> On the point are keys/semantics good enough as a starting point I am still
> > >> not convinced kernel should aggregate and that instead we should start from
> > >> day one by appending -system (or something) to Rob's proposed keys.
> > >
> > > It should imo. Inflicting driver knowledge on generic userspace makes not
> > > much sense, we should start with the more generally useful stuff imo.
> > > That's why there's the drm fdinfo spec and all that so it's not a
> > > free-for-all.
> > >
> > > Also Rob's stuff is _not_ system. Check on a i915 dgpu if you want :-)
> >
> > I am well aware it adds up everything, that is beside the point.
> >
> > Drm-usage-stats.rst text needs to be more precise across all keys at least:
> >
> > +- drm-resident-memory: <uint> [KiB|MiB]
> > +
> > +The total size of buffers that are resident in system memory.
> >
> > But as said, I don't see the point in providing aggregated values.
> >
> > Regards,
> >
> > Tvrtko
Tvrtko Ursulin April 17, 2023, 11:10 a.m. UTC | #7
On 16/04/2023 08:48, Daniel Vetter wrote:
> On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
>> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>
>>>
>>> On 13/04/2023 21:05, Daniel Vetter wrote:
>>>> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
>>>>>
>>>>> On 13/04/2023 14:27, Daniel Vetter wrote:
>>>>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
>>>>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
>>>>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>>>>
>>>>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
>>>>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
>>>>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
>>>>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Add support to dump GEM stats to fdinfo.
>>>>>>>>>>>>>
>>>>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
>>>>>>>>>>>>> v3: Do it in core
>>>>>>>>>>>>>
>>>>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
>>>>>>>>>>>>> ---
>>>>>>>>>>>>>       Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
>>>>>>>>>>>>>       drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
>>>>>>>>>>>>>       include/drm/drm_file.h                |  1 +
>>>>>>>>>>>>>       include/drm/drm_gem.h                 | 19 +++++++
>>>>>>>>>>>>>       4 files changed, 117 insertions(+)
>>>>>>>>>>>>>
>>>>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
>>>>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
>>>>>>>>>>>>>       Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
>>>>>>>>>>>>>       indicating kibi- or mebi-bytes.
>>>>>>>>>>>>>
>>>>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
>>>>>>>>>>>>> +than a single handle).
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +The total size of buffers that are not shared with another file.
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +The total size of buffers that are resident in system memory.
>>>>>>>>>>>>
>>>>>>>>>>>> I think this naming maybe does not work best with the existing
>>>>>>>>>>>> drm-memory-<region> keys.
>>>>>>>>>>>
>>>>>>>>>>> Actually, it was very deliberate not to conflict with the existing
>>>>>>>>>>> drm-memory-<region> keys ;-)
>>>>>>>>>>>
>>>>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
>>>>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
>>>>>>>>>>>
>>>>>>>>>>>> How about introduce the concept of a memory region from the start and
>>>>>>>>>>>> use naming similar like we do for engines?
>>>>>>>>>>>>
>>>>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
>>>>>>>>>>>>
>>>>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
>>>>>>>>>>>>
>>>>>>>>>>>> 'size' - All reachable objects
>>>>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
>>>>>>>>>>>> 'resident' - Objects with backing store
>>>>>>>>>>>> 'active' - Objects in use, subset of resident
>>>>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
>>>>>>>>>>>>
>>>>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
>>>>>>>>>>>> it right) which could be desirable for a simplified mental model.
>>>>>>>>>>>>
>>>>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
>>>>>>>>>>>> correctly captured this in the first round it should be equivalent to
>>>>>>>>>>>> 'resident' above. In any case we can document no category is equal to
>>>>>>>>>>>> which category, and at most one of the two must be output.)
>>>>>>>>>>>>
>>>>>>>>>>>> Region names we at most partially standardize. Like we could say
>>>>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
>>>>>>>>>>>> driver defined.
>>>>>>>>>>>>
>>>>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
>>>>>>>>>>>> region they support.
>>>>>>>>>>>>
>>>>>>>>>>>> I think this all also works for objects which can be migrated between
>>>>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
>>>>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
>>>>>>>>>>>
>>>>>>>>>>> I'm not too sure how to rectify different memory regions with this,
>>>>>>>>>>> since drm core doesn't really know about the driver's memory regions.
>>>>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
>>>>>>>>>>> just don't use the helper?  Or??
>>>>>>>>>>
>>>>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
>>>>>>>>>> all works out reasonably consistently?
>>>>>>>>>
>>>>>>>>> That is basically what we have now.  I could append -system to each to
>>>>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
>>>>>>>>
>>>>>>>> What you have isn't really -system, but everything. So doesn't really make
>>>>>>>> sense to me to mark this -system, it's only really true for integrated (if
>>>>>>>> they don't have stolen or something like that).
>>>>>>>>
>>>>>>>> Also my comment was more in reply to Tvrtko's suggestion.
>>>>>>>
>>>>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
>>>>>>> with the current drm-memory-$REGION by extending, rather than creating
>>>>>>> confusion with different order of key name components.
>>>>>>
>>>>>> Oh my comment was pretty much just bikeshed, in case someone creates a
>>>>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
>>>>>> So $CATEGORY before the -memory.
>>>>>>
>>>>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
>>>>>> folks like :-) I don't really care much personally.
>>>>>
>>>>> Okay I missed the parsing problem.
>>>>>
>>>>>>> AMD currently has (among others) drm-memory-vram, which we could define in
>>>>>>> the spec maps to category X, if category component is not present.
>>>>>>>
>>>>>>> Some examples:
>>>>>>>
>>>>>>> drm-memory-resident-system:
>>>>>>> drm-memory-size-lmem0:
>>>>>>> drm-memory-active-vram:
>>>>>>>
>>>>>>> Etc.. I think it creates a consistent story.
>>>>>>>
>>>>>>> Other than this, my two I think significant opens which haven't been
>>>>>>> addressed yet are:
>>>>>>>
>>>>>>> 1)
>>>>>>>
>>>>>>> Why do we want totals (not per region) when userspace can trivially
>>>>>>> aggregate if they want. What is the use case?
>>>>>>>
>>>>>>> 2)
>>>>>>>
>>>>>>> Current proposal limits the value to whole objects and fixates that by
>>>>>>> having it in the common code. If/when some driver is able to support sub-BO
>>>>>>> granularity they will need to opt out of the common printer at which point
>>>>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
>>>>>>> some drivers already support this, I don't know. Given how important VM BIND
>>>>>>> is I wouldn't be surprised.
>>>>>>
>>>>>> I feel like for drivers using ttm we want a ttm helper which takes care of
>>>>>> the region printing in hopefully a standard way. And that could then also
>>>>>> take care of all kinds of of partial binding and funny rules (like maybe
>>>>>> we want a standard vram region that addds up all the lmem regions on
>>>>>> intel, so that all dgpu have a common vram bucket that generic tools
>>>>>> understand?).
>>>>>
>>>>> First part yes, but for the second I would think we want to avoid any
>>>>> aggregation in the kernel which can be done in userspace just as well. Such
>>>>> total vram bucket would be pretty useless on Intel even since userspace
>>>>> needs to be region aware to make use of all resources. It could even be
>>>>> counter productive I think - "why am I getting out of memory when half of my
>>>>> vram is unused!?".
>>>>
>>>> This is not for intel-aware userspace. This is for fairly generic "gputop"
>>>> style userspace, which might simply have no clue or interest in what lmemX
>>>> means, but would understand vram.
>>>>
>>>> Aggregating makes sense.
>>>
>>> Lmem vs vram is now an argument not about aggregation but about
>>> standardizing regions names.
>>>
>>> One detail also is a change in philosophy compared to engine stats where
>>> engine names are not centrally prescribed and it was expected userspace
>>> will have to handle things generically and with some vendor specific
>>> knowledge.
>>>
>>> Like in my gputop patches. It doesn't need to understand what is what,
>>> it just finds what's there and presents it to the user.
>>>
>>> Come some accel driver with local memory it wouldn't be vram any more.
>>> Or even a headless data center GPU. So I really don't think it is good
>>> to hardcode 'vram' in the spec, or midlayer, or helpers.
>>>
>>> And for aggregation.. again, userspace can do it just as well. If we do
>>> it in kernel then immediately we have multiple sets of keys to output
>>> for any driver which wants to show the region view. IMO it is just
>>> pointless work in the kernel and more code in the kernel, when userspace
>>> can do it.
>>>
>>> Proposal A (one a discrete gpu, one category only):
>>>
>>> drm-resident-memory: x KiB
>>> drm-resident-memory-system: x KiB
>>> drm-resident-memory-vram: x KiB
>>>
>>> Two loops in the kernel, more parsing in userspace.
>>
>> why would it be more than one loop, ie.
>>
>>      mem.resident += size;
>>      mem.category[cat].resident += size;
>>
>> At the end of the day, there is limited real-estate to show a million
>> different columns of information.  Even the gputop patches I posted
>> don't show everything of what is currently there.  And nvtop only
>> shows toplevel resident stat.  So I think the "everything" stat is
>> going to be what most tools use.
> 
> Yeah with enough finesse the double-loop isn't needed, it's just the
> simplest possible approach.
> 
> Also this is fdinfo, I _really_ want perf data showing that it's a
> real-world problem when we conjecture about algorithmic complexity.
> procutils have been algorithmically garbage since decades after all :-)

Just run it. :)

Algorithmic complexity is quite obvious and not a conjecture - to find 
DRM clients you have to walk _all_ pids and _all_ fds under them. So 
amount of work can scale very quickly and even _not_ with the number of 
DRM clients.

It's not too bad on my desktop setup but it is significantly more CPU 
intensive than top(1).

It would be possible to optimise the current code some more by not 
parsing full fdinfo (may become more important as number of keys grow), 
but that's only relevant when number of drm fds is large. It doesn't 
solve the basic pids * open fds search for which we'd need a way to walk 
the list of pids with drm fds directly.

Regards,

Tvrtko
Rob Clark April 17, 2023, 1:42 p.m. UTC | #8
On Mon, Apr 17, 2023 at 4:10 AM Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
>
> On 16/04/2023 08:48, Daniel Vetter wrote:
> > On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
> >> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
> >> <tvrtko.ursulin@linux.intel.com> wrote:
> >>>
> >>>
> >>> On 13/04/2023 21:05, Daniel Vetter wrote:
> >>>> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
> >>>>>
> >>>>> On 13/04/2023 14:27, Daniel Vetter wrote:
> >>>>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
> >>>>>>>
> >>>>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
> >>>>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
> >>>>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>>>>>>>
> >>>>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> >>>>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> >>>>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
> >>>>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Add support to dump GEM stats to fdinfo.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
> >>>>>>>>>>>>> v3: Do it in core
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >>>>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> >>>>>>>>>>>>> ---
> >>>>>>>>>>>>>       Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> >>>>>>>>>>>>>       drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> >>>>>>>>>>>>>       include/drm/drm_file.h                |  1 +
> >>>>>>>>>>>>>       include/drm/drm_gem.h                 | 19 +++++++
> >>>>>>>>>>>>>       4 files changed, 117 insertions(+)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
> >>>>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> >>>>>>>>>>>>>       Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> >>>>>>>>>>>>>       indicating kibi- or mebi-bytes.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>> +
> >>>>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
> >>>>>>>>>>>>> +than a single handle).
> >>>>>>>>>>>>> +
> >>>>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>> +
> >>>>>>>>>>>>> +The total size of buffers that are not shared with another file.
> >>>>>>>>>>>>> +
> >>>>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>> +
> >>>>>>>>>>>>> +The total size of buffers that are resident in system memory.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I think this naming maybe does not work best with the existing
> >>>>>>>>>>>> drm-memory-<region> keys.
> >>>>>>>>>>>
> >>>>>>>>>>> Actually, it was very deliberate not to conflict with the existing
> >>>>>>>>>>> drm-memory-<region> keys ;-)
> >>>>>>>>>>>
> >>>>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
> >>>>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
> >>>>>>>>>>>
> >>>>>>>>>>>> How about introduce the concept of a memory region from the start and
> >>>>>>>>>>>> use naming similar like we do for engines?
> >>>>>>>>>>>>
> >>>>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
> >>>>>>>>>>>>
> >>>>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
> >>>>>>>>>>>>
> >>>>>>>>>>>> 'size' - All reachable objects
> >>>>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
> >>>>>>>>>>>> 'resident' - Objects with backing store
> >>>>>>>>>>>> 'active' - Objects in use, subset of resident
> >>>>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
> >>>>>>>>>>>>
> >>>>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
> >>>>>>>>>>>> it right) which could be desirable for a simplified mental model.
> >>>>>>>>>>>>
> >>>>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> >>>>>>>>>>>> correctly captured this in the first round it should be equivalent to
> >>>>>>>>>>>> 'resident' above. In any case we can document no category is equal to
> >>>>>>>>>>>> which category, and at most one of the two must be output.)
> >>>>>>>>>>>>
> >>>>>>>>>>>> Region names we at most partially standardize. Like we could say
> >>>>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
> >>>>>>>>>>>> driver defined.
> >>>>>>>>>>>>
> >>>>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
> >>>>>>>>>>>> region they support.
> >>>>>>>>>>>>
> >>>>>>>>>>>> I think this all also works for objects which can be migrated between
> >>>>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
> >>>>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
> >>>>>>>>>>>
> >>>>>>>>>>> I'm not too sure how to rectify different memory regions with this,
> >>>>>>>>>>> since drm core doesn't really know about the driver's memory regions.
> >>>>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
> >>>>>>>>>>> just don't use the helper?  Or??
> >>>>>>>>>>
> >>>>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> >>>>>>>>>> all works out reasonably consistently?
> >>>>>>>>>
> >>>>>>>>> That is basically what we have now.  I could append -system to each to
> >>>>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
> >>>>>>>>
> >>>>>>>> What you have isn't really -system, but everything. So doesn't really make
> >>>>>>>> sense to me to mark this -system, it's only really true for integrated (if
> >>>>>>>> they don't have stolen or something like that).
> >>>>>>>>
> >>>>>>>> Also my comment was more in reply to Tvrtko's suggestion.
> >>>>>>>
> >>>>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
> >>>>>>> with the current drm-memory-$REGION by extending, rather than creating
> >>>>>>> confusion with different order of key name components.
> >>>>>>
> >>>>>> Oh my comment was pretty much just bikeshed, in case someone creates a
> >>>>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
> >>>>>> So $CATEGORY before the -memory.
> >>>>>>
> >>>>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
> >>>>>> folks like :-) I don't really care much personally.
> >>>>>
> >>>>> Okay I missed the parsing problem.
> >>>>>
> >>>>>>> AMD currently has (among others) drm-memory-vram, which we could define in
> >>>>>>> the spec maps to category X, if category component is not present.
> >>>>>>>
> >>>>>>> Some examples:
> >>>>>>>
> >>>>>>> drm-memory-resident-system:
> >>>>>>> drm-memory-size-lmem0:
> >>>>>>> drm-memory-active-vram:
> >>>>>>>
> >>>>>>> Etc.. I think it creates a consistent story.
> >>>>>>>
> >>>>>>> Other than this, my two I think significant opens which haven't been
> >>>>>>> addressed yet are:
> >>>>>>>
> >>>>>>> 1)
> >>>>>>>
> >>>>>>> Why do we want totals (not per region) when userspace can trivially
> >>>>>>> aggregate if they want. What is the use case?
> >>>>>>>
> >>>>>>> 2)
> >>>>>>>
> >>>>>>> Current proposal limits the value to whole objects and fixates that by
> >>>>>>> having it in the common code. If/when some driver is able to support sub-BO
> >>>>>>> granularity they will need to opt out of the common printer at which point
> >>>>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
> >>>>>>> some drivers already support this, I don't know. Given how important VM BIND
> >>>>>>> is I wouldn't be surprised.
> >>>>>>
> >>>>>> I feel like for drivers using ttm we want a ttm helper which takes care of
> >>>>>> the region printing in hopefully a standard way. And that could then also
> >>>>>> take care of all kinds of of partial binding and funny rules (like maybe
> >>>>>> we want a standard vram region that addds up all the lmem regions on
> >>>>>> intel, so that all dgpu have a common vram bucket that generic tools
> >>>>>> understand?).
> >>>>>
> >>>>> First part yes, but for the second I would think we want to avoid any
> >>>>> aggregation in the kernel which can be done in userspace just as well. Such
> >>>>> total vram bucket would be pretty useless on Intel even since userspace
> >>>>> needs to be region aware to make use of all resources. It could even be
> >>>>> counter productive I think - "why am I getting out of memory when half of my
> >>>>> vram is unused!?".
> >>>>
> >>>> This is not for intel-aware userspace. This is for fairly generic "gputop"
> >>>> style userspace, which might simply have no clue or interest in what lmemX
> >>>> means, but would understand vram.
> >>>>
> >>>> Aggregating makes sense.
> >>>
> >>> Lmem vs vram is now an argument not about aggregation but about
> >>> standardizing regions names.
> >>>
> >>> One detail also is a change in philosophy compared to engine stats where
> >>> engine names are not centrally prescribed and it was expected userspace
> >>> will have to handle things generically and with some vendor specific
> >>> knowledge.
> >>>
> >>> Like in my gputop patches. It doesn't need to understand what is what,
> >>> it just finds what's there and presents it to the user.
> >>>
> >>> Come some accel driver with local memory it wouldn't be vram any more.
> >>> Or even a headless data center GPU. So I really don't think it is good
> >>> to hardcode 'vram' in the spec, or midlayer, or helpers.
> >>>
> >>> And for aggregation.. again, userspace can do it just as well. If we do
> >>> it in kernel then immediately we have multiple sets of keys to output
> >>> for any driver which wants to show the region view. IMO it is just
> >>> pointless work in the kernel and more code in the kernel, when userspace
> >>> can do it.
> >>>
> >>> Proposal A (one a discrete gpu, one category only):
> >>>
> >>> drm-resident-memory: x KiB
> >>> drm-resident-memory-system: x KiB
> >>> drm-resident-memory-vram: x KiB
> >>>
> >>> Two loops in the kernel, more parsing in userspace.
> >>
> >> why would it be more than one loop, ie.
> >>
> >>      mem.resident += size;
> >>      mem.category[cat].resident += size;
> >>
> >> At the end of the day, there is limited real-estate to show a million
> >> different columns of information.  Even the gputop patches I posted
> >> don't show everything of what is currently there.  And nvtop only
> >> shows toplevel resident stat.  So I think the "everything" stat is
> >> going to be what most tools use.
> >
> > Yeah with enough finesse the double-loop isn't needed, it's just the
> > simplest possible approach.
> >
> > Also this is fdinfo, I _really_ want perf data showing that it's a
> > real-world problem when we conjecture about algorithmic complexity.
> > procutils have been algorithmically garbage since decades after all :-)
>
> Just run it. :)
>
> Algorithmic complexity is quite obvious and not a conjecture - to find
> DRM clients you have to walk _all_ pids and _all_ fds under them. So
> amount of work can scale very quickly and even _not_ with the number of
> DRM clients.
>
> It's not too bad on my desktop setup but it is significantly more CPU
> intensive than top(1).
>
> It would be possible to optimise the current code some more by not
> parsing full fdinfo (may become more important as number of keys grow),
> but that's only relevant when number of drm fds is large. It doesn't
> solve the basic pids * open fds search for which we'd need a way to walk
> the list of pids with drm fds directly.

All of which has (almost[1]) nothing to do with one loop or two
(ignoring for a moment that I already pointed out a single loop is all
that is needed).  If CPU overhead is a problem, we could perhaps come
up some sysfs which has one file per drm_file and side-step crawling
of all of the proc * fd.  I'll play around with it some but I'm pretty
sure you are trying to optimize the wrong thing.

BR,
-R

[1] generally a single process using drm has multiple fd's pointing at
the same drm_file.. which makes the current approach of having to read
fdinfo to find the client-id sub-optimal.  But still the total # of
proc * fd is much larger
Alex Deucher April 17, 2023, 2:04 p.m. UTC | #9
On Mon, Apr 17, 2023 at 9:43 AM Rob Clark <robdclark@gmail.com> wrote:
>
> On Mon, Apr 17, 2023 at 4:10 AM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
> >
> >
> > On 16/04/2023 08:48, Daniel Vetter wrote:
> > > On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
> > >> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
> > >> <tvrtko.ursulin@linux.intel.com> wrote:
> > >>>
> > >>>
> > >>> On 13/04/2023 21:05, Daniel Vetter wrote:
> > >>>> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
> > >>>>>
> > >>>>> On 13/04/2023 14:27, Daniel Vetter wrote:
> > >>>>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
> > >>>>>>>
> > >>>>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
> > >>>>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
> > >>>>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >>>>>>>>>>
> > >>>>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> > >>>>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> > >>>>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
> > >>>>>>>>>>>>
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
> > >>>>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> Add support to dump GEM stats to fdinfo.
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
> > >>>>>>>>>>>>> v3: Do it in core
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> > >>>>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> > >>>>>>>>>>>>> ---
> > >>>>>>>>>>>>>       Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> > >>>>>>>>>>>>>       drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> > >>>>>>>>>>>>>       include/drm/drm_file.h                |  1 +
> > >>>>>>>>>>>>>       include/drm/drm_gem.h                 | 19 +++++++
> > >>>>>>>>>>>>>       4 files changed, 117 insertions(+)
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
> > >>>>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
> > >>>>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> > >>>>>>>>>>>>>       Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> > >>>>>>>>>>>>>       indicating kibi- or mebi-bytes.
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
> > >>>>>>>>>>>>> +
> > >>>>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
> > >>>>>>>>>>>>> +than a single handle).
> > >>>>>>>>>>>>> +
> > >>>>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
> > >>>>>>>>>>>>> +
> > >>>>>>>>>>>>> +The total size of buffers that are not shared with another file.
> > >>>>>>>>>>>>> +
> > >>>>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
> > >>>>>>>>>>>>> +
> > >>>>>>>>>>>>> +The total size of buffers that are resident in system memory.
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> I think this naming maybe does not work best with the existing
> > >>>>>>>>>>>> drm-memory-<region> keys.
> > >>>>>>>>>>>
> > >>>>>>>>>>> Actually, it was very deliberate not to conflict with the existing
> > >>>>>>>>>>> drm-memory-<region> keys ;-)
> > >>>>>>>>>>>
> > >>>>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
> > >>>>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
> > >>>>>>>>>>>
> > >>>>>>>>>>>> How about introduce the concept of a memory region from the start and
> > >>>>>>>>>>>> use naming similar like we do for engines?
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> 'size' - All reachable objects
> > >>>>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
> > >>>>>>>>>>>> 'resident' - Objects with backing store
> > >>>>>>>>>>>> 'active' - Objects in use, subset of resident
> > >>>>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
> > >>>>>>>>>>>> it right) which could be desirable for a simplified mental model.
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> > >>>>>>>>>>>> correctly captured this in the first round it should be equivalent to
> > >>>>>>>>>>>> 'resident' above. In any case we can document no category is equal to
> > >>>>>>>>>>>> which category, and at most one of the two must be output.)
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> Region names we at most partially standardize. Like we could say
> > >>>>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
> > >>>>>>>>>>>> driver defined.
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
> > >>>>>>>>>>>> region they support.
> > >>>>>>>>>>>>
> > >>>>>>>>>>>> I think this all also works for objects which can be migrated between
> > >>>>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
> > >>>>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
> > >>>>>>>>>>>
> > >>>>>>>>>>> I'm not too sure how to rectify different memory regions with this,
> > >>>>>>>>>>> since drm core doesn't really know about the driver's memory regions.
> > >>>>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
> > >>>>>>>>>>> just don't use the helper?  Or??
> > >>>>>>>>>>
> > >>>>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> > >>>>>>>>>> all works out reasonably consistently?
> > >>>>>>>>>
> > >>>>>>>>> That is basically what we have now.  I could append -system to each to
> > >>>>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
> > >>>>>>>>
> > >>>>>>>> What you have isn't really -system, but everything. So doesn't really make
> > >>>>>>>> sense to me to mark this -system, it's only really true for integrated (if
> > >>>>>>>> they don't have stolen or something like that).
> > >>>>>>>>
> > >>>>>>>> Also my comment was more in reply to Tvrtko's suggestion.
> > >>>>>>>
> > >>>>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
> > >>>>>>> with the current drm-memory-$REGION by extending, rather than creating
> > >>>>>>> confusion with different order of key name components.
> > >>>>>>
> > >>>>>> Oh my comment was pretty much just bikeshed, in case someone creates a
> > >>>>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
> > >>>>>> So $CATEGORY before the -memory.
> > >>>>>>
> > >>>>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
> > >>>>>> folks like :-) I don't really care much personally.
> > >>>>>
> > >>>>> Okay I missed the parsing problem.
> > >>>>>
> > >>>>>>> AMD currently has (among others) drm-memory-vram, which we could define in
> > >>>>>>> the spec maps to category X, if category component is not present.
> > >>>>>>>
> > >>>>>>> Some examples:
> > >>>>>>>
> > >>>>>>> drm-memory-resident-system:
> > >>>>>>> drm-memory-size-lmem0:
> > >>>>>>> drm-memory-active-vram:
> > >>>>>>>
> > >>>>>>> Etc.. I think it creates a consistent story.
> > >>>>>>>
> > >>>>>>> Other than this, my two I think significant opens which haven't been
> > >>>>>>> addressed yet are:
> > >>>>>>>
> > >>>>>>> 1)
> > >>>>>>>
> > >>>>>>> Why do we want totals (not per region) when userspace can trivially
> > >>>>>>> aggregate if they want. What is the use case?
> > >>>>>>>
> > >>>>>>> 2)
> > >>>>>>>
> > >>>>>>> Current proposal limits the value to whole objects and fixates that by
> > >>>>>>> having it in the common code. If/when some driver is able to support sub-BO
> > >>>>>>> granularity they will need to opt out of the common printer at which point
> > >>>>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
> > >>>>>>> some drivers already support this, I don't know. Given how important VM BIND
> > >>>>>>> is I wouldn't be surprised.
> > >>>>>>
> > >>>>>> I feel like for drivers using ttm we want a ttm helper which takes care of
> > >>>>>> the region printing in hopefully a standard way. And that could then also
> > >>>>>> take care of all kinds of of partial binding and funny rules (like maybe
> > >>>>>> we want a standard vram region that addds up all the lmem regions on
> > >>>>>> intel, so that all dgpu have a common vram bucket that generic tools
> > >>>>>> understand?).
> > >>>>>
> > >>>>> First part yes, but for the second I would think we want to avoid any
> > >>>>> aggregation in the kernel which can be done in userspace just as well. Such
> > >>>>> total vram bucket would be pretty useless on Intel even since userspace
> > >>>>> needs to be region aware to make use of all resources. It could even be
> > >>>>> counter productive I think - "why am I getting out of memory when half of my
> > >>>>> vram is unused!?".
> > >>>>
> > >>>> This is not for intel-aware userspace. This is for fairly generic "gputop"
> > >>>> style userspace, which might simply have no clue or interest in what lmemX
> > >>>> means, but would understand vram.
> > >>>>
> > >>>> Aggregating makes sense.
> > >>>
> > >>> Lmem vs vram is now an argument not about aggregation but about
> > >>> standardizing regions names.
> > >>>
> > >>> One detail also is a change in philosophy compared to engine stats where
> > >>> engine names are not centrally prescribed and it was expected userspace
> > >>> will have to handle things generically and with some vendor specific
> > >>> knowledge.
> > >>>
> > >>> Like in my gputop patches. It doesn't need to understand what is what,
> > >>> it just finds what's there and presents it to the user.
> > >>>
> > >>> Come some accel driver with local memory it wouldn't be vram any more.
> > >>> Or even a headless data center GPU. So I really don't think it is good
> > >>> to hardcode 'vram' in the spec, or midlayer, or helpers.
> > >>>
> > >>> And for aggregation.. again, userspace can do it just as well. If we do
> > >>> it in kernel then immediately we have multiple sets of keys to output
> > >>> for any driver which wants to show the region view. IMO it is just
> > >>> pointless work in the kernel and more code in the kernel, when userspace
> > >>> can do it.
> > >>>
> > >>> Proposal A (one a discrete gpu, one category only):
> > >>>
> > >>> drm-resident-memory: x KiB
> > >>> drm-resident-memory-system: x KiB
> > >>> drm-resident-memory-vram: x KiB
> > >>>
> > >>> Two loops in the kernel, more parsing in userspace.
> > >>
> > >> why would it be more than one loop, ie.
> > >>
> > >>      mem.resident += size;
> > >>      mem.category[cat].resident += size;
> > >>
> > >> At the end of the day, there is limited real-estate to show a million
> > >> different columns of information.  Even the gputop patches I posted
> > >> don't show everything of what is currently there.  And nvtop only
> > >> shows toplevel resident stat.  So I think the "everything" stat is
> > >> going to be what most tools use.
> > >
> > > Yeah with enough finesse the double-loop isn't needed, it's just the
> > > simplest possible approach.
> > >
> > > Also this is fdinfo, I _really_ want perf data showing that it's a
> > > real-world problem when we conjecture about algorithmic complexity.
> > > procutils have been algorithmically garbage since decades after all :-)
> >
> > Just run it. :)
> >
> > Algorithmic complexity is quite obvious and not a conjecture - to find
> > DRM clients you have to walk _all_ pids and _all_ fds under them. So
> > amount of work can scale very quickly and even _not_ with the number of
> > DRM clients.
> >
> > It's not too bad on my desktop setup but it is significantly more CPU
> > intensive than top(1).
> >
> > It would be possible to optimise the current code some more by not
> > parsing full fdinfo (may become more important as number of keys grow),
> > but that's only relevant when number of drm fds is large. It doesn't
> > solve the basic pids * open fds search for which we'd need a way to walk
> > the list of pids with drm fds directly.
>
> All of which has (almost[1]) nothing to do with one loop or two
> (ignoring for a moment that I already pointed out a single loop is all
> that is needed).  If CPU overhead is a problem, we could perhaps come
> up some sysfs which has one file per drm_file and side-step crawling
> of all of the proc * fd.  I'll play around with it some but I'm pretty
> sure you are trying to optimize the wrong thing.

Yeah, we have customers that would like a single interface (IOCTL or
sysfs) to get all of this info rather than having to walk a ton of
files and do effectively two syscalls to accumulate all of this data
for all of the processes on the system.

Alex

>
> BR,
> -R
>
> [1] generally a single process using drm has multiple fd's pointing at
> the same drm_file.. which makes the current approach of having to read
> fdinfo to find the client-id sub-optimal.  But still the total # of
> proc * fd is much larger
Tvrtko Ursulin April 17, 2023, 2:20 p.m. UTC | #10
On 17/04/2023 14:42, Rob Clark wrote:
> On Mon, Apr 17, 2023 at 4:10 AM Tvrtko Ursulin
> <tvrtko.ursulin@linux.intel.com> wrote:
>>
>>
>> On 16/04/2023 08:48, Daniel Vetter wrote:
>>> On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
>>>> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
>>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>>
>>>>>
>>>>> On 13/04/2023 21:05, Daniel Vetter wrote:
>>>>>> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
>>>>>>>
>>>>>>> On 13/04/2023 14:27, Daniel Vetter wrote:
>>>>>>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
>>>>>>>>>
>>>>>>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
>>>>>>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
>>>>>>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
>>>>>>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
>>>>>>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
>>>>>>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Add support to dump GEM stats to fdinfo.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
>>>>>>>>>>>>>>> v3: Do it in core
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
>>>>>>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>        Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
>>>>>>>>>>>>>>>        drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
>>>>>>>>>>>>>>>        include/drm/drm_file.h                |  1 +
>>>>>>>>>>>>>>>        include/drm/drm_gem.h                 | 19 +++++++
>>>>>>>>>>>>>>>        4 files changed, 117 insertions(+)
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
>>>>>>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
>>>>>>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
>>>>>>>>>>>>>>>        Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
>>>>>>>>>>>>>>>        indicating kibi- or mebi-bytes.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
>>>>>>>>>>>>>>> +than a single handle).
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +The total size of buffers that are not shared with another file.
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +The total size of buffers that are resident in system memory.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I think this naming maybe does not work best with the existing
>>>>>>>>>>>>>> drm-memory-<region> keys.
>>>>>>>>>>>>>
>>>>>>>>>>>>> Actually, it was very deliberate not to conflict with the existing
>>>>>>>>>>>>> drm-memory-<region> keys ;-)
>>>>>>>>>>>>>
>>>>>>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
>>>>>>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> How about introduce the concept of a memory region from the start and
>>>>>>>>>>>>>> use naming similar like we do for engines?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> 'size' - All reachable objects
>>>>>>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
>>>>>>>>>>>>>> 'resident' - Objects with backing store
>>>>>>>>>>>>>> 'active' - Objects in use, subset of resident
>>>>>>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
>>>>>>>>>>>>>> it right) which could be desirable for a simplified mental model.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
>>>>>>>>>>>>>> correctly captured this in the first round it should be equivalent to
>>>>>>>>>>>>>> 'resident' above. In any case we can document no category is equal to
>>>>>>>>>>>>>> which category, and at most one of the two must be output.)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Region names we at most partially standardize. Like we could say
>>>>>>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
>>>>>>>>>>>>>> driver defined.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
>>>>>>>>>>>>>> region they support.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> I think this all also works for objects which can be migrated between
>>>>>>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
>>>>>>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
>>>>>>>>>>>>>
>>>>>>>>>>>>> I'm not too sure how to rectify different memory regions with this,
>>>>>>>>>>>>> since drm core doesn't really know about the driver's memory regions.
>>>>>>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
>>>>>>>>>>>>> just don't use the helper?  Or??
>>>>>>>>>>>>
>>>>>>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
>>>>>>>>>>>> all works out reasonably consistently?
>>>>>>>>>>>
>>>>>>>>>>> That is basically what we have now.  I could append -system to each to
>>>>>>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
>>>>>>>>>>
>>>>>>>>>> What you have isn't really -system, but everything. So doesn't really make
>>>>>>>>>> sense to me to mark this -system, it's only really true for integrated (if
>>>>>>>>>> they don't have stolen or something like that).
>>>>>>>>>>
>>>>>>>>>> Also my comment was more in reply to Tvrtko's suggestion.
>>>>>>>>>
>>>>>>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
>>>>>>>>> with the current drm-memory-$REGION by extending, rather than creating
>>>>>>>>> confusion with different order of key name components.
>>>>>>>>
>>>>>>>> Oh my comment was pretty much just bikeshed, in case someone creates a
>>>>>>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
>>>>>>>> So $CATEGORY before the -memory.
>>>>>>>>
>>>>>>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
>>>>>>>> folks like :-) I don't really care much personally.
>>>>>>>
>>>>>>> Okay I missed the parsing problem.
>>>>>>>
>>>>>>>>> AMD currently has (among others) drm-memory-vram, which we could define in
>>>>>>>>> the spec maps to category X, if category component is not present.
>>>>>>>>>
>>>>>>>>> Some examples:
>>>>>>>>>
>>>>>>>>> drm-memory-resident-system:
>>>>>>>>> drm-memory-size-lmem0:
>>>>>>>>> drm-memory-active-vram:
>>>>>>>>>
>>>>>>>>> Etc.. I think it creates a consistent story.
>>>>>>>>>
>>>>>>>>> Other than this, my two I think significant opens which haven't been
>>>>>>>>> addressed yet are:
>>>>>>>>>
>>>>>>>>> 1)
>>>>>>>>>
>>>>>>>>> Why do we want totals (not per region) when userspace can trivially
>>>>>>>>> aggregate if they want. What is the use case?
>>>>>>>>>
>>>>>>>>> 2)
>>>>>>>>>
>>>>>>>>> Current proposal limits the value to whole objects and fixates that by
>>>>>>>>> having it in the common code. If/when some driver is able to support sub-BO
>>>>>>>>> granularity they will need to opt out of the common printer at which point
>>>>>>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
>>>>>>>>> some drivers already support this, I don't know. Given how important VM BIND
>>>>>>>>> is I wouldn't be surprised.
>>>>>>>>
>>>>>>>> I feel like for drivers using ttm we want a ttm helper which takes care of
>>>>>>>> the region printing in hopefully a standard way. And that could then also
>>>>>>>> take care of all kinds of of partial binding and funny rules (like maybe
>>>>>>>> we want a standard vram region that addds up all the lmem regions on
>>>>>>>> intel, so that all dgpu have a common vram bucket that generic tools
>>>>>>>> understand?).
>>>>>>>
>>>>>>> First part yes, but for the second I would think we want to avoid any
>>>>>>> aggregation in the kernel which can be done in userspace just as well. Such
>>>>>>> total vram bucket would be pretty useless on Intel even since userspace
>>>>>>> needs to be region aware to make use of all resources. It could even be
>>>>>>> counter productive I think - "why am I getting out of memory when half of my
>>>>>>> vram is unused!?".
>>>>>>
>>>>>> This is not for intel-aware userspace. This is for fairly generic "gputop"
>>>>>> style userspace, which might simply have no clue or interest in what lmemX
>>>>>> means, but would understand vram.
>>>>>>
>>>>>> Aggregating makes sense.
>>>>>
>>>>> Lmem vs vram is now an argument not about aggregation but about
>>>>> standardizing regions names.
>>>>>
>>>>> One detail also is a change in philosophy compared to engine stats where
>>>>> engine names are not centrally prescribed and it was expected userspace
>>>>> will have to handle things generically and with some vendor specific
>>>>> knowledge.
>>>>>
>>>>> Like in my gputop patches. It doesn't need to understand what is what,
>>>>> it just finds what's there and presents it to the user.
>>>>>
>>>>> Come some accel driver with local memory it wouldn't be vram any more.
>>>>> Or even a headless data center GPU. So I really don't think it is good
>>>>> to hardcode 'vram' in the spec, or midlayer, or helpers.
>>>>>
>>>>> And for aggregation.. again, userspace can do it just as well. If we do
>>>>> it in kernel then immediately we have multiple sets of keys to output
>>>>> for any driver which wants to show the region view. IMO it is just
>>>>> pointless work in the kernel and more code in the kernel, when userspace
>>>>> can do it.
>>>>>
>>>>> Proposal A (one a discrete gpu, one category only):
>>>>>
>>>>> drm-resident-memory: x KiB
>>>>> drm-resident-memory-system: x KiB
>>>>> drm-resident-memory-vram: x KiB
>>>>>
>>>>> Two loops in the kernel, more parsing in userspace.
>>>>
>>>> why would it be more than one loop, ie.
>>>>
>>>>       mem.resident += size;
>>>>       mem.category[cat].resident += size;
>>>>
>>>> At the end of the day, there is limited real-estate to show a million
>>>> different columns of information.  Even the gputop patches I posted
>>>> don't show everything of what is currently there.  And nvtop only
>>>> shows toplevel resident stat.  So I think the "everything" stat is
>>>> going to be what most tools use.
>>>
>>> Yeah with enough finesse the double-loop isn't needed, it's just the
>>> simplest possible approach.
>>>
>>> Also this is fdinfo, I _really_ want perf data showing that it's a
>>> real-world problem when we conjecture about algorithmic complexity.
>>> procutils have been algorithmically garbage since decades after all :-)
>>
>> Just run it. :)
>>
>> Algorithmic complexity is quite obvious and not a conjecture - to find
>> DRM clients you have to walk _all_ pids and _all_ fds under them. So
>> amount of work can scale very quickly and even _not_ with the number of
>> DRM clients.
>>
>> It's not too bad on my desktop setup but it is significantly more CPU
>> intensive than top(1).
>>
>> It would be possible to optimise the current code some more by not
>> parsing full fdinfo (may become more important as number of keys grow),
>> but that's only relevant when number of drm fds is large. It doesn't
>> solve the basic pids * open fds search for which we'd need a way to walk
>> the list of pids with drm fds directly.
> 
> All of which has (almost[1]) nothing to do with one loop or two

Correct, this was just a side discussion where I understood Daniel is 
asking about the wider performance story. Perhaps I misunderstood.

> (ignoring for a moment that I already pointed out a single loop is all
> that is needed).  If CPU overhead is a problem, we could perhaps come
> up some sysfs which has one file per drm_file and side-step crawling
> of all of the proc * fd.  I'll play around with it some but I'm pretty
> sure you are trying to optimize the wrong thing.

Yes, that's what I meant too in "a way to walk the list of pids with drm 
fds directly".

Regards,

Tvrtko

> 
> BR,
> -R
> 
> [1] generally a single process using drm has multiple fd's pointing at
> the same drm_file.. which makes the current approach of having to read
> fdinfo to find the client-id sub-optimal.  But still the total # of
> proc * fd is much larger
Rob Clark April 17, 2023, 4:12 p.m. UTC | #11
On Mon, Apr 17, 2023 at 7:20 AM Tvrtko Ursulin
<tvrtko.ursulin@linux.intel.com> wrote:
>
>
> On 17/04/2023 14:42, Rob Clark wrote:
> > On Mon, Apr 17, 2023 at 4:10 AM Tvrtko Ursulin
> > <tvrtko.ursulin@linux.intel.com> wrote:
> >>
> >>
> >> On 16/04/2023 08:48, Daniel Vetter wrote:
> >>> On Fri, Apr 14, 2023 at 06:40:27AM -0700, Rob Clark wrote:
> >>>> On Fri, Apr 14, 2023 at 1:57 AM Tvrtko Ursulin
> >>>> <tvrtko.ursulin@linux.intel.com> wrote:
> >>>>>
> >>>>>
> >>>>> On 13/04/2023 21:05, Daniel Vetter wrote:
> >>>>>> On Thu, Apr 13, 2023 at 05:40:21PM +0100, Tvrtko Ursulin wrote:
> >>>>>>>
> >>>>>>> On 13/04/2023 14:27, Daniel Vetter wrote:
> >>>>>>>> On Thu, Apr 13, 2023 at 01:58:34PM +0100, Tvrtko Ursulin wrote:
> >>>>>>>>>
> >>>>>>>>> On 12/04/2023 20:18, Daniel Vetter wrote:
> >>>>>>>>>> On Wed, Apr 12, 2023 at 11:42:07AM -0700, Rob Clark wrote:
> >>>>>>>>>>> On Wed, Apr 12, 2023 at 11:17 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>>>>>>>>>>
> >>>>>>>>>>>> On Wed, Apr 12, 2023 at 10:59:54AM -0700, Rob Clark wrote:
> >>>>>>>>>>>>> On Wed, Apr 12, 2023 at 7:42 AM Tvrtko Ursulin
> >>>>>>>>>>>>> <tvrtko.ursulin@linux.intel.com> wrote:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> On 11/04/2023 23:56, Rob Clark wrote:
> >>>>>>>>>>>>>>> From: Rob Clark <robdclark@chromium.org>
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Add support to dump GEM stats to fdinfo.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> v2: Fix typos, change size units to match docs, use div_u64
> >>>>>>>>>>>>>>> v3: Do it in core
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> Signed-off-by: Rob Clark <robdclark@chromium.org>
> >>>>>>>>>>>>>>> Reviewed-by: Emil Velikov <emil.l.velikov@gmail.com>
> >>>>>>>>>>>>>>> ---
> >>>>>>>>>>>>>>>        Documentation/gpu/drm-usage-stats.rst | 21 ++++++++
> >>>>>>>>>>>>>>>        drivers/gpu/drm/drm_file.c            | 76 +++++++++++++++++++++++++++
> >>>>>>>>>>>>>>>        include/drm/drm_file.h                |  1 +
> >>>>>>>>>>>>>>>        include/drm/drm_gem.h                 | 19 +++++++
> >>>>>>>>>>>>>>>        4 files changed, 117 insertions(+)
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>>>> index b46327356e80..b5e7802532ed 100644
> >>>>>>>>>>>>>>> --- a/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>>>> +++ b/Documentation/gpu/drm-usage-stats.rst
> >>>>>>>>>>>>>>> @@ -105,6 +105,27 @@ object belong to this client, in the respective memory region.
> >>>>>>>>>>>>>>>        Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
> >>>>>>>>>>>>>>>        indicating kibi- or mebi-bytes.
> >>>>>>>>>>>>>>>
> >>>>>>>>>>>>>>> +- drm-shared-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>> +The total size of buffers that are shared with another file (ie. have more
> >>>>>>>>>>>>>>> +than a single handle).
> >>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>> +- drm-private-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>> +The total size of buffers that are not shared with another file.
> >>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>> +- drm-resident-memory: <uint> [KiB|MiB]
> >>>>>>>>>>>>>>> +
> >>>>>>>>>>>>>>> +The total size of buffers that are resident in system memory.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> I think this naming maybe does not work best with the existing
> >>>>>>>>>>>>>> drm-memory-<region> keys.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> Actually, it was very deliberate not to conflict with the existing
> >>>>>>>>>>>>> drm-memory-<region> keys ;-)
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I wouldn't have preferred drm-memory-{active,resident,...} but it
> >>>>>>>>>>>>> could be mis-parsed by existing userspace so my hands were a bit tied.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>>> How about introduce the concept of a memory region from the start and
> >>>>>>>>>>>>>> use naming similar like we do for engines?
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> drm-memory-$CATEGORY-$REGION: ...
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Then we document a bunch of categories and their semantics, for instance:
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> 'size' - All reachable objects
> >>>>>>>>>>>>>> 'shared' - Subset of 'size' with handle_count > 1
> >>>>>>>>>>>>>> 'resident' - Objects with backing store
> >>>>>>>>>>>>>> 'active' - Objects in use, subset of resident
> >>>>>>>>>>>>>> 'purgeable' - Or inactive? Subset of resident.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> We keep the same semantics as with process memory accounting (if I got
> >>>>>>>>>>>>>> it right) which could be desirable for a simplified mental model.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> (AMD needs to remind me of their 'drm-memory-...' keys semantics. If we
> >>>>>>>>>>>>>> correctly captured this in the first round it should be equivalent to
> >>>>>>>>>>>>>> 'resident' above. In any case we can document no category is equal to
> >>>>>>>>>>>>>> which category, and at most one of the two must be output.)
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Region names we at most partially standardize. Like we could say
> >>>>>>>>>>>>>> 'system' is to be used where backing store is system RAM and others are
> >>>>>>>>>>>>>> driver defined.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> Then discrete GPUs could emit N sets of key-values, one for each memory
> >>>>>>>>>>>>>> region they support.
> >>>>>>>>>>>>>>
> >>>>>>>>>>>>>> I think this all also works for objects which can be migrated between
> >>>>>>>>>>>>>> memory regions. 'Size' accounts them against all regions while for
> >>>>>>>>>>>>>> 'resident' they only appear in the region of their current placement, etc.
> >>>>>>>>>>>>>
> >>>>>>>>>>>>> I'm not too sure how to rectify different memory regions with this,
> >>>>>>>>>>>>> since drm core doesn't really know about the driver's memory regions.
> >>>>>>>>>>>>> Perhaps we can go back to this being a helper and drivers with vram
> >>>>>>>>>>>>> just don't use the helper?  Or??
> >>>>>>>>>>>>
> >>>>>>>>>>>> I think if you flip it around to drm-$CATEGORY-memory{-$REGION}: then it
> >>>>>>>>>>>> all works out reasonably consistently?
> >>>>>>>>>>>
> >>>>>>>>>>> That is basically what we have now.  I could append -system to each to
> >>>>>>>>>>> make things easier to add vram/etc (from a uabi standpoint)..
> >>>>>>>>>>
> >>>>>>>>>> What you have isn't really -system, but everything. So doesn't really make
> >>>>>>>>>> sense to me to mark this -system, it's only really true for integrated (if
> >>>>>>>>>> they don't have stolen or something like that).
> >>>>>>>>>>
> >>>>>>>>>> Also my comment was more in reply to Tvrtko's suggestion.
> >>>>>>>>>
> >>>>>>>>> Right so my proposal was drm-memory-$CATEGORY-$REGION which I think aligns
> >>>>>>>>> with the current drm-memory-$REGION by extending, rather than creating
> >>>>>>>>> confusion with different order of key name components.
> >>>>>>>>
> >>>>>>>> Oh my comment was pretty much just bikeshed, in case someone creates a
> >>>>>>>> $REGION that other drivers use for $CATEGORY. Kinda Rob's parsing point.
> >>>>>>>> So $CATEGORY before the -memory.
> >>>>>>>>
> >>>>>>>> Otoh I don't think that'll happen, so I guess we can go with whatever more
> >>>>>>>> folks like :-) I don't really care much personally.
> >>>>>>>
> >>>>>>> Okay I missed the parsing problem.
> >>>>>>>
> >>>>>>>>> AMD currently has (among others) drm-memory-vram, which we could define in
> >>>>>>>>> the spec maps to category X, if category component is not present.
> >>>>>>>>>
> >>>>>>>>> Some examples:
> >>>>>>>>>
> >>>>>>>>> drm-memory-resident-system:
> >>>>>>>>> drm-memory-size-lmem0:
> >>>>>>>>> drm-memory-active-vram:
> >>>>>>>>>
> >>>>>>>>> Etc.. I think it creates a consistent story.
> >>>>>>>>>
> >>>>>>>>> Other than this, my two I think significant opens which haven't been
> >>>>>>>>> addressed yet are:
> >>>>>>>>>
> >>>>>>>>> 1)
> >>>>>>>>>
> >>>>>>>>> Why do we want totals (not per region) when userspace can trivially
> >>>>>>>>> aggregate if they want. What is the use case?
> >>>>>>>>>
> >>>>>>>>> 2)
> >>>>>>>>>
> >>>>>>>>> Current proposal limits the value to whole objects and fixates that by
> >>>>>>>>> having it in the common code. If/when some driver is able to support sub-BO
> >>>>>>>>> granularity they will need to opt out of the common printer at which point
> >>>>>>>>> it may be less churn to start with a helper rather than mid-layer. Or maybe
> >>>>>>>>> some drivers already support this, I don't know. Given how important VM BIND
> >>>>>>>>> is I wouldn't be surprised.
> >>>>>>>>
> >>>>>>>> I feel like for drivers using ttm we want a ttm helper which takes care of
> >>>>>>>> the region printing in hopefully a standard way. And that could then also
> >>>>>>>> take care of all kinds of of partial binding and funny rules (like maybe
> >>>>>>>> we want a standard vram region that addds up all the lmem regions on
> >>>>>>>> intel, so that all dgpu have a common vram bucket that generic tools
> >>>>>>>> understand?).
> >>>>>>>
> >>>>>>> First part yes, but for the second I would think we want to avoid any
> >>>>>>> aggregation in the kernel which can be done in userspace just as well. Such
> >>>>>>> total vram bucket would be pretty useless on Intel even since userspace
> >>>>>>> needs to be region aware to make use of all resources. It could even be
> >>>>>>> counter productive I think - "why am I getting out of memory when half of my
> >>>>>>> vram is unused!?".
> >>>>>>
> >>>>>> This is not for intel-aware userspace. This is for fairly generic "gputop"
> >>>>>> style userspace, which might simply have no clue or interest in what lmemX
> >>>>>> means, but would understand vram.
> >>>>>>
> >>>>>> Aggregating makes sense.
> >>>>>
> >>>>> Lmem vs vram is now an argument not about aggregation but about
> >>>>> standardizing regions names.
> >>>>>
> >>>>> One detail also is a change in philosophy compared to engine stats where
> >>>>> engine names are not centrally prescribed and it was expected userspace
> >>>>> will have to handle things generically and with some vendor specific
> >>>>> knowledge.
> >>>>>
> >>>>> Like in my gputop patches. It doesn't need to understand what is what,
> >>>>> it just finds what's there and presents it to the user.
> >>>>>
> >>>>> Come some accel driver with local memory it wouldn't be vram any more.
> >>>>> Or even a headless data center GPU. So I really don't think it is good
> >>>>> to hardcode 'vram' in the spec, or midlayer, or helpers.
> >>>>>
> >>>>> And for aggregation.. again, userspace can do it just as well. If we do
> >>>>> it in kernel then immediately we have multiple sets of keys to output
> >>>>> for any driver which wants to show the region view. IMO it is just
> >>>>> pointless work in the kernel and more code in the kernel, when userspace
> >>>>> can do it.
> >>>>>
> >>>>> Proposal A (one a discrete gpu, one category only):
> >>>>>
> >>>>> drm-resident-memory: x KiB
> >>>>> drm-resident-memory-system: x KiB
> >>>>> drm-resident-memory-vram: x KiB
> >>>>>
> >>>>> Two loops in the kernel, more parsing in userspace.
> >>>>
> >>>> why would it be more than one loop, ie.
> >>>>
> >>>>       mem.resident += size;
> >>>>       mem.category[cat].resident += size;
> >>>>
> >>>> At the end of the day, there is limited real-estate to show a million
> >>>> different columns of information.  Even the gputop patches I posted
> >>>> don't show everything of what is currently there.  And nvtop only
> >>>> shows toplevel resident stat.  So I think the "everything" stat is
> >>>> going to be what most tools use.
> >>>
> >>> Yeah with enough finesse the double-loop isn't needed, it's just the
> >>> simplest possible approach.
> >>>
> >>> Also this is fdinfo, I _really_ want perf data showing that it's a
> >>> real-world problem when we conjecture about algorithmic complexity.
> >>> procutils have been algorithmically garbage since decades after all :-)
> >>
> >> Just run it. :)
> >>
> >> Algorithmic complexity is quite obvious and not a conjecture - to find
> >> DRM clients you have to walk _all_ pids and _all_ fds under them. So
> >> amount of work can scale very quickly and even _not_ with the number of
> >> DRM clients.
> >>
> >> It's not too bad on my desktop setup but it is significantly more CPU
> >> intensive than top(1).
> >>
> >> It would be possible to optimise the current code some more by not
> >> parsing full fdinfo (may become more important as number of keys grow),
> >> but that's only relevant when number of drm fds is large. It doesn't
> >> solve the basic pids * open fds search for which we'd need a way to walk
> >> the list of pids with drm fds directly.
> >
> > All of which has (almost[1]) nothing to do with one loop or two
>
> Correct, this was just a side discussion where I understood Daniel is
> asking about the wider performance story. Perhaps I misunderstood.
>
> > (ignoring for a moment that I already pointed out a single loop is all
> > that is needed).  If CPU overhead is a problem, we could perhaps come
> > up some sysfs which has one file per drm_file and side-step crawling
> > of all of the proc * fd.  I'll play around with it some but I'm pretty
> > sure you are trying to optimize the wrong thing.
>
> Yes, that's what I meant too in "a way to walk the list of pids with drm
> fds directly".

Just to follow up, I did a quick hack to loop and print the mem
stats.. 5x loops I couldn't really measure any increase in gputop CPU
utilization.  At 50x loops I could measure a small increase.  Without
additional looping to artificially increase the cost, nothing drm
related shows up in a perf-record of gputop.

What could be an easy optimization, if it can be accessed, is to parse
/sys/kernel/debug/dri/<n>/clients to get the list of pid's of
processes with the drm device open.  This would cut down quite a bit
the # of pid's to examine.

BR,
-R

> Regards,
>
> Tvrtko
>
> >
> > BR,
> > -R
> >
> > [1] generally a single process using drm has multiple fd's pointing at
> > the same drm_file.. which makes the current approach of having to read
> > fdinfo to find the client-id sub-optimal.  But still the total # of
> > proc * fd is much larger
diff mbox series

Patch

diff --git a/Documentation/gpu/drm-usage-stats.rst b/Documentation/gpu/drm-usage-stats.rst
index b46327356e80..b5e7802532ed 100644
--- a/Documentation/gpu/drm-usage-stats.rst
+++ b/Documentation/gpu/drm-usage-stats.rst
@@ -105,6 +105,27 @@  object belong to this client, in the respective memory region.
 Default unit shall be bytes with optional unit specifiers of 'KiB' or 'MiB'
 indicating kibi- or mebi-bytes.
 
+- drm-shared-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are shared with another file (ie. have more
+than a single handle).
+
+- drm-private-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are not shared with another file.
+
+- drm-resident-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are resident in system memory.
+
+- drm-purgeable-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are purgeable.
+
+- drm-active-memory: <uint> [KiB|MiB]
+
+The total size of buffers that are active on one or more rings.
+
 - drm-cycles-<str> <uint>
 
 Engine identifier string must be the same as the one specified in the
diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 37dfaa6be560..46fdd843bb3a 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -42,6 +42,7 @@ 
 #include <drm/drm_client.h>
 #include <drm/drm_drv.h>
 #include <drm/drm_file.h>
+#include <drm/drm_gem.h>
 #include <drm/drm_print.h>
 
 #include "drm_crtc_internal.h"
@@ -871,6 +872,79 @@  void drm_send_event(struct drm_device *dev, struct drm_pending_event *e)
 }
 EXPORT_SYMBOL(drm_send_event);
 
+static void print_size(struct drm_printer *p, const char *stat, size_t sz)
+{
+	const char *units[] = {"", " KiB", " MiB"};
+	unsigned u;
+
+	for (u = 0; u < ARRAY_SIZE(units) - 1; u++) {
+		if (sz < SZ_1K)
+			break;
+		sz = div_u64(sz, SZ_1K);
+	}
+
+	drm_printf(p, "%s:\t%zu%s\n", stat, sz, units[u]);
+}
+
+static void print_memory_stats(struct drm_printer *p, struct drm_file *file)
+{
+	struct drm_gem_object *obj;
+	struct {
+		size_t shared;
+		size_t private;
+		size_t resident;
+		size_t purgeable;
+		size_t active;
+	} size = {0};
+	bool has_status = false;
+	int id;
+
+	spin_lock(&file->table_lock);
+	idr_for_each_entry (&file->object_idr, obj, id) {
+		enum drm_gem_object_status s = 0;
+
+		if (obj->funcs && obj->funcs->status) {
+			s = obj->funcs->status(obj);
+			has_status = true;
+		}
+
+		if (obj->handle_count > 1) {
+			size.shared += obj->size;
+		} else {
+			size.private += obj->size;
+		}
+
+		if (s & DRM_GEM_OBJECT_RESIDENT) {
+			size.resident += obj->size;
+		} else {
+			/* If already purged or not yet backed by pages, don't
+			 * count it as purgeable:
+			 */
+			s &= ~DRM_GEM_OBJECT_PURGEABLE;
+		}
+
+		if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) {
+			size.active += obj->size;
+
+			/* If still active, don't count as purgeable: */
+			s &= ~DRM_GEM_OBJECT_PURGEABLE;
+		}
+
+		if (s & DRM_GEM_OBJECT_PURGEABLE)
+			size.purgeable += obj->size;
+	}
+	spin_unlock(&file->table_lock);
+
+	print_size(p, "drm-shared-memory", size.shared);
+	print_size(p, "drm-private-memory", size.private);
+	print_size(p, "drm-active-memory", size.active);
+
+	if (has_status) {
+		print_size(p, "drm-resident-memory", size.resident);
+		print_size(p, "drm-purgeable-memory", size.purgeable);
+	}
+}
+
 /**
  * drm_fop_show_fdinfo - helper for drm file fops
  * @seq_file: output stream
@@ -904,6 +978,8 @@  void drm_fop_show_fdinfo(struct seq_file *m, struct file *f)
 
 	if (dev->driver->show_fdinfo)
 		dev->driver->show_fdinfo(&p, file);
+
+	print_memory_stats(&p, file);
 }
 EXPORT_SYMBOL(drm_fop_show_fdinfo);
 
diff --git a/include/drm/drm_file.h b/include/drm/drm_file.h
index dfa995b787e1..e5b40084538f 100644
--- a/include/drm/drm_file.h
+++ b/include/drm/drm_file.h
@@ -41,6 +41,7 @@ 
 struct dma_fence;
 struct drm_file;
 struct drm_device;
+struct drm_printer;
 struct device;
 struct file;
 
diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h
index 189fd618ca65..213917bb6b11 100644
--- a/include/drm/drm_gem.h
+++ b/include/drm/drm_gem.h
@@ -42,6 +42,14 @@ 
 struct iosys_map;
 struct drm_gem_object;
 
+/**
+ * enum drm_gem_object_status - bitmask of object state for fdinfo reporting
+ */
+enum drm_gem_object_status {
+	DRM_GEM_OBJECT_RESIDENT  = BIT(0),
+	DRM_GEM_OBJECT_PURGEABLE = BIT(1),
+};
+
 /**
  * struct drm_gem_object_funcs - GEM object functions
  */
@@ -174,6 +182,17 @@  struct drm_gem_object_funcs {
 	 */
 	int (*evict)(struct drm_gem_object *obj);
 
+	/**
+	 * @status:
+	 *
+	 * The optional status callback can return additional object state
+	 * which determines which stats the object is counted against.  The
+	 * callback is called under table_lock.  Racing against object status
+	 * change is "harmless", and the callback can expect to not race
+	 * against object destruction.
+	 */
+	enum drm_gem_object_status (*status)(struct drm_gem_object *obj);
+
 	/**
 	 * @vm_ops:
 	 *