Message ID | 20201001002709.21361-3-jonathan@marek.ca |
---|---|
State | New |
Headers | show |
Series | drm/msm: support for host-cached BOs | expand |
On Wed, Sep 30, 2020 at 08:27:05PM -0400, Jonathan Marek wrote: > This makes it possible to use the non-coherent cached MSM_BO_CACHED mode, > which otherwise doesn't provide any method for cleaning/invalidating the > cache to sync with the device. > > Signed-off-by: Jonathan Marek <jonathan@marek.ca> > --- > drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++ > drivers/gpu/drm/msm/msm_drv.h | 2 ++ > drivers/gpu/drm/msm/msm_gem.c | 15 +++++++++++++++ > include/uapi/drm/msm_drm.h | 20 ++++++++++++++++++++ > 4 files changed, 58 insertions(+) > > diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c > index 9716210495fc..305db1db1064 100644 > --- a/drivers/gpu/drm/msm/msm_drv.c > +++ b/drivers/gpu/drm/msm/msm_drv.c > @@ -964,6 +964,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data, > return msm_submitqueue_remove(file->driver_priv, id); > } > > +static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data, > + struct drm_file *file) > +{ > + struct drm_msm_gem_sync_cache *args = data; > + struct drm_gem_object *obj; > + > + if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS) > + return -EINVAL; > + > + obj = drm_gem_object_lookup(file, args->handle); > + if (!obj) > + return -ENOENT; > + > + msm_gem_sync_cache(obj, args->flags, args->offset, args->end); > + > + drm_gem_object_put(obj); > + > + return 0; > +} > + > static const struct drm_ioctl_desc msm_ioctls[] = { > DRM_IOCTL_DEF_DRV(MSM_GET_PARAM, msm_ioctl_get_param, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(MSM_GEM_NEW, msm_ioctl_gem_new, DRM_RENDER_ALLOW), > @@ -976,6 +996,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = { > DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW, msm_ioctl_submitqueue_new, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW), > DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW), > + DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE, msm_ioctl_gem_sync_cache, DRM_RENDER_ALLOW), > }; > > static const struct vm_operations_struct vm_ops = { > diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h > index 6384844b1696..5e932dae453f 100644 > --- a/drivers/gpu/drm/msm/msm_drv.h > +++ b/drivers/gpu/drm/msm/msm_drv.h > @@ -314,6 +314,8 @@ void msm_gem_move_to_active(struct drm_gem_object *obj, > void msm_gem_move_to_inactive(struct drm_gem_object *obj); > int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); > int msm_gem_cpu_fini(struct drm_gem_object *obj); > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, > + size_t range_start, size_t range_end); > void msm_gem_free_object(struct drm_gem_object *obj); > int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file, > uint32_t size, uint32_t flags, uint32_t *handle, char *name); > diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c > index ad9a627493ae..93da88b3fc50 100644 > --- a/drivers/gpu/drm/msm/msm_gem.c > +++ b/drivers/gpu/drm/msm/msm_gem.c > @@ -8,6 +8,7 @@ > #include <linux/shmem_fs.h> > #include <linux/dma-buf.h> > #include <linux/pfn_t.h> > +#include <linux/dma-noncoherent.h> > > #include <drm/drm_prime.h> > > @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj) > return 0; > } > > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, > + size_t range_start, size_t range_end) > +{ > + struct msm_gem_object *msm_obj = to_msm_bo(obj); > + > + /* TODO: sync only the required range, and don't invalidate on clean */ > + > + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) Curious why you would rename these - I feel like the to_device / to_cpu model is pretty well baked into our thought process. I know from personal experience that I have to stop and think to remember which direction is which. Jordan > + sync_for_device(msm_obj); > + > + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) > + sync_for_cpu(msm_obj); > +} > + > #ifdef CONFIG_DEBUG_FS > static void describe_fence(struct dma_fence *fence, const char *type, > struct seq_file *m) > diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h > index 474497e8743a..1dfafa71fc94 100644 > --- a/include/uapi/drm/msm_drm.h > +++ b/include/uapi/drm/msm_drm.h > @@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query { > __u32 pad; > }; > > +/* > + * Host cache maintenance (relevant for MSM_BO_CACHED) > + * driver may both clean/invalidate (flush) for clean > + */ > + > +#define MSM_GEM_SYNC_CACHE_CLEAN 0x1 > +#define MSM_GEM_SYNC_CACHE_INVALIDATE 0x2 > + > +#define MSM_GEM_SYNC_CACHE_FLAGS (MSM_GEM_SYNC_CACHE_CLEAN | \ > + MSM_GEM_SYNC_CACHE_INVALIDATE) > + > +struct drm_msm_gem_sync_cache { > + __u32 handle; > + __u32 flags; > + __u64 offset; > + __u64 end; /* offset + size */ > +}; > + > #define DRM_MSM_GET_PARAM 0x00 > /* placeholder: > #define DRM_MSM_SET_PARAM 0x01 > @@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query { > #define DRM_MSM_SUBMITQUEUE_NEW 0x0A > #define DRM_MSM_SUBMITQUEUE_CLOSE 0x0B > #define DRM_MSM_SUBMITQUEUE_QUERY 0x0C > +#define DRM_MSM_GEM_SYNC_CACHE 0x0D > > #define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param) > #define DRM_IOCTL_MSM_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new) > @@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query { > #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue) > #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32) > #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query) > +#define DRM_IOCTL_MSM_GEM_SYNC_CACHE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache) > > #if defined(__cplusplus) > } > -- > 2.26.1 > > _______________________________________________ > Freedreno mailing list > Freedreno@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/freedreno
> @@ -8,6 +8,7 @@ > #include <linux/shmem_fs.h> > #include <linux/dma-buf.h> > #include <linux/pfn_t.h> > +#include <linux/dma-noncoherent.h> NAK, dma-noncoherent.h is not for driver use. And will in fact go away in 5.10. > > #include <drm/drm_prime.h> > > @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj) > return 0; > } > > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, > + size_t range_start, size_t range_end) > +{ > + struct msm_gem_object *msm_obj = to_msm_bo(obj); > + > + /* TODO: sync only the required range, and don't invalidate on clean */ > + > + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) > + sync_for_device(msm_obj); > + > + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) > + sync_for_cpu(msm_obj); And make to these ones as well. They are complete abuses of the DMA API, and while we had to live with that for now to not cause regressions they absoutely must not be exposed in a userspace ABI like this.
On 10/2/20 3:53 AM, Christoph Hellwig wrote: >> @@ -8,6 +8,7 @@ >> #include <linux/shmem_fs.h> >> #include <linux/dma-buf.h> >> #include <linux/pfn_t.h> >> +#include <linux/dma-noncoherent.h> > > NAK, dma-noncoherent.h is not for driver use. And will in fact go > away in 5.10. > Not actually used, so can be removed. >> >> #include <drm/drm_prime.h> >> >> @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj) >> return 0; >> } >> >> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, >> + size_t range_start, size_t range_end) >> +{ >> + struct msm_gem_object *msm_obj = to_msm_bo(obj); >> + >> + /* TODO: sync only the required range, and don't invalidate on clean */ >> + >> + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) >> + sync_for_device(msm_obj); >> + >> + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) >> + sync_for_cpu(msm_obj); > > And make to these ones as well. They are complete abuses of the DMA > API, and while we had to live with that for now to not cause regressions > they absoutely must not be exposed in a userspace ABI like this. > How do you propose that cached non-coherent memory be implemented? It is a useful feature for userspace.
On Fri, Oct 02, 2020 at 08:46:35AM -0400, Jonathan Marek wrote: > > > +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, > > > + size_t range_start, size_t range_end) > > > +{ > > > + struct msm_gem_object *msm_obj = to_msm_bo(obj); > > > + > > > + /* TODO: sync only the required range, and don't invalidate on clean */ > > > + > > > + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) > > > + sync_for_device(msm_obj); > > > + > > > + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) > > > + sync_for_cpu(msm_obj); > > > > And make to these ones as well. They are complete abuses of the DMA > > API, and while we had to live with that for now to not cause regressions > > they absoutely must not be exposed in a userspace ABI like this. > > > > How do you propose that cached non-coherent memory be implemented? It is a > useful feature for userspace. If the driver is using the DMA API you need to use dma_alloc_noncoherent and friends as of 5.10 (see the iommu list for the discussion). If you use the raw IOMMU API (which I think the msm drm driver does) you need to work with the maintainers to implement a cache synchronization API that is not tied to the DMA API.
On 10/5/20 4:29 AM, Christoph Hellwig wrote: > On Fri, Oct 02, 2020 at 08:46:35AM -0400, Jonathan Marek wrote: >>>> +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, >>>> + size_t range_start, size_t range_end) >>>> +{ >>>> + struct msm_gem_object *msm_obj = to_msm_bo(obj); >>>> + >>>> + /* TODO: sync only the required range, and don't invalidate on clean */ >>>> + >>>> + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) >>>> + sync_for_device(msm_obj); >>>> + >>>> + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) >>>> + sync_for_cpu(msm_obj); >>> >>> And make to these ones as well. They are complete abuses of the DMA >>> API, and while we had to live with that for now to not cause regressions >>> they absoutely must not be exposed in a userspace ABI like this. >>> >> >> How do you propose that cached non-coherent memory be implemented? It is a >> useful feature for userspace. > > If the driver is using the DMA API you need to use dma_alloc_noncoherent > and friends as of 5.10 (see the iommu list for the discussion). > > If you use the raw IOMMU API (which I think the msm drm driver does) you > need to work with the maintainers to implement a cache synchronization > API that is not tied to the DMA API. > The cache synchronization doesn't have anything to do with IOMMU (for example: cache synchronization would be useful in cases where drm/msm doesn't use IOMMU). What is needed is to call arch_sync_dma_for_{cpu,device} (which is what I went with initially, but then decided to re-use drm/msm's sync_for_{cpu,device}). But you are also saying those functions aren't for driver use, and I doubt IOMMU maintainers will want to add wrappers for these functions just to satisfy this "not for driver use" requirement.
On Mon, Oct 05, 2020 at 10:35:43AM -0400, Jonathan Marek wrote: > The cache synchronization doesn't have anything to do with IOMMU (for > example: cache synchronization would be useful in cases where drm/msm > doesn't use IOMMU). It has to do with doing DMA. And we have two frameworks for doing DMA: either the DMA API which is for general driver use, and which as part of the design includes cache maintainance hidden behind the concept of ownership transfers. And we have the much more bare bones IOMMU API. If people want to use the "raw" IOMMU API with not cache coherent devices we'll need a cache maintainance API that goes along with it. It could either be formally part of the IOMMU API or be separate. > What is needed is to call arch_sync_dma_for_{cpu,device} (which is what I > went with initially, but then decided to re-use drm/msm's > sync_for_{cpu,device}). But you are also saying those functions aren't for > driver use, and I doubt IOMMU maintainers will want to add wrappers for > these functions just to satisfy this "not for driver use" requirement. arch_sync_dma_for_{cpu,device} are low-level helpers (and not very great ones at that). The definitively should not be used by drivers. They would be very useful buildblocks for a IOMMU cache maintainance API. Of course the best outcome would be if we could find a way for the MSM drm driver to just use DMA API and not deal with the lower level abstractions. Do you remember why the driver went for use of the IOMMU API?
On 10/6/20 3:23 AM, Christoph Hellwig wrote: > On Mon, Oct 05, 2020 at 10:35:43AM -0400, Jonathan Marek wrote: >> The cache synchronization doesn't have anything to do with IOMMU (for >> example: cache synchronization would be useful in cases where drm/msm >> doesn't use IOMMU). > > It has to do with doing DMA. And we have two frameworks for doing DMA: > either the DMA API which is for general driver use, and which as part of > the design includes cache maintainance hidden behind the concept of > ownership transfers. And we have the much more bare bones IOMMU API. > > If people want to use the "raw" IOMMU API with not cache coherent > devices we'll need a cache maintainance API that goes along with it. > It could either be formally part of the IOMMU API or be separate. > >> What is needed is to call arch_sync_dma_for_{cpu,device} (which is what I >> went with initially, but then decided to re-use drm/msm's >> sync_for_{cpu,device}). But you are also saying those functions aren't for >> driver use, and I doubt IOMMU maintainers will want to add wrappers for >> these functions just to satisfy this "not for driver use" requirement. > > arch_sync_dma_for_{cpu,device} are low-level helpers (and not very > great ones at that). The definitively should not be used by drivers. > They would be very useful buildblocks for a IOMMU cache maintainance > API. > > Of course the best outcome would be if we could find a way for the MSM > drm driver to just use DMA API and not deal with the lower level > abstractions. Do you remember why the driver went for use of the IOMMU > API? > One example why drm/msm can't use DMA API is multiple page table support (that is landing in 5.10), which is something that definitely couldn't work with DMA API. Another one is being able to choose the address for mappings, which AFAIK DMA API can't do (somewhat related to this: qcom hardware often has ranges of allowed addresses, which the dma_mask mechanism fails to represent, what I see is drivers using dma_mask as a "maximum address", and since addresses are allocated from the top it generally works) But let us imagine drm/msm switches to using DMA API. a2xx GPUs have their own very basic MMU (implemented by msm_gpummu.c), that will need to implement dma_map_ops, which will have to call arch_sync_dma_for_{cpu,device}. So drm/msm still needs to call arch_sync_dma_for_{cpu,device} in that scenario.
On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote: > One example why drm/msm can't use DMA API is multiple page table support > (that is landing in 5.10), which is something that definitely couldn't work > with DMA API. > > Another one is being able to choose the address for mappings, which AFAIK > DMA API can't do (somewhat related to this: qcom hardware often has ranges > of allowed addresses, which the dma_mask mechanism fails to represent, what > I see is drivers using dma_mask as a "maximum address", and since addresses > are allocated from the top it generally works) That sounds like a good enough rason to use the IOMMU API. I just wanted to make sure this really makes sense.
On Tue, Oct 06, 2020 at 08:23:06AM +0100, Christoph Hellwig wrote: > If people want to use the "raw" IOMMU API with not cache coherent > devices we'll need a cache maintainance API that goes along with it. > It could either be formally part of the IOMMU API or be separate. The IOMMU-API does not care about the caching effects of DMA, is manages IO address spaces for devices. I also don't know how this would be going to be implemented, the IOMMU-API does not have the concept of handles for mapped ranges and does not care about CPU virtual addresses (which are needed for cache flushes) of the memory it maps into IO page-tables. So I think a cache management API should be separate from the IOMMU-API. Regards, Joerg
On 2020-10-07 07:25, Christoph Hellwig wrote: > On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote: >> One example why drm/msm can't use DMA API is multiple page table support >> (that is landing in 5.10), which is something that definitely couldn't work >> with DMA API. >> >> Another one is being able to choose the address for mappings, which AFAIK >> DMA API can't do (somewhat related to this: qcom hardware often has ranges >> of allowed addresses, which the dma_mask mechanism fails to represent, what >> I see is drivers using dma_mask as a "maximum address", and since addresses >> are allocated from the top it generally works) > > That sounds like a good enough rason to use the IOMMU API. I just > wanted to make sure this really makes sense. I still think this situation would be best handled with a variant of dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set automatically when attaching to an unmanaged IOMMU domain. That way the device driver can make DMA API calls in the appropriate places that do the right thing either way, and only needs logic to decide whether to use the returned DMA addresses directly or ignore them if it knows they're overridden by its own IOMMU mapping. Robin.
On Tue, Oct 13, 2020 at 6:42 AM Robin Murphy <robin.murphy@arm.com> wrote: > > On 2020-10-07 07:25, Christoph Hellwig wrote: > > On Tue, Oct 06, 2020 at 09:19:32AM -0400, Jonathan Marek wrote: > >> One example why drm/msm can't use DMA API is multiple page table support > >> (that is landing in 5.10), which is something that definitely couldn't work > >> with DMA API. > >> > >> Another one is being able to choose the address for mappings, which AFAIK > >> DMA API can't do (somewhat related to this: qcom hardware often has ranges > >> of allowed addresses, which the dma_mask mechanism fails to represent, what > >> I see is drivers using dma_mask as a "maximum address", and since addresses > >> are allocated from the top it generally works) > > > > That sounds like a good enough rason to use the IOMMU API. I just > > wanted to make sure this really makes sense. > > I still think this situation would be best handled with a variant of > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set > automatically when attaching to an unmanaged IOMMU domain. That way the > device driver can make DMA API calls in the appropriate places that do > the right thing either way, and only needs logic to decide whether to > use the returned DMA addresses directly or ignore them if it knows > they're overridden by its own IOMMU mapping. > That would be pretty ideal from my PoV BR, -R
On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote: > I still think this situation would be best handled with a variant of > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set > automatically when attaching to an unmanaged IOMMU domain. dma_ops_bypass should mostly do the right thing as-is. swiotlb bouncing is triggered of two things: 1) the dma_mask. This is under control of the driver, and obviously if it is too small for a legit reason we can't just proceed 2) force_dma_unencrypted() - we'd need to do an opt-out here, either by a flag or by being smart and looking for an attached iommu on the device > That way the > device driver can make DMA API calls in the appropriate places that do the > right thing either way, and only needs logic to decide whether to use the > returned DMA addresses directly or ignore them if it knows they're > overridden by its own IOMMU mapping. I'd be happy to review patches for this.
On Thu, Oct 15, 2020 at 07:55:32AM +0100, Christoph Hellwig wrote: > On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote: > > I still think this situation would be best handled with a variant of > > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set > > automatically when attaching to an unmanaged IOMMU domain. > > dma_ops_bypass should mostly do the right thing as-is. swiotlb bouncing > is triggered of two things: > > 1) the dma_mask. This is under control of the driver, and obviously > if it is too small for a legit reason we can't just proceed Somewhat related, but is there a way to tell the dma-api to fail instead of falling back to swiotlb? In many case for gpu drivers it's much better if we fall back to dma_alloc_coherent and manage the copying ourselves instead of abstracting this away in the dma-api. Currently that's "solved" rather pessimistically by always allocating from dma_alloc_coherent if swiotlb could be in the picture (at least for ttm based drivers, i915 just falls over). -Daniel > 2) force_dma_unencrypted() - we'd need to do an opt-out here, either > by a flag or by being smart and looking for an attached iommu on > the device > > > That way the > > device driver can make DMA API calls in the appropriate places that do the > > right thing either way, and only needs logic to decide whether to use the > > returned DMA addresses directly or ignore them if it knows they're > > overridden by its own IOMMU mapping. > > I'd be happy to review patches for this.
On Thu, Oct 15, 2020 at 05:33:34PM +0200, Daniel Vetter wrote: > On Thu, Oct 15, 2020 at 07:55:32AM +0100, Christoph Hellwig wrote: > > On Tue, Oct 13, 2020 at 02:42:38PM +0100, Robin Murphy wrote: > > > I still think this situation would be best handled with a variant of > > > dma_ops_bypass that also guarantees to bypass SWIOTLB, and can be set > > > automatically when attaching to an unmanaged IOMMU domain. > > > > dma_ops_bypass should mostly do the right thing as-is. swiotlb bouncing > > is triggered of two things: > > > > 1) the dma_mask. This is under control of the driver, and obviously > > if it is too small for a legit reason we can't just proceed > > Somewhat related, but is there a way to tell the dma-api to fail instead > of falling back to swiotlb? In many case for gpu drivers it's much better > if we fall back to dma_alloc_coherent and manage the copying ourselves > instead of abstracting this away in the dma-api. Currently that's "solved" > rather pessimistically by always allocating from dma_alloc_coherent if > swiotlb could be in the picture (at least for ttm based drivers, i915 just > falls over). Is this for the alloc_pages plus manually map logic in various drivers? They should switch to the new dma_alloc_pages API that I'll send to Linus for 5.10 soon.
On Thu, Oct 15, 2020 at 04:43:01PM +0100, Christoph Hellwig wrote: > > Somewhat related, but is there a way to tell the dma-api to fail instead > > of falling back to swiotlb? In many case for gpu drivers it's much better > > if we fall back to dma_alloc_coherent and manage the copying ourselves > > instead of abstracting this away in the dma-api. Currently that's "solved" > > rather pessimistically by always allocating from dma_alloc_coherent if > > swiotlb could be in the picture (at least for ttm based drivers, i915 just > > falls over). > > Is this for the alloc_pages plus manually map logic in various drivers? > > They should switch to the new dma_alloc_pages API that I'll send to > Linus for 5.10 soon. Daniel, can you clarify this?
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 9716210495fc..305db1db1064 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -964,6 +964,26 @@ static int msm_ioctl_submitqueue_close(struct drm_device *dev, void *data, return msm_submitqueue_remove(file->driver_priv, id); } +static int msm_ioctl_gem_sync_cache(struct drm_device *dev, void *data, + struct drm_file *file) +{ + struct drm_msm_gem_sync_cache *args = data; + struct drm_gem_object *obj; + + if (args->flags & ~MSM_GEM_SYNC_CACHE_FLAGS) + return -EINVAL; + + obj = drm_gem_object_lookup(file, args->handle); + if (!obj) + return -ENOENT; + + msm_gem_sync_cache(obj, args->flags, args->offset, args->end); + + drm_gem_object_put(obj); + + return 0; +} + static const struct drm_ioctl_desc msm_ioctls[] = { DRM_IOCTL_DEF_DRV(MSM_GET_PARAM, msm_ioctl_get_param, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_GEM_NEW, msm_ioctl_gem_new, DRM_RENDER_ALLOW), @@ -976,6 +996,7 @@ static const struct drm_ioctl_desc msm_ioctls[] = { DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_NEW, msm_ioctl_submitqueue_new, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_CLOSE, msm_ioctl_submitqueue_close, DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(MSM_SUBMITQUEUE_QUERY, msm_ioctl_submitqueue_query, DRM_RENDER_ALLOW), + DRM_IOCTL_DEF_DRV(MSM_GEM_SYNC_CACHE, msm_ioctl_gem_sync_cache, DRM_RENDER_ALLOW), }; static const struct vm_operations_struct vm_ops = { diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 6384844b1696..5e932dae453f 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -314,6 +314,8 @@ void msm_gem_move_to_active(struct drm_gem_object *obj, void msm_gem_move_to_inactive(struct drm_gem_object *obj); int msm_gem_cpu_prep(struct drm_gem_object *obj, uint32_t op, ktime_t *timeout); int msm_gem_cpu_fini(struct drm_gem_object *obj); +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, + size_t range_start, size_t range_end); void msm_gem_free_object(struct drm_gem_object *obj); int msm_gem_new_handle(struct drm_device *dev, struct drm_file *file, uint32_t size, uint32_t flags, uint32_t *handle, char *name); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index ad9a627493ae..93da88b3fc50 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -8,6 +8,7 @@ #include <linux/shmem_fs.h> #include <linux/dma-buf.h> #include <linux/pfn_t.h> +#include <linux/dma-noncoherent.h> #include <drm/drm_prime.h> @@ -808,6 +809,20 @@ int msm_gem_cpu_fini(struct drm_gem_object *obj) return 0; } +void msm_gem_sync_cache(struct drm_gem_object *obj, uint32_t flags, + size_t range_start, size_t range_end) +{ + struct msm_gem_object *msm_obj = to_msm_bo(obj); + + /* TODO: sync only the required range, and don't invalidate on clean */ + + if (flags & MSM_GEM_SYNC_CACHE_CLEAN) + sync_for_device(msm_obj); + + if (flags & MSM_GEM_SYNC_CACHE_INVALIDATE) + sync_for_cpu(msm_obj); +} + #ifdef CONFIG_DEBUG_FS static void describe_fence(struct dma_fence *fence, const char *type, struct seq_file *m) diff --git a/include/uapi/drm/msm_drm.h b/include/uapi/drm/msm_drm.h index 474497e8743a..1dfafa71fc94 100644 --- a/include/uapi/drm/msm_drm.h +++ b/include/uapi/drm/msm_drm.h @@ -319,6 +319,24 @@ struct drm_msm_submitqueue_query { __u32 pad; }; +/* + * Host cache maintenance (relevant for MSM_BO_CACHED) + * driver may both clean/invalidate (flush) for clean + */ + +#define MSM_GEM_SYNC_CACHE_CLEAN 0x1 +#define MSM_GEM_SYNC_CACHE_INVALIDATE 0x2 + +#define MSM_GEM_SYNC_CACHE_FLAGS (MSM_GEM_SYNC_CACHE_CLEAN | \ + MSM_GEM_SYNC_CACHE_INVALIDATE) + +struct drm_msm_gem_sync_cache { + __u32 handle; + __u32 flags; + __u64 offset; + __u64 end; /* offset + size */ +}; + #define DRM_MSM_GET_PARAM 0x00 /* placeholder: #define DRM_MSM_SET_PARAM 0x01 @@ -336,6 +354,7 @@ struct drm_msm_submitqueue_query { #define DRM_MSM_SUBMITQUEUE_NEW 0x0A #define DRM_MSM_SUBMITQUEUE_CLOSE 0x0B #define DRM_MSM_SUBMITQUEUE_QUERY 0x0C +#define DRM_MSM_GEM_SYNC_CACHE 0x0D #define DRM_IOCTL_MSM_GET_PARAM DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GET_PARAM, struct drm_msm_param) #define DRM_IOCTL_MSM_GEM_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_GEM_NEW, struct drm_msm_gem_new) @@ -348,6 +367,7 @@ struct drm_msm_submitqueue_query { #define DRM_IOCTL_MSM_SUBMITQUEUE_NEW DRM_IOWR(DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_NEW, struct drm_msm_submitqueue) #define DRM_IOCTL_MSM_SUBMITQUEUE_CLOSE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_CLOSE, __u32) #define DRM_IOCTL_MSM_SUBMITQUEUE_QUERY DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_SUBMITQUEUE_QUERY, struct drm_msm_submitqueue_query) +#define DRM_IOCTL_MSM_GEM_SYNC_CACHE DRM_IOW (DRM_COMMAND_BASE + DRM_MSM_GEM_SYNC_CACHE, struct drm_msm_gem_sync_cache) #if defined(__cplusplus) }
This makes it possible to use the non-coherent cached MSM_BO_CACHED mode, which otherwise doesn't provide any method for cleaning/invalidating the cache to sync with the device. Signed-off-by: Jonathan Marek <jonathan@marek.ca> --- drivers/gpu/drm/msm/msm_drv.c | 21 +++++++++++++++++++++ drivers/gpu/drm/msm/msm_drv.h | 2 ++ drivers/gpu/drm/msm/msm_gem.c | 15 +++++++++++++++ include/uapi/drm/msm_drm.h | 20 ++++++++++++++++++++ 4 files changed, 58 insertions(+)