diff mbox series

[RFC,2/4] ASoC: soc-generic-dmaengine-pcm: Add custom prepare and submit function

Message ID 20201117080354.4309-4-michael.wei.hong.sit@intel.com
State Superseded
Headers show
Series Enable DMA mode on Intel Keem Bay platform | expand

Commit Message

Michael Sit Wei Hong Nov. 17, 2020, 8:03 a.m. UTC
Enabling custom prepare and submit function to overcome DMA limitation.

In the Intel KeemBay solution, the DW AXI-based DMA has a limitation on
the number of DMA blocks per transfer. In the case of 16 bit audio ASoC
would allocate blocks exceeding the DMA block limitation.

The ASoC layers are not aware of such DMA limitation, and the DMA engine
does not provide an API to set the maximum number of blocks per linked link.

This patch suggests an additional callback to let the caller check and modify
the number of blocks per transfer to work-around the limitations.

Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
---
 include/sound/dmaengine_pcm.h         |  6 ++++++
 sound/core/pcm_dmaengine.c            | 30 ++++++++++++++++++++++-----
 sound/soc/soc-generic-dmaengine-pcm.c |  8 ++++++-
 3 files changed, 38 insertions(+), 6 deletions(-)

Comments

Vinod Koul Nov. 17, 2020, 1:56 p.m. UTC | #1
On 17-11-20, 16:03, Michael Sit Wei Hong wrote:
> Enabling custom prepare and submit function to overcome DMA limitation.
> 
> In the Intel KeemBay solution, the DW AXI-based DMA has a limitation on
> the number of DMA blocks per transfer. In the case of 16 bit audio ASoC
> would allocate blocks exceeding the DMA block limitation.

Looking at this and cover does not tell me the limitation of the
hardware. Can you please elaborate what is the special feature this
hardware brings which results in custom solution here..?
> 
> The ASoC layers are not aware of such DMA limitation, and the DMA engine
> does not provide an API to set the maximum number of blocks per linked link.

What does a block refer to here..?

Btw have you looked into dma_slave_caps specifically max_sg_burst field
added recently

Thanks

> This patch suggests an additional callback to let the caller check and modify
> the number of blocks per transfer to work-around the limitations.
> 
> Signed-off-by: Michael Sit Wei Hong <michael.wei.hong.sit@intel.com>
> Reviewed-by: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>
> ---
>  include/sound/dmaengine_pcm.h         |  6 ++++++
>  sound/core/pcm_dmaengine.c            | 30 ++++++++++++++++++++++-----
>  sound/soc/soc-generic-dmaengine-pcm.c |  8 ++++++-
>  3 files changed, 38 insertions(+), 6 deletions(-)
> 
> diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h
> index 8c5e38180fb0..9fae56d39ae2 100644
> --- a/include/sound/dmaengine_pcm.h
> +++ b/include/sound/dmaengine_pcm.h
> @@ -28,6 +28,9 @@ snd_pcm_substream_to_dma_direction(const struct snd_pcm_substream *substream)
>  int snd_hwparams_to_dma_slave_config(const struct snd_pcm_substream *substream,
>  	const struct snd_pcm_hw_params *params, struct dma_slave_config *slave_config);
>  int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd);
> +int snd_dmaengine_pcm_custom_trigger(struct snd_pcm_substream *substream, int cmd,
> +	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream));
> +
>  snd_pcm_uframes_t snd_dmaengine_pcm_pointer(struct snd_pcm_substream *substream);
>  snd_pcm_uframes_t snd_dmaengine_pcm_pointer_no_residue(struct snd_pcm_substream *substream);
>  
> @@ -113,6 +116,8 @@ int snd_dmaengine_pcm_refine_runtime_hwparams(
>   *   which do not use devicetree.
>   * @process: Callback used to apply processing on samples transferred from/to
>   *   user space.
> + * @custom_pcm_prepare_and_submit: Callback used to work-around DMA limitations
> + *   related to link lists.
>   * @compat_filter_fn: Will be used as the filter function when requesting a
>   *  channel for platforms which do not use devicetree. The filter parameter
>   *  will be the DAI's DMA data.
> @@ -138,6 +143,7 @@ struct snd_dmaengine_pcm_config {
>  	int (*process)(struct snd_pcm_substream *substream,
>  		       int channel, unsigned long hwoff,
>  		       void *buf, unsigned long bytes);
> +	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream);
>  	dma_filter_fn compat_filter_fn;
>  	struct device *dma_dev;
>  	const char *chan_names[SNDRV_PCM_STREAM_LAST + 1];
> diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c
> index 4d059ff2b2e4..cbd1429de509 100644
> --- a/sound/core/pcm_dmaengine.c
> +++ b/sound/core/pcm_dmaengine.c
> @@ -170,16 +170,20 @@ static int dmaengine_pcm_prepare_and_submit(struct snd_pcm_substream *substream)
>  }
>  
>  /**
> - * snd_dmaengine_pcm_trigger - dmaengine based PCM trigger implementation
> + * snd_dmaengine_pcm_custom_trigger - customized PCM trigger implementation to
> + *  work-around DMA limitations related to link lists.
>   * @substream: PCM substream
>   * @cmd: Trigger command
> + * @custom_pcm_prepare_and_submit: custom function to deal with DMA limitations
>   *
>   * Returns 0 on success, a negative error code otherwise.
>   *
> - * This function can be used as the PCM trigger callback for dmaengine based PCM
> - * driver implementations.
> + * This function can be used as the PCM trigger callback for customized dmaengine
> + * based PCM driver implementations.
>   */
> -int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
> +
> +int snd_dmaengine_pcm_custom_trigger(struct snd_pcm_substream *substream, int cmd,
> +	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream))
>  {
>  	struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream);
>  	struct snd_pcm_runtime *runtime = substream->runtime;
> @@ -187,7 +191,7 @@ int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
>  
>  	switch (cmd) {
>  	case SNDRV_PCM_TRIGGER_START:
> -		ret = dmaengine_pcm_prepare_and_submit(substream);
> +		ret = custom_pcm_prepare_and_submit(substream);
>  		if (ret)
>  			return ret;
>  		dma_async_issue_pending(prtd->dma_chan);
> @@ -214,6 +218,22 @@ int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
>  
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_custom_trigger);
> +
> +/**
> + * snd_dmaengine_pcm_trigger - dmaengine based PCM trigger implementation
> + * @substream: PCM substream
> + * @cmd: Trigger command
> + *
> + * Returns 0 on success, a negative error code otherwise.
> + *
> + * This function can be used as the PCM trigger callback for dmaengine based PCM
> + * driver implementations.
> + */
> +int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
> +{
> +	return snd_dmaengine_pcm_custom_trigger(substream, cmd, dmaengine_pcm_prepare_and_submit);
> +}
>  EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_trigger);
>  
>  /**
> diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c
> index 9ef80a48707e..88fca6402a36 100644
> --- a/sound/soc/soc-generic-dmaengine-pcm.c
> +++ b/sound/soc/soc-generic-dmaengine-pcm.c
> @@ -173,7 +173,13 @@ static int dmaengine_pcm_close(struct snd_soc_component *component,
>  static int dmaengine_pcm_trigger(struct snd_soc_component *component,
>  				 struct snd_pcm_substream *substream, int cmd)
>  {
> -	return snd_dmaengine_pcm_trigger(substream, cmd);
> +	struct dmaengine_pcm *pcm = soc_component_to_pcm(component);
> +
> +	if (pcm->config && pcm->config->custom_pcm_prepare_and_submit)
> +		return snd_dmaengine_pcm_custom_trigger(substream, cmd,
> +							pcm->config->custom_pcm_prepare_and_submit);
> +	else
> +		return snd_dmaengine_pcm_trigger(substream, cmd);
>  }
>  
>  static struct dma_chan *dmaengine_pcm_compat_request_channel(
> -- 
> 2.17.1
Andy Shevchenko Nov. 17, 2020, 3:50 p.m. UTC | #2
On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei Hong wrote:
> Enabling custom prepare and submit function to overcome DMA limitation.
> 
> In the Intel KeemBay solution, the DW AXI-based DMA has a limitation on
> the number of DMA blocks per transfer. In the case of 16 bit audio ASoC
> would allocate blocks exceeding the DMA block limitation.

I'm still not sure the hardware has such a limitation.

The Synopsys IP supports linked list (LLI) transfers and I hardly can
imagine the list has any limitation. Even though, one can always emulate
LLI in software how it's done in the DesignWare AHB DMA driver.

I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM and didn't
find any errata or limits like this.
Sia, Jee Heng Nov. 18, 2020, 12:34 a.m. UTC | #3
> -----Original Message-----
> From: Andy Shevchenko <andriy.shevchenko@intel.com>
> Sent: 17 November 2020 11:51 PM
> To: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>
> Cc: alsa-devel@alsa-project.org; tiwai@suse.com; broonie@kernel.org; pierre-
> louis.bossart@linux.intel.com; Rojewski, Cezary <cezary.rojewski@intel.com>;
> liam.r.girdwood@linux.intel.com; Sia, Jee Heng <jee.heng.sia@intel.com>;
> vkoul@kernel.org; lars@metafoo.de
> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add custom
> prepare and submit function
> 
> On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei Hong wrote:
> > Enabling custom prepare and submit function to overcome DMA limitation.
> >
> > In the Intel KeemBay solution, the DW AXI-based DMA has a limitation
> > on the number of DMA blocks per transfer. In the case of 16 bit audio
> > ASoC would allocate blocks exceeding the DMA block limitation.
> 
> I'm still not sure the hardware has such a limitation.
> 
> The Synopsys IP supports linked list (LLI) transfers and I hardly can imagine the
> list has any limitation. Even though, one can always emulate LLI in software how
> it's done in the DesignWare AHB DMA driver.
> 
> I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM and didn't find any
> errata or limits like this.
[>>] Intel KeemBay datasheet can be found at below link:
https://www.intel.com/content/www/us/en/secure/design/confidential/products-and-solutions/processors-and-chipsets/keem-bay/technical-library.html?grouping=EMT_Content%20Type&sort=title:asc
Pg783, "Programmable transfer length (block length), max 1024". Sub-list can't exceed 1024 blocks.
BTW, I think the 16bits audio could be a confusion because it is not about the number of bits, but rather the constraint of the block length. Base on my understanding, Audio needs a period larger than 10ms, regardless of the bit depth.
The questions here are:
1. Should DMAEngine expose a new API to constraint the block_length (instead of size in bytes)?
2. Should DMA client re-split the linked-list before passing the linked-list to DMAEngine.
3. Should DMA controller driver re-split the linked-list before execution.
> 
> --
> With Best Regards,
> Andy Shevchenko
>
Vinod Koul Nov. 18, 2020, 4:40 a.m. UTC | #4
On 18-11-20, 00:34, Sia, Jee Heng wrote:
> 
> 
> > -----Original Message-----
> > From: Andy Shevchenko <andriy.shevchenko@intel.com>
> > Sent: 17 November 2020 11:51 PM
> > To: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>
> > Cc: alsa-devel@alsa-project.org; tiwai@suse.com; broonie@kernel.org; pierre-
> > louis.bossart@linux.intel.com; Rojewski, Cezary <cezary.rojewski@intel.com>;
> > liam.r.girdwood@linux.intel.com; Sia, Jee Heng <jee.heng.sia@intel.com>;
> > vkoul@kernel.org; lars@metafoo.de
> > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add custom
> > prepare and submit function
> > 
> > On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei Hong wrote:
> > > Enabling custom prepare and submit function to overcome DMA limitation.
> > >
> > > In the Intel KeemBay solution, the DW AXI-based DMA has a limitation
> > > on the number of DMA blocks per transfer. In the case of 16 bit audio
> > > ASoC would allocate blocks exceeding the DMA block limitation.
> > 
> > I'm still not sure the hardware has such a limitation.
> > 
> > The Synopsys IP supports linked list (LLI) transfers and I hardly can imagine the
> > list has any limitation. Even though, one can always emulate LLI in software how
> > it's done in the DesignWare AHB DMA driver.
> > 
> > I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM and didn't find any
> > errata or limits like this.
> [>>] Intel KeemBay datasheet can be found at below link:

** Please wrap your replies to 80 chars ** I have reflown below


> https://www.intel.com/content/www/us/en/secure/design/confidential/products-and-solutions/processors-and-chipsets/keem-bay/technical-library.html?grouping=EMT_Content%20Type&sort=title:asc

And this link is not accessible freely!

> Pg783, "Programmable transfer length (block length), max 1024".

Okay so block length cant be more than 1024, and IIUC that is 1024 items
and not bytes right

> Sub-list can't exceed 1024 blocks.  BTW, I think the 16bits audio
> could be a confusion because it is not about the number of bits, but
> rather the constraint of the block length.
> Base on my understanding,
> Audio needs a period larger than 10ms, regardless of the bit depth.
> The questions here are:
> 1. Should DMAEngine expose a new API to constraint the block_length
> (instead of size in bytes)?
> 2. Should DMA client re-split the linked-list before passing the
> linked-list to DMAEngine.
> 3. Should DMA controller driver re-split the linked-list before
> execution.

I would go with 3, this is not a fwk issue and can be easily handled in
the dma driver. It knows the limitation on block and can split a period
into multiple blocks and set the interrupt on last block. I do not see
why that will not work
Sia, Jee Heng Nov. 18, 2020, 5:27 a.m. UTC | #5
> -----Original Message-----
> From: Vinod Koul <vkoul@kernel.org>
> Sent: 18 November 2020 12:41 PM
> To: Sia, Jee Heng <jee.heng.sia@intel.com>
> Cc: Shevchenko, Andriy <andriy.shevchenko@intel.com>; Sit, Michael Wei Hong
> <michael.wei.hong.sit@intel.com>; alsa-devel@alsa-project.org; tiwai@suse.com;
> broonie@kernel.org; pierre-louis.bossart@linux.intel.com; Rojewski, Cezary
> <cezary.rojewski@intel.com>; liam.r.girdwood@linux.intel.com; lars@metafoo.de
> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add custom
> prepare and submit function
> 
> On 18-11-20, 00:34, Sia, Jee Heng wrote:
> >
> >
> > > -----Original Message-----
> > > From: Andy Shevchenko <andriy.shevchenko@intel.com>
> > > Sent: 17 November 2020 11:51 PM
> > > To: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>
> > > Cc: alsa-devel@alsa-project.org; tiwai@suse.com; broonie@kernel.org;
> > > pierre- louis.bossart@linux.intel.com; Rojewski, Cezary
> > > <cezary.rojewski@intel.com>; liam.r.girdwood@linux.intel.com; Sia,
> > > Jee Heng <jee.heng.sia@intel.com>; vkoul@kernel.org; lars@metafoo.de
> > > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add
> > > custom prepare and submit function
> > >
> > > On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei Hong wrote:
> > > > Enabling custom prepare and submit function to overcome DMA limitation.
> > > >
> > > > In the Intel KeemBay solution, the DW AXI-based DMA has a
> > > > limitation on the number of DMA blocks per transfer. In the case
> > > > of 16 bit audio ASoC would allocate blocks exceeding the DMA block limitation.
> > >
> > > I'm still not sure the hardware has such a limitation.
> > >
> > > The Synopsys IP supports linked list (LLI) transfers and I hardly
> > > can imagine the list has any limitation. Even though, one can always
> > > emulate LLI in software how it's done in the DesignWare AHB DMA driver.
> > >
> > > I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM and didn't
> > > find any errata or limits like this.
> > [>>] Intel KeemBay datasheet can be found at below link:
> 
> ** Please wrap your replies to 80 chars ** I have reflown below
[>>] Noted.
> 
> 
> > https://www.intel.com/content/www/us/en/secure/design/confidential/pro
> > ducts-and-solutions/processors-and-chipsets/keem-bay/technical-library
> > .html?grouping=EMT_Content%20Type&sort=title:asc
> 
> And this link is not accessible freely!
[>>] Sorry, perhaps need to apply the access.
> 
> > Pg783, "Programmable transfer length (block length), max 1024".
> 
> Okay so block length cant be more than 1024, and IIUC that is 1024 items and not
> bytes right
[>>] Yes, it is 1024 items/blocks
> 
> > Sub-list can't exceed 1024 blocks.  BTW, I think the 16bits audio
> > could be a confusion because it is not about the number of bits, but
> > rather the constraint of the block length.
> > Base on my understanding,
> > Audio needs a period larger than 10ms, regardless of the bit depth.
> > The questions here are:
> > 1. Should DMAEngine expose a new API to constraint the block_length
> > (instead of size in bytes)?
> > 2. Should DMA client re-split the linked-list before passing the
> > linked-list to DMAEngine.
> > 3. Should DMA controller driver re-split the linked-list before
> > execution.
> 
> I would go with 3, this is not a fwk issue and can be easily handled in the dma driver.
> It knows the limitation on block and can split a period into multiple blocks and set
> the interrupt on last block. I do not see why that will not work
[>>] Got it. A separate patch shall be submitted to AxiDMA to split the linked-list. 
Hope the rest of folks are fine with this approach.
> 
> --
> ~Vinod
Sia, Jee Heng Nov. 24, 2020, 3:51 a.m. UTC | #6
> -----Original Message-----
> From: Shevchenko, Andriy <andriy.shevchenko@intel.com>
> Sent: 18 November 2020 10:51 PM
> To: Sia, Jee Heng <jee.heng.sia@intel.com>
> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>; alsa-devel@alsa-
> project.org; tiwai@suse.com; broonie@kernel.org; pierre-
> louis.bossart@linux.intel.com; Rojewski, Cezary <cezary.rojewski@intel.com>;
> liam.r.girdwood@linux.intel.com; vkoul@kernel.org; lars@metafoo.de
> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add custom
> prepare and submit function
> 
> On Wed, Nov 18, 2020 at 02:34:22AM +0200, Sia, Jee Heng wrote:
> > > From: Andy Shevchenko <andriy.shevchenko@intel.com>
> > > Sent: 17 November 2020 11:51 PM
> > > On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei Hong wrote:
> > > > Enabling custom prepare and submit function to overcome DMA limitation.
> > > >
> > > > In the Intel KeemBay solution, the DW AXI-based DMA has a
> > > > limitation on the number of DMA blocks per transfer. In the case
> > > > of 16 bit audio ASoC would allocate blocks exceeding the DMA block limitation.
> > >
> > > I'm still not sure the hardware has such a limitation.
> > >
> > > The Synopsys IP supports linked list (LLI) transfers and I hardly
> > > can imagine the list has any limitation. Even though, one can always
> > > emulate LLI in software how it's done in the DesignWare AHB DMA driver.
> > >
> > > I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM and didn't
> > > find any errata or limits like this.
> > [>>] Intel KeemBay datasheet can be found at below link:
> > https://www.intel.com/content/www/us/en/secure/design/confidential/pro
> > ducts-and-solutions/processors-and-chipsets/keem-bay/technical-library
> > .html?grouping=EMT_Content%20Type&sort=title:asc
> > Pg783, "Programmable transfer length (block length), max 1024". Sub-list can't
> exceed 1024 blocks.
> > BTW, I think the 16bits audio could be a confusion because it is not about the
> number of bits, but rather the constraint of the block length. Base on my
> understanding, Audio needs a period larger than 10ms, regardless of the bit depth.
> > The questions here are:
> > 1. Should DMAEngine expose a new API to constraint the block_length (instead of
> size in bytes)?
> > 2. Should DMA client re-split the linked-list before passing the linked-list to
> DMAEngine.
> > 3. Should DMA controller driver re-split the linked-list before execution.
> 
> Since we have DMA slave capabilities, the consumer may ask for them and prepare
> the SG list accordingly.
> 
> Above limitation is a block size (value of 1023 is a maximum, meaning 1024
> maximum items in the block of given transfer width). So, like DesignWare DMA
> (AHB) has up to 4095 (but I saw hardware with 2047) or iDMA 32- and 64-bit have
> 131071. There is no limitation for amount of blocks of given maximum length in the
> LLI!
> 
> No hacks are needed, really.
[>>] Hi All, can we have the agreement that DMA clients should optimize the linked-list
[>>] by retrieve the DMA capabilities from the DMA controller?
[>>] I noticed that Vinod voted #3 but Andy voted #2. 
[>>] We need your decision to move on.
> 
> --
> With Best Regards,
> Andy Shevchenko
>
Michael Sit Wei Hong Nov. 30, 2020, 9:57 a.m. UTC | #7
> -----Original Message-----
> From: Sia, Jee Heng <jee.heng.sia@intel.com>
> Sent: Tuesday, 24 November, 2020 11:51 AM
> To: Shevchenko, Andriy <andriy.shevchenko@intel.com>
> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;
> alsa-devel@alsa-project.org; tiwai@suse.com;
> broonie@kernel.org; pierre-louis.bossart@linux.intel.com;
> Rojewski, Cezary <cezary.rojewski@intel.com>;
> liam.r.girdwood@linux.intel.com; vkoul@kernel.org;
> lars@metafoo.de
> Subject: RE: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm:
> Add custom prepare and submit function
> 
> 
> 
> > -----Original Message-----
> > From: Shevchenko, Andriy <andriy.shevchenko@intel.com>
> > Sent: 18 November 2020 10:51 PM
> > To: Sia, Jee Heng <jee.heng.sia@intel.com>
> > Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;
> > alsa-devel@alsa- project.org; tiwai@suse.com;
> broonie@kernel.org;
> > pierre- louis.bossart@linux.intel.com; Rojewski, Cezary
> > <cezary.rojewski@intel.com>; liam.r.girdwood@linux.intel.com;
> > vkoul@kernel.org; lars@metafoo.de
> > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-
> pcm: Add
> > custom prepare and submit function
> >
> > On Wed, Nov 18, 2020 at 02:34:22AM +0200, Sia, Jee Heng
> wrote:
> > > > From: Andy Shevchenko <andriy.shevchenko@intel.com>
> > > > Sent: 17 November 2020 11:51 PM
> > > > On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei
> Hong wrote:
> > > > > Enabling custom prepare and submit function to
> overcome DMA limitation.
> > > > >
> > > > > In the Intel KeemBay solution, the DW AXI-based DMA
> has a
> > > > > limitation on the number of DMA blocks per transfer. In
> the case
> > > > > of 16 bit audio ASoC would allocate blocks exceeding the
> DMA block limitation.
> > > >
> > > > I'm still not sure the hardware has such a limitation.
> > > >
> > > > The Synopsys IP supports linked list (LLI) transfers and I
> hardly
> > > > can imagine the list has any limitation. Even though, one can
> > > > always emulate LLI in software how it's done in the
> DesignWare AHB DMA driver.
> > > >
> > > > I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM
> and
> > > > didn't find any errata or limits like this.
> > > [>>] Intel KeemBay datasheet can be found at below link:
> > >
> https://www.intel.com/content/www/us/en/secure/design/con
> fidential/p
> > > ro
> > > ducts-and-solutions/processors-and-chipsets/keem-
> bay/technical-libra
> > > ry .html?grouping=EMT_Content%20Type&sort=title:asc
> > > Pg783, "Programmable transfer length (block length), max
> 1024".
> > > Sub-list can't
> > exceed 1024 blocks.
> > > BTW, I think the 16bits audio could be a confusion because it is
> not
> > > about the
> > number of bits, but rather the constraint of the block length.
> Base on
> > my understanding, Audio needs a period larger than 10ms,
> regardless of the bit depth.
> > > The questions here are:
> > > 1. Should DMAEngine expose a new API to constraint the
> block_length
> > > (instead of
> > size in bytes)?
> > > 2. Should DMA client re-split the linked-list before passing the
> > > linked-list to
> > DMAEngine.
> > > 3. Should DMA controller driver re-split the linked-list before
> execution.
> >
> > Since we have DMA slave capabilities, the consumer may ask
> for them
> > and prepare the SG list accordingly.
> >
> > Above limitation is a block size (value of 1023 is a maximum,
> meaning
> > 1024 maximum items in the block of given transfer width). So,
> like
> > DesignWare DMA
> > (AHB) has up to 4095 (but I saw hardware with 2047) or iDMA
> 32- and
> > 64-bit have 131071. There is no limitation for amount of blocks
> of
> > given maximum length in the LLI!
> >
> > No hacks are needed, really.
> [>>] Hi All, can we have the agreement that DMA clients should
> optimize the linked-list [>>] by retrieve the DMA capabilities from
> the DMA controller?
> [>>] I noticed that Vinod voted #3 but Andy voted #2.
> [>>] We need your decision to move on.
> >
> > --
> > With Best Regards,
> > Andy Shevchenko
> >
Hi everyone,

Is there anymore comment on this RFC?
We will be using the ASoC framework to split the linked-list, since resplitting the linked-list in DMA is a no go.
If there isn't any more comments, we will push these patches for review and merging.

Thanks,
Regards,
Michael
Lars-Peter Clausen Nov. 30, 2020, 10:45 a.m. UTC | #8
On 11/30/20 10:57 AM, Sit, Michael Wei Hong wrote:
>
>> -----Original Message-----
>> From: Sia, Jee Heng <jee.heng.sia@intel.com>
>> Sent: Tuesday, 24 November, 2020 11:51 AM
>> To: Shevchenko, Andriy <andriy.shevchenko@intel.com>
>> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;
>> alsa-devel@alsa-project.org; tiwai@suse.com;
>> broonie@kernel.org; pierre-louis.bossart@linux.intel.com;
>> Rojewski, Cezary <cezary.rojewski@intel.com>;
>> liam.r.girdwood@linux.intel.com; vkoul@kernel.org;
>> lars@metafoo.de
>> Subject: RE: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm:
>> Add custom prepare and submit function
>>
>>
>>
>>> -----Original Message-----
>>> From: Shevchenko, Andriy <andriy.shevchenko@intel.com>
>>> Sent: 18 November 2020 10:51 PM
>>> To: Sia, Jee Heng <jee.heng.sia@intel.com>
>>> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;
>>> alsa-devel@alsa- project.org; tiwai@suse.com;
>> broonie@kernel.org;
>>> pierre- louis.bossart@linux.intel.com; Rojewski, Cezary
>>> <cezary.rojewski@intel.com>; liam.r.girdwood@linux.intel.com;
>>> vkoul@kernel.org; lars@metafoo.de
>>> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-
>> pcm: Add
>>> custom prepare and submit function
>>>
>>> On Wed, Nov 18, 2020 at 02:34:22AM +0200, Sia, Jee Heng
>> wrote:
>>>>> From: Andy Shevchenko <andriy.shevchenko@intel.com>
>>>>> Sent: 17 November 2020 11:51 PM
>>>>> On Tue, Nov 17, 2020 at 04:03:48PM +0800, Michael Sit Wei
>> Hong wrote:
>>>>>> Enabling custom prepare and submit function to
>> overcome DMA limitation.
>>>>>> In the Intel KeemBay solution, the DW AXI-based DMA
>> has a
>>>>>> limitation on the number of DMA blocks per transfer. In
>> the case
>>>>>> of 16 bit audio ASoC would allocate blocks exceeding the
>> DMA block limitation.
>>>>> I'm still not sure the hardware has such a limitation.
>>>>>
>>>>> The Synopsys IP supports linked list (LLI) transfers and I
>> hardly
>>>>> can imagine the list has any limitation. Even though, one can
>>>>> always emulate LLI in software how it's done in the
>> DesignWare AHB DMA driver.
>>>>> I have briefly read chapter 4.6 "AXI_DMA" of Keem Bay TRM
>> and
>>>>> didn't find any errata or limits like this.
>>>> [>>] Intel KeemBay datasheet can be found at below link:
>>>>
>> https://www.intel.com/content/www/us/en/secure/design/con
>> fidential/p
>>>> ro
>>>> ducts-and-solutions/processors-and-chipsets/keem-
>> bay/technical-libra
>>>> ry .html?grouping=EMT_Content%20Type&sort=title:asc
>>>> Pg783, "Programmable transfer length (block length), max
>> 1024".
>>>> Sub-list can't
>>> exceed 1024 blocks.
>>>> BTW, I think the 16bits audio could be a confusion because it is
>> not
>>>> about the
>>> number of bits, but rather the constraint of the block length.
>> Base on
>>> my understanding, Audio needs a period larger than 10ms,
>> regardless of the bit depth.
>>>> The questions here are:
>>>> 1. Should DMAEngine expose a new API to constraint the
>> block_length
>>>> (instead of
>>> size in bytes)?
>>>> 2. Should DMA client re-split the linked-list before passing the
>>>> linked-list to
>>> DMAEngine.
>>>> 3. Should DMA controller driver re-split the linked-list before
>> execution.
>>> Since we have DMA slave capabilities, the consumer may ask
>> for them
>>> and prepare the SG list accordingly.
>>>
>>> Above limitation is a block size (value of 1023 is a maximum,
>> meaning
>>> 1024 maximum items in the block of given transfer width). So,
>> like
>>> DesignWare DMA
>>> (AHB) has up to 4095 (but I saw hardware with 2047) or iDMA
>> 32- and
>>> 64-bit have 131071. There is no limitation for amount of blocks
>> of
>>> given maximum length in the LLI!
>>>
>>> No hacks are needed, really.
>> [>>] Hi All, can we have the agreement that DMA clients should
>> optimize the linked-list [>>] by retrieve the DMA capabilities from
>> the DMA controller?
>> [>>] I noticed that Vinod voted #3 but Andy voted #2.
>> [>>] We need your decision to move on.
>>> --
>>> With Best Regards,
>>> Andy Shevchenko
>>>
> Hi everyone,
>
> Is there anymore comment on this RFC?
> We will be using the ASoC framework to split the linked-list, since resplitting the linked-list in DMA is a no go.
> If there isn't any more comments, we will push these patches for review and merging.

Hi,

Why is splitting the list in the DMAengine framework a no go?

The whole idea of the DMAengine framework is to hide hardware specifics. 
It offers an API with certain semantics and it is up to the driver to 
provide an implementation that implements these semantics. There does 
not necessarily have to be a 1-to-1 mapping to hardware primitives in 
such an implementation.

- Lars
Andy Shevchenko Nov. 30, 2020, 11:09 a.m. UTC | #9
On Mon, Nov 30, 2020 at 11:45:17AM +0100, Lars-Peter Clausen wrote:
> On 11/30/20 10:57 AM, Sit, Michael Wei Hong wrote:

> > Is there anymore comment on this RFC?
> > We will be using the ASoC framework to split the linked-list, since resplitting the linked-list in DMA is a no go.
> > If there isn't any more comments, we will push these patches for review and merging.

> Why is splitting the list in the DMAengine framework a no go?
> 
> The whole idea of the DMAengine framework is to hide hardware specifics. It
> offers an API with certain semantics and it is up to the driver to provide
> an implementation that implements these semantics. There does not
> necessarily have to be a 1-to-1 mapping to hardware primitives in such an
> implementation.

I would say it's not desirable.

Why should we split than resplit if we may do it in one go?
Why then we have DMA capabilities returned to the consumers.

So, I have that we need to optimize DMA SG list preparation in a way that
consumer gets SG list cooked in accordance with DMA limitations / requirements.
Lars-Peter Clausen Dec. 1, 2020, 8:22 a.m. UTC | #10
On 11/30/20 12:09 PM, Shevchenko, Andriy wrote:
> On Mon, Nov 30, 2020 at 11:45:17AM +0100, Lars-Peter Clausen wrote:
>> On 11/30/20 10:57 AM, Sit, Michael Wei Hong wrote:
>>> Is there anymore comment on this RFC?
>>> We will be using the ASoC framework to split the linked-list, since resplitting the linked-list in DMA is a no go.
>>> If there isn't any more comments, we will push these patches for review and merging.
>> Why is splitting the list in the DMAengine framework a no go?
>>
>> The whole idea of the DMAengine framework is to hide hardware specifics. It
>> offers an API with certain semantics and it is up to the driver to provide
>> an implementation that implements these semantics. There does not
>> necessarily have to be a 1-to-1 mapping to hardware primitives in such an
>> implementation.
> I would say it's not desirable.
>
> Why should we split than resplit if we may do it in one go?
> Why then we have DMA capabilities returned to the consumers.
>
> So, I have that we need to optimize DMA SG list preparation in a way that
> consumer gets SG list cooked in accordance with DMA limitations / requirements.

The API that the audio drivers use request a period DMA transfer for 
length N split into M segments with the callback being called after each 
segment.

How that is implemented internally in the DMA does not matter as long as 
it matches those semantics. E.g. if the segment is longer than what the 
DMA can handle it can split it into two segments internally and call the 
callback every second segment.

The way this API works there isn't even really a possibility for the 
client side to split periodic transfers.

Btw. 1024 beats per segment seems excessively small, I don't understand 
how somebody would design such an DMA. Was the assumption that the DMA 
will never transfer more than PAGE_SIZE of contiguous memory? Or do we 
not understand the documentation correctly?

- Lars
Sia, Jee Heng Dec. 1, 2020, 9:10 a.m. UTC | #11
> -----Original Message-----

> From: Lars-Peter Clausen <lars@metafoo.de>

> Sent: 01 December 2020 4:22 PM

> To: Shevchenko, Andriy <andriy.shevchenko@intel.com>

> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>; Sia, Jee

> Heng <jee.heng.sia@intel.com>; pierre-louis.bossart@linux.intel.com;

> Rojewski, Cezary <cezary.rojewski@intel.com>;

> liam.r.girdwood@linux.intel.com; vkoul@kernel.org; tiwai@suse.com;

> broonie@kernel.org; alsa-devel@alsa-project.org

> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add

> custom prepare and submit function

> 

> On 11/30/20 12:09 PM, Shevchenko, Andriy wrote:

> > On Mon, Nov 30, 2020 at 11:45:17AM +0100, Lars-Peter Clausen

> wrote:

> >> On 11/30/20 10:57 AM, Sit, Michael Wei Hong wrote:

> >>> Is there anymore comment on this RFC?

> >>> We will be using the ASoC framework to split the linked-list, since

> resplitting the linked-list in DMA is a no go.

> >>> If there isn't any more comments, we will push these patches for

> review and merging.

> >> Why is splitting the list in the DMAengine framework a no go?

> >>

> >> The whole idea of the DMAengine framework is to hide hardware

> >> specifics. It offers an API with certain semantics and it is up to

> >> the driver to provide an implementation that implements these

> >> semantics. There does not necessarily have to be a 1-to-1 mapping

> to

> >> hardware primitives in such an implementation.

> > I would say it's not desirable.

> >

> > Why should we split than resplit if we may do it in one go?

> > Why then we have DMA capabilities returned to the consumers.

> >

> > So, I have that we need to optimize DMA SG list preparation in a way

> > that consumer gets SG list cooked in accordance with DMA

> limitations / requirements.

> 

> The API that the audio drivers use request a period DMA transfer for

> length N split into M segments with the callback being called after

> each segment.

> 

> How that is implemented internally in the DMA does not matter as

> long as it matches those semantics. E.g. if the segment is longer than

> what the DMA can handle it can split it into two segments internally

> and call the callback every second segment.

> 

> The way this API works there isn't even really a possibility for the client

> side to split periodic transfers.

> 

> Btw. 1024 beats per segment seems excessively small, I don't

> understand how somebody would design such an DMA. Was the

> assumption that the DMA will never transfer more than PAGE_SIZE of

> contiguous memory? Or do we not understand the documentation

> correctly?

[>>] The segment size is 1024 items. The item size could be 8bits,
16bits or 32bits. So, set_max_seg_size() is set to 1024*4bytes.
 

> 

> - Lars
Michael Sit Wei Hong Dec. 5, 2020, 12:55 a.m. UTC | #12
> > -----Original Message-----

> > From: Lars-Peter Clausen <lars@metafoo.de>

> > Sent: 01 December 2020 4:22 PM

> > To: Shevchenko, Andriy <andriy.shevchenko@intel.com>

> > Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;

> Sia, Jee

> > Heng <jee.heng.sia@intel.com>; pierre-

> louis.bossart@linux.intel.com;

> > Rojewski, Cezary <cezary.rojewski@intel.com>;

> > liam.r.girdwood@linux.intel.com; vkoul@kernel.org;

> tiwai@suse.com;

> > broonie@kernel.org; alsa-devel@alsa-project.org

> > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-

> pcm: Add

> > custom prepare and submit function

> >

...
> > > Why should we split than resplit if we may do it in one go?

> > > Why then we have DMA capabilities returned to the

> consumers.

> > >

> > > So, I have that we need to optimize DMA SG list preparation

> in a way

> > > that consumer gets SG list cooked in accordance with DMA

> > limitations / requirements.

> >

> > The API that the audio drivers use request a period DMA

> transfer for

> > length N split into M segments with the callback being called

> after

> > each segment.

> >

> > How that is implemented internally in the DMA does not matter

> as long

> > as it matches those semantics. E.g. if the segment is longer than

> what

> > the DMA can handle it can split it into two segments internally

> and

> > call the callback every second segment.

> >

> > The way this API works there isn't even really a possibility for

> the

> > client side to split periodic transfers.

> >

> > Btw. 1024 beats per segment seems excessively small, I don't

> > understand how somebody would design such an DMA. Was

> the assumption

> > that the DMA will never transfer more than PAGE_SIZE of

> contiguous

> > memory? Or do we not understand the documentation

> correctly?

> [>>] The segment size is 1024 items. The item size could be 8bits,

> 16bits or 32bits. So, set_max_seg_size() is set to 1024*4bytes.

> 

> 

> >

> > - Lars


Hi All,
 
In order to fulfil Andy request on optimizing the linked-list at the DMA client side, we proposed a few changes to the soc-generic- dmaengine and DMAENGINE API due to the AxiDMA's nature operation in number of items.
 
Add New DMAENGINE API:
// For DMA driver to set the max items in a segment 1. dma_set_max_seg_items(struct device *dev, unsigned int size)

// For DMA client to retrieve the max items supported in a segment 2. dma_get_max_seg_items(struct device *dev)

#1 and #2 above are the critical API needed to be exposed to the DMA Clients so that DMA Clients can use it to calculate the appropriate segment length before pass it to the DMA driver.
If #1 and #2 are No Go, then splitting linked-list in DMA client can't be achieve.

ASoC changes:
1. Adding variable to snd_pcm_hardware to store DMA limitation information.
2. soc-generic-dmaengine-pcm to register DMA limitation exposed by DMA driver using API provided above.
3. dmaengine_pcm_prepare_and_submit in pcm_dmaengine.c to check the number of items needed calculated from userspace and configure the dma accordingly if the number of items exceeds the DMA items limitation.
4. dmaengine_pcm_dma_complete in pcm_dmaengine.c to check the number of items needed calculated from userspace and update position according to DMA limitation, to trigger period_elapse appropriately.

All of the above are needed in order to facilitate storing of the DMA limitation info and using the info to configure the DMA appropriately within the DMA limits.
#3 and #4 implements a check against the number of items needed based on userspace provided information and the DMA item limit, if the limit is exceeded, the maximum limit of the DMA is used to configure the DMA transfers.
e.g.
bytes_to_samples returns a value higher than the maximum item limit of the DMA, the driver will pass the maximum usable limit of the DMA using samples_to_bytes to the DMA driver. This will make the DMA driver to use longer linked-list and would not need to do the resplitting at the DMA driver.

Below is the snapshot of the code showing how soc-generic- dmaengine make use of the new API to calculate the segment length.
---------------------------------------------------------------------------------------------------------------------------------------------
Code change in pcm.h

struct snd_pcm_hardware {
	......

	size_t buffer_bytes_max;	/* max buffer size */
	size_t period_bytes_min;	/* min period size */
	size_t period_bytes_max;	/* max period size */
	unsigned int periods_min;	/* min # of periods */
	unsigned int periods_max;	/* max # of periods */
	size_t fifo_size;		/* fifo size in bytes */

--> Add variable for dma max item numbers
	e.g: unsigned int dma_item_max;	/* max # of dma items */

	......
};


---------------------------------------------------------------------------------------------------------------------------------------------
Code change in soc-generic-dmaengine-pcm.c

static int
dmaengine_pcm_set_runtime_hwparams(struct snd_soc_component *component,
				   struct snd_pcm_substream *substream) {
	......

	memset(&hw, 0, sizeof(hw));
	hw.info = SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID |
			SNDRV_PCM_INFO_INTERLEAVED;
	hw.periods_min = 2;
	hw.periods_max = UINT_MAX;
	hw.period_bytes_min = 256;
	hw.period_bytes_max = dma_get_max_seg_size(dma_dev);
	hw.buffer_bytes_max = SIZE_MAX;
	hw.fifo_size = dma_data->fifo_size;

--> Add code to register MAX_DMA_Items limitation using API exposed by 
--> dma
	e.g: hw.dma_item_max = dma_get_max_item_number(dma_dev);

	......
}

---------------------------------------------------------------------------------------------------------------------------------------------
Code change in pcm_dmaengine.c

static void dmaengine_pcm_dma_complete(void *arg) {
	......

	struct snd_pcm_runtime *runtime = substream->runtime;
	int blocks;

-->Add code to convert period bytes to number of DMA items passed down 
-->from user space
	e.g : blocks = bytes_to_samples(runtime, snd_pcm_lib_period_bytes(substream));
--> Add code to check number of DMA items from user space vs DMA 
--> limitation and update position accordingly
	e.g:

	if (blocks > hw.dma_item_max)
		prtd->pos += samples_to_bytes(runtime, MAX_DMA_BLOCKS);
	else
		prtd->pos += snd_pcm_lib_period_bytes(substream);

	......
}

static int dmaengine_pcm_prepare_and_submit(struct snd_pcm_substream *substream) {
	......

	struct snd_pcm_runtime *runtime = substream->runtime;
	int blocks;

--> Add code to convert period bytes to number of DMA items passed down 
--> from user space
	e.g: blocks = bytes_to_samples(runtime, snd_pcm_lib_period_bytes(substream));
	......

--> Add code to check if the number of blocks used exceed the DMA 
--> limitation and prepare DMA according to limitation instead of 
--> information taken from userspace
	e.g:
	if (blocks > hw.dma_item_max)
		desc = dmaengine_prep_dma_cyclic(chan,
			substream->runtime->dma_addr,
			snd_pcm_lib_buffer_bytes(substream),
			samples_to_bytes(runtime, MAX_DMA_BLOCKS), direction, flags);
	else
		desc = dmaengine_prep_dma_cyclic(chan,
			substream->runtime->dma_addr,
			snd_pcm_lib_buffer_bytes(substream),
			snd_pcm_lib_period_bytes(substream), direction, flags);

	......
}
Lars-Peter Clausen Dec. 7, 2020, 10:05 a.m. UTC | #13
On 12/5/20 1:55 AM, Sit, Michael Wei Hong wrote:
>
>>> -----Original Message-----
>>> From: Lars-Peter Clausen <lars@metafoo.de>
>>> Sent: 01 December 2020 4:22 PM
>>> To: Shevchenko, Andriy <andriy.shevchenko@intel.com>
>>> Cc: Sit, Michael Wei Hong <michael.wei.hong.sit@intel.com>;
>> Sia, Jee
>>> Heng <jee.heng.sia@intel.com>; pierre-
>> louis.bossart@linux.intel.com;
>>> Rojewski, Cezary <cezary.rojewski@intel.com>;
>>> liam.r.girdwood@linux.intel.com; vkoul@kernel.org;
>> tiwai@suse.com;
>>> broonie@kernel.org; alsa-devel@alsa-project.org
>>> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-
>> pcm: Add
>>> custom prepare and submit function
>>>
> ...
>>>> Why should we split than resplit if we may do it in one go?
>>>> Why then we have DMA capabilities returned to the
>> consumers.
>>>> So, I have that we need to optimize DMA SG list preparation
>> in a way
>>>> that consumer gets SG list cooked in accordance with DMA
>>> limitations / requirements.
>>>
>>> The API that the audio drivers use request a period DMA
>> transfer for
>>> length N split into M segments with the callback being called
>> after
>>> each segment.
>>>
>>> How that is implemented internally in the DMA does not matter
>> as long
>>> as it matches those semantics. E.g. if the segment is longer than
>> what
>>> the DMA can handle it can split it into two segments internally
>> and
>>> call the callback every second segment.
>>>
>>> The way this API works there isn't even really a possibility for
>> the
>>> client side to split periodic transfers.
>>>
>>> Btw. 1024 beats per segment seems excessively small, I don't
>>> understand how somebody would design such an DMA. Was
>> the assumption
>>> that the DMA will never transfer more than PAGE_SIZE of
>> contiguous
>>> memory? Or do we not understand the documentation
>> correctly?
>> [>>] The segment size is 1024 items. The item size could be 8bits,
>> 16bits or 32bits. So, set_max_seg_size() is set to 1024*4bytes.
>>
>>
>>> - Lars
> Hi All,
>   
> In order to fulfil Andy request on optimizing the linked-list at the DMA client side, we proposed a few changes to the soc-generic- dmaengine and DMAENGINE API due to the AxiDMA's nature operation in number of items.
>   
> Add New DMAENGINE API:
> // For DMA driver to set the max items in a segment 1. dma_set_max_seg_items(struct device *dev, unsigned int size)
>
> // For DMA client to retrieve the max items supported in a segment 2. dma_get_max_seg_items(struct device *dev)
>
> #1 and #2 above are the critical API needed to be exposed to the DMA Clients so that DMA Clients can use it to calculate the appropriate segment length before pass it to the DMA driver.
> If #1 and #2 are No Go, then splitting linked-list in DMA client can't be achieve.
>
> ASoC changes:
> 1. Adding variable to snd_pcm_hardware to store DMA limitation information.
> 2. soc-generic-dmaengine-pcm to register DMA limitation exposed by DMA driver using API provided above.
> 3. dmaengine_pcm_prepare_and_submit in pcm_dmaengine.c to check the number of items needed calculated from userspace and configure the dma accordingly if the number of items exceeds the DMA items limitation.
> 4. dmaengine_pcm_dma_complete in pcm_dmaengine.c to check the number of items needed calculated from userspace and update position according to DMA limitation, to trigger period_elapse appropriately.
>
> All of the above are needed in order to facilitate storing of the DMA limitation info and using the info to configure the DMA appropriately within the DMA limits.
> #3 and #4 implements a check against the number of items needed based on userspace provided information and the DMA item limit, if the limit is exceeded, the maximum limit of the DMA is used to configure the DMA transfers.
> e.g.
> bytes_to_samples returns a value higher than the maximum item limit of the DMA, the driver will pass the maximum usable limit of the DMA using samples_to_bytes to the DMA driver. This will make the DMA driver to use longer linked-list and would not need to do the resplitting at the DMA driver.
>
> Below is the snapshot of the code showing how soc-generic- dmaengine make use of the new API to calculate the segment length.
> ---------------------------------------------------------------------------------------------------------------------------------------------
> Code change in pcm.h
>
> struct snd_pcm_hardware {
> 	......
>
> 	size_t buffer_bytes_max;	/* max buffer size */
> 	size_t period_bytes_min;	/* min period size */
> 	size_t period_bytes_max;	/* max period size */
> 	unsigned int periods_min;	/* min # of periods */
> 	unsigned int periods_max;	/* max # of periods */
> 	size_t fifo_size;		/* fifo size in bytes */
>
> --> Add variable for dma max item numbers
> 	e.g: unsigned int dma_item_max;	/* max # of dma items */
>
> 	......
> };
>
>
> ---------------------------------------------------------------------------------------------------------------------------------------------
> Code change in soc-generic-dmaengine-pcm.c
>
> static int
> dmaengine_pcm_set_runtime_hwparams(struct snd_soc_component *component,
> 				   struct snd_pcm_substream *substream) {
> 	......
>
> 	memset(&hw, 0, sizeof(hw));
> 	hw.info = SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID |
> 			SNDRV_PCM_INFO_INTERLEAVED;
> 	hw.periods_min = 2;
> 	hw.periods_max = UINT_MAX;
> 	hw.period_bytes_min = 256;
> 	hw.period_bytes_max = dma_get_max_seg_size(dma_dev);
> 	hw.buffer_bytes_max = SIZE_MAX;
> 	hw.fifo_size = dma_data->fifo_size;
>
> --> Add code to register MAX_DMA_Items limitation using API exposed by
> --> dma
> 	e.g: hw.dma_item_max = dma_get_max_item_number(dma_dev);
>
> 	......
> }
>
> ---------------------------------------------------------------------------------------------------------------------------------------------
> Code change in pcm_dmaengine.c
>
> static void dmaengine_pcm_dma_complete(void *arg) {
> 	......
>
> 	struct snd_pcm_runtime *runtime = substream->runtime;
> 	int blocks;
>
> -->Add code to convert period bytes to number of DMA items passed down
> -->from user space
> 	e.g : blocks = bytes_to_samples(runtime, snd_pcm_lib_period_bytes(substream));
> --> Add code to check number of DMA items from user space vs DMA
> --> limitation and update position accordingly
> 	e.g:
>
> 	if (blocks > hw.dma_item_max)
> 		prtd->pos += samples_to_bytes(runtime, MAX_DMA_BLOCKS);
> 	else
> 		prtd->pos += snd_pcm_lib_period_bytes(substream);
>
> 	......
> }
>
> static int dmaengine_pcm_prepare_and_submit(struct snd_pcm_substream *substream) {
> 	......
>
> 	struct snd_pcm_runtime *runtime = substream->runtime;
> 	int blocks;
>
> --> Add code to convert period bytes to number of DMA items passed down
> --> from user space
> 	e.g: blocks = bytes_to_samples(runtime, snd_pcm_lib_period_bytes(substream));
> 	......
>
> --> Add code to check if the number of blocks used exceed the DMA
> --> limitation and prepare DMA according to limitation instead of
> --> information taken from userspace
> 	e.g:
> 	if (blocks > hw.dma_item_max)
> 		desc = dmaengine_prep_dma_cyclic(chan,
> 			substream->runtime->dma_addr,
> 			snd_pcm_lib_buffer_bytes(substream),
> 			samples_to_bytes(runtime, MAX_DMA_BLOCKS), direction, flags);
> 	else
> 		desc = dmaengine_prep_dma_cyclic(chan,
> 			substream->runtime->dma_addr,
> 			snd_pcm_lib_buffer_bytes(substream),
> 			snd_pcm_lib_period_bytes(substream), direction, flags);
>
> 	......
> }
>
Hi,

I don't think this approach will work. If you setup a DMA transfer with 
a different configuration that what was requested your audio will glitch.

If you really want to limit your period size you need to install a range 
constraint on the SNDRV_PCM_HW_PARAM_PERIOD_SIZE parameter.

But I'd highly recommend against it and just split the transfer into 
multiple segments in the DMA driver. Needlessly limiting the period size 
will increase the number of interrupts during audio playback/recording 
and hurt the power efficiency of your system.

- Lars
Pierre-Louis Bossart Dec. 7, 2020, 3:36 p.m. UTC | #14
> If you really want to limit your period size you need to install a range 
> constraint on the SNDRV_PCM_HW_PARAM_PERIOD_SIZE parameter.
> 
> But I'd highly recommend against it and just split the transfer into 
> multiple segments in the DMA driver. Needlessly limiting the period size 
> will increase the number of interrupts during audio playback/recording 
> and hurt the power efficiency of your system.

Yes that was also an objection from me, the fix should be in the DMA 
level. The 1024 block limitation would mean restricting the period size 
to be at most 5.3 or 10.6ms (16 and 32-bit cases). That's way to small.
Sia, Jee Heng Dec. 8, 2020, 3:21 a.m. UTC | #15
> -----Original Message-----

> From: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>

> Sent: 07 December 2020 11:36 PM

> To: Lars-Peter Clausen <lars@metafoo.de>; Sit, Michael Wei Hong

> <michael.wei.hong.sit@intel.com>; Sia, Jee Heng

> <jee.heng.sia@intel.com>; Shevchenko, Andriy

> <andriy.shevchenko@intel.com>

> Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-devel@alsa-

> project.org; tiwai@suse.com; liam.r.girdwood@linux.intel.com;

> vkoul@kernel.org; broonie@kernel.org

> Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm: Add

> custom prepare and submit function

> 

> 

> 

> 

> > If you really want to limit your period size you need to install a

> > range constraint on the SNDRV_PCM_HW_PARAM_PERIOD_SIZE

> parameter.

> >

> > But I'd highly recommend against it and just split the transfer into

> > multiple segments in the DMA driver. Needlessly limiting the period

> > size will increase the number of interrupts during audio

> > playback/recording and hurt the power efficiency of your system.

> 

> Yes that was also an objection from me, the fix should be in the DMA

> level. The 1024 block limitation would mean restricting the period size

> to be at most 5.3 or 10.6ms (16 and 32-bit cases). That's way to small.

[>>] Seems like complexity increases if splitting the segments in ASoC. This is not a framework issue nor architecture issue. 
If introducing new API to DMAENGINE to constraint the number of items is not recommended, then lets split the segments in DMA driver.
Michael Sit Wei Hong Dec. 10, 2020, 8:24 a.m. UTC | #16
> -----Original Message-----

> From: Sia, Jee Heng <jee.heng.sia@intel.com>

> Sent: Tuesday, 8 December, 2020 11:21 AM

> To: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>;

> Lars-Peter Clausen <lars@metafoo.de>; Sit, Michael Wei Hong

> <michael.wei.hong.sit@intel.com>; Shevchenko, Andriy

> <andriy.shevchenko@intel.com>

> Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-

> devel@alsa-project.org; tiwai@suse.com;

> liam.r.girdwood@linux.intel.com; vkoul@kernel.org;

> broonie@kernel.org

> Subject: RE: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm:

> Add custom prepare and submit function

> 

> 

> 

> > -----Original Message-----

> > From: Pierre-Louis Bossart <pierre-

> louis.bossart@linux.intel.com>

> > Sent: 07 December 2020 11:36 PM

> > To: Lars-Peter Clausen <lars@metafoo.de>; Sit, Michael Wei

> Hong

> > <michael.wei.hong.sit@intel.com>; Sia, Jee Heng

> > <jee.heng.sia@intel.com>; Shevchenko, Andriy

> > <andriy.shevchenko@intel.com>

> > Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-

> devel@alsa-

> > project.org; tiwai@suse.com; liam.r.girdwood@linux.intel.com;

> > vkoul@kernel.org; broonie@kernel.org

> > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-

> pcm: Add

> > custom prepare and submit function

> >

> >

> >

> >

> > > If you really want to limit your period size you need to install a

> > > range constraint on the

> SNDRV_PCM_HW_PARAM_PERIOD_SIZE

> > parameter.

> > >

> > > But I'd highly recommend against it and just split the transfer

> into

> > > multiple segments in the DMA driver. Needlessly limiting the

> period

> > > size will increase the number of interrupts during audio

> > > playback/recording and hurt the power efficiency of your

> system.

> >

> > Yes that was also an objection from me, the fix should be in the

> DMA

> > level. The 1024 block limitation would mean restricting the

> period

> > size to be at most 5.3 or 10.6ms (16 and 32-bit cases). That's way

> to small.

> [>>] Seems like complexity increases if splitting the segments in

> ASoC. This is not a framework issue nor architecture issue.

> If introducing new API to DMAENGINE to constraint the number

> of items is not recommended, then lets split the segments in

> DMA driver.


With the increased complexity of introducing new APIs can we move the segment splitting to the DMA driver?
Anymore concerns with doing so?
Michael Sit Wei Hong Dec. 14, 2020, 3:23 a.m. UTC | #17
> -----Original Message-----

> From: Sit, Michael Wei Hong

> Sent: Thursday, 10 December, 2020 4:24 PM

> To: Sia, Jee Heng <jee.heng.sia@intel.com>; Pierre-Louis Bossart

> <pierre-louis.bossart@linux.intel.com>; Lars-Peter Clausen

> <lars@metafoo.de>; Shevchenko, Andriy

> <andriy.shevchenko@intel.com>

> Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-

> devel@alsa-project.org; tiwai@suse.com;

> liam.r.girdwood@linux.intel.com; vkoul@kernel.org;

> broonie@kernel.org

> Subject: RE: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm:

> Add custom prepare and submit function

> 

> 

> 

> > -----Original Message-----

> > From: Sia, Jee Heng <jee.heng.sia@intel.com>

> > Sent: Tuesday, 8 December, 2020 11:21 AM

> > To: Pierre-Louis Bossart <pierre-louis.bossart@linux.intel.com>;

> > Lars-Peter Clausen <lars@metafoo.de>; Sit, Michael Wei Hong

> > <michael.wei.hong.sit@intel.com>; Shevchenko, Andriy

> > <andriy.shevchenko@intel.com>

> > Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-

> > devel@alsa-project.org; tiwai@suse.com;

> > liam.r.girdwood@linux.intel.com; vkoul@kernel.org;

> broonie@kernel.org

> > Subject: RE: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-pcm:

> > Add custom prepare and submit function

> >

> >

> >

> > > -----Original Message-----

> > > From: Pierre-Louis Bossart <pierre-

> > louis.bossart@linux.intel.com>

> > > Sent: 07 December 2020 11:36 PM

> > > To: Lars-Peter Clausen <lars@metafoo.de>; Sit, Michael Wei

> > Hong

> > > <michael.wei.hong.sit@intel.com>; Sia, Jee Heng

> > > <jee.heng.sia@intel.com>; Shevchenko, Andriy

> > > <andriy.shevchenko@intel.com>

> > > Cc: Rojewski, Cezary <cezary.rojewski@intel.com>; alsa-

> > devel@alsa-

> > > project.org; tiwai@suse.com;

> liam.r.girdwood@linux.intel.com;

> > > vkoul@kernel.org; broonie@kernel.org

> > > Subject: Re: [RFC PATCH 2/4] ASoC: soc-generic-dmaengine-

> > pcm: Add

> > > custom prepare and submit function

> > >

> > >

> > >

> > >

> > > > If you really want to limit your period size you need to install

> a

> > > > range constraint on the

> > SNDRV_PCM_HW_PARAM_PERIOD_SIZE

> > > parameter.

> > > >

> > > > But I'd highly recommend against it and just split the

> transfer

> > into

> > > > multiple segments in the DMA driver. Needlessly limiting the

> > period

> > > > size will increase the number of interrupts during audio

> > > > playback/recording and hurt the power efficiency of your

> > system.

> > >

> > > Yes that was also an objection from me, the fix should be in

> the

> > DMA

> > > level. The 1024 block limitation would mean restricting the

> > period

> > > size to be at most 5.3 or 10.6ms (16 and 32-bit cases). That's

> way

> > to small.

> > [>>] Seems like complexity increases if splitting the segments in

> > ASoC. This is not a framework issue nor architecture issue.

> > If introducing new API to DMAENGINE to constraint the number

> of items

> > is not recommended, then lets split the segments in DMA driver.

> 

> With the increased complexity of introducing new APIs can we

> move the segment splitting to the DMA driver?

> Anymore concerns with doing so?


If there are no more comments on this, I will start cleaning up the code to remove the DMA splitting in the ASoC layer, and push the code for review soon.
The DMA segment splitting will be done in the DMA driver instead.
diff mbox series

Patch

diff --git a/include/sound/dmaengine_pcm.h b/include/sound/dmaengine_pcm.h
index 8c5e38180fb0..9fae56d39ae2 100644
--- a/include/sound/dmaengine_pcm.h
+++ b/include/sound/dmaengine_pcm.h
@@ -28,6 +28,9 @@  snd_pcm_substream_to_dma_direction(const struct snd_pcm_substream *substream)
 int snd_hwparams_to_dma_slave_config(const struct snd_pcm_substream *substream,
 	const struct snd_pcm_hw_params *params, struct dma_slave_config *slave_config);
 int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd);
+int snd_dmaengine_pcm_custom_trigger(struct snd_pcm_substream *substream, int cmd,
+	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream));
+
 snd_pcm_uframes_t snd_dmaengine_pcm_pointer(struct snd_pcm_substream *substream);
 snd_pcm_uframes_t snd_dmaengine_pcm_pointer_no_residue(struct snd_pcm_substream *substream);
 
@@ -113,6 +116,8 @@  int snd_dmaengine_pcm_refine_runtime_hwparams(
  *   which do not use devicetree.
  * @process: Callback used to apply processing on samples transferred from/to
  *   user space.
+ * @custom_pcm_prepare_and_submit: Callback used to work-around DMA limitations
+ *   related to link lists.
  * @compat_filter_fn: Will be used as the filter function when requesting a
  *  channel for platforms which do not use devicetree. The filter parameter
  *  will be the DAI's DMA data.
@@ -138,6 +143,7 @@  struct snd_dmaengine_pcm_config {
 	int (*process)(struct snd_pcm_substream *substream,
 		       int channel, unsigned long hwoff,
 		       void *buf, unsigned long bytes);
+	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream);
 	dma_filter_fn compat_filter_fn;
 	struct device *dma_dev;
 	const char *chan_names[SNDRV_PCM_STREAM_LAST + 1];
diff --git a/sound/core/pcm_dmaengine.c b/sound/core/pcm_dmaengine.c
index 4d059ff2b2e4..cbd1429de509 100644
--- a/sound/core/pcm_dmaengine.c
+++ b/sound/core/pcm_dmaengine.c
@@ -170,16 +170,20 @@  static int dmaengine_pcm_prepare_and_submit(struct snd_pcm_substream *substream)
 }
 
 /**
- * snd_dmaengine_pcm_trigger - dmaengine based PCM trigger implementation
+ * snd_dmaengine_pcm_custom_trigger - customized PCM trigger implementation to
+ *  work-around DMA limitations related to link lists.
  * @substream: PCM substream
  * @cmd: Trigger command
+ * @custom_pcm_prepare_and_submit: custom function to deal with DMA limitations
  *
  * Returns 0 on success, a negative error code otherwise.
  *
- * This function can be used as the PCM trigger callback for dmaengine based PCM
- * driver implementations.
+ * This function can be used as the PCM trigger callback for customized dmaengine
+ * based PCM driver implementations.
  */
-int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
+
+int snd_dmaengine_pcm_custom_trigger(struct snd_pcm_substream *substream, int cmd,
+	int (*custom_pcm_prepare_and_submit)(struct snd_pcm_substream *substream))
 {
 	struct dmaengine_pcm_runtime_data *prtd = substream_to_prtd(substream);
 	struct snd_pcm_runtime *runtime = substream->runtime;
@@ -187,7 +191,7 @@  int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 
 	switch (cmd) {
 	case SNDRV_PCM_TRIGGER_START:
-		ret = dmaengine_pcm_prepare_and_submit(substream);
+		ret = custom_pcm_prepare_and_submit(substream);
 		if (ret)
 			return ret;
 		dma_async_issue_pending(prtd->dma_chan);
@@ -214,6 +218,22 @@  int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_custom_trigger);
+
+/**
+ * snd_dmaengine_pcm_trigger - dmaengine based PCM trigger implementation
+ * @substream: PCM substream
+ * @cmd: Trigger command
+ *
+ * Returns 0 on success, a negative error code otherwise.
+ *
+ * This function can be used as the PCM trigger callback for dmaengine based PCM
+ * driver implementations.
+ */
+int snd_dmaengine_pcm_trigger(struct snd_pcm_substream *substream, int cmd)
+{
+	return snd_dmaengine_pcm_custom_trigger(substream, cmd, dmaengine_pcm_prepare_and_submit);
+}
 EXPORT_SYMBOL_GPL(snd_dmaengine_pcm_trigger);
 
 /**
diff --git a/sound/soc/soc-generic-dmaengine-pcm.c b/sound/soc/soc-generic-dmaengine-pcm.c
index 9ef80a48707e..88fca6402a36 100644
--- a/sound/soc/soc-generic-dmaengine-pcm.c
+++ b/sound/soc/soc-generic-dmaengine-pcm.c
@@ -173,7 +173,13 @@  static int dmaengine_pcm_close(struct snd_soc_component *component,
 static int dmaengine_pcm_trigger(struct snd_soc_component *component,
 				 struct snd_pcm_substream *substream, int cmd)
 {
-	return snd_dmaengine_pcm_trigger(substream, cmd);
+	struct dmaengine_pcm *pcm = soc_component_to_pcm(component);
+
+	if (pcm->config && pcm->config->custom_pcm_prepare_and_submit)
+		return snd_dmaengine_pcm_custom_trigger(substream, cmd,
+							pcm->config->custom_pcm_prepare_and_submit);
+	else
+		return snd_dmaengine_pcm_trigger(substream, cmd);
 }
 
 static struct dma_chan *dmaengine_pcm_compat_request_channel(