diff mbox series

[v3] mmc: rtsx: improve performance for multi block rw

Message ID 8e61aed5f64e434abc1d7b6f81859c8a@realtek.com
State New
Headers show
Series [v3] mmc: rtsx: improve performance for multi block rw | expand

Commit Message

Ricky WU Dec. 21, 2021, 12:24 p.m. UTC
Improving performance for the CMD is multi-block read/write
and the data is sequential.
sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25)
or normal RW (CMD 17/24) if the CMD is multi-block and the data is
sequential then call to sd_rw_multi_seq()

This patch mainly to control the timing of reply at CMD 12/13.
Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
The new code to distinguish multi-block RW(CMD 18/25) and the data is
sequential or not, if the data is sequential RW driver do not send CMD 12
and bypass CMD 13 until wait the different direction RW CMD
or trigger the delay_work to sent CMD 12.

run benchmark result as below:
SD Card : Samsumg Pro Plus 128GB
Number of Samples:100, Sample Size:10MB
<Before> Read : 86.9 MB/s, Write : 38.3 MB/s
<After>  Read : 91.5 MB/s, Write : 55.5 MB/s

Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
---
v2:
make commit message more clarity
change function name for more clarity
v3:
add more commit message and benchmark result
---
 drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 5 deletions(-)

Comments

Ricky WU Dec. 23, 2021, 10:26 a.m. UTC | #1
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Tuesday, December 21, 2021 8:51 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > Improving performance for the CMD is multi-block read/write and the
> > data is sequential.
> > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > sequential then call to sd_rw_multi_seq()
> >
> > This patch mainly to control the timing of reply at CMD 12/13.
> > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > The new code to distinguish multi-block RW(CMD 18/25) and the data is
> > sequential or not, if the data is sequential RW driver do not send CMD
> > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > trigger the delay_work to sent CMD 12.
> >
> > run benchmark result as below:
> > SD Card : Samsumg Pro Plus 128GB
> > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> 
> A much nicer commit message, thanks a lot! Would you mind running some
> additional tests, like random I/O read/writes?
> 
> Also, please specify the benchmark tool and command you are using. In the
> meantime, I will continue to look at the code.
> 

The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" 
and the Tool don't have random I/O to choice...

Do you have any suggestion for testing random I/O
But we think random I/O will not change much

BR,
Ricky

> Kind regards
> Uffe
> 
> >
> > Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
> > ---
> > v2:
> > make commit message more clarity
> > change function name for more clarity
> > v3:
> > add more commit message and benchmark result
> > ---
> >  drivers/mmc/host/rtsx_pci_sdmmc.c | 185
> > +++++++++++++++++++++++++++++-
> >  1 file changed, 180 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c
> > b/drivers/mmc/host/rtsx_pci_sdmmc.c
> > index 58cfaffa3c2d..ee2b0eec6422 100644
> > --- a/drivers/mmc/host/rtsx_pci_sdmmc.c
> > +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
> > @@ -22,6 +22,8 @@
> >  #include <asm/unaligned.h>
> >  #include <linux/pm_runtime.h>
> >
> > +enum RW_MODE   {NORMAL_RW, SEQ_RW};
> > +
> >  struct realtek_pci_sdmmc {
> >         struct platform_device  *pdev;
> >         struct rtsx_pcr         *pcr;
> > @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc {
> >
> >         struct work_struct      work;
> >         struct mutex            host_mutex;
> > +       struct delayed_work             rw_idle_work;
> >
> >         u8                      ssc_depth;
> >         unsigned int            clock;
> > @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc {
> >         s32                     cookie;
> >         int                     cookie_sg_count;
> >         bool                    using_cookie;
> > +
> > +       enum RW_MODE            rw_mode;
> > +       u8              prev_dir;
> > +       u8              cur_dir;
> > +       u64             prev_sec_addr;
> > +       u32             prev_sec_cnt;
> >  };
> >
> >  static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios
> > *ios); @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct
> realtek_pci_sdmmc *host,
> >         dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg =
> 0x%08x\n",
> >                         __func__, cmd_idx, arg);
> >
> > +       if (cmd_idx == MMC_SEND_STATUS && host->rw_mode ==
> SEQ_RW) {
> > +               cmd->resp[0] = R1_READY_FOR_DATA |
> (R1_STATE_TRAN << 9);
> > +               goto out;
> > +       }
> > +
> > +       if (!mmc_op_multi(cmd->opcode))
> > +               host->rw_mode = NORMAL_RW;
> > +
> >         rsp_type = sd_response_type(cmd);
> >         if (rsp_type < 0)
> >                 goto out;
> > @@ -542,6 +559,93 @@ static int sd_write_long_data(struct
> realtek_pci_sdmmc *host,
> >         return 0;
> >  }
> >
> > +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct
> > +mmc_request *mrq) {
> > +       struct rtsx_pcr *pcr = host->pcr;
> > +       struct mmc_host *mmc = host->mmc;
> > +       struct mmc_card *card = mmc->card;
> > +       struct mmc_data *data = mrq->data;
> > +       int uhs = mmc_card_uhs(card);
> > +       u8 cfg2;
> > +       int err;
> > +       size_t data_len = data->blksz * data->blocks;
> > +
> > +       cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 |
> > +               SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 |
> SD_RSP_LEN_0;
> > +
> > +       if (!uhs)
> > +               cfg2 |= SD_NO_CHECK_WAIT_CRC_TO;
> > +
> > +       rtsx_pci_init_cmd(pcr);
> > +       sd_cmd_set_data_len(pcr, data->blocks, data->blksz);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
> > +                       DMA_DONE_INT, DMA_DONE_INT);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3,
> > +               0xFF, (u8)(data_len >> 24));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2,
> > +               0xFF, (u8)(data_len >> 16));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1,
> > +               0xFF, (u8)(data_len >> 8));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF,
> > + (u8)data_len);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> > +                       0x03 | DMA_PACK_SIZE_MASK,
> > +                       DMA_DIR_FROM_CARD | DMA_EN |
> DMA_512);
> > +       else
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> > +                       0x03 | DMA_PACK_SIZE_MASK,
> > +                       DMA_DIR_TO_CARD | DMA_EN | DMA_512);
> > +
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE,
> > +                       0x01, RING_BUFFER);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER,
> 0xFF,
> > +                               SD_TRANSFER_START |
> SD_TM_AUTO_READ_3);
> > +       else
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER,
> 0xFF,
> > +                               SD_TRANSFER_START |
> > + SD_TM_AUTO_WRITE_3);
> > +
> > +       rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER,
> > +                       SD_TRANSFER_END, SD_TRANSFER_END);
> > +       rtsx_pci_send_cmd_no_wait(pcr);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               err = rtsx_pci_dma_transfer(pcr, data->sg,
> host->sg_count, 1, 10000);
> > +       else
> > +               err = rtsx_pci_dma_transfer(pcr, data->sg,
> > + host->sg_count, 0, 10000);
> > +
> > +       if (err < 0) {
> > +               sd_clear_error(host);
> > +               return err;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host,
> > +struct mmc_request *mrq) {
> > +       struct rtsx_pcr *pcr = host->pcr;
> > +       struct mmc_command *cmd;
> > +
> > +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> > +
> > +       cmd->opcode = MMC_STOP_TRANSMISSION;
> > +       cmd->arg = 0;
> > +       cmd->busy_timeout = 0;
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 |
> MMC_CMD_AC;
> > +       else
> > +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B |
> MMC_CMD_AC;
> > +       sd_send_cmd_get_rsp(host, cmd);
> > +       udelay(50);
> > +       rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
> > +       kfree(cmd);
> > +       return 0;
> > +}
> > +
> >  static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc
> > *host)  {
> >         rtsx_pci_write_register(host->pcr, SD_CFG1, @@ -796,6 +900,45
> > @@ static inline int sd_rw_cmd(struct mmc_command *cmd)
> >                 (cmd->opcode == MMC_WRITE_BLOCK);  }
> >
> > +static void sd_rw_idle_work(struct work_struct *work) {
> > +       struct delayed_work *dwork = to_delayed_work(work);
> > +       struct realtek_pci_sdmmc *host = container_of(dwork,
> > +                       struct realtek_pci_sdmmc, rw_idle_work);
> > +       struct mmc_command *cmd;
> > +
> > +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> > +
> > +       cmd->opcode = MMC_STOP_TRANSMISSION;
> > +       cmd->arg = 0;
> > +       cmd->busy_timeout = 0;
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 |
> MMC_CMD_AC;
> > +       else
> > +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B |
> > + MMC_CMD_AC;
> > +
> > +       sd_send_cmd_get_rsp(host, cmd);
> > +       host->rw_mode = NORMAL_RW;
> > +       kfree(cmd);
> > +}
> > +
> > +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct
> > +mmc_request *mrq) {
> > +       struct mmc_command *cmd = mrq->cmd;
> > +       struct mmc_data *data = mrq->data;
> > +
> > +       if (!mmc_op_multi(cmd->opcode))
> > +               return 0;
> > +
> > +       if (host->prev_dir != host->cur_dir)
> > +               return 0;
> > +
> > +       if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr)
> > +               return 0;
> > +
> > +       return 1;
> > +}
> > +
> >  static void sd_request(struct work_struct *work)  {
> >         struct realtek_pci_sdmmc *host = container_of(work, @@ -841,12
> > +984,36 @@ static void sd_request(struct work_struct *work)
> >         if (!data_size) {
> >                 sd_send_cmd_get_rsp(host, cmd);
> >         } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> > -               cmd->error = sd_rw_multi(host, mrq);
> > -               if (!host->using_cookie)
> > -                       sdmmc_post_req(host->mmc, host->mrq, 0);
> > +               /* Check multi-block and seq function*/
> > +               if (data->flags & MMC_DATA_READ)
> > +                       host->cur_dir = DMA_DIR_FROM_CARD;
> > +               else
> > +                       host->cur_dir = DMA_DIR_TO_CARD;
> > +
> > +               if (host->rw_mode == SEQ_RW) {
> > +                       cancel_delayed_work(&host->rw_idle_work);
> > +                       if (!sd_check_multi_seq(host, mrq)) {
> > +                               sd_stop_rw_multi_seq(host, mrq);
> > +                               host->rw_mode = NORMAL_RW;
> > +                       }
> > +               }
> > +
> > +               if (host->rw_mode == SEQ_RW)
> > +                       cmd->error = sd_rw_multi_seq(host, mrq);
> > +               else {
> > +                       if (mmc_op_multi(cmd->opcode))
> > +                               host->rw_mode = SEQ_RW;
> > +                       cmd->error = sd_rw_multi(host, mrq);
> > +                       if (!host->using_cookie)
> > +                               sdmmc_post_req(host->mmc,
> host->mrq, 0);
> > +               }
> > +
> > +               if (cmd->error)
> > +                       host->rw_mode = NORMAL_RW;
> > +
> > +               if (mmc_op_multi(cmd->opcode) && host->rw_mode ==
> SEQ_RW)
> > +                       mod_delayed_work(system_wq,
> > + &host->rw_idle_work, msecs_to_jiffies(150));
> >
> > -               if (mmc_op_multi(cmd->opcode) && mrq->stop)
> > -                       sd_send_cmd_get_rsp(host, mrq->stop);
> >         } else {
> >                 sd_normal_rw(host, mrq);
> >         }
> > @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work)
> >         }
> >
> >         mutex_lock(&host->host_mutex);
> > +       if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> > +               host->prev_dir = host->cur_dir;
> > +               host->prev_sec_addr = data->blk_addr;
> > +               host->prev_sec_cnt = data->blocks;
> > +       }
> >         host->mrq = NULL;
> >         mutex_unlock(&host->host_mutex);
> >
> > @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct
> platform_device *pdev)
> >         struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev);
> >
> >         host->cookie = -1;
> > +       host->rw_mode = NORMAL_RW;
> >         mmc_detect_change(host->mmc, 0);  }
> >
> > @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct
> platform_device *pdev)
> >         host->cookie = -1;
> >         host->power_state = SDMMC_POWER_OFF;
> >         INIT_WORK(&host->work, sd_request);
> > +       INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work);
> >         platform_set_drvdata(pdev, host);
> >         pcr->slots[RTSX_SD_CARD].p_dev = pdev;
> >         pcr->slots[RTSX_SD_CARD].card_event =
> > rtsx_pci_sdmmc_card_event; @@ -1526,6 +1700,7 @@ static int
> rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
> >                 pm_runtime_disable(&pdev->dev);
> >         }
> >
> > +       cancel_delayed_work_sync(&host->rw_idle_work);
> >         cancel_work_sync(&host->work);
> >
> >         mutex_lock(&host->host_mutex);
> > --
> > 2.25.1
> ------Please consider the environment before printing this e-mail.
Ulf Hansson Dec. 23, 2021, 10:37 a.m. UTC | #2
On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Tuesday, December 21, 2021 8:51 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > > Improving performance for the CMD is multi-block read/write and the
> > > data is sequential.
> > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > sequential then call to sd_rw_multi_seq()
> > >
> > > This patch mainly to control the timing of reply at CMD 12/13.
> > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > The new code to distinguish multi-block RW(CMD 18/25) and the data is
> > > sequential or not, if the data is sequential RW driver do not send CMD
> > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > trigger the delay_work to sent CMD 12.
> > >
> > > run benchmark result as below:
> > > SD Card : Samsumg Pro Plus 128GB
> > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> >
> > A much nicer commit message, thanks a lot! Would you mind running some
> > additional tests, like random I/O read/writes?
> >
> > Also, please specify the benchmark tool and command you are using. In the
> > meantime, I will continue to look at the code.
> >
>
> The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> and the Tool don't have random I/O to choice...
>
> Do you have any suggestion for testing random I/O
> But we think random I/O will not change much

I would probably look into using fio, https://fio.readthedocs.io/en/latest/

Another option that I use frequently is iozone, https://www.iozone.org.
Here's a command line that I often use for iozone
./iozone -az -i0 -i1 -s 20m -y 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e

[...]

Kind regards
Uffe
Ricky WU Dec. 24, 2021, 7:23 a.m. UTC | #3
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Thursday, December 23, 2021 6:37 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > block rw
> > >
> > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > > Improving performance for the CMD is multi-block read/write and
> > > > the data is sequential.
> > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > > sequential then call to sd_rw_multi_seq()
> > > >
> > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > The new code to distinguish multi-block RW(CMD 18/25) and the data
> > > > is sequential or not, if the data is sequential RW driver do not
> > > > send CMD
> > > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > > trigger the delay_work to sent CMD 12.
> > > >
> > > > run benchmark result as below:
> > > > SD Card : Samsumg Pro Plus 128GB
> > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> > >
> > > A much nicer commit message, thanks a lot! Would you mind running
> > > some additional tests, like random I/O read/writes?
> > >
> > > Also, please specify the benchmark tool and command you are using.
> > > In the meantime, I will continue to look at the code.
> > >
> >
> > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > and the Tool don't have random I/O to choice...
> >
> > Do you have any suggestion for testing random I/O But we think random
> > I/O will not change much
> 
> I would probably look into using fio, https://fio.readthedocs.io/en/latest/
> 

Filled random I/O data
Before the patch:
CMD (Randread):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
  read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
    clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
     lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
    clat percentiles (usec):
     |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600], 20.00th=[11600],
     | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731],
     | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863],
     | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341],
     | 99.99th=[34341]
   bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24
   iops        : min=   80, max=   86, avg=85.00, stdev= 1.41, samples=24
  lat (msec)   : 20=99.90%, 50=0.10%
  cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec

Disk stats (read/write):
  mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23%

CMD (Randwrite):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite

mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
  write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets
    clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
     lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
    clat percentiles (usec):
     |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676], 20.00th=[22938],
     | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725],
     | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361],
     | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897],
     | 99.99th=[94897]
   bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53
   iops        : min=   24, max=   42, avg=38.30, stdev= 3.77, samples=53
  lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
  cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
  WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec

Disk stats (read/write):
  mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90%


After the patch:

CMD (Randread):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
  read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
    clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
     lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
    clat percentiles (usec):
     |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469], 20.00th=[11469],
     | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600],
     | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600],
     | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375],
     | 99.99th=[32375]
   bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23
   iops        : min=   82, max=   88, avg=86.52, stdev= 1.38, samples=23
  lat (msec)   : 20=99.80%, 50=0.20%
  cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec

Disk stats (read/write):
  mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21%

CMD (Randwrite):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite

mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
  write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets
    clat (msec): min=16, max=118, avg=25.37, stdev=16.34
     lat (msec): min=16, max=118, avg=25.44, stdev=16.34
    clat percentiles (msec):
     |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20], 20.00th=[   20],
     | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20], 60.00th=[   20],
     | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52], 95.00th=[   75],
     | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114], 99.95th=[  120],
     | 99.99th=[  120]
   bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52
   iops        : min=   20, max=   50, avg=39.27, stdev=10.25, samples=52
  lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
  cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
  WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec

Disk stats (read/write):
  mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89%

BR,
Ricky

> Another option that I use frequently is iozone, https://www.iozone.org.
> Here's a command line that I often use for iozone ./iozone -az -i0 -i1 -s 20m -y
> 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e
> 
> [...]
> 
> Kind regards
> Uffe
> ------Please consider the environment before printing this e-mail.
Ulf Hansson Dec. 28, 2021, 2:04 p.m. UTC | #4
On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote:
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Thursday, December 23, 2021 6:37 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > > > -----Original Message-----
> > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org
> > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > block rw
> > > >
> > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > > > >
> > > > > Improving performance for the CMD is multi-block read/write and
> > > > > the data is sequential.
> > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > > > sequential then call to sd_rw_multi_seq()
> > > > >
> > > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > > The new code to distinguish multi-block RW(CMD 18/25) and the data
> > > > > is sequential or not, if the data is sequential RW driver do not
> > > > > send CMD
> > > > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > > > trigger the delay_work to sent CMD 12.
> > > > >
> > > > > run benchmark result as below:
> > > > > SD Card : Samsumg Pro Plus 128GB
> > > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> > > >
> > > > A much nicer commit message, thanks a lot! Would you mind running
> > > > some additional tests, like random I/O read/writes?
> > > >
> > > > Also, please specify the benchmark tool and command you are using.
> > > > In the meantime, I will continue to look at the code.
> > > >
> > >
> > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > > and the Tool don't have random I/O to choice...
> > >
> > > Do you have any suggestion for testing random I/O But we think random
> > > I/O will not change much
> >
> > I would probably look into using fio, https://fio.readthedocs.io/en/latest/
> >
>
> Filled random I/O data
> Before the patch:
> CMD (Randread):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

Thanks for running the tests! Overall, I would not expect an impact on
the throughput when using a big blocksize like 1M. This is also pretty
clear from the result you have provided.

However, especially for random writes and reads, we want to try with
smaller blocksizes. Like 8k or 16k, would you mind running another
round of tests to see how that works out?

I haven't yet been able to provide you with comments on the code, but
I am looking into it.

Kind regards
Uffe

>
> mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
>   read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
>     clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
>      lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
>     clat percentiles (usec):
>      |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600], 20.00th=[11600],
>      | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731],
>      | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863],
>      | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341],
>      | 99.99th=[34341]
>    bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24
>    iops        : min=   80, max=   86, avg=85.00, stdev= 1.41, samples=24
>   lat (msec)   : 20=99.90%, 50=0.10%
>   cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>    READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec
>
> Disk stats (read/write):
>   mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23%
>
> CMD (Randwrite):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite
>
> mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
>   write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets
>     clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
>      lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
>     clat percentiles (usec):
>      |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676], 20.00th=[22938],
>      | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725],
>      | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361],
>      | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897],
>      | 99.99th=[94897]
>    bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53
>    iops        : min=   24, max=   42, avg=38.30, stdev= 3.77, samples=53
>   lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
>   cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>   WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec
>
> Disk stats (read/write):
>   mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90%
>
>
> After the patch:
>
> CMD (Randread):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread
>
> mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
>   read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
>     clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
>      lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
>     clat percentiles (usec):
>      |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469], 20.00th=[11469],
>      | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600],
>      | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600],
>      | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375],
>      | 99.99th=[32375]
>    bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23
>    iops        : min=   82, max=   88, avg=86.52, stdev= 1.38, samples=23
>   lat (msec)   : 20=99.80%, 50=0.20%
>   cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>    READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec
>
> Disk stats (read/write):
>   mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21%
>
> CMD (Randwrite):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite
>
> mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
>   write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets
>     clat (msec): min=16, max=118, avg=25.37, stdev=16.34
>      lat (msec): min=16, max=118, avg=25.44, stdev=16.34
>     clat percentiles (msec):
>      |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20], 20.00th=[   20],
>      | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20], 60.00th=[   20],
>      | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52], 95.00th=[   75],
>      | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114], 99.95th=[  120],
>      | 99.99th=[  120]
>    bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52
>    iops        : min=   20, max=   50, avg=39.27, stdev=10.25, samples=52
>   lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
>   cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>   WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec
>
> Disk stats (read/write):
>   mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89%
>
> BR,
> Ricky
Ricky WU Dec. 29, 2021, 12:39 p.m. UTC | #5
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Tuesday, December 28, 2021 10:05 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Thursday, December 23, 2021 6:37 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > block rw
> > >
> > > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > > > -----Original Message-----
> > > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > > linux-kernel@vger.kernel.org
> > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > > block rw
> > > > >
> > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com>
> wrote:
> > > > > >
> > > > > > Improving performance for the CMD is multi-block read/write
> > > > > > and the data is sequential.
> > > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25)
> > > > > > or normal RW (CMD 17/24) if the CMD is multi-block and the
> > > > > > data is sequential then call to sd_rw_multi_seq()
> > > > > >
> > > > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > > > The new code to distinguish multi-block RW(CMD 18/25) and the
> > > > > > data is sequential or not, if the data is sequential RW driver
> > > > > > do not send CMD
> > > > > > 12 and bypass CMD 13 until wait the different direction RW CMD
> > > > > > or trigger the delay_work to sent CMD 12.
> > > > > >
> > > > > > run benchmark result as below:
> > > > > > SD Card : Samsumg Pro Plus 128GB Number of Samples:100, Sample
> > > > > > Size:10MB <Before> Read : 86.9 MB/s, Write : 38.3 MB/s <After>
> > > > > > Read : 91.5 MB/s, Write : 55.5 MB/s
> > > > >
> > > > > A much nicer commit message, thanks a lot! Would you mind
> > > > > running some additional tests, like random I/O read/writes?
> > > > >
> > > > > Also, please specify the benchmark tool and command you are using.
> > > > > In the meantime, I will continue to look at the code.
> > > > >
> > > >
> > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > > > and the Tool don't have random I/O to choice...
> > > >
> > > > Do you have any suggestion for testing random I/O But we think
> > > > random I/O will not change much
> > >
> > > I would probably look into using fio,
> > > https://fio.readthedocs.io/en/latest/
> > >
> >
> > Filled random I/O data
> > Before the patch:
> > CMD (Randread):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randread
> 
> Thanks for running the tests! Overall, I would not expect an impact on the
> throughput when using a big blocksize like 1M. This is also pretty clear from
> the result you have provided.
> 
> However, especially for random writes and reads, we want to try with smaller
> blocksizes. Like 8k or 16k, would you mind running another round of tests to
> see how that works out?
> 

Filled random I/O data(8k/16k)

Before(randread)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
Disk stats (read/write):
  mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
Disk stats (read/write):
  mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84%

Before(randrwrite)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec
Disk stats (read/write):
  mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec
Disk stats (read/write):
  mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81%


After(randread)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
Disk stats (read/write):
  mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
Disk stats (read/write):
  mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87%

After(randwrite)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec
Disk stats (read/write):
  mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec
Disk stats (read/write):
  mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80%

> I haven't yet been able to provide you with comments on the code, but I am
> looking into it.
> 
> Kind regards
> Uffe
> 
> >
> > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
> >   read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
> >     clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
> >      lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
> >     clat percentiles (usec):
> >      |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600],
> 20.00th=[11600],
> >      | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731],
> 60.00th=[11731],
> >      | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863],
> 95.00th=[11863],
> >      | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664],
> 99.95th=[34341],
> >      | 99.99th=[34341]
> >    bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67,
> stdev=1467.81, samples=24
> >    iops        : min=   80, max=   86, avg=85.00, stdev= 1.41,
> samples=24
> >   lat (msec)   : 20=99.90%, 50=0.10%
> >   cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >    READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s
> > (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612,
> > util=99.23%
> >
> > CMD (Randwrite):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randwrite
> >
> > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
> >   write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone
> resets
> >     clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
> >      lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
> >     clat percentiles (usec):
> >      |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676],
> 20.00th=[22938],
> >      | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462],
> 60.00th=[23725],
> >      | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773],
> 95.00th=[56361],
> >      | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508],
> 99.95th=[94897],
> >      | 99.99th=[94897]
> >    bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13,
> stdev=3860.74, samples=53
> >    iops        : min=   24, max=   42, avg=38.30, stdev= 3.77,
> samples=53
> >   lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
> >   cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >   WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s
> > (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956,
> > util=99.90%
> >
> >
> > After the patch:
> >
> > CMD (Randread):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randread
> >
> > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
> >   read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
> >     clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
> >      lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
> >     clat percentiles (usec):
> >      |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469],
> 20.00th=[11469],
> >      | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469],
> 60.00th=[11600],
> >      | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600],
> 95.00th=[11600],
> >      | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627],
> 99.95th=[32375],
> >      | 99.99th=[32375]
> >    bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26,
> stdev=1410.46, samples=23
> >    iops        : min=   82, max=   88, avg=86.52, stdev= 1.38,
> samples=23
> >   lat (msec)   : 20=99.80%, 50=0.20%
> >   cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >    READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s
> > (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397,
> > util=99.21%
> >
> > CMD (Randwrite):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randwrite
> >
> > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
> >   write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone
> resets
> >     clat (msec): min=16, max=118, avg=25.37, stdev=16.34
> >      lat (msec): min=16, max=118, avg=25.44, stdev=16.34
> >     clat percentiles (msec):
> >      |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20],
> 20.00th=[   20],
> >      | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20],
> 60.00th=[   20],
> >      | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52],
> 95.00th=[   75],
> >      | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114],
> 99.95th=[  120],
> >      | 99.99th=[  120]
> >    bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69,
> stdev=10498.00, samples=52
> >    iops        : min=   20, max=   50, avg=39.27, stdev=10.25,
> samples=52
> >   lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
> >   cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >   WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s
> > (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144,
> > util=99.89%
> >
> > BR,
> > Ricky
> ------Please consider the environment before printing this e-mail.
Ulf Hansson Feb. 7, 2022, 11:11 a.m. UTC | #6
[...]

> > > > >
> > > > > Do you have any suggestion for testing random I/O But we think
> > > > > random I/O will not change much
> > > >
> > > > I would probably look into using fio,
> > > > https://fio.readthedocs.io/en/latest/
> > > >
> > >
> > > Filled random I/O data
> > > Before the patch:
> > > CMD (Randread):
> > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=1M -rw=randread
> >
> > Thanks for running the tests! Overall, I would not expect an impact on the
> > throughput when using a big blocksize like 1M. This is also pretty clear from
> > the result you have provided.
> >
> > However, especially for random writes and reads, we want to try with smaller
> > blocksizes. Like 8k or 16k, would you mind running another round of tests to
> > see how that works out?
> >
>
> Filled random I/O data(8k/16k)

Hi Ricky,

Apologize for the delay! Thanks for running the tests. Let me comment
on them below.

>
> Before(randread)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
> Disk stats (read/write):
>   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
> Disk stats (read/write):
>   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84%
>
> Before(randrwrite)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec
> Disk stats (read/write):
>   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec
> Disk stats (read/write):
>   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81%
>
>
> After(randread)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
> Disk stats (read/write):
>   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
> Disk stats (read/write):
>   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87%
>
> After(randwrite)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec
> Disk stats (read/write):
>   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec
> Disk stats (read/write):
>   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80%

It looks like the rand-read tests above are degrading with the new
changes, while rand-writes are both improving and degrading.

To summarize my view from all the tests you have done at this point
(thanks a lot); it looks like the block I/O merging isn't really
happening at common blocklayer, at least to that extent that would
benefit us. Clearly you have shown that by the suggested change in the
mmc host driver, by detecting whether the "next" request is sequential
to the previous one, which allows us to skip a CMD12 and minimize some
command overhead.

However, according to the latest tests above, you have also proved
that the changes in the mmc host driver doesn't come without a cost.
In particular, small random-reads would degrade in performance from
these changes.

That said, it looks to me that rather than trying to improve things
for one specific mmc host driver, it would be better to look at this
from the generic block layer point of view - and investigate why
sequential reads/writes aren't getting merged often enough for the
MMC/SD case. If we can fix the problem there, all mmc host drivers
would benefit I assume.

BTW, have you tried with different I/O schedulers? If you haven't
tried BFQ, I suggest you do as it's a good fit for MMC/SD.

[...]

Kind regards
Uffe
Ricky WU Feb. 10, 2022, 6:43 a.m. UTC | #7
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Monday, February 7, 2022 7:11 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> [...]
> 
> > > > > >
> > > > > > Do you have any suggestion for testing random I/O But we think
> > > > > > random I/O will not change much
> > > > >
> > > > > I would probably look into using fio,
> > > > > https://fio.readthedocs.io/en/latest/
> > > > >
> > > >
> > > > Filled random I/O data
> > > > Before the patch:
> > > > CMD (Randread):
> > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=1M -rw=randread
> > >
> > > Thanks for running the tests! Overall, I would not expect an impact
> > > on the throughput when using a big blocksize like 1M. This is also
> > > pretty clear from the result you have provided.
> > >
> > > However, especially for random writes and reads, we want to try with
> > > smaller blocksizes. Like 8k or 16k, would you mind running another
> > > round of tests to see how that works out?
> > >
> >
> > Filled random I/O data(8k/16k)
> 
> Hi Ricky,
> 
> Apologize for the delay! Thanks for running the tests. Let me comment on
> them below.
> 
> >
> > Before(randread)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=8k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk
> stats (read/write):
> >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > util=99.89%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=16k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk
> stats (read/write):
> >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > util=99.84%
> >
> > Before(randrwrite)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=8k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234,
> > util=99.90%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=16k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > util=99.81%
> >
> >
> > After(randread)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=8k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk
> stats (read/write):
> >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > util=99.94%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=16k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk
> stats (read/write):
> >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > util=99.87%
> >
> > After(randwrite)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=8k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats
> (read/write):
> >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267,
> > util=99.92%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=16k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > util=99.80%
> 
> It looks like the rand-read tests above are degrading with the new changes,
> while rand-writes are both improving and degrading.
> 
> To summarize my view from all the tests you have done at this point (thanks a
> lot); it looks like the block I/O merging isn't really happening at common
> blocklayer, at least to that extent that would benefit us. Clearly you have shown
> that by the suggested change in the mmc host driver, by detecting whether the
> "next" request is sequential to the previous one, which allows us to skip a
> CMD12 and minimize some command overhead.
> 
> However, according to the latest tests above, you have also proved that the
> changes in the mmc host driver doesn't come without a cost.
> In particular, small random-reads would degrade in performance from these
> changes.
> 
> That said, it looks to me that rather than trying to improve things for one
> specific mmc host driver, it would be better to look at this from the generic
> block layer point of view - and investigate why sequential reads/writes aren't
> getting merged often enough for the MMC/SD case. If we can fix the problem
> there, all mmc host drivers would benefit I assume.
> 

So you are thinking about how to patch this in MMC/SD?
I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver

> BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I
> suggest you do as it's a good fit for MMC/SD.
> 

I don’t know what is different I/O schedulers means?

> [...]
> 
> Kind regards
> Uffe
> ------Please consider the environment before printing this e-mail.
Ulf Hansson Feb. 10, 2022, 2:56 p.m. UTC | #8
On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Monday, February 7, 2022 7:11 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > [...]
> >
> > > > > > >
> > > > > > > Do you have any suggestion for testing random I/O But we think
> > > > > > > random I/O will not change much
> > > > > >
> > > > > > I would probably look into using fio,
> > > > > > https://fio.readthedocs.io/en/latest/
> > > > > >
> > > > >
> > > > > Filled random I/O data
> > > > > Before the patch:
> > > > > CMD (Randread):
> > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=1M -rw=randread
> > > >
> > > > Thanks for running the tests! Overall, I would not expect an impact
> > > > on the throughput when using a big blocksize like 1M. This is also
> > > > pretty clear from the result you have provided.
> > > >
> > > > However, especially for random writes and reads, we want to try with
> > > > smaller blocksizes. Like 8k or 16k, would you mind running another
> > > > round of tests to see how that works out?
> > > >
> > >
> > > Filled random I/O data(8k/16k)
> >
> > Hi Ricky,
> >
> > Apologize for the delay! Thanks for running the tests. Let me comment on
> > them below.
> >
> > >
> > > Before(randread)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=8k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > > util=99.89%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=16k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > > util=99.84%
> > >
> > > Before(randrwrite)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=8k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234,
> > > util=99.90%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=16k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > > util=99.81%
> > >
> > >
> > > After(randread)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=8k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > > util=99.94%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=16k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > > util=99.87%
> > >
> > > After(randwrite)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=8k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267,
> > > util=99.92%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=16k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > > util=99.80%
> >
> > It looks like the rand-read tests above are degrading with the new changes,
> > while rand-writes are both improving and degrading.
> >
> > To summarize my view from all the tests you have done at this point (thanks a
> > lot); it looks like the block I/O merging isn't really happening at common
> > blocklayer, at least to that extent that would benefit us. Clearly you have shown
> > that by the suggested change in the mmc host driver, by detecting whether the
> > "next" request is sequential to the previous one, which allows us to skip a
> > CMD12 and minimize some command overhead.
> >
> > However, according to the latest tests above, you have also proved that the
> > changes in the mmc host driver doesn't come without a cost.
> > In particular, small random-reads would degrade in performance from these
> > changes.
> >
> > That said, it looks to me that rather than trying to improve things for one
> > specific mmc host driver, it would be better to look at this from the generic
> > block layer point of view - and investigate why sequential reads/writes aren't
> > getting merged often enough for the MMC/SD case. If we can fix the problem
> > there, all mmc host drivers would benefit I assume.
> >
>
> So you are thinking about how to patch this in MMC/SD?
> I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver

I would not limit this to the core layer of MMC/SD. The point I was
trying to make was that it doesn't look like the generic block layer
is merging the sequential I/O requests in the most efficient way, at
least for the eMMC/SD devices. Why this is the case, I can't tell. It
looks like we need to do some more in-depth analysis to understand why
merging isn't efficient for us.

>
> > BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I
> > suggest you do as it's a good fit for MMC/SD.
> >
>
> I don’t know what is different I/O schedulers means?

What I/O scheduler did you use when running the test?

For MMC/SD the only one that makes sense to use is BFQ, however that
needs to be configured via sysfs after boot. There is no way,
currently, to make it the default, I think. You may look at
Documentation/block/bfq-iosched.rst, if you are more interested.

Kind regards
Uffe
Ricky WU Oct. 11, 2023, 5:36 a.m. UTC | #9
Hi Ulf Hansson,

Can I know what is this patch status or has some concern on this patch?

Ricky
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Thursday, February 10, 2022 10:57 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Monday, February 7, 2022 7:11 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > block rw
> > >
> > > [...]
> > >
> > > > > > > >
> > > > > > > > Do you have any suggestion for testing random I/O But we
> > > > > > > > think random I/O will not change much
> > > > > > >
> > > > > > > I would probably look into using fio,
> > > > > > > https://fio.readthedocs.io/en/latest/
> > > > > > >
> > > > > >
> > > > > > Filled random I/O data
> > > > > > Before the patch:
> > > > > > CMD (Randread):
> > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G
> > > > > > -name=mytest -bs=1M -rw=randread
> > > > >
> > > > > Thanks for running the tests! Overall, I would not expect an
> > > > > impact on the throughput when using a big blocksize like 1M.
> > > > > This is also pretty clear from the result you have provided.
> > > > >
> > > > > However, especially for random writes and reads, we want to try
> > > > > with smaller blocksizes. Like 8k or 16k, would you mind running
> > > > > another round of tests to see how that works out?
> > > > >
> > > >
> > > > Filled random I/O data(8k/16k)
> > >
> > > Hi Ricky,
> > >
> > > Apologize for the delay! Thanks for running the tests. Let me
> > > comment on them below.
> > >
> > > >
> > > > Before(randread)
> > > > 8k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=8k -rw=randread
> > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
> Disk
> > > stats (read/write):
> > > >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > > > util=99.89%
> > > >
> > > > 16k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=16k -rw=randread
> > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
> Disk
> > > stats (read/write):
> > > >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > > > util=99.84%
> > > >
> > > > Before(randrwrite)
> > > > 8k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > -name=mytest -bs=8k -rw=randwrite
> > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk
> > > > stats
> > > (read/write):
> > > >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154,
> > > > in_queue=24234, util=99.90%
> > > >
> > > > 16k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > -name=mytest -bs=16k -rw=randwrite
> > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk
> > > > stats
> > > (read/write):
> > > >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > > > util=99.81%
> > > >
> > > >
> > > > After(randread)
> > > > 8k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=8k -rw=randread
> > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
> Disk
> > > stats (read/write):
> > > >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > > > util=99.94%
> > > >
> > > > 16k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=16k -rw=randread
> > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
> Disk
> > > stats (read/write):
> > > >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > > > util=99.87%
> > > >
> > > > After(randwrite)
> > > > 8k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > -name=mytest -bs=8k -rw=randwrite
> > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk
> > > > stats
> > > (read/write):
> > > >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182,
> > > > in_queue=23267, util=99.92%
> > > >
> > > > 16k:
> > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > -name=mytest -bs=16k -rw=randwrite
> > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > result:
> > > > Run status group 0 (all jobs):
> > > >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk
> > > > stats
> > > (read/write):
> > > >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > > > util=99.80%
> > >
> > > It looks like the rand-read tests above are degrading with the new
> > > changes, while rand-writes are both improving and degrading.
> > >
> > > To summarize my view from all the tests you have done at this point
> > > (thanks a lot); it looks like the block I/O merging isn't really
> > > happening at common blocklayer, at least to that extent that would
> > > benefit us. Clearly you have shown that by the suggested change in
> > > the mmc host driver, by detecting whether the "next" request is
> > > sequential to the previous one, which allows us to skip a
> > > CMD12 and minimize some command overhead.
> > >
> > > However, according to the latest tests above, you have also proved
> > > that the changes in the mmc host driver doesn't come without a cost.
> > > In particular, small random-reads would degrade in performance from
> > > these changes.
> > >
> > > That said, it looks to me that rather than trying to improve things
> > > for one specific mmc host driver, it would be better to look at this
> > > from the generic block layer point of view - and investigate why
> > > sequential reads/writes aren't getting merged often enough for the
> > > MMC/SD case. If we can fix the problem there, all mmc host drivers would
> benefit I assume.
> > >
> >
> > So you are thinking about how to patch this in MMC/SD?
> > I don't know if this method is compatible with other MMC Hosts? Or
> > they need to patch other code on their host driver
> 
> I would not limit this to the core layer of MMC/SD. The point I was trying to
> make was that it doesn't look like the generic block layer is merging the
> sequential I/O requests in the most efficient way, at least for the eMMC/SD
> devices. Why this is the case, I can't tell. It looks like we need to do some more
> in-depth analysis to understand why merging isn't efficient for us.
> 
> >
> > > BTW, have you tried with different I/O schedulers? If you haven't
> > > tried BFQ, I suggest you do as it's a good fit for MMC/SD.
> > >
> >
> > I don’t know what is different I/O schedulers means?
> 
> What I/O scheduler did you use when running the test?
> 
> For MMC/SD the only one that makes sense to use is BFQ, however that needs
> to be configured via sysfs after boot. There is no way, currently, to make it the
> default, I think. You may look at Documentation/block/bfq-iosched.rst, if you
> are more interested.
> 
> Kind regards
> Uffe
> ------Please consider the environment before printing this e-mail.
Ulf Hansson Oct. 12, 2023, 1:40 p.m. UTC | #10
On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote:
>
> Hi Ulf Hansson,
>
> Can I know what is this patch status or has some concern on this patch?

Didn't you read my earlier replies?

Kind regards
Uffe

>
> Ricky
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Thursday, February 10, 2022 10:57 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > Sent: Monday, February 7, 2022 7:11 PM
> > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org
> > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > block rw
> > > >
> > > > [...]
> > > >
> > > > > > > > >
> > > > > > > > > Do you have any suggestion for testing random I/O But we
> > > > > > > > > think random I/O will not change much
> > > > > > > >
> > > > > > > > I would probably look into using fio,
> > > > > > > > https://fio.readthedocs.io/en/latest/
> > > > > > > >
> > > > > > >
> > > > > > > Filled random I/O data
> > > > > > > Before the patch:
> > > > > > > CMD (Randread):
> > > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G
> > > > > > > -name=mytest -bs=1M -rw=randread
> > > > > >
> > > > > > Thanks for running the tests! Overall, I would not expect an
> > > > > > impact on the throughput when using a big blocksize like 1M.
> > > > > > This is also pretty clear from the result you have provided.
> > > > > >
> > > > > > However, especially for random writes and reads, we want to try
> > > > > > with smaller blocksizes. Like 8k or 16k, would you mind running
> > > > > > another round of tests to see how that works out?
> > > > > >
> > > > >
> > > > > Filled random I/O data(8k/16k)
> > > >
> > > > Hi Ricky,
> > > >
> > > > Apologize for the delay! Thanks for running the tests. Let me
> > > > comment on them below.
> > > >
> > > > >
> > > > > Before(randread)
> > > > > 8k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=8k -rw=randread
> > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
> > Disk
> > > > stats (read/write):
> > > > >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > > > > util=99.89%
> > > > >
> > > > > 16k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=16k -rw=randread
> > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
> > Disk
> > > > stats (read/write):
> > > > >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > > > > util=99.84%
> > > > >
> > > > > Before(randrwrite)
> > > > > 8k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > -name=mytest -bs=8k -rw=randwrite
> > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk
> > > > > stats
> > > > (read/write):
> > > > >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154,
> > > > > in_queue=24234, util=99.90%
> > > > >
> > > > > 16k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > -name=mytest -bs=16k -rw=randwrite
> > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk
> > > > > stats
> > > > (read/write):
> > > > >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > > > > util=99.81%
> > > > >
> > > > >
> > > > > After(randread)
> > > > > 8k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=8k -rw=randread
> > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
> > Disk
> > > > stats (read/write):
> > > > >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > > > > util=99.94%
> > > > >
> > > > > 16k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=16k -rw=randread
> > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
> > Disk
> > > > stats (read/write):
> > > > >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > > > > util=99.87%
> > > > >
> > > > > After(randwrite)
> > > > > 8k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > -name=mytest -bs=8k -rw=randwrite
> > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk
> > > > > stats
> > > > (read/write):
> > > > >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182,
> > > > > in_queue=23267, util=99.92%
> > > > >
> > > > > 16k:
> > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > -name=mytest -bs=16k -rw=randwrite
> > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > result:
> > > > > Run status group 0 (all jobs):
> > > > >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk
> > > > > stats
> > > > (read/write):
> > > > >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > > > > util=99.80%
> > > >
> > > > It looks like the rand-read tests above are degrading with the new
> > > > changes, while rand-writes are both improving and degrading.
> > > >
> > > > To summarize my view from all the tests you have done at this point
> > > > (thanks a lot); it looks like the block I/O merging isn't really
> > > > happening at common blocklayer, at least to that extent that would
> > > > benefit us. Clearly you have shown that by the suggested change in
> > > > the mmc host driver, by detecting whether the "next" request is
> > > > sequential to the previous one, which allows us to skip a
> > > > CMD12 and minimize some command overhead.
> > > >
> > > > However, according to the latest tests above, you have also proved
> > > > that the changes in the mmc host driver doesn't come without a cost.
> > > > In particular, small random-reads would degrade in performance from
> > > > these changes.
> > > >
> > > > That said, it looks to me that rather than trying to improve things
> > > > for one specific mmc host driver, it would be better to look at this
> > > > from the generic block layer point of view - and investigate why
> > > > sequential reads/writes aren't getting merged often enough for the
> > > > MMC/SD case. If we can fix the problem there, all mmc host drivers would
> > benefit I assume.
> > > >
> > >
> > > So you are thinking about how to patch this in MMC/SD?
> > > I don't know if this method is compatible with other MMC Hosts? Or
> > > they need to patch other code on their host driver
> >
> > I would not limit this to the core layer of MMC/SD. The point I was trying to
> > make was that it doesn't look like the generic block layer is merging the
> > sequential I/O requests in the most efficient way, at least for the eMMC/SD
> > devices. Why this is the case, I can't tell. It looks like we need to do some more
> > in-depth analysis to understand why merging isn't efficient for us.
> >
> > >
> > > > BTW, have you tried with different I/O schedulers? If you haven't
> > > > tried BFQ, I suggest you do as it's a good fit for MMC/SD.
> > > >
> > >
> > > I don’t know what is different I/O schedulers means?
> >
> > What I/O scheduler did you use when running the test?
> >
> > For MMC/SD the only one that makes sense to use is BFQ, however that needs
> > to be configured via sysfs after boot. There is no way, currently, to make it the
> > default, I think. You may look at Documentation/block/bfq-iosched.rst, if you
> > are more interested.
> >
> > Kind regards
> > Uffe
> > ------Please consider the environment before printing this e-mail.
>
Ricky WU Oct. 13, 2023, 2:27 a.m. UTC | #11
> On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > Hi Ulf Hansson,
> >
> > Can I know what is this patch status or has some concern on this patch?
> 
> Didn't you read my earlier replies?
> 

Are you talking about BFQ for testing speed? 
Because we tested the Read/Write speed are better than before and our customer that uses our reader on their product also tested the Read/Write speed, they want us to push this patch on 


> Kind regards
> Uffe
> 
> >
> > Ricky
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Thursday, February 10, 2022 10:57 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> > >
> > > On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > > Sent: Monday, February 7, 2022 7:11 PM
> > > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > > linux-kernel@vger.kernel.org
> > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > > block rw
> > > > >
> > > > > [...]
> > > > >
> > > > > > > > > >
> > > > > > > > > > Do you have any suggestion for testing random I/O But we
> > > > > > > > > > think random I/O will not change much
> > > > > > > > >
> > > > > > > > > I would probably look into using fio,
> > > > > > > > > https://fio.readthedocs.io/en/latest/
> > > > > > > > >
> > > > > > > >
> > > > > > > > Filled random I/O data
> > > > > > > > Before the patch:
> > > > > > > > CMD (Randread):
> > > > > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G
> > > > > > > > -name=mytest -bs=1M -rw=randread
> > > > > > >
> > > > > > > Thanks for running the tests! Overall, I would not expect an
> > > > > > > impact on the throughput when using a big blocksize like 1M.
> > > > > > > This is also pretty clear from the result you have provided.
> > > > > > >
> > > > > > > However, especially for random writes and reads, we want to try
> > > > > > > with smaller blocksizes. Like 8k or 16k, would you mind running
> > > > > > > another round of tests to see how that works out?
> > > > > > >
> > > > > >
> > > > > > Filled random I/O data(8k/16k)
> > > > >
> > > > > Hi Ricky,
> > > > >
> > > > > Apologize for the delay! Thanks for running the tests. Let me
> > > > > comment on them below.
> > > > >
> > > > > >
> > > > > > Before(randread)
> > > > > > 8k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > > -bs=8k -rw=randread
> > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > > > > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB),
> run=62019-62019msec
> > > Disk
> > > > > stats (read/write):
> > > > > >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0,
> in_queue=57751,
> > > > > > util=99.89%
> > > > > >
> > > > > > 16k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > > -bs=16k -rw=randread
> > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > > > > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB),
> run=44034-44034msec
> > > Disk
> > > > > stats (read/write):
> > > > > >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0,
> in_queue=39420,
> > > > > > util=99.84%
> > > > > >
> > > > > > Before(randrwrite)
> > > > > > 8k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > > -name=mytest -bs=8k -rw=randwrite
> > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > > > > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec
> Disk
> > > > > > stats
> > > > > (read/write):
> > > > > >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154,
> > > > > > in_queue=24234, util=99.90%
> > > > > >
> > > > > > 16k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > > -name=mytest -bs=16k -rw=randwrite
> > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > > > > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec
> Disk
> > > > > > stats
> > > > > (read/write):
> > > > > >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647,
> in_queue=13728,
> > > > > > util=99.81%
> > > > > >
> > > > > >
> > > > > > After(randread)
> > > > > > 8k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > > -bs=8k -rw=randread
> > > > > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > > > > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB),
> run=82397-82397msec
> > > Disk
> > > > > stats (read/write):
> > > > > >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0,
> in_queue=74125,
> > > > > > util=99.94%
> > > > > >
> > > > > > 16k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > > -bs=16k -rw=randread
> > > > > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > > > > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB),
> run=51076-51076msec
> > > Disk
> > > > > stats (read/write):
> > > > > >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0,
> in_queue=46254,
> > > > > > util=99.87%
> > > > > >
> > > > > > After(randwrite)
> > > > > > 8k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > > -name=mytest -bs=8k -rw=randwrite
> > > > > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B,
> > > > > > (T) 8192B-8192B, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > > > > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec
> Disk
> > > > > > stats
> > > > > (read/write):
> > > > > >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182,
> > > > > > in_queue=23267, util=99.92%
> > > > > >
> > > > > > 16k:
> > > > > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > > -group_reporting -ioengine=psync -iodepth=1 -size=100M
> > > > > > -name=mytest -bs=16k -rw=randwrite
> > > > > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > > > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > > > > result:
> > > > > > Run status group 0 (all jobs):
> > > > > >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > > > > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec
> Disk
> > > > > > stats
> > > > > (read/write):
> > > > > >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120,
> in_queue=15204,
> > > > > > util=99.80%
> > > > >
> > > > > It looks like the rand-read tests above are degrading with the new
> > > > > changes, while rand-writes are both improving and degrading.
> > > > >
> > > > > To summarize my view from all the tests you have done at this point
> > > > > (thanks a lot); it looks like the block I/O merging isn't really
> > > > > happening at common blocklayer, at least to that extent that would
> > > > > benefit us. Clearly you have shown that by the suggested change in
> > > > > the mmc host driver, by detecting whether the "next" request is
> > > > > sequential to the previous one, which allows us to skip a
> > > > > CMD12 and minimize some command overhead.
> > > > >
> > > > > However, according to the latest tests above, you have also proved
> > > > > that the changes in the mmc host driver doesn't come without a cost.
> > > > > In particular, small random-reads would degrade in performance from
> > > > > these changes.
> > > > >
> > > > > That said, it looks to me that rather than trying to improve things
> > > > > for one specific mmc host driver, it would be better to look at this
> > > > > from the generic block layer point of view - and investigate why
> > > > > sequential reads/writes aren't getting merged often enough for the
> > > > > MMC/SD case. If we can fix the problem there, all mmc host drivers
> would
> > > benefit I assume.
> > > > >
> > > >
> > > > So you are thinking about how to patch this in MMC/SD?
> > > > I don't know if this method is compatible with other MMC Hosts? Or
> > > > they need to patch other code on their host driver
> > >
> > > I would not limit this to the core layer of MMC/SD. The point I was trying to
> > > make was that it doesn't look like the generic block layer is merging the
> > > sequential I/O requests in the most efficient way, at least for the eMMC/SD
> > > devices. Why this is the case, I can't tell. It looks like we need to do some
> more
> > > in-depth analysis to understand why merging isn't efficient for us.
> > >
> > > >
> > > > > BTW, have you tried with different I/O schedulers? If you haven't
> > > > > tried BFQ, I suggest you do as it's a good fit for MMC/SD.
> > > > >
> > > >
> > > > I don’t know what is different I/O schedulers means?
> > >
> > > What I/O scheduler did you use when running the test?
> > >
> > > For MMC/SD the only one that makes sense to use is BFQ, however that
> needs
> > > to be configured via sysfs after boot. There is no way, currently, to make it
> the
> > > default, I think. You may look at Documentation/block/bfq-iosched.rst, if
> you
> > > are more interested.
> > >
> > > Kind regards
> > > Uffe
> > > ------Please consider the environment before printing this e-mail.
> >
Ulf Hansson Oct. 17, 2023, 9:28 p.m. UTC | #12
On Fri, 13 Oct 2023 at 04:27, Ricky WU <ricky_wu@realtek.com> wrote:
>
> > On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > > Hi Ulf Hansson,
> > >
> > > Can I know what is this patch status or has some concern on this patch?
> >
> > Didn't you read my earlier replies?
> >
>
> Are you talking about BFQ for testing speed?
> Because we tested the Read/Write speed are better than before and our customer that uses our reader on their product also tested the Read/Write speed, they want us to push this patch on

It's certainly a very positive thing that your target is to upstream
solutions that improve performance. We all appreciate this!

In this regard, I believe I have tried to guide you on how to move
forward with this. This particular optimization doesn't belong in an
mmc host driver, but rather at the common upper block device driver
layer, such that it can benefit more than one particular mmc host
driver.

I fully understand that making that kind of improvement is way more
difficult and requires in-depth analysis to understand what is
happening on those layers too. On the other hand it could be something
that may benefit a lot of devices/platforms. Unfortunately, I am
currently not in a position where I have the bandwidth to dive deeper
into this.

If you decide to pursue your investigations, I think we need to
involve the experts from the common block community (linux-block
mailing list) to get their advice.

So to be clear, I am not going to apply $subject patch - or anything
similar to an mmc host driver.

[...]

Kind regards
Uffe
Ricky WU Oct. 25, 2023, 10:30 a.m. UTC | #13
> >
> > > On Wed, 11 Oct 2023 at 07:36, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > > Hi Ulf Hansson,
> > > >
> > > > Can I know what is this patch status or has some concern on this patch?
> > >
> > > Didn't you read my earlier replies?
> > >
> >
> > Are you talking about BFQ for testing speed?
> > Because we tested the Read/Write speed are better than before and our
> customer that uses our reader on their product also tested the Read/Write
> speed, they want us to push this patch on
> 
> It's certainly a very positive thing that your target is to upstream
> solutions that improve performance. We all appreciate this!
> 
> In this regard, I believe I have tried to guide you on how to move
> forward with this. This particular optimization doesn't belong in an
> mmc host driver, but rather at the common upper block device driver
> layer, such that it can benefit more than one particular mmc host
> driver.
> 
> I fully understand that making that kind of improvement is way more
> difficult and requires in-depth analysis to understand what is
> happening on those layers too. On the other hand it could be something
> that may benefit a lot of devices/platforms. Unfortunately, I am
> currently not in a position where I have the bandwidth to dive deeper
> into this.
> 
> If you decide to pursue your investigations, I think we need to
> involve the experts from the common block community (linux-block
> mailing list) to get their advice.
> 
> So to be clear, I am not going to apply $subject patch - or anything
> similar to an mmc host driver.
> 

This improve performance solution is developed for our HW design

We discussed internally, The CMD 12 response timing is depend on HW design so this solution 
maybe cannot meet all devices, and the core part of this mechanism is when we got sequential data 
we control our DMA register for read/write data, this operating has different designed on different device,
so this is not easy to push a same way on the mmc core. 

> 
> Kind regards
> Uffe
diff mbox series

Patch

diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
index 58cfaffa3c2d..ee2b0eec6422 100644
--- a/drivers/mmc/host/rtsx_pci_sdmmc.c
+++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
@@ -22,6 +22,8 @@ 
 #include <asm/unaligned.h>
 #include <linux/pm_runtime.h>
 
+enum RW_MODE	{NORMAL_RW, SEQ_RW};
+
 struct realtek_pci_sdmmc {
 	struct platform_device	*pdev;
 	struct rtsx_pcr		*pcr;
@@ -31,6 +33,7 @@  struct realtek_pci_sdmmc {
 
 	struct work_struct	work;
 	struct mutex		host_mutex;
+	struct delayed_work		rw_idle_work;
 
 	u8			ssc_depth;
 	unsigned int		clock;
@@ -46,6 +49,12 @@  struct realtek_pci_sdmmc {
 	s32			cookie;
 	int			cookie_sg_count;
 	bool			using_cookie;
+
+	enum RW_MODE		rw_mode;
+	u8		prev_dir;
+	u8		cur_dir;
+	u64		prev_sec_addr;
+	u32		prev_sec_cnt;
 };
 
 static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios);
@@ -226,6 +235,14 @@  static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host,
 	dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = 0x%08x\n",
 			__func__, cmd_idx, arg);
 
+	if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == SEQ_RW) {
+		cmd->resp[0] = R1_READY_FOR_DATA | (R1_STATE_TRAN << 9);
+		goto out;
+	}
+
+	if (!mmc_op_multi(cmd->opcode))
+		host->rw_mode = NORMAL_RW;
+
 	rsp_type = sd_response_type(cmd);
 	if (rsp_type < 0)
 		goto out;
@@ -542,6 +559,93 @@  static int sd_write_long_data(struct realtek_pci_sdmmc *host,
 	return 0;
 }
 
+static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct rtsx_pcr *pcr = host->pcr;
+	struct mmc_host *mmc = host->mmc;
+	struct mmc_card *card = mmc->card;
+	struct mmc_data *data = mrq->data;
+	int uhs = mmc_card_uhs(card);
+	u8 cfg2;
+	int err;
+	size_t data_len = data->blksz * data->blocks;
+
+	cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 |
+		SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | SD_RSP_LEN_0;
+
+	if (!uhs)
+		cfg2 |= SD_NO_CHECK_WAIT_CRC_TO;
+
+	rtsx_pci_init_cmd(pcr);
+	sd_cmd_set_data_len(pcr, data->blocks, data->blksz);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
+			DMA_DONE_INT, DMA_DONE_INT);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3,
+		0xFF, (u8)(data_len >> 24));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2,
+		0xFF, (u8)(data_len >> 16));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1,
+		0xFF, (u8)(data_len >> 8));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, (u8)data_len);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
+			0x03 | DMA_PACK_SIZE_MASK,
+			DMA_DIR_FROM_CARD | DMA_EN | DMA_512);
+	else
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
+			0x03 | DMA_PACK_SIZE_MASK,
+			DMA_DIR_TO_CARD | DMA_EN | DMA_512);
+
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE,
+			0x01, RING_BUFFER);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
+				SD_TRANSFER_START | SD_TM_AUTO_READ_3);
+	else
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
+				SD_TRANSFER_START | SD_TM_AUTO_WRITE_3);
+
+	rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER,
+			SD_TRANSFER_END, SD_TRANSFER_END);
+	rtsx_pci_send_cmd_no_wait(pcr);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 1, 10000);
+	else
+		err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 0, 10000);
+
+	if (err < 0) {
+		sd_clear_error(host);
+		return err;
+	}
+
+	return 0;
+}
+
+static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct rtsx_pcr *pcr = host->pcr;
+	struct mmc_command *cmd;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+
+	cmd->opcode = MMC_STOP_TRANSMISSION;
+	cmd->arg = 0;
+	cmd->busy_timeout = 0;
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	else
+		cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+	sd_send_cmd_get_rsp(host, cmd);
+	udelay(50);
+	rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
+	kfree(cmd);
+	return 0;
+}
+
 static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host)
 {
 	rtsx_pci_write_register(host->pcr, SD_CFG1,
@@ -796,6 +900,45 @@  static inline int sd_rw_cmd(struct mmc_command *cmd)
 		(cmd->opcode == MMC_WRITE_BLOCK);
 }
 
+static void sd_rw_idle_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct realtek_pci_sdmmc *host = container_of(dwork,
+			struct realtek_pci_sdmmc, rw_idle_work);
+	struct mmc_command *cmd;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+
+	cmd->opcode = MMC_STOP_TRANSMISSION;
+	cmd->arg = 0;
+	cmd->busy_timeout = 0;
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	else
+		cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+
+	sd_send_cmd_get_rsp(host, cmd);
+	host->rw_mode = NORMAL_RW;
+	kfree(cmd);
+}
+
+static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct mmc_command *cmd = mrq->cmd;
+	struct mmc_data *data = mrq->data;
+
+	if (!mmc_op_multi(cmd->opcode))
+		return 0;
+
+	if (host->prev_dir != host->cur_dir)
+		return 0;
+
+	if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr)
+		return 0;
+
+	return 1;
+}
+
 static void sd_request(struct work_struct *work)
 {
 	struct realtek_pci_sdmmc *host = container_of(work,
@@ -841,12 +984,36 @@  static void sd_request(struct work_struct *work)
 	if (!data_size) {
 		sd_send_cmd_get_rsp(host, cmd);
 	} else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
-		cmd->error = sd_rw_multi(host, mrq);
-		if (!host->using_cookie)
-			sdmmc_post_req(host->mmc, host->mrq, 0);
+		/* Check multi-block and seq function*/
+		if (data->flags & MMC_DATA_READ)
+			host->cur_dir = DMA_DIR_FROM_CARD;
+		else
+			host->cur_dir = DMA_DIR_TO_CARD;
+
+		if (host->rw_mode == SEQ_RW) {
+			cancel_delayed_work(&host->rw_idle_work);
+			if (!sd_check_multi_seq(host, mrq)) {
+				sd_stop_rw_multi_seq(host, mrq);
+				host->rw_mode = NORMAL_RW;
+			}
+		}
+
+		if (host->rw_mode == SEQ_RW)
+			cmd->error = sd_rw_multi_seq(host, mrq);
+		else {
+			if (mmc_op_multi(cmd->opcode))
+				host->rw_mode = SEQ_RW;
+			cmd->error = sd_rw_multi(host, mrq);
+			if (!host->using_cookie)
+				sdmmc_post_req(host->mmc, host->mrq, 0);
+		}
+
+		if (cmd->error)
+			host->rw_mode = NORMAL_RW;
+
+		if (mmc_op_multi(cmd->opcode) && host->rw_mode == SEQ_RW)
+			mod_delayed_work(system_wq, &host->rw_idle_work, msecs_to_jiffies(150));
 
-		if (mmc_op_multi(cmd->opcode) && mrq->stop)
-			sd_send_cmd_get_rsp(host, mrq->stop);
 	} else {
 		sd_normal_rw(host, mrq);
 	}
@@ -867,6 +1034,11 @@  static void sd_request(struct work_struct *work)
 	}
 
 	mutex_lock(&host->host_mutex);
+	if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
+		host->prev_dir = host->cur_dir;
+		host->prev_sec_addr = data->blk_addr;
+		host->prev_sec_cnt = data->blocks;
+	}
 	host->mrq = NULL;
 	mutex_unlock(&host->host_mutex);
 
@@ -1457,6 +1629,7 @@  static void rtsx_pci_sdmmc_card_event(struct platform_device *pdev)
 	struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev);
 
 	host->cookie = -1;
+	host->rw_mode = NORMAL_RW;
 	mmc_detect_change(host->mmc, 0);
 }
 
@@ -1487,6 +1660,7 @@  static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev)
 	host->cookie = -1;
 	host->power_state = SDMMC_POWER_OFF;
 	INIT_WORK(&host->work, sd_request);
+	INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work);
 	platform_set_drvdata(pdev, host);
 	pcr->slots[RTSX_SD_CARD].p_dev = pdev;
 	pcr->slots[RTSX_SD_CARD].card_event = rtsx_pci_sdmmc_card_event;
@@ -1526,6 +1700,7 @@  static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
 		pm_runtime_disable(&pdev->dev);
 	}
 
+	cancel_delayed_work_sync(&host->rw_idle_work);
 	cancel_work_sync(&host->work);
 
 	mutex_lock(&host->host_mutex);