[PATCH v3] mmc: rtsx: improve performance for multi block rw

Ricky WU posted 1 patch 4 years, 6 months ago
drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++-
1 file changed, 180 insertions(+), 5 deletions(-)
[PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ricky WU 4 years, 6 months ago
Improving performance for the CMD is multi-block read/write
and the data is sequential.
sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25)
or normal RW (CMD 17/24) if the CMD is multi-block and the data is
sequential then call to sd_rw_multi_seq()

This patch mainly to control the timing of reply at CMD 12/13.
Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
The new code to distinguish multi-block RW(CMD 18/25) and the data is
sequential or not, if the data is sequential RW driver do not send CMD 12
and bypass CMD 13 until wait the different direction RW CMD
or trigger the delay_work to sent CMD 12.

run benchmark result as below:
SD Card : Samsumg Pro Plus 128GB
Number of Samples:100, Sample Size:10MB
<Before> Read : 86.9 MB/s, Write : 38.3 MB/s
<After>  Read : 91.5 MB/s, Write : 55.5 MB/s

Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
---
v2:
make commit message more clarity
change function name for more clarity
v3:
add more commit message and benchmark result
---
 drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++-
 1 file changed, 180 insertions(+), 5 deletions(-)

diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
index 58cfaffa3c2d..ee2b0eec6422 100644
--- a/drivers/mmc/host/rtsx_pci_sdmmc.c
+++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
@@ -22,6 +22,8 @@
 #include <asm/unaligned.h>
 #include <linux/pm_runtime.h>
 
+enum RW_MODE	{NORMAL_RW, SEQ_RW};
+
 struct realtek_pci_sdmmc {
 	struct platform_device	*pdev;
 	struct rtsx_pcr		*pcr;
@@ -31,6 +33,7 @@ struct realtek_pci_sdmmc {
 
 	struct work_struct	work;
 	struct mutex		host_mutex;
+	struct delayed_work		rw_idle_work;
 
 	u8			ssc_depth;
 	unsigned int		clock;
@@ -46,6 +49,12 @@ struct realtek_pci_sdmmc {
 	s32			cookie;
 	int			cookie_sg_count;
 	bool			using_cookie;
+
+	enum RW_MODE		rw_mode;
+	u8		prev_dir;
+	u8		cur_dir;
+	u64		prev_sec_addr;
+	u32		prev_sec_cnt;
 };
 
 static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios);
@@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host,
 	dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = 0x%08x\n",
 			__func__, cmd_idx, arg);
 
+	if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == SEQ_RW) {
+		cmd->resp[0] = R1_READY_FOR_DATA | (R1_STATE_TRAN << 9);
+		goto out;
+	}
+
+	if (!mmc_op_multi(cmd->opcode))
+		host->rw_mode = NORMAL_RW;
+
 	rsp_type = sd_response_type(cmd);
 	if (rsp_type < 0)
 		goto out;
@@ -542,6 +559,93 @@ static int sd_write_long_data(struct realtek_pci_sdmmc *host,
 	return 0;
 }
 
+static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct rtsx_pcr *pcr = host->pcr;
+	struct mmc_host *mmc = host->mmc;
+	struct mmc_card *card = mmc->card;
+	struct mmc_data *data = mrq->data;
+	int uhs = mmc_card_uhs(card);
+	u8 cfg2;
+	int err;
+	size_t data_len = data->blksz * data->blocks;
+
+	cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 |
+		SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | SD_RSP_LEN_0;
+
+	if (!uhs)
+		cfg2 |= SD_NO_CHECK_WAIT_CRC_TO;
+
+	rtsx_pci_init_cmd(pcr);
+	sd_cmd_set_data_len(pcr, data->blocks, data->blksz);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
+			DMA_DONE_INT, DMA_DONE_INT);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3,
+		0xFF, (u8)(data_len >> 24));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2,
+		0xFF, (u8)(data_len >> 16));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1,
+		0xFF, (u8)(data_len >> 8));
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, (u8)data_len);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
+			0x03 | DMA_PACK_SIZE_MASK,
+			DMA_DIR_FROM_CARD | DMA_EN | DMA_512);
+	else
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
+			0x03 | DMA_PACK_SIZE_MASK,
+			DMA_DIR_TO_CARD | DMA_EN | DMA_512);
+
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE,
+			0x01, RING_BUFFER);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
+				SD_TRANSFER_START | SD_TM_AUTO_READ_3);
+	else
+		rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
+				SD_TRANSFER_START | SD_TM_AUTO_WRITE_3);
+
+	rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER,
+			SD_TRANSFER_END, SD_TRANSFER_END);
+	rtsx_pci_send_cmd_no_wait(pcr);
+
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 1, 10000);
+	else
+		err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 0, 10000);
+
+	if (err < 0) {
+		sd_clear_error(host);
+		return err;
+	}
+
+	return 0;
+}
+
+static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct rtsx_pcr *pcr = host->pcr;
+	struct mmc_command *cmd;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+
+	cmd->opcode = MMC_STOP_TRANSMISSION;
+	cmd->arg = 0;
+	cmd->busy_timeout = 0;
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	else
+		cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+	sd_send_cmd_get_rsp(host, cmd);
+	udelay(50);
+	rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
+	kfree(cmd);
+	return 0;
+}
+
 static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host)
 {
 	rtsx_pci_write_register(host->pcr, SD_CFG1,
@@ -796,6 +900,45 @@ static inline int sd_rw_cmd(struct mmc_command *cmd)
 		(cmd->opcode == MMC_WRITE_BLOCK);
 }
 
+static void sd_rw_idle_work(struct work_struct *work)
+{
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct realtek_pci_sdmmc *host = container_of(dwork,
+			struct realtek_pci_sdmmc, rw_idle_work);
+	struct mmc_command *cmd;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+
+	cmd->opcode = MMC_STOP_TRANSMISSION;
+	cmd->arg = 0;
+	cmd->busy_timeout = 0;
+	if (host->cur_dir == DMA_DIR_FROM_CARD)
+		cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
+	else
+		cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
+
+	sd_send_cmd_get_rsp(host, cmd);
+	host->rw_mode = NORMAL_RW;
+	kfree(cmd);
+}
+
+static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
+{
+	struct mmc_command *cmd = mrq->cmd;
+	struct mmc_data *data = mrq->data;
+
+	if (!mmc_op_multi(cmd->opcode))
+		return 0;
+
+	if (host->prev_dir != host->cur_dir)
+		return 0;
+
+	if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr)
+		return 0;
+
+	return 1;
+}
+
 static void sd_request(struct work_struct *work)
 {
 	struct realtek_pci_sdmmc *host = container_of(work,
@@ -841,12 +984,36 @@ static void sd_request(struct work_struct *work)
 	if (!data_size) {
 		sd_send_cmd_get_rsp(host, cmd);
 	} else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
-		cmd->error = sd_rw_multi(host, mrq);
-		if (!host->using_cookie)
-			sdmmc_post_req(host->mmc, host->mrq, 0);
+		/* Check multi-block and seq function*/
+		if (data->flags & MMC_DATA_READ)
+			host->cur_dir = DMA_DIR_FROM_CARD;
+		else
+			host->cur_dir = DMA_DIR_TO_CARD;
+
+		if (host->rw_mode == SEQ_RW) {
+			cancel_delayed_work(&host->rw_idle_work);
+			if (!sd_check_multi_seq(host, mrq)) {
+				sd_stop_rw_multi_seq(host, mrq);
+				host->rw_mode = NORMAL_RW;
+			}
+		}
+
+		if (host->rw_mode == SEQ_RW)
+			cmd->error = sd_rw_multi_seq(host, mrq);
+		else {
+			if (mmc_op_multi(cmd->opcode))
+				host->rw_mode = SEQ_RW;
+			cmd->error = sd_rw_multi(host, mrq);
+			if (!host->using_cookie)
+				sdmmc_post_req(host->mmc, host->mrq, 0);
+		}
+
+		if (cmd->error)
+			host->rw_mode = NORMAL_RW;
+
+		if (mmc_op_multi(cmd->opcode) && host->rw_mode == SEQ_RW)
+			mod_delayed_work(system_wq, &host->rw_idle_work, msecs_to_jiffies(150));
 
-		if (mmc_op_multi(cmd->opcode) && mrq->stop)
-			sd_send_cmd_get_rsp(host, mrq->stop);
 	} else {
 		sd_normal_rw(host, mrq);
 	}
@@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work)
 	}
 
 	mutex_lock(&host->host_mutex);
+	if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
+		host->prev_dir = host->cur_dir;
+		host->prev_sec_addr = data->blk_addr;
+		host->prev_sec_cnt = data->blocks;
+	}
 	host->mrq = NULL;
 	mutex_unlock(&host->host_mutex);
 
@@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct platform_device *pdev)
 	struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev);
 
 	host->cookie = -1;
+	host->rw_mode = NORMAL_RW;
 	mmc_detect_change(host->mmc, 0);
 }
 
@@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev)
 	host->cookie = -1;
 	host->power_state = SDMMC_POWER_OFF;
 	INIT_WORK(&host->work, sd_request);
+	INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work);
 	platform_set_drvdata(pdev, host);
 	pcr->slots[RTSX_SD_CARD].p_dev = pdev;
 	pcr->slots[RTSX_SD_CARD].card_event = rtsx_pci_sdmmc_card_event;
@@ -1526,6 +1700,7 @@ static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
 		pm_runtime_disable(&pdev->dev);
 	}
 
+	cancel_delayed_work_sync(&host->rw_idle_work);
 	cancel_work_sync(&host->work);
 
 	mutex_lock(&host->host_mutex);
-- 
2.25.1
Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ulf Hansson 4 years, 6 months ago
On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
>
> Improving performance for the CMD is multi-block read/write
> and the data is sequential.
> sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25)
> or normal RW (CMD 17/24) if the CMD is multi-block and the data is
> sequential then call to sd_rw_multi_seq()
>
> This patch mainly to control the timing of reply at CMD 12/13.
> Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> The new code to distinguish multi-block RW(CMD 18/25) and the data is
> sequential or not, if the data is sequential RW driver do not send CMD 12
> and bypass CMD 13 until wait the different direction RW CMD
> or trigger the delay_work to sent CMD 12.
>
> run benchmark result as below:
> SD Card : Samsumg Pro Plus 128GB
> Number of Samples:100, Sample Size:10MB
> <Before> Read : 86.9 MB/s, Write : 38.3 MB/s
> <After>  Read : 91.5 MB/s, Write : 55.5 MB/s

A much nicer commit message, thanks a lot! Would you mind running some
additional tests, like random I/O read/writes?

Also, please specify the benchmark tool and command you are using. In
the meantime, I will continue to look at the code.

Kind regards
Uffe

>
> Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
> ---
> v2:
> make commit message more clarity
> change function name for more clarity
> v3:
> add more commit message and benchmark result
> ---
>  drivers/mmc/host/rtsx_pci_sdmmc.c | 185 +++++++++++++++++++++++++++++-
>  1 file changed, 180 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c
> index 58cfaffa3c2d..ee2b0eec6422 100644
> --- a/drivers/mmc/host/rtsx_pci_sdmmc.c
> +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
> @@ -22,6 +22,8 @@
>  #include <asm/unaligned.h>
>  #include <linux/pm_runtime.h>
>
> +enum RW_MODE   {NORMAL_RW, SEQ_RW};
> +
>  struct realtek_pci_sdmmc {
>         struct platform_device  *pdev;
>         struct rtsx_pcr         *pcr;
> @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc {
>
>         struct work_struct      work;
>         struct mutex            host_mutex;
> +       struct delayed_work             rw_idle_work;
>
>         u8                      ssc_depth;
>         unsigned int            clock;
> @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc {
>         s32                     cookie;
>         int                     cookie_sg_count;
>         bool                    using_cookie;
> +
> +       enum RW_MODE            rw_mode;
> +       u8              prev_dir;
> +       u8              cur_dir;
> +       u64             prev_sec_addr;
> +       u32             prev_sec_cnt;
>  };
>
>  static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios *ios);
> @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct realtek_pci_sdmmc *host,
>         dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg = 0x%08x\n",
>                         __func__, cmd_idx, arg);
>
> +       if (cmd_idx == MMC_SEND_STATUS && host->rw_mode == SEQ_RW) {
> +               cmd->resp[0] = R1_READY_FOR_DATA | (R1_STATE_TRAN << 9);
> +               goto out;
> +       }
> +
> +       if (!mmc_op_multi(cmd->opcode))
> +               host->rw_mode = NORMAL_RW;
> +
>         rsp_type = sd_response_type(cmd);
>         if (rsp_type < 0)
>                 goto out;
> @@ -542,6 +559,93 @@ static int sd_write_long_data(struct realtek_pci_sdmmc *host,
>         return 0;
>  }
>
> +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
> +{
> +       struct rtsx_pcr *pcr = host->pcr;
> +       struct mmc_host *mmc = host->mmc;
> +       struct mmc_card *card = mmc->card;
> +       struct mmc_data *data = mrq->data;
> +       int uhs = mmc_card_uhs(card);
> +       u8 cfg2;
> +       int err;
> +       size_t data_len = data->blksz * data->blocks;
> +
> +       cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 |
> +               SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 | SD_RSP_LEN_0;
> +
> +       if (!uhs)
> +               cfg2 |= SD_NO_CHECK_WAIT_CRC_TO;
> +
> +       rtsx_pci_init_cmd(pcr);
> +       sd_cmd_set_data_len(pcr, data->blocks, data->blksz);
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
> +                       DMA_DONE_INT, DMA_DONE_INT);
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3,
> +               0xFF, (u8)(data_len >> 24));
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2,
> +               0xFF, (u8)(data_len >> 16));
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1,
> +               0xFF, (u8)(data_len >> 8));
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF, (u8)data_len);
> +
> +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> +                       0x03 | DMA_PACK_SIZE_MASK,
> +                       DMA_DIR_FROM_CARD | DMA_EN | DMA_512);
> +       else
> +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> +                       0x03 | DMA_PACK_SIZE_MASK,
> +                       DMA_DIR_TO_CARD | DMA_EN | DMA_512);
> +
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE,
> +                       0x01, RING_BUFFER);
> +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2);
> +
> +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
> +                               SD_TRANSFER_START | SD_TM_AUTO_READ_3);
> +       else
> +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER, 0xFF,
> +                               SD_TRANSFER_START | SD_TM_AUTO_WRITE_3);
> +
> +       rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER,
> +                       SD_TRANSFER_END, SD_TRANSFER_END);
> +       rtsx_pci_send_cmd_no_wait(pcr);
> +
> +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> +               err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 1, 10000);
> +       else
> +               err = rtsx_pci_dma_transfer(pcr, data->sg, host->sg_count, 0, 10000);
> +
> +       if (err < 0) {
> +               sd_clear_error(host);
> +               return err;
> +       }
> +
> +       return 0;
> +}
> +
> +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
> +{
> +       struct rtsx_pcr *pcr = host->pcr;
> +       struct mmc_command *cmd;
> +
> +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> +
> +       cmd->opcode = MMC_STOP_TRANSMISSION;
> +       cmd->arg = 0;
> +       cmd->busy_timeout = 0;
> +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
> +       else
> +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
> +       sd_send_cmd_get_rsp(host, cmd);
> +       udelay(50);
> +       rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
> +       kfree(cmd);
> +       return 0;
> +}
> +
>  static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc *host)
>  {
>         rtsx_pci_write_register(host->pcr, SD_CFG1,
> @@ -796,6 +900,45 @@ static inline int sd_rw_cmd(struct mmc_command *cmd)
>                 (cmd->opcode == MMC_WRITE_BLOCK);
>  }
>
> +static void sd_rw_idle_work(struct work_struct *work)
> +{
> +       struct delayed_work *dwork = to_delayed_work(work);
> +       struct realtek_pci_sdmmc *host = container_of(dwork,
> +                       struct realtek_pci_sdmmc, rw_idle_work);
> +       struct mmc_command *cmd;
> +
> +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> +
> +       cmd->opcode = MMC_STOP_TRANSMISSION;
> +       cmd->arg = 0;
> +       cmd->busy_timeout = 0;
> +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_AC;
> +       else
> +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
> +
> +       sd_send_cmd_get_rsp(host, cmd);
> +       host->rw_mode = NORMAL_RW;
> +       kfree(cmd);
> +}
> +
> +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct mmc_request *mrq)
> +{
> +       struct mmc_command *cmd = mrq->cmd;
> +       struct mmc_data *data = mrq->data;
> +
> +       if (!mmc_op_multi(cmd->opcode))
> +               return 0;
> +
> +       if (host->prev_dir != host->cur_dir)
> +               return 0;
> +
> +       if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr)
> +               return 0;
> +
> +       return 1;
> +}
> +
>  static void sd_request(struct work_struct *work)
>  {
>         struct realtek_pci_sdmmc *host = container_of(work,
> @@ -841,12 +984,36 @@ static void sd_request(struct work_struct *work)
>         if (!data_size) {
>                 sd_send_cmd_get_rsp(host, cmd);
>         } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> -               cmd->error = sd_rw_multi(host, mrq);
> -               if (!host->using_cookie)
> -                       sdmmc_post_req(host->mmc, host->mrq, 0);
> +               /* Check multi-block and seq function*/
> +               if (data->flags & MMC_DATA_READ)
> +                       host->cur_dir = DMA_DIR_FROM_CARD;
> +               else
> +                       host->cur_dir = DMA_DIR_TO_CARD;
> +
> +               if (host->rw_mode == SEQ_RW) {
> +                       cancel_delayed_work(&host->rw_idle_work);
> +                       if (!sd_check_multi_seq(host, mrq)) {
> +                               sd_stop_rw_multi_seq(host, mrq);
> +                               host->rw_mode = NORMAL_RW;
> +                       }
> +               }
> +
> +               if (host->rw_mode == SEQ_RW)
> +                       cmd->error = sd_rw_multi_seq(host, mrq);
> +               else {
> +                       if (mmc_op_multi(cmd->opcode))
> +                               host->rw_mode = SEQ_RW;
> +                       cmd->error = sd_rw_multi(host, mrq);
> +                       if (!host->using_cookie)
> +                               sdmmc_post_req(host->mmc, host->mrq, 0);
> +               }
> +
> +               if (cmd->error)
> +                       host->rw_mode = NORMAL_RW;
> +
> +               if (mmc_op_multi(cmd->opcode) && host->rw_mode == SEQ_RW)
> +                       mod_delayed_work(system_wq, &host->rw_idle_work, msecs_to_jiffies(150));
>
> -               if (mmc_op_multi(cmd->opcode) && mrq->stop)
> -                       sd_send_cmd_get_rsp(host, mrq->stop);
>         } else {
>                 sd_normal_rw(host, mrq);
>         }
> @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work)
>         }
>
>         mutex_lock(&host->host_mutex);
> +       if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> +               host->prev_dir = host->cur_dir;
> +               host->prev_sec_addr = data->blk_addr;
> +               host->prev_sec_cnt = data->blocks;
> +       }
>         host->mrq = NULL;
>         mutex_unlock(&host->host_mutex);
>
> @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct platform_device *pdev)
>         struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev);
>
>         host->cookie = -1;
> +       host->rw_mode = NORMAL_RW;
>         mmc_detect_change(host->mmc, 0);
>  }
>
> @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct platform_device *pdev)
>         host->cookie = -1;
>         host->power_state = SDMMC_POWER_OFF;
>         INIT_WORK(&host->work, sd_request);
> +       INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work);
>         platform_set_drvdata(pdev, host);
>         pcr->slots[RTSX_SD_CARD].p_dev = pdev;
>         pcr->slots[RTSX_SD_CARD].card_event = rtsx_pci_sdmmc_card_event;
> @@ -1526,6 +1700,7 @@ static int rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
>                 pm_runtime_disable(&pdev->dev);
>         }
>
> +       cancel_delayed_work_sync(&host->rw_idle_work);
>         cancel_work_sync(&host->work);
>
>         mutex_lock(&host->host_mutex);
> --
> 2.25.1
RE: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ricky WU 4 years, 6 months ago
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Tuesday, December 21, 2021 8:51 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > Improving performance for the CMD is multi-block read/write and the
> > data is sequential.
> > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > sequential then call to sd_rw_multi_seq()
> >
> > This patch mainly to control the timing of reply at CMD 12/13.
> > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > The new code to distinguish multi-block RW(CMD 18/25) and the data is
> > sequential or not, if the data is sequential RW driver do not send CMD
> > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > trigger the delay_work to sent CMD 12.
> >
> > run benchmark result as below:
> > SD Card : Samsumg Pro Plus 128GB
> > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> 
> A much nicer commit message, thanks a lot! Would you mind running some
> additional tests, like random I/O read/writes?
> 
> Also, please specify the benchmark tool and command you are using. In the
> meantime, I will continue to look at the code.
> 

The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks" 
and the Tool don't have random I/O to choice...

Do you have any suggestion for testing random I/O
But we think random I/O will not change much

BR,
Ricky

> Kind regards
> Uffe
> 
> >
> > Signed-off-by: Ricky Wu <ricky_wu@realtek.com>
> > ---
> > v2:
> > make commit message more clarity
> > change function name for more clarity
> > v3:
> > add more commit message and benchmark result
> > ---
> >  drivers/mmc/host/rtsx_pci_sdmmc.c | 185
> > +++++++++++++++++++++++++++++-
> >  1 file changed, 180 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c
> > b/drivers/mmc/host/rtsx_pci_sdmmc.c
> > index 58cfaffa3c2d..ee2b0eec6422 100644
> > --- a/drivers/mmc/host/rtsx_pci_sdmmc.c
> > +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c
> > @@ -22,6 +22,8 @@
> >  #include <asm/unaligned.h>
> >  #include <linux/pm_runtime.h>
> >
> > +enum RW_MODE   {NORMAL_RW, SEQ_RW};
> > +
> >  struct realtek_pci_sdmmc {
> >         struct platform_device  *pdev;
> >         struct rtsx_pcr         *pcr;
> > @@ -31,6 +33,7 @@ struct realtek_pci_sdmmc {
> >
> >         struct work_struct      work;
> >         struct mutex            host_mutex;
> > +       struct delayed_work             rw_idle_work;
> >
> >         u8                      ssc_depth;
> >         unsigned int            clock;
> > @@ -46,6 +49,12 @@ struct realtek_pci_sdmmc {
> >         s32                     cookie;
> >         int                     cookie_sg_count;
> >         bool                    using_cookie;
> > +
> > +       enum RW_MODE            rw_mode;
> > +       u8              prev_dir;
> > +       u8              cur_dir;
> > +       u64             prev_sec_addr;
> > +       u32             prev_sec_cnt;
> >  };
> >
> >  static int sdmmc_init_sd_express(struct mmc_host *mmc, struct mmc_ios
> > *ios); @@ -226,6 +235,14 @@ static void sd_send_cmd_get_rsp(struct
> realtek_pci_sdmmc *host,
> >         dev_dbg(sdmmc_dev(host), "%s: SD/MMC CMD %d, arg =
> 0x%08x\n",
> >                         __func__, cmd_idx, arg);
> >
> > +       if (cmd_idx == MMC_SEND_STATUS && host->rw_mode ==
> SEQ_RW) {
> > +               cmd->resp[0] = R1_READY_FOR_DATA |
> (R1_STATE_TRAN << 9);
> > +               goto out;
> > +       }
> > +
> > +       if (!mmc_op_multi(cmd->opcode))
> > +               host->rw_mode = NORMAL_RW;
> > +
> >         rsp_type = sd_response_type(cmd);
> >         if (rsp_type < 0)
> >                 goto out;
> > @@ -542,6 +559,93 @@ static int sd_write_long_data(struct
> realtek_pci_sdmmc *host,
> >         return 0;
> >  }
> >
> > +static int sd_rw_multi_seq(struct realtek_pci_sdmmc *host, struct
> > +mmc_request *mrq) {
> > +       struct rtsx_pcr *pcr = host->pcr;
> > +       struct mmc_host *mmc = host->mmc;
> > +       struct mmc_card *card = mmc->card;
> > +       struct mmc_data *data = mrq->data;
> > +       int uhs = mmc_card_uhs(card);
> > +       u8 cfg2;
> > +       int err;
> > +       size_t data_len = data->blksz * data->blocks;
> > +
> > +       cfg2 = SD_NO_CALCULATE_CRC7 | SD_CHECK_CRC16 |
> > +               SD_NO_WAIT_BUSY_END | SD_NO_CHECK_CRC7 |
> SD_RSP_LEN_0;
> > +
> > +       if (!uhs)
> > +               cfg2 |= SD_NO_CHECK_WAIT_CRC_TO;
> > +
> > +       rtsx_pci_init_cmd(pcr);
> > +       sd_cmd_set_data_len(pcr, data->blocks, data->blksz);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, IRQSTAT0,
> > +                       DMA_DONE_INT, DMA_DONE_INT);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC3,
> > +               0xFF, (u8)(data_len >> 24));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC2,
> > +               0xFF, (u8)(data_len >> 16));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC1,
> > +               0xFF, (u8)(data_len >> 8));
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMATC0, 0xFF,
> > + (u8)data_len);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> > +                       0x03 | DMA_PACK_SIZE_MASK,
> > +                       DMA_DIR_FROM_CARD | DMA_EN |
> DMA_512);
> > +       else
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, DMACTL,
> > +                       0x03 | DMA_PACK_SIZE_MASK,
> > +                       DMA_DIR_TO_CARD | DMA_EN | DMA_512);
> > +
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_DATA_SOURCE,
> > +                       0x01, RING_BUFFER);
> > +       rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_CFG2, 0xFF, cfg2);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER,
> 0xFF,
> > +                               SD_TRANSFER_START |
> SD_TM_AUTO_READ_3);
> > +       else
> > +               rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD_TRANSFER,
> 0xFF,
> > +                               SD_TRANSFER_START |
> > + SD_TM_AUTO_WRITE_3);
> > +
> > +       rtsx_pci_add_cmd(pcr, CHECK_REG_CMD, SD_TRANSFER,
> > +                       SD_TRANSFER_END, SD_TRANSFER_END);
> > +       rtsx_pci_send_cmd_no_wait(pcr);
> > +
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               err = rtsx_pci_dma_transfer(pcr, data->sg,
> host->sg_count, 1, 10000);
> > +       else
> > +               err = rtsx_pci_dma_transfer(pcr, data->sg,
> > + host->sg_count, 0, 10000);
> > +
> > +       if (err < 0) {
> > +               sd_clear_error(host);
> > +               return err;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> > +static int sd_stop_rw_multi_seq(struct realtek_pci_sdmmc *host,
> > +struct mmc_request *mrq) {
> > +       struct rtsx_pcr *pcr = host->pcr;
> > +       struct mmc_command *cmd;
> > +
> > +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> > +
> > +       cmd->opcode = MMC_STOP_TRANSMISSION;
> > +       cmd->arg = 0;
> > +       cmd->busy_timeout = 0;
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 |
> MMC_CMD_AC;
> > +       else
> > +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B |
> MMC_CMD_AC;
> > +       sd_send_cmd_get_rsp(host, cmd);
> > +       udelay(50);
> > +       rtsx_pci_write_register(pcr, RBCTL, RB_FLUSH, RB_FLUSH);
> > +       kfree(cmd);
> > +       return 0;
> > +}
> > +
> >  static inline void sd_enable_initial_mode(struct realtek_pci_sdmmc
> > *host)  {
> >         rtsx_pci_write_register(host->pcr, SD_CFG1, @@ -796,6 +900,45
> > @@ static inline int sd_rw_cmd(struct mmc_command *cmd)
> >                 (cmd->opcode == MMC_WRITE_BLOCK);  }
> >
> > +static void sd_rw_idle_work(struct work_struct *work) {
> > +       struct delayed_work *dwork = to_delayed_work(work);
> > +       struct realtek_pci_sdmmc *host = container_of(dwork,
> > +                       struct realtek_pci_sdmmc, rw_idle_work);
> > +       struct mmc_command *cmd;
> > +
> > +       cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
> > +
> > +       cmd->opcode = MMC_STOP_TRANSMISSION;
> > +       cmd->arg = 0;
> > +       cmd->busy_timeout = 0;
> > +       if (host->cur_dir == DMA_DIR_FROM_CARD)
> > +               cmd->flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 |
> MMC_CMD_AC;
> > +       else
> > +               cmd->flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B |
> > + MMC_CMD_AC;
> > +
> > +       sd_send_cmd_get_rsp(host, cmd);
> > +       host->rw_mode = NORMAL_RW;
> > +       kfree(cmd);
> > +}
> > +
> > +static int sd_check_multi_seq(struct realtek_pci_sdmmc *host, struct
> > +mmc_request *mrq) {
> > +       struct mmc_command *cmd = mrq->cmd;
> > +       struct mmc_data *data = mrq->data;
> > +
> > +       if (!mmc_op_multi(cmd->opcode))
> > +               return 0;
> > +
> > +       if (host->prev_dir != host->cur_dir)
> > +               return 0;
> > +
> > +       if ((host->prev_sec_addr + host->prev_sec_cnt) != data->blk_addr)
> > +               return 0;
> > +
> > +       return 1;
> > +}
> > +
> >  static void sd_request(struct work_struct *work)  {
> >         struct realtek_pci_sdmmc *host = container_of(work, @@ -841,12
> > +984,36 @@ static void sd_request(struct work_struct *work)
> >         if (!data_size) {
> >                 sd_send_cmd_get_rsp(host, cmd);
> >         } else if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> > -               cmd->error = sd_rw_multi(host, mrq);
> > -               if (!host->using_cookie)
> > -                       sdmmc_post_req(host->mmc, host->mrq, 0);
> > +               /* Check multi-block and seq function*/
> > +               if (data->flags & MMC_DATA_READ)
> > +                       host->cur_dir = DMA_DIR_FROM_CARD;
> > +               else
> > +                       host->cur_dir = DMA_DIR_TO_CARD;
> > +
> > +               if (host->rw_mode == SEQ_RW) {
> > +                       cancel_delayed_work(&host->rw_idle_work);
> > +                       if (!sd_check_multi_seq(host, mrq)) {
> > +                               sd_stop_rw_multi_seq(host, mrq);
> > +                               host->rw_mode = NORMAL_RW;
> > +                       }
> > +               }
> > +
> > +               if (host->rw_mode == SEQ_RW)
> > +                       cmd->error = sd_rw_multi_seq(host, mrq);
> > +               else {
> > +                       if (mmc_op_multi(cmd->opcode))
> > +                               host->rw_mode = SEQ_RW;
> > +                       cmd->error = sd_rw_multi(host, mrq);
> > +                       if (!host->using_cookie)
> > +                               sdmmc_post_req(host->mmc,
> host->mrq, 0);
> > +               }
> > +
> > +               if (cmd->error)
> > +                       host->rw_mode = NORMAL_RW;
> > +
> > +               if (mmc_op_multi(cmd->opcode) && host->rw_mode ==
> SEQ_RW)
> > +                       mod_delayed_work(system_wq,
> > + &host->rw_idle_work, msecs_to_jiffies(150));
> >
> > -               if (mmc_op_multi(cmd->opcode) && mrq->stop)
> > -                       sd_send_cmd_get_rsp(host, mrq->stop);
> >         } else {
> >                 sd_normal_rw(host, mrq);
> >         }
> > @@ -867,6 +1034,11 @@ static void sd_request(struct work_struct *work)
> >         }
> >
> >         mutex_lock(&host->host_mutex);
> > +       if (sd_rw_cmd(cmd) || sdio_extblock_cmd(cmd, data)) {
> > +               host->prev_dir = host->cur_dir;
> > +               host->prev_sec_addr = data->blk_addr;
> > +               host->prev_sec_cnt = data->blocks;
> > +       }
> >         host->mrq = NULL;
> >         mutex_unlock(&host->host_mutex);
> >
> > @@ -1457,6 +1629,7 @@ static void rtsx_pci_sdmmc_card_event(struct
> platform_device *pdev)
> >         struct realtek_pci_sdmmc *host = platform_get_drvdata(pdev);
> >
> >         host->cookie = -1;
> > +       host->rw_mode = NORMAL_RW;
> >         mmc_detect_change(host->mmc, 0);  }
> >
> > @@ -1487,6 +1660,7 @@ static int rtsx_pci_sdmmc_drv_probe(struct
> platform_device *pdev)
> >         host->cookie = -1;
> >         host->power_state = SDMMC_POWER_OFF;
> >         INIT_WORK(&host->work, sd_request);
> > +       INIT_DELAYED_WORK(&host->rw_idle_work, sd_rw_idle_work);
> >         platform_set_drvdata(pdev, host);
> >         pcr->slots[RTSX_SD_CARD].p_dev = pdev;
> >         pcr->slots[RTSX_SD_CARD].card_event =
> > rtsx_pci_sdmmc_card_event; @@ -1526,6 +1700,7 @@ static int
> rtsx_pci_sdmmc_drv_remove(struct platform_device *pdev)
> >                 pm_runtime_disable(&pdev->dev);
> >         }
> >
> > +       cancel_delayed_work_sync(&host->rw_idle_work);
> >         cancel_work_sync(&host->work);
> >
> >         mutex_lock(&host->host_mutex);
> > --
> > 2.25.1
> ------Please consider the environment before printing this e-mail.
Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ulf Hansson 4 years, 6 months ago
On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Tuesday, December 21, 2021 8:51 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > > Improving performance for the CMD is multi-block read/write and the
> > > data is sequential.
> > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > sequential then call to sd_rw_multi_seq()
> > >
> > > This patch mainly to control the timing of reply at CMD 12/13.
> > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > The new code to distinguish multi-block RW(CMD 18/25) and the data is
> > > sequential or not, if the data is sequential RW driver do not send CMD
> > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > trigger the delay_work to sent CMD 12.
> > >
> > > run benchmark result as below:
> > > SD Card : Samsumg Pro Plus 128GB
> > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> >
> > A much nicer commit message, thanks a lot! Would you mind running some
> > additional tests, like random I/O read/writes?
> >
> > Also, please specify the benchmark tool and command you are using. In the
> > meantime, I will continue to look at the code.
> >
>
> The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> and the Tool don't have random I/O to choice...
>
> Do you have any suggestion for testing random I/O
> But we think random I/O will not change much

I would probably look into using fio, https://fio.readthedocs.io/en/latest/

Another option that I use frequently is iozone, https://www.iozone.org.
Here's a command line that I often use for iozone
./iozone -az -i0 -i1 -s 20m -y 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e

[...]

Kind regards
Uffe
RE: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ricky WU 4 years, 6 months ago
> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Thursday, December 23, 2021 6:37 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > block rw
> > >
> > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > > Improving performance for the CMD is multi-block read/write and
> > > > the data is sequential.
> > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > > sequential then call to sd_rw_multi_seq()
> > > >
> > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > The new code to distinguish multi-block RW(CMD 18/25) and the data
> > > > is sequential or not, if the data is sequential RW driver do not
> > > > send CMD
> > > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > > trigger the delay_work to sent CMD 12.
> > > >
> > > > run benchmark result as below:
> > > > SD Card : Samsumg Pro Plus 128GB
> > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> > >
> > > A much nicer commit message, thanks a lot! Would you mind running
> > > some additional tests, like random I/O read/writes?
> > >
> > > Also, please specify the benchmark tool and command you are using.
> > > In the meantime, I will continue to look at the code.
> > >
> >
> > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > and the Tool don't have random I/O to choice...
> >
> > Do you have any suggestion for testing random I/O But we think random
> > I/O will not change much
> 
> I would probably look into using fio, https://fio.readthedocs.io/en/latest/
> 

Filled random I/O data
Before the patch:
CMD (Randread):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
  read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
    clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
     lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
    clat percentiles (usec):
     |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600], 20.00th=[11600],
     | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731],
     | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863],
     | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341],
     | 99.99th=[34341]
   bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24
   iops        : min=   80, max=   86, avg=85.00, stdev= 1.41, samples=24
  lat (msec)   : 20=99.90%, 50=0.10%
  cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec

Disk stats (read/write):
  mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23%

CMD (Randwrite):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite

mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
  write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets
    clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
     lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
    clat percentiles (usec):
     |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676], 20.00th=[22938],
     | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725],
     | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361],
     | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897],
     | 99.99th=[94897]
   bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53
   iops        : min=   24, max=   42, avg=38.30, stdev= 3.77, samples=53
  lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
  cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
  WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec

Disk stats (read/write):
  mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90%


After the patch:

CMD (Randread):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
  read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
    clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
     lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
    clat percentiles (usec):
     |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469], 20.00th=[11469],
     | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600],
     | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600],
     | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375],
     | 99.99th=[32375]
   bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23
   iops        : min=   82, max=   88, avg=86.52, stdev= 1.38, samples=23
  lat (msec)   : 20=99.80%, 50=0.20%
  cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
   READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec

Disk stats (read/write):
  mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21%

CMD (Randwrite):
sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite

mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
fio-3.16
Starting 1 thread
Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
  write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets
    clat (msec): min=16, max=118, avg=25.37, stdev=16.34
     lat (msec): min=16, max=118, avg=25.44, stdev=16.34
    clat percentiles (msec):
     |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20], 20.00th=[   20],
     | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20], 60.00th=[   20],
     | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52], 95.00th=[   75],
     | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114], 99.95th=[  120],
     | 99.99th=[  120]
   bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52
   iops        : min=   20, max=   50, avg=39.27, stdev=10.25, samples=52
  lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
  cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
  IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
     issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
     latency   : target=0, window=0, percentile=100.00%, depth=1

Run status group 0 (all jobs):
  WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec

Disk stats (read/write):
  mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89%

BR,
Ricky

> Another option that I use frequently is iozone, https://www.iozone.org.
> Here's a command line that I often use for iozone ./iozone -az -i0 -i1 -s 20m -y
> 16k -q 4m -I -f /mnt/sdcard/iozone.tmp -e
> 
> [...]
> 
> Kind regards
> Uffe
> ------Please consider the environment before printing this e-mail.
Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ulf Hansson 4 years, 6 months ago
On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote:
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Thursday, December 23, 2021 6:37 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> > >
> > > > -----Original Message-----
> > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > linux-kernel@vger.kernel.org
> > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > block rw
> > > >
> > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com> wrote:
> > > > >
> > > > > Improving performance for the CMD is multi-block read/write and
> > > > > the data is sequential.
> > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25) or
> > > > > normal RW (CMD 17/24) if the CMD is multi-block and the data is
> > > > > sequential then call to sd_rw_multi_seq()
> > > > >
> > > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > > The new code to distinguish multi-block RW(CMD 18/25) and the data
> > > > > is sequential or not, if the data is sequential RW driver do not
> > > > > send CMD
> > > > > 12 and bypass CMD 13 until wait the different direction RW CMD or
> > > > > trigger the delay_work to sent CMD 12.
> > > > >
> > > > > run benchmark result as below:
> > > > > SD Card : Samsumg Pro Plus 128GB
> > > > > Number of Samples:100, Sample Size:10MB <Before> Read : 86.9 MB/s,
> > > > > Write : 38.3 MB/s <After>  Read : 91.5 MB/s, Write : 55.5 MB/s
> > > >
> > > > A much nicer commit message, thanks a lot! Would you mind running
> > > > some additional tests, like random I/O read/writes?
> > > >
> > > > Also, please specify the benchmark tool and command you are using.
> > > > In the meantime, I will continue to look at the code.
> > > >
> > >
> > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > > and the Tool don't have random I/O to choice...
> > >
> > > Do you have any suggestion for testing random I/O But we think random
> > > I/O will not change much
> >
> > I would probably look into using fio, https://fio.readthedocs.io/en/latest/
> >
>
> Filled random I/O data
> Before the patch:
> CMD (Randread):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread

Thanks for running the tests! Overall, I would not expect an impact on
the throughput when using a big blocksize like 1M. This is also pretty
clear from the result you have provided.

However, especially for random writes and reads, we want to try with
smaller blocksizes. Like 8k or 16k, would you mind running another
round of tests to see how that works out?

I haven't yet been able to provide you with comments on the code, but
I am looking into it.

Kind regards
Uffe

>
> mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
>   read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
>     clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
>      lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
>     clat percentiles (usec):
>      |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600], 20.00th=[11600],
>      | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731], 60.00th=[11731],
>      | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863], 95.00th=[11863],
>      | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664], 99.95th=[34341],
>      | 99.99th=[34341]
>    bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67, stdev=1467.81, samples=24
>    iops        : min=   80, max=   86, avg=85.00, stdev= 1.41, samples=24
>   lat (msec)   : 20=99.90%, 50=0.10%
>   cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>    READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec
>
> Disk stats (read/write):
>   mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612, util=99.23%
>
> CMD (Randwrite):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite
>
> mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
>   write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone resets
>     clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
>      lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
>     clat percentiles (usec):
>      |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676], 20.00th=[22938],
>      | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462], 60.00th=[23725],
>      | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773], 95.00th=[56361],
>      | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508], 99.95th=[94897],
>      | 99.99th=[94897]
>    bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13, stdev=3860.74, samples=53
>    iops        : min=   24, max=   42, avg=38.30, stdev= 3.77, samples=53
>   lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
>   cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>   WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec
>
> Disk stats (read/write):
>   mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956, util=99.90%
>
>
> After the patch:
>
> CMD (Randread):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randread
>
> mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
>   read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
>     clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
>      lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
>     clat percentiles (usec):
>      |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469], 20.00th=[11469],
>      | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469], 60.00th=[11600],
>      | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600], 95.00th=[11600],
>      | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627], 99.95th=[32375],
>      | 99.99th=[32375]
>    bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26, stdev=1410.46, samples=23
>    iops        : min=   82, max=   88, avg=86.52, stdev= 1.38, samples=23
>   lat (msec)   : 20=99.80%, 50=0.20%
>   cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>    READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec
>
> Disk stats (read/write):
>   mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397, util=99.21%
>
> CMD (Randwrite):
> sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=1M -rw=randwrite
>
> mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W) 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> fio-3.16
> Starting 1 thread
> Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
> mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
>   write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone resets
>     clat (msec): min=16, max=118, avg=25.37, stdev=16.34
>      lat (msec): min=16, max=118, avg=25.44, stdev=16.34
>     clat percentiles (msec):
>      |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20], 20.00th=[   20],
>      | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20], 60.00th=[   20],
>      | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52], 95.00th=[   75],
>      | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114], 99.95th=[  120],
>      | 99.99th=[  120]
>    bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69, stdev=10498.00, samples=52
>    iops        : min=   20, max=   50, avg=39.27, stdev=10.25, samples=52
>   lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
>   cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
>   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%, >=64=0.0%
>      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
>      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
>      latency   : target=0, window=0, percentile=100.00%, depth=1
>
> Run status group 0 (all jobs):
>   WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec
>
> Disk stats (read/write):
>   mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144, util=99.89%
>
> BR,
> Ricky
RE: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ricky WU 4 years, 6 months ago

> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Tuesday, December 28, 2021 10:05 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> On Fri, 24 Dec 2021 at 08:23, Ricky WU <ricky_wu@realtek.com> wrote:
> >
> > > -----Original Message-----
> > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > Sent: Thursday, December 23, 2021 6:37 PM
> > > To: Ricky WU <ricky_wu@realtek.com>
> > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > block rw
> > >
> > > On Thu, 23 Dec 2021 at 11:27, Ricky WU <ricky_wu@realtek.com> wrote:
> > > >
> > > > > -----Original Message-----
> > > > > From: Ulf Hansson <ulf.hansson@linaro.org>
> > > > > Sent: Tuesday, December 21, 2021 8:51 PM
> > > > > To: Ricky WU <ricky_wu@realtek.com>
> > > > > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > > > > linux-kernel@vger.kernel.org
> > > > > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi
> > > > > block rw
> > > > >
> > > > > On Tue, 21 Dec 2021 at 13:24, Ricky WU <ricky_wu@realtek.com>
> wrote:
> > > > > >
> > > > > > Improving performance for the CMD is multi-block read/write
> > > > > > and the data is sequential.
> > > > > > sd_check_multi_seq() to distinguish multi-block RW (CMD 18/25)
> > > > > > or normal RW (CMD 17/24) if the CMD is multi-block and the
> > > > > > data is sequential then call to sd_rw_multi_seq()
> > > > > >
> > > > > > This patch mainly to control the timing of reply at CMD 12/13.
> > > > > > Originally code driver reply CMD 12/13 at every RW (CMD 18/25).
> > > > > > The new code to distinguish multi-block RW(CMD 18/25) and the
> > > > > > data is sequential or not, if the data is sequential RW driver
> > > > > > do not send CMD
> > > > > > 12 and bypass CMD 13 until wait the different direction RW CMD
> > > > > > or trigger the delay_work to sent CMD 12.
> > > > > >
> > > > > > run benchmark result as below:
> > > > > > SD Card : Samsumg Pro Plus 128GB Number of Samples:100, Sample
> > > > > > Size:10MB <Before> Read : 86.9 MB/s, Write : 38.3 MB/s <After>
> > > > > > Read : 91.5 MB/s, Write : 55.5 MB/s
> > > > >
> > > > > A much nicer commit message, thanks a lot! Would you mind
> > > > > running some additional tests, like random I/O read/writes?
> > > > >
> > > > > Also, please specify the benchmark tool and command you are using.
> > > > > In the meantime, I will continue to look at the code.
> > > > >
> > > >
> > > > The Tool just use Ubuntu internal GUI benchmark Tool in the "Disks"
> > > > and the Tool don't have random I/O to choice...
> > > >
> > > > Do you have any suggestion for testing random I/O But we think
> > > > random I/O will not change much
> > >
> > > I would probably look into using fio,
> > > https://fio.readthedocs.io/en/latest/
> > >
> >
> > Filled random I/O data
> > Before the patch:
> > CMD (Randread):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randread
> 
> Thanks for running the tests! Overall, I would not expect an impact on the
> throughput when using a big blocksize like 1M. This is also pretty clear from
> the result you have provided.
> 
> However, especially for random writes and reads, we want to try with smaller
> blocksizes. Like 8k or 16k, would you mind running another round of tests to
> see how that works out?
> 

Filled random I/O data(8k/16k)

Before(randread)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
Disk stats (read/write):
  mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
Disk stats (read/write):
  mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84%

Before(randrwrite)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec
Disk stats (read/write):
  mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec
Disk stats (read/write):
  mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81%


After(randread)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
Disk stats (read/write):
  mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
   READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
Disk stats (read/write):
  mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87%

After(randwrite)
8k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec
Disk stats (read/write):
  mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92%

16k:
Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
result:
Run status group 0 (all jobs):
  WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec
Disk stats (read/write):
  mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80%

> I haven't yet been able to provide you with comments on the code, but I am
> looking into it.
> 
> Kind regards
> Uffe
> 
> >
> > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [r(1)][100.0%][r=86.0MiB/s][r=86 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=2663: Fri Dec 24 14:28:33 2021
> >   read: IOPS=85, BW=85.1MiB/s (89.3MB/s)(1024MiB/12026msec)
> >     clat (usec): min=11253, max=34579, avg=11735.57, stdev=742.16
> >      lat (usec): min=11254, max=34580, avg=11736.34, stdev=742.16
> >     clat percentiles (usec):
> >      |  1.00th=[11338],  5.00th=[11469], 10.00th=[11600],
> 20.00th=[11600],
> >      | 30.00th=[11600], 40.00th=[11600], 50.00th=[11731],
> 60.00th=[11731],
> >      | 70.00th=[11863], 80.00th=[11863], 90.00th=[11863],
> 95.00th=[11863],
> >      | 99.00th=[11863], 99.50th=[12518], 99.90th=[15664],
> 99.95th=[34341],
> >      | 99.99th=[34341]
> >    bw (  KiB/s): min=81920, max=88064, per=99.91%, avg=87110.67,
> stdev=1467.81, samples=24
> >    iops        : min=   80, max=   86, avg=85.00, stdev= 1.41,
> samples=24
> >   lat (msec)   : 20=99.90%, 50=0.10%
> >   cpu          : usr=0.17%, sys=1.26%, ctx=2048, majf=0, minf=256
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >    READ: bw=85.1MiB/s (89.3MB/s), 85.1MiB/s-85.1MiB/s
> > (89.3MB/s-89.3MB/s), io=1024MiB (1074MB), run=12026-12026msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=2026/0, merge=0/0, ticks=17612/0, in_queue=17612,
> > util=99.23%
> >
> > CMD (Randwrite):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randwrite
> >
> > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [w(1)][100.0%][w=41.0MiB/s][w=41 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=2738: Fri Dec 24 14:30:05 2021
> >   write: IOPS=38, BW=38.4MiB/s (40.2MB/s)(1024MiB/26695msec); 0 zone
> resets
> >     clat (usec): min=18862, max=94708, avg=25990.34, stdev=9227.22
> >      lat (usec): min=18910, max=94781, avg=26061.91, stdev=9228.04
> >     clat percentiles (usec):
> >      |  1.00th=[20579],  5.00th=[22414], 10.00th=[22676],
> 20.00th=[22938],
> >      | 30.00th=[23200], 40.00th=[23462], 50.00th=[23462],
> 60.00th=[23725],
> >      | 70.00th=[23725], 80.00th=[23987], 90.00th=[24773],
> 95.00th=[56361],
> >      | 99.00th=[59507], 99.50th=[64226], 99.90th=[86508],
> 99.95th=[94897],
> >      | 99.99th=[94897]
> >    bw (  KiB/s): min=24576, max=43008, per=99.85%, avg=39221.13,
> stdev=3860.74, samples=53
> >    iops        : min=   24, max=   42, avg=38.30, stdev= 3.77,
> samples=53
> >   lat (msec)   : 20=0.98%, 50=92.38%, 100=6.64%
> >   cpu          : usr=0.50%, sys=0.31%, ctx=1024, majf=0, minf=0
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >   WRITE: bw=38.4MiB/s (40.2MB/s), 38.4MiB/s-38.4MiB/s
> > (40.2MB/s-40.2MB/s), io=1024MiB (1074MB), run=26695-26695msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=52/2043, merge=0/0, ticks=81/39874, in_queue=39956,
> > util=99.90%
> >
> >
> > After the patch:
> >
> > CMD (Randread):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randread
> >
> > mytest: (g=0): rw=randread, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [r(1)][100.0%][r=87.0MiB/s][r=87 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=11614: Fri Dec 24 14:07:06 2021
> >   read: IOPS=86, BW=86.6MiB/s (90.8MB/s)(1024MiB/11828msec)
> >     clat (usec): min=11068, max=32423, avg=11543.12, stdev=733.86
> >      lat (usec): min=11069, max=32424, avg=11543.85, stdev=733.87
> >     clat percentiles (usec):
> >      |  1.00th=[11076],  5.00th=[11338], 10.00th=[11469],
> 20.00th=[11469],
> >      | 30.00th=[11469], 40.00th=[11469], 50.00th=[11469],
> 60.00th=[11600],
> >      | 70.00th=[11600], 80.00th=[11600], 90.00th=[11600],
> 95.00th=[11600],
> >      | 99.00th=[11600], 99.50th=[11731], 99.90th=[21627],
> 99.95th=[32375],
> >      | 99.99th=[32375]
> >    bw (  KiB/s): min=83968, max=90112, per=99.94%, avg=88598.26,
> stdev=1410.46, samples=23
> >    iops        : min=   82, max=   88, avg=86.52, stdev= 1.38,
> samples=23
> >   lat (msec)   : 20=99.80%, 50=0.20%
> >   cpu          : usr=0.09%, sys=1.40%, ctx=2048, majf=0, minf=256
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=1024,0,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >    READ: bw=86.6MiB/s (90.8MB/s), 86.6MiB/s-86.6MiB/s
> > (90.8MB/s-90.8MB/s), io=1024MiB (1074MB), run=11828-11828msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=2016/0, merge=0/0, ticks=17397/0, in_queue=17397,
> > util=99.21%
> >
> > CMD (Randwrite):
> > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=1M -rw=randwrite
> >
> > mytest: (g=0): rw=randwrite, bs=(R) 1024KiB-1024KiB, (W)
> > 1024KiB-1024KiB, (T) 1024KiB-1024KiB, ioengine=psync, iodepth=1
> > fio-3.16
> > Starting 1 thread
> > Jobs: 1 (f=1): [w(1)][100.0%][w=50.0MiB/s][w=50 IOPS][eta 00m:00s]
> > mytest: (groupid=0, jobs=1): err= 0: pid=11668: Fri Dec 24 14:08:36 2021
> >   write: IOPS=39, BW=39.3MiB/s (41.2MB/s)(1024MiB/26059msec); 0 zone
> resets
> >     clat (msec): min=16, max=118, avg=25.37, stdev=16.34
> >      lat (msec): min=16, max=118, avg=25.44, stdev=16.34
> >     clat percentiles (msec):
> >      |  1.00th=[   17],  5.00th=[   20], 10.00th=[   20],
> 20.00th=[   20],
> >      | 30.00th=[   20], 40.00th=[   20], 50.00th=[   20],
> 60.00th=[   20],
> >      | 70.00th=[   21], 80.00th=[   21], 90.00th=[   52],
> 95.00th=[   75],
> >      | 99.00th=[   78], 99.50th=[  104], 99.90th=[  114],
> 99.95th=[  120],
> >      | 99.99th=[  120]
> >    bw (  KiB/s): min=20480, max=51200, per=99.93%, avg=40211.69,
> stdev=10498.00, samples=52
> >    iops        : min=   20, max=   50, avg=39.27, stdev=10.25,
> samples=52
> >   lat (msec)   : 20=72.95%, 50=16.80%, 100=9.57%, 250=0.68%
> >   cpu          : usr=0.41%, sys=0.38%, ctx=1024, majf=0, minf=0
> >   IO depths    : 1=100.0%, 2=0.0%, 4=0.0%, 8=0.0%, 16=0.0%, 32=0.0%,
> >=64=0.0%
> >      submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%,
> >=64=0.0%
> >      issued rwts: total=0,1024,0,0 short=0,0,0,0 dropped=0,0,0,0
> >      latency   : target=0, window=0, percentile=100.00%, depth=1
> >
> > Run status group 0 (all jobs):
> >   WRITE: bw=39.3MiB/s (41.2MB/s), 39.3MiB/s-39.3MiB/s
> > (41.2MB/s-41.2MB/s), io=1024MiB (1074MB), run=26059-26059msec
> >
> > Disk stats (read/write):
> >   mmcblk0: ios=51/2031, merge=0/0, ticks=84/40061, in_queue=40144,
> > util=99.89%
> >
> > BR,
> > Ricky
> ------Please consider the environment before printing this e-mail.
Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ulf Hansson 4 years, 4 months ago
[...]

> > > > >
> > > > > Do you have any suggestion for testing random I/O But we think
> > > > > random I/O will not change much
> > > >
> > > > I would probably look into using fio,
> > > > https://fio.readthedocs.io/en/latest/
> > > >
> > >
> > > Filled random I/O data
> > > Before the patch:
> > > CMD (Randread):
> > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=1M -rw=randread
> >
> > Thanks for running the tests! Overall, I would not expect an impact on the
> > throughput when using a big blocksize like 1M. This is also pretty clear from
> > the result you have provided.
> >
> > However, especially for random writes and reads, we want to try with smaller
> > blocksizes. Like 8k or 16k, would you mind running another round of tests to
> > see how that works out?
> >
>
> Filled random I/O data(8k/16k)

Hi Ricky,

Apologize for the delay! Thanks for running the tests. Let me comment
on them below.

>
> Before(randread)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec
> Disk stats (read/write):
>   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751, util=99.89%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec
> Disk stats (read/write):
>   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420, util=99.84%
>
> Before(randrwrite)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec
> Disk stats (read/write):
>   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234, util=99.90%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec
> Disk stats (read/write):
>   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728, util=99.81%
>
>
> After(randread)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=8k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec
> Disk stats (read/write):
>   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125, util=99.94%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest -bs=16k -rw=randread
> mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec
> Disk stats (read/write):
>   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254, util=99.87%
>
> After(randwrite)
> 8k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=8k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec
> Disk stats (read/write):
>   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267, util=99.92%
>
> 16k:
> Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest -bs=16k -rw=randwrite
> mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W) 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> result:
> Run status group 0 (all jobs):
>   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec
> Disk stats (read/write):
>   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204, util=99.80%

It looks like the rand-read tests above are degrading with the new
changes, while rand-writes are both improving and degrading.

To summarize my view from all the tests you have done at this point
(thanks a lot); it looks like the block I/O merging isn't really
happening at common blocklayer, at least to that extent that would
benefit us. Clearly you have shown that by the suggested change in the
mmc host driver, by detecting whether the "next" request is sequential
to the previous one, which allows us to skip a CMD12 and minimize some
command overhead.

However, according to the latest tests above, you have also proved
that the changes in the mmc host driver doesn't come without a cost.
In particular, small random-reads would degrade in performance from
these changes.

That said, it looks to me that rather than trying to improve things
for one specific mmc host driver, it would be better to look at this
from the generic block layer point of view - and investigate why
sequential reads/writes aren't getting merged often enough for the
MMC/SD case. If we can fix the problem there, all mmc host drivers
would benefit I assume.

BTW, have you tried with different I/O schedulers? If you haven't
tried BFQ, I suggest you do as it's a good fit for MMC/SD.

[...]

Kind regards
Uffe
RE: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ricky WU 4 years, 4 months ago

> -----Original Message-----
> From: Ulf Hansson <ulf.hansson@linaro.org>
> Sent: Monday, February 7, 2022 7:11 PM
> To: Ricky WU <ricky_wu@realtek.com>
> Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> 
> [...]
> 
> > > > > >
> > > > > > Do you have any suggestion for testing random I/O But we think
> > > > > > random I/O will not change much
> > > > >
> > > > > I would probably look into using fio,
> > > > > https://fio.readthedocs.io/en/latest/
> > > > >
> > > >
> > > > Filled random I/O data
> > > > Before the patch:
> > > > CMD (Randread):
> > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > -bs=1M -rw=randread
> > >
> > > Thanks for running the tests! Overall, I would not expect an impact
> > > on the throughput when using a big blocksize like 1M. This is also
> > > pretty clear from the result you have provided.
> > >
> > > However, especially for random writes and reads, we want to try with
> > > smaller blocksizes. Like 8k or 16k, would you mind running another
> > > round of tests to see how that works out?
> > >
> >
> > Filled random I/O data(8k/16k)
> 
> Hi Ricky,
> 
> Apologize for the delay! Thanks for running the tests. Let me comment on
> them below.
> 
> >
> > Before(randread)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=8k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk
> stats (read/write):
> >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > util=99.89%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=16k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk
> stats (read/write):
> >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > util=99.84%
> >
> > Before(randrwrite)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=8k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234,
> > util=99.90%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=16k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > util=99.81%
> >
> >
> > After(randread)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=8k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk
> stats (read/write):
> >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > util=99.94%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > -bs=16k -rw=randread
> > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk
> stats (read/write):
> >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > util=99.87%
> >
> > After(randwrite)
> > 8k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=8k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > 8192B-8192B, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats
> (read/write):
> >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267,
> > util=99.92%
> >
> > 16k:
> > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > -bs=16k -rw=randwrite
> > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > result:
> > Run status group 0 (all jobs):
> >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats
> (read/write):
> >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > util=99.80%
> 
> It looks like the rand-read tests above are degrading with the new changes,
> while rand-writes are both improving and degrading.
> 
> To summarize my view from all the tests you have done at this point (thanks a
> lot); it looks like the block I/O merging isn't really happening at common
> blocklayer, at least to that extent that would benefit us. Clearly you have shown
> that by the suggested change in the mmc host driver, by detecting whether the
> "next" request is sequential to the previous one, which allows us to skip a
> CMD12 and minimize some command overhead.
> 
> However, according to the latest tests above, you have also proved that the
> changes in the mmc host driver doesn't come without a cost.
> In particular, small random-reads would degrade in performance from these
> changes.
> 
> That said, it looks to me that rather than trying to improve things for one
> specific mmc host driver, it would be better to look at this from the generic
> block layer point of view - and investigate why sequential reads/writes aren't
> getting merged often enough for the MMC/SD case. If we can fix the problem
> there, all mmc host drivers would benefit I assume.
> 

So you are thinking about how to patch this in MMC/SD?
I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver

> BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I
> suggest you do as it's a good fit for MMC/SD.
> 

I don’t know what is different I/O schedulers means?

> [...]
> 
> Kind regards
> Uffe
> ------Please consider the environment before printing this e-mail.
Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
Posted by Ulf Hansson 4 years, 4 months ago
On Thu, 10 Feb 2022 at 07:43, Ricky WU <ricky_wu@realtek.com> wrote:
>
>
>
> > -----Original Message-----
> > From: Ulf Hansson <ulf.hansson@linaro.org>
> > Sent: Monday, February 7, 2022 7:11 PM
> > To: Ricky WU <ricky_wu@realtek.com>
> > Cc: tommyhebb@gmail.com; linux-mmc@vger.kernel.org;
> > linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH v3] mmc: rtsx: improve performance for multi block rw
> >
> > [...]
> >
> > > > > > >
> > > > > > > Do you have any suggestion for testing random I/O But we think
> > > > > > > random I/O will not change much
> > > > > >
> > > > > > I would probably look into using fio,
> > > > > > https://fio.readthedocs.io/en/latest/
> > > > > >
> > > > >
> > > > > Filled random I/O data
> > > > > Before the patch:
> > > > > CMD (Randread):
> > > > > sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > > > -bs=1M -rw=randread
> > > >
> > > > Thanks for running the tests! Overall, I would not expect an impact
> > > > on the throughput when using a big blocksize like 1M. This is also
> > > > pretty clear from the result you have provided.
> > > >
> > > > However, especially for random writes and reads, we want to try with
> > > > smaller blocksizes. Like 8k or 16k, would you mind running another
> > > > round of tests to see how that works out?
> > > >
> > >
> > > Filled random I/O data(8k/16k)
> >
> > Hi Ricky,
> >
> > Apologize for the delay! Thanks for running the tests. Let me comment on
> > them below.
> >
> > >
> > > Before(randread)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=8k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=16.5MiB/s (17.3MB/s), 16.5MiB/s-16.5MiB/s
> > > (17.3MB/s-17.3MB/s), io=1024MiB (1074MB), run=62019-62019msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=130757/0, merge=0/0, ticks=57751/0, in_queue=57751,
> > > util=99.89%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=16k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=23.3MiB/s (24.4MB/s), 23.3MiB/s-23.3MiB/s
> > > (24.4MB/s-24.4MB/s), io=1024MiB (1074MB), run=44034-44034msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=65333/0, merge=0/0, ticks=39420/0, in_queue=39420,
> > > util=99.84%
> > >
> > > Before(randrwrite)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=8k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=4060KiB/s (4158kB/s), 4060KiB/s-4060KiB/s
> > > (4158kB/s-4158kB/s), io=100MiB (105MB), run=25220-25220msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/12759, merge=0/0, ticks=80/24154, in_queue=24234,
> > > util=99.90%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=16k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=7201KiB/s (7373kB/s), 7201KiB/s-7201KiB/s
> > > (7373kB/s-7373kB/s), io=100MiB (105MB), run=14221-14221msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/6367, merge=0/0, ticks=82/13647, in_queue=13728,
> > > util=99.81%
> > >
> > >
> > > After(randread)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=8k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=12.4MiB/s (13.0MB/s), 12.4MiB/s-12.4MiB/s
> > > (13.0MB/s-13.0MB/s), io=1024MiB (1074MB), run=82397-82397msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=130640/0, merge=0/0, ticks=74125/0, in_queue=74125,
> > > util=99.94%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=1G -name=mytest
> > > -bs=16k -rw=randread
> > > mytest: (g=0): rw=randread, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >    READ: bw=20.0MiB/s (21.0MB/s), 20.0MiB/s-20.0MiB/s
> > > (21.0MB/s-21.0MB/s), io=1024MiB (1074MB), run=51076-51076msec Disk
> > stats (read/write):
> > >   mmcblk0: ios=65282/0, merge=0/0, ticks=46255/0, in_queue=46254,
> > > util=99.87%
> > >
> > > After(randwrite)
> > > 8k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=8k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T)
> > > 8192B-8192B, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=4215KiB/s (4317kB/s), 4215KiB/s-4215KiB/s
> > > (4317kB/s-4317kB/s), io=100MiB (105MB), run=24292-24292msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=52/12717, merge=0/0, ticks=86/23182, in_queue=23267,
> > > util=99.92%
> > >
> > > 16k:
> > > Cmd: sudo fio -filename=/dev/mmcblk0 -direct=1 -numjobs=1 -thread
> > > -group_reporting -ioengine=psync -iodepth=1 -size=100M -name=mytest
> > > -bs=16k -rw=randwrite
> > > mytest: (g=0): rw=randwrite, bs=(R) 16.0KiB-16.0KiB, (W)
> > > 16.0KiB-16.0KiB, (T) 16.0KiB-16.0KiB, ioengine=psync, iodepth=1
> > > result:
> > > Run status group 0 (all jobs):
> > >   WRITE: bw=6499KiB/s (6655kB/s), 6499KiB/s-6499KiB/s
> > > (6655kB/s-6655kB/s), io=100MiB (105MB), run=15756-15756msec Disk stats
> > (read/write):
> > >   mmcblk0: ios=51/6347, merge=0/0, ticks=84/15120, in_queue=15204,
> > > util=99.80%
> >
> > It looks like the rand-read tests above are degrading with the new changes,
> > while rand-writes are both improving and degrading.
> >
> > To summarize my view from all the tests you have done at this point (thanks a
> > lot); it looks like the block I/O merging isn't really happening at common
> > blocklayer, at least to that extent that would benefit us. Clearly you have shown
> > that by the suggested change in the mmc host driver, by detecting whether the
> > "next" request is sequential to the previous one, which allows us to skip a
> > CMD12 and minimize some command overhead.
> >
> > However, according to the latest tests above, you have also proved that the
> > changes in the mmc host driver doesn't come without a cost.
> > In particular, small random-reads would degrade in performance from these
> > changes.
> >
> > That said, it looks to me that rather than trying to improve things for one
> > specific mmc host driver, it would be better to look at this from the generic
> > block layer point of view - and investigate why sequential reads/writes aren't
> > getting merged often enough for the MMC/SD case. If we can fix the problem
> > there, all mmc host drivers would benefit I assume.
> >
>
> So you are thinking about how to patch this in MMC/SD?
> I don't know if this method is compatible with other MMC Hosts? Or they need to patch other code on their host driver

I would not limit this to the core layer of MMC/SD. The point I was
trying to make was that it doesn't look like the generic block layer
is merging the sequential I/O requests in the most efficient way, at
least for the eMMC/SD devices. Why this is the case, I can't tell. It
looks like we need to do some more in-depth analysis to understand why
merging isn't efficient for us.

>
> > BTW, have you tried with different I/O schedulers? If you haven't tried BFQ, I
> > suggest you do as it's a good fit for MMC/SD.
> >
>
> I don’t know what is different I/O schedulers means?

What I/O scheduler did you use when running the test?

For MMC/SD the only one that makes sense to use is BFQ, however that
needs to be configured via sysfs after boot. There is no way,
currently, to make it the default, I think. You may look at
Documentation/block/bfq-iosched.rst, if you are more interested.

Kind regards
Uffe