NTB transport backed by remote DW eDMA

[RFC PATCH v2 14/27] NTB: ntb_transport: Move TX memory window setup into setup_qp_mw()

Posted by Koichiro Den 2 months, 1 week ago

Historically both TX and RX have assumed the same per-QP MW slice
(tx_max_entry == remote rx_max_entry), while those are calculated
separately in different places (pre and post the link-up negotiation
point). This has been safe because nt->link_is_up is never set to true
unless the pre-determined qp_count are the same among them, and qp_count
is typically limited to nt->mw_count, which should be carefully
configured by admin.

However, setup_qp_mw can actually split mw and handle multi-qps in one
MW properly, so qp_count needs not to be limited by nt->mw_count. Once
we relaxing the limitation, pre-determined qp_count can differ among
host side and endpoint, and link-up negotiation can easily fail.

Move the TX MW configuration (per-QP offset and size) into
ntb_transport_setup_qp_mw() so that both RX and TX layout decisions are
centralized in a single helper. ntb_transport_init_queue() now deals
only with per-QP software state, not with MW layout.

This keeps the previous behaviour, while preparing for relaxing the
qp_count limitation and improving readibility.

No functional change is intended.

Signed-off-by: Koichiro Den <den@valinux.co.jp>
---
 drivers/ntb/ntb_transport.c | 67 ++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
index 57b4c0511927..79063e2f911b 100644
--- a/drivers/ntb/ntb_transport.c
+++ b/drivers/ntb/ntb_transport.c
@@ -569,7 +569,8 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
 	struct ntb_transport_mw *mw;
 	struct ntb_dev *ndev = nt->ndev;
 	struct ntb_queue_entry *entry;
-	unsigned int rx_size, num_qps_mw;
+	unsigned int num_qps_mw;
+	unsigned int mw_size, mw_size_per_qp, qp_offset, rx_info_offset;
 	unsigned int mw_num, mw_count, qp_count;
 	unsigned int i;
 	int node;
@@ -588,15 +589,33 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
 	else
 		num_qps_mw = qp_count / mw_count;
 
-	rx_size = (unsigned int)mw->xlat_size / num_qps_mw;
-	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
-	rx_size -= sizeof(struct ntb_rx_info);
+	mw_size = min(nt->mw_vec[mw_num].phys_size, mw->xlat_size);
+	if (max_mw_size && mw_size > max_mw_size)
+		mw_size = max_mw_size;
 
-	qp->remote_rx_info = qp->rx_buff + rx_size;
+	/* Split this MW evenly among the queue pairs mapped to it. */
+	mw_size_per_qp = (unsigned int)mw_size / num_qps_mw;
+	qp_offset = mw_size_per_qp * (qp_num / mw_count);
+
+	/* Place remote_rx_info at the end of the per-QP region. */
+	rx_info_offset = mw_size_per_qp - sizeof(struct ntb_rx_info);
+
+	qp->tx_mw_size = mw_size_per_qp;
+	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
+	if (!qp->tx_mw)
+		return -EINVAL;
+	qp->tx_mw_phys = nt->mw_vec[mw_num].phys_addr + qp_offset;
+	if (!qp->tx_mw_phys)
+		return -EINVAL;
+	qp->rx_info = qp->tx_mw + rx_info_offset;
+	qp->rx_buff = mw->virt_addr + qp_offset;
+	qp->remote_rx_info = qp->rx_buff + rx_info_offset;
 
 	/* Due to housekeeping, there must be atleast 2 buffs */
-	qp->rx_max_frame = min(transport_mtu, rx_size / 2);
-	qp->rx_max_entry = rx_size / qp->rx_max_frame;
+	qp->tx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
+	qp->tx_max_entry = mw_size_per_qp / qp->tx_max_frame;
+	qp->rx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
+	qp->rx_max_entry = mw_size_per_qp / qp->rx_max_frame;
 	qp->rx_index = 0;
 
 	/*
@@ -1133,11 +1152,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 				    unsigned int qp_num)
 {
 	struct ntb_transport_qp *qp;
-	phys_addr_t mw_base;
-	resource_size_t mw_size;
-	unsigned int num_qps_mw, tx_size;
 	unsigned int mw_num, mw_count, qp_count;
-	u64 qp_offset;
 
 	mw_count = nt->mw_count;
 	qp_count = nt->qp_count;
@@ -1152,36 +1167,6 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
 	qp->event_handler = NULL;
 	ntb_qp_link_context_reset(qp);
 
-	if (mw_num < qp_count % mw_count)
-		num_qps_mw = qp_count / mw_count + 1;
-	else
-		num_qps_mw = qp_count / mw_count;
-
-	mw_base = nt->mw_vec[mw_num].phys_addr;
-	mw_size = nt->mw_vec[mw_num].phys_size;
-
-	if (max_mw_size && mw_size > max_mw_size)
-		mw_size = max_mw_size;
-
-	tx_size = (unsigned int)mw_size / num_qps_mw;
-	qp_offset = tx_size * (qp_num / mw_count);
-
-	qp->tx_mw_size = tx_size;
-	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
-	if (!qp->tx_mw)
-		return -EINVAL;
-
-	qp->tx_mw_phys = mw_base + qp_offset;
-	if (!qp->tx_mw_phys)
-		return -EINVAL;
-
-	tx_size -= sizeof(struct ntb_rx_info);
-	qp->rx_info = qp->tx_mw + tx_size;
-
-	/* Due to housekeeping, there must be atleast 2 buffs */
-	qp->tx_max_frame = min(transport_mtu, tx_size / 2);
-	qp->tx_max_entry = tx_size / qp->tx_max_frame;
-
 	if (nt->debugfs_node_dir) {
 		char debugfs_name[8];
 
-- 
2.48.1

Re: [RFC PATCH v2 14/27] NTB: ntb_transport: Move TX memory window setup into setup_qp_mw()

Posted by Frank Li 2 months, 1 week ago

On Sun, Nov 30, 2025 at 01:03:52AM +0900, Koichiro Den wrote:
> Historically both TX and RX have assumed the same per-QP MW slice
> (tx_max_entry == remote rx_max_entry), while those are calculated
> separately in different places (pre and post the link-up negotiation
> point). This has been safe because nt->link_is_up is never set to true
> unless the pre-determined qp_count are the same among them, and qp_count
> is typically limited to nt->mw_count, which should be carefully
> configured by admin.
>
> However, setup_qp_mw can actually split mw and handle multi-qps in one
> MW properly, so qp_count needs not to be limited by nt->mw_count. Once
> we relaxing the limitation, pre-determined qp_count can differ among
> host side and endpoint, and link-up negotiation can easily fail.
>
> Move the TX MW configuration (per-QP offset and size) into
> ntb_transport_setup_qp_mw() so that both RX and TX layout decisions are
> centralized in a single helper. ntb_transport_init_queue() now deals
> only with per-QP software state, not with MW layout.
>
> This keeps the previous behaviour, while preparing for relaxing the
> qp_count limitation and improving readibility.
>
> No functional change is intended.
>
> Signed-off-by: Koichiro Den <den@valinux.co.jp>
> ---
>  drivers/ntb/ntb_transport.c | 67 ++++++++++++++-----------------------
>  1 file changed, 26 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> index 57b4c0511927..79063e2f911b 100644
> --- a/drivers/ntb/ntb_transport.c
> +++ b/drivers/ntb/ntb_transport.c
> @@ -569,7 +569,8 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
>  	struct ntb_transport_mw *mw;
>  	struct ntb_dev *ndev = nt->ndev;
>  	struct ntb_queue_entry *entry;
> -	unsigned int rx_size, num_qps_mw;
> +	unsigned int num_qps_mw;
> +	unsigned int mw_size, mw_size_per_qp, qp_offset, rx_info_offset;
>  	unsigned int mw_num, mw_count, qp_count;
>  	unsigned int i;
>  	int node;
> @@ -588,15 +589,33 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
>  	else
>  		num_qps_mw = qp_count / mw_count;
>
> -	rx_size = (unsigned int)mw->xlat_size / num_qps_mw;
> -	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
> -	rx_size -= sizeof(struct ntb_rx_info);
> +	mw_size = min(nt->mw_vec[mw_num].phys_size, mw->xlat_size);
> +	if (max_mw_size && mw_size > max_mw_size)
> +		mw_size = max_mw_size;
>
> -	qp->remote_rx_info = qp->rx_buff + rx_size;
> +	/* Split this MW evenly among the queue pairs mapped to it. */
> +	mw_size_per_qp = (unsigned int)mw_size / num_qps_mw;

Can you use the same variable firstly to make review easily?

tx_size = (unsigned int)mw_size / num_qps_mw;

It is hard to make sure code logic is the same as old one.

Frank

> +	qp_offset = mw_size_per_qp * (qp_num / mw_count);
> +
> +	/* Place remote_rx_info at the end of the per-QP region. */
> +	rx_info_offset = mw_size_per_qp - sizeof(struct ntb_rx_info);
> +
> +	qp->tx_mw_size = mw_size_per_qp;
> +	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
> +	if (!qp->tx_mw)
> +		return -EINVAL;
> +	qp->tx_mw_phys = nt->mw_vec[mw_num].phys_addr + qp_offset;
> +	if (!qp->tx_mw_phys)
> +		return -EINVAL;
> +	qp->rx_info = qp->tx_mw + rx_info_offset;
> +	qp->rx_buff = mw->virt_addr + qp_offset;
> +	qp->remote_rx_info = qp->rx_buff + rx_info_offset;
>
>  	/* Due to housekeeping, there must be atleast 2 buffs */
> -	qp->rx_max_frame = min(transport_mtu, rx_size / 2);
> -	qp->rx_max_entry = rx_size / qp->rx_max_frame;
> +	qp->tx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
> +	qp->tx_max_entry = mw_size_per_qp / qp->tx_max_frame;
> +	qp->rx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
> +	qp->rx_max_entry = mw_size_per_qp / qp->rx_max_frame;
>  	qp->rx_index = 0;
>
>  	/*
> @@ -1133,11 +1152,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
>  				    unsigned int qp_num)
>  {
>  	struct ntb_transport_qp *qp;
> -	phys_addr_t mw_base;
> -	resource_size_t mw_size;
> -	unsigned int num_qps_mw, tx_size;
>  	unsigned int mw_num, mw_count, qp_count;
> -	u64 qp_offset;
>
>  	mw_count = nt->mw_count;
>  	qp_count = nt->qp_count;
> @@ -1152,36 +1167,6 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
>  	qp->event_handler = NULL;
>  	ntb_qp_link_context_reset(qp);
>
> -	if (mw_num < qp_count % mw_count)
> -		num_qps_mw = qp_count / mw_count + 1;
> -	else
> -		num_qps_mw = qp_count / mw_count;
> -
> -	mw_base = nt->mw_vec[mw_num].phys_addr;
> -	mw_size = nt->mw_vec[mw_num].phys_size;
> -
> -	if (max_mw_size && mw_size > max_mw_size)
> -		mw_size = max_mw_size;
> -
> -	tx_size = (unsigned int)mw_size / num_qps_mw;
> -	qp_offset = tx_size * (qp_num / mw_count);
> -
> -	qp->tx_mw_size = tx_size;
> -	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
> -	if (!qp->tx_mw)
> -		return -EINVAL;
> -
> -	qp->tx_mw_phys = mw_base + qp_offset;
> -	if (!qp->tx_mw_phys)
> -		return -EINVAL;
> -
> -	tx_size -= sizeof(struct ntb_rx_info);
> -	qp->rx_info = qp->tx_mw + tx_size;
> -
> -	/* Due to housekeeping, there must be atleast 2 buffs */
> -	qp->tx_max_frame = min(transport_mtu, tx_size / 2);
> -	qp->tx_max_entry = tx_size / qp->tx_max_frame;
> -
>  	if (nt->debugfs_node_dir) {
>  		char debugfs_name[8];
>
> --
> 2.48.1
>

Re: [RFC PATCH v2 14/27] NTB: ntb_transport: Move TX memory window setup into setup_qp_mw()

Posted by Koichiro Den 2 months, 1 week ago

On Mon, Dec 01, 2025 at 03:02:40PM -0500, Frank Li wrote:
> On Sun, Nov 30, 2025 at 01:03:52AM +0900, Koichiro Den wrote:
> > Historically both TX and RX have assumed the same per-QP MW slice
> > (tx_max_entry == remote rx_max_entry), while those are calculated
> > separately in different places (pre and post the link-up negotiation
> > point). This has been safe because nt->link_is_up is never set to true
> > unless the pre-determined qp_count are the same among them, and qp_count
> > is typically limited to nt->mw_count, which should be carefully
> > configured by admin.
> >
> > However, setup_qp_mw can actually split mw and handle multi-qps in one
> > MW properly, so qp_count needs not to be limited by nt->mw_count. Once
> > we relaxing the limitation, pre-determined qp_count can differ among
> > host side and endpoint, and link-up negotiation can easily fail.
> >
> > Move the TX MW configuration (per-QP offset and size) into
> > ntb_transport_setup_qp_mw() so that both RX and TX layout decisions are
> > centralized in a single helper. ntb_transport_init_queue() now deals
> > only with per-QP software state, not with MW layout.
> >
> > This keeps the previous behaviour, while preparing for relaxing the
> > qp_count limitation and improving readibility.
> >
> > No functional change is intended.
> >
> > Signed-off-by: Koichiro Den <den@valinux.co.jp>
> > ---
> >  drivers/ntb/ntb_transport.c | 67 ++++++++++++++-----------------------
> >  1 file changed, 26 insertions(+), 41 deletions(-)
> >
> > diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c
> > index 57b4c0511927..79063e2f911b 100644
> > --- a/drivers/ntb/ntb_transport.c
> > +++ b/drivers/ntb/ntb_transport.c
> > @@ -569,7 +569,8 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
> >  	struct ntb_transport_mw *mw;
> >  	struct ntb_dev *ndev = nt->ndev;
> >  	struct ntb_queue_entry *entry;
> > -	unsigned int rx_size, num_qps_mw;
> > +	unsigned int num_qps_mw;
> > +	unsigned int mw_size, mw_size_per_qp, qp_offset, rx_info_offset;
> >  	unsigned int mw_num, mw_count, qp_count;
> >  	unsigned int i;
> >  	int node;
> > @@ -588,15 +589,33 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt,
> >  	else
> >  		num_qps_mw = qp_count / mw_count;
> >
> > -	rx_size = (unsigned int)mw->xlat_size / num_qps_mw;
> > -	qp->rx_buff = mw->virt_addr + rx_size * (qp_num / mw_count);
> > -	rx_size -= sizeof(struct ntb_rx_info);
> > +	mw_size = min(nt->mw_vec[mw_num].phys_size, mw->xlat_size);
> > +	if (max_mw_size && mw_size > max_mw_size)
> > +		mw_size = max_mw_size;
> >
> > -	qp->remote_rx_info = qp->rx_buff + rx_size;
> > +	/* Split this MW evenly among the queue pairs mapped to it. */
> > +	mw_size_per_qp = (unsigned int)mw_size / num_qps_mw;
> 
> Can you use the same variable firstly to make review easily?
> 
> tx_size = (unsigned int)mw_size / num_qps_mw;
> 
> It is hard to make sure code logic is the same as old one.

I'll do so. Thank you!

Koichiro

> 
> Frank
> 
> > +	qp_offset = mw_size_per_qp * (qp_num / mw_count);
> > +
> > +	/* Place remote_rx_info at the end of the per-QP region. */
> > +	rx_info_offset = mw_size_per_qp - sizeof(struct ntb_rx_info);
> > +
> > +	qp->tx_mw_size = mw_size_per_qp;
> > +	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
> > +	if (!qp->tx_mw)
> > +		return -EINVAL;
> > +	qp->tx_mw_phys = nt->mw_vec[mw_num].phys_addr + qp_offset;
> > +	if (!qp->tx_mw_phys)
> > +		return -EINVAL;
> > +	qp->rx_info = qp->tx_mw + rx_info_offset;
> > +	qp->rx_buff = mw->virt_addr + qp_offset;
> > +	qp->remote_rx_info = qp->rx_buff + rx_info_offset;
> >
> >  	/* Due to housekeeping, there must be atleast 2 buffs */
> > -	qp->rx_max_frame = min(transport_mtu, rx_size / 2);
> > -	qp->rx_max_entry = rx_size / qp->rx_max_frame;
> > +	qp->tx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
> > +	qp->tx_max_entry = mw_size_per_qp / qp->tx_max_frame;
> > +	qp->rx_max_frame = min(transport_mtu, mw_size_per_qp / 2);
> > +	qp->rx_max_entry = mw_size_per_qp / qp->rx_max_frame;
> >  	qp->rx_index = 0;
> >
> >  	/*
> > @@ -1133,11 +1152,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
> >  				    unsigned int qp_num)
> >  {
> >  	struct ntb_transport_qp *qp;
> > -	phys_addr_t mw_base;
> > -	resource_size_t mw_size;
> > -	unsigned int num_qps_mw, tx_size;
> >  	unsigned int mw_num, mw_count, qp_count;
> > -	u64 qp_offset;
> >
> >  	mw_count = nt->mw_count;
> >  	qp_count = nt->qp_count;
> > @@ -1152,36 +1167,6 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt,
> >  	qp->event_handler = NULL;
> >  	ntb_qp_link_context_reset(qp);
> >
> > -	if (mw_num < qp_count % mw_count)
> > -		num_qps_mw = qp_count / mw_count + 1;
> > -	else
> > -		num_qps_mw = qp_count / mw_count;
> > -
> > -	mw_base = nt->mw_vec[mw_num].phys_addr;
> > -	mw_size = nt->mw_vec[mw_num].phys_size;
> > -
> > -	if (max_mw_size && mw_size > max_mw_size)
> > -		mw_size = max_mw_size;
> > -
> > -	tx_size = (unsigned int)mw_size / num_qps_mw;
> > -	qp_offset = tx_size * (qp_num / mw_count);
> > -
> > -	qp->tx_mw_size = tx_size;
> > -	qp->tx_mw = nt->mw_vec[mw_num].vbase + qp_offset;
> > -	if (!qp->tx_mw)
> > -		return -EINVAL;
> > -
> > -	qp->tx_mw_phys = mw_base + qp_offset;
> > -	if (!qp->tx_mw_phys)
> > -		return -EINVAL;
> > -
> > -	tx_size -= sizeof(struct ntb_rx_info);
> > -	qp->rx_info = qp->tx_mw + tx_size;
> > -
> > -	/* Due to housekeeping, there must be atleast 2 buffs */
> > -	qp->tx_max_frame = min(transport_mtu, tx_size / 2);
> > -	qp->tx_max_entry = tx_size / qp->tx_max_frame;
> > -
> >  	if (nt->debugfs_node_dir) {
> >  		char debugfs_name[8];
> >
> > --
> > 2.48.1
> >