net/smc/smc_core.c | 8 +- net/smc/smc_core.h | 16 ++- net/smc/smc_ib.c | 114 ++++++------------- net/smc/smc_ib.h | 5 - net/smc/smc_tx.c | 1 - net/smc/smc_wr.c | 267 ++++++++++++++++----------------------------- net/smc/smc_wr.h | 38 ++----- 7 files changed, 150 insertions(+), 299 deletions(-)
The current SMC-R implementation relies on global per-device CQs
and manual polling within tasklets, which introduces severe
scalability bottlenecks due to global lock contention and tasklet
scheduling overhead, resulting in poor performance as concurrency
increases.
Refactor the completion handling to utilize the ib_cqe API and
standard RDMA core CQ pooling. This transition provides several key
advantages:
1. Multi-CQ: Shift from a single shared per-device CQ to multiple
link-specific CQs via the CQ pool. This allows completion processing
to be parallelized across multiple CPU cores, effectively eliminating
the global CQ bottleneck.
2. Leverage DIM: Utilizing the standard CQ pool with IB_POLL_SOFTIRQ
enables Dynamic Interrupt Moderation from the RDMA core, optimizing
interrupt frequency and reducing CPU load under high pressure.
3. O(1) Context Retrieval: Replaces the expensive wr_id based lookup
logic (e.g., smc_wr_tx_find_pending_index) with direct context retrieval
using container_of() on the embedded ib_cqe.
4. Code Simplification: This refactoring results in a reduction of
~150 lines of code. It removes redundant sequence tracking, complex lookup
helpers, and manual CQ management, significantly improving maintainability.
Performance Test: redis-benchmark with max 32 connections per QP
Data format: Requests Per Second (RPS), Percentage in brackets
represents the gain/loss compared to TCP.
| Clients | TCP | SMC (original) | SMC (cq_pool) |
|---------|----------|---------------------|---------------------|
| c = 1 | 24449 | 31172 (+27%) | 34039 (+39%) |
| c = 2 | 46420 | 53216 (+14%) | 64391 (+38%) |
| c = 16 | 159673 | 83668 (-48%) <-- | 216947 (+36%) |
| c = 32 | 164956 | 97631 (-41%) <-- | 249376 (+51%) |
| c = 64 | 166322 | 118192 (-29%) <-- | 249488 (+50%) |
| c = 128 | 167700 | 121497 (-27%) <-- | 249480 (+48%) |
| c = 256 | 175021 | 146109 (-16%) <-- | 240384 (+37%) |
| c = 512 | 168987 | 101479 (-40%) <-- | 226634 (+34%) |
The results demonstrate that this optimization effectively resolves the
scalability bottleneck, with RPS increasing by over 110% at c=64
compared to the original implementation.
Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
net/smc/smc_core.c | 8 +-
net/smc/smc_core.h | 16 ++-
net/smc/smc_ib.c | 114 ++++++-------------
net/smc/smc_ib.h | 5 -
net/smc/smc_tx.c | 1 -
net/smc/smc_wr.c | 267 ++++++++++++++++-----------------------------
net/smc/smc_wr.h | 38 ++-----
7 files changed, 150 insertions(+), 299 deletions(-)
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 8aca5dc54be7..9590c8aed3dd 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -815,17 +815,11 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
lnk->lgr = lgr;
smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
lnk->link_idx = link_idx;
- lnk->wr_rx_id_compl = 0;
smc_ibdev_cnt_inc(lnk);
smcr_copy_dev_info_to_link(lnk);
atomic_set(&lnk->conn_cnt, 0);
smc_llc_link_set_uid(lnk);
INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
- if (!lnk->smcibdev->initialized) {
- rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
- if (rc)
- goto out;
- }
get_random_bytes(rndvec, sizeof(rndvec));
lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
(rndvec[2] << 16);
@@ -1373,7 +1367,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
smc_llc_link_clear(lnk, log);
smcr_buf_unmap_lgr(lnk);
smcr_rtoken_clear_link(lnk);
- smc_ib_modify_qp_error(lnk);
+ smc_wr_drain_qp(lnk);
smc_wr_free_link(lnk);
smc_ib_destroy_queue_pair(lnk);
smc_ib_dealloc_protection_domain(lnk);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 5c18f08a4c8a..00468b7a279f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -92,6 +92,12 @@ struct smc_rdma_wr { /* work requests per message
struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
};
+struct smc_ib_recv_wr {
+ struct ib_cqe cqe;
+ struct ib_recv_wr wr;
+ int index;
+};
+
#define SMC_LGR_ID_SIZE 4
struct smc_link {
@@ -100,6 +106,8 @@ struct smc_link {
struct ib_pd *roce_pd; /* IB protection domain,
* unique for every RoCE QP
*/
+ int nr_cqe;
+ struct ib_cq *ib_cq;
struct ib_qp *roce_qp; /* IB queue pair */
struct ib_qp_attr qp_attr; /* IB queue pair attributes */
@@ -107,6 +115,7 @@ struct smc_link {
struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
struct ib_sge *wr_tx_sges; /* WR send gather meta data */
struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
+ struct ib_cqe tx_rdma_cqe; /* CQE RDMA WRITE */
struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
struct completion *wr_tx_compl; /* WR send CQE completion */
@@ -116,7 +125,6 @@ struct smc_link {
struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */
dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
- atomic_long_t wr_tx_id; /* seq # of last sent WR */
unsigned long *wr_tx_mask; /* bit mask of used indexes */
u32 wr_tx_cnt; /* number of WR send buffers */
wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
@@ -126,7 +134,7 @@ struct smc_link {
struct completion tx_ref_comp;
u8 *wr_rx_bufs; /* WR recv payload buffers */
- struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
+ struct smc_ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
/* above three vectors have wr_rx_cnt elements and use the same index */
int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */
@@ -135,13 +143,11 @@ struct smc_link {
*/
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
- u64 wr_rx_id; /* seq # of last recv WR */
- u64 wr_rx_id_compl; /* seq # of last completed WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
- wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */
struct ib_reg_wr wr_reg; /* WR register memory region */
+ struct ib_cqe wr_reg_cqe;
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
struct {
struct percpu_ref wr_reg_refs;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 67211d44a1db..77047ad7d452 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -112,15 +112,6 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
IB_QP_MAX_QP_RD_ATOMIC);
}
-int smc_ib_modify_qp_error(struct smc_link *lnk)
-{
- struct ib_qp_attr qp_attr;
-
- memset(&qp_attr, 0, sizeof(qp_attr));
- qp_attr.qp_state = IB_QPS_ERR;
- return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
-}
-
int smc_ib_ready_link(struct smc_link *lnk)
{
struct smc_link_group *lgr = smc_get_lgr(lnk);
@@ -134,10 +125,7 @@ int smc_ib_ready_link(struct smc_link *lnk)
if (rc)
goto out;
smc_wr_remember_qp_attr(lnk);
- rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
- IB_CQ_SOLICITED_MASK);
- if (rc)
- goto out;
+
rc = smc_wr_rx_post_init(lnk);
if (rc)
goto out;
@@ -658,38 +646,60 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
if (lnk->roce_qp)
ib_destroy_qp(lnk->roce_qp);
lnk->roce_qp = NULL;
+ if (lnk->ib_cq) {
+ ib_cq_pool_put(lnk->ib_cq, lnk->nr_cqe);
+ lnk->ib_cq = NULL;
+ }
}
/* create a queue pair within the protection domain for a link */
int smc_ib_create_queue_pair(struct smc_link *lnk)
{
+ int max_send_wr, max_recv_wr, rc;
+ struct ib_cq *cq;
+
+ /* include unsolicited rdma_writes as well,
+ * there are max. 2 RDMA_WRITE per 1 WR_SEND.
+ * +1 for ib_drain_qp()
+ */
+ max_send_wr = 3 * lnk->lgr->max_send_wr + 1;
+ max_recv_wr = lnk->lgr->max_recv_wr + 1;
+
+ cq = ib_cq_pool_get(lnk->smcibdev->ibdev, max_send_wr + max_recv_wr, -1,
+ IB_POLL_SOFTIRQ);
+
+ if (IS_ERR(cq)) {
+ rc = PTR_ERR(cq);
+ return rc;
+ }
+
struct ib_qp_init_attr qp_attr = {
.event_handler = smc_ib_qp_event_handler,
.qp_context = lnk,
- .send_cq = lnk->smcibdev->roce_cq_send,
- .recv_cq = lnk->smcibdev->roce_cq_recv,
+ .send_cq = cq,
+ .recv_cq = cq,
.srq = NULL,
.cap = {
.max_send_sge = SMC_IB_MAX_SEND_SGE,
.max_recv_sge = lnk->wr_rx_sge_cnt,
+ .max_send_wr = max_send_wr,
+ .max_recv_wr = max_recv_wr,
.max_inline_data = 0,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_RC,
};
- int rc;
- /* include unsolicited rdma_writes as well,
- * there are max. 2 RDMA_WRITE per 1 WR_SEND
- */
- qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
- qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
- if (IS_ERR(lnk->roce_qp))
+ if (IS_ERR(lnk->roce_qp)) {
lnk->roce_qp = NULL;
- else
+ ib_cq_pool_put(cq, max_send_wr + max_recv_wr);
+ } else {
smc_wr_remember_qp_attr(lnk);
+ lnk->nr_cqe = max_send_wr + max_recv_wr;
+ lnk->ib_cq = cq;
+ }
return rc;
}
@@ -855,62 +865,6 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk,
buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
}
-long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
-{
- struct ib_cq_init_attr cqattr = {
- .cqe = SMC_MAX_CQE, .comp_vector = 0 };
- int cqe_size_order, smc_order;
- long rc;
-
- mutex_lock(&smcibdev->mutex);
- rc = 0;
- if (smcibdev->initialized)
- goto out;
- /* the calculated number of cq entries fits to mlx5 cq allocation */
- cqe_size_order = cache_line_size() == 128 ? 7 : 6;
- smc_order = MAX_PAGE_ORDER - cqe_size_order;
- if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
- cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
- smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
- smc_wr_tx_cq_handler, NULL,
- smcibdev, &cqattr);
- rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
- if (IS_ERR(smcibdev->roce_cq_send)) {
- smcibdev->roce_cq_send = NULL;
- goto out;
- }
- smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
- smc_wr_rx_cq_handler, NULL,
- smcibdev, &cqattr);
- rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
- if (IS_ERR(smcibdev->roce_cq_recv)) {
- smcibdev->roce_cq_recv = NULL;
- goto err;
- }
- smc_wr_add_dev(smcibdev);
- smcibdev->initialized = 1;
- goto out;
-
-err:
- ib_destroy_cq(smcibdev->roce_cq_send);
-out:
- mutex_unlock(&smcibdev->mutex);
- return rc;
-}
-
-static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
-{
- mutex_lock(&smcibdev->mutex);
- if (!smcibdev->initialized)
- goto out;
- smcibdev->initialized = 0;
- ib_destroy_cq(smcibdev->roce_cq_recv);
- ib_destroy_cq(smcibdev->roce_cq_send);
- smc_wr_remove_dev(smcibdev);
-out:
- mutex_unlock(&smcibdev->mutex);
-}
-
static struct ib_client smc_ib_client;
static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
@@ -969,7 +923,6 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
atomic_set(&smcibdev->lnk_cnt, 0);
init_waitqueue_head(&smcibdev->lnks_deleted);
- mutex_init(&smcibdev->mutex);
mutex_lock(&smc_ib_devices.mutex);
list_add_tail(&smcibdev->list, &smc_ib_devices.list);
mutex_unlock(&smc_ib_devices.mutex);
@@ -1018,7 +971,6 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
pr_warn_ratelimited("smc: removing ib device %s\n",
smcibdev->ibdev->name);
smc_smcr_terminate_all(smcibdev);
- smc_ib_cleanup_per_ibdev(smcibdev);
ib_unregister_event_handler(&smcibdev->event_handler);
cancel_work_sync(&smcibdev->port_event_work);
kfree(smcibdev);
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index ef8ac2b7546d..c5a0d773b73f 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -37,10 +37,6 @@ struct smc_ib_device { /* ib-device infos for smc */
struct ib_device *ibdev;
struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
struct ib_event_handler event_handler; /* global ib_event handler */
- struct ib_cq *roce_cq_send; /* send completion queue */
- struct ib_cq *roce_cq_recv; /* recv completion queue */
- struct tasklet_struct send_tasklet; /* called by send cq handler */
- struct tasklet_struct recv_tasklet; /* called by recv cq handler */
char mac[SMC_MAX_PORTS][ETH_ALEN];
/* mac address per port*/
u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
@@ -96,7 +92,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk);
int smc_ib_create_queue_pair(struct smc_link *lnk);
int smc_ib_ready_link(struct smc_link *lnk);
int smc_ib_modify_qp_rts(struct smc_link *lnk);
-int smc_ib_modify_qp_error(struct smc_link *lnk);
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
struct smc_buf_desc *buf_slot, u8 link_idx);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 3144b4b1fe29..d301df9ed58b 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -321,7 +321,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
struct smc_link *link = conn->lnk;
int rc;
- rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
rdma_wr->wr.num_sge = num_sges;
rdma_wr->remote_addr =
lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 5feafa98ab1a..3a361aa020ab 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -38,7 +38,7 @@ static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
struct smc_wr_tx_pend { /* control data for a pending send request */
- u64 wr_id; /* work request id sent */
+ struct ib_cqe cqe;
smc_wr_tx_handler handler;
enum ib_wc_status wc_status; /* CQE status */
struct smc_link *link;
@@ -63,62 +63,51 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
}
-static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
+static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
{
- u32 i;
+ struct smc_link *link = wc->qp->qp_context;
- for (i = 0; i < link->wr_tx_cnt; i++) {
- if (link->wr_tx_pends[i].wr_id == wr_id)
- return i;
- }
- return link->wr_tx_cnt;
+ /* terminate link */
+ if (unlikely(wc->status))
+ smcr_link_down_cond_sched(link);
+}
+
+static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct smc_link *link = wc->qp->qp_context;
+
+ if (wc->status)
+ link->wr_reg_state = FAILED;
+ else
+ link->wr_reg_state = CONFIRMED;
+ smc_wr_wakeup_reg_wait(link);
}
-static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
+static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smc_wr_tx_pend pnd_snd;
+ struct smc_wr_tx_pend *tx_pend, pnd_snd;
struct smc_link *link;
u32 pnd_snd_idx;
link = wc->qp->qp_context;
- if (wc->opcode == IB_WC_REG_MR) {
- if (wc->status)
- link->wr_reg_state = FAILED;
- else
- link->wr_reg_state = CONFIRMED;
- smc_wr_wakeup_reg_wait(link);
- return;
- }
+ tx_pend = container_of(wc->wr_cqe, struct smc_wr_tx_pend, cqe);
+ pnd_snd_idx = tx_pend->idx;
+
+ tx_pend->wc_status = wc->status;
+ memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
- pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
if (pnd_snd_idx == link->wr_tx_cnt) {
- if (link->lgr->smc_version != SMC_V2 ||
- link->wr_tx_v2_pend->wr_id != wc->wr_id)
- return;
- link->wr_tx_v2_pend->wc_status = wc->status;
- memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
- /* clear the full struct smc_wr_tx_pend including .priv */
- memset(link->wr_tx_v2_pend, 0,
- sizeof(*link->wr_tx_v2_pend));
- memset(link->lgr->wr_tx_buf_v2, 0,
- sizeof(*link->lgr->wr_tx_buf_v2));
+ memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2));
} else {
- link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
- if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+ if (tx_pend->compl_requested)
complete(&link->wr_tx_compl[pnd_snd_idx]);
- memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
- sizeof(pnd_snd));
- /* clear the full struct smc_wr_tx_pend including .priv */
- memset(&link->wr_tx_pends[pnd_snd_idx], 0,
- sizeof(link->wr_tx_pends[pnd_snd_idx]));
- memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
- sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+ memset(&link->wr_tx_bufs[tx_pend->idx], 0, sizeof(link->wr_tx_bufs[tx_pend->idx]));
if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
return;
}
- if (wc->status) {
+ if (unlikely(wc->status)) {
if (link->lgr->smc_version == SMC_V2) {
memset(link->wr_tx_v2_pend, 0,
sizeof(*link->wr_tx_v2_pend));
@@ -128,44 +117,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
/* terminate link */
smcr_link_down_cond_sched(link);
}
+
if (pnd_snd.handler)
pnd_snd.handler(&pnd_snd.priv, link, wc->status);
wake_up(&link->wr_tx_wait);
}
-static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
-{
- struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
- struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
- int i = 0, rc;
- int polled = 0;
-
-again:
- polled++;
- do {
- memset(&wc, 0, sizeof(wc));
- rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
- if (polled == 1) {
- ib_req_notify_cq(dev->roce_cq_send,
- IB_CQ_NEXT_COMP |
- IB_CQ_REPORT_MISSED_EVENTS);
- }
- if (!rc)
- break;
- for (i = 0; i < rc; i++)
- smc_wr_tx_process_cqe(&wc[i]);
- } while (rc > 0);
- if (polled == 1)
- goto again;
-}
-
-void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
-{
- struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
-
- tasklet_schedule(&dev->send_tasklet);
-}
-
/*---------------------------- request submission ---------------------------*/
static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
@@ -202,7 +159,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
struct smc_wr_tx_pend *wr_pend;
u32 idx = link->wr_tx_cnt;
struct ib_send_wr *wr_ib;
- u64 wr_id;
int rc;
*wr_buf = NULL;
@@ -226,14 +182,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
if (idx == link->wr_tx_cnt)
return -EPIPE;
}
- wr_id = smc_wr_tx_get_next_wr_id(link);
+
wr_pend = &link->wr_tx_pends[idx];
- wr_pend->wr_id = wr_id;
wr_pend->handler = handler;
wr_pend->link = link;
wr_pend->idx = idx;
wr_ib = &link->wr_tx_ibs[idx];
- wr_ib->wr_id = wr_id;
+ wr_ib->wr_cqe = &wr_pend->cqe;
*wr_buf = &link->wr_tx_bufs[idx];
if (wr_rdma_buf)
*wr_rdma_buf = &link->wr_tx_rdmas[idx];
@@ -248,21 +203,18 @@ int smc_wr_tx_get_v2_slot(struct smc_link *link,
{
struct smc_wr_tx_pend *wr_pend;
struct ib_send_wr *wr_ib;
- u64 wr_id;
if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
return -EBUSY;
*wr_buf = NULL;
*wr_pend_priv = NULL;
- wr_id = smc_wr_tx_get_next_wr_id(link);
wr_pend = link->wr_tx_v2_pend;
- wr_pend->wr_id = wr_id;
wr_pend->handler = handler;
wr_pend->link = link;
wr_pend->idx = link->wr_tx_cnt;
wr_ib = link->wr_tx_v2_ib;
- wr_ib->wr_id = wr_id;
+ wr_ib->wr_cqe = &wr_pend->cqe;
*wr_buf = link->lgr->wr_tx_buf_v2;
*wr_pend_priv = &wr_pend->priv;
return 0;
@@ -306,8 +258,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
struct smc_wr_tx_pend *pend;
int rc;
- ib_req_notify_cq(link->smcibdev->roce_cq_send,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
pend = container_of(priv, struct smc_wr_tx_pend, priv);
rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
if (rc) {
@@ -323,8 +273,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
int rc;
link->wr_tx_v2_ib->sg_list[0].length = len;
- ib_req_notify_cq(link->smcibdev->roce_cq_send,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
if (rc) {
smc_wr_tx_put_slot(link, priv);
@@ -367,10 +315,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
{
int rc;
- ib_req_notify_cq(link->smcibdev->roce_cq_send,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
link->wr_reg_state = POSTED;
- link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
link->wr_reg.mr = mr;
link->wr_reg.key = mr->rkey;
rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
@@ -431,94 +376,76 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
{
struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
struct smc_wr_rx_handler *handler;
+ struct smc_ib_recv_wr *recv_wr;
struct smc_wr_rx_hdr *wr_rx;
- u64 temp_wr_id;
- u32 index;
if (wc->byte_len < sizeof(*wr_rx))
return; /* short message */
- temp_wr_id = wc->wr_id;
- index = do_div(temp_wr_id, link->wr_rx_cnt);
- wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen);
+
+ recv_wr = container_of(wc->wr_cqe, struct smc_ib_recv_wr, cqe);
+
+ wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + recv_wr->index * link->wr_rx_buflen);
hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
if (handler->type == wr_rx->type)
handler->handler(wc, wr_rx);
}
}
-static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
+static void smc_wr_rx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
{
- struct smc_link *link;
- int i;
-
- for (i = 0; i < num; i++) {
- link = wc[i].qp->qp_context;
- link->wr_rx_id_compl = wc[i].wr_id;
- if (wc[i].status == IB_WC_SUCCESS) {
- link->wr_rx_tstamp = jiffies;
- smc_wr_rx_demultiplex(&wc[i]);
- smc_wr_rx_post(link); /* refill WR RX */
- } else {
- /* handle status errors */
- switch (wc[i].status) {
- case IB_WC_RETRY_EXC_ERR:
- case IB_WC_RNR_RETRY_EXC_ERR:
- case IB_WC_WR_FLUSH_ERR:
- smcr_link_down_cond_sched(link);
- if (link->wr_rx_id_compl == link->wr_rx_id)
- wake_up(&link->wr_rx_empty_wait);
- break;
- default:
- smc_wr_rx_post(link); /* refill WR RX */
- break;
- }
+ struct smc_link *link = wc->qp->qp_context;
+
+ if (wc->status == IB_WC_SUCCESS) {
+ link->wr_rx_tstamp = jiffies;
+ smc_wr_rx_demultiplex(wc);
+ smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
+ } else {
+ /* handle status errors */
+ switch (wc->status) {
+ case IB_WC_RETRY_EXC_ERR:
+ case IB_WC_RNR_RETRY_EXC_ERR:
+ case IB_WC_WR_FLUSH_ERR:
+ smcr_link_down_cond_sched(link);
+ break;
+ default:
+ smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
+ break;
}
}
}
-static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
+int smc_wr_rx_post_init(struct smc_link *link)
{
- struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
- struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
- int polled = 0;
- int rc;
+ u32 i;
+ int rc = 0;
-again:
- polled++;
- do {
- memset(&wc, 0, sizeof(wc));
- rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
- if (polled == 1) {
- ib_req_notify_cq(dev->roce_cq_recv,
- IB_CQ_SOLICITED_MASK
- | IB_CQ_REPORT_MISSED_EVENTS);
- }
- if (!rc)
- break;
- smc_wr_rx_process_cqes(&wc[0], rc);
- } while (rc > 0);
- if (polled == 1)
- goto again;
+ for (i = 0; i < link->wr_rx_cnt; i++)
+ rc = smc_wr_rx_post(link, &link->wr_rx_ibs[i].cqe);
+ return rc;
}
-void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
-{
- struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
+/***************************** init, exit, misc ******************************/
+
- tasklet_schedule(&dev->recv_tasklet);
+static inline void smc_wr_reg_init_cqe(struct ib_cqe *cqe)
+{
+ cqe->done = smc_wr_reg_process_cqe;
}
-int smc_wr_rx_post_init(struct smc_link *link)
+static inline void smc_wr_tx_init_cqe(struct ib_cqe *cqe)
{
- u32 i;
- int rc = 0;
+ cqe->done = smc_wr_tx_process_cqe;
+}
- for (i = 0; i < link->wr_rx_cnt; i++)
- rc = smc_wr_rx_post(link);
- return rc;
+static inline void smc_wr_rx_init_cqe(struct ib_cqe *cqe)
+{
+ cqe->done = smc_wr_rx_process_cqe;
}
-/***************************** init, exit, misc ******************************/
+static inline void smc_wr_tx_rdma_init_cqe(struct ib_cqe *cqe)
+{
+ cqe->done = smc_wr_tx_rdma_process_cqe;
+}
void smc_wr_remember_qp_attr(struct smc_link *lnk)
{
@@ -548,9 +475,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
&init_attr);
lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
- lnk->qp_attr.cap.max_send_wr);
+ lnk->qp_attr.cap.max_send_wr - 1);
lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
- lnk->qp_attr.cap.max_recv_wr);
+ lnk->qp_attr.cap.max_recv_wr - 1);
}
static void smc_wr_init_sge(struct smc_link *lnk)
@@ -585,6 +512,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.wr_cqe = &lnk->tx_rdma_cqe;
+ lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.wr_cqe = &lnk->tx_rdma_cqe;
}
if (lnk->lgr->smc_version == SMC_V2) {
@@ -622,10 +551,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
lnk->wr_rx_sges[x + 1].lkey =
lnk->roce_pd->local_dma_lkey;
}
- lnk->wr_rx_ibs[i].next = NULL;
- lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
- lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt;
+ lnk->wr_rx_ibs[i].wr.next = NULL;
+ lnk->wr_rx_ibs[i].wr.sg_list = &lnk->wr_rx_sges[x];
+ lnk->wr_rx_ibs[i].wr.num_sge = lnk->wr_rx_sge_cnt;
}
+
+ smc_wr_reg_init_cqe(&lnk->wr_reg_cqe);
+ lnk->wr_reg.wr.wr_cqe = &lnk->wr_reg_cqe;
lnk->wr_reg.wr.next = NULL;
lnk->wr_reg.wr.num_sge = 0;
lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
@@ -641,7 +573,6 @@ void smc_wr_free_link(struct smc_link *lnk)
return;
ibdev = lnk->smcibdev->ibdev;
- smc_wr_drain_cq(lnk);
smc_wr_wakeup_reg_wait(lnk);
smc_wr_wakeup_tx_wait(lnk);
@@ -758,11 +689,19 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
GFP_KERNEL);
if (!link->wr_rx_ibs)
goto no_mem_wr_tx_ibs;
+ /* init wr_rx_ibs cqe */
+ for (int i = 0; i < link->max_recv_wr; i++) {
+ smc_wr_rx_init_cqe(&link->wr_rx_ibs[i].cqe);
+ link->wr_rx_ibs[i].wr.wr_cqe = &link->wr_rx_ibs[i].cqe;
+ link->wr_rx_ibs[i].index = i;
+ }
link->wr_tx_rdmas = kcalloc(link->max_send_wr,
sizeof(link->wr_tx_rdmas[0]),
GFP_KERNEL);
if (!link->wr_tx_rdmas)
goto no_mem_wr_rx_ibs;
+
+ smc_wr_tx_rdma_init_cqe(&link->tx_rdma_cqe);
link->wr_tx_rdma_sges = kcalloc(link->max_send_wr,
sizeof(link->wr_tx_rdma_sges[0]),
GFP_KERNEL);
@@ -785,6 +724,8 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
GFP_KERNEL);
if (!link->wr_tx_pends)
goto no_mem_wr_tx_mask;
+ for (int i = 0; i < link->max_send_wr; i++)
+ smc_wr_tx_init_cqe(&link->wr_tx_pends[i].cqe);
link->wr_tx_compl = kcalloc(link->max_send_wr,
sizeof(link->wr_tx_compl[0]),
GFP_KERNEL);
@@ -804,6 +745,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
GFP_KERNEL);
if (!link->wr_tx_v2_pend)
goto no_mem_v2_sge;
+ smc_wr_tx_init_cqe(&link->wr_tx_v2_pend->cqe);
}
return 0;
@@ -837,18 +779,6 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
return -ENOMEM;
}
-void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
-{
- tasklet_kill(&smcibdev->recv_tasklet);
- tasklet_kill(&smcibdev->send_tasklet);
-}
-
-void smc_wr_add_dev(struct smc_ib_device *smcibdev)
-{
- tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
- tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
-}
-
static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
{
struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
@@ -868,8 +798,6 @@ int smc_wr_create_link(struct smc_link *lnk)
struct ib_device *ibdev = lnk->smcibdev->ibdev;
int rc = 0;
- smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
- lnk->wr_rx_id = 0;
lnk->wr_rx_dma_addr = ib_dma_map_single(
ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
@@ -917,7 +845,6 @@ int smc_wr_create_link(struct smc_link *lnk)
if (rc)
goto cancel_ref;
init_completion(&lnk->reg_ref_comp);
- init_waitqueue_head(&lnk->wr_rx_empty_wait);
return rc;
cancel_ref:
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index aa4533af9122..4268dfcd84d3 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -44,19 +44,6 @@ struct smc_wr_rx_handler {
u8 type;
};
-/* Only used by RDMA write WRs.
- * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
- */
-static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
-{
- return atomic_long_inc_return(&link->wr_tx_id);
-}
-
-static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
-{
- atomic_long_set(wr_tx_id, val);
-}
-
static inline bool smc_wr_tx_link_hold(struct smc_link *link)
{
if (!smc_link_sendable(link))
@@ -70,9 +57,10 @@ static inline void smc_wr_tx_link_put(struct smc_link *link)
percpu_ref_put(&link->wr_tx_refs);
}
-static inline void smc_wr_drain_cq(struct smc_link *lnk)
+static inline void smc_wr_drain_qp(struct smc_link *lnk)
{
- wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
+ if (lnk->qp_attr.cur_qp_state != IB_QPS_RESET)
+ ib_drain_qp(lnk->roce_qp);
}
static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
@@ -86,18 +74,12 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
}
/* post a new receive work request to fill a completed old work request entry */
-static inline int smc_wr_rx_post(struct smc_link *link)
+static inline int smc_wr_rx_post(struct smc_link *link, struct ib_cqe *cqe)
{
- int rc;
- u64 wr_id, temp_wr_id;
- u32 index;
-
- wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
- temp_wr_id = wr_id;
- index = do_div(temp_wr_id, link->wr_rx_cnt);
- link->wr_rx_ibs[index].wr_id = wr_id;
- rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
- return rc;
+ struct smc_ib_recv_wr *recv_wr;
+
+ recv_wr = container_of(cqe, struct smc_ib_recv_wr, cqe);
+ return ib_post_recv(link->roce_qp, &recv_wr->wr, NULL);
}
int smc_wr_create_link(struct smc_link *lnk);
@@ -107,8 +89,6 @@ void smc_wr_free_link(struct smc_link *lnk);
void smc_wr_free_link_mem(struct smc_link *lnk);
void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
void smc_wr_remember_qp_attr(struct smc_link *lnk);
-void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
-void smc_wr_add_dev(struct smc_ib_device *smcibdev);
int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
struct smc_wr_buf **wr_buf,
@@ -126,12 +106,10 @@ int smc_wr_tx_v2_send(struct smc_link *link,
struct smc_wr_tx_pend_priv *priv, int len);
int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
unsigned long timeout);
-void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
int smc_wr_rx_post_init(struct smc_link *link);
-void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
#endif /* SMC_WR_H */
--
2.45.0
On 02/02/26 3:18 pm, D. Wythe wrote: > The current SMC-R implementation relies on global per-device CQs > and manual polling within tasklets, which introduces severe > scalability bottlenecks due to global lock contention and tasklet > scheduling overhead, resulting in poor performance as concurrency > increases. > > Refactor the completion handling to utilize the ib_cqe API and > standard RDMA core CQ pooling. This transition provides several key > advantages: > > 1. Multi-CQ: Shift from a single shared per-device CQ to multiple > link-specific CQs via the CQ pool. This allows completion processing > to be parallelized across multiple CPU cores, effectively eliminating > the global CQ bottleneck. > > 2. Leverage DIM: Utilizing the standard CQ pool with IB_POLL_SOFTIRQ > enables Dynamic Interrupt Moderation from the RDMA core, optimizing > interrupt frequency and reducing CPU load under high pressure. > > 3. O(1) Context Retrieval: Replaces the expensive wr_id based lookup > logic (e.g., smc_wr_tx_find_pending_index) with direct context retrieval > using container_of() on the embedded ib_cqe. > > 4. Code Simplification: This refactoring results in a reduction of > ~150 lines of code. It removes redundant sequence tracking, complex lookup > helpers, and manual CQ management, significantly improving maintainability. > > Performance Test: redis-benchmark with max 32 connections per QP > Data format: Requests Per Second (RPS), Percentage in brackets > represents the gain/loss compared to TCP. > > | Clients | TCP | SMC (original) | SMC (cq_pool) | > |---------|----------|---------------------|---------------------| > | c = 1 | 24449 | 31172 (+27%) | 34039 (+39%) | > | c = 2 | 46420 | 53216 (+14%) | 64391 (+38%) | > | c = 16 | 159673 | 83668 (-48%) <-- | 216947 (+36%) | > | c = 32 | 164956 | 97631 (-41%) <-- | 249376 (+51%) | > | c = 64 | 166322 | 118192 (-29%) <-- | 249488 (+50%) | > | c = 128 | 167700 | 121497 (-27%) <-- | 249480 (+48%) | > | c = 256 | 175021 | 146109 (-16%) <-- | 240384 (+37%) | > | c = 512 | 168987 | 101479 (-40%) <-- | 226634 (+34%) | > > The results demonstrate that this optimization effectively resolves the > scalability bottleneck, with RPS increasing by over 110% at c=64 > compared to the original implementation. I applied your patch to the latest kernel(6.19-rc8) & saw below Performance results: 1) In my evaluation, I ran several *uperf* based workloads using a request/response (RR) pattern, and I observed performance *degradation* ranging from *4%* to *59%*, depending on the specific read/write sizes used. For example, with a TCP RR workload using 50 parallel clients (nprocs=50) sending a 200‑byte request and reading a 1000‑byte response over a 60‑second run, I measured approximately 59% degradation compared to SMC‑R original performance. 2) In contrast, with uperf *streaming‑type* workloads, your patch shows clear gains. I observed performance *improvement* ranging from *11%* to *75%*, again depending on the specific streaming parameters. One representative case is a TCP streaming/bulk‑receive workload with 250 parallel clients (nprocs=250) performing 640 reads per burst with 30 KB per read, running continuously for 60 seconds, where I measured approximately *75%* *improvement* over the SMC‑R original performance. Note: I ran above tests with default WR(work request buffers), default receive & transmit buffer size with smc_run. I am looking for additional details regarding the redis-benchmark performance results you previously shared. I would like to understand whether the workload behaved more like a traditional request/response (RR) pattern or a streaming-type workload, and what SMC‑R configuration was used during the tests? 1) SMC Work Request (WR) Settings - Did your test environment use the default SMC‑R work request buffers? net.smc.smcr_max_recv_wr = 48 net.smc.smcr_max_send_wr = 16 2) SMC-R Buffer sizes used via smc_run - Did you use default transmit & receive buffer sizes(smc_run -r <recv_size> -t <send_size>)? 3) Additional system or network tuning e.g CPU affinity, NIC offload settings etc?
On Mon, Feb 02, 2026 at 05:48:00PM +0800, D. Wythe wrote:
> The current SMC-R implementation relies on global per-device CQs
> and manual polling within tasklets, which introduces severe
> scalability bottlenecks due to global lock contention and tasklet
> scheduling overhead, resulting in poor performance as concurrency
> increases.
>
> Refactor the completion handling to utilize the ib_cqe API and
> standard RDMA core CQ pooling. This transition provides several key
> advantages:
>
> 1. Multi-CQ: Shift from a single shared per-device CQ to multiple
> link-specific CQs via the CQ pool. This allows completion processing
> to be parallelized across multiple CPU cores, effectively eliminating
> the global CQ bottleneck.
>
> 2. Leverage DIM: Utilizing the standard CQ pool with IB_POLL_SOFTIRQ
> enables Dynamic Interrupt Moderation from the RDMA core, optimizing
> interrupt frequency and reducing CPU load under high pressure.
>
> 3. O(1) Context Retrieval: Replaces the expensive wr_id based lookup
> logic (e.g., smc_wr_tx_find_pending_index) with direct context retrieval
> using container_of() on the embedded ib_cqe.
>
> 4. Code Simplification: This refactoring results in a reduction of
> ~150 lines of code. It removes redundant sequence tracking, complex lookup
> helpers, and manual CQ management, significantly improving maintainability.
>
> Performance Test: redis-benchmark with max 32 connections per QP
> Data format: Requests Per Second (RPS), Percentage in brackets
> represents the gain/loss compared to TCP.
>
> | Clients | TCP | SMC (original) | SMC (cq_pool) |
> |---------|----------|---------------------|---------------------|
> | c = 1 | 24449 | 31172 (+27%) | 34039 (+39%) |
> | c = 2 | 46420 | 53216 (+14%) | 64391 (+38%) |
> | c = 16 | 159673 | 83668 (-48%) <-- | 216947 (+36%) |
> | c = 32 | 164956 | 97631 (-41%) <-- | 249376 (+51%) |
> | c = 64 | 166322 | 118192 (-29%) <-- | 249488 (+50%) |
> | c = 128 | 167700 | 121497 (-27%) <-- | 249480 (+48%) |
> | c = 256 | 175021 | 146109 (-16%) <-- | 240384 (+37%) |
> | c = 512 | 168987 | 101479 (-40%) <-- | 226634 (+34%) |
>
> The results demonstrate that this optimization effectively resolves the
> scalability bottleneck, with RPS increasing by over 110% at c=64
> compared to the original implementation.
Thanks for the conversion, it looks great. I just wonder, why did you
mark this as an RFC?
Thanks
>
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> ---
> net/smc/smc_core.c | 8 +-
> net/smc/smc_core.h | 16 ++-
> net/smc/smc_ib.c | 114 ++++++-------------
> net/smc/smc_ib.h | 5 -
> net/smc/smc_tx.c | 1 -
> net/smc/smc_wr.c | 267 ++++++++++++++++-----------------------------
> net/smc/smc_wr.h | 38 ++-----
> 7 files changed, 150 insertions(+), 299 deletions(-)
>
> diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> index 8aca5dc54be7..9590c8aed3dd 100644
> --- a/net/smc/smc_core.c
> +++ b/net/smc/smc_core.c
> @@ -815,17 +815,11 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> lnk->lgr = lgr;
> smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> lnk->link_idx = link_idx;
> - lnk->wr_rx_id_compl = 0;
> smc_ibdev_cnt_inc(lnk);
> smcr_copy_dev_info_to_link(lnk);
> atomic_set(&lnk->conn_cnt, 0);
> smc_llc_link_set_uid(lnk);
> INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
> - if (!lnk->smcibdev->initialized) {
> - rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
> - if (rc)
> - goto out;
> - }
> get_random_bytes(rndvec, sizeof(rndvec));
> lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
> (rndvec[2] << 16);
> @@ -1373,7 +1367,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> smc_llc_link_clear(lnk, log);
> smcr_buf_unmap_lgr(lnk);
> smcr_rtoken_clear_link(lnk);
> - smc_ib_modify_qp_error(lnk);
> + smc_wr_drain_qp(lnk);
> smc_wr_free_link(lnk);
> smc_ib_destroy_queue_pair(lnk);
> smc_ib_dealloc_protection_domain(lnk);
> diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> index 5c18f08a4c8a..00468b7a279f 100644
> --- a/net/smc/smc_core.h
> +++ b/net/smc/smc_core.h
> @@ -92,6 +92,12 @@ struct smc_rdma_wr { /* work requests per message
> struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
> };
>
> +struct smc_ib_recv_wr {
> + struct ib_cqe cqe;
> + struct ib_recv_wr wr;
> + int index;
> +};
> +
> #define SMC_LGR_ID_SIZE 4
>
> struct smc_link {
> @@ -100,6 +106,8 @@ struct smc_link {
> struct ib_pd *roce_pd; /* IB protection domain,
> * unique for every RoCE QP
> */
> + int nr_cqe;
> + struct ib_cq *ib_cq;
> struct ib_qp *roce_qp; /* IB queue pair */
> struct ib_qp_attr qp_attr; /* IB queue pair attributes */
>
> @@ -107,6 +115,7 @@ struct smc_link {
> struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
> struct ib_sge *wr_tx_sges; /* WR send gather meta data */
> struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
> + struct ib_cqe tx_rdma_cqe; /* CQE RDMA WRITE */
> struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
> struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
> struct completion *wr_tx_compl; /* WR send CQE completion */
> @@ -116,7 +125,6 @@ struct smc_link {
> struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */
> dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
> dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
> - atomic_long_t wr_tx_id; /* seq # of last sent WR */
> unsigned long *wr_tx_mask; /* bit mask of used indexes */
> u32 wr_tx_cnt; /* number of WR send buffers */
> wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
> @@ -126,7 +134,7 @@ struct smc_link {
> struct completion tx_ref_comp;
>
> u8 *wr_rx_bufs; /* WR recv payload buffers */
> - struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
> + struct smc_ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
> struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
> /* above three vectors have wr_rx_cnt elements and use the same index */
> int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */
> @@ -135,13 +143,11 @@ struct smc_link {
> */
> dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
> dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> - u64 wr_rx_id; /* seq # of last recv WR */
> - u64 wr_rx_id_compl; /* seq # of last completed WR */
> u32 wr_rx_cnt; /* number of WR recv buffers */
> unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
> - wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */
>
> struct ib_reg_wr wr_reg; /* WR register memory region */
> + struct ib_cqe wr_reg_cqe;
> wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
> struct {
> struct percpu_ref wr_reg_refs;
> diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
> index 67211d44a1db..77047ad7d452 100644
> --- a/net/smc/smc_ib.c
> +++ b/net/smc/smc_ib.c
> @@ -112,15 +112,6 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
> IB_QP_MAX_QP_RD_ATOMIC);
> }
>
> -int smc_ib_modify_qp_error(struct smc_link *lnk)
> -{
> - struct ib_qp_attr qp_attr;
> -
> - memset(&qp_attr, 0, sizeof(qp_attr));
> - qp_attr.qp_state = IB_QPS_ERR;
> - return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
> -}
> -
> int smc_ib_ready_link(struct smc_link *lnk)
> {
> struct smc_link_group *lgr = smc_get_lgr(lnk);
> @@ -134,10 +125,7 @@ int smc_ib_ready_link(struct smc_link *lnk)
> if (rc)
> goto out;
> smc_wr_remember_qp_attr(lnk);
> - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
> - IB_CQ_SOLICITED_MASK);
> - if (rc)
> - goto out;
> +
> rc = smc_wr_rx_post_init(lnk);
> if (rc)
> goto out;
> @@ -658,38 +646,60 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
> if (lnk->roce_qp)
> ib_destroy_qp(lnk->roce_qp);
> lnk->roce_qp = NULL;
> + if (lnk->ib_cq) {
> + ib_cq_pool_put(lnk->ib_cq, lnk->nr_cqe);
> + lnk->ib_cq = NULL;
> + }
> }
>
> /* create a queue pair within the protection domain for a link */
> int smc_ib_create_queue_pair(struct smc_link *lnk)
> {
> + int max_send_wr, max_recv_wr, rc;
> + struct ib_cq *cq;
> +
> + /* include unsolicited rdma_writes as well,
> + * there are max. 2 RDMA_WRITE per 1 WR_SEND.
> + * +1 for ib_drain_qp()
> + */
> + max_send_wr = 3 * lnk->lgr->max_send_wr + 1;
> + max_recv_wr = lnk->lgr->max_recv_wr + 1;
> +
> + cq = ib_cq_pool_get(lnk->smcibdev->ibdev, max_send_wr + max_recv_wr, -1,
> + IB_POLL_SOFTIRQ);
> +
> + if (IS_ERR(cq)) {
> + rc = PTR_ERR(cq);
> + return rc;
> + }
> +
> struct ib_qp_init_attr qp_attr = {
> .event_handler = smc_ib_qp_event_handler,
> .qp_context = lnk,
> - .send_cq = lnk->smcibdev->roce_cq_send,
> - .recv_cq = lnk->smcibdev->roce_cq_recv,
> + .send_cq = cq,
> + .recv_cq = cq,
> .srq = NULL,
> .cap = {
> .max_send_sge = SMC_IB_MAX_SEND_SGE,
> .max_recv_sge = lnk->wr_rx_sge_cnt,
> + .max_send_wr = max_send_wr,
> + .max_recv_wr = max_recv_wr,
> .max_inline_data = 0,
> },
> .sq_sig_type = IB_SIGNAL_REQ_WR,
> .qp_type = IB_QPT_RC,
> };
> - int rc;
>
> - /* include unsolicited rdma_writes as well,
> - * there are max. 2 RDMA_WRITE per 1 WR_SEND
> - */
> - qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
> - qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
> lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
> rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
> - if (IS_ERR(lnk->roce_qp))
> + if (IS_ERR(lnk->roce_qp)) {
> lnk->roce_qp = NULL;
> - else
> + ib_cq_pool_put(cq, max_send_wr + max_recv_wr);
> + } else {
> smc_wr_remember_qp_attr(lnk);
> + lnk->nr_cqe = max_send_wr + max_recv_wr;
> + lnk->ib_cq = cq;
> + }
> return rc;
> }
>
> @@ -855,62 +865,6 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk,
> buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
> }
>
> -long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
> -{
> - struct ib_cq_init_attr cqattr = {
> - .cqe = SMC_MAX_CQE, .comp_vector = 0 };
> - int cqe_size_order, smc_order;
> - long rc;
> -
> - mutex_lock(&smcibdev->mutex);
> - rc = 0;
> - if (smcibdev->initialized)
> - goto out;
> - /* the calculated number of cq entries fits to mlx5 cq allocation */
> - cqe_size_order = cache_line_size() == 128 ? 7 : 6;
> - smc_order = MAX_PAGE_ORDER - cqe_size_order;
> - if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
> - cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
> - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
> - smc_wr_tx_cq_handler, NULL,
> - smcibdev, &cqattr);
> - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
> - if (IS_ERR(smcibdev->roce_cq_send)) {
> - smcibdev->roce_cq_send = NULL;
> - goto out;
> - }
> - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
> - smc_wr_rx_cq_handler, NULL,
> - smcibdev, &cqattr);
> - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
> - if (IS_ERR(smcibdev->roce_cq_recv)) {
> - smcibdev->roce_cq_recv = NULL;
> - goto err;
> - }
> - smc_wr_add_dev(smcibdev);
> - smcibdev->initialized = 1;
> - goto out;
> -
> -err:
> - ib_destroy_cq(smcibdev->roce_cq_send);
> -out:
> - mutex_unlock(&smcibdev->mutex);
> - return rc;
> -}
> -
> -static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
> -{
> - mutex_lock(&smcibdev->mutex);
> - if (!smcibdev->initialized)
> - goto out;
> - smcibdev->initialized = 0;
> - ib_destroy_cq(smcibdev->roce_cq_recv);
> - ib_destroy_cq(smcibdev->roce_cq_send);
> - smc_wr_remove_dev(smcibdev);
> -out:
> - mutex_unlock(&smcibdev->mutex);
> -}
> -
> static struct ib_client smc_ib_client;
>
> static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
> @@ -969,7 +923,6 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
> INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
> atomic_set(&smcibdev->lnk_cnt, 0);
> init_waitqueue_head(&smcibdev->lnks_deleted);
> - mutex_init(&smcibdev->mutex);
> mutex_lock(&smc_ib_devices.mutex);
> list_add_tail(&smcibdev->list, &smc_ib_devices.list);
> mutex_unlock(&smc_ib_devices.mutex);
> @@ -1018,7 +971,6 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
> pr_warn_ratelimited("smc: removing ib device %s\n",
> smcibdev->ibdev->name);
> smc_smcr_terminate_all(smcibdev);
> - smc_ib_cleanup_per_ibdev(smcibdev);
> ib_unregister_event_handler(&smcibdev->event_handler);
> cancel_work_sync(&smcibdev->port_event_work);
> kfree(smcibdev);
> diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
> index ef8ac2b7546d..c5a0d773b73f 100644
> --- a/net/smc/smc_ib.h
> +++ b/net/smc/smc_ib.h
> @@ -37,10 +37,6 @@ struct smc_ib_device { /* ib-device infos for smc */
> struct ib_device *ibdev;
> struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
> struct ib_event_handler event_handler; /* global ib_event handler */
> - struct ib_cq *roce_cq_send; /* send completion queue */
> - struct ib_cq *roce_cq_recv; /* recv completion queue */
> - struct tasklet_struct send_tasklet; /* called by send cq handler */
> - struct tasklet_struct recv_tasklet; /* called by recv cq handler */
> char mac[SMC_MAX_PORTS][ETH_ALEN];
> /* mac address per port*/
> u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
> @@ -96,7 +92,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk);
> int smc_ib_create_queue_pair(struct smc_link *lnk);
> int smc_ib_ready_link(struct smc_link *lnk);
> int smc_ib_modify_qp_rts(struct smc_link *lnk);
> -int smc_ib_modify_qp_error(struct smc_link *lnk);
> long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
> int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
> struct smc_buf_desc *buf_slot, u8 link_idx);
> diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
> index 3144b4b1fe29..d301df9ed58b 100644
> --- a/net/smc/smc_tx.c
> +++ b/net/smc/smc_tx.c
> @@ -321,7 +321,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
> struct smc_link *link = conn->lnk;
> int rc;
>
> - rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
> rdma_wr->wr.num_sge = num_sges;
> rdma_wr->remote_addr =
> lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 5feafa98ab1a..3a361aa020ab 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
> @@ -38,7 +38,7 @@ static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
> static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
>
> struct smc_wr_tx_pend { /* control data for a pending send request */
> - u64 wr_id; /* work request id sent */
> + struct ib_cqe cqe;
> smc_wr_tx_handler handler;
> enum ib_wc_status wc_status; /* CQE status */
> struct smc_link *link;
> @@ -63,62 +63,51 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
> wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
> }
>
> -static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
> +static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> {
> - u32 i;
> + struct smc_link *link = wc->qp->qp_context;
>
> - for (i = 0; i < link->wr_tx_cnt; i++) {
> - if (link->wr_tx_pends[i].wr_id == wr_id)
> - return i;
> - }
> - return link->wr_tx_cnt;
> + /* terminate link */
> + if (unlikely(wc->status))
> + smcr_link_down_cond_sched(link);
> +}
> +
> +static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> +{
> + struct smc_link *link = wc->qp->qp_context;
> +
> + if (wc->status)
> + link->wr_reg_state = FAILED;
> + else
> + link->wr_reg_state = CONFIRMED;
> + smc_wr_wakeup_reg_wait(link);
> }
>
> -static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> +static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> {
> - struct smc_wr_tx_pend pnd_snd;
> + struct smc_wr_tx_pend *tx_pend, pnd_snd;
> struct smc_link *link;
> u32 pnd_snd_idx;
>
> link = wc->qp->qp_context;
>
> - if (wc->opcode == IB_WC_REG_MR) {
> - if (wc->status)
> - link->wr_reg_state = FAILED;
> - else
> - link->wr_reg_state = CONFIRMED;
> - smc_wr_wakeup_reg_wait(link);
> - return;
> - }
> + tx_pend = container_of(wc->wr_cqe, struct smc_wr_tx_pend, cqe);
> + pnd_snd_idx = tx_pend->idx;
> +
> + tx_pend->wc_status = wc->status;
> + memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
>
> - pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
> if (pnd_snd_idx == link->wr_tx_cnt) {
> - if (link->lgr->smc_version != SMC_V2 ||
> - link->wr_tx_v2_pend->wr_id != wc->wr_id)
> - return;
> - link->wr_tx_v2_pend->wc_status = wc->status;
> - memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
> - /* clear the full struct smc_wr_tx_pend including .priv */
> - memset(link->wr_tx_v2_pend, 0,
> - sizeof(*link->wr_tx_v2_pend));
> - memset(link->lgr->wr_tx_buf_v2, 0,
> - sizeof(*link->lgr->wr_tx_buf_v2));
> + memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2));
> } else {
> - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
> - if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
> + if (tx_pend->compl_requested)
> complete(&link->wr_tx_compl[pnd_snd_idx]);
> - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
> - sizeof(pnd_snd));
> - /* clear the full struct smc_wr_tx_pend including .priv */
> - memset(&link->wr_tx_pends[pnd_snd_idx], 0,
> - sizeof(link->wr_tx_pends[pnd_snd_idx]));
> - memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
> - sizeof(link->wr_tx_bufs[pnd_snd_idx]));
> + memset(&link->wr_tx_bufs[tx_pend->idx], 0, sizeof(link->wr_tx_bufs[tx_pend->idx]));
> if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
> return;
> }
>
> - if (wc->status) {
> + if (unlikely(wc->status)) {
> if (link->lgr->smc_version == SMC_V2) {
> memset(link->wr_tx_v2_pend, 0,
> sizeof(*link->wr_tx_v2_pend));
> @@ -128,44 +117,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> /* terminate link */
> smcr_link_down_cond_sched(link);
> }
> +
> if (pnd_snd.handler)
> pnd_snd.handler(&pnd_snd.priv, link, wc->status);
> wake_up(&link->wr_tx_wait);
> }
>
> -static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
> -{
> - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
> - struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
> - int i = 0, rc;
> - int polled = 0;
> -
> -again:
> - polled++;
> - do {
> - memset(&wc, 0, sizeof(wc));
> - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
> - if (polled == 1) {
> - ib_req_notify_cq(dev->roce_cq_send,
> - IB_CQ_NEXT_COMP |
> - IB_CQ_REPORT_MISSED_EVENTS);
> - }
> - if (!rc)
> - break;
> - for (i = 0; i < rc; i++)
> - smc_wr_tx_process_cqe(&wc[i]);
> - } while (rc > 0);
> - if (polled == 1)
> - goto again;
> -}
> -
> -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
> -{
> - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
> -
> - tasklet_schedule(&dev->send_tasklet);
> -}
> -
> /*---------------------------- request submission ---------------------------*/
>
> static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
> @@ -202,7 +159,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
> struct smc_wr_tx_pend *wr_pend;
> u32 idx = link->wr_tx_cnt;
> struct ib_send_wr *wr_ib;
> - u64 wr_id;
> int rc;
>
> *wr_buf = NULL;
> @@ -226,14 +182,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
> if (idx == link->wr_tx_cnt)
> return -EPIPE;
> }
> - wr_id = smc_wr_tx_get_next_wr_id(link);
> +
> wr_pend = &link->wr_tx_pends[idx];
> - wr_pend->wr_id = wr_id;
> wr_pend->handler = handler;
> wr_pend->link = link;
> wr_pend->idx = idx;
> wr_ib = &link->wr_tx_ibs[idx];
> - wr_ib->wr_id = wr_id;
> + wr_ib->wr_cqe = &wr_pend->cqe;
> *wr_buf = &link->wr_tx_bufs[idx];
> if (wr_rdma_buf)
> *wr_rdma_buf = &link->wr_tx_rdmas[idx];
> @@ -248,21 +203,18 @@ int smc_wr_tx_get_v2_slot(struct smc_link *link,
> {
> struct smc_wr_tx_pend *wr_pend;
> struct ib_send_wr *wr_ib;
> - u64 wr_id;
>
> if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
> return -EBUSY;
>
> *wr_buf = NULL;
> *wr_pend_priv = NULL;
> - wr_id = smc_wr_tx_get_next_wr_id(link);
> wr_pend = link->wr_tx_v2_pend;
> - wr_pend->wr_id = wr_id;
> wr_pend->handler = handler;
> wr_pend->link = link;
> wr_pend->idx = link->wr_tx_cnt;
> wr_ib = link->wr_tx_v2_ib;
> - wr_ib->wr_id = wr_id;
> + wr_ib->wr_cqe = &wr_pend->cqe;
> *wr_buf = link->lgr->wr_tx_buf_v2;
> *wr_pend_priv = &wr_pend->priv;
> return 0;
> @@ -306,8 +258,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
> struct smc_wr_tx_pend *pend;
> int rc;
>
> - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> pend = container_of(priv, struct smc_wr_tx_pend, priv);
> rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
> if (rc) {
> @@ -323,8 +273,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
> int rc;
>
> link->wr_tx_v2_ib->sg_list[0].length = len;
> - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
> if (rc) {
> smc_wr_tx_put_slot(link, priv);
> @@ -367,10 +315,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
> {
> int rc;
>
> - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> link->wr_reg_state = POSTED;
> - link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
> link->wr_reg.mr = mr;
> link->wr_reg.key = mr->rkey;
> rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
> @@ -431,94 +376,76 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
> {
> struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
> struct smc_wr_rx_handler *handler;
> + struct smc_ib_recv_wr *recv_wr;
> struct smc_wr_rx_hdr *wr_rx;
> - u64 temp_wr_id;
> - u32 index;
>
> if (wc->byte_len < sizeof(*wr_rx))
> return; /* short message */
> - temp_wr_id = wc->wr_id;
> - index = do_div(temp_wr_id, link->wr_rx_cnt);
> - wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen);
> +
> + recv_wr = container_of(wc->wr_cqe, struct smc_ib_recv_wr, cqe);
> +
> + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + recv_wr->index * link->wr_rx_buflen);
> hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
> if (handler->type == wr_rx->type)
> handler->handler(wc, wr_rx);
> }
> }
>
> -static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> +static void smc_wr_rx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> {
> - struct smc_link *link;
> - int i;
> -
> - for (i = 0; i < num; i++) {
> - link = wc[i].qp->qp_context;
> - link->wr_rx_id_compl = wc[i].wr_id;
> - if (wc[i].status == IB_WC_SUCCESS) {
> - link->wr_rx_tstamp = jiffies;
> - smc_wr_rx_demultiplex(&wc[i]);
> - smc_wr_rx_post(link); /* refill WR RX */
> - } else {
> - /* handle status errors */
> - switch (wc[i].status) {
> - case IB_WC_RETRY_EXC_ERR:
> - case IB_WC_RNR_RETRY_EXC_ERR:
> - case IB_WC_WR_FLUSH_ERR:
> - smcr_link_down_cond_sched(link);
> - if (link->wr_rx_id_compl == link->wr_rx_id)
> - wake_up(&link->wr_rx_empty_wait);
> - break;
> - default:
> - smc_wr_rx_post(link); /* refill WR RX */
> - break;
> - }
> + struct smc_link *link = wc->qp->qp_context;
> +
> + if (wc->status == IB_WC_SUCCESS) {
> + link->wr_rx_tstamp = jiffies;
> + smc_wr_rx_demultiplex(wc);
> + smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
> + } else {
> + /* handle status errors */
> + switch (wc->status) {
> + case IB_WC_RETRY_EXC_ERR:
> + case IB_WC_RNR_RETRY_EXC_ERR:
> + case IB_WC_WR_FLUSH_ERR:
> + smcr_link_down_cond_sched(link);
> + break;
> + default:
> + smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
> + break;
> }
> }
> }
>
> -static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
> +int smc_wr_rx_post_init(struct smc_link *link)
> {
> - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
> - struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
> - int polled = 0;
> - int rc;
> + u32 i;
> + int rc = 0;
>
> -again:
> - polled++;
> - do {
> - memset(&wc, 0, sizeof(wc));
> - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
> - if (polled == 1) {
> - ib_req_notify_cq(dev->roce_cq_recv,
> - IB_CQ_SOLICITED_MASK
> - | IB_CQ_REPORT_MISSED_EVENTS);
> - }
> - if (!rc)
> - break;
> - smc_wr_rx_process_cqes(&wc[0], rc);
> - } while (rc > 0);
> - if (polled == 1)
> - goto again;
> + for (i = 0; i < link->wr_rx_cnt; i++)
> + rc = smc_wr_rx_post(link, &link->wr_rx_ibs[i].cqe);
> + return rc;
> }
>
> -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
> -{
> - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
> +/***************************** init, exit, misc ******************************/
> +
>
> - tasklet_schedule(&dev->recv_tasklet);
> +static inline void smc_wr_reg_init_cqe(struct ib_cqe *cqe)
> +{
> + cqe->done = smc_wr_reg_process_cqe;
> }
>
> -int smc_wr_rx_post_init(struct smc_link *link)
> +static inline void smc_wr_tx_init_cqe(struct ib_cqe *cqe)
> {
> - u32 i;
> - int rc = 0;
> + cqe->done = smc_wr_tx_process_cqe;
> +}
>
> - for (i = 0; i < link->wr_rx_cnt; i++)
> - rc = smc_wr_rx_post(link);
> - return rc;
> +static inline void smc_wr_rx_init_cqe(struct ib_cqe *cqe)
> +{
> + cqe->done = smc_wr_rx_process_cqe;
> }
>
> -/***************************** init, exit, misc ******************************/
> +static inline void smc_wr_tx_rdma_init_cqe(struct ib_cqe *cqe)
> +{
> + cqe->done = smc_wr_tx_rdma_process_cqe;
> +}
>
> void smc_wr_remember_qp_attr(struct smc_link *lnk)
> {
> @@ -548,9 +475,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
> &init_attr);
>
> lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
> - lnk->qp_attr.cap.max_send_wr);
> + lnk->qp_attr.cap.max_send_wr - 1);
> lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
> - lnk->qp_attr.cap.max_recv_wr);
> + lnk->qp_attr.cap.max_recv_wr - 1);
> }
>
> static void smc_wr_init_sge(struct smc_link *lnk)
> @@ -585,6 +512,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
> lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
> lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
> + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.wr_cqe = &lnk->tx_rdma_cqe;
> + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.wr_cqe = &lnk->tx_rdma_cqe;
> }
>
> if (lnk->lgr->smc_version == SMC_V2) {
> @@ -622,10 +551,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> lnk->wr_rx_sges[x + 1].lkey =
> lnk->roce_pd->local_dma_lkey;
> }
> - lnk->wr_rx_ibs[i].next = NULL;
> - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
> - lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt;
> + lnk->wr_rx_ibs[i].wr.next = NULL;
> + lnk->wr_rx_ibs[i].wr.sg_list = &lnk->wr_rx_sges[x];
> + lnk->wr_rx_ibs[i].wr.num_sge = lnk->wr_rx_sge_cnt;
> }
> +
> + smc_wr_reg_init_cqe(&lnk->wr_reg_cqe);
> + lnk->wr_reg.wr.wr_cqe = &lnk->wr_reg_cqe;
> lnk->wr_reg.wr.next = NULL;
> lnk->wr_reg.wr.num_sge = 0;
> lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
> @@ -641,7 +573,6 @@ void smc_wr_free_link(struct smc_link *lnk)
> return;
> ibdev = lnk->smcibdev->ibdev;
>
> - smc_wr_drain_cq(lnk);
> smc_wr_wakeup_reg_wait(lnk);
> smc_wr_wakeup_tx_wait(lnk);
>
> @@ -758,11 +689,19 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> GFP_KERNEL);
> if (!link->wr_rx_ibs)
> goto no_mem_wr_tx_ibs;
> + /* init wr_rx_ibs cqe */
> + for (int i = 0; i < link->max_recv_wr; i++) {
> + smc_wr_rx_init_cqe(&link->wr_rx_ibs[i].cqe);
> + link->wr_rx_ibs[i].wr.wr_cqe = &link->wr_rx_ibs[i].cqe;
> + link->wr_rx_ibs[i].index = i;
> + }
> link->wr_tx_rdmas = kcalloc(link->max_send_wr,
> sizeof(link->wr_tx_rdmas[0]),
> GFP_KERNEL);
> if (!link->wr_tx_rdmas)
> goto no_mem_wr_rx_ibs;
> +
> + smc_wr_tx_rdma_init_cqe(&link->tx_rdma_cqe);
> link->wr_tx_rdma_sges = kcalloc(link->max_send_wr,
> sizeof(link->wr_tx_rdma_sges[0]),
> GFP_KERNEL);
> @@ -785,6 +724,8 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> GFP_KERNEL);
> if (!link->wr_tx_pends)
> goto no_mem_wr_tx_mask;
> + for (int i = 0; i < link->max_send_wr; i++)
> + smc_wr_tx_init_cqe(&link->wr_tx_pends[i].cqe);
> link->wr_tx_compl = kcalloc(link->max_send_wr,
> sizeof(link->wr_tx_compl[0]),
> GFP_KERNEL);
> @@ -804,6 +745,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> GFP_KERNEL);
> if (!link->wr_tx_v2_pend)
> goto no_mem_v2_sge;
> + smc_wr_tx_init_cqe(&link->wr_tx_v2_pend->cqe);
> }
> return 0;
>
> @@ -837,18 +779,6 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> return -ENOMEM;
> }
>
> -void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
> -{
> - tasklet_kill(&smcibdev->recv_tasklet);
> - tasklet_kill(&smcibdev->send_tasklet);
> -}
> -
> -void smc_wr_add_dev(struct smc_ib_device *smcibdev)
> -{
> - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
> - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
> -}
> -
> static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
> {
> struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
> @@ -868,8 +798,6 @@ int smc_wr_create_link(struct smc_link *lnk)
> struct ib_device *ibdev = lnk->smcibdev->ibdev;
> int rc = 0;
>
> - smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
> - lnk->wr_rx_id = 0;
> lnk->wr_rx_dma_addr = ib_dma_map_single(
> ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt,
> DMA_FROM_DEVICE);
> @@ -917,7 +845,6 @@ int smc_wr_create_link(struct smc_link *lnk)
> if (rc)
> goto cancel_ref;
> init_completion(&lnk->reg_ref_comp);
> - init_waitqueue_head(&lnk->wr_rx_empty_wait);
> return rc;
>
> cancel_ref:
> diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> index aa4533af9122..4268dfcd84d3 100644
> --- a/net/smc/smc_wr.h
> +++ b/net/smc/smc_wr.h
> @@ -44,19 +44,6 @@ struct smc_wr_rx_handler {
> u8 type;
> };
>
> -/* Only used by RDMA write WRs.
> - * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
> - */
> -static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
> -{
> - return atomic_long_inc_return(&link->wr_tx_id);
> -}
> -
> -static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
> -{
> - atomic_long_set(wr_tx_id, val);
> -}
> -
> static inline bool smc_wr_tx_link_hold(struct smc_link *link)
> {
> if (!smc_link_sendable(link))
> @@ -70,9 +57,10 @@ static inline void smc_wr_tx_link_put(struct smc_link *link)
> percpu_ref_put(&link->wr_tx_refs);
> }
>
> -static inline void smc_wr_drain_cq(struct smc_link *lnk)
> +static inline void smc_wr_drain_qp(struct smc_link *lnk)
> {
> - wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> + if (lnk->qp_attr.cur_qp_state != IB_QPS_RESET)
> + ib_drain_qp(lnk->roce_qp);
> }
>
> static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
> @@ -86,18 +74,12 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
> }
>
> /* post a new receive work request to fill a completed old work request entry */
> -static inline int smc_wr_rx_post(struct smc_link *link)
> +static inline int smc_wr_rx_post(struct smc_link *link, struct ib_cqe *cqe)
> {
> - int rc;
> - u64 wr_id, temp_wr_id;
> - u32 index;
> -
> - wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
> - temp_wr_id = wr_id;
> - index = do_div(temp_wr_id, link->wr_rx_cnt);
> - link->wr_rx_ibs[index].wr_id = wr_id;
> - rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
> - return rc;
> + struct smc_ib_recv_wr *recv_wr;
> +
> + recv_wr = container_of(cqe, struct smc_ib_recv_wr, cqe);
> + return ib_post_recv(link->roce_qp, &recv_wr->wr, NULL);
> }
>
> int smc_wr_create_link(struct smc_link *lnk);
> @@ -107,8 +89,6 @@ void smc_wr_free_link(struct smc_link *lnk);
> void smc_wr_free_link_mem(struct smc_link *lnk);
> void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> void smc_wr_remember_qp_attr(struct smc_link *lnk);
> -void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
> -void smc_wr_add_dev(struct smc_ib_device *smcibdev);
>
> int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
> struct smc_wr_buf **wr_buf,
> @@ -126,12 +106,10 @@ int smc_wr_tx_v2_send(struct smc_link *link,
> struct smc_wr_tx_pend_priv *priv, int len);
> int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
> unsigned long timeout);
> -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
> void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
>
> int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
> int smc_wr_rx_post_init(struct smc_link *link);
> -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
> int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
>
> #endif /* SMC_WR_H */
> --
> 2.45.0
>
>
On Mon, Feb 02, 2026 at 02:30:32PM +0200, Leon Romanovsky wrote:
> On Mon, Feb 02, 2026 at 05:48:00PM +0800, D. Wythe wrote:
> > The current SMC-R implementation relies on global per-device CQs
> > and manual polling within tasklets, which introduces severe
> > scalability bottlenecks due to global lock contention and tasklet
> > scheduling overhead, resulting in poor performance as concurrency
> > increases.
> >
> > Refactor the completion handling to utilize the ib_cqe API and
> > standard RDMA core CQ pooling. This transition provides several key
> > advantages:
> >
> > 1. Multi-CQ: Shift from a single shared per-device CQ to multiple
> > link-specific CQs via the CQ pool. This allows completion processing
> > to be parallelized across multiple CPU cores, effectively eliminating
> > the global CQ bottleneck.
> >
> > 2. Leverage DIM: Utilizing the standard CQ pool with IB_POLL_SOFTIRQ
> > enables Dynamic Interrupt Moderation from the RDMA core, optimizing
> > interrupt frequency and reducing CPU load under high pressure.
> >
> > 3. O(1) Context Retrieval: Replaces the expensive wr_id based lookup
> > logic (e.g., smc_wr_tx_find_pending_index) with direct context retrieval
> > using container_of() on the embedded ib_cqe.
> >
> > 4. Code Simplification: This refactoring results in a reduction of
> > ~150 lines of code. It removes redundant sequence tracking, complex lookup
> > helpers, and manual CQ management, significantly improving maintainability.
> >
> > Performance Test: redis-benchmark with max 32 connections per QP
> > Data format: Requests Per Second (RPS), Percentage in brackets
> > represents the gain/loss compared to TCP.
> >
> > | Clients | TCP | SMC (original) | SMC (cq_pool) |
> > |---------|----------|---------------------|---------------------|
> > | c = 1 | 24449 | 31172 (+27%) | 34039 (+39%) |
> > | c = 2 | 46420 | 53216 (+14%) | 64391 (+38%) |
> > | c = 16 | 159673 | 83668 (-48%) <-- | 216947 (+36%) |
> > | c = 32 | 164956 | 97631 (-41%) <-- | 249376 (+51%) |
> > | c = 64 | 166322 | 118192 (-29%) <-- | 249488 (+50%) |
> > | c = 128 | 167700 | 121497 (-27%) <-- | 249480 (+48%) |
> > | c = 256 | 175021 | 146109 (-16%) <-- | 240384 (+37%) |
> > | c = 512 | 168987 | 101479 (-40%) <-- | 226634 (+34%) |
> >
> > The results demonstrate that this optimization effectively resolves the
> > scalability bottleneck, with RPS increasing by over 110% at c=64
> > compared to the original implementation.
>
> Thanks for the conversion, it looks great. I just wonder, why did you
> mark this as an RFC?
>
> Thanks
>
I usually mark patches involving complex changes as RFC first to solicit
early feedback from the community. No other reason.
Best regards,
D. Wythe
> >
> > Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> > ---
> > net/smc/smc_core.c | 8 +-
> > net/smc/smc_core.h | 16 ++-
> > net/smc/smc_ib.c | 114 ++++++-------------
> > net/smc/smc_ib.h | 5 -
> > net/smc/smc_tx.c | 1 -
> > net/smc/smc_wr.c | 267 ++++++++++++++++-----------------------------
> > net/smc/smc_wr.h | 38 ++-----
> > 7 files changed, 150 insertions(+), 299 deletions(-)
> >
> > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
> > index 8aca5dc54be7..9590c8aed3dd 100644
> > --- a/net/smc/smc_core.c
> > +++ b/net/smc/smc_core.c
> > @@ -815,17 +815,11 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
> > lnk->lgr = lgr;
> > smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */
> > lnk->link_idx = link_idx;
> > - lnk->wr_rx_id_compl = 0;
> > smc_ibdev_cnt_inc(lnk);
> > smcr_copy_dev_info_to_link(lnk);
> > atomic_set(&lnk->conn_cnt, 0);
> > smc_llc_link_set_uid(lnk);
> > INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
> > - if (!lnk->smcibdev->initialized) {
> > - rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
> > - if (rc)
> > - goto out;
> > - }
> > get_random_bytes(rndvec, sizeof(rndvec));
> > lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
> > (rndvec[2] << 16);
> > @@ -1373,7 +1367,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
> > smc_llc_link_clear(lnk, log);
> > smcr_buf_unmap_lgr(lnk);
> > smcr_rtoken_clear_link(lnk);
> > - smc_ib_modify_qp_error(lnk);
> > + smc_wr_drain_qp(lnk);
> > smc_wr_free_link(lnk);
> > smc_ib_destroy_queue_pair(lnk);
> > smc_ib_dealloc_protection_domain(lnk);
> > diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
> > index 5c18f08a4c8a..00468b7a279f 100644
> > --- a/net/smc/smc_core.h
> > +++ b/net/smc/smc_core.h
> > @@ -92,6 +92,12 @@ struct smc_rdma_wr { /* work requests per message
> > struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES];
> > };
> >
> > +struct smc_ib_recv_wr {
> > + struct ib_cqe cqe;
> > + struct ib_recv_wr wr;
> > + int index;
> > +};
> > +
> > #define SMC_LGR_ID_SIZE 4
> >
> > struct smc_link {
> > @@ -100,6 +106,8 @@ struct smc_link {
> > struct ib_pd *roce_pd; /* IB protection domain,
> > * unique for every RoCE QP
> > */
> > + int nr_cqe;
> > + struct ib_cq *ib_cq;
> > struct ib_qp *roce_qp; /* IB queue pair */
> > struct ib_qp_attr qp_attr; /* IB queue pair attributes */
> >
> > @@ -107,6 +115,7 @@ struct smc_link {
> > struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
> > struct ib_sge *wr_tx_sges; /* WR send gather meta data */
> > struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/
> > + struct ib_cqe tx_rdma_cqe; /* CQE RDMA WRITE */
> > struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */
> > struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
> > struct completion *wr_tx_compl; /* WR send CQE completion */
> > @@ -116,7 +125,6 @@ struct smc_link {
> > struct smc_wr_tx_pend *wr_tx_v2_pend; /* WR send v2 waiting for CQE */
> > dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
> > dma_addr_t wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
> > - atomic_long_t wr_tx_id; /* seq # of last sent WR */
> > unsigned long *wr_tx_mask; /* bit mask of used indexes */
> > u32 wr_tx_cnt; /* number of WR send buffers */
> > wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
> > @@ -126,7 +134,7 @@ struct smc_link {
> > struct completion tx_ref_comp;
> >
> > u8 *wr_rx_bufs; /* WR recv payload buffers */
> > - struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
> > + struct smc_ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
> > struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
> > /* above three vectors have wr_rx_cnt elements and use the same index */
> > int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */
> > @@ -135,13 +143,11 @@ struct smc_link {
> > */
> > dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
> > dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
> > - u64 wr_rx_id; /* seq # of last recv WR */
> > - u64 wr_rx_id_compl; /* seq # of last completed WR */
> > u32 wr_rx_cnt; /* number of WR recv buffers */
> > unsigned long wr_rx_tstamp; /* jiffies when last buf rx */
> > - wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */
> >
> > struct ib_reg_wr wr_reg; /* WR register memory region */
> > + struct ib_cqe wr_reg_cqe;
> > wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
> > struct {
> > struct percpu_ref wr_reg_refs;
> > diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
> > index 67211d44a1db..77047ad7d452 100644
> > --- a/net/smc/smc_ib.c
> > +++ b/net/smc/smc_ib.c
> > @@ -112,15 +112,6 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
> > IB_QP_MAX_QP_RD_ATOMIC);
> > }
> >
> > -int smc_ib_modify_qp_error(struct smc_link *lnk)
> > -{
> > - struct ib_qp_attr qp_attr;
> > -
> > - memset(&qp_attr, 0, sizeof(qp_attr));
> > - qp_attr.qp_state = IB_QPS_ERR;
> > - return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
> > -}
> > -
> > int smc_ib_ready_link(struct smc_link *lnk)
> > {
> > struct smc_link_group *lgr = smc_get_lgr(lnk);
> > @@ -134,10 +125,7 @@ int smc_ib_ready_link(struct smc_link *lnk)
> > if (rc)
> > goto out;
> > smc_wr_remember_qp_attr(lnk);
> > - rc = ib_req_notify_cq(lnk->smcibdev->roce_cq_recv,
> > - IB_CQ_SOLICITED_MASK);
> > - if (rc)
> > - goto out;
> > +
> > rc = smc_wr_rx_post_init(lnk);
> > if (rc)
> > goto out;
> > @@ -658,38 +646,60 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
> > if (lnk->roce_qp)
> > ib_destroy_qp(lnk->roce_qp);
> > lnk->roce_qp = NULL;
> > + if (lnk->ib_cq) {
> > + ib_cq_pool_put(lnk->ib_cq, lnk->nr_cqe);
> > + lnk->ib_cq = NULL;
> > + }
> > }
> >
> > /* create a queue pair within the protection domain for a link */
> > int smc_ib_create_queue_pair(struct smc_link *lnk)
> > {
> > + int max_send_wr, max_recv_wr, rc;
> > + struct ib_cq *cq;
> > +
> > + /* include unsolicited rdma_writes as well,
> > + * there are max. 2 RDMA_WRITE per 1 WR_SEND.
> > + * +1 for ib_drain_qp()
> > + */
> > + max_send_wr = 3 * lnk->lgr->max_send_wr + 1;
> > + max_recv_wr = lnk->lgr->max_recv_wr + 1;
> > +
> > + cq = ib_cq_pool_get(lnk->smcibdev->ibdev, max_send_wr + max_recv_wr, -1,
> > + IB_POLL_SOFTIRQ);
> > +
> > + if (IS_ERR(cq)) {
> > + rc = PTR_ERR(cq);
> > + return rc;
> > + }
> > +
> > struct ib_qp_init_attr qp_attr = {
> > .event_handler = smc_ib_qp_event_handler,
> > .qp_context = lnk,
> > - .send_cq = lnk->smcibdev->roce_cq_send,
> > - .recv_cq = lnk->smcibdev->roce_cq_recv,
> > + .send_cq = cq,
> > + .recv_cq = cq,
> > .srq = NULL,
> > .cap = {
> > .max_send_sge = SMC_IB_MAX_SEND_SGE,
> > .max_recv_sge = lnk->wr_rx_sge_cnt,
> > + .max_send_wr = max_send_wr,
> > + .max_recv_wr = max_recv_wr,
> > .max_inline_data = 0,
> > },
> > .sq_sig_type = IB_SIGNAL_REQ_WR,
> > .qp_type = IB_QPT_RC,
> > };
> > - int rc;
> >
> > - /* include unsolicited rdma_writes as well,
> > - * there are max. 2 RDMA_WRITE per 1 WR_SEND
> > - */
> > - qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr;
> > - qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr;
> > lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
> > rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
> > - if (IS_ERR(lnk->roce_qp))
> > + if (IS_ERR(lnk->roce_qp)) {
> > lnk->roce_qp = NULL;
> > - else
> > + ib_cq_pool_put(cq, max_send_wr + max_recv_wr);
> > + } else {
> > smc_wr_remember_qp_attr(lnk);
> > + lnk->nr_cqe = max_send_wr + max_recv_wr;
> > + lnk->ib_cq = cq;
> > + }
> > return rc;
> > }
> >
> > @@ -855,62 +865,6 @@ void smc_ib_buf_unmap_sg(struct smc_link *lnk,
> > buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0;
> > }
> >
> > -long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev)
> > -{
> > - struct ib_cq_init_attr cqattr = {
> > - .cqe = SMC_MAX_CQE, .comp_vector = 0 };
> > - int cqe_size_order, smc_order;
> > - long rc;
> > -
> > - mutex_lock(&smcibdev->mutex);
> > - rc = 0;
> > - if (smcibdev->initialized)
> > - goto out;
> > - /* the calculated number of cq entries fits to mlx5 cq allocation */
> > - cqe_size_order = cache_line_size() == 128 ? 7 : 6;
> > - smc_order = MAX_PAGE_ORDER - cqe_size_order;
> > - if (SMC_MAX_CQE + 2 > (0x00000001 << smc_order) * PAGE_SIZE)
> > - cqattr.cqe = (0x00000001 << smc_order) * PAGE_SIZE - 2;
> > - smcibdev->roce_cq_send = ib_create_cq(smcibdev->ibdev,
> > - smc_wr_tx_cq_handler, NULL,
> > - smcibdev, &cqattr);
> > - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_send);
> > - if (IS_ERR(smcibdev->roce_cq_send)) {
> > - smcibdev->roce_cq_send = NULL;
> > - goto out;
> > - }
> > - smcibdev->roce_cq_recv = ib_create_cq(smcibdev->ibdev,
> > - smc_wr_rx_cq_handler, NULL,
> > - smcibdev, &cqattr);
> > - rc = PTR_ERR_OR_ZERO(smcibdev->roce_cq_recv);
> > - if (IS_ERR(smcibdev->roce_cq_recv)) {
> > - smcibdev->roce_cq_recv = NULL;
> > - goto err;
> > - }
> > - smc_wr_add_dev(smcibdev);
> > - smcibdev->initialized = 1;
> > - goto out;
> > -
> > -err:
> > - ib_destroy_cq(smcibdev->roce_cq_send);
> > -out:
> > - mutex_unlock(&smcibdev->mutex);
> > - return rc;
> > -}
> > -
> > -static void smc_ib_cleanup_per_ibdev(struct smc_ib_device *smcibdev)
> > -{
> > - mutex_lock(&smcibdev->mutex);
> > - if (!smcibdev->initialized)
> > - goto out;
> > - smcibdev->initialized = 0;
> > - ib_destroy_cq(smcibdev->roce_cq_recv);
> > - ib_destroy_cq(smcibdev->roce_cq_send);
> > - smc_wr_remove_dev(smcibdev);
> > -out:
> > - mutex_unlock(&smcibdev->mutex);
> > -}
> > -
> > static struct ib_client smc_ib_client;
> >
> > static void smc_copy_netdev_ifindex(struct smc_ib_device *smcibdev, int port)
> > @@ -969,7 +923,6 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
> > INIT_WORK(&smcibdev->port_event_work, smc_ib_port_event_work);
> > atomic_set(&smcibdev->lnk_cnt, 0);
> > init_waitqueue_head(&smcibdev->lnks_deleted);
> > - mutex_init(&smcibdev->mutex);
> > mutex_lock(&smc_ib_devices.mutex);
> > list_add_tail(&smcibdev->list, &smc_ib_devices.list);
> > mutex_unlock(&smc_ib_devices.mutex);
> > @@ -1018,7 +971,6 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data)
> > pr_warn_ratelimited("smc: removing ib device %s\n",
> > smcibdev->ibdev->name);
> > smc_smcr_terminate_all(smcibdev);
> > - smc_ib_cleanup_per_ibdev(smcibdev);
> > ib_unregister_event_handler(&smcibdev->event_handler);
> > cancel_work_sync(&smcibdev->port_event_work);
> > kfree(smcibdev);
> > diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
> > index ef8ac2b7546d..c5a0d773b73f 100644
> > --- a/net/smc/smc_ib.h
> > +++ b/net/smc/smc_ib.h
> > @@ -37,10 +37,6 @@ struct smc_ib_device { /* ib-device infos for smc */
> > struct ib_device *ibdev;
> > struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
> > struct ib_event_handler event_handler; /* global ib_event handler */
> > - struct ib_cq *roce_cq_send; /* send completion queue */
> > - struct ib_cq *roce_cq_recv; /* recv completion queue */
> > - struct tasklet_struct send_tasklet; /* called by send cq handler */
> > - struct tasklet_struct recv_tasklet; /* called by recv cq handler */
> > char mac[SMC_MAX_PORTS][ETH_ALEN];
> > /* mac address per port*/
> > u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
> > @@ -96,7 +92,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk);
> > int smc_ib_create_queue_pair(struct smc_link *lnk);
> > int smc_ib_ready_link(struct smc_link *lnk);
> > int smc_ib_modify_qp_rts(struct smc_link *lnk);
> > -int smc_ib_modify_qp_error(struct smc_link *lnk);
> > long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
> > int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
> > struct smc_buf_desc *buf_slot, u8 link_idx);
> > diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
> > index 3144b4b1fe29..d301df9ed58b 100644
> > --- a/net/smc/smc_tx.c
> > +++ b/net/smc/smc_tx.c
> > @@ -321,7 +321,6 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset,
> > struct smc_link *link = conn->lnk;
> > int rc;
> >
> > - rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link);
> > rdma_wr->wr.num_sge = num_sges;
> > rdma_wr->remote_addr =
> > lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr +
> > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > index 5feafa98ab1a..3a361aa020ab 100644
> > --- a/net/smc/smc_wr.c
> > +++ b/net/smc/smc_wr.c
> > @@ -38,7 +38,7 @@ static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS);
> > static DEFINE_SPINLOCK(smc_wr_rx_hash_lock);
> >
> > struct smc_wr_tx_pend { /* control data for a pending send request */
> > - u64 wr_id; /* work request id sent */
> > + struct ib_cqe cqe;
> > smc_wr_tx_handler handler;
> > enum ib_wc_status wc_status; /* CQE status */
> > struct smc_link *link;
> > @@ -63,62 +63,51 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
> > wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
> > }
> >
> > -static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
> > +static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > {
> > - u32 i;
> > + struct smc_link *link = wc->qp->qp_context;
> >
> > - for (i = 0; i < link->wr_tx_cnt; i++) {
> > - if (link->wr_tx_pends[i].wr_id == wr_id)
> > - return i;
> > - }
> > - return link->wr_tx_cnt;
> > + /* terminate link */
> > + if (unlikely(wc->status))
> > + smcr_link_down_cond_sched(link);
> > +}
> > +
> > +static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > +{
> > + struct smc_link *link = wc->qp->qp_context;
> > +
> > + if (wc->status)
> > + link->wr_reg_state = FAILED;
> > + else
> > + link->wr_reg_state = CONFIRMED;
> > + smc_wr_wakeup_reg_wait(link);
> > }
> >
> > -static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> > +static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > {
> > - struct smc_wr_tx_pend pnd_snd;
> > + struct smc_wr_tx_pend *tx_pend, pnd_snd;
> > struct smc_link *link;
> > u32 pnd_snd_idx;
> >
> > link = wc->qp->qp_context;
> >
> > - if (wc->opcode == IB_WC_REG_MR) {
> > - if (wc->status)
> > - link->wr_reg_state = FAILED;
> > - else
> > - link->wr_reg_state = CONFIRMED;
> > - smc_wr_wakeup_reg_wait(link);
> > - return;
> > - }
> > + tx_pend = container_of(wc->wr_cqe, struct smc_wr_tx_pend, cqe);
> > + pnd_snd_idx = tx_pend->idx;
> > +
> > + tx_pend->wc_status = wc->status;
> > + memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
> >
> > - pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
> > if (pnd_snd_idx == link->wr_tx_cnt) {
> > - if (link->lgr->smc_version != SMC_V2 ||
> > - link->wr_tx_v2_pend->wr_id != wc->wr_id)
> > - return;
> > - link->wr_tx_v2_pend->wc_status = wc->status;
> > - memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
> > - /* clear the full struct smc_wr_tx_pend including .priv */
> > - memset(link->wr_tx_v2_pend, 0,
> > - sizeof(*link->wr_tx_v2_pend));
> > - memset(link->lgr->wr_tx_buf_v2, 0,
> > - sizeof(*link->lgr->wr_tx_buf_v2));
> > + memset(link->lgr->wr_tx_buf_v2, 0, sizeof(*link->lgr->wr_tx_buf_v2));
> > } else {
> > - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
> > - if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
> > + if (tx_pend->compl_requested)
> > complete(&link->wr_tx_compl[pnd_snd_idx]);
> > - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
> > - sizeof(pnd_snd));
> > - /* clear the full struct smc_wr_tx_pend including .priv */
> > - memset(&link->wr_tx_pends[pnd_snd_idx], 0,
> > - sizeof(link->wr_tx_pends[pnd_snd_idx]));
> > - memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
> > - sizeof(link->wr_tx_bufs[pnd_snd_idx]));
> > + memset(&link->wr_tx_bufs[tx_pend->idx], 0, sizeof(link->wr_tx_bufs[tx_pend->idx]));
> > if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
> > return;
> > }
> >
> > - if (wc->status) {
> > + if (unlikely(wc->status)) {
> > if (link->lgr->smc_version == SMC_V2) {
> > memset(link->wr_tx_v2_pend, 0,
> > sizeof(*link->wr_tx_v2_pend));
> > @@ -128,44 +117,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> > /* terminate link */
> > smcr_link_down_cond_sched(link);
> > }
> > +
> > if (pnd_snd.handler)
> > pnd_snd.handler(&pnd_snd.priv, link, wc->status);
> > wake_up(&link->wr_tx_wait);
> > }
> >
> > -static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
> > -{
> > - struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
> > - struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
> > - int i = 0, rc;
> > - int polled = 0;
> > -
> > -again:
> > - polled++;
> > - do {
> > - memset(&wc, 0, sizeof(wc));
> > - rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
> > - if (polled == 1) {
> > - ib_req_notify_cq(dev->roce_cq_send,
> > - IB_CQ_NEXT_COMP |
> > - IB_CQ_REPORT_MISSED_EVENTS);
> > - }
> > - if (!rc)
> > - break;
> > - for (i = 0; i < rc; i++)
> > - smc_wr_tx_process_cqe(&wc[i]);
> > - } while (rc > 0);
> > - if (polled == 1)
> > - goto again;
> > -}
> > -
> > -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
> > -{
> > - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
> > -
> > - tasklet_schedule(&dev->send_tasklet);
> > -}
> > -
> > /*---------------------------- request submission ---------------------------*/
> >
> > static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
> > @@ -202,7 +159,6 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
> > struct smc_wr_tx_pend *wr_pend;
> > u32 idx = link->wr_tx_cnt;
> > struct ib_send_wr *wr_ib;
> > - u64 wr_id;
> > int rc;
> >
> > *wr_buf = NULL;
> > @@ -226,14 +182,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
> > if (idx == link->wr_tx_cnt)
> > return -EPIPE;
> > }
> > - wr_id = smc_wr_tx_get_next_wr_id(link);
> > +
> > wr_pend = &link->wr_tx_pends[idx];
> > - wr_pend->wr_id = wr_id;
> > wr_pend->handler = handler;
> > wr_pend->link = link;
> > wr_pend->idx = idx;
> > wr_ib = &link->wr_tx_ibs[idx];
> > - wr_ib->wr_id = wr_id;
> > + wr_ib->wr_cqe = &wr_pend->cqe;
> > *wr_buf = &link->wr_tx_bufs[idx];
> > if (wr_rdma_buf)
> > *wr_rdma_buf = &link->wr_tx_rdmas[idx];
> > @@ -248,21 +203,18 @@ int smc_wr_tx_get_v2_slot(struct smc_link *link,
> > {
> > struct smc_wr_tx_pend *wr_pend;
> > struct ib_send_wr *wr_ib;
> > - u64 wr_id;
> >
> > if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
> > return -EBUSY;
> >
> > *wr_buf = NULL;
> > *wr_pend_priv = NULL;
> > - wr_id = smc_wr_tx_get_next_wr_id(link);
> > wr_pend = link->wr_tx_v2_pend;
> > - wr_pend->wr_id = wr_id;
> > wr_pend->handler = handler;
> > wr_pend->link = link;
> > wr_pend->idx = link->wr_tx_cnt;
> > wr_ib = link->wr_tx_v2_ib;
> > - wr_ib->wr_id = wr_id;
> > + wr_ib->wr_cqe = &wr_pend->cqe;
> > *wr_buf = link->lgr->wr_tx_buf_v2;
> > *wr_pend_priv = &wr_pend->priv;
> > return 0;
> > @@ -306,8 +258,6 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
> > struct smc_wr_tx_pend *pend;
> > int rc;
> >
> > - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> > - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> > pend = container_of(priv, struct smc_wr_tx_pend, priv);
> > rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL);
> > if (rc) {
> > @@ -323,8 +273,6 @@ int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
> > int rc;
> >
> > link->wr_tx_v2_ib->sg_list[0].length = len;
> > - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> > - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> > rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
> > if (rc) {
> > smc_wr_tx_put_slot(link, priv);
> > @@ -367,10 +315,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
> > {
> > int rc;
> >
> > - ib_req_notify_cq(link->smcibdev->roce_cq_send,
> > - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> > link->wr_reg_state = POSTED;
> > - link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr;
> > link->wr_reg.mr = mr;
> > link->wr_reg.key = mr->rkey;
> > rc = ib_post_send(link->roce_qp, &link->wr_reg.wr, NULL);
> > @@ -431,94 +376,76 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
> > {
> > struct smc_link *link = (struct smc_link *)wc->qp->qp_context;
> > struct smc_wr_rx_handler *handler;
> > + struct smc_ib_recv_wr *recv_wr;
> > struct smc_wr_rx_hdr *wr_rx;
> > - u64 temp_wr_id;
> > - u32 index;
> >
> > if (wc->byte_len < sizeof(*wr_rx))
> > return; /* short message */
> > - temp_wr_id = wc->wr_id;
> > - index = do_div(temp_wr_id, link->wr_rx_cnt);
> > - wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen);
> > +
> > + recv_wr = container_of(wc->wr_cqe, struct smc_ib_recv_wr, cqe);
> > +
> > + wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + recv_wr->index * link->wr_rx_buflen);
> > hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
> > if (handler->type == wr_rx->type)
> > handler->handler(wc, wr_rx);
> > }
> > }
> >
> > -static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
> > +static void smc_wr_rx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > {
> > - struct smc_link *link;
> > - int i;
> > -
> > - for (i = 0; i < num; i++) {
> > - link = wc[i].qp->qp_context;
> > - link->wr_rx_id_compl = wc[i].wr_id;
> > - if (wc[i].status == IB_WC_SUCCESS) {
> > - link->wr_rx_tstamp = jiffies;
> > - smc_wr_rx_demultiplex(&wc[i]);
> > - smc_wr_rx_post(link); /* refill WR RX */
> > - } else {
> > - /* handle status errors */
> > - switch (wc[i].status) {
> > - case IB_WC_RETRY_EXC_ERR:
> > - case IB_WC_RNR_RETRY_EXC_ERR:
> > - case IB_WC_WR_FLUSH_ERR:
> > - smcr_link_down_cond_sched(link);
> > - if (link->wr_rx_id_compl == link->wr_rx_id)
> > - wake_up(&link->wr_rx_empty_wait);
> > - break;
> > - default:
> > - smc_wr_rx_post(link); /* refill WR RX */
> > - break;
> > - }
> > + struct smc_link *link = wc->qp->qp_context;
> > +
> > + if (wc->status == IB_WC_SUCCESS) {
> > + link->wr_rx_tstamp = jiffies;
> > + smc_wr_rx_demultiplex(wc);
> > + smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
> > + } else {
> > + /* handle status errors */
> > + switch (wc->status) {
> > + case IB_WC_RETRY_EXC_ERR:
> > + case IB_WC_RNR_RETRY_EXC_ERR:
> > + case IB_WC_WR_FLUSH_ERR:
> > + smcr_link_down_cond_sched(link);
> > + break;
> > + default:
> > + smc_wr_rx_post(link, wc->wr_cqe); /* refill WR RX */
> > + break;
> > }
> > }
> > }
> >
> > -static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
> > +int smc_wr_rx_post_init(struct smc_link *link)
> > {
> > - struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
> > - struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
> > - int polled = 0;
> > - int rc;
> > + u32 i;
> > + int rc = 0;
> >
> > -again:
> > - polled++;
> > - do {
> > - memset(&wc, 0, sizeof(wc));
> > - rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
> > - if (polled == 1) {
> > - ib_req_notify_cq(dev->roce_cq_recv,
> > - IB_CQ_SOLICITED_MASK
> > - | IB_CQ_REPORT_MISSED_EVENTS);
> > - }
> > - if (!rc)
> > - break;
> > - smc_wr_rx_process_cqes(&wc[0], rc);
> > - } while (rc > 0);
> > - if (polled == 1)
> > - goto again;
> > + for (i = 0; i < link->wr_rx_cnt; i++)
> > + rc = smc_wr_rx_post(link, &link->wr_rx_ibs[i].cqe);
> > + return rc;
> > }
> >
> > -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
> > -{
> > - struct smc_ib_device *dev = (struct smc_ib_device *)cq_context;
> > +/***************************** init, exit, misc ******************************/
> > +
> >
> > - tasklet_schedule(&dev->recv_tasklet);
> > +static inline void smc_wr_reg_init_cqe(struct ib_cqe *cqe)
> > +{
> > + cqe->done = smc_wr_reg_process_cqe;
> > }
> >
> > -int smc_wr_rx_post_init(struct smc_link *link)
> > +static inline void smc_wr_tx_init_cqe(struct ib_cqe *cqe)
> > {
> > - u32 i;
> > - int rc = 0;
> > + cqe->done = smc_wr_tx_process_cqe;
> > +}
> >
> > - for (i = 0; i < link->wr_rx_cnt; i++)
> > - rc = smc_wr_rx_post(link);
> > - return rc;
> > +static inline void smc_wr_rx_init_cqe(struct ib_cqe *cqe)
> > +{
> > + cqe->done = smc_wr_rx_process_cqe;
> > }
> >
> > -/***************************** init, exit, misc ******************************/
> > +static inline void smc_wr_tx_rdma_init_cqe(struct ib_cqe *cqe)
> > +{
> > + cqe->done = smc_wr_tx_rdma_process_cqe;
> > +}
> >
> > void smc_wr_remember_qp_attr(struct smc_link *lnk)
> > {
> > @@ -548,9 +475,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
> > &init_attr);
> >
> > lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
> > - lnk->qp_attr.cap.max_send_wr);
> > + lnk->qp_attr.cap.max_send_wr - 1);
> > lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
> > - lnk->qp_attr.cap.max_recv_wr);
> > + lnk->qp_attr.cap.max_recv_wr - 1);
> > }
> >
> > static void smc_wr_init_sge(struct smc_link *lnk)
> > @@ -585,6 +512,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> > lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge;
> > lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
> > lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
> > + lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.wr_cqe = &lnk->tx_rdma_cqe;
> > + lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.wr_cqe = &lnk->tx_rdma_cqe;
> > }
> >
> > if (lnk->lgr->smc_version == SMC_V2) {
> > @@ -622,10 +551,13 @@ static void smc_wr_init_sge(struct smc_link *lnk)
> > lnk->wr_rx_sges[x + 1].lkey =
> > lnk->roce_pd->local_dma_lkey;
> > }
> > - lnk->wr_rx_ibs[i].next = NULL;
> > - lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
> > - lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt;
> > + lnk->wr_rx_ibs[i].wr.next = NULL;
> > + lnk->wr_rx_ibs[i].wr.sg_list = &lnk->wr_rx_sges[x];
> > + lnk->wr_rx_ibs[i].wr.num_sge = lnk->wr_rx_sge_cnt;
> > }
> > +
> > + smc_wr_reg_init_cqe(&lnk->wr_reg_cqe);
> > + lnk->wr_reg.wr.wr_cqe = &lnk->wr_reg_cqe;
> > lnk->wr_reg.wr.next = NULL;
> > lnk->wr_reg.wr.num_sge = 0;
> > lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED;
> > @@ -641,7 +573,6 @@ void smc_wr_free_link(struct smc_link *lnk)
> > return;
> > ibdev = lnk->smcibdev->ibdev;
> >
> > - smc_wr_drain_cq(lnk);
> > smc_wr_wakeup_reg_wait(lnk);
> > smc_wr_wakeup_tx_wait(lnk);
> >
> > @@ -758,11 +689,19 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> > GFP_KERNEL);
> > if (!link->wr_rx_ibs)
> > goto no_mem_wr_tx_ibs;
> > + /* init wr_rx_ibs cqe */
> > + for (int i = 0; i < link->max_recv_wr; i++) {
> > + smc_wr_rx_init_cqe(&link->wr_rx_ibs[i].cqe);
> > + link->wr_rx_ibs[i].wr.wr_cqe = &link->wr_rx_ibs[i].cqe;
> > + link->wr_rx_ibs[i].index = i;
> > + }
> > link->wr_tx_rdmas = kcalloc(link->max_send_wr,
> > sizeof(link->wr_tx_rdmas[0]),
> > GFP_KERNEL);
> > if (!link->wr_tx_rdmas)
> > goto no_mem_wr_rx_ibs;
> > +
> > + smc_wr_tx_rdma_init_cqe(&link->tx_rdma_cqe);
> > link->wr_tx_rdma_sges = kcalloc(link->max_send_wr,
> > sizeof(link->wr_tx_rdma_sges[0]),
> > GFP_KERNEL);
> > @@ -785,6 +724,8 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> > GFP_KERNEL);
> > if (!link->wr_tx_pends)
> > goto no_mem_wr_tx_mask;
> > + for (int i = 0; i < link->max_send_wr; i++)
> > + smc_wr_tx_init_cqe(&link->wr_tx_pends[i].cqe);
> > link->wr_tx_compl = kcalloc(link->max_send_wr,
> > sizeof(link->wr_tx_compl[0]),
> > GFP_KERNEL);
> > @@ -804,6 +745,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> > GFP_KERNEL);
> > if (!link->wr_tx_v2_pend)
> > goto no_mem_v2_sge;
> > + smc_wr_tx_init_cqe(&link->wr_tx_v2_pend->cqe);
> > }
> > return 0;
> >
> > @@ -837,18 +779,6 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
> > return -ENOMEM;
> > }
> >
> > -void smc_wr_remove_dev(struct smc_ib_device *smcibdev)
> > -{
> > - tasklet_kill(&smcibdev->recv_tasklet);
> > - tasklet_kill(&smcibdev->send_tasklet);
> > -}
> > -
> > -void smc_wr_add_dev(struct smc_ib_device *smcibdev)
> > -{
> > - tasklet_setup(&smcibdev->recv_tasklet, smc_wr_rx_tasklet_fn);
> > - tasklet_setup(&smcibdev->send_tasklet, smc_wr_tx_tasklet_fn);
> > -}
> > -
> > static void smcr_wr_tx_refs_free(struct percpu_ref *ref)
> > {
> > struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs);
> > @@ -868,8 +798,6 @@ int smc_wr_create_link(struct smc_link *lnk)
> > struct ib_device *ibdev = lnk->smcibdev->ibdev;
> > int rc = 0;
> >
> > - smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
> > - lnk->wr_rx_id = 0;
> > lnk->wr_rx_dma_addr = ib_dma_map_single(
> > ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt,
> > DMA_FROM_DEVICE);
> > @@ -917,7 +845,6 @@ int smc_wr_create_link(struct smc_link *lnk)
> > if (rc)
> > goto cancel_ref;
> > init_completion(&lnk->reg_ref_comp);
> > - init_waitqueue_head(&lnk->wr_rx_empty_wait);
> > return rc;
> >
> > cancel_ref:
> > diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
> > index aa4533af9122..4268dfcd84d3 100644
> > --- a/net/smc/smc_wr.h
> > +++ b/net/smc/smc_wr.h
> > @@ -44,19 +44,6 @@ struct smc_wr_rx_handler {
> > u8 type;
> > };
> >
> > -/* Only used by RDMA write WRs.
> > - * All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
> > - */
> > -static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
> > -{
> > - return atomic_long_inc_return(&link->wr_tx_id);
> > -}
> > -
> > -static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
> > -{
> > - atomic_long_set(wr_tx_id, val);
> > -}
> > -
> > static inline bool smc_wr_tx_link_hold(struct smc_link *link)
> > {
> > if (!smc_link_sendable(link))
> > @@ -70,9 +57,10 @@ static inline void smc_wr_tx_link_put(struct smc_link *link)
> > percpu_ref_put(&link->wr_tx_refs);
> > }
> >
> > -static inline void smc_wr_drain_cq(struct smc_link *lnk)
> > +static inline void smc_wr_drain_qp(struct smc_link *lnk)
> > {
> > - wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id);
> > + if (lnk->qp_attr.cur_qp_state != IB_QPS_RESET)
> > + ib_drain_qp(lnk->roce_qp);
> > }
> >
> > static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
> > @@ -86,18 +74,12 @@ static inline void smc_wr_wakeup_reg_wait(struct smc_link *lnk)
> > }
> >
> > /* post a new receive work request to fill a completed old work request entry */
> > -static inline int smc_wr_rx_post(struct smc_link *link)
> > +static inline int smc_wr_rx_post(struct smc_link *link, struct ib_cqe *cqe)
> > {
> > - int rc;
> > - u64 wr_id, temp_wr_id;
> > - u32 index;
> > -
> > - wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
> > - temp_wr_id = wr_id;
> > - index = do_div(temp_wr_id, link->wr_rx_cnt);
> > - link->wr_rx_ibs[index].wr_id = wr_id;
> > - rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], NULL);
> > - return rc;
> > + struct smc_ib_recv_wr *recv_wr;
> > +
> > + recv_wr = container_of(cqe, struct smc_ib_recv_wr, cqe);
> > + return ib_post_recv(link->roce_qp, &recv_wr->wr, NULL);
> > }
> >
> > int smc_wr_create_link(struct smc_link *lnk);
> > @@ -107,8 +89,6 @@ void smc_wr_free_link(struct smc_link *lnk);
> > void smc_wr_free_link_mem(struct smc_link *lnk);
> > void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
> > void smc_wr_remember_qp_attr(struct smc_link *lnk);
> > -void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
> > -void smc_wr_add_dev(struct smc_ib_device *smcibdev);
> >
> > int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
> > struct smc_wr_buf **wr_buf,
> > @@ -126,12 +106,10 @@ int smc_wr_tx_v2_send(struct smc_link *link,
> > struct smc_wr_tx_pend_priv *priv, int len);
> > int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
> > unsigned long timeout);
> > -void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
> > void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
> >
> > int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
> > int smc_wr_rx_post_init(struct smc_link *link);
> > -void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
> > int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr);
> >
> > #endif /* SMC_WR_H */
> > --
> > 2.45.0
> >
> >
© 2016 - 2026 Red Hat, Inc.