[PATCH v5 for-next 2/2] RDMA/hns: Disassociate mmap pages for all uctx when HW is being reset

Junxian Huang posted 2 patches 6 days, 3 hours ago
[PATCH v5 for-next 2/2] RDMA/hns: Disassociate mmap pages for all uctx when HW is being reset
Posted by Junxian Huang 6 days, 3 hours ago
From: Chengchang Tang <tangchengchang@huawei.com>

When HW is being reset, userspace should not ring doorbell otherwise
it may lead to abnormal consequence such as RAS.

Disassociate mmap pages for all uctx to prevent userspace from ringing
doorbell to HW. Since all resources will be destroyed during HW reset,
no new mmap is allowed after HW reset is completed.

Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver")
Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
---
 drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +++++++++
 drivers/infiniband/hw/hns/hns_roce_main.c  | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
index 24e906b9d3ae..4e374b2da101 100644
--- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
+++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
@@ -7017,6 +7017,12 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
 
 	handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
 }
+
+static void hns_roce_v2_reset_notify_user(struct hns_roce_dev *hr_dev)
+{
+	rdma_user_mmap_disassociate(&hr_dev->ib_dev);
+}
+
 static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
 {
 	struct hns_roce_dev *hr_dev;
@@ -7035,6 +7041,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
 
 	hr_dev->active = false;
 	hr_dev->dis_db = true;
+
+	hns_roce_v2_reset_notify_user(hr_dev);
+
 	hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN;
 
 	return 0;
diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 4cb0af733587..49315f39361d 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -466,6 +466,11 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma)
 	pgprot_t prot;
 	int ret;
 
+	if (hr_dev->dis_db) {
+		atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]);
+		return -EPERM;
+	}
+
 	rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff);
 	if (!rdma_entry) {
 		atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]);
-- 
2.33.0
Re: [PATCH v5 for-next 2/2] RDMA/hns: Disassociate mmap pages for all uctx when HW is being reset
Posted by Leon Romanovsky 3 days, 7 hours ago
On Fri, Sep 13, 2024 at 08:29:55PM +0800, Junxian Huang wrote:
> From: Chengchang Tang <tangchengchang@huawei.com>
> 
> When HW is being reset, userspace should not ring doorbell otherwise
> it may lead to abnormal consequence such as RAS.
> 
> Disassociate mmap pages for all uctx to prevent userspace from ringing
> doorbell to HW. Since all resources will be destroyed during HW reset,
> no new mmap is allowed after HW reset is completed.
> 
> Fixes: 9a4435375cd1 ("IB/hns: Add driver files for hns RoCE driver")
> Signed-off-by: Chengchang Tang <tangchengchang@huawei.com>
> Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
> ---
>  drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 +++++++++
>  drivers/infiniband/hw/hns/hns_roce_main.c  | 5 +++++
>  2 files changed, 14 insertions(+)
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> index 24e906b9d3ae..4e374b2da101 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
> @@ -7017,6 +7017,12 @@ static void hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
>  
>  	handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
>  }
> +
> +static void hns_roce_v2_reset_notify_user(struct hns_roce_dev *hr_dev)
> +{
> +	rdma_user_mmap_disassociate(&hr_dev->ib_dev);
> +}

There is no need in one line function, please inline it.

> +
>  static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
>  {
>  	struct hns_roce_dev *hr_dev;
> @@ -7035,6 +7041,9 @@ static int hns_roce_hw_v2_reset_notify_down(struct hnae3_handle *handle)
>  
>  	hr_dev->active = false;
>  	hr_dev->dis_db = true;
> +
> +	hns_roce_v2_reset_notify_user(hr_dev);
> +
>  	hr_dev->state = HNS_ROCE_DEVICE_STATE_RST_DOWN;
>  
>  	return 0;
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
> index 4cb0af733587..49315f39361d 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_main.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -466,6 +466,11 @@ static int hns_roce_mmap(struct ib_ucontext *uctx, struct vm_area_struct *vma)
>  	pgprot_t prot;
>  	int ret;
>  
> +	if (hr_dev->dis_db) {

How do you clear dis_db after calling to hns_roce_hw_v2_reset_notify_down()? Does it have any locking protection?

> +		atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]);
> +		return -EPERM;
> +	}
> +
>  	rdma_entry = rdma_user_mmap_entry_get_pgoff(uctx, vma->vm_pgoff);
>  	if (!rdma_entry) {
>  		atomic64_inc(&hr_dev->dfx_cnt[HNS_ROCE_DFX_MMAP_ERR_CNT]);
> -- 
> 2.33.0
>