[PATCH net 2/2] net: hns3: fix null pointer in debugfs issue

Jijie Shao posted 2 patches 3 months, 2 weeks ago
There is a newer version of this series
[PATCH net 2/2] net: hns3: fix null pointer in debugfs issue
Posted by Jijie Shao 3 months, 2 weeks ago
Currently, when debugfs and reset are executed concurrently,
some resources are released during the reset process,
which may cause debugfs to read null pointers or other anomalies.

Therefore, in this patch, interception protection has been added
to debugfs operations that are sensitive to reset.

Fixes: eced3d1c41db ("net: hns3: use seq_file for files in queue/ in debugfs")
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
 .../ethernet/hisilicon/hns3/hns3_debugfs.c    | 67 ++++++++++++++-----
 .../hisilicon/hns3/hns3pf/hclge_debugfs.c     |  6 ++
 2 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
index 4cce4f4ba6b0..aa0f8a6cd9d6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c
@@ -383,6 +383,15 @@ static const char * const dim_state_str[] = { "START", "IN_PROG", "APPLY" };
 static const char * const
 dim_tune_stat_str[] = { "ON_TOP", "TIRED", "RIGHT", "LEFT" };
 
+static bool hns3_dbg_is_device_busy(struct hns3_nic_priv *priv)
+{
+	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
+	    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
+		return true;
+
+	return false;
+}
+
 static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector,
 			       struct seq_file *s, int i, bool is_tx)
 {
@@ -428,13 +437,16 @@ static void hns3_get_coal_info(struct hns3_enet_tqp_vector *tqp_vector,
 	}
 }
 
-static void hns3_dump_coal_info(struct seq_file *s, bool is_tx)
+static int hns3_dump_coal_info(struct seq_file *s, bool is_tx)
 {
 	struct hnae3_handle *h = hnae3_seq_file_to_handle(s);
 	struct hns3_enet_tqp_vector *tqp_vector;
 	struct hns3_nic_priv *priv = h->priv;
 	unsigned int i;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	seq_printf(s, "%s interrupt coalesce info:\n", is_tx ? "tx" : "rx");
 
 	seq_puts(s, "VEC_ID  ALGO_STATE  PROFILE_ID  CQE_MODE  TUNE_STATE  ");
@@ -442,18 +454,26 @@ static void hns3_dump_coal_info(struct seq_file *s, bool is_tx)
 	seq_puts(s, "HW_GL  HW_QL\n");
 
 	for (i = 0; i < priv->vector_num; i++) {
+		if (hns3_dbg_is_device_busy(priv))
+			return -EBUSY;
+
 		tqp_vector = &priv->tqp_vector[i];
 		hns3_get_coal_info(tqp_vector, s, i, is_tx);
 	}
+
+	return 0;
 }
 
 static int hns3_dbg_coal_info(struct seq_file *s, void *data)
 {
-	hns3_dump_coal_info(s, true);
-	seq_puts(s, "\n");
-	hns3_dump_coal_info(s, false);
+	int ret;
 
-	return 0;
+	ret = hns3_dump_coal_info(s, true);
+	if (ret)
+		return ret;
+
+	seq_puts(s, "\n");
+	return hns3_dump_coal_info(s, false);
 }
 
 static void hns3_dump_rx_queue_info(struct hns3_enet_ring *ring,
@@ -498,6 +518,9 @@ static int hns3_dbg_rx_queue_info(struct seq_file *s, void *data)
 	struct hns3_enet_ring *ring;
 	u32 i;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	if (!priv->ring) {
 		dev_err(&h->pdev->dev, "priv->ring is NULL\n");
 		return -EFAULT;
@@ -511,8 +534,7 @@ static int hns3_dbg_rx_queue_info(struct seq_file *s, void *data)
 		 * to prevent reference to invalid memory. And need to ensure
 		 * that the following code is executed within 100ms.
 		 */
-		if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
-		    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
+		if (hns3_dbg_is_device_busy(priv))
 			return -EPERM;
 
 		ring = &priv->ring[(u32)(i + h->kinfo.num_tqps)];
@@ -563,6 +585,9 @@ static int hns3_dbg_tx_queue_info(struct seq_file *s, void *data)
 	struct hns3_enet_ring *ring;
 	u32 i;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	if (!priv->ring) {
 		dev_err(&h->pdev->dev, "priv->ring is NULL\n");
 		return -EFAULT;
@@ -576,8 +601,7 @@ static int hns3_dbg_tx_queue_info(struct seq_file *s, void *data)
 		 * to prevent reference to invalid memory. And need to ensure
 		 * that the following code is executed within 100ms.
 		 */
-		if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
-		    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
+		if (hns3_dbg_is_device_busy(priv))
 			return -EPERM;
 
 		ring = &priv->ring[i];
@@ -596,6 +620,9 @@ static int hns3_dbg_queue_map(struct seq_file *s, void *data)
 	if (!h->ae_algo->ops->get_global_queue_id)
 		return -EOPNOTSUPP;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	seq_puts(s, "local_queue_id  global_queue_id  vector_id\n");
 
 	for (i = 0; i < h->kinfo.num_tqps; i++) {
@@ -643,6 +670,9 @@ static int hns3_dbg_rx_bd_info(struct seq_file *s, void *private)
 	struct hns3_desc *desc;
 	unsigned int i;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	if (data->qid >= h->kinfo.num_tqps) {
 		dev_err(&h->pdev->dev, "queue%u is not in use\n", data->qid);
 		return -EINVAL;
@@ -655,8 +685,10 @@ static int hns3_dbg_rx_bd_info(struct seq_file *s, void *private)
 
 	ring = &priv->ring[data->qid + data->handle->kinfo.num_tqps];
 	for (i = 0; i < ring->desc_num; i++) {
-		desc = &ring->desc[i];
+		if (hns3_dbg_is_device_busy(priv))
+			return -EBUSY;
 
+		desc = &ring->desc[i];
 		hns3_dump_rx_bd_info(priv, desc, s, i);
 	}
 
@@ -688,6 +720,9 @@ static int hns3_dbg_tx_bd_info(struct seq_file *s, void *private)
 	struct hns3_desc *desc;
 	unsigned int i;
 
+	if (hns3_dbg_is_device_busy(priv))
+		return -EBUSY;
+
 	if (data->qid >= h->kinfo.num_tqps) {
 		dev_err(&h->pdev->dev, "queue%u is not in use\n", data->qid);
 		return -EINVAL;
@@ -700,8 +735,10 @@ static int hns3_dbg_tx_bd_info(struct seq_file *s, void *private)
 
 	ring = &priv->ring[data->qid];
 	for (i = 0; i < ring->desc_num; i++) {
-		desc = &ring->desc[i];
+		if (hns3_dbg_is_device_busy(priv))
+			return -EBUSY;
 
+		desc = &ring->desc[i];
 		hns3_dump_tx_bd_info(desc, s, i);
 	}
 
@@ -804,9 +841,8 @@ static int hns3_dbg_page_pool_info(struct seq_file *s, void *data)
 	seq_puts(s, "POOL_SIZE(PAGE_NUM)  ORDER  NUMA_ID  MAX_LEN\n");
 
 	for (i = 0; i < h->kinfo.num_tqps; i++) {
-		if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
-		    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
-			return -EPERM;
+		if (hns3_dbg_is_device_busy(priv))
+			return -EBUSY;
 
 		ring = &priv->ring[(u32)(i + h->kinfo.num_tqps)];
 		hns3_dump_page_pool_info(ring, s, i);
@@ -821,8 +857,7 @@ static int hns3_dbg_bd_info_show(struct seq_file *s, void *private)
 	struct hnae3_handle *h = data->handle;
 	struct hns3_nic_priv *priv = h->priv;
 
-	if (!test_bit(HNS3_NIC_STATE_INITED, &priv->state) ||
-	    test_bit(HNS3_NIC_STATE_RESETTING, &priv->state))
+	if (hns3_dbg_is_device_busy(priv))
 		return -EBUSY;
 
 	if (data->cmd == HNAE3_DBG_CMD_TX_BD)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
index b76d25074e99..b658077b9d50 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c
@@ -2470,6 +2470,9 @@ static int hclge_dbg_dump_umv_info(struct seq_file *s, void *data)
 	struct hclge_vport *vport;
 	u8 i;
 
+	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
+		return -EBUSY;
+
 	seq_printf(s, "num_alloc_vport   : %u\n", hdev->num_alloc_vport);
 	seq_printf(s, "max_umv_size     : %u\n", hdev->max_umv_size);
 	seq_printf(s, "wanted_umv_size  : %u\n", hdev->wanted_umv_size);
@@ -2680,6 +2683,9 @@ static int hclge_dbg_dump_vlan_offload_config(struct hclge_dev *hdev,
 	int ret;
 	u8 i;
 
+	if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
+		return -EBUSY;
+
 	seq_puts(s, "FUNC_ID  PVID  ACCEPT_TAG1  ACCEPT_TAG2 ACCEPT_UNTAG1  ");
 	seq_puts(s, "ACCEPT_UNTAG2  INSERT_TAG1  INSERT_TAG2  SHIFT_TAG  ");
 	seq_puts(s, "STRIP_TAG1  STRIP_TAG2  DROP_TAG1  DROP_TAG2  ");
-- 
2.33.0
Re: [PATCH net 2/2] net: hns3: fix null pointer in debugfs issue
Posted by Jakub Kicinski 3 months, 2 weeks ago
On Thu, 23 Oct 2025 21:13:38 +0800 Jijie Shao wrote:
> Currently, when debugfs and reset are executed concurrently,
> some resources are released during the reset process,
> which may cause debugfs to read null pointers or other anomalies.
> 
> Therefore, in this patch, interception protection has been added
> to debugfs operations that are sensitive to reset.

You need to explain what prevents the state from changing immediately
after you did the bit check. With no obvious locking in place I don't
see how this reliably fixes the issue.
Re: [PATCH net 2/2] net: hns3: fix null pointer in debugfs issue
Posted by Jijie Shao 3 months, 2 weeks ago
on 2025/10/28 8:54, Jakub Kicinski wrote:
> On Thu, 23 Oct 2025 21:13:38 +0800 Jijie Shao wrote:
>> Currently, when debugfs and reset are executed concurrently,
>> some resources are released during the reset process,
>> which may cause debugfs to read null pointers or other anomalies.
>>
>> Therefore, in this patch, interception protection has been added
>> to debugfs operations that are sensitive to reset.
> You need to explain what prevents the state from changing immediately
> after you did the bit check. With no obvious locking in place I don't
> see how this reliably fixes the issue.

In July, we used seqfile to refactor debugfs.

Before the refactoring, all debugfs operations would check the reset status
(HNS3_NIC_STATE_INITED and HNS3_NIC_STATE_RESETTING) in the entry function.
After the refactoring, the entry function was removed, which led to the loss of protection.

This patch restores the protection behavior that existed before the refactoring.
Now our tests have already detected the null pointer issue.

As for the problem you mentioned, we have been discussing it recently.
There is a small time gap, checking the status before reading from debugfs is fine,
but there could still be issues if the device enters the reset state during the read process:

check state pass
	debugfs read start...
		do reset
			debugfs read end
			
Currently, we are still assessing the risk and discussing solutions for this issue.
After adding the entry protection, executing debugfs and reset concurrently has not
resulted in null pointers or other exceptions.

Thanks,
Jijie Shao