[PATCH v6 5/5] hisi_acc_vfio_pci: bugfix live migration function without VF device driver

Longfang Liu posted 5 patches 9 months ago
There is a newer version of this series
[PATCH v6 5/5] hisi_acc_vfio_pci: bugfix live migration function without VF device driver
Posted by Longfang Liu 9 months ago
If the VF device driver is not loaded in the Guest OS and we attempt to
perform device data migration, the address of the migrated data will
be NULL.
The live migration recovery operation on the destination side will
access a null address value, which will cause access errors.

Therefore, live migration of VMs without added VF device drivers
does not require device data migration.
In addition, when the queue address data obtained by the destination
is empty, device queue recovery processing will not be performed.

Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration")
Signed-off-by: Longfang Liu <liulongfang@huawei.com>
Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
---
 .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c    | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index cadc82419dca..68b1c7204cad 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -426,13 +426,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
 		return -EINVAL;
 	}
 
-	ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
-	if (ret) {
-		dev_err(dev, "failed to write QM_VF_STATE\n");
-		return ret;
-	}
-
-	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
 	hisi_acc_vdev->match_done = true;
 	return 0;
 }
@@ -498,6 +491,13 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
 	if (migf->total_length < sizeof(struct acc_vf_data))
 		return -EINVAL;
 
+	ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
+	if (ret) {
+		dev_err(dev, "failed to write QM_VF_STATE\n");
+		return -EINVAL;
+	}
+	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
 	qm->eqe_dma = vf_data->eqe_dma;
 	qm->aeqe_dma = vf_data->aeqe_dma;
 	qm->sqc_dma = vf_data->sqc_dma;
@@ -506,6 +506,12 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
 	qm->qp_base = vf_data->qp_base;
 	qm->qp_num = vf_data->qp_num;
 
+	if (!vf_data->eqe_dma || !vf_data->aeqe_dma ||
+	    !vf_data->sqc_dma || !vf_data->cqc_dma) {
+		dev_err(dev, "resume dma addr is NULL!\n");
+		return -EINVAL;
+	}
+
 	ret = qm_set_regs(qm, vf_data);
 	if (ret) {
 		dev_err(dev, "set VF regs failed\n");
@@ -1531,6 +1537,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
 	hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
 	hisi_acc_vdev->pf_qm = pf_qm;
 	hisi_acc_vdev->vf_dev = pdev;
+	hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
 	mutex_init(&hisi_acc_vdev->state_mutex);
 	mutex_init(&hisi_acc_vdev->open_mutex);
 
-- 
2.24.0
Re: [PATCH v6 5/5] hisi_acc_vfio_pci: bugfix live migration function without VF device driver
Posted by Alex Williamson 9 months ago
On Tue, 18 Mar 2025 14:45:48 +0800
Longfang Liu <liulongfang@huawei.com> wrote:

> If the VF device driver is not loaded in the Guest OS and we attempt to
> perform device data migration, the address of the migrated data will
> be NULL.
> The live migration recovery operation on the destination side will
> access a null address value, which will cause access errors.
> 
> Therefore, live migration of VMs without added VF device drivers
> does not require device data migration.
> In addition, when the queue address data obtained by the destination
> is empty, device queue recovery processing will not be performed.
> 
> Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration")
> Signed-off-by: Longfang Liu <liulongfang@huawei.com>
> Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
> ---
>  .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c    | 21 ++++++++++++-------
>  1 file changed, 14 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
> index cadc82419dca..68b1c7204cad 100644
> --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
> +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
> @@ -426,13 +426,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>  		return -EINVAL;
>  	}
>  
> -	ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
> -	if (ret) {
> -		dev_err(dev, "failed to write QM_VF_STATE\n");
> -		return ret;
> -	}
> -
> -	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
>  	hisi_acc_vdev->match_done = true;
>  	return 0;
>  }
> @@ -498,6 +491,13 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>  	if (migf->total_length < sizeof(struct acc_vf_data))
>  		return -EINVAL;
>  
> +	ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
> +	if (ret) {
> +		dev_err(dev, "failed to write QM_VF_STATE\n");
> +		return -EINVAL;
> +	}
> +	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
> +
>  	qm->eqe_dma = vf_data->eqe_dma;
>  	qm->aeqe_dma = vf_data->aeqe_dma;
>  	qm->sqc_dma = vf_data->sqc_dma;
> @@ -506,6 +506,12 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>  	qm->qp_base = vf_data->qp_base;
>  	qm->qp_num = vf_data->qp_num;
>  
> +	if (!vf_data->eqe_dma || !vf_data->aeqe_dma ||
> +	    !vf_data->sqc_dma || !vf_data->cqc_dma) {
> +		dev_err(dev, "resume dma addr is NULL!\n");
> +		return -EINVAL;
> +	}
> +

I'm not sure how this fits in based on the commit log.  IIUC, we're
actually rejecting the migration data here, which will cause a
migration failure.  We're also testing the validity of the data *after*
we've actually applied it to the hisi_qm object, which seems backwards.

Are we just not processing the migration data because there's no driver
or are we failing the migration?  There shouldn't be a requirement on
the state of the guest driver for a successful migration.  Thanks,

Alex

>  	ret = qm_set_regs(qm, vf_data);
>  	if (ret) {
>  		dev_err(dev, "set VF regs failed\n");
> @@ -1531,6 +1537,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
>  	hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
>  	hisi_acc_vdev->pf_qm = pf_qm;
>  	hisi_acc_vdev->vf_dev = pdev;
> +	hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
>  	mutex_init(&hisi_acc_vdev->state_mutex);
>  	mutex_init(&hisi_acc_vdev->open_mutex);
>
Re: [PATCH v6 5/5] hisi_acc_vfio_pci: bugfix live migration function without VF device driver
Posted by liulongfang 8 months, 3 weeks ago
On 2025/3/21 23:52, Alex Williamson wrote:
> On Tue, 18 Mar 2025 14:45:48 +0800
> Longfang Liu <liulongfang@huawei.com> wrote:
> 
>> If the VF device driver is not loaded in the Guest OS and we attempt to
>> perform device data migration, the address of the migrated data will
>> be NULL.
>> The live migration recovery operation on the destination side will
>> access a null address value, which will cause access errors.
>>
>> Therefore, live migration of VMs without added VF device drivers
>> does not require device data migration.
>> In addition, when the queue address data obtained by the destination
>> is empty, device queue recovery processing will not be performed.
>>
>> Fixes: b0eed085903e ("hisi_acc_vfio_pci: Add support for VFIO live migration")
>> Signed-off-by: Longfang Liu <liulongfang@huawei.com>
>> Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
>> ---
>>  .../vfio/pci/hisilicon/hisi_acc_vfio_pci.c    | 21 ++++++++++++-------
>>  1 file changed, 14 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
>> index cadc82419dca..68b1c7204cad 100644
>> --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
>> +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
>> @@ -426,13 +426,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>>  		return -EINVAL;
>>  	}
>>  
>> -	ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
>> -	if (ret) {
>> -		dev_err(dev, "failed to write QM_VF_STATE\n");
>> -		return ret;
>> -	}
>> -
>> -	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
>>  	hisi_acc_vdev->match_done = true;
>>  	return 0;
>>  }
>> @@ -498,6 +491,13 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>>  	if (migf->total_length < sizeof(struct acc_vf_data))
>>  		return -EINVAL;
>>  
>> +	ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
>> +	if (ret) {
>> +		dev_err(dev, "failed to write QM_VF_STATE\n");
>> +		return -EINVAL;
>> +	}
>> +	hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
>> +
>>  	qm->eqe_dma = vf_data->eqe_dma;
>>  	qm->aeqe_dma = vf_data->aeqe_dma;
>>  	qm->sqc_dma = vf_data->sqc_dma;
>> @@ -506,6 +506,12 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
>>  	qm->qp_base = vf_data->qp_base;
>>  	qm->qp_num = vf_data->qp_num;
>>  
>> +	if (!vf_data->eqe_dma || !vf_data->aeqe_dma ||
>> +	    !vf_data->sqc_dma || !vf_data->cqc_dma) {
>> +		dev_err(dev, "resume dma addr is NULL!\n");
>> +		return -EINVAL;
>> +	}
>> +
> 
> I'm not sure how this fits in based on the commit log.  IIUC, we're
> actually rejecting the migration data here, which will cause a
> migration failure.  We're also testing the validity of the data *after*
> we've actually applied it to the hisi_qm object, which seems backwards.
> 
> Are we just not processing the migration data because there's no driver
> or are we failing the migration?  There shouldn't be a requirement on
> the state of the guest driver for a successful migration.  Thanks,
>

Therefore, this shouldn't be about exiting the migration operation,
but rather continuing the migration process while skipping these empty
address write operations.
Consequently, this shouldn't return an error,
it should simply return 0.

Thanks.
Longfang.

> Alex
> 
>>  	ret = qm_set_regs(qm, vf_data);
>>  	if (ret) {
>>  		dev_err(dev, "set VF regs failed\n");
>> @@ -1531,6 +1537,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
>>  	hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
>>  	hisi_acc_vdev->pf_qm = pf_qm;
>>  	hisi_acc_vdev->vf_dev = pdev;
>> +	hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
>>  	mutex_init(&hisi_acc_vdev->state_mutex);
>>  	mutex_init(&hisi_acc_vdev->open_mutex);
>>  
> 
> 
> 
> .
>