Live Update Orchestrator: PCI subsystem

[PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Chris Li 2 months, 1 week ago

The liveupdate devices are already initialized by the kernel before the
kexec. During the kexec the device is still running. Avoid write to the
liveupdate devices during the new kernel boot up.

Signed-off-by: Chris Li <chrisl@kernel.org>
---
 drivers/pci/ats.c            |  7 ++--
 drivers/pci/iov.c            | 58 ++++++++++++++++++------------
 drivers/pci/msi/msi.c        | 32 ++++++++++++-----
 drivers/pci/msi/pcidev_msi.c |  4 +--
 drivers/pci/pci-acpi.c       |  3 ++
 drivers/pci/pci.c            | 85 +++++++++++++++++++++++++++++---------------
 drivers/pci/pci.h            |  9 ++++-
 drivers/pci/pcie/aspm.c      |  7 ++--
 drivers/pci/pcie/pme.c       | 11 ++++--
 drivers/pci/probe.c          | 43 +++++++++++++++-------
 drivers/pci/setup-bus.c      | 10 +++++-
 11 files changed, 184 insertions(+), 85 deletions(-)

diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index ec6c8dbdc5e9c9959e822e016ab301bf483713a5..284f43c82593903058dee58ce64b82bad8aed710 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -75,7 +75,9 @@ int pci_prepare_ats(struct pci_dev *dev, int ps)
 
 	dev->ats_stu = ps;
 	ctrl = PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
-	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
+
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(pci_prepare_ats);
@@ -114,7 +116,8 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
 		dev->ats_stu = ps;
 		ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
 	}
-	pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
 
 	dev->ats_enabled = 1;
 	return 0;
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 10693b5d7eb66bbbfb9b70ffe6e89eb89c8dc3a3..df27bcf840d9fc0dbce29810e288c1c2b74a70c9 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -85,7 +85,8 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
 {
 	struct pci_sriov *iov = dev->sriov;
 
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
 	pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
 }
@@ -694,10 +695,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 
 	pci_iov_set_numvfs(dev, nr_virtfn);
 	iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
-	pci_cfg_access_lock(dev);
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
-	msleep(100);
-	pci_cfg_access_unlock(dev);
+	if (!pci_lu_adopt(dev)) {
+		pci_cfg_access_lock(dev);
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+		msleep(100);
+		pci_cfg_access_unlock(dev);
+	}
 
 	rc = sriov_add_vfs(dev, initial);
 	if (rc)
@@ -710,10 +713,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 
 err_pcibios:
 	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
-	pci_cfg_access_lock(dev);
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
-	ssleep(1);
-	pci_cfg_access_unlock(dev);
+	if (!pci_lu_adopt(dev)) {
+		pci_cfg_access_lock(dev);
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+		ssleep(1);
+		pci_cfg_access_unlock(dev);
+	}
 
 	pcibios_sriov_disable(dev);
 
@@ -741,11 +746,13 @@ static void sriov_disable(struct pci_dev *dev)
 		return;
 
 	sriov_del_vfs(dev);
-	iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
-	pci_cfg_access_lock(dev);
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
-	ssleep(1);
-	pci_cfg_access_unlock(dev);
+	if (!pci_lu_adopt(dev)) {
+		iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+		pci_cfg_access_lock(dev);
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+		ssleep(1);
+		pci_cfg_access_unlock(dev);
+	}
 
 	pcibios_sriov_disable(dev);
 
@@ -770,7 +777,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
 	u32 sriovbars[PCI_SRIOV_NUM_BARS];
 
 	pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
-	if (ctrl & PCI_SRIOV_CTRL_VFE) {
+	if (!pci_lu_adopt(dev) && ctrl & PCI_SRIOV_CTRL_VFE) {
 		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
 		ssleep(1);
 	}
@@ -785,7 +792,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
 		ctrl |= PCI_SRIOV_CTRL_ARI;
 
 found:
-	pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
 
 	pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
 	if (!total)
@@ -798,7 +806,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
 		return -EIO;
 
 	pgsz &= ~(pgsz - 1);
-	pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
 
 	iov = kzalloc(sizeof(*iov), GFP_KERNEL);
 	if (!iov)
@@ -904,14 +913,17 @@ static void sriov_restore_state(struct pci_dev *dev)
 	 */
 	ctrl &= ~PCI_SRIOV_CTRL_ARI;
 	ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI;
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
 		pci_update_resource(dev, i + PCI_IOV_RESOURCES);
 
-	pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
 	pci_iov_set_numvfs(dev, iov->num_VFs);
-	pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
 	if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
 		msleep(100);
 }
@@ -1013,10 +1025,12 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno)
 	new |= res->flags & ~PCI_BASE_ADDRESS_MEM_MASK;
 
 	reg = iov->pos + PCI_SRIOV_BAR + 4 * vf_bar;
-	pci_write_config_dword(dev, reg, new);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_dword(dev, reg, new);
 	if (res->flags & IORESOURCE_MEM_64) {
 		new = region.start >> 16 >> 16;
-		pci_write_config_dword(dev, reg + 4, new);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_dword(dev, reg + 4, new);
 	}
 }
 
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev)
 
 void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
 {
-	raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock;
+	struct pci_dev *pci_dev = to_pci_dev(desc->dev);
+	raw_spinlock_t *lock = &pci_dev->msi_lock;
 	unsigned long flags;
 
 	if (!desc->pci.msi_attrib.can_mask)
@@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
 	raw_spin_lock_irqsave(lock, flags);
 	desc->pci.msi_mask &= ~clear;
 	desc->pci.msi_mask |= set;
-	pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
-			       desc->pci.msi_mask);
+	if (!pci_lu_adopt(pci_dev))
+		pci_write_config_dword(pci_dev, desc->pci.mask_pos,
+				       desc->pci.msi_mask);
 	raw_spin_unlock_irqrestore(lock, flags);
 }
 
@@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc,
 	int pos = dev->msi_cap;
 	u16 msgctl;
 
+	if (pci_lu_adopt(dev))
+		return;
+
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
 	msgctl &= ~PCI_MSI_FLAGS_QSIZE;
 	msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple);
@@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg
 
 	if (desc->pci.msi_attrib.is_virtual)
 		return;
+	if (pci_lu_adopt(to_pci_dev(desc->dev)))
+		return;
 	/*
 	 * The specification mandates that the entry is masked
 	 * when the message is modified:
@@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
 	control &= ~PCI_MSI_FLAGS_ENABLE;
 	if (enable)
 		control |= PCI_MSI_FLAGS_ENABLE;
-	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
 }
 
 static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
@@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
 {
 	u16 ctrl;
 
+	BUG_ON(pci_lu_adopt(dev));
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
 	ctrl &= ~clear;
 	ctrl |= set;
@@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 	 * registers can be accessed.  Mask all the vectors to prevent
 	 * interrupts coming in before they're fully set up.
 	 */
-	pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
-				    PCI_MSIX_FLAGS_ENABLE);
+	if (!pci_lu_adopt(dev))
+		pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
+					    PCI_MSIX_FLAGS_ENABLE);
 
 	/* Mark it enabled so setup functions can query it */
 	dev->msix_enabled = 1;
@@ -753,14 +763,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
 		 */
 		msix_mask_all(dev->msix_base, tsize);
 	}
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+	if (!pci_lu_adopt(dev))
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
 
 	pcibios_free_irq(dev);
 	return 0;
 
 out_disable:
 	dev->msix_enabled = 0;
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
+	if (!pci_lu_adopt(dev))
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
 
 	return ret;
 }
@@ -864,6 +876,7 @@ void __pci_restore_msix_state(struct pci_dev *dev)
 	if (!dev->msix_enabled)
 		return;
 
+	BUG_ON(pci_lu_adopt(dev));
 	/* route the table */
 	pci_intx_for_msi(dev, 0);
 	pci_msix_clear_and_set_ctrl(dev, 0,
@@ -898,7 +911,8 @@ void pci_msix_shutdown(struct pci_dev *dev)
 	msi_for_each_desc(desc, &dev->dev, MSI_DESC_ALL)
 		pci_msix_mask(desc);
 
-	pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+	if (!pci_lu_adopt(dev))
+		pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
 	pci_intx_for_msi(dev, 1);
 	dev->msix_enabled = 0;
 	pcibios_alloc_irq(dev);
diff --git a/drivers/pci/msi/pcidev_msi.c b/drivers/pci/msi/pcidev_msi.c
index 5520aff53b5670e70311c63f0f358228bf03c309..f9f682a84a05ef47ff4d85e7d0e724cc7c2f5cdc 100644
--- a/drivers/pci/msi/pcidev_msi.c
+++ b/drivers/pci/msi/pcidev_msi.c
@@ -18,7 +18,7 @@ void pci_msi_init(struct pci_dev *dev)
 		return;
 
 	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
-	if (ctrl & PCI_MSI_FLAGS_ENABLE) {
+	if (!pci_lu_adopt(dev) && ctrl & PCI_MSI_FLAGS_ENABLE) {
 		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
 				      ctrl & ~PCI_MSI_FLAGS_ENABLE);
 	}
@@ -36,7 +36,7 @@ void pci_msix_init(struct pci_dev *dev)
 		return;
 
 	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
-	if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
+	if (!pci_lu_adopt(dev) && ctrl & PCI_MSIX_FLAGS_ENABLE) {
 		pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
 				      ctrl & ~PCI_MSIX_FLAGS_ENABLE);
 	}
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index af370628e58393aa0cbdf6d283b3afe33e5effb5..b9e42a1352c87443dd5c4ee9f03bc8a0d343d714 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -172,6 +172,9 @@ static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx)
 		hpx = &pci_default_type0;
 	}
 
+	if (pci_lu_adopt(dev))
+		return;
+
 	pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size);
 	pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer);
 	pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 46fb80dbca590c251fcad3bf2f011a16f6898810..c1cc723f979ae881cf07ad06e1fa0d472e8b89c6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -218,7 +218,7 @@ int pci_status_get_and_clear_errors(struct pci_dev *pdev)
 		return -EIO;
 
 	status &= PCI_STATUS_ERROR_BITS;
-	if (status)
+	if (status && !pci_lu_adopt(pdev))
 		pci_write_config_word(pdev, PCI_STATUS, status);
 
 	return status;
@@ -628,7 +628,7 @@ u64 pci_get_dsn(struct pci_dev *dev)
 	int pos;
 
 	pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN);
-	if (!pos)
+	if (!pos && !pci_lu_adopt(dev))
 		return 0;
 
 	/*
@@ -1103,7 +1103,8 @@ static void pci_enable_acs(struct pci_dev *dev)
 			 ~(PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC));
 	__pci_config_acs(dev, &caps, config_acs_param, 0, 0);
 
-	pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
 }
 
 /**
@@ -1394,7 +1395,8 @@ int pci_power_up(struct pci_dev *dev)
 	 * Force the entire word to 0. This doesn't affect PME_Status, disables
 	 * PME_En, and sets PowerState to 0.
 	 */
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, 0);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, 0);
 
 	/* Mandatory transition delays; see PCI PM 1.2. */
 	if (state == PCI_D3hot)
@@ -1552,7 +1554,8 @@ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state, bool
 	pmcsr |= state;
 
 	/* Enter specified state */
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
 
 	/* Mandatory power management transition delays; see PCI PM 1.2. */
 	if (state == PCI_D3hot)
@@ -1781,7 +1784,8 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
 		return;
 	cap = (u16 *)&save_state->cap.data[0];
 
-	pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
 }
 
 /**
@@ -2090,7 +2094,7 @@ static int do_pci_enable_device(struct pci_dev *dev, int bars)
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 	if (pin) {
 		pci_read_config_word(dev, PCI_COMMAND, &cmd);
-		if (cmd & PCI_COMMAND_INTX_DISABLE)
+		if (!pci_lu_adopt(dev) && cmd & PCI_COMMAND_INTX_DISABLE)
 			pci_write_config_word(dev, PCI_COMMAND,
 					      cmd & ~PCI_COMMAND_INTX_DISABLE);
 	}
@@ -2248,7 +2252,8 @@ static void do_pci_disable_device(struct pci_dev *dev)
 	pci_read_config_word(dev, PCI_COMMAND, &pci_command);
 	if (pci_command & PCI_COMMAND_MASTER) {
 		pci_command &= ~PCI_COMMAND_MASTER;
-		pci_write_config_word(dev, PCI_COMMAND, pci_command);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, pci_command);
 	}
 
 	pcibios_disable_device(dev);
@@ -2369,7 +2374,8 @@ bool pci_check_pme_status(struct pci_dev *dev)
 		ret = true;
 	}
 
-	pci_write_config_word(dev, pmcsr_pos, pmcsr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, pmcsr_pos, pmcsr);
 
 	return ret;
 }
@@ -2484,7 +2490,8 @@ static void __pci_pme_active(struct pci_dev *dev, bool enable)
 	if (!enable)
 		pmcsr &= ~PCI_PM_CTRL_PME_ENABLE;
 
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
 }
 
 /**
@@ -2506,7 +2513,8 @@ void pci_pme_restore(struct pci_dev *dev)
 		pmcsr &= ~PCI_PM_CTRL_PME_ENABLE;
 		pmcsr |= PCI_PM_CTRL_PME_STATUS;
 	}
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
 }
 
 /**
@@ -3587,12 +3595,14 @@ void pci_configure_ari(struct pci_dev *dev)
 		return;
 
 	if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI)) {
-		pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
-					 PCI_EXP_DEVCTL2_ARI);
+		if (!pci_lu_adopt(dev))
+			pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
+						 PCI_EXP_DEVCTL2_ARI);
 		bridge->ari_enabled = 1;
 	} else {
-		pcie_capability_clear_word(bridge, PCI_EXP_DEVCTL2,
-					   PCI_EXP_DEVCTL2_ARI);
+		if (!pci_lu_adopt(dev))
+			pcie_capability_clear_word(bridge, PCI_EXP_DEVCTL2,
+						   PCI_EXP_DEVCTL2_ARI);
 		bridge->ari_enabled = 0;
 	}
 }
@@ -4286,7 +4296,8 @@ static void __pci_set_master(struct pci_dev *dev, bool enable)
 	if (cmd != old_cmd) {
 		pci_dbg(dev, "%s bus mastering\n",
 			enable ? "enabling" : "disabling");
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 	dev->is_busmaster = enable;
 }
@@ -4416,7 +4427,8 @@ int pci_set_mwi(struct pci_dev *dev)
 	if (!(cmd & PCI_COMMAND_INVALIDATE)) {
 		pci_dbg(dev, "enabling Mem-Wr-Inval\n");
 		cmd |= PCI_COMMAND_INVALIDATE;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 	return 0;
 #endif
@@ -4456,7 +4468,8 @@ void pci_clear_mwi(struct pci_dev *dev)
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	if (cmd & PCI_COMMAND_INVALIDATE) {
 		cmd &= ~PCI_COMMAND_INVALIDATE;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 #endif
 }
@@ -4475,7 +4488,8 @@ void pci_disable_parity(struct pci_dev *dev)
 	pci_read_config_word(dev, PCI_COMMAND, &cmd);
 	if (cmd & PCI_COMMAND_PARITY) {
 		cmd &= ~PCI_COMMAND_PARITY;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 }
 
@@ -4500,7 +4514,8 @@ void pci_intx(struct pci_dev *pdev, int enable)
 	if (new == pci_command)
 		return;
 
-	pci_write_config_word(pdev, PCI_COMMAND, new);
+	if (!pci_lu_adopt(pdev))
+			pci_write_config_word(pdev, PCI_COMMAND, new);
 }
 EXPORT_SYMBOL_GPL(pci_intx);
 
@@ -4648,12 +4663,14 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
 
 	csr &= ~PCI_PM_CTRL_STATE_MASK;
 	csr |= PCI_D3hot;
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
 	pci_dev_d3_sleep(dev);
 
 	csr &= ~PCI_PM_CTRL_STATE_MASK;
 	csr |= PCI_D0;
-	pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
 	pci_dev_d3_sleep(dev);
 
 	return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
@@ -4959,6 +4976,7 @@ void pci_reset_secondary_bus(struct pci_dev *dev)
 {
 	u16 ctrl;
 
+	BUG_ON(pci_lu_adopt(dev));
 	pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
 	ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
@@ -5186,7 +5204,8 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
 	 * DMA from the device including MSI/MSI-X interrupts.  For PCI 2.3
 	 * compliant devices, INTx-disable prevents legacy interrupts.
 	 */
-	pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
 }
 
 static void pci_dev_restore(struct pci_dev *dev)
@@ -5897,8 +5916,9 @@ int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc)
 
 		cmd &= ~PCI_X_CMD_MAX_READ;
 		cmd |= FIELD_PREP(PCI_X_CMD_MAX_READ, v);
-		if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd))
-			return -EIO;
+		if (!pci_lu_adopt(dev))
+			if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd))
+				return -EIO;
 	}
 	return 0;
 }
@@ -5960,6 +5980,8 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
 		}
 	}
 
+	if (pci_lu_adopt(dev))
+		return 0;
 	ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
 						  PCI_EXP_DEVCTL_READRQ, v);
 
@@ -6004,6 +6026,8 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
 		return -EINVAL;
 	v = FIELD_PREP(PCI_EXP_DEVCTL_PAYLOAD, v);
 
+	if (pci_lu_adopt(dev))
+		return 0;
 	ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
 						  PCI_EXP_DEVCTL_PAYLOAD, v);
 
@@ -6304,7 +6328,8 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
 			cmd |= command_bits;
 		else
 			cmd &= ~command_bits;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
+		if (!pci_lu_adopt(dev))
+			pci_write_config_word(dev, PCI_COMMAND, cmd);
 	}
 
 	if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
@@ -6320,8 +6345,9 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
 				cmd |= PCI_BRIDGE_CTL_VGA;
 			else
 				cmd &= ~PCI_BRIDGE_CTL_VGA;
-			pci_write_config_word(bridge, PCI_BRIDGE_CONTROL,
-					      cmd);
+			if (!pci_lu_adopt(bridge))
+				pci_write_config_word(bridge, PCI_BRIDGE_CONTROL,
+						      cmd);
 		}
 		bus = bus->parent;
 	}
@@ -6621,7 +6647,8 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev)
 
 	pci_read_config_word(dev, PCI_COMMAND, &command);
 	command &= ~PCI_COMMAND_MEMORY;
-	pci_write_config_word(dev, PCI_COMMAND, command);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, PCI_COMMAND, command);
 
 	for (i = 0; i <= PCI_ROM_RESOURCE; i++)
 		pci_request_resource_alignment(dev, i, align, resize);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a8acc986a5aac808ec64395d7d946ee036270f5b..bd198227ae3cf687f4ddae76c2f53125681ca91d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1188,11 +1188,18 @@ static inline int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int inde
 
 void pci_liveupdate_restore(struct pci_dev *dev);
 void pci_liveupdate_override_driver(struct pci_dev *dev);
+static inline struct pci_dev_ser *pci_lu_adopt(struct pci_dev *dev)
+{
+	return dev->dev.lu.requested ? dev->dev.lu.dev_state : NULL;
+}
 #else
 #define PCI_SER_GET(__dev, __var, __def) __def
 
 static inline void pci_liveupdate_restore(struct pci_dev *dev) {}
 static inline void pci_liveupdate_override_driver(struct pci_dev *dev) {}
+static inline struct pci_dev_ser *pci_lu_adopt(struct pci_dev *dev)
+{
+	return NULL;
+}
 #endif
-
 #endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 29fcb0689a918f9cb123691e1680de5a1af2c115..61f9a443f6ad2bad57d3fc5958e8855117f79598 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -788,7 +788,7 @@ static void aspm_l1ss_init(struct pcie_link_state *link)
 		aspm_calc_l12_info(link, parent_l1ss_cap, child_l1ss_cap);
 }
 
-static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
+static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist, bool lu_restore)
 {
 	struct pci_dev *child = link->downstream, *parent = link->pdev;
 	u32 parent_lnkcap, child_lnkcap;
@@ -812,7 +812,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
 		return;
 
 	/* Configure common clock before checking latencies */
-	pcie_aspm_configure_common_clock(link);
+	if (!lu_restore)
+		pcie_aspm_configure_common_clock(link);
 
 	/*
 	 * Re-read upstream/downstream components' register state after
@@ -1130,7 +1131,7 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
 	 * upstream links also because capable state of them can be
 	 * update through pcie_aspm_cap_init().
 	 */
-	pcie_aspm_cap_init(link, blacklist);
+	pcie_aspm_cap_init(link, blacklist, pci_lu_adopt(pdev));
 
 	/* Setup initial Clock PM state */
 	pcie_clkpm_cap_init(link, blacklist);
diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index a2daebd9806cd7273ee331406201402a758bd7b8..da093a5ba7ee1f9d20652c71e8e78662fdab176c 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -53,6 +53,8 @@ struct pcie_pme_service_data {
  */
 void pcie_pme_interrupt_enable(struct pci_dev *dev, bool enable)
 {
+	if (pci_lu_adopt(dev))
+		return;
 	if (enable)
 		pcie_capability_set_word(dev, PCI_EXP_RTCTL,
 					 PCI_EXP_RTCTL_PMEIE);
@@ -344,8 +346,10 @@ static int pcie_pme_probe(struct pcie_device *srv)
 	data->srv = srv;
 	set_service_data(srv, data);
 
-	pcie_pme_interrupt_enable(port, false);
-	pcie_clear_root_pme_status(port);
+	if (!pci_lu_adopt(port)) {
+		pcie_pme_interrupt_enable(port, false);
+		pcie_clear_root_pme_status(port);
+	}
 
 	ret = request_irq(srv->irq, pcie_pme_irq, IRQF_SHARED, "PCIe PME", srv);
 	if (ret) {
@@ -356,7 +360,8 @@ static int pcie_pme_probe(struct pcie_device *srv)
 	pci_info(port, "Signaling with IRQ %d\n", srv->irq);
 
 	pcie_pme_mark_devices(port);
-	pcie_pme_interrupt_enable(port, true);
+	if (!pci_lu_adopt(port))
+		pcie_pme_interrupt_enable(port, true);
 	return 0;
 }
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d8b80e1c4fb35289208d7c953fb5c1e137a5c1a8..5c30d1d52a96b17a92794756cab5db0972548267 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -358,7 +358,7 @@ static __always_inline void pci_read_bases(struct pci_dev *dev,
 		return;
 
 	/* No printks while decoding is disabled! */
-	if (!dev->mmio_always_on) {
+	if (!pci_lu_adopt(dev) && !dev->mmio_always_on) {
 		pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
 		if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
 			pci_write_config_word(dev, PCI_COMMAND,
@@ -366,11 +366,13 @@ static __always_inline void pci_read_bases(struct pci_dev *dev,
 		}
 	}
 
-	__pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
-	if (rom)
-		__pci_size_rom(dev, rom, &rombar);
+	if (!pci_lu_adopt(dev)) {
+		__pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
+		if (rom)
+			__pci_size_rom(dev, rom, &rombar);
+	}
 
-	if (!dev->mmio_always_on &&
+	if (!pci_lu_adopt(dev) && !dev->mmio_always_on &&
 	    (orig_cmd & PCI_COMMAND_DECODE_ENABLE))
 		pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
 
@@ -1269,8 +1271,9 @@ static void pci_enable_rrs_sv(struct pci_dev *pdev)
 	/* Enable Configuration RRS Software Visibility if supported */
 	pcie_capability_read_word(pdev, PCI_EXP_RTCAP, &root_cap);
 	if (root_cap & PCI_EXP_RTCAP_RRS_SV) {
-		pcie_capability_set_word(pdev, PCI_EXP_RTCTL,
-					 PCI_EXP_RTCTL_RRS_SVE);
+		if (!pci_lu_adopt(pdev))
+			pcie_capability_set_word(pdev, PCI_EXP_RTCTL,
+						 PCI_EXP_RTCTL_RRS_SVE);
 		pdev->config_rrs_sv = 1;
 	}
 }
@@ -1384,8 +1387,9 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 	 * bus errors in some architectures.
 	 */
 	pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
-	pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
-			      bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
+				      bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
 
 	if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
 	    !is_cardbus && !broken) {
@@ -1404,6 +1408,10 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 		 * more than one bridge. The second case can happen with
 		 * the i450NX chipset.
 		 */
+		if (pci_lu_adopt(dev)) {
+			/* Verify bus number here */
+		}
+
 		child = pci_find_bus(pci_domain_nr(bus), secondary);
 		if (!child) {
 			child = pci_add_new_bus(bus, dev, secondary);
@@ -1558,7 +1566,8 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 	/* Clear errors in the Secondary Status Register */
 	pci_write_config_word(dev, PCI_SEC_STATUS, 0xffff);
 
-	pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
+	if (!pci_lu_adopt(dev))
+		pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
 
 	pm_runtime_put(&dev->dev);
 
@@ -2035,7 +2044,10 @@ int pci_setup_device(struct pci_dev *dev)
 	 * If we are not booted from liveupdate, default
 	 * "Unknown power state".
 	 */
-	dev->current_state = PCI_SER_GET(dev, current_state, PCI_UNKNOWN);
+	if (pci_lu_adopt(dev))
+		dev->current_state = 0; /* FIXME */
+	else
+		dev->current_state = PCI_SER_GET(dev, current_state, PCI_UNKNOWN);
 
 	/* Early fixups, before probing the BARs */
 	pci_fixup_device(pci_fixup_early, dev);
@@ -2075,7 +2087,8 @@ int pci_setup_device(struct pci_dev *dev)
 						   dev->hotplug_user_indicators);
 	dev->pref_window = PCI_SER_GET(dev, pref_window,
 				       dev->pref_window);
-	dev->pref_64_window = PCI_SER_GET(dev, pref_64_window,
+	if (!pci_lu_adopt(dev))
+		dev->pref_64_window = PCI_SER_GET(dev, pref_64_window,
 					  dev->pref_64_window);
 
 	switch (dev->hdr_type) {		    /* header type */
@@ -2269,6 +2282,10 @@ int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
 	if (!host)
 		return 0;
 
+
+	if (pci_lu_adopt(dev))
+		return 0;
+
 	/*
 	 * If some device in the hierarchy doesn't handle Extended Tags
 	 * correctly, make sure they're disabled.
@@ -2373,7 +2390,7 @@ static void pci_configure_serr(struct pci_dev *dev)
 		 * endpoint unless SERR# forwarding is enabled.
 		 */
 		pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &control);
-		if (!(control & PCI_BRIDGE_CTL_SERR)) {
+		if (!pci_lu_adopt(dev) && !(control & PCI_BRIDGE_CTL_SERR)) {
 			control |= PCI_BRIDGE_CTL_SERR;
 			pci_write_config_word(dev, PCI_BRIDGE_CONTROL, control);
 		}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 07c3d021a47ec794aaae13e1c12a667cfb47cb45..276a62c6957218c0c89d8881b1a4d6f6d419dacf 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -706,6 +706,9 @@ static void pci_setup_bridge_io(struct pci_dev *bridge)
 		io_upper16 = 0;
 		l = 0x00f0;
 	}
+
+	if (pci_lu_adopt(bridge))
+		return;
 	/* Temporarily disable the I/O range before updating PCI_IO_BASE */
 	pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
 	/* Update lower 16 bits of I/O base/limit */
@@ -732,6 +735,8 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge)
 	} else {
 		l = 0x0000fff0;
 	}
+	if (pci_lu_adopt(bridge))
+		return;
 	pci_write_config_dword(bridge, PCI_MEMORY_BASE, l);
 }
 
@@ -765,6 +770,8 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev *bridge)
 	} else {
 		l = 0x0000fff0;
 	}
+	if (pci_lu_adopt(bridge))
+		return;
 	pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l);
 
 	/* Set the upper 32 bits of PREF base & limit */
@@ -787,7 +794,8 @@ static void __pci_setup_bridge(struct pci_bus *bus, unsigned long type)
 	if (type & IORESOURCE_PREFETCH)
 		pci_setup_bridge_mmio_pref(bridge);
 
-	pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
+	if (!pci_lu_adopt(bridge))
+		pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
 }
 
 void __weak pcibios_setup_bridge(struct pci_bus *bus, unsigned long type)

-- 
2.50.1.487.gc89ff58d15-goog

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Thomas Gleixner 2 months, 1 week ago

On Mon, Jul 28 2025 at 01:24, Chris Li wrote:
> The liveupdate devices are already initialized by the kernel before the
> kexec. During the kexec the device is still running. Avoid write to the
> liveupdate devices during the new kernel boot up.

This change log is way too meager for this kind of change.

 1) You want to explain in detail how this works.

    "initialized by the kernel before the kexec" is as vague as it gets.

 2) Avoid write ....

    Again this lacks any information how this is supposed to work correctly.

>  drivers/pci/ats.c            |  7 ++--
>  drivers/pci/iov.c            | 58 ++++++++++++++++++------------
>  drivers/pci/msi/msi.c        | 32 ++++++++++++-----
>  drivers/pci/msi/pcidev_msi.c |  4 +--
>  drivers/pci/pci-acpi.c       |  3 ++
>  drivers/pci/pci.c            | 85 +++++++++++++++++++++++++++++---------------
>  drivers/pci/pci.h            |  9 ++++-
>  drivers/pci/pcie/aspm.c      |  7 ++--
>  drivers/pci/pcie/pme.c       | 11 ++++--
>  drivers/pci/probe.c          | 43 +++++++++++++++-------
>  drivers/pci/setup-bus.c      | 10 +++++-

Then you sprinkle this stuff into files, which have completely different
purposes, without any explanation for the particular instances why they
are supposed to be correct and how this works.

I'm just looking at the MSI parts, as I have no expertise with the rest.

> diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644
> --- a/drivers/pci/msi/msi.c
> +++ b/drivers/pci/msi/msi.c
> @@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev)
>  
>  void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
>  {
> -	raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock;
> +	struct pci_dev *pci_dev = to_pci_dev(desc->dev);
> +	raw_spinlock_t *lock = &pci_dev->msi_lock;
>  	unsigned long flags;
>  
>  	if (!desc->pci.msi_attrib.can_mask)
> @@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
>  	raw_spin_lock_irqsave(lock, flags);
>  	desc->pci.msi_mask &= ~clear;
>  	desc->pci.msi_mask |= set;
> -	pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
> -			       desc->pci.msi_mask);
> +	if (!pci_lu_adopt(pci_dev))
> +		pci_write_config_dword(pci_dev, desc->pci.mask_pos,
> +				       desc->pci.msi_mask);

This results in inconsistent state, which is a bad idea to begin
with. How is cached software state and hardware state going to be
brought in sync at some point?

If you analyzed all places, which actually depend on hardware state and
make decisions based on it, for correctness, then you failed to provide
that analysis. If not, no comment.

>  	raw_spin_unlock_irqrestore(lock, flags);
>  }
>  
> @@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc,
>  	int pos = dev->msi_cap;
>  	u16 msgctl;
>  
> +	if (pci_lu_adopt(dev))
> +		return;
> +
>  	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
>  	msgctl &= ~PCI_MSI_FLAGS_QSIZE;
>  	msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple);
> @@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg
>  
>  	if (desc->pci.msi_attrib.is_virtual)
>  		return;
> +	if (pci_lu_adopt(to_pci_dev(desc->dev)))
> +		return;

So you don't allow the new kernel to write the MSI message, but the
interrupt subsystem has this new message and there are places which
utilize that cached message. How is this supposed to work?

>  	/*
>  	 * The specification mandates that the entry is masked
>  	 * when the message is modified:
> @@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
>  	control &= ~PCI_MSI_FLAGS_ENABLE;
>  	if (enable)
>  		control |= PCI_MSI_FLAGS_ENABLE;
> -	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
> +	if (!pci_lu_adopt(dev))
> +		pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);

The placement of these conditionals is arbitrary. Some are the begin of
a function, others just block the write. Is that based on some logic or
were the places selected by shabby AI queries?

>  static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> @@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
>  {
>  	u16 ctrl;
>  
> +	BUG_ON(pci_lu_adopt(dev));

Not going to happen. BUG() is only appropriate when there is absolutely
no way to handle a situation. This is as undocumented as everything else
here.

>  	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
>  	ctrl &= ~clear;
>  	ctrl |= set;
> @@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
>  	 * registers can be accessed.  Mask all the vectors to prevent
>  	 * interrupts coming in before they're fully set up.
>  	 */
> -	pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
> -				    PCI_MSIX_FLAGS_ENABLE);
> +	if (!pci_lu_adopt(dev))
> +		pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
> +					    PCI_MSIX_FLAGS_ENABLE);

And for enhanced annoyance you sprinkle this condition everywhere into
the code and then BUG() when you missed an instance. Because putting it
into the function which is invoked a gazillion of times would be too
obvious, right? That would at least be tasteful, but that's not the
primary problem of all this.

Sprinkling these conditionals all over the place is absolutely
unmaintainable, error prone and burdens everyone with this insanity and
the related hard to chase bugs.

Especially as there is no concept behind this and zero documentation how
any of this should work or even be remotely correct.

Before you start the next hackery, please sit down and write up coherent
explanations:

  What is the general concept of this?

  What is the exact state in which a device is left when the old kernel
  jumps into the new kernel?

  What is the state of the MSI[-X] or legacy PCI interrupts at this
  point?

  Can the device raise interrupts during the transition from the old to
  the new kernel?

  How is the "live" state of the device reflected and restored
  throughout the interrupt subsystem?

  How is the device driver supposed to attach to the same interrupt
  state as before?

  How are the potentially different Linux interrupt numbers mapped to
  the previous state?

Before this materializes and is agreed on, this is not going anywhere.

Thanks,

        tglx

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Chris Li 2 months, 1 week ago

Hi Thomas,

On Mon, Jul 28, 2025 at 10:23 AM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Mon, Jul 28 2025 at 01:24, Chris Li wrote:
> > The liveupdate devices are already initialized by the kernel before the
> > kexec. During the kexec the device is still running. Avoid write to the
> > liveupdate devices during the new kernel boot up.
>
> This change log is way too meager for this kind of change.

I agree with you. I mention it in the cover letter, I do expect this
part of change to be controversial. This RFC series is just to kick
off the discussion for PCI device liveupdate.

>  1) You want to explain in detail how this works.
>     "initialized by the kernel before the kexec" is as vague as it gets.

Agree. Sorry I haven't included more documents in this series. Working on it.

>
>  2) Avoid write ....
>
>     Again this lacks any information how this is supposed to work correctly.

I guess I haven't presented the big picture of how liveupdate works
with a PCI device.

Let's start with the background why we want to do this. We want to
upgrade a host kernel, which has a VM running with a GPU device
attached to the VM via vfio_pci. We want the host kernel upgrade in a
way that the VM can continue without shutting down and restarting the
VM. The VM will pause during the host kernel kexec. The GPU device
will continue running and DMA without pausing. VM will not be able to
run the interrupt until the new kernel is finished booting and resume
the VM.

Pasha's LUO series already have designs on the liveupdate state, with
callback associated with the state.

https://lore.kernel.org/lkml/20250515182322.117840-1-pasha.tatashin@soleen.com/

I copy paste some of Pasha's LUO state here:
==========quote==========
LUO State Machine and Events:

NORMAL:   Default operational state.
PREPARED: Initial preparation complete after LIVEUPDATE_PREPARE
          event. Subsystems have saved initial state.
FROZEN:   Final "blackout window" state after LIVEUPDATE_FREEZE
          event, just before kexec. Workloads must be suspended.
UPDATED:  Next kernel has booted via live update. Awaiting restoration
          and LIVEUPDATE_FINISH.

Events:
LIVEUPDATE_PREPARE: Prepare for reboot, serialize state.
LIVEUPDATE_FREEZE:  Final opportunity to save state before kexec.
LIVEUPDATE_FINISH:  Post-reboot cleanup in the next kernel.
LIVEUPDATE_CANCEL:  Abort prepare or freeze, revert changes.
==========quote ends ===========

The PCI core register will register as a subsystem to LUO and
participate in the LUO callbacks.
1) In NORMAL state:
The PCI device will register to the PCI subsystem by setting the
pci_dev->dev.lu.requested flag.

2) PREPARE callback. The PCI subsystem will build the list of the PCI
devices using the PCI device dependency. VF depends on PF, PCI devices
depend on the parent bridge.

The PCI subsystem will save the struct pci_dev part of the pci device
state. Then forward the prepare callback to the PCI devices to
serialize the PCI devices driver state. The VM  is still running but
with some limitations. e.g. can't create new DMA mapping. can't attach
to an additional new vfio_pci device.

3) FREEZE callback: VM is paused. Last change for PCI device to
serialize the device state.

4) kexec booting up the new kernel.

5) PCI device enumeration and probing. Find the PCI device in the
serialized preserved device list, restore the device serialized data
pointer for PCI device. PF device probe(), restores the number of  VF
and creates the VF, the VF device probe()

6) VM re-attach to the requested PCI device via vfio_pci.

7) FINISH callback. PCI subsystem and PCI devices free their preserved
serialized data. System go back to NORMAL state.

8) VM resume running.

>
> >  drivers/pci/ats.c            |  7 ++--
> >  drivers/pci/iov.c            | 58 ++++++++++++++++++------------
> >  drivers/pci/msi/msi.c        | 32 ++++++++++++-----
> >  drivers/pci/msi/pcidev_msi.c |  4 +--
> >  drivers/pci/pci-acpi.c       |  3 ++
> >  drivers/pci/pci.c            | 85 +++++++++++++++++++++++++++++---------------
> >  drivers/pci/pci.h            |  9 ++++-
> >  drivers/pci/pcie/aspm.c      |  7 ++--
> >  drivers/pci/pcie/pme.c       | 11 ++++--
> >  drivers/pci/probe.c          | 43 +++++++++++++++-------
> >  drivers/pci/setup-bus.c      | 10 +++++-
>
> Then you sprinkle this stuff into files, which have completely different
> purposes, without any explanation for the particular instances why they
> are supposed to be correct and how this works.

They follow a pattern that the original kernel needs to write to the
device and change the device state. The liveupdate device needs to
maintain the previous state not changed, therefore needs to prevent
such write initialization in liveupdate case.

I can certainly split it into more patches and group them by functions
in the later series.
This patch does it in a whole sale just to demonstrate what needs to
happen to make a device live update.

>
> I'm just looking at the MSI parts, as I have no expertise with the rest.

Thank you for your feedback, that is very helpful.

>
> > diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
> > index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644
> > --- a/drivers/pci/msi/msi.c
> > +++ b/drivers/pci/msi/msi.c
> > @@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev)
> >
> >  void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
> >  {
> > -     raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock;
> > +     struct pci_dev *pci_dev = to_pci_dev(desc->dev);
> > +     raw_spinlock_t *lock = &pci_dev->msi_lock;
> >       unsigned long flags;
> >
> >       if (!desc->pci.msi_attrib.can_mask)
> > @@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
> >       raw_spin_lock_irqsave(lock, flags);
> >       desc->pci.msi_mask &= ~clear;
> >       desc->pci.msi_mask |= set;
> > -     pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
> > -                            desc->pci.msi_mask);
> > +     if (!pci_lu_adopt(pci_dev))
> > +             pci_write_config_dword(pci_dev, desc->pci.mask_pos,
> > +                                    desc->pci.msi_mask);
>
> This results in inconsistent state, which is a bad idea to begin
> with. How is cached software state and hardware state going to be
> brought in sync at some point?

Yes, to make the interrupt fully working we need to tell the new
kernel about the previous kernel's interrupt descriptor in IOMMU etc.
As it is, the liveupdate device interrupt is not fully working yet.
David is working on the interrupt and later there will be an interrupt
series to make interrupt working with liveupdate devices. This is just
the first baby step.

>
> If you analyzed all places, which actually depend on hardware state and
> make decisions based on it, for correctness, then you failed to provide
> that analysis. If not, no comment.

Let me clarify. This avoid writing to devices only applies to
liveupdate devices. Only between FREEZE and FINISH. After the LUO
finish(), LUO is back to normal state again. The device can be
writable again as normal, most likely by the VM. We don't want the
device state to change between FREEZE and FINISH.

>
> >       raw_spin_unlock_irqrestore(lock, flags);
> >  }
> >
> > @@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc,
> >       int pos = dev->msi_cap;
> >       u16 msgctl;
> >
> > +     if (pci_lu_adopt(dev))
> > +             return;
> > +
> >       pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
> >       msgctl &= ~PCI_MSI_FLAGS_QSIZE;
> >       msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple);
> > @@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg
> >
> >       if (desc->pci.msi_attrib.is_virtual)
> >               return;
> > +     if (pci_lu_adopt(to_pci_dev(desc->dev)))
> > +             return;
>
> So you don't allow the new kernel to write the MSI message, but the
> interrupt subsystem has this new message and there are places which
> utilize that cached message. How is this supposed to work?

We don't allow the PCI subsystem or driver to write the MSI message
before FINISH.
There are two possible ways. 1) Have someone save the incoming MSI
message somehow, and re-deliver them after the FINISH call. 2) Don't
save the MSI message between FREEZE and FINISH. At finish, deliver one
spurious interrupt to the device driver, so the device driver can have
a chance to check if there is any pending work that needs to be done.
It is possible that no MSI has been dropped, the driver finds out
there is nothing that needs to be done. We expect the driver can
tolerate such one time spurious interruptions. Because spurious
interruptions can happen for other reasons, that should be fine? Let
me know if there is a case where this kind of spurious interrupt can
cause a problem, we are very interested to know.

>
> >       /*
> >        * The specification mandates that the entry is masked
> >        * when the message is modified:
> > @@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
> >       control &= ~PCI_MSI_FLAGS_ENABLE;
> >       if (enable)
> >               control |= PCI_MSI_FLAGS_ENABLE;
> > -     pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
> > +     if (!pci_lu_adopt(dev))
> > +             pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
>
> The placement of these conditionals is arbitrary. Some are the begin of
> a function, others just block the write. Is that based on some logic or
> were the places selected by shabby AI queries?
They all can be converted to the pattern as:
if (!pci_luo_adopt(dev))
      pci_write_config_xxx().

Sometimes I choose to return early if there is multiple write but not
data stored in struct pci_dev. Mostly just try to reduce the number of
if (!pci_luo_adopt(dev)). I am not satisfied with this change yet. The
goal of this patch is to show what effect needs to happen, we can
discuss better ways to do it.

>
> >  static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
> > @@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
> >  {
> >       u16 ctrl;
> >
> > +     BUG_ON(pci_lu_adopt(dev));
>
> Not going to happen. BUG() is only appropriate when there is absolutely
> no way to handle a situation. This is as undocumented as everything else
> here.

Agree. This is some developing/debug stuff left over. I haven't
encountered msix_clear_and_set_ctrl() in my test. I will remove the
bug in the next version.

>
> >       pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
> >       ctrl &= ~clear;
> >       ctrl |= set;
> > @@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
> >        * registers can be accessed.  Mask all the vectors to prevent
> >        * interrupts coming in before they're fully set up.
> >        */
> > -     pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
> > -                                 PCI_MSIX_FLAGS_ENABLE);
> > +     if (!pci_lu_adopt(dev))
> > +             pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
> > +                                         PCI_MSIX_FLAGS_ENABLE);
>
> And for enhanced annoyance you sprinkle this condition everywhere into
> the code and then BUG() when you missed an instance. Because putting it
> into the function which is invoked a gazillion of times would be too
> obvious, right? That would at least be tasteful, but that's not the
> primary problem of all this.
>
> Sprinkling these conditionals all over the place is absolutely
> unmaintainable, error prone and burdens everyone with this insanity and
> the related hard to chase bugs.

If you prefer, I can move them all into the pci_config_write. We
actually start with pci_config_write_xxx(). But that solution has its
own problem as well.  For starters, the function name does not reflect
what the function actually does any more. Also for the complicated
case, where liveupdate does need to write some config register but not
the other. e.g. From the live update point of view, PF devices
shouldn't write to SR-IOV related registers that change the VF devices
number. But PF devices should be able to tolerate some other config
space write, because the VM is not using the PF device. The PF device
state can be changed without impacting the VM.
It is going to be unmaintainable to make a complicated logic inside
pci_config_write_xxx(), depending on which caller and what state, what
is allowed and what is not.

I can discuss and try different approaches to address this problem. I
understand it is a hard problem. I don't have a perfect solution
without cons. This is just the first baby step to demonstrate what is
the resulting effect we want. Then we can shape the code to our
liking. I am happy to explore other approaches as well.

>
> Especially as there is no concept behind this and zero documentation how
> any of this should work or even be remotely correct.

I hope the above description can help you understand better why we
want to do it and the approach we take. I am happy to answer questions
if you have any. Mind you that I don't have all the answers. It is
part of the journey to find the best solution.
>
> Before you start the next hackery, please sit down and write up coherent
> explanations:
>
>   What is the general concept of this?

See above.

>
>   What is the exact state in which a device is left when the old kernel
>   jumps into the new kernel?

The device allows DMA to the mapping region during PREPARE and raise
interrupt. The interrupt handler will not be able to run during kexec
black out period (between freeze and finish). Other than the state
store in the device, there is also a PCI subsystem and device driver
state serialized in the preserved folio for the next kernel to
interrupt.

>   What is the state of the MSI[-X] or legacy PCI interrupts at this
>   point?

The current approach is that, just drop the interrupt during black out
period (between freeze and finish) then deliver a spurious interrupt
to the device at finish(), that gives the device driver a chance to
perform the interrupt handler action which can't happen in black out.

>
>   Can the device raise interrupts during the transition from the old to
>   the new kernel?

Yes, can raise interrupt but interrupt handle won't able to run during
black out.
After finish() it is business as usual.
>
>   How is the "live" state of the device reflected and restored
>   throughout the interrupt subsystem?

Those are very good questions. Current approach just drop them and use
the spurious interrupt to catch up in the end.
>
>   How is the device driver supposed to attach to the same interrupt
>   state as before?

We can't if we did not save the interrupt state changed during black
out. Current approach is just using a spurious interrupt to catch up
in finish().

>
>   How are the potentially different Linux interrupt numbers mapped to
>   the previous state?
The IRQ number will remain the state cross kexec. However the
interrupt descriptor address might have changed in the new kernel. We
need to save some of the interrupt descriptor and interrupt state into
the preserved folio for the next kernel to rebuild. To be continued in
the interrupt series. Not covered by this patch series yet.
>
> Before this materializes and is agreed on, this is not going anywhere.

Those are very good questions. Hopefully I have answered some of it.
Please let me know if you have more questions I can clarify.

Again this is just an RFC to show what was the resulting effect we
want to get from the PCI device livedupate. It is not complete nor
perfect. I am happy to explore different approaches.

Thanks for the questions. I still owe you a write up document for the
PCI device liveupdate. I will work on that.

Hope that helps explain some of the background and approach. It is not
a substitution of the document. I am working on that and will include
it in the next version.

Chris

Chris

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Jason Gunthorpe 2 months ago

On Tue, Jul 29, 2025 at 06:51:27PM -0700, Chris Li wrote:

> They follow a pattern that the original kernel needs to write to the
> device and change the device state. The liveupdate device needs to
> maintain the previous state not changed, therefore needs to prevent
> such write initialization in liveupdate case.

No, I fundamentally reject this position and your testing methodology.

The new kernel *should* be writing to config space and it *should* be
doing things like clearing and gaining control over MSI. It is fully
wrong to be blocking it like you are doing just to satify some
incorrect qemu based test checking for no config access.

Only some config accesse are bad. Each and every "bad" one needs to be
clearly explained *why* it is bad and only then mitigated.

Most mitigation are far harder than just if'ing around the config
write. My ATS/PASID/etc example for instance.

Jason

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Chris Li 2 months ago

On Thu, Jul 31, 2025 at 8:02 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Jul 29, 2025 at 06:51:27PM -0700, Chris Li wrote:
>
> > They follow a pattern that the original kernel needs to write to the
> > device and change the device state. The liveupdate device needs to
> > maintain the previous state not changed, therefore needs to prevent
> > such write initialization in liveupdate case.
>
> No, I fundamentally reject this position and your testing methodology.
>
> The new kernel *should* be writing to config space and it *should* be
> doing things like clearing and gaining control over MSI. It is fully
> wrong to be blocking it like you are doing just to satify some
> incorrect qemu based test checking for no config access.

First of all, let me clarify that the PCI PF and VF tests I mention in
the cover letter are run on the real data center servers, not qemu.
QEMU does not have the correct IOMMU simulation for my workstation
anyway. I do use qemu in development to quickly check if I screwed up
something badly. The real test is always on the real machine. Our
internal test dashboard has reached a high two digit number now, all
with real hardware.

With that out of the way. Let me explain why we did it the way we did.
I believe you and I eventually want the same thing, just different
ways to get there. I am also working on a series that allows fine
grain control of  PCI preservation. It allows the driver to select
exactly what needs to be preserved, rather than the current
"preserved" vs "depended" control. With the fine grain control, it can
basically do what you described, allow new kernel writes to config
space they don't want to preserve. However this RFC series is already
getting very long, that is why I did not include the fine grain
control series in this RFC. Keep in mind that this is just RFC, I want
to demonstrate the problem space, and what source code needs to be
modified in order to preserve all config space. It is not the final
version that gets merged. Your feedback is important to us.

My philosophy is that the LUO PCI subsystem is for service of the PCI
device driver. Ultimately it is the PCI device driver who decides what
part of the config space they want to preserve or overwrite. The PCI
layer is just there to facilitate that service.

Regarding the testing. There are many different tests we can write and
run. Preserving all config space is just one of them.  We also have
other tests that partially preserve the config space and write to some
config as it needs to. That is why I need to have the fine grain
control series.

If you still think it is unjustifiable to have one test try to
preserve all config space for liveupdate. Please elaborate your
reasoning. I am very curious.
With the fine grained control we let the driver decide what the driver
wants to preserve vs not, will that remove your objection?

> Only some config accesse are bad. Each and every "bad" one needs to be
> clearly explained *why* it is bad and only then mitigated.

That is exactly the reason why we have the conservative test that
preserves every config space test as a starting point. It does not
mean that is the ending point.  We also have tests that only partially
preserve the config space driver actually needs. When things break, we
can quickly compare to find out not preserving which register will
break which device. This incremental approach is very effective to
deal with very complex devices.

Another constraint is that the data center servers are dependent on
the network device able to connect to the network appropriately. Take
diorite NIC  for example, if I try only preserving ATS/PASID did not
finish the rest of liveupdate, the nic wasn't able to boot up and
connect to the network all the way. Even if the test passes for the
ATS part, the over test fails because the server is not back online. I
can't include that test into the test dashboard, because it brings
down the server. The only way to recover from that is rebooting the
server, which takes a long time for a big server. I can only keep that
non-passing test as my own private developing test, not the regression
test set.

That is the reason we to have some conservative tests passing first,
then expand to the more risky tests. We are actually quickly expanding
our test metrics for doing more and more interesting(and risky) stuff.

I hope that clarifies the eventual end goal and the development
approach we take.

> Most mitigation are far harder than just if'ing around the config
> write. My ATS/PASID/etc example for instance.

Exactly why we can't add those risky(non working) tests into the
dashboard before the conservative passing one.

Chris

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Jason Gunthorpe 2 months ago

On Fri, Aug 01, 2025 at 04:04:39PM -0700, Chris Li wrote:
> My philosophy is that the LUO PCI subsystem is for service of the PCI
> device driver. Ultimately it is the PCI device driver who decides what
> part of the config space they want to preserve or overwrite. The PCI
> layer is just there to facilitate that service.

I don't think this makes any sense at all. There is nothing the device
driver can contribute here.

> If you still think it is unjustifiable to have one test try to
> preserve all config space for liveupdate. 

I do think it is unjustifiable, it is architecurally wrong. You only
should be preserving the absolute bare minimum of config space bits
and everything else should be rewritten by the next kernel in the
normal way. This MSI is a prime example of a nonsensical outcome if
you take the position the config space should not be written to.

> > Only some config accesse are bad. Each and every "bad" one needs to be
> > clearly explained *why* it is bad and only then mitigated.
> 
> That is exactly the reason why we have the conservative test that
> preserves every config space test as a starting point. 

That is completely the opposite of what I said. Preserving everything
is giving up on the harder job of identifying which bits cannot be
changed, explaining why they can't be changed, and then mitigating
only those things.

> Another constraint is that the data center servers are dependent on
> the network device able to connect to the network appropriately. Take
> diorite NIC  for example, if I try only preserving ATS/PASID did not
> finish the rest of liveupdate, the nic wasn't able to boot up and
> connect to the network all the way. Even if the test passes for the
> ATS part, the over test fails because the server is not back online. I
> can't include that test into the test dashboard, because it brings
> down the server. The only way to recover from that is rebooting the
> server, which takes a long time for a big server. I can only keep that
> non-passing test as my own private developing test, not the regression
> test set.

I have no idea what this is trying to say and it sounds like you also
can't explain exactly what is "wrong" and justify why things are being
preserved.

Again, your series should be starting simpler. Perserve the dumbest
simplest PCI configuration. Certainly no switches, P2P, ATS or
PASID. When that is working you can then add on more complex PCI
features piece by piece.

Jason

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Chris Li 1 month, 4 weeks ago

Hi Jason,

Thanks for your feedback.

On Sat, Aug 2, 2025 at 6:50 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, Aug 01, 2025 at 04:04:39PM -0700, Chris Li wrote:
> > My philosophy is that the LUO PCI subsystem is for service of the PCI
> > device driver. Ultimately it is the PCI device driver who decides what
> > part of the config space they want to preserve or overwrite. The PCI
> > layer is just there to facilitate that service.
>
> I don't think this makes any sense at all. There is nothing the device
> driver can contribute here.

I am considering that the device driver owner will know a lot more
device internal knowledge, e.g. why it needs to reserve this and that
register where the PCI layer might not know much about the internal
device behavior.

> > If you still think it is unjustifiable to have one test try to
> > preserve all config space for liveupdate.
>
> I do think it is unjustifiable, it is architecurally wrong. You only
> should be preserving the absolute bare minimum of config space bits
> and everything else should be rewritten by the next kernel in the
> normal way. This MSI is a prime example of a nonsensical outcome if
> you take the position the config space should not be written to.

OK. Let me rework the V2 with your approach.

>
> > > Only some config accesse are bad. Each and every "bad" one needs to be
> > > clearly explained *why* it is bad and only then mitigated.
> >
> > That is exactly the reason why we have the conservative test that
> > preserves every config space test as a starting point.
>
> That is completely the opposite of what I said. Preserving everything
> is giving up on the harder job of identifying which bits cannot be
> changed, explaining why they can't be changed, and then mitigating
> only those things.

We can still preserve every thing then work backwards to preserve
less.  As I said, I will rework V2 with your approach preserving bare
minimum as the starting place.

> > Another constraint is that the data center servers are dependent on
> > the network device able to connect to the network appropriately. Take
> > diorite NIC  for example, if I try only preserving ATS/PASID did not
> > finish the rest of liveupdate, the nic wasn't able to boot up and
> > connect to the network all the way. Even if the test passes for the
> > ATS part, the over test fails because the server is not back online. I
> > can't include that test into the test dashboard, because it brings
> > down the server. The only way to recover from that is rebooting the
> > server, which takes a long time for a big server. I can only keep that
> > non-passing test as my own private developing test, not the regression
> > test set.
>
> I have no idea what this is trying to say and it sounds like you also
> can't explain exactly what is "wrong" and justify why things are being
> preserved.

I know what register is causing the trouble but I think we are under a
different philosophy of addressing the problem from different ends.
Another consideration is the device testing matrixs. The kexec with
device liveupdate is a rare event. With that many device state
re-initializing might trigger some very rare bug in the device or
firmware. So it might be due to the device internal implementation,
even though PCI spec might say otherwise or undefined.

Anyway, let me do it your way in V2 then.

> Again, your series should be starting simpler. Perserve the dumbest
> simplest PCI configuration. Certainly no switches, P2P, ATS or
> PASID. When that is working you can then add on more complex PCI
> features piece by piece.

With the V1 the patch series deliverable is having an Intel diorite
NVMe device preserve every config space access and pass to the vfio
and iommu people to build the vfio and iommu on top of it. Let's
forget about V1.

With V2 I want to start with the minimal end. No switches,P2P, ATS or
PASID. I need some help to define what is deliverable in such a
minimal preserve. e.g. Do I be able to read back the config value not
changed then call it a day. Or do I expect to see the device fully
initialized, it is able to be used by the user space. Will the device
need to perform any DMA? Interrupt?

I will probably find a device as simple as possible and it is attached
to the root PCI host bridge, not the PCI-PCI bridge.
Maybe no interrupt as the first step. One possibility is using the
Intel DSA device that does the DMA streaming.

If you have any other feedback on the candidate device and deliverable
test for V2, I am looking forward to it.

Thanks.

Chris

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Jason Gunthorpe 2 months, 1 week ago

On Mon, Jul 28, 2025 at 07:23:03PM +0200, Thomas Gleixner wrote:
> On Mon, Jul 28 2025 at 01:24, Chris Li wrote:
> > The liveupdate devices are already initialized by the kernel before the
> > kexec. During the kexec the device is still running. Avoid write to the
> > liveupdate devices during the new kernel boot up.
> 
> This change log is way too meager for this kind of change.
> 
>  1) You want to explain in detail how this works.
> 
>     "initialized by the kernel before the kexec" is as vague as it gets.
> 
>  2) Avoid write ....
> 
>     Again this lacks any information how this is supposed to work correctly.
> 
> >  drivers/pci/ats.c            |  7 ++--
> >  drivers/pci/iov.c            | 58 ++++++++++++++++++------------
> >  drivers/pci/msi/msi.c        | 32 ++++++++++++-----
> >  drivers/pci/msi/pcidev_msi.c |  4 +--
> >  drivers/pci/pci-acpi.c       |  3 ++
> >  drivers/pci/pci.c            | 85 +++++++++++++++++++++++++++++---------------
> >  drivers/pci/pci.h            |  9 ++++-
> >  drivers/pci/pcie/aspm.c      |  7 ++--
> >  drivers/pci/pcie/pme.c       | 11 ++++--
> >  drivers/pci/probe.c          | 43 +++++++++++++++-------
> >  drivers/pci/setup-bus.c      | 10 +++++-
> 
> Then you sprinkle this stuff into files, which have completely different
> purposes, without any explanation for the particular instances why they
> are supposed to be correct and how this works.

Yeah, everyting needs to be very carefully explained.

For instance I'm not sure we should be doing *anything* to the
MSI. Why did you think so?

MSI should be fully cleared by the new kernel and the new VFIO should
re-establish all the MSI routing from scratch as part of adopting the
device. We already accept that any interrupts are lost during the
kexec process so what reason is there to do anything except start up the
new kernel with a fully disabled MSI and cleared MSI?

If otherwise it should be explained why we can't work this way - and
then explain how the new kernel will adopt the inherited operating MSI
(hint: I doubt it can) without disrupting it.

Same remark for everything. Explain in the commits and perhaps a well
placed comment why anything needs to be done and why exactly we can't
use the cold boot flow for each item.

eg "we can't use the cold boot flow for BAR sizing because BAR sizing
requires changing the BAR register and that will break ongoing P2P
DMAs"

"we can't use the cold boot flow for bridge windows because changing
the bridge windows in any way will break ongoing P2P DMAs" (though you
also need to explain why the cold boot flow would change the bridge
windows)

etc etc.

There is also some complication here as the iommu driver technically
owns some of the PCI state, and we really don't want the PCI Core to
change it, but we do need theiommu driver to affirm what the in-use
state should be because it is responsible to clean it up.

This may actually require some restructing of the iommu driver/pci
core interfaces to switch from an enable/disbale language to a 'target
state' language. Ie "ATS shall be on and ATS page size shall be X".

This series is very big, so I would probably try to break it up into
smaller chunks. Like you don't need to preserve bridge windows and
BARs if you don't support P2P. You don't need to worry about ATS and
PASID if you don't support those, etc, etc.

Yes, in the end all needs to be supported, but going bit by bit will
be easier for people to understand. Basic VFIO support with a basic
IOMMU using basic PCI with no P2P is the simplest thing you can do,
and I think it needs surprisingly little preservation.

Jason

Re: [PATCH RFC 20/25] PCI/LUO: Avoid write to liveupdate devices at boot

Posted by Chris Li 2 months, 1 week ago

On Mon, Jul 28, 2025 at 4:50 PM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > Then you sprinkle this stuff into files, which have completely different
> > purposes, without any explanation for the particular instances why they
> > are supposed to be correct and how this works.
>
> Yeah, everyting needs to be very carefully explained.

Agree. I did some explanation in my last email reply to Thomas. Will
add a document for the next version.

>
> For instance I'm not sure we should be doing *anything* to the
> MSI. Why did you think so?
>
> MSI should be fully cleared by the new kernel and the new VFIO should
> re-establish all the MSI routing from scratch as part of adopting the
> device. We already accept that any interrupts are lost during the
> kexec process so what reason is there to do anything except start up the
> new kernel with a fully disabled MSI and cleared MSI?

The current approach is that we fake/inject a spurious interrupt to
the device to allow the device driver to have a chance to process any
pending action for the interrupt. There is also a possibility there is
nothing the device driver needs to do due to no interrupt having ever
triggered in the kexec window.  We expect the driver can tolerate that
spurious interrupt.

The alternative is to try to (partially) process the interrupt during
kexec. e.g. remember which IRQ has the interrupt triggered. It will
make things much more complicated. Invoke interrupt handler in the
early boot stage before IOMMU is very tricky.
>
> If otherwise it should be explained why we can't work this way - and
> then explain how the new kernel will adopt the inherited operating MSI
> (hint: I doubt it can) without disrupting it.

Agree.

>
> Same remark for everything. Explain in the commits and perhaps a well
> placed comment why anything needs to be done and why exactly we can't
> use the cold boot flow for each item.

We certainly can do that.

I am trying to see if we can agree on the VFIO_PCI device used by the
VM. We don't want any config space register to change during the
liveupdate kexec (before finish). We can certainly change what config
space register might or might not break stuff. But it is going to be
very hard to test and verify what can break if we change this.

If we can draw a line and say, there is no config space to write to
the device between freeze and finish. It is much easier to reason from
the device point of view, the device should continue working. The
device has no way of knowing the host kernel has been changed. The
device has only a limited view of their config space, the DMA area it
can read/write to. If we preserve enough stuff, the device should
continue working. For most of the devices, we can reason with the
model that keeping the status quo will not break things.

There is an obvious exception to that, e.g. if the device has a
watchdog timer it needs to kick at regular intervals, if that interval
is shorter than the kexec cycle. It should be pretty rare and we can
deal with those when we actually encounter one.

>
> eg "we can't use the cold boot flow for BAR sizing because BAR sizing
> requires changing the BAR register and that will break ongoing P2P
> DMAs"
>
> "we can't use the cold boot flow for bridge windows because changing
> the bridge windows in any way will break ongoing P2P DMAs" (though you
> also need to explain why the cold boot flow would change the bridge
> windows)
>
> etc etc.

There will be some config space register hard to make sure changing it
will break things or not.
e.g. The base BAR register, if we change to a new memory region, and
all follow up write to the device using a BAR new address, should
things continue working? Will have a lot of corner case like this, it
is much easier to just avoid changing anything to make things
consistent.

>
> There is also some complication here as the iommu driver technically
> owns some of the PCI state, and we really don't want the PCI Core to
> change it, but we do need theiommu driver to affirm what the in-use
> state should be because it is responsible to clean it up.

Yes, there is overlap between PCI and IOMMU, more than just config
space write. The IOMMU needs to know which PCI device participates,
which set of groups it needs to save. CC Samiullah here, he knows more
about the IOMMU side of the liveupdate than I do.

> This may actually require some restructing of the iommu driver/pci
> core interfaces to switch from an enable/disbale language to a 'target
> state' language. Ie "ATS shall be on and ATS page size shall be X".
>
Ack.

I have some ideas to make the PCI initialization cleaner for this
usage as well. Instead of directly initiating and turning on features
if found. We can do in 3 stages:
1) enumerate PCI capability and get the list of capability available
but don't turn them on yet.
2) determine what capability needs to be turned on/off. For the normal
initiation without liveupdate, the current behavior mostly turns on
whatever can be turned on. For liveupdate devices, it would be
inherent the on/off from what the previous kernel hands off to the new
kernel. By either 1) reading the device state (assume reading state is
possible and does not change device state) or 2) previous kernel save
state into preserved folio and new kernel reads the state from
preserved folio.
3) Perform the action to turn on/off the according the result from 2).
For live update devices the most common case is skip write, that will
be noop. For normal initialization without liveupdate, it will turn on
the capability.

> This series is very big, so I would probably try to break it up into
> smaller chunks. Like you don't need to preserve bridge windows and
> BARs if you don't support P2P. You don't need to worry about ATS and
> PASID if you don't support those, etc, etc.

Yes, I can break it to smaller chunks.

One of the deliverables of this patch series is that I can test the
liveupdate with the pci-lu-stub and pci-lub-stub-pf driver. Having
additional patch to verify no PCI config space write has performed on
the requested PCI device during shutdown and kexec boot up.

> Yes, in the end all needs to be supported, but going bit by bit will
> be easier for people to understand. Basic VFIO support with a basic
> IOMMU using basic PCI with no P2P is the simplest thing you can do,
> and I think it needs surprisingly little preservation.

Yes, that is certainly possible ;-)

Because I am working on the PCI side of the liveupdate, there are
other developers working on VFIO and IOMMU depending on my PCI
changes. From the project development point of view the PCI change
needs to happen first, to unblock others. That is how I get here.

I can certainly break it down to smaller chunks.

Chris