The liveupdate devices are already initialized by the kernel before the
kexec. During the kexec the device is still running. Avoid write to the
liveupdate devices during the new kernel boot up.
Signed-off-by: Chris Li <chrisl@kernel.org>
---
drivers/pci/ats.c | 7 ++--
drivers/pci/iov.c | 58 ++++++++++++++++++------------
drivers/pci/msi/msi.c | 32 ++++++++++++-----
drivers/pci/msi/pcidev_msi.c | 4 +--
drivers/pci/pci-acpi.c | 3 ++
drivers/pci/pci.c | 85 +++++++++++++++++++++++++++++---------------
drivers/pci/pci.h | 9 ++++-
drivers/pci/pcie/aspm.c | 7 ++--
drivers/pci/pcie/pme.c | 11 ++++--
drivers/pci/probe.c | 43 +++++++++++++++-------
drivers/pci/setup-bus.c | 10 +++++-
11 files changed, 184 insertions(+), 85 deletions(-)
diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c
index ec6c8dbdc5e9c9959e822e016ab301bf483713a5..284f43c82593903058dee58ce64b82bad8aed710 100644
--- a/drivers/pci/ats.c
+++ b/drivers/pci/ats.c
@@ -75,7 +75,9 @@ int pci_prepare_ats(struct pci_dev *dev, int ps)
dev->ats_stu = ps;
ctrl = PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
- pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
+
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
return 0;
}
EXPORT_SYMBOL_GPL(pci_prepare_ats);
@@ -114,7 +116,8 @@ int pci_enable_ats(struct pci_dev *dev, int ps)
dev->ats_stu = ps;
ctrl |= PCI_ATS_CTRL_STU(dev->ats_stu - PCI_ATS_MIN_STU);
}
- pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->ats_cap + PCI_ATS_CTRL, ctrl);
dev->ats_enabled = 1;
return 0;
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 10693b5d7eb66bbbfb9b70ffe6e89eb89c8dc3a3..df27bcf840d9fc0dbce29810e288c1c2b74a70c9 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -85,7 +85,8 @@ static inline void pci_iov_set_numvfs(struct pci_dev *dev, int nr_virtfn)
{
struct pci_sriov *iov = dev->sriov;
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_NUM_VF, nr_virtfn);
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_OFFSET, &iov->offset);
pci_read_config_word(dev, iov->pos + PCI_SRIOV_VF_STRIDE, &iov->stride);
}
@@ -694,10 +695,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
pci_iov_set_numvfs(dev, nr_virtfn);
iov->ctrl |= PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE;
- pci_cfg_access_lock(dev);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
- msleep(100);
- pci_cfg_access_unlock(dev);
+ if (!pci_lu_adopt(dev)) {
+ pci_cfg_access_lock(dev);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ msleep(100);
+ pci_cfg_access_unlock(dev);
+ }
rc = sriov_add_vfs(dev, initial);
if (rc)
@@ -710,10 +713,12 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
err_pcibios:
iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
- pci_cfg_access_lock(dev);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
- ssleep(1);
- pci_cfg_access_unlock(dev);
+ if (!pci_lu_adopt(dev)) {
+ pci_cfg_access_lock(dev);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ ssleep(1);
+ pci_cfg_access_unlock(dev);
+ }
pcibios_sriov_disable(dev);
@@ -741,11 +746,13 @@ static void sriov_disable(struct pci_dev *dev)
return;
sriov_del_vfs(dev);
- iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
- pci_cfg_access_lock(dev);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
- ssleep(1);
- pci_cfg_access_unlock(dev);
+ if (!pci_lu_adopt(dev)) {
+ iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
+ pci_cfg_access_lock(dev);
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ ssleep(1);
+ pci_cfg_access_unlock(dev);
+ }
pcibios_sriov_disable(dev);
@@ -770,7 +777,7 @@ static int sriov_init(struct pci_dev *dev, int pos)
u32 sriovbars[PCI_SRIOV_NUM_BARS];
pci_read_config_word(dev, pos + PCI_SRIOV_CTRL, &ctrl);
- if (ctrl & PCI_SRIOV_CTRL_VFE) {
+ if (!pci_lu_adopt(dev) && ctrl & PCI_SRIOV_CTRL_VFE) {
pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, 0);
ssleep(1);
}
@@ -785,7 +792,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
ctrl |= PCI_SRIOV_CTRL_ARI;
found:
- pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, pos + PCI_SRIOV_CTRL, ctrl);
pci_read_config_word(dev, pos + PCI_SRIOV_TOTAL_VF, &total);
if (!total)
@@ -798,7 +806,8 @@ static int sriov_init(struct pci_dev *dev, int pos)
return -EIO;
pgsz &= ~(pgsz - 1);
- pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_dword(dev, pos + PCI_SRIOV_SYS_PGSIZE, pgsz);
iov = kzalloc(sizeof(*iov), GFP_KERNEL);
if (!iov)
@@ -904,14 +913,17 @@ static void sriov_restore_state(struct pci_dev *dev)
*/
ctrl &= ~PCI_SRIOV_CTRL_ARI;
ctrl |= iov->ctrl & PCI_SRIOV_CTRL_ARI;
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, ctrl);
for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
pci_update_resource(dev, i + PCI_IOV_RESOURCES);
- pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_dword(dev, iov->pos + PCI_SRIOV_SYS_PGSIZE, iov->pgsz);
pci_iov_set_numvfs(dev, iov->num_VFs);
- pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
if (iov->ctrl & PCI_SRIOV_CTRL_VFE)
msleep(100);
}
@@ -1013,10 +1025,12 @@ void pci_iov_update_resource(struct pci_dev *dev, int resno)
new |= res->flags & ~PCI_BASE_ADDRESS_MEM_MASK;
reg = iov->pos + PCI_SRIOV_BAR + 4 * vf_bar;
- pci_write_config_dword(dev, reg, new);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_dword(dev, reg, new);
if (res->flags & IORESOURCE_MEM_64) {
new = region.start >> 16 >> 16;
- pci_write_config_dword(dev, reg + 4, new);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_dword(dev, reg + 4, new);
}
}
diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c
index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644
--- a/drivers/pci/msi/msi.c
+++ b/drivers/pci/msi/msi.c
@@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev)
void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
{
- raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock;
+ struct pci_dev *pci_dev = to_pci_dev(desc->dev);
+ raw_spinlock_t *lock = &pci_dev->msi_lock;
unsigned long flags;
if (!desc->pci.msi_attrib.can_mask)
@@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set)
raw_spin_lock_irqsave(lock, flags);
desc->pci.msi_mask &= ~clear;
desc->pci.msi_mask |= set;
- pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos,
- desc->pci.msi_mask);
+ if (!pci_lu_adopt(pci_dev))
+ pci_write_config_dword(pci_dev, desc->pci.mask_pos,
+ desc->pci.msi_mask);
raw_spin_unlock_irqrestore(lock, flags);
}
@@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc,
int pos = dev->msi_cap;
u16 msgctl;
+ if (pci_lu_adopt(dev))
+ return;
+
pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
msgctl &= ~PCI_MSI_FLAGS_QSIZE;
msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple);
@@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg
if (desc->pci.msi_attrib.is_virtual)
return;
+ if (pci_lu_adopt(to_pci_dev(desc->dev)))
+ return;
/*
* The specification mandates that the entry is masked
* when the message is modified:
@@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable)
control &= ~PCI_MSI_FLAGS_ENABLE;
if (enable)
control |= PCI_MSI_FLAGS_ENABLE;
- pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
}
static int msi_setup_msi_desc(struct pci_dev *dev, int nvec,
@@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set)
{
u16 ctrl;
+ BUG_ON(pci_lu_adopt(dev));
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
ctrl &= ~clear;
ctrl |= set;
@@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
* registers can be accessed. Mask all the vectors to prevent
* interrupts coming in before they're fully set up.
*/
- pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
- PCI_MSIX_FLAGS_ENABLE);
+ if (!pci_lu_adopt(dev))
+ pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL |
+ PCI_MSIX_FLAGS_ENABLE);
/* Mark it enabled so setup functions can query it */
dev->msix_enabled = 1;
@@ -753,14 +763,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
*/
msix_mask_all(dev->msix_base, tsize);
}
- pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
+ if (!pci_lu_adopt(dev))
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL, 0);
pcibios_free_irq(dev);
return 0;
out_disable:
dev->msix_enabled = 0;
- pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
+ if (!pci_lu_adopt(dev))
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE, 0);
return ret;
}
@@ -864,6 +876,7 @@ void __pci_restore_msix_state(struct pci_dev *dev)
if (!dev->msix_enabled)
return;
+ BUG_ON(pci_lu_adopt(dev));
/* route the table */
pci_intx_for_msi(dev, 0);
pci_msix_clear_and_set_ctrl(dev, 0,
@@ -898,7 +911,8 @@ void pci_msix_shutdown(struct pci_dev *dev)
msi_for_each_desc(desc, &dev->dev, MSI_DESC_ALL)
pci_msix_mask(desc);
- pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
+ if (!pci_lu_adopt(dev))
+ pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0);
pci_intx_for_msi(dev, 1);
dev->msix_enabled = 0;
pcibios_alloc_irq(dev);
diff --git a/drivers/pci/msi/pcidev_msi.c b/drivers/pci/msi/pcidev_msi.c
index 5520aff53b5670e70311c63f0f358228bf03c309..f9f682a84a05ef47ff4d85e7d0e724cc7c2f5cdc 100644
--- a/drivers/pci/msi/pcidev_msi.c
+++ b/drivers/pci/msi/pcidev_msi.c
@@ -18,7 +18,7 @@ void pci_msi_init(struct pci_dev *dev)
return;
pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &ctrl);
- if (ctrl & PCI_MSI_FLAGS_ENABLE) {
+ if (!pci_lu_adopt(dev) && ctrl & PCI_MSI_FLAGS_ENABLE) {
pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS,
ctrl & ~PCI_MSI_FLAGS_ENABLE);
}
@@ -36,7 +36,7 @@ void pci_msix_init(struct pci_dev *dev)
return;
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl);
- if (ctrl & PCI_MSIX_FLAGS_ENABLE) {
+ if (!pci_lu_adopt(dev) && ctrl & PCI_MSIX_FLAGS_ENABLE) {
pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS,
ctrl & ~PCI_MSIX_FLAGS_ENABLE);
}
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index af370628e58393aa0cbdf6d283b3afe33e5effb5..b9e42a1352c87443dd5c4ee9f03bc8a0d343d714 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -172,6 +172,9 @@ static void program_hpx_type0(struct pci_dev *dev, struct hpx_type0 *hpx)
hpx = &pci_default_type0;
}
+ if (pci_lu_adopt(dev))
+ return;
+
pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE, hpx->cache_line_size);
pci_write_config_byte(dev, PCI_LATENCY_TIMER, hpx->latency_timer);
pci_read_config_word(dev, PCI_COMMAND, &pci_cmd);
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 46fb80dbca590c251fcad3bf2f011a16f6898810..c1cc723f979ae881cf07ad06e1fa0d472e8b89c6 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -218,7 +218,7 @@ int pci_status_get_and_clear_errors(struct pci_dev *pdev)
return -EIO;
status &= PCI_STATUS_ERROR_BITS;
- if (status)
+ if (status && !pci_lu_adopt(pdev))
pci_write_config_word(pdev, PCI_STATUS, status);
return status;
@@ -628,7 +628,7 @@ u64 pci_get_dsn(struct pci_dev *dev)
int pos;
pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DSN);
- if (!pos)
+ if (!pos && !pci_lu_adopt(dev))
return 0;
/*
@@ -1103,7 +1103,8 @@ static void pci_enable_acs(struct pci_dev *dev)
~(PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC));
__pci_config_acs(dev, &caps, config_acs_param, 0, 0);
- pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
}
/**
@@ -1394,7 +1395,8 @@ int pci_power_up(struct pci_dev *dev)
* Force the entire word to 0. This doesn't affect PME_Status, disables
* PME_En, and sets PowerState to 0.
*/
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, 0);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, 0);
/* Mandatory transition delays; see PCI PM 1.2. */
if (state == PCI_D3hot)
@@ -1552,7 +1554,8 @@ static int pci_set_low_power_state(struct pci_dev *dev, pci_power_t state, bool
pmcsr |= state;
/* Enter specified state */
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
/* Mandatory power management transition delays; see PCI PM 1.2. */
if (state == PCI_D3hot)
@@ -1781,7 +1784,8 @@ static void pci_restore_pcix_state(struct pci_dev *dev)
return;
cap = (u16 *)&save_state->cap.data[0];
- pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, pos + PCI_X_CMD, cap[i++]);
}
/**
@@ -2090,7 +2094,7 @@ static int do_pci_enable_device(struct pci_dev *dev, int bars)
pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
if (pin) {
pci_read_config_word(dev, PCI_COMMAND, &cmd);
- if (cmd & PCI_COMMAND_INTX_DISABLE)
+ if (!pci_lu_adopt(dev) && cmd & PCI_COMMAND_INTX_DISABLE)
pci_write_config_word(dev, PCI_COMMAND,
cmd & ~PCI_COMMAND_INTX_DISABLE);
}
@@ -2248,7 +2252,8 @@ static void do_pci_disable_device(struct pci_dev *dev)
pci_read_config_word(dev, PCI_COMMAND, &pci_command);
if (pci_command & PCI_COMMAND_MASTER) {
pci_command &= ~PCI_COMMAND_MASTER;
- pci_write_config_word(dev, PCI_COMMAND, pci_command);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, pci_command);
}
pcibios_disable_device(dev);
@@ -2369,7 +2374,8 @@ bool pci_check_pme_status(struct pci_dev *dev)
ret = true;
}
- pci_write_config_word(dev, pmcsr_pos, pmcsr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, pmcsr_pos, pmcsr);
return ret;
}
@@ -2484,7 +2490,8 @@ static void __pci_pme_active(struct pci_dev *dev, bool enable)
if (!enable)
pmcsr &= ~PCI_PM_CTRL_PME_ENABLE;
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
}
/**
@@ -2506,7 +2513,8 @@ void pci_pme_restore(struct pci_dev *dev)
pmcsr &= ~PCI_PM_CTRL_PME_ENABLE;
pmcsr |= PCI_PM_CTRL_PME_STATUS;
}
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, pmcsr);
}
/**
@@ -3587,12 +3595,14 @@ void pci_configure_ari(struct pci_dev *dev)
return;
if (pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ARI)) {
- pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
- PCI_EXP_DEVCTL2_ARI);
+ if (!pci_lu_adopt(dev))
+ pcie_capability_set_word(bridge, PCI_EXP_DEVCTL2,
+ PCI_EXP_DEVCTL2_ARI);
bridge->ari_enabled = 1;
} else {
- pcie_capability_clear_word(bridge, PCI_EXP_DEVCTL2,
- PCI_EXP_DEVCTL2_ARI);
+ if (!pci_lu_adopt(dev))
+ pcie_capability_clear_word(bridge, PCI_EXP_DEVCTL2,
+ PCI_EXP_DEVCTL2_ARI);
bridge->ari_enabled = 0;
}
}
@@ -4286,7 +4296,8 @@ static void __pci_set_master(struct pci_dev *dev, bool enable)
if (cmd != old_cmd) {
pci_dbg(dev, "%s bus mastering\n",
enable ? "enabling" : "disabling");
- pci_write_config_word(dev, PCI_COMMAND, cmd);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
}
dev->is_busmaster = enable;
}
@@ -4416,7 +4427,8 @@ int pci_set_mwi(struct pci_dev *dev)
if (!(cmd & PCI_COMMAND_INVALIDATE)) {
pci_dbg(dev, "enabling Mem-Wr-Inval\n");
cmd |= PCI_COMMAND_INVALIDATE;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
}
return 0;
#endif
@@ -4456,7 +4468,8 @@ void pci_clear_mwi(struct pci_dev *dev)
pci_read_config_word(dev, PCI_COMMAND, &cmd);
if (cmd & PCI_COMMAND_INVALIDATE) {
cmd &= ~PCI_COMMAND_INVALIDATE;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
}
#endif
}
@@ -4475,7 +4488,8 @@ void pci_disable_parity(struct pci_dev *dev)
pci_read_config_word(dev, PCI_COMMAND, &cmd);
if (cmd & PCI_COMMAND_PARITY) {
cmd &= ~PCI_COMMAND_PARITY;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
}
}
@@ -4500,7 +4514,8 @@ void pci_intx(struct pci_dev *pdev, int enable)
if (new == pci_command)
return;
- pci_write_config_word(pdev, PCI_COMMAND, new);
+ if (!pci_lu_adopt(pdev))
+ pci_write_config_word(pdev, PCI_COMMAND, new);
}
EXPORT_SYMBOL_GPL(pci_intx);
@@ -4648,12 +4663,14 @@ static int pci_pm_reset(struct pci_dev *dev, bool probe)
csr &= ~PCI_PM_CTRL_STATE_MASK;
csr |= PCI_D3hot;
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
pci_dev_d3_sleep(dev);
csr &= ~PCI_PM_CTRL_STATE_MASK;
csr |= PCI_D0;
- pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, dev->pm_cap + PCI_PM_CTRL, csr);
pci_dev_d3_sleep(dev);
return pci_dev_wait(dev, "PM D3hot->D0", PCIE_RESET_READY_POLL_MS);
@@ -4959,6 +4976,7 @@ void pci_reset_secondary_bus(struct pci_dev *dev)
{
u16 ctrl;
+ BUG_ON(pci_lu_adopt(dev));
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &ctrl);
ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, ctrl);
@@ -5186,7 +5204,8 @@ static void pci_dev_save_and_disable(struct pci_dev *dev)
* DMA from the device including MSI/MSI-X interrupts. For PCI 2.3
* compliant devices, INTx-disable prevents legacy interrupts.
*/
- pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
}
static void pci_dev_restore(struct pci_dev *dev)
@@ -5897,8 +5916,9 @@ int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc)
cmd &= ~PCI_X_CMD_MAX_READ;
cmd |= FIELD_PREP(PCI_X_CMD_MAX_READ, v);
- if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd))
- return -EIO;
+ if (!pci_lu_adopt(dev))
+ if (pci_write_config_word(dev, cap + PCI_X_CMD, cmd))
+ return -EIO;
}
return 0;
}
@@ -5960,6 +5980,8 @@ int pcie_set_readrq(struct pci_dev *dev, int rq)
}
}
+ if (pci_lu_adopt(dev))
+ return 0;
ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
PCI_EXP_DEVCTL_READRQ, v);
@@ -6004,6 +6026,8 @@ int pcie_set_mps(struct pci_dev *dev, int mps)
return -EINVAL;
v = FIELD_PREP(PCI_EXP_DEVCTL_PAYLOAD, v);
+ if (pci_lu_adopt(dev))
+ return 0;
ret = pcie_capability_clear_and_set_word(dev, PCI_EXP_DEVCTL,
PCI_EXP_DEVCTL_PAYLOAD, v);
@@ -6304,7 +6328,8 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
cmd |= command_bits;
else
cmd &= ~command_bits;
- pci_write_config_word(dev, PCI_COMMAND, cmd);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, cmd);
}
if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
@@ -6320,8 +6345,9 @@ int pci_set_vga_state(struct pci_dev *dev, bool decode,
cmd |= PCI_BRIDGE_CTL_VGA;
else
cmd &= ~PCI_BRIDGE_CTL_VGA;
- pci_write_config_word(bridge, PCI_BRIDGE_CONTROL,
- cmd);
+ if (!pci_lu_adopt(bridge))
+ pci_write_config_word(bridge, PCI_BRIDGE_CONTROL,
+ cmd);
}
bus = bus->parent;
}
@@ -6621,7 +6647,8 @@ void pci_reassigndev_resource_alignment(struct pci_dev *dev)
pci_read_config_word(dev, PCI_COMMAND, &command);
command &= ~PCI_COMMAND_MEMORY;
- pci_write_config_word(dev, PCI_COMMAND, command);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_COMMAND, command);
for (i = 0; i <= PCI_ROM_RESOURCE; i++)
pci_request_resource_alignment(dev, i, align, resize);
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a8acc986a5aac808ec64395d7d946ee036270f5b..bd198227ae3cf687f4ddae76c2f53125681ca91d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -1188,11 +1188,18 @@ static inline int pci_msix_write_tph_tag(struct pci_dev *pdev, unsigned int inde
void pci_liveupdate_restore(struct pci_dev *dev);
void pci_liveupdate_override_driver(struct pci_dev *dev);
+static inline struct pci_dev_ser *pci_lu_adopt(struct pci_dev *dev)
+{
+ return dev->dev.lu.requested ? dev->dev.lu.dev_state : NULL;
+}
#else
#define PCI_SER_GET(__dev, __var, __def) __def
static inline void pci_liveupdate_restore(struct pci_dev *dev) {}
static inline void pci_liveupdate_override_driver(struct pci_dev *dev) {}
+static inline struct pci_dev_ser *pci_lu_adopt(struct pci_dev *dev)
+{
+ return NULL;
+}
#endif
-
#endif /* DRIVERS_PCI_H */
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 29fcb0689a918f9cb123691e1680de5a1af2c115..61f9a443f6ad2bad57d3fc5958e8855117f79598 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -788,7 +788,7 @@ static void aspm_l1ss_init(struct pcie_link_state *link)
aspm_calc_l12_info(link, parent_l1ss_cap, child_l1ss_cap);
}
-static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
+static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist, bool lu_restore)
{
struct pci_dev *child = link->downstream, *parent = link->pdev;
u32 parent_lnkcap, child_lnkcap;
@@ -812,7 +812,8 @@ static void pcie_aspm_cap_init(struct pcie_link_state *link, int blacklist)
return;
/* Configure common clock before checking latencies */
- pcie_aspm_configure_common_clock(link);
+ if (!lu_restore)
+ pcie_aspm_configure_common_clock(link);
/*
* Re-read upstream/downstream components' register state after
@@ -1130,7 +1131,7 @@ void pcie_aspm_init_link_state(struct pci_dev *pdev)
* upstream links also because capable state of them can be
* update through pcie_aspm_cap_init().
*/
- pcie_aspm_cap_init(link, blacklist);
+ pcie_aspm_cap_init(link, blacklist, pci_lu_adopt(pdev));
/* Setup initial Clock PM state */
pcie_clkpm_cap_init(link, blacklist);
diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c
index a2daebd9806cd7273ee331406201402a758bd7b8..da093a5ba7ee1f9d20652c71e8e78662fdab176c 100644
--- a/drivers/pci/pcie/pme.c
+++ b/drivers/pci/pcie/pme.c
@@ -53,6 +53,8 @@ struct pcie_pme_service_data {
*/
void pcie_pme_interrupt_enable(struct pci_dev *dev, bool enable)
{
+ if (pci_lu_adopt(dev))
+ return;
if (enable)
pcie_capability_set_word(dev, PCI_EXP_RTCTL,
PCI_EXP_RTCTL_PMEIE);
@@ -344,8 +346,10 @@ static int pcie_pme_probe(struct pcie_device *srv)
data->srv = srv;
set_service_data(srv, data);
- pcie_pme_interrupt_enable(port, false);
- pcie_clear_root_pme_status(port);
+ if (!pci_lu_adopt(port)) {
+ pcie_pme_interrupt_enable(port, false);
+ pcie_clear_root_pme_status(port);
+ }
ret = request_irq(srv->irq, pcie_pme_irq, IRQF_SHARED, "PCIe PME", srv);
if (ret) {
@@ -356,7 +360,8 @@ static int pcie_pme_probe(struct pcie_device *srv)
pci_info(port, "Signaling with IRQ %d\n", srv->irq);
pcie_pme_mark_devices(port);
- pcie_pme_interrupt_enable(port, true);
+ if (!pci_lu_adopt(port))
+ pcie_pme_interrupt_enable(port, true);
return 0;
}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index d8b80e1c4fb35289208d7c953fb5c1e137a5c1a8..5c30d1d52a96b17a92794756cab5db0972548267 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -358,7 +358,7 @@ static __always_inline void pci_read_bases(struct pci_dev *dev,
return;
/* No printks while decoding is disabled! */
- if (!dev->mmio_always_on) {
+ if (!pci_lu_adopt(dev) && !dev->mmio_always_on) {
pci_read_config_word(dev, PCI_COMMAND, &orig_cmd);
if (orig_cmd & PCI_COMMAND_DECODE_ENABLE) {
pci_write_config_word(dev, PCI_COMMAND,
@@ -366,11 +366,13 @@ static __always_inline void pci_read_bases(struct pci_dev *dev,
}
}
- __pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
- if (rom)
- __pci_size_rom(dev, rom, &rombar);
+ if (!pci_lu_adopt(dev)) {
+ __pci_size_stdbars(dev, howmany, PCI_BASE_ADDRESS_0, stdbars);
+ if (rom)
+ __pci_size_rom(dev, rom, &rombar);
+ }
- if (!dev->mmio_always_on &&
+ if (!pci_lu_adopt(dev) && !dev->mmio_always_on &&
(orig_cmd & PCI_COMMAND_DECODE_ENABLE))
pci_write_config_word(dev, PCI_COMMAND, orig_cmd);
@@ -1269,8 +1271,9 @@ static void pci_enable_rrs_sv(struct pci_dev *pdev)
/* Enable Configuration RRS Software Visibility if supported */
pcie_capability_read_word(pdev, PCI_EXP_RTCAP, &root_cap);
if (root_cap & PCI_EXP_RTCAP_RRS_SV) {
- pcie_capability_set_word(pdev, PCI_EXP_RTCTL,
- PCI_EXP_RTCTL_RRS_SVE);
+ if (!pci_lu_adopt(pdev))
+ pcie_capability_set_word(pdev, PCI_EXP_RTCTL,
+ PCI_EXP_RTCTL_RRS_SVE);
pdev->config_rrs_sv = 1;
}
}
@@ -1384,8 +1387,9 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
* bus errors in some architectures.
*/
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
- pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
- bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
+ bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
!is_cardbus && !broken) {
@@ -1404,6 +1408,10 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
* more than one bridge. The second case can happen with
* the i450NX chipset.
*/
+ if (pci_lu_adopt(dev)) {
+ /* Verify bus number here */
+ }
+
child = pci_find_bus(pci_domain_nr(bus), secondary);
if (!child) {
child = pci_add_new_bus(bus, dev, secondary);
@@ -1558,7 +1566,8 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
/* Clear errors in the Secondary Status Register */
pci_write_config_word(dev, PCI_SEC_STATUS, 0xffff);
- pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
+ if (!pci_lu_adopt(dev))
+ pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
pm_runtime_put(&dev->dev);
@@ -2035,7 +2044,10 @@ int pci_setup_device(struct pci_dev *dev)
* If we are not booted from liveupdate, default
* "Unknown power state".
*/
- dev->current_state = PCI_SER_GET(dev, current_state, PCI_UNKNOWN);
+ if (pci_lu_adopt(dev))
+ dev->current_state = 0; /* FIXME */
+ else
+ dev->current_state = PCI_SER_GET(dev, current_state, PCI_UNKNOWN);
/* Early fixups, before probing the BARs */
pci_fixup_device(pci_fixup_early, dev);
@@ -2075,7 +2087,8 @@ int pci_setup_device(struct pci_dev *dev)
dev->hotplug_user_indicators);
dev->pref_window = PCI_SER_GET(dev, pref_window,
dev->pref_window);
- dev->pref_64_window = PCI_SER_GET(dev, pref_64_window,
+ if (!pci_lu_adopt(dev))
+ dev->pref_64_window = PCI_SER_GET(dev, pref_64_window,
dev->pref_64_window);
switch (dev->hdr_type) { /* header type */
@@ -2269,6 +2282,10 @@ int pci_configure_extended_tags(struct pci_dev *dev, void *ign)
if (!host)
return 0;
+
+ if (pci_lu_adopt(dev))
+ return 0;
+
/*
* If some device in the hierarchy doesn't handle Extended Tags
* correctly, make sure they're disabled.
@@ -2373,7 +2390,7 @@ static void pci_configure_serr(struct pci_dev *dev)
* endpoint unless SERR# forwarding is enabled.
*/
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &control);
- if (!(control & PCI_BRIDGE_CTL_SERR)) {
+ if (!pci_lu_adopt(dev) && !(control & PCI_BRIDGE_CTL_SERR)) {
control |= PCI_BRIDGE_CTL_SERR;
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, control);
}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 07c3d021a47ec794aaae13e1c12a667cfb47cb45..276a62c6957218c0c89d8881b1a4d6f6d419dacf 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -706,6 +706,9 @@ static void pci_setup_bridge_io(struct pci_dev *bridge)
io_upper16 = 0;
l = 0x00f0;
}
+
+ if (pci_lu_adopt(bridge))
+ return;
/* Temporarily disable the I/O range before updating PCI_IO_BASE */
pci_write_config_dword(bridge, PCI_IO_BASE_UPPER16, 0x0000ffff);
/* Update lower 16 bits of I/O base/limit */
@@ -732,6 +735,8 @@ static void pci_setup_bridge_mmio(struct pci_dev *bridge)
} else {
l = 0x0000fff0;
}
+ if (pci_lu_adopt(bridge))
+ return;
pci_write_config_dword(bridge, PCI_MEMORY_BASE, l);
}
@@ -765,6 +770,8 @@ static void pci_setup_bridge_mmio_pref(struct pci_dev *bridge)
} else {
l = 0x0000fff0;
}
+ if (pci_lu_adopt(bridge))
+ return;
pci_write_config_dword(bridge, PCI_PREF_MEMORY_BASE, l);
/* Set the upper 32 bits of PREF base & limit */
@@ -787,7 +794,8 @@ static void __pci_setup_bridge(struct pci_bus *bus, unsigned long type)
if (type & IORESOURCE_PREFETCH)
pci_setup_bridge_mmio_pref(bridge);
- pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
+ if (!pci_lu_adopt(bridge))
+ pci_write_config_word(bridge, PCI_BRIDGE_CONTROL, bus->bridge_ctl);
}
void __weak pcibios_setup_bridge(struct pci_bus *bus, unsigned long type)
--
2.50.1.487.gc89ff58d15-goog
On Mon, Jul 28 2025 at 01:24, Chris Li wrote: > The liveupdate devices are already initialized by the kernel before the > kexec. During the kexec the device is still running. Avoid write to the > liveupdate devices during the new kernel boot up. This change log is way too meager for this kind of change. 1) You want to explain in detail how this works. "initialized by the kernel before the kexec" is as vague as it gets. 2) Avoid write .... Again this lacks any information how this is supposed to work correctly. > drivers/pci/ats.c | 7 ++-- > drivers/pci/iov.c | 58 ++++++++++++++++++------------ > drivers/pci/msi/msi.c | 32 ++++++++++++----- > drivers/pci/msi/pcidev_msi.c | 4 +-- > drivers/pci/pci-acpi.c | 3 ++ > drivers/pci/pci.c | 85 +++++++++++++++++++++++++++++--------------- > drivers/pci/pci.h | 9 ++++- > drivers/pci/pcie/aspm.c | 7 ++-- > drivers/pci/pcie/pme.c | 11 ++++-- > drivers/pci/probe.c | 43 +++++++++++++++------- > drivers/pci/setup-bus.c | 10 +++++- Then you sprinkle this stuff into files, which have completely different purposes, without any explanation for the particular instances why they are supposed to be correct and how this works. I'm just looking at the MSI parts, as I have no expertise with the rest. > diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c > index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644 > --- a/drivers/pci/msi/msi.c > +++ b/drivers/pci/msi/msi.c > @@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev) > > void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) > { > - raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock; > + struct pci_dev *pci_dev = to_pci_dev(desc->dev); > + raw_spinlock_t *lock = &pci_dev->msi_lock; > unsigned long flags; > > if (!desc->pci.msi_attrib.can_mask) > @@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) > raw_spin_lock_irqsave(lock, flags); > desc->pci.msi_mask &= ~clear; > desc->pci.msi_mask |= set; > - pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, > - desc->pci.msi_mask); > + if (!pci_lu_adopt(pci_dev)) > + pci_write_config_dword(pci_dev, desc->pci.mask_pos, > + desc->pci.msi_mask); This results in inconsistent state, which is a bad idea to begin with. How is cached software state and hardware state going to be brought in sync at some point? If you analyzed all places, which actually depend on hardware state and make decisions based on it, for correctness, then you failed to provide that analysis. If not, no comment. > raw_spin_unlock_irqrestore(lock, flags); > } > > @@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc, > int pos = dev->msi_cap; > u16 msgctl; > > + if (pci_lu_adopt(dev)) > + return; > + > pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); > msgctl &= ~PCI_MSI_FLAGS_QSIZE; > msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple); > @@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg > > if (desc->pci.msi_attrib.is_virtual) > return; > + if (pci_lu_adopt(to_pci_dev(desc->dev))) > + return; So you don't allow the new kernel to write the MSI message, but the interrupt subsystem has this new message and there are places which utilize that cached message. How is this supposed to work? > /* > * The specification mandates that the entry is masked > * when the message is modified: > @@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable) > control &= ~PCI_MSI_FLAGS_ENABLE; > if (enable) > control |= PCI_MSI_FLAGS_ENABLE; > - pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control); > + if (!pci_lu_adopt(dev)) > + pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control); The placement of these conditionals is arbitrary. Some are the begin of a function, others just block the write. Is that based on some logic or were the places selected by shabby AI queries? > static int msi_setup_msi_desc(struct pci_dev *dev, int nvec, > @@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set) > { > u16 ctrl; > > + BUG_ON(pci_lu_adopt(dev)); Not going to happen. BUG() is only appropriate when there is absolutely no way to handle a situation. This is as undocumented as everything else here. > pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl); > ctrl &= ~clear; > ctrl |= set; > @@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, > * registers can be accessed. Mask all the vectors to prevent > * interrupts coming in before they're fully set up. > */ > - pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | > - PCI_MSIX_FLAGS_ENABLE); > + if (!pci_lu_adopt(dev)) > + pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | > + PCI_MSIX_FLAGS_ENABLE); And for enhanced annoyance you sprinkle this condition everywhere into the code and then BUG() when you missed an instance. Because putting it into the function which is invoked a gazillion of times would be too obvious, right? That would at least be tasteful, but that's not the primary problem of all this. Sprinkling these conditionals all over the place is absolutely unmaintainable, error prone and burdens everyone with this insanity and the related hard to chase bugs. Especially as there is no concept behind this and zero documentation how any of this should work or even be remotely correct. Before you start the next hackery, please sit down and write up coherent explanations: What is the general concept of this? What is the exact state in which a device is left when the old kernel jumps into the new kernel? What is the state of the MSI[-X] or legacy PCI interrupts at this point? Can the device raise interrupts during the transition from the old to the new kernel? How is the "live" state of the device reflected and restored throughout the interrupt subsystem? How is the device driver supposed to attach to the same interrupt state as before? How are the potentially different Linux interrupt numbers mapped to the previous state? Before this materializes and is agreed on, this is not going anywhere. Thanks, tglx
Hi Thomas, On Mon, Jul 28, 2025 at 10:23 AM Thomas Gleixner <tglx@linutronix.de> wrote: > > On Mon, Jul 28 2025 at 01:24, Chris Li wrote: > > The liveupdate devices are already initialized by the kernel before the > > kexec. During the kexec the device is still running. Avoid write to the > > liveupdate devices during the new kernel boot up. > > This change log is way too meager for this kind of change. I agree with you. I mention it in the cover letter, I do expect this part of change to be controversial. This RFC series is just to kick off the discussion for PCI device liveupdate. > 1) You want to explain in detail how this works. > "initialized by the kernel before the kexec" is as vague as it gets. Agree. Sorry I haven't included more documents in this series. Working on it. > > 2) Avoid write .... > > Again this lacks any information how this is supposed to work correctly. I guess I haven't presented the big picture of how liveupdate works with a PCI device. Let's start with the background why we want to do this. We want to upgrade a host kernel, which has a VM running with a GPU device attached to the VM via vfio_pci. We want the host kernel upgrade in a way that the VM can continue without shutting down and restarting the VM. The VM will pause during the host kernel kexec. The GPU device will continue running and DMA without pausing. VM will not be able to run the interrupt until the new kernel is finished booting and resume the VM. Pasha's LUO series already have designs on the liveupdate state, with callback associated with the state. https://lore.kernel.org/lkml/20250515182322.117840-1-pasha.tatashin@soleen.com/ I copy paste some of Pasha's LUO state here: ==========quote========== LUO State Machine and Events: NORMAL: Default operational state. PREPARED: Initial preparation complete after LIVEUPDATE_PREPARE event. Subsystems have saved initial state. FROZEN: Final "blackout window" state after LIVEUPDATE_FREEZE event, just before kexec. Workloads must be suspended. UPDATED: Next kernel has booted via live update. Awaiting restoration and LIVEUPDATE_FINISH. Events: LIVEUPDATE_PREPARE: Prepare for reboot, serialize state. LIVEUPDATE_FREEZE: Final opportunity to save state before kexec. LIVEUPDATE_FINISH: Post-reboot cleanup in the next kernel. LIVEUPDATE_CANCEL: Abort prepare or freeze, revert changes. ==========quote ends =========== The PCI core register will register as a subsystem to LUO and participate in the LUO callbacks. 1) In NORMAL state: The PCI device will register to the PCI subsystem by setting the pci_dev->dev.lu.requested flag. 2) PREPARE callback. The PCI subsystem will build the list of the PCI devices using the PCI device dependency. VF depends on PF, PCI devices depend on the parent bridge. The PCI subsystem will save the struct pci_dev part of the pci device state. Then forward the prepare callback to the PCI devices to serialize the PCI devices driver state. The VM is still running but with some limitations. e.g. can't create new DMA mapping. can't attach to an additional new vfio_pci device. 3) FREEZE callback: VM is paused. Last change for PCI device to serialize the device state. 4) kexec booting up the new kernel. 5) PCI device enumeration and probing. Find the PCI device in the serialized preserved device list, restore the device serialized data pointer for PCI device. PF device probe(), restores the number of VF and creates the VF, the VF device probe() 6) VM re-attach to the requested PCI device via vfio_pci. 7) FINISH callback. PCI subsystem and PCI devices free their preserved serialized data. System go back to NORMAL state. 8) VM resume running. > > > drivers/pci/ats.c | 7 ++-- > > drivers/pci/iov.c | 58 ++++++++++++++++++------------ > > drivers/pci/msi/msi.c | 32 ++++++++++++----- > > drivers/pci/msi/pcidev_msi.c | 4 +-- > > drivers/pci/pci-acpi.c | 3 ++ > > drivers/pci/pci.c | 85 +++++++++++++++++++++++++++++--------------- > > drivers/pci/pci.h | 9 ++++- > > drivers/pci/pcie/aspm.c | 7 ++-- > > drivers/pci/pcie/pme.c | 11 ++++-- > > drivers/pci/probe.c | 43 +++++++++++++++------- > > drivers/pci/setup-bus.c | 10 +++++- > > Then you sprinkle this stuff into files, which have completely different > purposes, without any explanation for the particular instances why they > are supposed to be correct and how this works. They follow a pattern that the original kernel needs to write to the device and change the device state. The liveupdate device needs to maintain the previous state not changed, therefore needs to prevent such write initialization in liveupdate case. I can certainly split it into more patches and group them by functions in the later series. This patch does it in a whole sale just to demonstrate what needs to happen to make a device live update. > > I'm just looking at the MSI parts, as I have no expertise with the rest. Thank you for your feedback, that is very helpful. > > > diff --git a/drivers/pci/msi/msi.c b/drivers/pci/msi/msi.c > > index 6ede55a7c5e652c80b51b10e58f0290eb6556430..7c40fde1ba0f89ad1d72064ac9e80696faeab426 100644 > > --- a/drivers/pci/msi/msi.c > > +++ b/drivers/pci/msi/msi.c > > @@ -113,7 +113,8 @@ static int pci_setup_msi_context(struct pci_dev *dev) > > > > void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) > > { > > - raw_spinlock_t *lock = &to_pci_dev(desc->dev)->msi_lock; > > + struct pci_dev *pci_dev = to_pci_dev(desc->dev); > > + raw_spinlock_t *lock = &pci_dev->msi_lock; > > unsigned long flags; > > > > if (!desc->pci.msi_attrib.can_mask) > > @@ -122,8 +123,9 @@ void pci_msi_update_mask(struct msi_desc *desc, u32 clear, u32 set) > > raw_spin_lock_irqsave(lock, flags); > > desc->pci.msi_mask &= ~clear; > > desc->pci.msi_mask |= set; > > - pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->pci.mask_pos, > > - desc->pci.msi_mask); > > + if (!pci_lu_adopt(pci_dev)) > > + pci_write_config_dword(pci_dev, desc->pci.mask_pos, > > + desc->pci.msi_mask); > > This results in inconsistent state, which is a bad idea to begin > with. How is cached software state and hardware state going to be > brought in sync at some point? Yes, to make the interrupt fully working we need to tell the new kernel about the previous kernel's interrupt descriptor in IOMMU etc. As it is, the liveupdate device interrupt is not fully working yet. David is working on the interrupt and later there will be an interrupt series to make interrupt working with liveupdate devices. This is just the first baby step. > > If you analyzed all places, which actually depend on hardware state and > make decisions based on it, for correctness, then you failed to provide > that analysis. If not, no comment. Let me clarify. This avoid writing to devices only applies to liveupdate devices. Only between FREEZE and FINISH. After the LUO finish(), LUO is back to normal state again. The device can be writable again as normal, most likely by the VM. We don't want the device state to change between FREEZE and FINISH. > > > raw_spin_unlock_irqrestore(lock, flags); > > } > > > > @@ -190,6 +192,9 @@ static inline void pci_write_msg_msi(struct pci_dev *dev, struct msi_desc *desc, > > int pos = dev->msi_cap; > > u16 msgctl; > > > > + if (pci_lu_adopt(dev)) > > + return; > > + > > pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); > > msgctl &= ~PCI_MSI_FLAGS_QSIZE; > > msgctl |= FIELD_PREP(PCI_MSI_FLAGS_QSIZE, desc->pci.msi_attrib.multiple); > > @@ -214,6 +219,8 @@ static inline void pci_write_msg_msix(struct msi_desc *desc, struct msi_msg *msg > > > > if (desc->pci.msi_attrib.is_virtual) > > return; > > + if (pci_lu_adopt(to_pci_dev(desc->dev))) > > + return; > > So you don't allow the new kernel to write the MSI message, but the > interrupt subsystem has this new message and there are places which > utilize that cached message. How is this supposed to work? We don't allow the PCI subsystem or driver to write the MSI message before FINISH. There are two possible ways. 1) Have someone save the incoming MSI message somehow, and re-deliver them after the FINISH call. 2) Don't save the MSI message between FREEZE and FINISH. At finish, deliver one spurious interrupt to the device driver, so the device driver can have a chance to check if there is any pending work that needs to be done. It is possible that no MSI has been dropped, the driver finds out there is nothing that needs to be done. We expect the driver can tolerate such one time spurious interruptions. Because spurious interruptions can happen for other reasons, that should be fine? Let me know if there is a case where this kind of spurious interrupt can cause a problem, we are very interested to know. > > > /* > > * The specification mandates that the entry is masked > > * when the message is modified: > > @@ -279,7 +286,8 @@ static void pci_msi_set_enable(struct pci_dev *dev, int enable) > > control &= ~PCI_MSI_FLAGS_ENABLE; > > if (enable) > > control |= PCI_MSI_FLAGS_ENABLE; > > - pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control); > > + if (!pci_lu_adopt(dev)) > > + pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control); > > The placement of these conditionals is arbitrary. Some are the begin of > a function, others just block the write. Is that based on some logic or > were the places selected by shabby AI queries? They all can be converted to the pattern as: if (!pci_luo_adopt(dev)) pci_write_config_xxx(). Sometimes I choose to return early if there is multiple write but not data stored in struct pci_dev. Mostly just try to reduce the number of if (!pci_luo_adopt(dev)). I am not satisfied with this change yet. The goal of this patch is to show what effect needs to happen, we can discuss better ways to do it. > > > static int msi_setup_msi_desc(struct pci_dev *dev, int nvec, > > @@ -553,6 +561,7 @@ static void pci_msix_clear_and_set_ctrl(struct pci_dev *dev, u16 clear, u16 set) > > { > > u16 ctrl; > > > > + BUG_ON(pci_lu_adopt(dev)); > > Not going to happen. BUG() is only appropriate when there is absolutely > no way to handle a situation. This is as undocumented as everything else > here. Agree. This is some developing/debug stuff left over. I haven't encountered msix_clear_and_set_ctrl() in my test. I will remove the bug in the next version. > > > pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &ctrl); > > ctrl &= ~clear; > > ctrl |= set; > > @@ -720,8 +729,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, > > * registers can be accessed. Mask all the vectors to prevent > > * interrupts coming in before they're fully set up. > > */ > > - pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | > > - PCI_MSIX_FLAGS_ENABLE); > > + if (!pci_lu_adopt(dev)) > > + pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | > > + PCI_MSIX_FLAGS_ENABLE); > > And for enhanced annoyance you sprinkle this condition everywhere into > the code and then BUG() when you missed an instance. Because putting it > into the function which is invoked a gazillion of times would be too > obvious, right? That would at least be tasteful, but that's not the > primary problem of all this. > > Sprinkling these conditionals all over the place is absolutely > unmaintainable, error prone and burdens everyone with this insanity and > the related hard to chase bugs. If you prefer, I can move them all into the pci_config_write. We actually start with pci_config_write_xxx(). But that solution has its own problem as well. For starters, the function name does not reflect what the function actually does any more. Also for the complicated case, where liveupdate does need to write some config register but not the other. e.g. From the live update point of view, PF devices shouldn't write to SR-IOV related registers that change the VF devices number. But PF devices should be able to tolerate some other config space write, because the VM is not using the PF device. The PF device state can be changed without impacting the VM. It is going to be unmaintainable to make a complicated logic inside pci_config_write_xxx(), depending on which caller and what state, what is allowed and what is not. I can discuss and try different approaches to address this problem. I understand it is a hard problem. I don't have a perfect solution without cons. This is just the first baby step to demonstrate what is the resulting effect we want. Then we can shape the code to our liking. I am happy to explore other approaches as well. > > Especially as there is no concept behind this and zero documentation how > any of this should work or even be remotely correct. I hope the above description can help you understand better why we want to do it and the approach we take. I am happy to answer questions if you have any. Mind you that I don't have all the answers. It is part of the journey to find the best solution. > > Before you start the next hackery, please sit down and write up coherent > explanations: > > What is the general concept of this? See above. > > What is the exact state in which a device is left when the old kernel > jumps into the new kernel? The device allows DMA to the mapping region during PREPARE and raise interrupt. The interrupt handler will not be able to run during kexec black out period (between freeze and finish). Other than the state store in the device, there is also a PCI subsystem and device driver state serialized in the preserved folio for the next kernel to interrupt. > What is the state of the MSI[-X] or legacy PCI interrupts at this > point? The current approach is that, just drop the interrupt during black out period (between freeze and finish) then deliver a spurious interrupt to the device at finish(), that gives the device driver a chance to perform the interrupt handler action which can't happen in black out. > > Can the device raise interrupts during the transition from the old to > the new kernel? Yes, can raise interrupt but interrupt handle won't able to run during black out. After finish() it is business as usual. > > How is the "live" state of the device reflected and restored > throughout the interrupt subsystem? Those are very good questions. Current approach just drop them and use the spurious interrupt to catch up in the end. > > How is the device driver supposed to attach to the same interrupt > state as before? We can't if we did not save the interrupt state changed during black out. Current approach is just using a spurious interrupt to catch up in finish(). > > How are the potentially different Linux interrupt numbers mapped to > the previous state? The IRQ number will remain the state cross kexec. However the interrupt descriptor address might have changed in the new kernel. We need to save some of the interrupt descriptor and interrupt state into the preserved folio for the next kernel to rebuild. To be continued in the interrupt series. Not covered by this patch series yet. > > Before this materializes and is agreed on, this is not going anywhere. Those are very good questions. Hopefully I have answered some of it. Please let me know if you have more questions I can clarify. Again this is just an RFC to show what was the resulting effect we want to get from the PCI device livedupate. It is not complete nor perfect. I am happy to explore different approaches. Thanks for the questions. I still owe you a write up document for the PCI device liveupdate. I will work on that. Hope that helps explain some of the background and approach. It is not a substitution of the document. I am working on that and will include it in the next version. Chris Chris
On Tue, Jul 29, 2025 at 06:51:27PM -0700, Chris Li wrote: > They follow a pattern that the original kernel needs to write to the > device and change the device state. The liveupdate device needs to > maintain the previous state not changed, therefore needs to prevent > such write initialization in liveupdate case. No, I fundamentally reject this position and your testing methodology. The new kernel *should* be writing to config space and it *should* be doing things like clearing and gaining control over MSI. It is fully wrong to be blocking it like you are doing just to satify some incorrect qemu based test checking for no config access. Only some config accesse are bad. Each and every "bad" one needs to be clearly explained *why* it is bad and only then mitigated. Most mitigation are far harder than just if'ing around the config write. My ATS/PASID/etc example for instance. Jason
On Thu, Jul 31, 2025 at 8:02 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Tue, Jul 29, 2025 at 06:51:27PM -0700, Chris Li wrote: > > > They follow a pattern that the original kernel needs to write to the > > device and change the device state. The liveupdate device needs to > > maintain the previous state not changed, therefore needs to prevent > > such write initialization in liveupdate case. > > No, I fundamentally reject this position and your testing methodology. > > The new kernel *should* be writing to config space and it *should* be > doing things like clearing and gaining control over MSI. It is fully > wrong to be blocking it like you are doing just to satify some > incorrect qemu based test checking for no config access. First of all, let me clarify that the PCI PF and VF tests I mention in the cover letter are run on the real data center servers, not qemu. QEMU does not have the correct IOMMU simulation for my workstation anyway. I do use qemu in development to quickly check if I screwed up something badly. The real test is always on the real machine. Our internal test dashboard has reached a high two digit number now, all with real hardware. With that out of the way. Let me explain why we did it the way we did. I believe you and I eventually want the same thing, just different ways to get there. I am also working on a series that allows fine grain control of PCI preservation. It allows the driver to select exactly what needs to be preserved, rather than the current "preserved" vs "depended" control. With the fine grain control, it can basically do what you described, allow new kernel writes to config space they don't want to preserve. However this RFC series is already getting very long, that is why I did not include the fine grain control series in this RFC. Keep in mind that this is just RFC, I want to demonstrate the problem space, and what source code needs to be modified in order to preserve all config space. It is not the final version that gets merged. Your feedback is important to us. My philosophy is that the LUO PCI subsystem is for service of the PCI device driver. Ultimately it is the PCI device driver who decides what part of the config space they want to preserve or overwrite. The PCI layer is just there to facilitate that service. Regarding the testing. There are many different tests we can write and run. Preserving all config space is just one of them. We also have other tests that partially preserve the config space and write to some config as it needs to. That is why I need to have the fine grain control series. If you still think it is unjustifiable to have one test try to preserve all config space for liveupdate. Please elaborate your reasoning. I am very curious. With the fine grained control we let the driver decide what the driver wants to preserve vs not, will that remove your objection? > Only some config accesse are bad. Each and every "bad" one needs to be > clearly explained *why* it is bad and only then mitigated. That is exactly the reason why we have the conservative test that preserves every config space test as a starting point. It does not mean that is the ending point. We also have tests that only partially preserve the config space driver actually needs. When things break, we can quickly compare to find out not preserving which register will break which device. This incremental approach is very effective to deal with very complex devices. Another constraint is that the data center servers are dependent on the network device able to connect to the network appropriately. Take diorite NIC for example, if I try only preserving ATS/PASID did not finish the rest of liveupdate, the nic wasn't able to boot up and connect to the network all the way. Even if the test passes for the ATS part, the over test fails because the server is not back online. I can't include that test into the test dashboard, because it brings down the server. The only way to recover from that is rebooting the server, which takes a long time for a big server. I can only keep that non-passing test as my own private developing test, not the regression test set. That is the reason we to have some conservative tests passing first, then expand to the more risky tests. We are actually quickly expanding our test metrics for doing more and more interesting(and risky) stuff. I hope that clarifies the eventual end goal and the development approach we take. > Most mitigation are far harder than just if'ing around the config > write. My ATS/PASID/etc example for instance. Exactly why we can't add those risky(non working) tests into the dashboard before the conservative passing one. Chris
On Fri, Aug 01, 2025 at 04:04:39PM -0700, Chris Li wrote: > My philosophy is that the LUO PCI subsystem is for service of the PCI > device driver. Ultimately it is the PCI device driver who decides what > part of the config space they want to preserve or overwrite. The PCI > layer is just there to facilitate that service. I don't think this makes any sense at all. There is nothing the device driver can contribute here. > If you still think it is unjustifiable to have one test try to > preserve all config space for liveupdate. I do think it is unjustifiable, it is architecurally wrong. You only should be preserving the absolute bare minimum of config space bits and everything else should be rewritten by the next kernel in the normal way. This MSI is a prime example of a nonsensical outcome if you take the position the config space should not be written to. > > Only some config accesse are bad. Each and every "bad" one needs to be > > clearly explained *why* it is bad and only then mitigated. > > That is exactly the reason why we have the conservative test that > preserves every config space test as a starting point. That is completely the opposite of what I said. Preserving everything is giving up on the harder job of identifying which bits cannot be changed, explaining why they can't be changed, and then mitigating only those things. > Another constraint is that the data center servers are dependent on > the network device able to connect to the network appropriately. Take > diorite NIC for example, if I try only preserving ATS/PASID did not > finish the rest of liveupdate, the nic wasn't able to boot up and > connect to the network all the way. Even if the test passes for the > ATS part, the over test fails because the server is not back online. I > can't include that test into the test dashboard, because it brings > down the server. The only way to recover from that is rebooting the > server, which takes a long time for a big server. I can only keep that > non-passing test as my own private developing test, not the regression > test set. I have no idea what this is trying to say and it sounds like you also can't explain exactly what is "wrong" and justify why things are being preserved. Again, your series should be starting simpler. Perserve the dumbest simplest PCI configuration. Certainly no switches, P2P, ATS or PASID. When that is working you can then add on more complex PCI features piece by piece. Jason
Hi Jason, Thanks for your feedback. On Sat, Aug 2, 2025 at 6:50 AM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > On Fri, Aug 01, 2025 at 04:04:39PM -0700, Chris Li wrote: > > My philosophy is that the LUO PCI subsystem is for service of the PCI > > device driver. Ultimately it is the PCI device driver who decides what > > part of the config space they want to preserve or overwrite. The PCI > > layer is just there to facilitate that service. > > I don't think this makes any sense at all. There is nothing the device > driver can contribute here. I am considering that the device driver owner will know a lot more device internal knowledge, e.g. why it needs to reserve this and that register where the PCI layer might not know much about the internal device behavior. > > If you still think it is unjustifiable to have one test try to > > preserve all config space for liveupdate. > > I do think it is unjustifiable, it is architecurally wrong. You only > should be preserving the absolute bare minimum of config space bits > and everything else should be rewritten by the next kernel in the > normal way. This MSI is a prime example of a nonsensical outcome if > you take the position the config space should not be written to. OK. Let me rework the V2 with your approach. > > > > Only some config accesse are bad. Each and every "bad" one needs to be > > > clearly explained *why* it is bad and only then mitigated. > > > > That is exactly the reason why we have the conservative test that > > preserves every config space test as a starting point. > > That is completely the opposite of what I said. Preserving everything > is giving up on the harder job of identifying which bits cannot be > changed, explaining why they can't be changed, and then mitigating > only those things. We can still preserve every thing then work backwards to preserve less. As I said, I will rework V2 with your approach preserving bare minimum as the starting place. > > Another constraint is that the data center servers are dependent on > > the network device able to connect to the network appropriately. Take > > diorite NIC for example, if I try only preserving ATS/PASID did not > > finish the rest of liveupdate, the nic wasn't able to boot up and > > connect to the network all the way. Even if the test passes for the > > ATS part, the over test fails because the server is not back online. I > > can't include that test into the test dashboard, because it brings > > down the server. The only way to recover from that is rebooting the > > server, which takes a long time for a big server. I can only keep that > > non-passing test as my own private developing test, not the regression > > test set. > > I have no idea what this is trying to say and it sounds like you also > can't explain exactly what is "wrong" and justify why things are being > preserved. I know what register is causing the trouble but I think we are under a different philosophy of addressing the problem from different ends. Another consideration is the device testing matrixs. The kexec with device liveupdate is a rare event. With that many device state re-initializing might trigger some very rare bug in the device or firmware. So it might be due to the device internal implementation, even though PCI spec might say otherwise or undefined. Anyway, let me do it your way in V2 then. > Again, your series should be starting simpler. Perserve the dumbest > simplest PCI configuration. Certainly no switches, P2P, ATS or > PASID. When that is working you can then add on more complex PCI > features piece by piece. With the V1 the patch series deliverable is having an Intel diorite NVMe device preserve every config space access and pass to the vfio and iommu people to build the vfio and iommu on top of it. Let's forget about V1. With V2 I want to start with the minimal end. No switches,P2P, ATS or PASID. I need some help to define what is deliverable in such a minimal preserve. e.g. Do I be able to read back the config value not changed then call it a day. Or do I expect to see the device fully initialized, it is able to be used by the user space. Will the device need to perform any DMA? Interrupt? I will probably find a device as simple as possible and it is attached to the root PCI host bridge, not the PCI-PCI bridge. Maybe no interrupt as the first step. One possibility is using the Intel DSA device that does the DMA streaming. If you have any other feedback on the candidate device and deliverable test for V2, I am looking forward to it. Thanks. Chris
On Mon, Jul 28, 2025 at 07:23:03PM +0200, Thomas Gleixner wrote: > On Mon, Jul 28 2025 at 01:24, Chris Li wrote: > > The liveupdate devices are already initialized by the kernel before the > > kexec. During the kexec the device is still running. Avoid write to the > > liveupdate devices during the new kernel boot up. > > This change log is way too meager for this kind of change. > > 1) You want to explain in detail how this works. > > "initialized by the kernel before the kexec" is as vague as it gets. > > 2) Avoid write .... > > Again this lacks any information how this is supposed to work correctly. > > > drivers/pci/ats.c | 7 ++-- > > drivers/pci/iov.c | 58 ++++++++++++++++++------------ > > drivers/pci/msi/msi.c | 32 ++++++++++++----- > > drivers/pci/msi/pcidev_msi.c | 4 +-- > > drivers/pci/pci-acpi.c | 3 ++ > > drivers/pci/pci.c | 85 +++++++++++++++++++++++++++++--------------- > > drivers/pci/pci.h | 9 ++++- > > drivers/pci/pcie/aspm.c | 7 ++-- > > drivers/pci/pcie/pme.c | 11 ++++-- > > drivers/pci/probe.c | 43 +++++++++++++++------- > > drivers/pci/setup-bus.c | 10 +++++- > > Then you sprinkle this stuff into files, which have completely different > purposes, without any explanation for the particular instances why they > are supposed to be correct and how this works. Yeah, everyting needs to be very carefully explained. For instance I'm not sure we should be doing *anything* to the MSI. Why did you think so? MSI should be fully cleared by the new kernel and the new VFIO should re-establish all the MSI routing from scratch as part of adopting the device. We already accept that any interrupts are lost during the kexec process so what reason is there to do anything except start up the new kernel with a fully disabled MSI and cleared MSI? If otherwise it should be explained why we can't work this way - and then explain how the new kernel will adopt the inherited operating MSI (hint: I doubt it can) without disrupting it. Same remark for everything. Explain in the commits and perhaps a well placed comment why anything needs to be done and why exactly we can't use the cold boot flow for each item. eg "we can't use the cold boot flow for BAR sizing because BAR sizing requires changing the BAR register and that will break ongoing P2P DMAs" "we can't use the cold boot flow for bridge windows because changing the bridge windows in any way will break ongoing P2P DMAs" (though you also need to explain why the cold boot flow would change the bridge windows) etc etc. There is also some complication here as the iommu driver technically owns some of the PCI state, and we really don't want the PCI Core to change it, but we do need theiommu driver to affirm what the in-use state should be because it is responsible to clean it up. This may actually require some restructing of the iommu driver/pci core interfaces to switch from an enable/disbale language to a 'target state' language. Ie "ATS shall be on and ATS page size shall be X". This series is very big, so I would probably try to break it up into smaller chunks. Like you don't need to preserve bridge windows and BARs if you don't support P2P. You don't need to worry about ATS and PASID if you don't support those, etc, etc. Yes, in the end all needs to be supported, but going bit by bit will be easier for people to understand. Basic VFIO support with a basic IOMMU using basic PCI with no P2P is the simplest thing you can do, and I think it needs surprisingly little preservation. Jason
On Mon, Jul 28, 2025 at 4:50 PM Jason Gunthorpe <jgg@ziepe.ca> wrote: > > Then you sprinkle this stuff into files, which have completely different > > purposes, without any explanation for the particular instances why they > > are supposed to be correct and how this works. > > Yeah, everyting needs to be very carefully explained. Agree. I did some explanation in my last email reply to Thomas. Will add a document for the next version. > > For instance I'm not sure we should be doing *anything* to the > MSI. Why did you think so? > > MSI should be fully cleared by the new kernel and the new VFIO should > re-establish all the MSI routing from scratch as part of adopting the > device. We already accept that any interrupts are lost during the > kexec process so what reason is there to do anything except start up the > new kernel with a fully disabled MSI and cleared MSI? The current approach is that we fake/inject a spurious interrupt to the device to allow the device driver to have a chance to process any pending action for the interrupt. There is also a possibility there is nothing the device driver needs to do due to no interrupt having ever triggered in the kexec window. We expect the driver can tolerate that spurious interrupt. The alternative is to try to (partially) process the interrupt during kexec. e.g. remember which IRQ has the interrupt triggered. It will make things much more complicated. Invoke interrupt handler in the early boot stage before IOMMU is very tricky. > > If otherwise it should be explained why we can't work this way - and > then explain how the new kernel will adopt the inherited operating MSI > (hint: I doubt it can) without disrupting it. Agree. > > Same remark for everything. Explain in the commits and perhaps a well > placed comment why anything needs to be done and why exactly we can't > use the cold boot flow for each item. We certainly can do that. I am trying to see if we can agree on the VFIO_PCI device used by the VM. We don't want any config space register to change during the liveupdate kexec (before finish). We can certainly change what config space register might or might not break stuff. But it is going to be very hard to test and verify what can break if we change this. If we can draw a line and say, there is no config space to write to the device between freeze and finish. It is much easier to reason from the device point of view, the device should continue working. The device has no way of knowing the host kernel has been changed. The device has only a limited view of their config space, the DMA area it can read/write to. If we preserve enough stuff, the device should continue working. For most of the devices, we can reason with the model that keeping the status quo will not break things. There is an obvious exception to that, e.g. if the device has a watchdog timer it needs to kick at regular intervals, if that interval is shorter than the kexec cycle. It should be pretty rare and we can deal with those when we actually encounter one. > > eg "we can't use the cold boot flow for BAR sizing because BAR sizing > requires changing the BAR register and that will break ongoing P2P > DMAs" > > "we can't use the cold boot flow for bridge windows because changing > the bridge windows in any way will break ongoing P2P DMAs" (though you > also need to explain why the cold boot flow would change the bridge > windows) > > etc etc. There will be some config space register hard to make sure changing it will break things or not. e.g. The base BAR register, if we change to a new memory region, and all follow up write to the device using a BAR new address, should things continue working? Will have a lot of corner case like this, it is much easier to just avoid changing anything to make things consistent. > > There is also some complication here as the iommu driver technically > owns some of the PCI state, and we really don't want the PCI Core to > change it, but we do need theiommu driver to affirm what the in-use > state should be because it is responsible to clean it up. Yes, there is overlap between PCI and IOMMU, more than just config space write. The IOMMU needs to know which PCI device participates, which set of groups it needs to save. CC Samiullah here, he knows more about the IOMMU side of the liveupdate than I do. > This may actually require some restructing of the iommu driver/pci > core interfaces to switch from an enable/disbale language to a 'target > state' language. Ie "ATS shall be on and ATS page size shall be X". > Ack. I have some ideas to make the PCI initialization cleaner for this usage as well. Instead of directly initiating and turning on features if found. We can do in 3 stages: 1) enumerate PCI capability and get the list of capability available but don't turn them on yet. 2) determine what capability needs to be turned on/off. For the normal initiation without liveupdate, the current behavior mostly turns on whatever can be turned on. For liveupdate devices, it would be inherent the on/off from what the previous kernel hands off to the new kernel. By either 1) reading the device state (assume reading state is possible and does not change device state) or 2) previous kernel save state into preserved folio and new kernel reads the state from preserved folio. 3) Perform the action to turn on/off the according the result from 2). For live update devices the most common case is skip write, that will be noop. For normal initialization without liveupdate, it will turn on the capability. > This series is very big, so I would probably try to break it up into > smaller chunks. Like you don't need to preserve bridge windows and > BARs if you don't support P2P. You don't need to worry about ATS and > PASID if you don't support those, etc, etc. Yes, I can break it to smaller chunks. One of the deliverables of this patch series is that I can test the liveupdate with the pci-lu-stub and pci-lub-stub-pf driver. Having additional patch to verify no PCI config space write has performed on the requested PCI device during shutdown and kexec boot up. > Yes, in the end all needs to be supported, but going bit by bit will > be easier for people to understand. Basic VFIO support with a basic > IOMMU using basic PCI with no P2P is the simplest thing you can do, > and I think it needs surprisingly little preservation. Yes, that is certainly possible ;-) Because I am working on the PCI side of the liveupdate, there are other developers working on VFIO and IOMMU depending on my PCI changes. From the project development point of view the PCI change needs to happen first, to unblock others. That is how I get here. I can certainly break it down to smaller chunks. Chris
© 2016 - 2025 Red Hat, Inc.