[PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device

Li Zhijian posted 1 patch 1 month, 1 week ago
tools/testing/cxl/Kbuild      | 2 ++
tools/testing/cxl/mock_cdat.c | 8 ++++++++
2 files changed, 10 insertions(+)
create mode 100644 tools/testing/cxl/mock_cdat.c
[PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Li Zhijian 1 month, 1 week ago
The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
PCI/PCIe CXL device only while cxl_test was implemeneted by platform
device.

Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
for cxl_core so that the cxl_test goes well.

Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
will cause a kernel panic with calltrace:
 platform cxl_host_bridge.3: host supports CXL (restricted)
 Oops: general protection fault, probably for non-canonical address 0x3ef17856fcae4fbd: 0000 [#1] PREEMPT SMP PTI
 CPU: 1 UID: 0 PID: 9167 Comm: cxl Kdump: loaded Tainted: G           OE      6.12.0-rc3-master+ #66
 Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
 Hardware name: LENOVO 90CXCTO1WW/, BIOS FCKT70AUS 04/23/2015
 RIP: 0010:pci_bus_read_config_word+0x1c/0x60
 Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 53 b8 87 00 00 00 48 83 ec 08 c7 44 24 04 00 00 00 00 f6 c2 01 75 29 <48> 8b 87 c0 00 00 00 48 89 cb 4c 8d 44 24 04 b9 02 00 00 00 48 8b
 RSP: 0018:ffffa115034dfbb8 EFLAGS: 00010246
 RAX: 0000000000000087 RBX: 0000000000000012 RCX: ffffa115034dfbfe
 RDX: 0000000000000016 RSI: 000000006f4e2f4e RDI: 3ef17856fcae4efd
 RBP: ffff8cc229121b48 R08: 0000000000000010 R09: 0000000000000000
 R10: 0000000000000001 R11: ffff8cc225434360 R12: ffffa115034dfbfe
 R13: 0000000000000000 R14: ffff8cc2f119a080 R15: ffffa115034dfc50
 FS:  00007f31d93537c0(0000) GS:ffff8cc510a80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f31d95f3370 CR3: 00000001163ea001 CR4: 00000000001726f0
 Call Trace:
  <TASK>
  ? __die_body.cold+0x19/0x27
  ? die_addr+0x38/0x60
  ? exc_general_protection+0x1f5/0x4b0
  ? asm_exc_general_protection+0x22/0x30
  ? pci_bus_read_config_word+0x1c/0x60
  pcie_capability_read_word+0x93/0xb0
  pcie_link_speed_mbps+0x18/0x50
  cxl_pci_get_bandwidth+0x18/0x60 [cxl_core]
  cxl_endpoint_gather_bandwidth.constprop.0+0xf4/0x230 [cxl_core]
  ? xas_store+0x54/0x660
  ? preempt_count_add+0x69/0xa0
  ? _raw_spin_lock+0x13/0x40
  ? __kmalloc_cache_noprof+0xe7/0x270
  cxl_region_shared_upstream_bandwidth_update+0x9c/0x790 [cxl_core]
  cxl_region_attach+0x520/0x7e0 [cxl_core]
  store_targetN+0xf2/0x120 [cxl_core]
  kernfs_fop_write_iter+0x13a/0x1f0
  vfs_write+0x23b/0x410
  ksys_write+0x53/0xd0
  do_syscall_64+0x62/0x180
  entry_SYSCALL_64_after_hwframe+0x76/0x7e

And Ying also reported a KASAN error with similar calltrace.

Reported-by: "Huang, Ying" <ying.huang@intel.com>
Closes: https://lore.kernel.org/linux-cxl/87y12w9vp5.fsf@yhuang6-desk2.ccr.corp.intel.com/
Fixes: a5ab0de0ebaa ("cxl: Calculate region bandwidth of targets with shared upstream link")
Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
 tools/testing/cxl/Kbuild      | 2 ++
 tools/testing/cxl/mock_cdat.c | 8 ++++++++
 2 files changed, 10 insertions(+)
 create mode 100644 tools/testing/cxl/mock_cdat.c

diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
index b1256fee3567..ed9f50dee3f5 100644
--- a/tools/testing/cxl/Kbuild
+++ b/tools/testing/cxl/Kbuild
@@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
 ldflags-y += --wrap=cxl_rcd_component_reg_phys
 ldflags-y += --wrap=cxl_endpoint_parse_cdat
 ldflags-y += --wrap=cxl_dport_init_ras_reporting
+ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
 
 DRIVERS := ../../../drivers
 CXL_SRC := $(DRIVERS)/cxl
@@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
 cxl_core-y += $(CXL_CORE_SRC)/hdm.o
 cxl_core-y += $(CXL_CORE_SRC)/pmu.o
 cxl_core-y += $(CXL_CORE_SRC)/cdat.o
+cxl_core-y += mock_cdat.o
 cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
 cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
 cxl_core-y += config_check.o
diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
new file mode 100644
index 000000000000..99974153b3f6
--- /dev/null
+++ b/tools/testing/cxl/mock_cdat.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
+
+#include <cxl.h>
+
+void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
+{
+}
-- 
2.44.0
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Alison Schofield 1 month, 1 week ago
On Wed, Oct 16, 2024 at 09:52:13AM +0800, Li Zhijian wrote:
> The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
> PCI/PCIe CXL device only while cxl_test was implemeneted by platform
> device.
> 
> Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
> for cxl_core so that the cxl_test goes well.
> 
> Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
> will cause a kernel panic with calltrace:

snip

> ---
>  tools/testing/cxl/Kbuild      | 2 ++
>  tools/testing/cxl/mock_cdat.c | 8 ++++++++
>  2 files changed, 10 insertions(+)
>  create mode 100644 tools/testing/cxl/mock_cdat.c
> 
> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> index b1256fee3567..ed9f50dee3f5 100644
> --- a/tools/testing/cxl/Kbuild
> +++ b/tools/testing/cxl/Kbuild
> @@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
>  ldflags-y += --wrap=cxl_rcd_component_reg_phys
>  ldflags-y += --wrap=cxl_endpoint_parse_cdat
>  ldflags-y += --wrap=cxl_dport_init_ras_reporting
> +ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
>  
>  DRIVERS := ../../../drivers
>  CXL_SRC := $(DRIVERS)/cxl
> @@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
>  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
>  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
>  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> +cxl_core-y += mock_cdat.o
>  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
>  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
>  cxl_core-y += config_check.o
> diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
> new file mode 100644
> index 000000000000..99974153b3f6
> --- /dev/null
> +++ b/tools/testing/cxl/mock_cdat.c
> @@ -0,0 +1,8 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
> +
> +#include <cxl.h>
> +
> +void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
> +{
> +}

The addition of file mock_cdat.c made me wonder why this wrapper couldn't join
all the other __wrap's defined in test/mock.c. I tried, as you probably did,
and see the circular dependency. I mention it here in case anyone else has a
way to simplify this.

Otherwise LGTM:
Reviewed-by: Alison Schofield <alison.schofield@intel.com>



> -- 
> 2.44.0
>
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Dan Williams 1 month, 1 week ago
Alison Schofield wrote:
> On Wed, Oct 16, 2024 at 09:52:13AM +0800, Li Zhijian wrote:
> > The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
> > PCI/PCIe CXL device only while cxl_test was implemeneted by platform
> > device.
> > 
> > Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
> > for cxl_core so that the cxl_test goes well.
> > 
> > Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
> > will cause a kernel panic with calltrace:
> 
> snip
> 
> > ---
> >  tools/testing/cxl/Kbuild      | 2 ++
> >  tools/testing/cxl/mock_cdat.c | 8 ++++++++
> >  2 files changed, 10 insertions(+)
> >  create mode 100644 tools/testing/cxl/mock_cdat.c
> > 
> > diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> > index b1256fee3567..ed9f50dee3f5 100644
> > --- a/tools/testing/cxl/Kbuild
> > +++ b/tools/testing/cxl/Kbuild
> > @@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
> >  ldflags-y += --wrap=cxl_rcd_component_reg_phys
> >  ldflags-y += --wrap=cxl_endpoint_parse_cdat
> >  ldflags-y += --wrap=cxl_dport_init_ras_reporting
> > +ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
> >  
> >  DRIVERS := ../../../drivers
> >  CXL_SRC := $(DRIVERS)/cxl
> > @@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
> >  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
> >  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
> >  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> > +cxl_core-y += mock_cdat.o
> >  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
> >  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
> >  cxl_core-y += config_check.o
> > diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
> > new file mode 100644
> > index 000000000000..99974153b3f6
> > --- /dev/null
> > +++ b/tools/testing/cxl/mock_cdat.c
> > @@ -0,0 +1,8 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
> > +
> > +#include <cxl.h>
> > +
> > +void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
> > +{
> > +}
> 
> The addition of file mock_cdat.c made me wonder why this wrapper couldn't join
> all the other __wrap's defined in test/mock.c. I tried, as you probably did,
> and see the circular dependency. I mention it here in case anyone else has a
> way to simplify this.

Yeah, unfortunately symbols can only be mocked across EXPORT_SYMBOL()
boundaries, but since the caller of this is internal to the CXL core it
is not amenable to the wrap approach.

So, unfortunately what this patch does is break the expectation that
cxl_test can live alongside and not regress any production flows. I.e.
what this patch does is replace 
cxl_region_shared_upstream_bandwidth_update() for all use cases, not
just platform devices.

Compare that to tools/testing/cxl/test/mock.c which arranges for all the
mocked use cases to call back into the real routines in the real device
case.

Given that I think this puts the device type check back in play.

However, instead of checking "dev_is_platform()" check "dev_is_pci()" to
be consistent with all the other CXL core internal functions that exit
early when passed cxl_test devices.
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Zhijian Li (Fujitsu) 1 month ago
Hey Dan,

Thanks for your review...

I want to confirm with you


On 18/10/2024 09:20, Dan Williams wrote:
> Alison Schofield wrote:
>> On Wed, Oct 16, 2024 at 09:52:13AM +0800, Li Zhijian wrote:
>>> The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
>>> PCI/PCIe CXL device only while cxl_test was implemeneted by platform
>>> device.
>>>
>>> Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
>>> for cxl_core so that the cxl_test goes well.
>>>
>>> Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
>>> will cause a kernel panic with calltrace:
>>
>> snip
>>
>>> ---
>>>   tools/testing/cxl/Kbuild      | 2 ++
>>>   tools/testing/cxl/mock_cdat.c | 8 ++++++++
>>>   2 files changed, 10 insertions(+)
>>>   create mode 100644 tools/testing/cxl/mock_cdat.c
>>>
>>> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
>>> index b1256fee3567..ed9f50dee3f5 100644
>>> --- a/tools/testing/cxl/Kbuild
>>> +++ b/tools/testing/cxl/Kbuild
>>> @@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
>>>   ldflags-y += --wrap=cxl_rcd_component_reg_phys
>>>   ldflags-y += --wrap=cxl_endpoint_parse_cdat
>>>   ldflags-y += --wrap=cxl_dport_init_ras_reporting
>>> +ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
>>>   
>>>   DRIVERS := ../../../drivers
>>>   CXL_SRC := $(DRIVERS)/cxl
>>> @@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
>>>   cxl_core-y += $(CXL_CORE_SRC)/hdm.o
>>>   cxl_core-y += $(CXL_CORE_SRC)/pmu.o
>>>   cxl_core-y += $(CXL_CORE_SRC)/cdat.o
>>> +cxl_core-y += mock_cdat.o
>>>   cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
>>>   cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
>>>   cxl_core-y += config_check.o
>>> diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
>>> new file mode 100644
>>> index 000000000000..99974153b3f6
>>> --- /dev/null
>>> +++ b/tools/testing/cxl/mock_cdat.c
>>> @@ -0,0 +1,8 @@
>>> +// SPDX-License-Identifier: GPL-2.0-only
>>> +/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
>>> +
>>> +#include <cxl.h>
>>> +
>>> +void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
>>> +{
>>> +}
>>
>> The addition of file mock_cdat.c made me wonder why this wrapper couldn't join
>> all the other __wrap's defined in test/mock.c. I tried, as you probably did,
>> and see the circular dependency. I mention it here in case anyone else has a
>> way to simplify this.
> 
> Yeah, unfortunately symbols can only be mocked across EXPORT_SYMBOL()
> boundaries, but since the caller of this is internal to the CXL core it
> is not amenable to the wrap approach.
> 
> So, unfortunately what this patch does is break the expectation that
> cxl_test can live alongside and not regress any production flows. I.e.
> what this patch does is replace
> cxl_region_shared_upstream_bandwidth_update() for all use cases, not
> just platform devices.

Yes, that's true.


> 
> Compare that to tools/testing/cxl/test/mock.c which arranges for all the
> mocked use cases to call back into the real routines in the real device
> case.
> 
> Given that I think this puts the device type check back in play.

Just to confirm, do you mean add device type check to drivers/cxl/core/cdat.c

--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -5,6 +5,7 @@
   #include <linux/fw_table.h>
   #include <linux/node.h>
   #include <linux/overflow.h>
+#include <linux/platform_device.h>
   #include "cxlpci.h"
   #include "cxlmem.h"
   #include "core.h"
@@ -641,9 +642,13 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
          void *ptr;
          int rc;

+       if (dev_is_pci(cxlds->dev))
+               return -ENODEV;
+
          if (cxlds->rcd)
                  return -ENODEV;
   

or

tools/testing/cxl/mock_cdat.c

#include <cxl.h>
                                                                                 
void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
{
         for (int i = 0; i < cxlr->params.nr_targets; i++) {
                 struct cxl_endpoint_decoder *cxled = cxlr->params.targets[i];
                 struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
                 struct cxl_dev_state *cxlds = cxlmd->cxlds;
                                                                                 
                 if (!dev_is_pci(cxlds->dev))
                         return;
         }
                                                                                 
         cxl_region_shared_upstream_bandwidth_update(cxlr);
}


Thanks
Zhijian

> 
> However, instead of checking "dev_is_platform()" check "dev_is_pci()" to
> be consistent with all the other CXL core internal functions that exit
> early when passed cxl_test devices.
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Dan Williams 1 month ago
Zhijian Li (Fujitsu) wrote:
> Hey Dan,
> 
> Thanks for your review...
> 
> I want to confirm with you
[..]
> 
> Just to confirm, do you mean add device type check to drivers/cxl/core/cdat.c
> 
> --- a/drivers/cxl/core/cdat.c
> +++ b/drivers/cxl/core/cdat.c
> @@ -5,6 +5,7 @@
>    #include <linux/fw_table.h>
>    #include <linux/node.h>
>    #include <linux/overflow.h>
> +#include <linux/platform_device.h>
>    #include "cxlpci.h"
>    #include "cxlmem.h"
>    #include "core.h"
> @@ -641,9 +642,13 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
>           void *ptr;
>           int rc;
> 
> +       if (dev_is_pci(cxlds->dev))
> +               return -ENODEV;
> +
>           if (cxlds->rcd)
>                   return -ENODEV;
>    

Yes, this one looks good to me. Place the determination as far out into
a leaf function as possible.
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Ira Weiny 1 month ago
Zhijian Li (Fujitsu) wrote:
> Hey Dan,
> 
> Thanks for your review...
> 
> I want to confirm with you
> 
> 
> On 18/10/2024 09:20, Dan Williams wrote:
> > Alison Schofield wrote:
> >> On Wed, Oct 16, 2024 at 09:52:13AM +0800, Li Zhijian wrote:
> >>> The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
> >>> PCI/PCIe CXL device only while cxl_test was implemeneted by platform
> >>> device.
> >>>
> >>> Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
> >>> for cxl_core so that the cxl_test goes well.
> >>>
> >>> Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
> >>> will cause a kernel panic with calltrace:
> >>
> >> snip
> >>
> >>> ---
> >>>   tools/testing/cxl/Kbuild      | 2 ++
> >>>   tools/testing/cxl/mock_cdat.c | 8 ++++++++
> >>>   2 files changed, 10 insertions(+)
> >>>   create mode 100644 tools/testing/cxl/mock_cdat.c
> >>>
> >>> diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> >>> index b1256fee3567..ed9f50dee3f5 100644
> >>> --- a/tools/testing/cxl/Kbuild
> >>> +++ b/tools/testing/cxl/Kbuild
> >>> @@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
> >>>   ldflags-y += --wrap=cxl_rcd_component_reg_phys
> >>>   ldflags-y += --wrap=cxl_endpoint_parse_cdat
> >>>   ldflags-y += --wrap=cxl_dport_init_ras_reporting
> >>> +ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
> >>>   
> >>>   DRIVERS := ../../../drivers
> >>>   CXL_SRC := $(DRIVERS)/cxl
> >>> @@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
> >>>   cxl_core-y += $(CXL_CORE_SRC)/hdm.o
> >>>   cxl_core-y += $(CXL_CORE_SRC)/pmu.o
> >>>   cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> >>> +cxl_core-y += mock_cdat.o
> >>>   cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
> >>>   cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
> >>>   cxl_core-y += config_check.o
> >>> diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
> >>> new file mode 100644
> >>> index 000000000000..99974153b3f6
> >>> --- /dev/null
> >>> +++ b/tools/testing/cxl/mock_cdat.c
> >>> @@ -0,0 +1,8 @@
> >>> +// SPDX-License-Identifier: GPL-2.0-only
> >>> +/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
> >>> +
> >>> +#include <cxl.h>
> >>> +
> >>> +void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
> >>> +{
> >>> +}
> >>
> >> The addition of file mock_cdat.c made me wonder why this wrapper couldn't join
> >> all the other __wrap's defined in test/mock.c. I tried, as you probably did,
> >> and see the circular dependency. I mention it here in case anyone else has a
> >> way to simplify this.
> > 
> > Yeah, unfortunately symbols can only be mocked across EXPORT_SYMBOL()
> > boundaries, but since the caller of this is internal to the CXL core it
> > is not amenable to the wrap approach.
> > 
> > So, unfortunately what this patch does is break the expectation that
> > cxl_test can live alongside and not regress any production flows. I.e.
> > what this patch does is replace
> > cxl_region_shared_upstream_bandwidth_update() for all use cases, not
> > just platform devices.
> 
> Yes, that's true.
> 
> 
> > 
> > Compare that to tools/testing/cxl/test/mock.c which arranges for all the
> > mocked use cases to call back into the real routines in the real device
> > case.
> > 
> > Given that I think this puts the device type check back in play.
> 
> Just to confirm, do you mean add device type check to drivers/cxl/core/cdat.c
> 
> --- a/drivers/cxl/core/cdat.c
> +++ b/drivers/cxl/core/cdat.c
> @@ -5,6 +5,7 @@
>    #include <linux/fw_table.h>
>    #include <linux/node.h>
>    #include <linux/overflow.h>
> +#include <linux/platform_device.h>
>    #include "cxlpci.h"
>    #include "cxlmem.h"
>    #include "core.h"
> @@ -641,9 +642,13 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
>           void *ptr;
>           int rc;
> 
> +       if (dev_is_pci(cxlds->dev))

Did you mean...

	if (!dev_is_pci(cxlds->dev))
???

> +               return -ENODEV;
> +
>           if (cxlds->rcd)
>                   return -ENODEV;
>    
> 
> or
> 
> tools/testing/cxl/mock_cdat.c
> 
> #include <cxl.h>
>                                                                                  
> void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
> {
>          for (int i = 0; i < cxlr->params.nr_targets; i++) {
>                  struct cxl_endpoint_decoder *cxled = cxlr->params.targets[i];
>                  struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>                  struct cxl_dev_state *cxlds = cxlmd->cxlds;
>                                                                                  
>                  if (!dev_is_pci(cxlds->dev))
>                          return;
>          }
>                                                                                  
>          cxl_region_shared_upstream_bandwidth_update(cxlr);
> }

Couldn't this be done on the endpoint call?  (Making it non-static?)

int __wrap_cxl_endpoint_gather_bandwidth(...
                                         struct cxl_endpoint_decoder *cxled,
					...)
{
        struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
        struct cxl_dev_state *cxlds = cxlmd->cxlds;

	if (!dev_is_pci(cxlds->dev))
		return -ENODEV;

	return cxl_endpoint_gather_bandwidth(...);
}

Ira

> 
> 
> Thanks
> Zhijian
> 
> > 
> > However, instead of checking "dev_is_platform()" check "dev_is_pci()" to
> > be consistent with all the other CXL core internal functions that exit
> > early when passed cxl_test devices.
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Dan Williams 1 month ago
Ira Weiny wrote:
[..]
> > --- a/drivers/cxl/core/cdat.c
> > +++ b/drivers/cxl/core/cdat.c
> > @@ -5,6 +5,7 @@
> >    #include <linux/fw_table.h>
> >    #include <linux/node.h>
> >    #include <linux/overflow.h>
> > +#include <linux/platform_device.h>
> >    #include "cxlpci.h"
> >    #include "cxlmem.h"
> >    #include "core.h"
> > @@ -641,9 +642,13 @@ static int cxl_endpoint_gather_bandwidth(struct cxl_region *cxlr,
> >           void *ptr;
> >           int rc;
> > 
> > +       if (dev_is_pci(cxlds->dev))
> 
> Did you mean...
> 
> 	if (!dev_is_pci(cxlds->dev))
> ???

Yes, skip non-PCI.

[..]
> Couldn't this be done on the endpoint call?  (Making it non-static?)
> 
> int __wrap_cxl_endpoint_gather_bandwidth(...
>                                          struct cxl_endpoint_decoder *cxled,
> 					...)
> {
>         struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
>         struct cxl_dev_state *cxlds = cxlmd->cxlds;
> 
> 	if (!dev_is_pci(cxlds->dev))
> 		return -ENODEV;
> 
> 	return cxl_endpoint_gather_bandwidth(...);
> }

No, cxl_endpoint_gather_bandwidth() is called internal to cxl_core,
there is no opportunity to optionally wrap it, only replace it.

In order for this path to be mocked the cxl_region driver would need to
be moved to a module and then
cxl_region_shared_upstream_bandwidth_update() could be unit tested.
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Ira Weiny 1 month ago
Dan Williams wrote:
> Alison Schofield wrote:
> > On Wed, Oct 16, 2024 at 09:52:13AM +0800, Li Zhijian wrote:
> > > The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
> > > PCI/PCIe CXL device only while cxl_test was implemeneted by platform
> > > device.
> > > 
> > > Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
> > > for cxl_core so that the cxl_test goes well.
> > > 
> > > Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
> > > will cause a kernel panic with calltrace:
> > 
> > snip
> > 
> > > ---
> > >  tools/testing/cxl/Kbuild      | 2 ++
> > >  tools/testing/cxl/mock_cdat.c | 8 ++++++++
> > >  2 files changed, 10 insertions(+)
> > >  create mode 100644 tools/testing/cxl/mock_cdat.c
> > > 
> > > diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild
> > > index b1256fee3567..ed9f50dee3f5 100644
> > > --- a/tools/testing/cxl/Kbuild
> > > +++ b/tools/testing/cxl/Kbuild
> > > @@ -15,6 +15,7 @@ ldflags-y += --wrap=devm_cxl_add_rch_dport
> > >  ldflags-y += --wrap=cxl_rcd_component_reg_phys
> > >  ldflags-y += --wrap=cxl_endpoint_parse_cdat
> > >  ldflags-y += --wrap=cxl_dport_init_ras_reporting
> > > +ldflags-y += --wrap=cxl_region_shared_upstream_bandwidth_update
> > >  
> > >  DRIVERS := ../../../drivers
> > >  CXL_SRC := $(DRIVERS)/cxl
> > > @@ -61,6 +62,7 @@ cxl_core-y += $(CXL_CORE_SRC)/pci.o
> > >  cxl_core-y += $(CXL_CORE_SRC)/hdm.o
> > >  cxl_core-y += $(CXL_CORE_SRC)/pmu.o
> > >  cxl_core-y += $(CXL_CORE_SRC)/cdat.o
> > > +cxl_core-y += mock_cdat.o
> > >  cxl_core-$(CONFIG_TRACING) += $(CXL_CORE_SRC)/trace.o
> > >  cxl_core-$(CONFIG_CXL_REGION) += $(CXL_CORE_SRC)/region.o
> > >  cxl_core-y += config_check.o
> > > diff --git a/tools/testing/cxl/mock_cdat.c b/tools/testing/cxl/mock_cdat.c
> > > new file mode 100644
> > > index 000000000000..99974153b3f6
> > > --- /dev/null
> > > +++ b/tools/testing/cxl/mock_cdat.c
> > > @@ -0,0 +1,8 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (c) 2024 FUJITSU LIMITED. All rights reserved. */
> > > +
> > > +#include <cxl.h>
> > > +
> > > +void __wrap_cxl_region_shared_upstream_bandwidth_update(struct cxl_region *cxlr)
> > > +{
> > > +}
> > 
> > The addition of file mock_cdat.c made me wonder why this wrapper couldn't join
> > all the other __wrap's defined in test/mock.c. I tried, as you probably did,
> > and see the circular dependency. I mention it here in case anyone else has a
> > way to simplify this.
> 
> Yeah, unfortunately symbols can only be mocked across EXPORT_SYMBOL()
> boundaries, but since the caller of this is internal to the CXL core it
> is not amenable to the wrap approach.
> 
> So, unfortunately what this patch does is break the expectation that
> cxl_test can live alongside and not regress any production flows. I.e.
> what this patch does is replace 
> cxl_region_shared_upstream_bandwidth_update() for all use cases, not
> just platform devices.
> 
> Compare that to tools/testing/cxl/test/mock.c which arranges for all the
> mocked use cases to call back into the real routines in the real device
> case.
> 
> Given that I think this puts the device type check back in play.
> 
> However, instead of checking "dev_is_platform()" check "dev_is_pci()" to
> be consistent with all the other CXL core internal functions that exit
> early when passed cxl_test devices.

Zhijian,

Looks like we will need a spin of this fix.  Do you have time to do that?

Ira
Re: [PATCH] testing/cxl: Fix abused pci_bus_read_config_word() on platform device
Posted by Huang, Ying 1 month, 1 week ago
Hi, Zhijian,

Li Zhijian <lizhijian@fujitsu.com> writes:

> The cxl_region_shared_upstream_bandwidth_update() in clx_core works on
> PCI/PCIe CXL device only while cxl_test was implemeneted by platform
> device.
>
> Mock a cxl_region_shared_upstream_bandwidth_update() which does nothing
> for cxl_core so that the cxl_test goes well.
>
> Abuse cxl_region_shared_upstream_bandwidth_update() on platform device
> will cause a kernel panic with calltrace:
>  platform cxl_host_bridge.3: host supports CXL (restricted)
>  Oops: general protection fault, probably for non-canonical address 0x3ef17856fcae4fbd: 0000 [#1] PREEMPT SMP PTI
>  CPU: 1 UID: 0 PID: 9167 Comm: cxl Kdump: loaded Tainted: G           OE      6.12.0-rc3-master+ #66
>  Tainted: [O]=OOT_MODULE, [E]=UNSIGNED_MODULE
>  Hardware name: LENOVO 90CXCTO1WW/, BIOS FCKT70AUS 04/23/2015
>  RIP: 0010:pci_bus_read_config_word+0x1c/0x60
>  Code: 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 53 b8 87 00 00 00 48 83 ec 08 c7 44 24 04 00 00 00 00 f6 c2 01 75 29 <48> 8b 87 c0 00 00 00 48 89 cb 4c 8d 44 24 04 b9 02 00 00 00 48 8b
>  RSP: 0018:ffffa115034dfbb8 EFLAGS: 00010246
>  RAX: 0000000000000087 RBX: 0000000000000012 RCX: ffffa115034dfbfe
>  RDX: 0000000000000016 RSI: 000000006f4e2f4e RDI: 3ef17856fcae4efd
>  RBP: ffff8cc229121b48 R08: 0000000000000010 R09: 0000000000000000
>  R10: 0000000000000001 R11: ffff8cc225434360 R12: ffffa115034dfbfe
>  R13: 0000000000000000 R14: ffff8cc2f119a080 R15: ffffa115034dfc50
>  FS:  00007f31d93537c0(0000) GS:ffff8cc510a80000(0000) knlGS:0000000000000000
>  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>  CR2: 00007f31d95f3370 CR3: 00000001163ea001 CR4: 00000000001726f0
>  Call Trace:
>   <TASK>
>   ? __die_body.cold+0x19/0x27
>   ? die_addr+0x38/0x60
>   ? exc_general_protection+0x1f5/0x4b0
>   ? asm_exc_general_protection+0x22/0x30
>   ? pci_bus_read_config_word+0x1c/0x60
>   pcie_capability_read_word+0x93/0xb0
>   pcie_link_speed_mbps+0x18/0x50
>   cxl_pci_get_bandwidth+0x18/0x60 [cxl_core]
>   cxl_endpoint_gather_bandwidth.constprop.0+0xf4/0x230 [cxl_core]
>   ? xas_store+0x54/0x660
>   ? preempt_count_add+0x69/0xa0
>   ? _raw_spin_lock+0x13/0x40
>   ? __kmalloc_cache_noprof+0xe7/0x270
>   cxl_region_shared_upstream_bandwidth_update+0x9c/0x790 [cxl_core]
>   cxl_region_attach+0x520/0x7e0 [cxl_core]
>   store_targetN+0xf2/0x120 [cxl_core]
>   kernfs_fop_write_iter+0x13a/0x1f0
>   vfs_write+0x23b/0x410
>   ksys_write+0x53/0xd0
>   do_syscall_64+0x62/0x180
>   entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> And Ying also reported a KASAN error with similar calltrace.
>
> Reported-by: "Huang, Ying" <ying.huang@intel.com>
> Closes: https://lore.kernel.org/linux-cxl/87y12w9vp5.fsf@yhuang6-desk2.ccr.corp.intel.com/
> Fixes: a5ab0de0ebaa ("cxl: Calculate region bandwidth of targets with shared upstream link")
> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>

This fixes the KASAN error report in my test too.  Thanks!  Feel free to
add

Tested-by: "Huang, Ying" <ying.huang@intel.com>

--
Best Regards,
Huang, Ying