Makefile | 4 +- arch/arm64/include/asm/exception.h | 5 - arch/arm64/include/asm/kvm_host.h | 2 + arch/arm64/include/asm/kvm_pgtable.h | 26 +-- arch/arm64/kernel/fpsimd.c | 33 ++- arch/arm64/kvm/arch_timer.c | 6 +- arch/arm64/kvm/arm.c | 19 +- arch/arm64/kvm/hyp/pgtable.c | 47 +++- arch/arm64/kvm/mmu.c | 18 +- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- arch/arm64/kvm/vgic/vgic-v4.c | 7 +- arch/arm64/mm/mmu.c | 4 +- arch/arm64/net/bpf_jit_comp.c | 8 +- arch/arm64/tools/sysreg | 12 +- arch/ia64/kernel/sys_ia64.c | 2 +- arch/mips/include/asm/dec/prom.h | 2 +- arch/parisc/kernel/sys_parisc.c | 15 +- block/blk-mq.c | 10 +- drivers/accel/qaic/qaic_control.c | 39 ++-- drivers/acpi/button.c | 9 + drivers/acpi/resource.c | 60 ----- drivers/acpi/video_detect.c | 24 ++ drivers/acpi/x86/utils.c | 26 ++- drivers/base/regmap/regmap-i2c.c | 8 +- drivers/base/regmap/regmap-spi-avmm.c | 2 +- drivers/base/regmap/regmap.c | 6 +- drivers/bluetooth/btusb.c | 1 + drivers/dma-buf/dma-resv.c | 13 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 5 +- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 256 +++++++++------------ drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 7 + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 12 + .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 110 +++++++++ .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.h | 11 + .../amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 5 + .../drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 3 +- .../drm/amd/display/dc/dcn303/dcn303_resource.c | 2 +- .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 +- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 2 +- drivers/gpu/drm/drm_client_modeset.c | 6 + drivers/gpu/drm/i915/i915_perf.c | 1 + drivers/gpu/drm/nouveau/dispnv50/disp.c | 4 + drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h | 4 +- drivers/gpu/drm/nouveau/nvkm/engine/disp/uconn.c | 27 ++- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/base.c | 11 +- drivers/gpu/drm/radeon/radeon_cs.c | 3 +- drivers/gpu/drm/ttm/ttm_resource.c | 5 +- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + drivers/iommu/iommu-sva.c | 3 +- drivers/md/md.c | 14 +- drivers/md/raid10.c | 2 + drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 10 +- drivers/net/can/spi/mcp251xfd/mcp251xfd.h | 1 + drivers/net/can/usb/gs_usb.c | 130 ++++++----- drivers/net/dsa/microchip/ksz8795.c | 8 +- drivers/net/dsa/microchip/ksz_common.c | 8 +- drivers/net/dsa/microchip/ksz_common.h | 7 + drivers/net/dsa/mv88e6xxx/chip.c | 7 + drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 33 ++- .../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 29 ++- drivers/net/ethernet/intel/iavf/iavf.h | 16 +- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 39 ++-- drivers/net/ethernet/intel/iavf/iavf_main.c | 223 ++++++++++++------ drivers/net/ethernet/intel/iavf/iavf_txrx.c | 43 ++-- drivers/net/ethernet/intel/iavf/iavf_txrx.h | 4 - drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 5 +- drivers/net/ethernet/intel/ice/ice_base.c | 2 + drivers/net/ethernet/intel/ice/ice_ethtool.c | 13 +- drivers/net/ethernet/intel/ice/ice_lib.c | 27 --- drivers/net/ethernet/intel/ice/ice_main.c | 10 +- drivers/net/ethernet/intel/igb/igb_main.c | 5 + drivers/net/ethernet/intel/igc/igc_main.c | 12 +- drivers/net/ethernet/litex/litex_liteeth.c | 19 +- .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 5 +- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 29 +-- drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c | 2 +- drivers/net/ethernet/realtek/r8169_main.c | 18 +- drivers/net/ethernet/ti/cpsw_ale.c | 24 +- drivers/net/phy/phy_device.c | 21 +- drivers/net/vrf.c | 12 +- drivers/net/wireless/ath/ath11k/core.c | 53 +++-- drivers/net/wireless/ath/ath11k/mac.c | 3 +- drivers/net/wireless/ath/ath11k/wmi.c | 5 + drivers/net/wireless/ath/ath12k/mac.c | 1 + drivers/net/wireless/intel/iwlwifi/mvm/mld-key.c | 9 +- drivers/net/wireless/intel/iwlwifi/mvm/power.c | 14 +- drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 +- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 4 + drivers/net/wireless/realtek/rtw88/sdio.c | 24 +- drivers/net/wireless/virtual/mac80211_hwsim.c | 4 +- drivers/of/platform.c | 2 +- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 28 ++- drivers/pinctrl/renesas/pinctrl-rzv2m.c | 28 ++- drivers/regulator/da9063-regulator.c | 3 + drivers/s390/crypto/zcrypt_msgtype6.c | 33 ++- drivers/scsi/sg.c | 10 + drivers/spi/spi-bcm63xx.c | 2 +- drivers/spi/spi-cadence-quadspi.c | 19 ++ drivers/spi/spi-dw-mmio.c | 22 ++ drivers/spi/spi-s3c64xx.c | 2 + drivers/video/fbdev/au1200fb.c | 3 + drivers/video/fbdev/imxfb.c | 5 +- fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.c | 10 +- fs/btrfs/disk-io.c | 3 + fs/btrfs/extent_io.c | 33 +-- fs/btrfs/inode.c | 35 +-- fs/btrfs/qgroup.c | 1 + fs/btrfs/raid56.c | 11 +- fs/btrfs/volumes.c | 17 +- fs/erofs/zdata.c | 2 +- fs/ext4/xattr.c | 14 ++ fs/fuse/dir.c | 2 +- fs/fuse/inode.c | 8 +- fs/fuse/ioctl.c | 21 +- fs/jbd2/checkpoint.c | 102 +++----- fs/jfs/jfs_dmap.c | 3 + fs/jfs/jfs_txnmgr.c | 5 + fs/jfs/namei.c | 5 + fs/overlayfs/ovl_entry.h | 9 + fs/quota/dquot.c | 5 +- fs/smb/client/connect.c | 19 +- fs/smb/client/dfs.c | 26 +-- fs/smb/client/smb2transport.c | 2 +- fs/udf/unicode.c | 2 +- include/kvm/arm_vgic.h | 2 +- include/linux/psi.h | 5 +- include/linux/psi_types.h | 3 + include/linux/sched/signal.h | 2 +- include/linux/tcp.h | 2 +- include/net/bluetooth/hci_core.h | 5 + include/net/ip.h | 2 +- include/net/tcp.h | 31 ++- include/uapi/linux/fuse.h | 3 + io_uring/io_uring.c | 52 ++--- kernel/bpf/bpf_lru_list.c | 21 +- kernel/bpf/bpf_lru_list.h | 7 +- kernel/bpf/btf.c | 23 +- kernel/bpf/log.c | 3 - kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 32 ++- kernel/cgroup/cgroup.c | 2 +- kernel/kallsyms.c | 5 +- kernel/rcu/tasks.h | 5 +- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 +- kernel/sched/fair.c | 4 +- kernel/sched/psi.c | 29 ++- kernel/sys.c | 10 +- kernel/time/posix-timers.c | 31 +-- kernel/trace/trace_events_hist.c | 3 +- lib/iov_iter.c | 2 +- lib/maple_tree.c | 3 +- mm/mlock.c | 9 +- net/bluetooth/hci_conn.c | 14 +- net/bluetooth/hci_core.c | 42 +++- net/bluetooth/hci_event.c | 15 +- net/bluetooth/hci_sync.c | 121 ++++++++-- net/bluetooth/iso.c | 55 +++-- net/bluetooth/mgmt.c | 26 +-- net/bluetooth/sco.c | 23 +- net/bridge/br_stp_if.c | 3 + net/can/bcm.c | 12 +- net/devlink/health.c | 2 +- net/devlink/leftover.c | 5 +- net/ipv4/esp4.c | 2 +- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 17 +- net/ipv4/inet_timewait_sock.c | 8 +- net/ipv4/ip_output.c | 4 +- net/ipv4/tcp.c | 57 ++--- net/ipv4/tcp_fastopen.c | 6 +- net/ipv4/tcp_ipv4.c | 27 ++- net/ipv4/tcp_minisocks.c | 11 +- net/ipv4/tcp_output.c | 6 +- net/ipv4/udp_offload.c | 16 +- net/ipv6/ip6_gre.c | 3 +- net/ipv6/tcp_ipv6.c | 4 +- net/ipv6/udp_offload.c | 3 +- net/llc/llc_input.c | 3 - net/netfilter/nf_tables_api.c | 12 +- net/netfilter/nft_set_pipapo.c | 6 +- net/sched/cls_bpf.c | 99 ++++---- net/sched/cls_matchall.c | 35 +-- net/sched/cls_u32.c | 48 +++- net/wireless/wext-core.c | 6 + scripts/Makefile.build | 5 +- scripts/Makefile.host | 6 +- scripts/kallsyms.c | 6 +- security/keys/request_key.c | 35 ++- security/keys/trusted-keys/trusted_tpm2.c | 2 +- sound/pci/emu10k1/emufx.c | 112 +-------- sound/pci/hda/patch_realtek.c | 100 +++++++- sound/soc/amd/acp/amd.h | 7 +- sound/soc/codecs/Kconfig | 1 + sound/soc/codecs/cs42l51-i2c.c | 6 + sound/soc/codecs/cs42l51.c | 7 - sound/soc/codecs/cs42l51.h | 1 - sound/soc/codecs/rt5640.c | 12 +- sound/soc/codecs/wcd-mbhc-v2.c | 57 +++-- sound/soc/codecs/wcd934x.c | 12 + sound/soc/codecs/wcd938x.c | 86 ++++++- sound/soc/fsl/fsl_sai.c | 8 +- sound/soc/fsl/fsl_sai.h | 1 + sound/soc/qcom/qdsp6/q6apm.c | 7 +- sound/soc/qcom/qdsp6/topology.c | 4 +- sound/soc/sof/ipc3-dtrace.c | 9 +- sound/soc/tegra/tegra210_adx.c | 34 ++- sound/soc/tegra/tegra210_amx.c | 40 ++-- tools/include/nolibc/stackprotector.h | 5 +- tools/perf/Makefile.config | 4 +- .../tests/shell/test_uprobe_from_different_cu.sh | 77 +++++++ tools/perf/util/dwarf-aux.c | 4 +- tools/testing/radix-tree/maple.c | 6 +- tools/testing/selftests/mm/mkdirty.c | 2 +- tools/testing/selftests/tc-testing/config | 2 + tools/testing/selftests/tc-testing/settings | 1 + 218 files changed, 2462 insertions(+), 1482 deletions(-)
This is the start of the stable review cycle for the 6.4.7 release.
There are 227 patches in this series, all will be posted as a response
to this one. If anyone has any issues with these being applied, please
let me know.
Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000.
Anything received after that time might be too late.
The whole patch series can be found in one patch at:
https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz
or in the git tree and branch at:
git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y
and the diffstat can be found below.
thanks,
greg k-h
-------------
Pseudo-Shortlog of commits:
Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Linux 6.4.7-rc1
Wayne Lin <wayne.lin@amd.com>
drm/amd/display: Add polling method to handle MST reply packet
Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
drm/amd/display: Clean up errors & warnings in amdgpu_dm.c
Yu Kuai <yukuai3@huawei.com>
scsi: sg: Fix checking return value of blk_get_queue()
Yu Kuai <yukuai3@huawei.com>
scsi/sg: don't grab scsi host module reference
Abe Kohandel <abe.kohandel@intel.com>
spi: dw: Remove misleading comment for Mount Evans SoC
Yunxiang Li <Yunxiang.Li@amd.com>
drm/ttm: fix bulk_move corruption when adding a entry
Mohamed Khalfella <mkhalfella@purestorage.com>
tracing/histograms: Return an error if we fail to add histogram to hist_vars list
Miguel Ojeda <ojeda@kernel.org>
kbuild: rust: avoid creating temporary files
Zhang Yi <yi.zhang@huawei.com>
jbd2: recheck chechpointing non-dirty buffer
Vladimir Oltean <vladimir.oltean@nxp.com>
net: phy: prevent stale pointer dereference in phy_init()
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around fastopenq.max_qlen
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around icsk->icsk_user_timeout
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->notsent_lowat
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around rskq_defer_accept
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->linger2
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around icsk->icsk_syn_retries
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->keepalive_probes
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->keepalive_intvl
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->keepalive_time
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->tsoffset
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tp->tcp_tx_delay
Tomasz Moń <tomasz.mon@nordicsemi.no>
Bluetooth: btusb: Fix bluetooth on Intel Macbook 2014
Pauli Virtanen <pav@iki.fi>
Bluetooth: SCO: fix sco_conn related locking and validity issues
Siddh Raman Pant <code@siddh.me>
Bluetooth: hci_conn: return ERR_PTR instead of NULL when there is no link
Douglas Anderson <dianders@chromium.org>
Bluetooth: hci_sync: Avoid use-after-free in dbg for hci_remove_adv_monitor()
Pauli Virtanen <pav@iki.fi>
Bluetooth: ISO: fix iso_conn related locking and validity issues
Pauli Virtanen <pav@iki.fi>
Bluetooth: hci_event: call disconnect callback before deleting conn
Pauli Virtanen <pav@iki.fi>
Bluetooth: use RCU for hci_conn_params and iterate safely in hci_sync
Pablo Neira Ayuso <pablo@netfilter.org>
netfilter: nf_tables: skip bound chain on rule flush
Pablo Neira Ayuso <pablo@netfilter.org>
netfilter: nf_tables: skip bound chain in netns release path
Florian Westphal <fw@strlen.de>
netfilter: nft_set_pipapo: fix improper element removal
Florian Westphal <fw@strlen.de>
netfilter: nf_tables: can't schedule in nft_chain_validate
Florian Westphal <fw@strlen.de>
netfilter: nf_tables: fix spurious set element insertion failure
Vitaly Rodionov <vitalyr@opensource.cirrus.com>
ALSA: hda/realtek: Fix generic fixup definition for cs35l41 amp
Kuniyuki Iwashima <kuniyu@amazon.com>
llc: Don't drop packet from non-root netns.
Zhang Shurong <zhang_shurong@foxmail.com>
fbdev: au1200fb: Fix missing IRQ check in au1200fb_drv_probe
Daniel Golle <daniel@makrotopia.org>
net: ethernet: mtk_eth_soc: always mtk_get_ib1_pkt_type
Kuniyuki Iwashima <kuniyu@amazon.com>
Revert "tcp: avoid the lookup process failing to get sk in ehash table"
Yuanjun Gong <ruc_gongyuanjun@163.com>
net:ipv6: check return value of pskb_trim()
Wang Ming <machel@vivo.com>
net: ipv4: Use kfree_sensitive instead of kfree
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tcp_rsk(req)->ts_recent
Eric Dumazet <edumazet@google.com>
tcp: annotate data-races around tcp_rsk(req)->txhash
Antoine Tenart <atenart@kernel.org>
net: ipv4: use consistent txhash in TIME_WAIT and SYN_RECV
Florian Kauer <florian.kauer@linutronix.de>
igc: Prevent garbled TX queue with XDP ZEROCOPY
Kurt Kanzenbach <kurt@linutronix.de>
igc: Avoid transmit queue timeout for XDP
Alexander Duyck <alexanderduyck@fb.com>
bpf, arm64: Fix BTI type used for freplace attached functions
Kumar Kartikeya Dwivedi <memxor@gmail.com>
bpf: Repeat check_max_stack_depth for async callbacks
Kumar Kartikeya Dwivedi <memxor@gmail.com>
bpf: Fix subprog idx logic in check_max_stack_depth
Geetha sowjanya <gakula@marvell.com>
octeontx2-pf: Dont allocate BPIDs for LBK interfaces
Ido Schimmel <idosch@nvidia.com>
vrf: Fix lockdep splat in output path
Jiapeng Chong <jiapeng.chong@linux.alibaba.com>
security: keys: Modify mismatched function name
Ahmed Zaki <ahmed.zaki@intel.com>
iavf: fix reset task race with iavf_remove()
Ahmed Zaki <ahmed.zaki@intel.com>
iavf: fix a deadlock caused by rtnl and driver's lock circular dependencies
Marcin Szycik <marcin.szycik@linux.intel.com>
iavf: Wait for reset in callbacks which trigger it
Przemek Kitszel <przemyslaw.kitszel@intel.com>
iavf: make functions static where possible
Ahmed Zaki <ahmed.zaki@intel.com>
iavf: use internal state to free traffic IRQs
Ding Hui <dinghui@sangfor.com.cn>
iavf: Fix out-of-bounds when setting channels on remove
Ding Hui <dinghui@sangfor.com.cn>
iavf: Fix use-after-free in free_netdev
Andrzej Hajda <andrzej.hajda@intel.com>
drm/i915/perf: add sentinel to xehp_oa_b_counters
Heiner Kallweit <hkallweit1@gmail.com>
r8169: fix ASPM-related problem for chip version 42 and 43
Tristram Ha <Tristram.Ha@microchip.com>
net: dsa: microchip: correct KSZ8795 static MAC table access
Victor Nogueira <victor@mojatatu.com>
net: sched: cls_bpf: Undo tcf_bind_filter in case of an error
Victor Nogueira <victor@mojatatu.com>
net: sched: cls_u32: Undo refcount decrement in case update failed
Victor Nogueira <victor@mojatatu.com>
net: sched: cls_u32: Undo tcf_bind_filter if u32_replace_hw_knode
Victor Nogueira <victor@mojatatu.com>
net: sched: cls_matchall: Undo tcf_bind_filter in case of failure after mall_set_parms
Martin Fuzzey <martin.fuzzey@flowbird.group>
regulator: da9063: fix null pointer deref with partial DT config
Dan Carpenter <dan.carpenter@linaro.org>
ASoC: SOF: ipc3-dtrace: uninitialized data in dfsentry_trace_filter_write()
Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
ice: prevent NULL pointer deref during reload
Petr Oros <poros@redhat.com>
ice: Unregister netdev and devlink_port only once
Shyam Prasad N <nspmangalore@gmail.com>
cifs: fix mid leak during reconnection after timeout threshold
Dan Carpenter <error27@gmail.com>
iommu/sva: Fix signedness bug in iommu_sva_alloc_pasid()
Yan Zhai <yan@cloudflare.com>
gso: fix dodgy bit handling for GSO_UDP_L4
Daniel Golle <daniel@makrotopia.org>
net: ethernet: mtk_eth_soc: handle probe deferral
Kuniyuki Iwashima <kuniyu@amazon.com>
bridge: Add extack warning when enabling STP in netns.
Tanmay Patil <t-patil@ti.com>
net: ethernet: ti: cpsw_ale: Fix cpsw_ale_get_field()/cpsw_ale_set_field()
Linus Walleij <linus.walleij@linaro.org>
dsa: mv88e6xxx: Do a final check before timing out
Marc Zyngier <maz@kernel.org>
arm64: Fix HFGxTR_EL2 field naming
Paulo Alcantara <pc@manguebit.com>
smb: client: fix missed ses refcounting
Yonghong Song <yhs@fb.com>
kallsyms: strip LTO-only suffixes from promoted global functions
Jaewon Kim <jaewon02.kim@samsung.com>
spi: s3c64xx: clear loopback bit after loopback test
Christoph Hellwig <hch@lst.de>
btrfs: be a bit more careful when setting mirror_num_ret in btrfs_map_block
James Clark <james.clark@arm.com>
perf build: Fix library not found error when using CSLIBS
Yangtao Li <frank.li@vivo.com>
fbdev: imxfb: Removed unneeded release_mem_region
Martin Kaiser <martin@kaiser.cx>
fbdev: imxfb: warn about invalid left/right margin
Jonas Gorski <jonas.gorski@gmail.com>
spi: bcm63xx: fix max prepend length
Biju Das <biju.das.jz@bp.renesas.com>
pinctrl: renesas: rzg2l: Handle non-unique subnode names
Geert Uytterhoeven <geert+renesas@glider.be>
pinctrl: renesas: rzv2m: Handle non-unique subnode names
Suren Baghdasaryan <surenb@google.com>
sched/psi: use kernfs polling functions for PSI trigger polling
Miaohe Lin <linmiaohe@huawei.com>
sched/fair: Use recent_used_cpu to test p->cpus_ptr
Peter Zijlstra <peterz@infradead.org>
iov_iter: Mark copy_iovec_from_user() noclone
Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
ASoC: qcom: q6apm: do not close GPR port before closing graph
Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
ASoC: codecs: wcd938x: fix dB range for HPHL and HPHR
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix mbhc impedance loglevel
Vijendar Mukunda <Vijendar.Mukunda@amd.com>
ASoC: amd: acp: fix for invalid dai id handling in acp_get_byte_count()
Hao Chen <chenhao418@huawei.com>
net: hns3: fix strncpy() not using dest-buf length as length issue
Ying Hsu <yinghsu@chromium.org>
igb: Fix igb_down hung on surprise removal
Yi Kuo <yi@yikuo.dev>
wifi: iwlwifi: pcie: add device id 51F1 for killer 1675
Johannes Berg <johannes.berg@intel.com>
wifi: iwlwifi: mvm: avoid baid size integer overflow
Mukesh Sisodiya <mukesh.sisodiya@intel.com>
wifi: iwlwifi: Add support for new PCI Id
Gustavo A. R. Silva <gustavoars@kernel.org>
wifi: wext-core: Fix -Wstringop-overflow warning in ioctl_standard_iw_point()
Mukesh Sisodiya <mukesh.sisodiya@intel.com>
wifi: iwlwifi: mvm: Add NULL check before dereferencing the pointer
Petr Oros <poros@redhat.com>
devlink: report devlink_port_type_warn source device
Jisheng Zhang <jszhang@kernel.org>
net: ethernet: litex: add support for 64 bit stats
Gregory Greenman <gregory.greenman@intel.com>
wifi: iwlwifi: mvm: fix potential array out of bounds access
P Praneesh <quic_ppranees@quicinc.com>
wifi: ath11k: fix memory leak in WMI firmware stats
Balamurugan S <quic_bselvara@quicinc.com>
wifi: ath12k: Avoid NULL pointer access during management transmit cleanup
Abe Kohandel <abe.kohandel@intel.com>
spi: dw: Add compatible for Intel Mount Evans SoC
Ilan Peer <ilan.peer@intel.com>
wifi: mac80211_hwsim: Fix possible NULL dereference
Wen Gong <quic_wgong@quicinc.com>
wifi: ath11k: add support default regdb while searching board-2.bin for WCN6855
Jakub Kicinski <kuba@kernel.org>
devlink: make health report on unregistered instance warn just once
Yonghong Song <yhs@fb.com>
bpf: Silence a warning in btf_type_id_size()
Martin Blumenstingl <martin.blumenstingl@googlemail.com>
wifi: rtw88: sdio: Check the HISR RX_REQUEST bit in rtw_sdio_rx_isr()
Aditi Ghag <aditi.ghag@isovalent.com>
bpf: tcp: Avoid taking fast sock lock in iterator
Andrii Nakryiko <andrii@kernel.org>
bpf: drop unnecessary user-triggerable WARN_ONCE in verifierl log
Brad Larson <blarson@amd.com>
spi: cadence-quadspi: Add compatible for AMD Pensando Elba SoC
Martin KaFai Lau <martin.lau@kernel.org>
bpf: Address KCSAN report on bpf_lru_list
Kui-Feng Lee <thinker.li@gmail.com>
bpf: Print a warning only if writing to unprivileged_bpf_disabled.
Maxime Bizon <mbizon@freebox.fr>
wifi: ath11k: fix registration of 6Ghz-only phy without the full channel range
Yicong Yang <yangyicong@hisilicon.com>
sched/fair: Don't balance task to its current running CPU
Thomas Weißschuh <linux@weissschuh.net>
tools/nolibc: ensure stack protector guard is never zero
Paul E. McKenney <paulmck@kernel.org>
rcu: Mark additional concurrent load from ->cpu_no_qs.b.exp
Shigeru Yoshida <syoshida@redhat.com>
rcu-tasks: Avoid pr_info() with spin lock in cblist_init_generic()
Hans de Goede <hdegoede@redhat.com>
ACPI: video: Add backlight=native DMI quirk for Dell Studio 1569
Mark Rutland <mark.rutland@arm.com>
arm64: mm: fix VA-range sanity check
Youngmin Nam <youngmin.nam@samsung.com>
arm64: set __exception_irq_entry with __irq_entry as a default
Mario Limonciello <mario.limonciello@amd.com>
ACPI: resource: Remove "Zen" specific match and quirks
Hans de Goede <hdegoede@redhat.com>
ACPI: video: Add backlight=native DMI quirk for Lenovo ThinkPad X131e (3371 AMD version)
Hans de Goede <hdegoede@redhat.com>
ACPI: video: Add backlight=native DMI quirk for Apple iMac11,3
Hans de Goede <hdegoede@redhat.com>
ACPI: x86: Add ACPI_QUIRK_UART1_SKIP for Lenovo Yoga Book yb1-x90f/l
Hans de Goede <hdegoede@redhat.com>
ACPI: button: Add lid disable DMI quirk for Nextbook Ares 8A
Hans de Goede <hdegoede@redhat.com>
ACPI: x86: Add skip i2c clients quirk for Nextbook Ares 8A
Sandeep Dhavale <dhavale@google.com>
erofs: Fix detection of atomic context
Filipe Manana <fdmanana@suse.com>
btrfs: abort transaction at update_ref_for_cow() when ref count is zero
Christoph Hellwig <hch@lst.de>
btrfs: don't check PageError in __extent_writepage
David Sterba <dsterba@suse.com>
btrfs: add xxhash to fast checksum implementations
Thomas Gleixner <tglx@linutronix.de>
posix-timers: Ensure timer ID search-loop limit is valid
Ming Lei <ming.lei@redhat.com>
blk-mq: fix NULL dereference on q->elevator in blk_mq_elv_switch_none
Yu Kuai <yukuai3@huawei.com>
scsi: sg: fix blktrace debugfs entries leakage
Yu Kuai <yukuai3@huawei.com>
md/raid10: prevent soft lockup while flush writes
Yu Kuai <yukuai3@huawei.com>
md: fix data corruption for raid456 when reshape restart while grow up
Immad Mir <mirimmad17@gmail.com>
FS: JFS: Check for read-only mounted filesystem in txBegin
Immad Mir <mirimmad17@gmail.com>
FS: JFS: Fix null-ptr-deref Read in txBegin
Gustavo A. R. Silva <gustavoars@kernel.org>
MIPS: dec: prom: Address -Warray-bounds warning
Yogesh <yogi.kernel@gmail.com>
fs: jfs: Fix UBSAN: array-index-out-of-bounds in dbAllocDmapLev
Matthew Anderson <ruinairas1992@gmail.com>
ALSA: hda/realtek: Add quirks for ROG ALLY CS35l41 audio
Jan Kara <jack@suse.cz>
udf: Fix uninitialized array access for some pathnames
Christian Brauner <brauner@kernel.org>
ovl: check type and offset of struct vfsmount in ovl_entry
Marco Morandini <marco.morandini@polimi.it>
HID: add quirk for 03f0:464a HP Elite Presenter Mouse
Ye Bin <yebin10@huawei.com>
quota: fix warning in dqgrab()
Jan Kara <jack@suse.cz>
quota: Properly disable quotas when add_dquot_ref() fails
Oswald Buddenhagen <oswald.buddenhagen@gmx.de>
ALSA: emu10k1: roll up loops in DSP setup code for Audigy
hackyzh002 <hackyzh002@gmail.com>
drm/radeon: Fix integer overflow in radeon_cs_parser_init
Eric Whitney <enwlinux@gmail.com>
ext4: correct inline offset when handling xattrs in inode body
Marc Zyngier <maz@kernel.org>
KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption
Marc Zyngier <maz@kernel.org>
KVM: arm64: Disable preemption in kvm_arch_hardware_enable()
Oliver Upton <oliver.upton@linux.dev>
KVM: arm64: Correctly handle page aging notifiers for unaligned memslot
Marc Zyngier <maz@kernel.org>
KVM: arm64: timers: Use CNTHCTL_EL2 when setting non-CNTKCTL_EL1 bits
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix soundwire initialisation race
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix codec initialisation race
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd934x: fix resource leaks on component remove
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix missing mbhc init error handling
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix resource leaks on component remove
Sheetal <sheetal@nvidia.com>
ASoC: tegra: Fix AMX byte map
Johan Hovold <johan+linaro@kernel.org>
ASoC: qdsp6: audioreach: fix topology probe deferral
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd-mbhc-v2: fix resource leaks on component remove
Nathan Chancellor <nathan@kernel.org>
ASoC: cs35l45: Select REGMAP_IRQ
Johan Hovold <johan+linaro@kernel.org>
ASoC: codecs: wcd938x: fix missing clsh ctrl error handling
Thomas Petazzoni <thomas.petazzoni@bootlin.com>
ASoC: cs42l51: fix driver to properly autoload with automatic module loading
Sameer Pujar <spujar@nvidia.com>
ASoC: rt5640: Fix sleep in atomic context
Sheetal <sheetal@nvidia.com>
ASoC: tegra: Fix ADX byte map
Fabio Estevam <festevam@denx.de>
ASoC: fsl_sai: Revert "ASoC: fsl_sai: Enable MCTL_MCLK_EN bit for master mode"
Matus Gajdos <matuszpd@gmail.com>
ASoC: fsl_sai: Disable bit clock with transmitter
Nicholas Kazlauskas <nicholas.kazlauskas@amd.com>
drm/amd/display: Keep PHY active for DP displays on DCN31
Taimur Hassan <syed.hassan@amd.com>
drm/amd/display: check TG is non-null before checking if enabled
Zhikai Zhai <zhikai.zhai@amd.com>
drm/amd/display: Disable MPC split by default on special asic
Simon Ser <contact@emersion.fr>
drm/amd/display: only accept async flips for fast updates
Jocelyn Falempe <jfalempe@redhat.com>
drm/client: Fix memory leak in drm_client_modeset_probe
Jocelyn Falempe <jfalempe@redhat.com>
drm/client: Fix memory leak in drm_client_target_cloned
Ben Skeggs <bskeggs@redhat.com>
drm/nouveau/i2c: fix number of aux event slots
Ben Skeggs <bskeggs@redhat.com>
drm/nouveau/kms/nv50-: init hpd_irq_lock for PIOR DP
Ben Skeggs <bskeggs@redhat.com>
drm/nouveau/disp: PIOR DP uses GPIO for HPD, not PMGR AUX interrupts
Alex Deucher <alexander.deucher@amd.com>
drm/amdgpu/pm: make mclk consistent for smu 13.0.7
Alex Deucher <alexander.deucher@amd.com>
drm/amdgpu/pm: make gfxclock consistent for sienna cichlid
Guchun Chen <guchun.chen@amd.com>
drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel
Ville Syrjälä <ville.syrjala@linux.intel.com>
dma-buf/dma-resv: Stop leaking on krealloc() failure
Dan Carpenter <dan.carpenter@linaro.org>
accel/qaic: Add consistent integer overflow checks
Dan Carpenter <dan.carpenter@linaro.org>
accel/qaic: tighten bounds checking in decode_message()
Dan Carpenter <dan.carpenter@linaro.org>
accel/qaic: tighten bounds checking in encode_message()
Matthieu Baerts <matthieu.baerts@tessares.net>
selftests: tc: add ConnTrack procfs kconfig
Heiner Kallweit <hkallweit1@gmail.com>
Revert "r8169: disable ASPM during NAPI poll"
Marc Kleine-Budde <mkl@pengutronix.de>
can: gs_usb: fix time stamp counter initialization
Marc Kleine-Budde <mkl@pengutronix.de>
can: gs_usb: gs_can_open(): improve error handling
YueHaibing <yuehaibing@huawei.com>
can: bcm: Fix UAF in bcm_proc_show()
Fedor Ross <fedor.ross@ifm.com>
can: mcp251xfd: __mcp251xfd_chip_set_mode(): increase poll timeout
Mark Brown <broonie@kernel.org>
arm64/fpsimd: Ensure SME storage is allocated after SVE VL changes
Helge Deller <deller@gmx.de>
ia64: mmap: Consider pgoff when searching for free mapping
Mark Brown <broonie@kernel.org>
regmap: Account for register length in SMBus I/O limits
Rob Herring <robh@kernel.org>
of: Preserve "of-display" device name for compatibility
Harald Freudenberger <freude@linux.ibm.com>
s390/zcrypt: fix reply buffer calculations for CCA replies
Mark Brown <broonie@kernel.org>
regmap: Drop initial version of maximum transfer length fixes
Matthieu Baerts <matthieu.baerts@tessares.net>
selftests: tc: add 'ct' action kconfig dep
Dan Carpenter <dan.carpenter@linaro.org>
accel/qaic: Fix a leak in map_user_pages()
Matthieu Baerts <matthieu.baerts@tessares.net>
selftests: tc: set timeout to 15 minutes
Josef Bacik <josef@toxicpanda.com>
btrfs: fix race between balance and cancel/pause
Miklos Szeredi <mszeredi@redhat.com>
fuse: ioctl: translate ENOSYS in outarg
Filipe Manana <fdmanana@suse.com>
btrfs: zoned: fix memory leak after finding block group with super blocks
Filipe Manana <fdmanana@suse.com>
btrfs: fix double iput() on inode after an error during orphan cleanup
Josef Bacik <josef@toxicpanda.com>
btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand
Qu Wenruo <wqu@suse.com>
btrfs: raid56: always verify the P/Q contents for scrub
Bernd Schubert <bschubert@ddn.com>
fuse: Apply flags2 only when userspace set the FUSE_INIT_EXT
Miklos Szeredi <mszeredi@redhat.com>
fuse: add feature flag for expire-only
Miklos Szeredi <mszeredi@redhat.com>
fuse: revalidate: don't invalidate if interrupted
Filipe Manana <fdmanana@suse.com>
btrfs: fix warning when putting transaction with qgroups enabled after abort
Filipe Manana <fdmanana@suse.com>
btrfs: fix iput() on error pointer after error during orphan cleanup
Georg Müller <georgmueller@gmx.net>
perf probe: Read DWARF files from the correct CU
Georg Müller <georgmueller@gmx.net>
perf probe: Add test for regression introduced by switch to die_get_decl_file()
Miguel Ojeda <ojeda@kernel.org>
prctl: move PR_GET_AUXV out of PR_MCE_KILL
Petr Pavlu <petr.pavlu@suse.com>
keys: Fix linking a duplicate key to a keyring's assoc_array
Colin Ian King <colin.i.king@gmail.com>
selftests/mm: mkdirty: fix incorrect position of #endif
Liam R. Howlett <Liam.Howlett@oracle.com>
maple_tree: fix node allocation testing on 32 bit
Liam R. Howlett <Liam.Howlett@oracle.com>
mm/mlock: fix vma iterator conversion of apply_vma_lock_flags()
Peng Zhang <zhangpeng.00@bytedance.com>
maple_tree: set the node limit when creating a new root node
Luka Guzenko <l.guzenko@web.de>
ALSA: hda/realtek: Enable Mute LED on HP Laptop 15s-eq2xxx
Christoffer Sandberg <cs@tuxedo.de>
ALSA: hda/realtek: Add quirk for Clevo NS70AU
Kailang Yang <kailang@realtek.com>
ALSA: hda/realtek - remove 3k pull low procedure
Helge Deller <deller@gmx.de>
io_uring: Fix io_uring mmap() by using architecture-provided get_unmapped_area()
Jens Axboe <axboe@kernel.dk>
io_uring: treat -EAGAIN for REQ_F_NOWAIT as final for io-wq
-------------
Diffstat:
Makefile | 4 +-
arch/arm64/include/asm/exception.h | 5 -
arch/arm64/include/asm/kvm_host.h | 2 +
arch/arm64/include/asm/kvm_pgtable.h | 26 +--
arch/arm64/kernel/fpsimd.c | 33 ++-
arch/arm64/kvm/arch_timer.c | 6 +-
arch/arm64/kvm/arm.c | 19 +-
arch/arm64/kvm/hyp/pgtable.c | 47 +++-
arch/arm64/kvm/mmu.c | 18 +-
arch/arm64/kvm/vgic/vgic-v3.c | 2 +-
arch/arm64/kvm/vgic/vgic-v4.c | 7 +-
arch/arm64/mm/mmu.c | 4 +-
arch/arm64/net/bpf_jit_comp.c | 8 +-
arch/arm64/tools/sysreg | 12 +-
arch/ia64/kernel/sys_ia64.c | 2 +-
arch/mips/include/asm/dec/prom.h | 2 +-
arch/parisc/kernel/sys_parisc.c | 15 +-
block/blk-mq.c | 10 +-
drivers/accel/qaic/qaic_control.c | 39 ++--
drivers/acpi/button.c | 9 +
drivers/acpi/resource.c | 60 -----
drivers/acpi/video_detect.c | 24 ++
drivers/acpi/x86/utils.c | 26 ++-
drivers/base/regmap/regmap-i2c.c | 8 +-
drivers/base/regmap/regmap-spi-avmm.c | 2 +-
drivers/base/regmap/regmap.c | 6 +-
drivers/bluetooth/btusb.c | 1 +
drivers/dma-buf/dma-resv.c | 13 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 5 +-
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 256 +++++++++------------
drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 7 +
.../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 12 +
.../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 110 +++++++++
.../amd/display/amdgpu_dm/amdgpu_dm_mst_types.h | 11 +
.../amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 5 +
.../drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 3 +-
.../drm/amd/display/dc/dcn303/dcn303_resource.c | 2 +-
.../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 +-
.../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 2 +-
drivers/gpu/drm/drm_client_modeset.c | 6 +
drivers/gpu/drm/i915/i915_perf.c | 1 +
drivers/gpu/drm/nouveau/dispnv50/disp.c | 4 +
drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h | 4 +-
drivers/gpu/drm/nouveau/nvkm/engine/disp/uconn.c | 27 ++-
drivers/gpu/drm/nouveau/nvkm/subdev/i2c/base.c | 11 +-
drivers/gpu/drm/radeon/radeon_cs.c | 3 +-
drivers/gpu/drm/ttm/ttm_resource.c | 5 +-
drivers/hid/hid-ids.h | 1 +
drivers/hid/hid-quirks.c | 1 +
drivers/iommu/iommu-sva.c | 3 +-
drivers/md/md.c | 14 +-
drivers/md/raid10.c | 2 +
drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 10 +-
drivers/net/can/spi/mcp251xfd/mcp251xfd.h | 1 +
drivers/net/can/usb/gs_usb.c | 130 ++++++-----
drivers/net/dsa/microchip/ksz8795.c | 8 +-
drivers/net/dsa/microchip/ksz_common.c | 8 +-
drivers/net/dsa/microchip/ksz_common.h | 7 +
drivers/net/dsa/mv88e6xxx/chip.c | 7 +
drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 33 ++-
.../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 29 ++-
drivers/net/ethernet/intel/iavf/iavf.h | 16 +-
drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 39 ++--
drivers/net/ethernet/intel/iavf/iavf_main.c | 223 ++++++++++++------
drivers/net/ethernet/intel/iavf/iavf_txrx.c | 43 ++--
drivers/net/ethernet/intel/iavf/iavf_txrx.h | 4 -
drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 5 +-
drivers/net/ethernet/intel/ice/ice_base.c | 2 +
drivers/net/ethernet/intel/ice/ice_ethtool.c | 13 +-
drivers/net/ethernet/intel/ice/ice_lib.c | 27 ---
drivers/net/ethernet/intel/ice/ice_main.c | 10 +-
drivers/net/ethernet/intel/igb/igb_main.c | 5 +
drivers/net/ethernet/intel/igc/igc_main.c | 12 +-
drivers/net/ethernet/litex/litex_liteeth.c | 19 +-
.../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 5 +-
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 29 +--
drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c | 2 +-
drivers/net/ethernet/realtek/r8169_main.c | 18 +-
drivers/net/ethernet/ti/cpsw_ale.c | 24 +-
drivers/net/phy/phy_device.c | 21 +-
drivers/net/vrf.c | 12 +-
drivers/net/wireless/ath/ath11k/core.c | 53 +++--
drivers/net/wireless/ath/ath11k/mac.c | 3 +-
drivers/net/wireless/ath/ath11k/wmi.c | 5 +
drivers/net/wireless/ath/ath12k/mac.c | 1 +
drivers/net/wireless/intel/iwlwifi/mvm/mld-key.c | 9 +-
drivers/net/wireless/intel/iwlwifi/mvm/power.c | 14 +-
drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 +-
drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 4 +
drivers/net/wireless/realtek/rtw88/sdio.c | 24 +-
drivers/net/wireless/virtual/mac80211_hwsim.c | 4 +-
drivers/of/platform.c | 2 +-
drivers/pinctrl/renesas/pinctrl-rzg2l.c | 28 ++-
drivers/pinctrl/renesas/pinctrl-rzv2m.c | 28 ++-
drivers/regulator/da9063-regulator.c | 3 +
drivers/s390/crypto/zcrypt_msgtype6.c | 33 ++-
drivers/scsi/sg.c | 10 +
drivers/spi/spi-bcm63xx.c | 2 +-
drivers/spi/spi-cadence-quadspi.c | 19 ++
drivers/spi/spi-dw-mmio.c | 22 ++
drivers/spi/spi-s3c64xx.c | 2 +
drivers/video/fbdev/au1200fb.c | 3 +
drivers/video/fbdev/imxfb.c | 5 +-
fs/btrfs/block-group.c | 1 +
fs/btrfs/ctree.c | 10 +-
fs/btrfs/disk-io.c | 3 +
fs/btrfs/extent_io.c | 33 +--
fs/btrfs/inode.c | 35 +--
fs/btrfs/qgroup.c | 1 +
fs/btrfs/raid56.c | 11 +-
fs/btrfs/volumes.c | 17 +-
fs/erofs/zdata.c | 2 +-
fs/ext4/xattr.c | 14 ++
fs/fuse/dir.c | 2 +-
fs/fuse/inode.c | 8 +-
fs/fuse/ioctl.c | 21 +-
fs/jbd2/checkpoint.c | 102 +++-----
fs/jfs/jfs_dmap.c | 3 +
fs/jfs/jfs_txnmgr.c | 5 +
fs/jfs/namei.c | 5 +
fs/overlayfs/ovl_entry.h | 9 +
fs/quota/dquot.c | 5 +-
fs/smb/client/connect.c | 19 +-
fs/smb/client/dfs.c | 26 +--
fs/smb/client/smb2transport.c | 2 +-
fs/udf/unicode.c | 2 +-
include/kvm/arm_vgic.h | 2 +-
include/linux/psi.h | 5 +-
include/linux/psi_types.h | 3 +
include/linux/sched/signal.h | 2 +-
include/linux/tcp.h | 2 +-
include/net/bluetooth/hci_core.h | 5 +
include/net/ip.h | 2 +-
include/net/tcp.h | 31 ++-
include/uapi/linux/fuse.h | 3 +
io_uring/io_uring.c | 52 ++---
kernel/bpf/bpf_lru_list.c | 21 +-
kernel/bpf/bpf_lru_list.h | 7 +-
kernel/bpf/btf.c | 23 +-
kernel/bpf/log.c | 3 -
kernel/bpf/syscall.c | 3 +-
kernel/bpf/verifier.c | 32 ++-
kernel/cgroup/cgroup.c | 2 +-
kernel/kallsyms.c | 5 +-
kernel/rcu/tasks.h | 5 +-
kernel/rcu/tree_exp.h | 2 +-
kernel/rcu/tree_plugin.h | 4 +-
kernel/sched/fair.c | 4 +-
kernel/sched/psi.c | 29 ++-
kernel/sys.c | 10 +-
kernel/time/posix-timers.c | 31 +--
kernel/trace/trace_events_hist.c | 3 +-
lib/iov_iter.c | 2 +-
lib/maple_tree.c | 3 +-
mm/mlock.c | 9 +-
net/bluetooth/hci_conn.c | 14 +-
net/bluetooth/hci_core.c | 42 +++-
net/bluetooth/hci_event.c | 15 +-
net/bluetooth/hci_sync.c | 121 ++++++++--
net/bluetooth/iso.c | 55 +++--
net/bluetooth/mgmt.c | 26 +--
net/bluetooth/sco.c | 23 +-
net/bridge/br_stp_if.c | 3 +
net/can/bcm.c | 12 +-
net/devlink/health.c | 2 +-
net/devlink/leftover.c | 5 +-
net/ipv4/esp4.c | 2 +-
net/ipv4/inet_connection_sock.c | 2 +-
net/ipv4/inet_hashtables.c | 17 +-
net/ipv4/inet_timewait_sock.c | 8 +-
net/ipv4/ip_output.c | 4 +-
net/ipv4/tcp.c | 57 ++---
net/ipv4/tcp_fastopen.c | 6 +-
net/ipv4/tcp_ipv4.c | 27 ++-
net/ipv4/tcp_minisocks.c | 11 +-
net/ipv4/tcp_output.c | 6 +-
net/ipv4/udp_offload.c | 16 +-
net/ipv6/ip6_gre.c | 3 +-
net/ipv6/tcp_ipv6.c | 4 +-
net/ipv6/udp_offload.c | 3 +-
net/llc/llc_input.c | 3 -
net/netfilter/nf_tables_api.c | 12 +-
net/netfilter/nft_set_pipapo.c | 6 +-
net/sched/cls_bpf.c | 99 ++++----
net/sched/cls_matchall.c | 35 +--
net/sched/cls_u32.c | 48 +++-
net/wireless/wext-core.c | 6 +
scripts/Makefile.build | 5 +-
scripts/Makefile.host | 6 +-
scripts/kallsyms.c | 6 +-
security/keys/request_key.c | 35 ++-
security/keys/trusted-keys/trusted_tpm2.c | 2 +-
sound/pci/emu10k1/emufx.c | 112 +--------
sound/pci/hda/patch_realtek.c | 100 +++++++-
sound/soc/amd/acp/amd.h | 7 +-
sound/soc/codecs/Kconfig | 1 +
sound/soc/codecs/cs42l51-i2c.c | 6 +
sound/soc/codecs/cs42l51.c | 7 -
sound/soc/codecs/cs42l51.h | 1 -
sound/soc/codecs/rt5640.c | 12 +-
sound/soc/codecs/wcd-mbhc-v2.c | 57 +++--
sound/soc/codecs/wcd934x.c | 12 +
sound/soc/codecs/wcd938x.c | 86 ++++++-
sound/soc/fsl/fsl_sai.c | 8 +-
sound/soc/fsl/fsl_sai.h | 1 +
sound/soc/qcom/qdsp6/q6apm.c | 7 +-
sound/soc/qcom/qdsp6/topology.c | 4 +-
sound/soc/sof/ipc3-dtrace.c | 9 +-
sound/soc/tegra/tegra210_adx.c | 34 ++-
sound/soc/tegra/tegra210_amx.c | 40 ++--
tools/include/nolibc/stackprotector.h | 5 +-
tools/perf/Makefile.config | 4 +-
.../tests/shell/test_uprobe_from_different_cu.sh | 77 +++++++
tools/perf/util/dwarf-aux.c | 4 +-
tools/testing/radix-tree/maple.c | 6 +-
tools/testing/selftests/mm/mkdirty.c | 2 +-
tools/testing/selftests/tc-testing/config | 2 +
tools/testing/selftests/tc-testing/settings | 1 +
218 files changed, 2462 insertions(+), 1482 deletions(-)
On Tue, Jul 25, 2023 at 12:42:47PM +0200, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > or in the git tree and branch at: > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > and the diffstat can be found below. I saw this when running rcutorture, this one happened in the TREE04 configuration. This is likely due to the stuttering issues we are discussing in the other thread. Anyway I am just making a note here while I am continuing to look into it. Other than that, all tests pass: Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> [ 1676.206713] ------------[ cut here ]------------ [ 1676.213985] rcutorture_oom_notify invoked upon OOM during forward-progress testing. [ 1676.224945] WARNING: CPU: 7 PID: 103 at kernel/rcu/rcutorture.c:2841 rcutorture_oom_notify+0x3c/0x1d0 [ 1676.238323] Modules linked in: [ 1676.242750] CPU: 7 PID: 103 Comm: rcu_torture_fwd Not tainted 6.4.7-rc1-g3c19c5641cce #6 [ 1676.254378] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 1676.268003] RIP: 0010:rcutorture_oom_notify+0x3c/0x1d0 [ 1676.275468] Code: d5 53 e8 e7 23 d4 00 48 8b 1d 70 34 45 02 48 85 db 0f 84 88 01 00 00 48 c7 c6 e0 f6 a0 b2 48 c7 c7 88 91 ee b2 e8 14 25 f7 ff <0f> 0b 8b 35 8c d8 a2 01 85 f6 7e 40 45 31 ed 4d 63 e5 41 83 c5 01 [ 1676.302738] RSP: 0000:ffffa7c6c0397a98 EFLAGS: 00010282 [ 1676.310984] RAX: 0000000000000000 RBX: ffff897a418cc000 RCX: 00000000ffffdfff [ 1676.322207] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000009ffb [ 1676.333232] RBP: ffffa7c6c0397b28 R08: 00000000ffffdfff R09: 00000000ffffdfff [ 1676.342365] R10: ffffffffb32591e0 R11: ffffffffb32591e0 R12: 0000000000000000 [ 1676.352563] R13: ffffa7c6c0397b28 R14: 00000000ffffffff R15: 0000000000000000 [ 1676.362721] FS: 0000000000000000(0000) GS:ffff897a5f5c0000(0000) knlGS:0000000000000000 [ 1676.374816] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1676.383256] CR2: 0000000000000000 CR3: 000000001e22e000 CR4: 00000000000006e0 [ 1676.392499] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1676.401739] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1676.410804] Call Trace: [ 1676.414279] <TASK> [ 1676.417140] ? rcutorture_oom_notify+0x3c/0x1d0 [ 1676.422944] ? __warn+0x7c/0x120 [ 1676.427146] ? rcutorture_oom_notify+0x3c/0x1d0 [ 1676.432902] ? report_bug+0x15d/0x180 [ 1676.437783] ? handle_bug+0x3c/0x70 [ 1676.442369] ? exc_invalid_op+0x17/0x70 [ 1676.447269] ? asm_exc_invalid_op+0x1a/0x20 [ 1676.452574] ? rcutorture_oom_notify+0x3c/0x1d0 [ 1676.458128] ? rcutorture_oom_notify+0x3c/0x1d0 [ 1676.463880] notifier_call_chain+0x55/0xb0 [ 1676.469255] blocking_notifier_call_chain+0x3a/0x60 [ 1676.475244] out_of_memory+0x3bc/0x710 [ 1676.480323] __alloc_pages_slowpath.constprop.0+0xbb6/0xd00 [ 1676.487347] __alloc_pages+0x2cb/0x2e0 [ 1676.492200] allocate_slab+0x348/0x3e0 [ 1676.496983] ? sysvec_reschedule_ipi+0x31/0xd0 [ 1676.502607] ___slab_alloc+0x2d8/0x7a0 [ 1676.507406] ? rcu_torture_fwd_prog+0x3d8/0xa60 [ 1676.513157] ? asm_sysvec_apic_timer_interrupt+0x1a/0x20 [ 1676.519767] ? rcu_nocb_do_flush_bypass+0xc6/0x110 [ 1676.525875] ? rcu_torture_fwd_prog+0x3d8/0xa60 [ 1676.531607] __kmem_cache_alloc_node+0x183/0x1a0 [ 1676.537506] kmalloc_trace+0x25/0x90 [ 1676.542240] rcu_torture_fwd_prog+0x3d8/0xa60 [ 1676.547800] ? __pfx_rcu_torture_fwd_prog+0x10/0x10 [ 1676.554051] ? kthread+0xcb/0xf0 [ 1676.558286] ? __pfx_rcu_torture_fwd_prog+0x10/0x10 [ 1676.564594] kthread+0xcb/0xf0 [ 1676.568731] ? __pfx_kthread+0x10/0x10 [ 1676.573590] ret_from_fork+0x2c/0x50 [ 1676.578317] </TASK> [ 1676.581240] ---[ end trace 0000000000000000 ]--- thanks, - Joel > > thanks, > > greg k-h > > ------------- > Pseudo-Shortlog of commits: > > Greg Kroah-Hartman <gregkh@linuxfoundation.org> > Linux 6.4.7-rc1 > > Wayne Lin <wayne.lin@amd.com> > drm/amd/display: Add polling method to handle MST reply packet > > Srinivasan Shanmugam <srinivasan.shanmugam@amd.com> > drm/amd/display: Clean up errors & warnings in amdgpu_dm.c > > Yu Kuai <yukuai3@huawei.com> > scsi: sg: Fix checking return value of blk_get_queue() > > Yu Kuai <yukuai3@huawei.com> > scsi/sg: don't grab scsi host module reference > > Abe Kohandel <abe.kohandel@intel.com> > spi: dw: Remove misleading comment for Mount Evans SoC > > Yunxiang Li <Yunxiang.Li@amd.com> > drm/ttm: fix bulk_move corruption when adding a entry > > Mohamed Khalfella <mkhalfella@purestorage.com> > tracing/histograms: Return an error if we fail to add histogram to hist_vars list > > Miguel Ojeda <ojeda@kernel.org> > kbuild: rust: avoid creating temporary files > > Zhang Yi <yi.zhang@huawei.com> > jbd2: recheck chechpointing non-dirty buffer > > Vladimir Oltean <vladimir.oltean@nxp.com> > net: phy: prevent stale pointer dereference in phy_init() > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around fastopenq.max_qlen > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around icsk->icsk_user_timeout > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->notsent_lowat > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around rskq_defer_accept > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->linger2 > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around icsk->icsk_syn_retries > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->keepalive_probes > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->keepalive_intvl > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->keepalive_time > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->tsoffset > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tp->tcp_tx_delay > > Tomasz Moń <tomasz.mon@nordicsemi.no> > Bluetooth: btusb: Fix bluetooth on Intel Macbook 2014 > > Pauli Virtanen <pav@iki.fi> > Bluetooth: SCO: fix sco_conn related locking and validity issues > > Siddh Raman Pant <code@siddh.me> > Bluetooth: hci_conn: return ERR_PTR instead of NULL when there is no link > > Douglas Anderson <dianders@chromium.org> > Bluetooth: hci_sync: Avoid use-after-free in dbg for hci_remove_adv_monitor() > > Pauli Virtanen <pav@iki.fi> > Bluetooth: ISO: fix iso_conn related locking and validity issues > > Pauli Virtanen <pav@iki.fi> > Bluetooth: hci_event: call disconnect callback before deleting conn > > Pauli Virtanen <pav@iki.fi> > Bluetooth: use RCU for hci_conn_params and iterate safely in hci_sync > > Pablo Neira Ayuso <pablo@netfilter.org> > netfilter: nf_tables: skip bound chain on rule flush > > Pablo Neira Ayuso <pablo@netfilter.org> > netfilter: nf_tables: skip bound chain in netns release path > > Florian Westphal <fw@strlen.de> > netfilter: nft_set_pipapo: fix improper element removal > > Florian Westphal <fw@strlen.de> > netfilter: nf_tables: can't schedule in nft_chain_validate > > Florian Westphal <fw@strlen.de> > netfilter: nf_tables: fix spurious set element insertion failure > > Vitaly Rodionov <vitalyr@opensource.cirrus.com> > ALSA: hda/realtek: Fix generic fixup definition for cs35l41 amp > > Kuniyuki Iwashima <kuniyu@amazon.com> > llc: Don't drop packet from non-root netns. > > Zhang Shurong <zhang_shurong@foxmail.com> > fbdev: au1200fb: Fix missing IRQ check in au1200fb_drv_probe > > Daniel Golle <daniel@makrotopia.org> > net: ethernet: mtk_eth_soc: always mtk_get_ib1_pkt_type > > Kuniyuki Iwashima <kuniyu@amazon.com> > Revert "tcp: avoid the lookup process failing to get sk in ehash table" > > Yuanjun Gong <ruc_gongyuanjun@163.com> > net:ipv6: check return value of pskb_trim() > > Wang Ming <machel@vivo.com> > net: ipv4: Use kfree_sensitive instead of kfree > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tcp_rsk(req)->ts_recent > > Eric Dumazet <edumazet@google.com> > tcp: annotate data-races around tcp_rsk(req)->txhash > > Antoine Tenart <atenart@kernel.org> > net: ipv4: use consistent txhash in TIME_WAIT and SYN_RECV > > Florian Kauer <florian.kauer@linutronix.de> > igc: Prevent garbled TX queue with XDP ZEROCOPY > > Kurt Kanzenbach <kurt@linutronix.de> > igc: Avoid transmit queue timeout for XDP > > Alexander Duyck <alexanderduyck@fb.com> > bpf, arm64: Fix BTI type used for freplace attached functions > > Kumar Kartikeya Dwivedi <memxor@gmail.com> > bpf: Repeat check_max_stack_depth for async callbacks > > Kumar Kartikeya Dwivedi <memxor@gmail.com> > bpf: Fix subprog idx logic in check_max_stack_depth > > Geetha sowjanya <gakula@marvell.com> > octeontx2-pf: Dont allocate BPIDs for LBK interfaces > > Ido Schimmel <idosch@nvidia.com> > vrf: Fix lockdep splat in output path > > Jiapeng Chong <jiapeng.chong@linux.alibaba.com> > security: keys: Modify mismatched function name > > Ahmed Zaki <ahmed.zaki@intel.com> > iavf: fix reset task race with iavf_remove() > > Ahmed Zaki <ahmed.zaki@intel.com> > iavf: fix a deadlock caused by rtnl and driver's lock circular dependencies > > Marcin Szycik <marcin.szycik@linux.intel.com> > iavf: Wait for reset in callbacks which trigger it > > Przemek Kitszel <przemyslaw.kitszel@intel.com> > iavf: make functions static where possible > > Ahmed Zaki <ahmed.zaki@intel.com> > iavf: use internal state to free traffic IRQs > > Ding Hui <dinghui@sangfor.com.cn> > iavf: Fix out-of-bounds when setting channels on remove > > Ding Hui <dinghui@sangfor.com.cn> > iavf: Fix use-after-free in free_netdev > > Andrzej Hajda <andrzej.hajda@intel.com> > drm/i915/perf: add sentinel to xehp_oa_b_counters > > Heiner Kallweit <hkallweit1@gmail.com> > r8169: fix ASPM-related problem for chip version 42 and 43 > > Tristram Ha <Tristram.Ha@microchip.com> > net: dsa: microchip: correct KSZ8795 static MAC table access > > Victor Nogueira <victor@mojatatu.com> > net: sched: cls_bpf: Undo tcf_bind_filter in case of an error > > Victor Nogueira <victor@mojatatu.com> > net: sched: cls_u32: Undo refcount decrement in case update failed > > Victor Nogueira <victor@mojatatu.com> > net: sched: cls_u32: Undo tcf_bind_filter if u32_replace_hw_knode > > Victor Nogueira <victor@mojatatu.com> > net: sched: cls_matchall: Undo tcf_bind_filter in case of failure after mall_set_parms > > Martin Fuzzey <martin.fuzzey@flowbird.group> > regulator: da9063: fix null pointer deref with partial DT config > > Dan Carpenter <dan.carpenter@linaro.org> > ASoC: SOF: ipc3-dtrace: uninitialized data in dfsentry_trace_filter_write() > > Michal Swiatkowski <michal.swiatkowski@linux.intel.com> > ice: prevent NULL pointer deref during reload > > Petr Oros <poros@redhat.com> > ice: Unregister netdev and devlink_port only once > > Shyam Prasad N <nspmangalore@gmail.com> > cifs: fix mid leak during reconnection after timeout threshold > > Dan Carpenter <error27@gmail.com> > iommu/sva: Fix signedness bug in iommu_sva_alloc_pasid() > > Yan Zhai <yan@cloudflare.com> > gso: fix dodgy bit handling for GSO_UDP_L4 > > Daniel Golle <daniel@makrotopia.org> > net: ethernet: mtk_eth_soc: handle probe deferral > > Kuniyuki Iwashima <kuniyu@amazon.com> > bridge: Add extack warning when enabling STP in netns. > > Tanmay Patil <t-patil@ti.com> > net: ethernet: ti: cpsw_ale: Fix cpsw_ale_get_field()/cpsw_ale_set_field() > > Linus Walleij <linus.walleij@linaro.org> > dsa: mv88e6xxx: Do a final check before timing out > > Marc Zyngier <maz@kernel.org> > arm64: Fix HFGxTR_EL2 field naming > > Paulo Alcantara <pc@manguebit.com> > smb: client: fix missed ses refcounting > > Yonghong Song <yhs@fb.com> > kallsyms: strip LTO-only suffixes from promoted global functions > > Jaewon Kim <jaewon02.kim@samsung.com> > spi: s3c64xx: clear loopback bit after loopback test > > Christoph Hellwig <hch@lst.de> > btrfs: be a bit more careful when setting mirror_num_ret in btrfs_map_block > > James Clark <james.clark@arm.com> > perf build: Fix library not found error when using CSLIBS > > Yangtao Li <frank.li@vivo.com> > fbdev: imxfb: Removed unneeded release_mem_region > > Martin Kaiser <martin@kaiser.cx> > fbdev: imxfb: warn about invalid left/right margin > > Jonas Gorski <jonas.gorski@gmail.com> > spi: bcm63xx: fix max prepend length > > Biju Das <biju.das.jz@bp.renesas.com> > pinctrl: renesas: rzg2l: Handle non-unique subnode names > > Geert Uytterhoeven <geert+renesas@glider.be> > pinctrl: renesas: rzv2m: Handle non-unique subnode names > > Suren Baghdasaryan <surenb@google.com> > sched/psi: use kernfs polling functions for PSI trigger polling > > Miaohe Lin <linmiaohe@huawei.com> > sched/fair: Use recent_used_cpu to test p->cpus_ptr > > Peter Zijlstra <peterz@infradead.org> > iov_iter: Mark copy_iovec_from_user() noclone > > Srinivas Kandagatla <srinivas.kandagatla@linaro.org> > ASoC: qcom: q6apm: do not close GPR port before closing graph > > Srinivas Kandagatla <srinivas.kandagatla@linaro.org> > ASoC: codecs: wcd938x: fix dB range for HPHL and HPHR > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix mbhc impedance loglevel > > Vijendar Mukunda <Vijendar.Mukunda@amd.com> > ASoC: amd: acp: fix for invalid dai id handling in acp_get_byte_count() > > Hao Chen <chenhao418@huawei.com> > net: hns3: fix strncpy() not using dest-buf length as length issue > > Ying Hsu <yinghsu@chromium.org> > igb: Fix igb_down hung on surprise removal > > Yi Kuo <yi@yikuo.dev> > wifi: iwlwifi: pcie: add device id 51F1 for killer 1675 > > Johannes Berg <johannes.berg@intel.com> > wifi: iwlwifi: mvm: avoid baid size integer overflow > > Mukesh Sisodiya <mukesh.sisodiya@intel.com> > wifi: iwlwifi: Add support for new PCI Id > > Gustavo A. R. Silva <gustavoars@kernel.org> > wifi: wext-core: Fix -Wstringop-overflow warning in ioctl_standard_iw_point() > > Mukesh Sisodiya <mukesh.sisodiya@intel.com> > wifi: iwlwifi: mvm: Add NULL check before dereferencing the pointer > > Petr Oros <poros@redhat.com> > devlink: report devlink_port_type_warn source device > > Jisheng Zhang <jszhang@kernel.org> > net: ethernet: litex: add support for 64 bit stats > > Gregory Greenman <gregory.greenman@intel.com> > wifi: iwlwifi: mvm: fix potential array out of bounds access > > P Praneesh <quic_ppranees@quicinc.com> > wifi: ath11k: fix memory leak in WMI firmware stats > > Balamurugan S <quic_bselvara@quicinc.com> > wifi: ath12k: Avoid NULL pointer access during management transmit cleanup > > Abe Kohandel <abe.kohandel@intel.com> > spi: dw: Add compatible for Intel Mount Evans SoC > > Ilan Peer <ilan.peer@intel.com> > wifi: mac80211_hwsim: Fix possible NULL dereference > > Wen Gong <quic_wgong@quicinc.com> > wifi: ath11k: add support default regdb while searching board-2.bin for WCN6855 > > Jakub Kicinski <kuba@kernel.org> > devlink: make health report on unregistered instance warn just once > > Yonghong Song <yhs@fb.com> > bpf: Silence a warning in btf_type_id_size() > > Martin Blumenstingl <martin.blumenstingl@googlemail.com> > wifi: rtw88: sdio: Check the HISR RX_REQUEST bit in rtw_sdio_rx_isr() > > Aditi Ghag <aditi.ghag@isovalent.com> > bpf: tcp: Avoid taking fast sock lock in iterator > > Andrii Nakryiko <andrii@kernel.org> > bpf: drop unnecessary user-triggerable WARN_ONCE in verifierl log > > Brad Larson <blarson@amd.com> > spi: cadence-quadspi: Add compatible for AMD Pensando Elba SoC > > Martin KaFai Lau <martin.lau@kernel.org> > bpf: Address KCSAN report on bpf_lru_list > > Kui-Feng Lee <thinker.li@gmail.com> > bpf: Print a warning only if writing to unprivileged_bpf_disabled. > > Maxime Bizon <mbizon@freebox.fr> > wifi: ath11k: fix registration of 6Ghz-only phy without the full channel range > > Yicong Yang <yangyicong@hisilicon.com> > sched/fair: Don't balance task to its current running CPU > > Thomas Weißschuh <linux@weissschuh.net> > tools/nolibc: ensure stack protector guard is never zero > > Paul E. McKenney <paulmck@kernel.org> > rcu: Mark additional concurrent load from ->cpu_no_qs.b.exp > > Shigeru Yoshida <syoshida@redhat.com> > rcu-tasks: Avoid pr_info() with spin lock in cblist_init_generic() > > Hans de Goede <hdegoede@redhat.com> > ACPI: video: Add backlight=native DMI quirk for Dell Studio 1569 > > Mark Rutland <mark.rutland@arm.com> > arm64: mm: fix VA-range sanity check > > Youngmin Nam <youngmin.nam@samsung.com> > arm64: set __exception_irq_entry with __irq_entry as a default > > Mario Limonciello <mario.limonciello@amd.com> > ACPI: resource: Remove "Zen" specific match and quirks > > Hans de Goede <hdegoede@redhat.com> > ACPI: video: Add backlight=native DMI quirk for Lenovo ThinkPad X131e (3371 AMD version) > > Hans de Goede <hdegoede@redhat.com> > ACPI: video: Add backlight=native DMI quirk for Apple iMac11,3 > > Hans de Goede <hdegoede@redhat.com> > ACPI: x86: Add ACPI_QUIRK_UART1_SKIP for Lenovo Yoga Book yb1-x90f/l > > Hans de Goede <hdegoede@redhat.com> > ACPI: button: Add lid disable DMI quirk for Nextbook Ares 8A > > Hans de Goede <hdegoede@redhat.com> > ACPI: x86: Add skip i2c clients quirk for Nextbook Ares 8A > > Sandeep Dhavale <dhavale@google.com> > erofs: Fix detection of atomic context > > Filipe Manana <fdmanana@suse.com> > btrfs: abort transaction at update_ref_for_cow() when ref count is zero > > Christoph Hellwig <hch@lst.de> > btrfs: don't check PageError in __extent_writepage > > David Sterba <dsterba@suse.com> > btrfs: add xxhash to fast checksum implementations > > Thomas Gleixner <tglx@linutronix.de> > posix-timers: Ensure timer ID search-loop limit is valid > > Ming Lei <ming.lei@redhat.com> > blk-mq: fix NULL dereference on q->elevator in blk_mq_elv_switch_none > > Yu Kuai <yukuai3@huawei.com> > scsi: sg: fix blktrace debugfs entries leakage > > Yu Kuai <yukuai3@huawei.com> > md/raid10: prevent soft lockup while flush writes > > Yu Kuai <yukuai3@huawei.com> > md: fix data corruption for raid456 when reshape restart while grow up > > Immad Mir <mirimmad17@gmail.com> > FS: JFS: Check for read-only mounted filesystem in txBegin > > Immad Mir <mirimmad17@gmail.com> > FS: JFS: Fix null-ptr-deref Read in txBegin > > Gustavo A. R. Silva <gustavoars@kernel.org> > MIPS: dec: prom: Address -Warray-bounds warning > > Yogesh <yogi.kernel@gmail.com> > fs: jfs: Fix UBSAN: array-index-out-of-bounds in dbAllocDmapLev > > Matthew Anderson <ruinairas1992@gmail.com> > ALSA: hda/realtek: Add quirks for ROG ALLY CS35l41 audio > > Jan Kara <jack@suse.cz> > udf: Fix uninitialized array access for some pathnames > > Christian Brauner <brauner@kernel.org> > ovl: check type and offset of struct vfsmount in ovl_entry > > Marco Morandini <marco.morandini@polimi.it> > HID: add quirk for 03f0:464a HP Elite Presenter Mouse > > Ye Bin <yebin10@huawei.com> > quota: fix warning in dqgrab() > > Jan Kara <jack@suse.cz> > quota: Properly disable quotas when add_dquot_ref() fails > > Oswald Buddenhagen <oswald.buddenhagen@gmx.de> > ALSA: emu10k1: roll up loops in DSP setup code for Audigy > > hackyzh002 <hackyzh002@gmail.com> > drm/radeon: Fix integer overflow in radeon_cs_parser_init > > Eric Whitney <enwlinux@gmail.com> > ext4: correct inline offset when handling xattrs in inode body > > Marc Zyngier <maz@kernel.org> > KVM: arm64: vgic-v4: Make the doorbell request robust w.r.t preemption > > Marc Zyngier <maz@kernel.org> > KVM: arm64: Disable preemption in kvm_arch_hardware_enable() > > Oliver Upton <oliver.upton@linux.dev> > KVM: arm64: Correctly handle page aging notifiers for unaligned memslot > > Marc Zyngier <maz@kernel.org> > KVM: arm64: timers: Use CNTHCTL_EL2 when setting non-CNTKCTL_EL1 bits > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix soundwire initialisation race > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix codec initialisation race > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd934x: fix resource leaks on component remove > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix missing mbhc init error handling > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix resource leaks on component remove > > Sheetal <sheetal@nvidia.com> > ASoC: tegra: Fix AMX byte map > > Johan Hovold <johan+linaro@kernel.org> > ASoC: qdsp6: audioreach: fix topology probe deferral > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd-mbhc-v2: fix resource leaks on component remove > > Nathan Chancellor <nathan@kernel.org> > ASoC: cs35l45: Select REGMAP_IRQ > > Johan Hovold <johan+linaro@kernel.org> > ASoC: codecs: wcd938x: fix missing clsh ctrl error handling > > Thomas Petazzoni <thomas.petazzoni@bootlin.com> > ASoC: cs42l51: fix driver to properly autoload with automatic module loading > > Sameer Pujar <spujar@nvidia.com> > ASoC: rt5640: Fix sleep in atomic context > > Sheetal <sheetal@nvidia.com> > ASoC: tegra: Fix ADX byte map > > Fabio Estevam <festevam@denx.de> > ASoC: fsl_sai: Revert "ASoC: fsl_sai: Enable MCTL_MCLK_EN bit for master mode" > > Matus Gajdos <matuszpd@gmail.com> > ASoC: fsl_sai: Disable bit clock with transmitter > > Nicholas Kazlauskas <nicholas.kazlauskas@amd.com> > drm/amd/display: Keep PHY active for DP displays on DCN31 > > Taimur Hassan <syed.hassan@amd.com> > drm/amd/display: check TG is non-null before checking if enabled > > Zhikai Zhai <zhikai.zhai@amd.com> > drm/amd/display: Disable MPC split by default on special asic > > Simon Ser <contact@emersion.fr> > drm/amd/display: only accept async flips for fast updates > > Jocelyn Falempe <jfalempe@redhat.com> > drm/client: Fix memory leak in drm_client_modeset_probe > > Jocelyn Falempe <jfalempe@redhat.com> > drm/client: Fix memory leak in drm_client_target_cloned > > Ben Skeggs <bskeggs@redhat.com> > drm/nouveau/i2c: fix number of aux event slots > > Ben Skeggs <bskeggs@redhat.com> > drm/nouveau/kms/nv50-: init hpd_irq_lock for PIOR DP > > Ben Skeggs <bskeggs@redhat.com> > drm/nouveau/disp: PIOR DP uses GPIO for HPD, not PMGR AUX interrupts > > Alex Deucher <alexander.deucher@amd.com> > drm/amdgpu/pm: make mclk consistent for smu 13.0.7 > > Alex Deucher <alexander.deucher@amd.com> > drm/amdgpu/pm: make gfxclock consistent for sienna cichlid > > Guchun Chen <guchun.chen@amd.com> > drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel > > Ville Syrjälä <ville.syrjala@linux.intel.com> > dma-buf/dma-resv: Stop leaking on krealloc() failure > > Dan Carpenter <dan.carpenter@linaro.org> > accel/qaic: Add consistent integer overflow checks > > Dan Carpenter <dan.carpenter@linaro.org> > accel/qaic: tighten bounds checking in decode_message() > > Dan Carpenter <dan.carpenter@linaro.org> > accel/qaic: tighten bounds checking in encode_message() > > Matthieu Baerts <matthieu.baerts@tessares.net> > selftests: tc: add ConnTrack procfs kconfig > > Heiner Kallweit <hkallweit1@gmail.com> > Revert "r8169: disable ASPM during NAPI poll" > > Marc Kleine-Budde <mkl@pengutronix.de> > can: gs_usb: fix time stamp counter initialization > > Marc Kleine-Budde <mkl@pengutronix.de> > can: gs_usb: gs_can_open(): improve error handling > > YueHaibing <yuehaibing@huawei.com> > can: bcm: Fix UAF in bcm_proc_show() > > Fedor Ross <fedor.ross@ifm.com> > can: mcp251xfd: __mcp251xfd_chip_set_mode(): increase poll timeout > > Mark Brown <broonie@kernel.org> > arm64/fpsimd: Ensure SME storage is allocated after SVE VL changes > > Helge Deller <deller@gmx.de> > ia64: mmap: Consider pgoff when searching for free mapping > > Mark Brown <broonie@kernel.org> > regmap: Account for register length in SMBus I/O limits > > Rob Herring <robh@kernel.org> > of: Preserve "of-display" device name for compatibility > > Harald Freudenberger <freude@linux.ibm.com> > s390/zcrypt: fix reply buffer calculations for CCA replies > > Mark Brown <broonie@kernel.org> > regmap: Drop initial version of maximum transfer length fixes > > Matthieu Baerts <matthieu.baerts@tessares.net> > selftests: tc: add 'ct' action kconfig dep > > Dan Carpenter <dan.carpenter@linaro.org> > accel/qaic: Fix a leak in map_user_pages() > > Matthieu Baerts <matthieu.baerts@tessares.net> > selftests: tc: set timeout to 15 minutes > > Josef Bacik <josef@toxicpanda.com> > btrfs: fix race between balance and cancel/pause > > Miklos Szeredi <mszeredi@redhat.com> > fuse: ioctl: translate ENOSYS in outarg > > Filipe Manana <fdmanana@suse.com> > btrfs: zoned: fix memory leak after finding block group with super blocks > > Filipe Manana <fdmanana@suse.com> > btrfs: fix double iput() on inode after an error during orphan cleanup > > Josef Bacik <josef@toxicpanda.com> > btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand > > Qu Wenruo <wqu@suse.com> > btrfs: raid56: always verify the P/Q contents for scrub > > Bernd Schubert <bschubert@ddn.com> > fuse: Apply flags2 only when userspace set the FUSE_INIT_EXT > > Miklos Szeredi <mszeredi@redhat.com> > fuse: add feature flag for expire-only > > Miklos Szeredi <mszeredi@redhat.com> > fuse: revalidate: don't invalidate if interrupted > > Filipe Manana <fdmanana@suse.com> > btrfs: fix warning when putting transaction with qgroups enabled after abort > > Filipe Manana <fdmanana@suse.com> > btrfs: fix iput() on error pointer after error during orphan cleanup > > Georg Müller <georgmueller@gmx.net> > perf probe: Read DWARF files from the correct CU > > Georg Müller <georgmueller@gmx.net> > perf probe: Add test for regression introduced by switch to die_get_decl_file() > > Miguel Ojeda <ojeda@kernel.org> > prctl: move PR_GET_AUXV out of PR_MCE_KILL > > Petr Pavlu <petr.pavlu@suse.com> > keys: Fix linking a duplicate key to a keyring's assoc_array > > Colin Ian King <colin.i.king@gmail.com> > selftests/mm: mkdirty: fix incorrect position of #endif > > Liam R. Howlett <Liam.Howlett@oracle.com> > maple_tree: fix node allocation testing on 32 bit > > Liam R. Howlett <Liam.Howlett@oracle.com> > mm/mlock: fix vma iterator conversion of apply_vma_lock_flags() > > Peng Zhang <zhangpeng.00@bytedance.com> > maple_tree: set the node limit when creating a new root node > > Luka Guzenko <l.guzenko@web.de> > ALSA: hda/realtek: Enable Mute LED on HP Laptop 15s-eq2xxx > > Christoffer Sandberg <cs@tuxedo.de> > ALSA: hda/realtek: Add quirk for Clevo NS70AU > > Kailang Yang <kailang@realtek.com> > ALSA: hda/realtek - remove 3k pull low procedure > > Helge Deller <deller@gmx.de> > io_uring: Fix io_uring mmap() by using architecture-provided get_unmapped_area() > > Jens Axboe <axboe@kernel.dk> > io_uring: treat -EAGAIN for REQ_F_NOWAIT as final for io-wq > > > ------------- > > Diffstat: > > Makefile | 4 +- > arch/arm64/include/asm/exception.h | 5 - > arch/arm64/include/asm/kvm_host.h | 2 + > arch/arm64/include/asm/kvm_pgtable.h | 26 +-- > arch/arm64/kernel/fpsimd.c | 33 ++- > arch/arm64/kvm/arch_timer.c | 6 +- > arch/arm64/kvm/arm.c | 19 +- > arch/arm64/kvm/hyp/pgtable.c | 47 +++- > arch/arm64/kvm/mmu.c | 18 +- > arch/arm64/kvm/vgic/vgic-v3.c | 2 +- > arch/arm64/kvm/vgic/vgic-v4.c | 7 +- > arch/arm64/mm/mmu.c | 4 +- > arch/arm64/net/bpf_jit_comp.c | 8 +- > arch/arm64/tools/sysreg | 12 +- > arch/ia64/kernel/sys_ia64.c | 2 +- > arch/mips/include/asm/dec/prom.h | 2 +- > arch/parisc/kernel/sys_parisc.c | 15 +- > block/blk-mq.c | 10 +- > drivers/accel/qaic/qaic_control.c | 39 ++-- > drivers/acpi/button.c | 9 + > drivers/acpi/resource.c | 60 ----- > drivers/acpi/video_detect.c | 24 ++ > drivers/acpi/x86/utils.c | 26 ++- > drivers/base/regmap/regmap-i2c.c | 8 +- > drivers/base/regmap/regmap-spi-avmm.c | 2 +- > drivers/base/regmap/regmap.c | 6 +- > drivers/bluetooth/btusb.c | 1 + > drivers/dma-buf/dma-resv.c | 13 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 5 +- > drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 256 +++++++++------------ > drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 7 + > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 12 + > .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 110 +++++++++ > .../amd/display/amdgpu_dm/amdgpu_dm_mst_types.h | 11 + > .../amd/display/dc/clk_mgr/dcn31/dcn31_clk_mgr.c | 5 + > .../drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c | 3 +- > .../drm/amd/display/dc/dcn303/dcn303_resource.c | 2 +- > .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 8 +- > .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 2 +- > drivers/gpu/drm/drm_client_modeset.c | 6 + > drivers/gpu/drm/i915/i915_perf.c | 1 + > drivers/gpu/drm/nouveau/dispnv50/disp.c | 4 + > drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h | 4 +- > drivers/gpu/drm/nouveau/nvkm/engine/disp/uconn.c | 27 ++- > drivers/gpu/drm/nouveau/nvkm/subdev/i2c/base.c | 11 +- > drivers/gpu/drm/radeon/radeon_cs.c | 3 +- > drivers/gpu/drm/ttm/ttm_resource.c | 5 +- > drivers/hid/hid-ids.h | 1 + > drivers/hid/hid-quirks.c | 1 + > drivers/iommu/iommu-sva.c | 3 +- > drivers/md/md.c | 14 +- > drivers/md/raid10.c | 2 + > drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 10 +- > drivers/net/can/spi/mcp251xfd/mcp251xfd.h | 1 + > drivers/net/can/usb/gs_usb.c | 130 ++++++----- > drivers/net/dsa/microchip/ksz8795.c | 8 +- > drivers/net/dsa/microchip/ksz_common.c | 8 +- > drivers/net/dsa/microchip/ksz_common.h | 7 + > drivers/net/dsa/mv88e6xxx/chip.c | 7 + > drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c | 33 ++- > .../ethernet/hisilicon/hns3/hns3pf/hclge_debugfs.c | 29 ++- > drivers/net/ethernet/intel/iavf/iavf.h | 16 +- > drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 39 ++-- > drivers/net/ethernet/intel/iavf/iavf_main.c | 223 ++++++++++++------ > drivers/net/ethernet/intel/iavf/iavf_txrx.c | 43 ++-- > drivers/net/ethernet/intel/iavf/iavf_txrx.h | 4 - > drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 5 +- > drivers/net/ethernet/intel/ice/ice_base.c | 2 + > drivers/net/ethernet/intel/ice/ice_ethtool.c | 13 +- > drivers/net/ethernet/intel/ice/ice_lib.c | 27 --- > drivers/net/ethernet/intel/ice/ice_main.c | 10 +- > drivers/net/ethernet/intel/igb/igb_main.c | 5 + > drivers/net/ethernet/intel/igc/igc_main.c | 12 +- > drivers/net/ethernet/litex/litex_liteeth.c | 19 +- > .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 5 +- > drivers/net/ethernet/mediatek/mtk_eth_soc.c | 29 +-- > drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c | 2 +- > drivers/net/ethernet/realtek/r8169_main.c | 18 +- > drivers/net/ethernet/ti/cpsw_ale.c | 24 +- > drivers/net/phy/phy_device.c | 21 +- > drivers/net/vrf.c | 12 +- > drivers/net/wireless/ath/ath11k/core.c | 53 +++-- > drivers/net/wireless/ath/ath11k/mac.c | 3 +- > drivers/net/wireless/ath/ath11k/wmi.c | 5 + > drivers/net/wireless/ath/ath12k/mac.c | 1 + > drivers/net/wireless/intel/iwlwifi/mvm/mld-key.c | 9 +- > drivers/net/wireless/intel/iwlwifi/mvm/power.c | 14 +- > drivers/net/wireless/intel/iwlwifi/mvm/sta.c | 2 +- > drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 4 + > drivers/net/wireless/realtek/rtw88/sdio.c | 24 +- > drivers/net/wireless/virtual/mac80211_hwsim.c | 4 +- > drivers/of/platform.c | 2 +- > drivers/pinctrl/renesas/pinctrl-rzg2l.c | 28 ++- > drivers/pinctrl/renesas/pinctrl-rzv2m.c | 28 ++- > drivers/regulator/da9063-regulator.c | 3 + > drivers/s390/crypto/zcrypt_msgtype6.c | 33 ++- > drivers/scsi/sg.c | 10 + > drivers/spi/spi-bcm63xx.c | 2 +- > drivers/spi/spi-cadence-quadspi.c | 19 ++ > drivers/spi/spi-dw-mmio.c | 22 ++ > drivers/spi/spi-s3c64xx.c | 2 + > drivers/video/fbdev/au1200fb.c | 3 + > drivers/video/fbdev/imxfb.c | 5 +- > fs/btrfs/block-group.c | 1 + > fs/btrfs/ctree.c | 10 +- > fs/btrfs/disk-io.c | 3 + > fs/btrfs/extent_io.c | 33 +-- > fs/btrfs/inode.c | 35 +-- > fs/btrfs/qgroup.c | 1 + > fs/btrfs/raid56.c | 11 +- > fs/btrfs/volumes.c | 17 +- > fs/erofs/zdata.c | 2 +- > fs/ext4/xattr.c | 14 ++ > fs/fuse/dir.c | 2 +- > fs/fuse/inode.c | 8 +- > fs/fuse/ioctl.c | 21 +- > fs/jbd2/checkpoint.c | 102 +++----- > fs/jfs/jfs_dmap.c | 3 + > fs/jfs/jfs_txnmgr.c | 5 + > fs/jfs/namei.c | 5 + > fs/overlayfs/ovl_entry.h | 9 + > fs/quota/dquot.c | 5 +- > fs/smb/client/connect.c | 19 +- > fs/smb/client/dfs.c | 26 +-- > fs/smb/client/smb2transport.c | 2 +- > fs/udf/unicode.c | 2 +- > include/kvm/arm_vgic.h | 2 +- > include/linux/psi.h | 5 +- > include/linux/psi_types.h | 3 + > include/linux/sched/signal.h | 2 +- > include/linux/tcp.h | 2 +- > include/net/bluetooth/hci_core.h | 5 + > include/net/ip.h | 2 +- > include/net/tcp.h | 31 ++- > include/uapi/linux/fuse.h | 3 + > io_uring/io_uring.c | 52 ++--- > kernel/bpf/bpf_lru_list.c | 21 +- > kernel/bpf/bpf_lru_list.h | 7 +- > kernel/bpf/btf.c | 23 +- > kernel/bpf/log.c | 3 - > kernel/bpf/syscall.c | 3 +- > kernel/bpf/verifier.c | 32 ++- > kernel/cgroup/cgroup.c | 2 +- > kernel/kallsyms.c | 5 +- > kernel/rcu/tasks.h | 5 +- > kernel/rcu/tree_exp.h | 2 +- > kernel/rcu/tree_plugin.h | 4 +- > kernel/sched/fair.c | 4 +- > kernel/sched/psi.c | 29 ++- > kernel/sys.c | 10 +- > kernel/time/posix-timers.c | 31 +-- > kernel/trace/trace_events_hist.c | 3 +- > lib/iov_iter.c | 2 +- > lib/maple_tree.c | 3 +- > mm/mlock.c | 9 +- > net/bluetooth/hci_conn.c | 14 +- > net/bluetooth/hci_core.c | 42 +++- > net/bluetooth/hci_event.c | 15 +- > net/bluetooth/hci_sync.c | 121 ++++++++-- > net/bluetooth/iso.c | 55 +++-- > net/bluetooth/mgmt.c | 26 +-- > net/bluetooth/sco.c | 23 +- > net/bridge/br_stp_if.c | 3 + > net/can/bcm.c | 12 +- > net/devlink/health.c | 2 +- > net/devlink/leftover.c | 5 +- > net/ipv4/esp4.c | 2 +- > net/ipv4/inet_connection_sock.c | 2 +- > net/ipv4/inet_hashtables.c | 17 +- > net/ipv4/inet_timewait_sock.c | 8 +- > net/ipv4/ip_output.c | 4 +- > net/ipv4/tcp.c | 57 ++--- > net/ipv4/tcp_fastopen.c | 6 +- > net/ipv4/tcp_ipv4.c | 27 ++- > net/ipv4/tcp_minisocks.c | 11 +- > net/ipv4/tcp_output.c | 6 +- > net/ipv4/udp_offload.c | 16 +- > net/ipv6/ip6_gre.c | 3 +- > net/ipv6/tcp_ipv6.c | 4 +- > net/ipv6/udp_offload.c | 3 +- > net/llc/llc_input.c | 3 - > net/netfilter/nf_tables_api.c | 12 +- > net/netfilter/nft_set_pipapo.c | 6 +- > net/sched/cls_bpf.c | 99 ++++---- > net/sched/cls_matchall.c | 35 +-- > net/sched/cls_u32.c | 48 +++- > net/wireless/wext-core.c | 6 + > scripts/Makefile.build | 5 +- > scripts/Makefile.host | 6 +- > scripts/kallsyms.c | 6 +- > security/keys/request_key.c | 35 ++- > security/keys/trusted-keys/trusted_tpm2.c | 2 +- > sound/pci/emu10k1/emufx.c | 112 +-------- > sound/pci/hda/patch_realtek.c | 100 +++++++- > sound/soc/amd/acp/amd.h | 7 +- > sound/soc/codecs/Kconfig | 1 + > sound/soc/codecs/cs42l51-i2c.c | 6 + > sound/soc/codecs/cs42l51.c | 7 - > sound/soc/codecs/cs42l51.h | 1 - > sound/soc/codecs/rt5640.c | 12 +- > sound/soc/codecs/wcd-mbhc-v2.c | 57 +++-- > sound/soc/codecs/wcd934x.c | 12 + > sound/soc/codecs/wcd938x.c | 86 ++++++- > sound/soc/fsl/fsl_sai.c | 8 +- > sound/soc/fsl/fsl_sai.h | 1 + > sound/soc/qcom/qdsp6/q6apm.c | 7 +- > sound/soc/qcom/qdsp6/topology.c | 4 +- > sound/soc/sof/ipc3-dtrace.c | 9 +- > sound/soc/tegra/tegra210_adx.c | 34 ++- > sound/soc/tegra/tegra210_amx.c | 40 ++-- > tools/include/nolibc/stackprotector.h | 5 +- > tools/perf/Makefile.config | 4 +- > .../tests/shell/test_uprobe_from_different_cu.sh | 77 +++++++ > tools/perf/util/dwarf-aux.c | 4 +- > tools/testing/radix-tree/maple.c | 6 +- > tools/testing/selftests/mm/mkdirty.c | 2 +- > tools/testing/selftests/tc-testing/config | 2 + > tools/testing/selftests/tc-testing/settings | 1 + > 218 files changed, 2462 insertions(+), 1482 deletions(-) > >
Hi! > > This is the start of the stable review cycle for the 6.4.7 release. > > There are 227 patches in this series, all will be posted as a response > > to this one. If anyone has any issues with these being applied, please > > let me know. > > > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > > Anything received after that time might be too late. > > > > The whole patch series can be found in one patch at: > > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > > or in the git tree and branch at: > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > > and the diffstat can be found below. > > I saw this when running rcutorture, this one happened in the TREE04 > configuration. This is likely due to the stuttering issues we are discussing > in the other thread. Anyway I am just making a note here while I am > continuing to look into it. So is the stuttering new in 6.4.7? > Other than that, all tests pass: > Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> ...or you still believe 6.4.7 is okay to release? Best regards, Pavel -- DENX Software Engineering GmbH, Managing Director: Erika Unter HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
> On Jul 27, 2023, at 7:35 AM, Pavel Machek <pavel@denx.de> wrote: > > Hi! > >>> This is the start of the stable review cycle for the 6.4.7 release. >>> There are 227 patches in this series, all will be posted as a response >>> to this one. If anyone has any issues with these being applied, please >>> let me know. >>> >>> Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. >>> Anything received after that time might be too late. >>> >>> The whole patch series can be found in one patch at: >>> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz >>> or in the git tree and branch at: >>> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y >>> and the diffstat can be found below. >> >> I saw this when running rcutorture, this one happened in the TREE04 >> configuration. This is likely due to the stuttering issues we are discussing >> in the other thread. Anyway I am just making a note here while I am >> continuing to look into it. > > So is the stuttering new in 6.4.7? No it is an old feature in RCU torture tests. But is dependent on timing. Something changed in recent kernels that is making the issues with it more likely. Its hard to bisect as failure sometimes takes hours. > >> Other than that, all tests pass: >> Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> > > ...or you still believe 6.4.7 is okay to release? As such, it should be Ok. However naturally I am not happy that the RCU testing is intermittently failing. These issues have been seen in last several 6.4 stable releases so since those were released, maybe this one can be too? The fix for stuttering is currently being reviewed. Thanks, - Joel > > Best regards, > Pavel > -- > DENX Software Engineering GmbH, Managing Director: Erika Unter > HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
On Thu, Jul 27, 2023 at 09:26:52AM -0400, Joel Fernandes wrote: > > > > On Jul 27, 2023, at 7:35 AM, Pavel Machek <pavel@denx.de> wrote: > > > > Hi! > > > >>> This is the start of the stable review cycle for the 6.4.7 release. > >>> There are 227 patches in this series, all will be posted as a response > >>> to this one. If anyone has any issues with these being applied, please > >>> let me know. > >>> > >>> Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > >>> Anything received after that time might be too late. > >>> > >>> The whole patch series can be found in one patch at: > >>> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > >>> or in the git tree and branch at: > >>> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > >>> and the diffstat can be found below. > >> > >> I saw this when running rcutorture, this one happened in the TREE04 > >> configuration. This is likely due to the stuttering issues we are discussing > >> in the other thread. Anyway I am just making a note here while I am > >> continuing to look into it. > > > > So is the stuttering new in 6.4.7? > > No it is an old feature in RCU torture tests. But is dependent on timing. Something > changed in recent kernels that is making the issues with it more likely. Its hard to bisect as failure sometimes takes hours. > > > > >> Other than that, all tests pass: > >> Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> > > > > ...or you still believe 6.4.7 is okay to release? > > As such, it should be Ok. However naturally I am not happy that the RCU testing > is intermittently failing. These issues have been seen in last several 6.4 stable releases > so since those were released, maybe this one can be too? > The fix for stuttering is currently being reviewed. Or, to look at it another way, the stuttering fix is specific to torture testing. Would we really want to hold up a -stable release only because rcutorture occasionally gives a false-positive failure on certain types of systems? Thanx, Paul > Thanks, > > - Joel > > > > > > Best regards, > > Pavel > > -- > > DENX Software Engineering GmbH, Managing Director: Erika Unter > > HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
On 7/27/23 07:06, Paul E. McKenney wrote: > On Thu, Jul 27, 2023 at 09:26:52AM -0400, Joel Fernandes wrote: >> >> >>> On Jul 27, 2023, at 7:35 AM, Pavel Machek <pavel@denx.de> wrote: >>> >>> Hi! >>> >>>>> This is the start of the stable review cycle for the 6.4.7 release. >>>>> There are 227 patches in this series, all will be posted as a response >>>>> to this one. If anyone has any issues with these being applied, please >>>>> let me know. >>>>> >>>>> Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. >>>>> Anything received after that time might be too late. >>>>> >>>>> The whole patch series can be found in one patch at: >>>>> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz >>>>> or in the git tree and branch at: >>>>> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y >>>>> and the diffstat can be found below. >>>> >>>> I saw this when running rcutorture, this one happened in the TREE04 >>>> configuration. This is likely due to the stuttering issues we are discussing >>>> in the other thread. Anyway I am just making a note here while I am >>>> continuing to look into it. >>> >>> So is the stuttering new in 6.4.7? >> >> No it is an old feature in RCU torture tests. But is dependent on timing. Something >> changed in recent kernels that is making the issues with it more likely. Its hard to bisect as failure sometimes takes hours. >> >>> >>>> Other than that, all tests pass: >>>> Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> >>> >>> ...or you still believe 6.4.7 is okay to release? >> >> As such, it should be Ok. However naturally I am not happy that the RCU testing >> is intermittently failing. These issues have been seen in last several 6.4 stable releases >> so since those were released, maybe this one can be too? >> The fix for stuttering is currently being reviewed. > > Or, to look at it another way, the stuttering fix is specific to torture > testing. Would we really want to hold up a -stable release only because > rcutorture occasionally gives a false-positive failure on certain types > of systems? > No. However, (unrelated) in linux-next, rcu tests sometimes result in apparent hangs or long runtime. [ 0.778841] Mount-cache hash table entries: 512 (order: 0, 4096 bytes, linear) [ 0.779011] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes, linear) [ 0.797998] Running RCU synchronous self tests [ 0.798209] Running RCU synchronous self tests [ 0.912368] smpboot: CPU0: AMD Opteron 63xx class CPU (family: 0x15, model: 0x2, stepping: 0x0) [ 0.923398] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1. [ 0.925419] Running RCU-tasks wait API self tests (hangs until aborted). This is primarily with Opteron CPUs, but also with others such as Haswell, Icelake-Server, and pentium3. It is all but impossible to bisect because it doesn't happen all the time. All I was able to figure out was that it has to do with rcu changes in linux-next. I'd be much more concerned about that. Guenter
On Thu, Jul 27, 2023 at 07:39:54AM -0700, Guenter Roeck wrote: > On 7/27/23 07:06, Paul E. McKenney wrote: > > On Thu, Jul 27, 2023 at 09:26:52AM -0400, Joel Fernandes wrote: > > > > > > > > > > On Jul 27, 2023, at 7:35 AM, Pavel Machek <pavel@denx.de> wrote: > > > > > > > > Hi! > > > > > > > > > > This is the start of the stable review cycle for the 6.4.7 release. > > > > > > There are 227 patches in this series, all will be posted as a response > > > > > > to this one. If anyone has any issues with these being applied, please > > > > > > let me know. > > > > > > > > > > > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > > > > > > Anything received after that time might be too late. > > > > > > > > > > > > The whole patch series can be found in one patch at: > > > > > > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > > > > > > or in the git tree and branch at: > > > > > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > > > > > > and the diffstat can be found below. > > > > > > > > > > I saw this when running rcutorture, this one happened in the TREE04 > > > > > configuration. This is likely due to the stuttering issues we are discussing > > > > > in the other thread. Anyway I am just making a note here while I am > > > > > continuing to look into it. > > > > > > > > So is the stuttering new in 6.4.7? > > > > > > No it is an old feature in RCU torture tests. But is dependent on timing. Something > > > changed in recent kernels that is making the issues with it more likely. Its hard to bisect as failure sometimes takes hours. > > > > > > > > > > > > Other than that, all tests pass: > > > > > Tested-by: Joel Fernandes (Google) <joel@joelfernandes.org> > > > > > > > > ...or you still believe 6.4.7 is okay to release? > > > > > > As such, it should be Ok. However naturally I am not happy that the RCU testing > > > is intermittently failing. These issues have been seen in last several 6.4 stable releases > > > so since those were released, maybe this one can be too? > > > The fix for stuttering is currently being reviewed. > > > > Or, to look at it another way, the stuttering fix is specific to torture > > testing. Would we really want to hold up a -stable release only because > > rcutorture occasionally gives a false-positive failure on certain types > > of systems? > > > > No. However, (unrelated) in linux-next, rcu tests sometimes result in apparent hangs > or long runtime. > > [ 0.778841] Mount-cache hash table entries: 512 (order: 0, 4096 bytes, linear) > [ 0.779011] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes, linear) > [ 0.797998] Running RCU synchronous self tests > [ 0.798209] Running RCU synchronous self tests > [ 0.912368] smpboot: CPU0: AMD Opteron 63xx class CPU (family: 0x15, model: 0x2, stepping: 0x0) > [ 0.923398] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1. > [ 0.925419] Running RCU-tasks wait API self tests > > (hangs until aborted). This is primarily with Opteron CPUs, but also with others such as Haswell, > Icelake-Server, and pentium3. It is all but impossible to bisect because it doesn't happen > all the time. All I was able to figure out was that it has to do with rcu changes in linux-next. > I'd be much more concerned about that. First I have heard of this, so thank you for letting me know. About what fraction of the time does this happen? Thanx, Paul
On 7/27/23 09:07, Paul E. McKenney wrote: ...] >> No. However, (unrelated) in linux-next, rcu tests sometimes result in apparent hangs >> or long runtime. >> >> [ 0.778841] Mount-cache hash table entries: 512 (order: 0, 4096 bytes, linear) >> [ 0.779011] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes, linear) >> [ 0.797998] Running RCU synchronous self tests >> [ 0.798209] Running RCU synchronous self tests >> [ 0.912368] smpboot: CPU0: AMD Opteron 63xx class CPU (family: 0x15, model: 0x2, stepping: 0x0) >> [ 0.923398] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1. >> [ 0.925419] Running RCU-tasks wait API self tests >> >> (hangs until aborted). This is primarily with Opteron CPUs, but also with others such as Haswell, >> Icelake-Server, and pentium3. It is all but impossible to bisect because it doesn't happen >> all the time. All I was able to figure out was that it has to do with rcu changes in linux-next. >> I'd be much more concerned about that. > > First I have heard of this, so thank you for letting me know. > > About what fraction of the time does this happen? > Here is a sample test log from yesterday's -next. This is with x86_64. Today's -next always crashes, so no data. Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .................R........... passed Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ............. passed Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ..................R.......... passed Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ...... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .................R.............. failed (silent) Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .................R................. passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ................... passed Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ......... passed Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running .................R... passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ...... passed Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent) Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running .....................R....... passed Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running .................R.............. failed (silent) An earlier test run: Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ........ passed Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running .......... passed Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ......... passed Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ........ passed Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running .................R..... passed Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running .................R.............. failed (silent) Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ....... passed Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .......... passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running .......... passed Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ....... passed Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ....... passed Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ........ passed Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent) Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ....... passed Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ....... passed "R" means retry, and the dots reflect time expired. It looks like it happens most of the time, but not always, on affected CPUs. I don't have specific data for non-Intel CPUs. I don't think I see the problem there, but there is too much interference from other problems to be sure. For comparison, here is the result from the latest mainline: Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .......... passed Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ........... passed Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running ........ passed Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ....... passed Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ............. passed Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ...... passed Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ......... passed Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ......... passed Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ......... passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running ......... passed Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ......... passed Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ............ passed Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running .......... passed Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running .......... passed Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ...... passed Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ...... passed Guenter
On Thu, Jul 27, 2023 at 10:39:17AM -0700, Guenter Roeck wrote:
> On 7/27/23 09:07, Paul E. McKenney wrote:
>
> ...]
>
> > > No. However, (unrelated) in linux-next, rcu tests sometimes result in apparent hangs
> > > or long runtime.
> > >
> > > [ 0.778841] Mount-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
> > > [ 0.779011] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
> > > [ 0.797998] Running RCU synchronous self tests
> > > [ 0.798209] Running RCU synchronous self tests
> > > [ 0.912368] smpboot: CPU0: AMD Opteron 63xx class CPU (family: 0x15, model: 0x2, stepping: 0x0)
> > > [ 0.923398] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
> > > [ 0.925419] Running RCU-tasks wait API self tests
> > >
> > > (hangs until aborted). This is primarily with Opteron CPUs, but also with others such as Haswell,
> > > Icelake-Server, and pentium3. It is all but impossible to bisect because it doesn't happen
> > > all the time. All I was able to figure out was that it has to do with rcu changes in linux-next.
> > > I'd be much more concerned about that.
> >
> > First I have heard of this, so thank you for letting me know.
> >
> > About what fraction of the time does this happen?
> >
>
> Here is a sample test log from yesterday's -next. This is with x86_64.
> Today's -next always crashes, so no data.
>
> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed
> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed
> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed
> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed
> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed
> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed
> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .................R........... passed
> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ............. passed
> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ..................R.......... passed
> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ...... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .................R.............. failed (silent)
> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed
> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed
> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .................R................. passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ................... passed
> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ......... passed
> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running .................R... passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ...... passed
> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed
> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed
> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed
> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed
> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent)
> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running .....................R....... passed
> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running .................R.............. failed (silent)
>
> An earlier test run:
>
> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed
> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ........ passed
> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running .......... passed
> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed
> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed
> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ......... passed
> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ........ passed
> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed
> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running .................R..... passed
> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running .................R.............. failed (silent)
> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ....... passed
> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed
> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed
> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .......... passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running .......... passed
> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed
> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed
> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ....... passed
> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ....... passed
> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ........ passed
> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed
> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent)
> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ....... passed
> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ....... passed
>
> "R" means retry, and the dots reflect time expired. It looks like it happens most of the time,
> but not always, on affected CPUs. I don't have specific data for non-Intel CPUs. I don't think
> I see the problem there, but there is too much interference from other problems to be sure.
>
> For comparison, here is the result from the latest mainline:
>
> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .......... passed
> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed
> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed
> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ........... passed
> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running ........ passed
> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed
> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed
> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ....... passed
> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ............. passed
> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed
> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ...... passed
> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ......... passed
> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ......... passed
> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ......... passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running ......... passed
> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ......... passed
> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed
> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ............ passed
> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed
> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed
> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed
> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed
> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running .......... passed
> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running .......... passed
> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ...... passed
> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ...... passed
I freely confess that I am having a hard time imagining what would
be CPU dependent in that code. Timing, maybe? Whatever the reason,
I am not seeing these failures in my testing.
So which of the following Kconfig options is defined in your .config?
CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU.
If you have more than one of them, could you please apply this patch
and show me the corresponding console output from the resulting hang?
Thanx, Paul
------------------------------------------------------------------------
commit 709a917710dc01798e01750ea628ece4bfc42b7b
Author: Paul E. McKenney <paulmck@kernel.org>
Date: Thu Jul 27 13:13:46 2023 -0700
rcu-tasks: Add printk()s to localize boot-time self-test hang
Currently, rcu_tasks_initiate_self_tests() prints a message and then
initiates self tests on up to three different RCU Tasks flavors. If one
of the flavors has a grace-period hang, it is not easy to work out which
of the three hung. This commit therefore prints a message prior to each
individual test.
Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 56c470a489c8..427433c90935 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1981,20 +1981,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
static void rcu_tasks_initiate_self_tests(void)
{
- pr_info("Running RCU-tasks wait API self tests\n");
#ifdef CONFIG_TASKS_RCU
+ pr_info("Running RCU Tasks wait API self tests\n");
tests[0].runstart = jiffies;
synchronize_rcu_tasks();
call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_RUDE_RCU
+ pr_info("Running RCU Tasks Rude wait API self tests\n");
tests[1].runstart = jiffies;
synchronize_rcu_tasks_rude();
call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
#endif
#ifdef CONFIG_TASKS_TRACE_RCU
+ pr_info("Running RCU Tasks Trace wait API self tests\n");
tests[2].runstart = jiffies;
synchronize_rcu_tasks_trace();
call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
On 7/27/23 13:33, Paul E. McKenney wrote:
[ ... ]
> So which of the following Kconfig options is defined in your .config?
> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU.
>
Only CONFIG_TASKS_RCU. I added another log message after call_rcu_tasks().
It never returns from that function.
[ 1.168993] Running RCU synchronous self tests
[ 1.169219] Running RCU synchronous self tests
[ 1.285795] smpboot: CPU0: Intel Xeon Processor (Cascadelake) (family: 0x6, model: 0x55, stepping: 0x6)
[ 1.302827] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1.
[ 1.304526] Running RCU Tasks wait API self tests
... and then nothing for at least 10 minutes (then I gave up and stopped the test).
Qemu command line:
qemu-system-x86_64 -kernel \
arch/x86/boot/bzImage -M q35 -cpu Cascadelake-Server -no-reboot \
-snapshot -device e1000e,netdev=net0 -netdev user,id=net0 -m 256 \
-drive file=rootfs.iso,format=raw,if=ide,media=cdrom \
--append "earlycon=uart8250,io,0x3f8,9600n8 panic=-1 slub_debug=FZPUA root=/dev/sr0 rootwait console=ttyS0 noreboot" \
-d unimp,guest_errors -nographic -monitor none
Again, this doesn't happen all the time. With Cascadelake-Server
I see it maybe once every 5 boot attempts. I tried with qemu v8.0
and v8.1. Note that it does seem to happen with various CPU types,
only for some it seems to me more likely to happen (so maybe the
CPU type was a red herring). It does seem to depend on the system
load, and happen more often if the system is under heavy load.
Guenter
On Thu, Jul 27, 2023 at 09:22:52PM -0700, Guenter Roeck wrote: > On 7/27/23 13:33, Paul E. McKenney wrote: > [ ... ] > > > So which of the following Kconfig options is defined in your .config? > > CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. > > > > Only CONFIG_TASKS_RCU. I added another log message after call_rcu_tasks(). > It never returns from that function. > > [ 1.168993] Running RCU synchronous self tests > [ 1.169219] Running RCU synchronous self tests > [ 1.285795] smpboot: CPU0: Intel Xeon Processor (Cascadelake) (family: 0x6, model: 0x55, stepping: 0x6) > [ 1.302827] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. > [ 1.304526] Running RCU Tasks wait API self tests > > ... and then nothing for at least 10 minutes (then I gave up and stopped the test). > > Qemu command line: > > qemu-system-x86_64 -kernel \ > arch/x86/boot/bzImage -M q35 -cpu Cascadelake-Server -no-reboot \ > -snapshot -device e1000e,netdev=net0 -netdev user,id=net0 -m 256 \ > -drive file=rootfs.iso,format=raw,if=ide,media=cdrom \ > --append "earlycon=uart8250,io,0x3f8,9600n8 panic=-1 slub_debug=FZPUA root=/dev/sr0 rootwait console=ttyS0 noreboot" \ > -d unimp,guest_errors -nographic -monitor none > > Again, this doesn't happen all the time. With Cascadelake-Server > I see it maybe once every 5 boot attempts. I tried with qemu v8.0 > and v8.1. Note that it does seem to happen with various CPU types, > only for some it seems to me more likely to happen (so maybe the > CPU type was a red herring). It does seem to depend on the system > load, and happen more often if the system is under heavy load. Hmmm... What kernel are you using as your qemu/KVM hypervisor? And I echo Joel's requests for your .config file. Thanx, Paul
On 7/30/23 20:54, Paul E. McKenney wrote: > On Thu, Jul 27, 2023 at 09:22:52PM -0700, Guenter Roeck wrote: >> On 7/27/23 13:33, Paul E. McKenney wrote: >> [ ... ] >> >>> So which of the following Kconfig options is defined in your .config? >>> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. >>> >> >> Only CONFIG_TASKS_RCU. I added another log message after call_rcu_tasks(). >> It never returns from that function. >> >> [ 1.168993] Running RCU synchronous self tests >> [ 1.169219] Running RCU synchronous self tests >> [ 1.285795] smpboot: CPU0: Intel Xeon Processor (Cascadelake) (family: 0x6, model: 0x55, stepping: 0x6) >> [ 1.302827] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. >> [ 1.304526] Running RCU Tasks wait API self tests >> >> ... and then nothing for at least 10 minutes (then I gave up and stopped the test). >> >> Qemu command line: >> >> qemu-system-x86_64 -kernel \ >> arch/x86/boot/bzImage -M q35 -cpu Cascadelake-Server -no-reboot \ >> -snapshot -device e1000e,netdev=net0 -netdev user,id=net0 -m 256 \ >> -drive file=rootfs.iso,format=raw,if=ide,media=cdrom \ >> --append "earlycon=uart8250,io,0x3f8,9600n8 panic=-1 slub_debug=FZPUA root=/dev/sr0 rootwait console=ttyS0 noreboot" \ >> -d unimp,guest_errors -nographic -monitor none >> >> Again, this doesn't happen all the time. With Cascadelake-Server >> I see it maybe once every 5 boot attempts. I tried with qemu v8.0 >> and v8.1. Note that it does seem to happen with various CPU types, >> only for some it seems to me more likely to happen (so maybe the >> CPU type was a red herring). It does seem to depend on the system >> load, and happen more often if the system is under heavy load. > > Hmmm... What kernel are you using as your qemu/KVM hypervisor? > Not sure I understand the question. KVM is disabled in my systems. The host CPUs are Ryzen 3900X and 5900X, but I don't really see why that would matter. > And I echo Joel's requests for your .config file. > Did you see the e-mail I sent about this problem earlier today ? https://lore.kernel.org/lkml/3da81a5c-700b-8e21-1bde-27dd3a0b8945@roeck-us.net/ I think I'll declare this to be a problem with my test environment and disable RCU debugging. Thanks, Guenter
On Sun, Jul 30, 2023 at 08:54:46PM -0700, Paul E. McKenney wrote: > On Thu, Jul 27, 2023 at 09:22:52PM -0700, Guenter Roeck wrote: > > On 7/27/23 13:33, Paul E. McKenney wrote: > > [ ... ] > > > > > So which of the following Kconfig options is defined in your .config? > > > CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. > > > > > > > Only CONFIG_TASKS_RCU. I added another log message after call_rcu_tasks(). > > It never returns from that function. > > > > [ 1.168993] Running RCU synchronous self tests > > [ 1.169219] Running RCU synchronous self tests > > [ 1.285795] smpboot: CPU0: Intel Xeon Processor (Cascadelake) (family: 0x6, model: 0x55, stepping: 0x6) > > [ 1.302827] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. > > [ 1.304526] Running RCU Tasks wait API self tests > > > > ... and then nothing for at least 10 minutes (then I gave up and stopped the test). > > > > Qemu command line: > > > > qemu-system-x86_64 -kernel \ > > arch/x86/boot/bzImage -M q35 -cpu Cascadelake-Server -no-reboot \ > > -snapshot -device e1000e,netdev=net0 -netdev user,id=net0 -m 256 \ > > -drive file=rootfs.iso,format=raw,if=ide,media=cdrom \ > > --append "earlycon=uart8250,io,0x3f8,9600n8 panic=-1 slub_debug=FZPUA root=/dev/sr0 rootwait console=ttyS0 noreboot" \ > > -d unimp,guest_errors -nographic -monitor none > > > > Again, this doesn't happen all the time. With Cascadelake-Server > > I see it maybe once every 5 boot attempts. I tried with qemu v8.0 > > and v8.1. Note that it does seem to happen with various CPU types, > > only for some it seems to me more likely to happen (so maybe the > > CPU type was a red herring). It does seem to depend on the system > > load, and happen more often if the system is under heavy load. > > Hmmm... What kernel are you using as your qemu/KVM hypervisor? Never mind, I now see your bisection result. Good show, thank you!!! Thanx, Paul > And I echo Joel's requests for your .config file. > > Thanx, Paul
> On Jul 27, 2023, at 4:33 PM, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> On Thu, Jul 27, 2023 at 10:39:17AM -0700, Guenter Roeck wrote:
>> On 7/27/23 09:07, Paul E. McKenney wrote:
>>
>> ...]
>>
>>>> No. However, (unrelated) in linux-next, rcu tests sometimes result in apparent hangs
>>>> or long runtime.
>>>>
>>>> [ 0.778841] Mount-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
>>>> [ 0.779011] Mountpoint-cache hash table entries: 512 (order: 0, 4096 bytes, linear)
>>>> [ 0.797998] Running RCU synchronous self tests
>>>> [ 0.798209] Running RCU synchronous self tests
>>>> [ 0.912368] smpboot: CPU0: AMD Opteron 63xx class CPU (family: 0x15, model: 0x2, stepping: 0x0)
>>>> [ 0.923398] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
>>>> [ 0.925419] Running RCU-tasks wait API self tests
>>>>
>>>> (hangs until aborted). This is primarily with Opteron CPUs, but also with others such as Haswell,
>>>> Icelake-Server, and pentium3. It is all but impossible to bisect because it doesn't happen
>>>> all the time. All I was able to figure out was that it has to do with rcu changes in linux-next.
>>>> I'd be much more concerned about that.
>>>
>>> First I have heard of this, so thank you for letting me know.
>>>
>>> About what fraction of the time does this happen?
>>>
>>
>> Here is a sample test log from yesterday's -next. This is with x86_64.
>> Today's -next always crashes, so no data.
>>
>> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
>> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed
>> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed
>> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed
>> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed
>> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed
>> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
>> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed
>> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .................R........... passed
>> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ............. passed
>> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ..................R.......... passed
>> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ...... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .................R.............. failed (silent)
>> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed
>> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed
>> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .................R................. passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ................... passed
>> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ......... passed
>> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running .................R... passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ...... passed
>> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed
>> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed
>> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed
>> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed
>> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent)
>> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running .....................R....... passed
>> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running .................R.............. failed (silent)
>>
>> An earlier test run:
>>
>> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
>> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .................R....... passed
>> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ........ passed
>> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running .......... passed
>> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ....... passed
>> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running .................R.... passed
>> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
>> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ......... passed
>> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ........ passed
>> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed
>> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running .................R..... passed
>> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running .................R.............. failed (silent)
>> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ....... passed
>> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running .......... passed
>> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ........ passed
>> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ...... passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running .......... passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running .......... passed
>> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed
>> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ......... passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed
>> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ....... passed
>> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ....... passed
>> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ........ passed
>> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running ......... passed
>> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running ....................R................. failed (silent)
>> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ....... passed
>> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ....... passed
>>
>> "R" means retry, and the dots reflect time expired. It looks like it happens most of the time,
>> but not always, on affected CPUs. I don't have specific data for non-Intel CPUs. I don't think
>> I see the problem there, but there is too much interference from other problems to be sure.
>>
>> For comparison, here is the result from the latest mainline:
>>
>> Building x86_64:q35:Broadwell-noTSX:defconfig:smp:net,e1000:mem256:ata:hd ... running ....... passed
>> Building x86_64:q35:Cascadelake-Server:defconfig:smp:net,e1000e:mem256:ata:cd ... running .......... passed
>> Building x86_64:q35:IvyBridge:defconfig:smp2:net,i82801:efi:mem512:nvme:hd ... running ...... passed
>> Building x86_64:q35:SandyBridge:defconfig:smp4:net,ne2k_pci:efi32:mem1G:usb:hd ... running ......... passed
>> Building x86_64:q35:SandyBridge:defconfig:smp8:net,ne2k_pci:mem1G:usb-hub:hd ... running ........... passed
>> Building x86_64:q35:Haswell:defconfig:smp:tpm-tis:net,pcnet:mem2G:usb-uas:hd ... running ........ passed
>> Building x86_64:q35:Skylake-Client:defconfig:smp2:tpm-tis:net,rtl8139:efi:mem4G:sdhci:mmc:hd ... running ....... passed
>> Building x86_64:q35:Conroe:defconfig:smp4:net,tulip:efi32:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:Denverton:defconfig:smp2:net,tulip:efi:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:EPYC-Milan:defconfig:smp:tpm-crb:net,tulip:mem256:scsi[DC395]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net:mem512:scsi[AM53C974]:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp:net,virtio-net-old:mem512:scsi[AM53C974]:hd ... running ....... passed
>> Building x86_64:q35:Westmere-IBRS:defconfig:smp2:tpm-crb:net,usb-ohci:efi:mem1G:scsi[53C810]:cd ... running .......... passed
>> Building x86_64:q35:Skylake-Server:defconfig:smp4:tpm-tis:net,e1000-82544gc:efi32:mem2G:scsi[53C895A]:hd ... running ....... passed
>> Building x86_64:pc:EPYC:defconfig:smp:pci-bridge:net,usb-uhci:mem4G:scsi[FUSION]:hd ... running ............. passed
>> Building x86_64:q35:EPYC-IBPB:defconfig:smp2:net,e1000-82545em:efi:mem8G:scsi[MEGASAS]:hd ... running ....... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:efi32:mem256:scsi[MEGASAS2]:hd ... running ....... passed
>> Building x86_64:q35:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ...... passed
>> Building x86_64:pc:Opteron_G5:defconfig:smp4:net,i82559c:mem256:scsi[MEGASAS2]:hd ... running ......... passed
>> Building x86_64:pc:phenom:defconfig:smp:net,i82559er:mem512:initrd ... running ......... passed
>> Building x86_64:q35:Opteron_G1:defconfig:smp2:net,i82562:efi:mem1G:initrd ... running ......... passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci]:hd ... running ......... passed
>> Building x86_64:pc:Opteron_G2:defconfig:smp:net,usb:efi32:mem2G:scsi[virtio-pci-old]:hd ... running ......... passed
>> Building x86_64:q35:core2duo:defconfig:smp2:net,i82559a:mem4G:virtio-pci:hd ... running ...... passed
>> Building x86_64:q35:Broadwell:defconfig:smp4:net,i82558b:efi:mem8G:virtio:hd ... running ....... passed
>> Building x86_64:q35:Nehalem:defconfig:smp2:net,i82558a:efi32:mem1G:virtio:hd ... running ...... passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp4:net,ne2k_pci:efi:mem2G:virtio:cd ... running ............ passed
>> Building x86_64:q35:Icelake-Server:defconfig:preempt:smp8:net,i82557a:mem4G:nvme:hd ... running ....... passed
>> Building x86_64:q35:Skylake-Client-IBRS:defconfig:preempt:smp2:net,i82558b:efi32:mem1G:sdhci:mmc:hd ... running ...... passed
>> Building x86_64:q35:KnightsMill:defconfig:preempt:smp6:net,i82550:mem512:initrd ... running ...... passed
>> Building x86_64:q35:Cooperlake:defconfig:smp2:net,usb-ohci:efi:mem1G:scsi[53C810]:hd ... running ....... passed
>> Building x86_64:q35:EPYC-Rome:defconfig:smp4:net,igb:mem2G:scsi[53C895A]:hd ... running .......... passed
>> Building x86_64:pc:Opteron_G3:defconfig:nosmp:net,e1000:mem1G:usb:hd ... running .......... passed
>> Building x86_64:q35:Opteron_G4:defconfig:nosmp:net,ne2k_pci:efi:mem512:ata:hd ... running ...... passed
>> Building x86_64:q35:Haswell-noTSX-IBRS:defconfig:nosmp:net,pcnet:efi32:mem2G:ata:hd ... running ...... passed
>
> I freely confess that I am having a hard time imagining what would
> be CPU dependent in that code. Timing, maybe? Whatever the reason,
> I am not seeing these failures in my testing.
>
> So which of the following Kconfig options is defined in your .config?
> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU.
>
> If you have more than one of them, could you please apply this patch
> and show me the corresponding console output from the resulting hang?
FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side.
Cheers,
- Joel
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> commit 709a917710dc01798e01750ea628ece4bfc42b7b
> Author: Paul E. McKenney <paulmck@kernel.org>
> Date: Thu Jul 27 13:13:46 2023 -0700
>
> rcu-tasks: Add printk()s to localize boot-time self-test hang
>
> Currently, rcu_tasks_initiate_self_tests() prints a message and then
> initiates self tests on up to three different RCU Tasks flavors. If one
> of the flavors has a grace-period hang, it is not easy to work out which
> of the three hung. This commit therefore prints a message prior to each
> individual test.
>
> Reported-by: Guenter Roeck <linux@roeck-us.net>
> Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 56c470a489c8..427433c90935 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -1981,20 +1981,22 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp)
>
> static void rcu_tasks_initiate_self_tests(void)
> {
> - pr_info("Running RCU-tasks wait API self tests\n");
> #ifdef CONFIG_TASKS_RCU
> + pr_info("Running RCU Tasks wait API self tests\n");
> tests[0].runstart = jiffies;
> synchronize_rcu_tasks();
> call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback);
> #endif
>
> #ifdef CONFIG_TASKS_RUDE_RCU
> + pr_info("Running RCU Tasks Rude wait API self tests\n");
> tests[1].runstart = jiffies;
> synchronize_rcu_tasks_rude();
> call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback);
> #endif
>
> #ifdef CONFIG_TASKS_TRACE_RCU
> + pr_info("Running RCU Tasks Trace wait API self tests\n");
> tests[2].runstart = jiffies;
> synchronize_rcu_tasks_trace();
> call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback);
On 7/27/23 16:18, Joel Fernandes wrote:
[ ... ]
>> I freely confess that I am having a hard time imagining what would
>> be CPU dependent in that code. Timing, maybe? Whatever the reason,
>> I am not seeing these failures in my testing.
>>
>> So which of the following Kconfig options is defined in your .config?
>> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU.
>>
>> If you have more than one of them, could you please apply this patch
>> and show me the corresponding console output from the resulting hang?
>
> FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side.
>
I managed to bisect the problem. See bisect log below. Bisect repeated twice.
so it should be reliable. I don't really understand it, but the following
reverts fix the problem. This is on top of next-20230721 because next-20230728
crashes immediately in my tests.
0caafe9b94ab (HEAD) Revert "sched/fair: Remove sched_feat(START_DEBIT)"
518bdbd39fdb Revert "sched/fair: Add lag based placement"
a011162c3e32 Revert "sched/fair: Implement an EEVDF-like scheduling policy"
df579720bf98 Revert "sched/fair: Commit to lag based placement"
aac459a7e738 Revert "sched/smp: Use lag to simplify cross-runqueue placement"
8d686eb173e1 Revert "sched/fair: Commit to EEVDF"
486474c50f95 Revert "sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice"
79e94d67d08a Revert "sched/fair: Propagate enqueue flags into place_entity()"
ae867bc97b71 (tag: next-20230721) Add linux-next specific files for 20230721
For context: x86 images (32 and 64 bit) in -next tend to hang at
[ 2.309323] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1.
[ 2.311634] Running RCU-tasks wait API self tests
The hang is not seen with every boot; it happens roughly about once every
10 boot attempts. It is not CPU dependent as I initially thought.
Configuration file is at http://server.roeck-us.net/qemu/x86-next/config.
Example qemu command line:
qemu-system-x86_64 -kernel arch/x86/boot/bzImage -M q35 -cpu Broadwell-noTSX -no-reboot \
-snapshot -device e1000,netdev=net0 -netdev user,id=net0 -m 256 \
-drive file=rootfs.ext2,format=raw,if=ide \
--append "earlycon=uart8250,io,0x3f8,9600n8 root=/dev/sda console=ttyS0" \
-nographic -monitor none
Guenter
---
# bad: [ae867bc97b713121b2a7f5fcac68378a0774739b] Add linux-next specific files for 20230721
# good: [fdf0eaf11452d72945af31804e2a1048ee1b574c] Linux 6.5-rc2
git bisect start 'HEAD' 'v6.5-rc2'
# good: [f09bf8f6c8cbbff6f52523abcda88c86db72e31c] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma.git
git bisect good f09bf8f6c8cbbff6f52523abcda88c86db72e31c
# good: [86374a6210aeebceb927204d80f9e65739134bc3] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tiwai/sound.git
git bisect good 86374a6210aeebceb927204d80f9e65739134bc3
# bad: [d588c93cae9e3dff15d125e755edcba5d842f41a] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
git bisect bad d588c93cae9e3dff15d125e755edcba5d842f41a
# good: [acadcaf8c67062ad4c1a0ad0e05bf429b04740c5] Merge branch 'for-next' of git://git.kernel.dk/linux-block.git
git bisect good acadcaf8c67062ad4c1a0ad0e05bf429b04740c5
# good: [2c73542f4cdc59fd23514f9e963d0b3419bd5e16] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/jarkko/linux-tpmdd.git
git bisect good 2c73542f4cdc59fd23514f9e963d0b3419bd5e16
# good: [be15b91155cd5a6c4ac8f46740ae62e610981b79] Merge remote-tracking branch 'spi/for-6.6' into spi-next
git bisect good be15b91155cd5a6c4ac8f46740ae62e610981b79
# bad: [8f4995b370a57e7ad92c0f66664d171b23234337] Merge branch into tip/master: 'sched/eevdf'
git bisect bad 8f4995b370a57e7ad92c0f66664d171b23234337
# bad: [99d4d26551b56f4e523dd04e4970b94aa796a64e] rbtree: Add rb_add_augmented_cached() helper
git bisect bad 99d4d26551b56f4e523dd04e4970b94aa796a64e
# good: [7ff1693236f5d97a939dbeb660c07671a2d57071] sched/fair: Implement prefer sibling imbalance calculation between asymmetric groups
git bisect good 7ff1693236f5d97a939dbeb660c07671a2d57071
# good: [48b5583719cdfbdee238f9549a6a1a47af2b0469] sched/headers: Rename task_struct::state to task_struct::__state in the comments too
git bisect good 48b5583719cdfbdee238f9549a6a1a47af2b0469
# good: [af4cf40470c22efa3987200fd19478199e08e103] sched/fair: Add cfs_rq::avg_vruntime
git bisect good af4cf40470c22efa3987200fd19478199e08e103
# bad: [86bfbb7ce4f67a88df2639198169b685668e7349] sched/fair: Add lag based placement
git bisect bad 86bfbb7ce4f67a88df2639198169b685668e7349
# bad: [e0c2ff903c320d3fd3c2c604dc401b3b7c0a1d13] sched/fair: Remove sched_feat(START_DEBIT)
git bisect bad e0c2ff903c320d3fd3c2c604dc401b3b7c0a1d13
# first bad commit: [e0c2ff903c320d3fd3c2c604dc401b3b7c0a1d13] sched/fair: Remove sched_feat(START_DEBIT)
On Sat, Jul 29, 2023 at 09:00:02PM -0700, Guenter Roeck wrote: > On 7/27/23 16:18, Joel Fernandes wrote: > > [ ... ] > > > > I freely confess that I am having a hard time imagining what would > > > be CPU dependent in that code. Timing, maybe? Whatever the reason, > > > I am not seeing these failures in my testing. > > > > > > So which of the following Kconfig options is defined in your .config? > > > CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. > > > > > > If you have more than one of them, could you please apply this patch > > > and show me the corresponding console output from the resulting hang? > > > > FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side. > > > > I managed to bisect the problem. See bisect log below. Bisect repeated twice. > so it should be reliable. I don't really understand it, but the following > reverts fix the problem. This is on top of next-20230721 because next-20230728 > crashes immediately in my tests. > > 0caafe9b94ab (HEAD) Revert "sched/fair: Remove sched_feat(START_DEBIT)" > 518bdbd39fdb Revert "sched/fair: Add lag based placement" > a011162c3e32 Revert "sched/fair: Implement an EEVDF-like scheduling policy" > df579720bf98 Revert "sched/fair: Commit to lag based placement" > aac459a7e738 Revert "sched/smp: Use lag to simplify cross-runqueue placement" > 8d686eb173e1 Revert "sched/fair: Commit to EEVDF" > 486474c50f95 Revert "sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice" > 79e94d67d08a Revert "sched/fair: Propagate enqueue flags into place_entity()" > ae867bc97b71 (tag: next-20230721) Add linux-next specific files for 20230721 > > For context: x86 images (32 and 64 bit) in -next tend to hang at > > [ 2.309323] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. > [ 2.311634] Running RCU-tasks wait API self tests > > The hang is not seen with every boot; it happens roughly about once every > 10 boot attempts. It is not CPU dependent as I initially thought. > > Configuration file is at http://server.roeck-us.net/qemu/x86-next/config. > Example qemu command line: Hurmph, let me see if I can reproduce on next-20230731 (not having the older next thingies around).
On Mon, Jul 31, 2023 at 04:19:34PM +0200, Peter Zijlstra wrote: > On Sat, Jul 29, 2023 at 09:00:02PM -0700, Guenter Roeck wrote: > > On 7/27/23 16:18, Joel Fernandes wrote: > > > > [ ... ] > > > > > > I freely confess that I am having a hard time imagining what would > > > > be CPU dependent in that code. Timing, maybe? Whatever the reason, > > > > I am not seeing these failures in my testing. > > > > > > > > So which of the following Kconfig options is defined in your .config? > > > > CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. > > > > > > > > If you have more than one of them, could you please apply this patch > > > > and show me the corresponding console output from the resulting hang? > > > > > > FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side. > > > > > > > I managed to bisect the problem. See bisect log below. Bisect repeated twice. > > so it should be reliable. I don't really understand it, but the following > > reverts fix the problem. This is on top of next-20230721 because next-20230728 > > crashes immediately in my tests. > > > > 0caafe9b94ab (HEAD) Revert "sched/fair: Remove sched_feat(START_DEBIT)" > > 518bdbd39fdb Revert "sched/fair: Add lag based placement" > > a011162c3e32 Revert "sched/fair: Implement an EEVDF-like scheduling policy" > > df579720bf98 Revert "sched/fair: Commit to lag based placement" > > aac459a7e738 Revert "sched/smp: Use lag to simplify cross-runqueue placement" > > 8d686eb173e1 Revert "sched/fair: Commit to EEVDF" > > 486474c50f95 Revert "sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice" > > 79e94d67d08a Revert "sched/fair: Propagate enqueue flags into place_entity()" > > ae867bc97b71 (tag: next-20230721) Add linux-next specific files for 20230721 > > > > For context: x86 images (32 and 64 bit) in -next tend to hang at > > > > [ 2.309323] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. > > [ 2.311634] Running RCU-tasks wait API self tests > > > > The hang is not seen with every boot; it happens roughly about once every > > 10 boot attempts. It is not CPU dependent as I initially thought. > > > > Configuration file is at http://server.roeck-us.net/qemu/x86-next/config. > > Example qemu command line: > > Hurmph, let me see if I can reproduce on next-20230731 (not having the > older next thingies around). I've taken your config above, and the rootfs.ext2 and run-sh from x86/. I've then modified run-sh to use: qemu-system-x86_64 -enable-kvm -cpu host What I'm seeing is that some boots get stuck at: [ 0.608230] Running RCU-tasks wait API self tests Is this the right 'problem' ?
On 7/31/23 07:39, Peter Zijlstra wrote: > On Mon, Jul 31, 2023 at 04:19:34PM +0200, Peter Zijlstra wrote: >> On Sat, Jul 29, 2023 at 09:00:02PM -0700, Guenter Roeck wrote: >>> On 7/27/23 16:18, Joel Fernandes wrote: >>> >>> [ ... ] >>> >>>>> I freely confess that I am having a hard time imagining what would >>>>> be CPU dependent in that code. Timing, maybe? Whatever the reason, >>>>> I am not seeing these failures in my testing. >>>>> >>>>> So which of the following Kconfig options is defined in your .config? >>>>> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. >>>>> >>>>> If you have more than one of them, could you please apply this patch >>>>> and show me the corresponding console output from the resulting hang? >>>> >>>> FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side. >>>> >>> >>> I managed to bisect the problem. See bisect log below. Bisect repeated twice. >>> so it should be reliable. I don't really understand it, but the following >>> reverts fix the problem. This is on top of next-20230721 because next-20230728 >>> crashes immediately in my tests. >>> >>> 0caafe9b94ab (HEAD) Revert "sched/fair: Remove sched_feat(START_DEBIT)" >>> 518bdbd39fdb Revert "sched/fair: Add lag based placement" >>> a011162c3e32 Revert "sched/fair: Implement an EEVDF-like scheduling policy" >>> df579720bf98 Revert "sched/fair: Commit to lag based placement" >>> aac459a7e738 Revert "sched/smp: Use lag to simplify cross-runqueue placement" >>> 8d686eb173e1 Revert "sched/fair: Commit to EEVDF" >>> 486474c50f95 Revert "sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice" >>> 79e94d67d08a Revert "sched/fair: Propagate enqueue flags into place_entity()" >>> ae867bc97b71 (tag: next-20230721) Add linux-next specific files for 20230721 >>> >>> For context: x86 images (32 and 64 bit) in -next tend to hang at >>> >>> [ 2.309323] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. >>> [ 2.311634] Running RCU-tasks wait API self tests >>> >>> The hang is not seen with every boot; it happens roughly about once every >>> 10 boot attempts. It is not CPU dependent as I initially thought. >>> >>> Configuration file is at http://server.roeck-us.net/qemu/x86-next/config. >>> Example qemu command line: >> >> Hurmph, let me see if I can reproduce on next-20230731 (not having the >> older next thingies around). > > I've taken your config above, and the rootfs.ext2 and run-sh from x86/. > I've then modified run-sh to use: > > qemu-system-x86_64 -enable-kvm -cpu host > > What I'm seeing is that some boots get stuck at: > > [ 0.608230] Running RCU-tasks wait API self tests > > Is this the right 'problem' ? > Yes, exactly. Thanks, Guenter
On Mon, Jul 31, 2023 at 07:48:19AM -0700, Guenter Roeck wrote: > > I've taken your config above, and the rootfs.ext2 and run-sh from x86/. > > I've then modified run-sh to use: > > > > qemu-system-x86_64 -enable-kvm -cpu host > > > > What I'm seeing is that some boots get stuck at: > > > > [ 0.608230] Running RCU-tasks wait API self tests > > > > Is this the right 'problem' ? > > > > > Yes, exactly. Excellent! Let me prod that with something sharp, see what comes creeping out.
On Mon, 2023-07-31 at 16:52 +0200, Peter Zijlstra wrote: > On Mon, Jul 31, 2023 at 07:48:19AM -0700, Guenter Roeck wrote: > > > > I've taken your config above, and the rootfs.ext2 and run-sh from x86/. > > > I've then modified run-sh to use: > > > > > > qemu-system-x86_64 -enable-kvm -cpu host > > > > > > What I'm seeing is that some boots get stuck at: > > > > > > [ 0.608230] Running RCU-tasks wait API self tests > > > > > > Is this the right 'problem' ? > > > > > > > > > Yes, exactly. > > Excellent! Let me prod that with something sharp, see what comes > creeping out. In an effort to get up to speed with this area of the kernel, I've been playing around with this too today and managed to reproduce the problem using the same configuration. I'm completely new to this code but I think I may have found the root of the problem. What I've found is that there is a race condition between starting the RCU tasks grace-period thread in rcu_spawn_tasks_kthread_generic() and a subsequent call to synchronize_rcu_tasks_generic(). This results in rtp->tasks_gp_mutex being locked in the initial thread which subsequently blocks the newly started grace- period thread. The problem is that although synchronize_rcu_tasks_generic() checks to see if the grace-period kthread is running, it uses rtp->kthread_ptr to achieve this. This is only set in the thread entry point and not when the thread is created, meaning that it is set only after the creating thread yields or is preempted. If this has not happened before the next call to synchronize_rcu_tasks_generic() then a deadlock occurs. I've created a debug patch that introduces a new flag in rcu_tasks that is set when the kthread is created and used this in synchronize_rcu_tasks_generic() in place of READ_ONCE(rtp->kthread_ptr). This fixes the issue in my test environment. I'm happy to have a go at submitting a patch for this if it helps.
On Mon, Jul 31, 2023 at 05:08:29PM +0100, Roy Hopkins wrote:
> On Mon, 2023-07-31 at 16:52 +0200, Peter Zijlstra wrote:
> > On Mon, Jul 31, 2023 at 07:48:19AM -0700, Guenter Roeck wrote:
> >
> > > > I've taken your config above, and the rootfs.ext2 and run-sh from x86/.
> > > > I've then modified run-sh to use:
> > > >
> > > > qemu-system-x86_64 -enable-kvm -cpu host
> > > >
> > > > What I'm seeing is that some boots get stuck at:
> > > >
> > > > [ 0.608230] Running RCU-tasks wait API self tests
> > > >
> > > > Is this the right 'problem' ?
> > > >
> > >
> > >
> > > Yes, exactly.
> >
> > Excellent! Let me prod that with something sharp, see what comes
> > creeping out.
>
> In an effort to get up to speed with this area of the kernel, I've been playing
> around with this too today and managed to reproduce the problem using the same
> configuration. I'm completely new to this code but I think I may have found the
> root of the problem.
>
> What I've found is that there is a race condition between starting the RCU tasks
> grace-period thread in rcu_spawn_tasks_kthread_generic() and a subsequent call
> to synchronize_rcu_tasks_generic(). This results in rtp->tasks_gp_mutex being
> locked in the initial thread which subsequently blocks the newly started grace-
> period thread.
>
> The problem is that although synchronize_rcu_tasks_generic() checks to see if
> the grace-period kthread is running, it uses rtp->kthread_ptr to achieve this.
> This is only set in the thread entry point and not when the thread is created,
> meaning that it is set only after the creating thread yields or is preempted. If
> this has not happened before the next call to synchronize_rcu_tasks_generic()
> then a deadlock occurs.
>
> I've created a debug patch that introduces a new flag in rcu_tasks that is set
> when the kthread is created and used this in synchronize_rcu_tasks_generic() in
> place of READ_ONCE(rtp->kthread_ptr). This fixes the issue in my test
> environment.
>
> I'm happy to have a go at submitting a patch for this if it helps.
Ha!, I was poking around the same thing. My hack below seems to (so far,
<20 boots) help things.
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 56c470a489c8..b083b5a30025 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -652,7 +658,11 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
return;
- smp_mb(); /* Ensure others see full kthread. */
+ for (;;) {
+ cond_resched();
+ if (smp_load_acquire(&rtp->kthread_ptr))
+ break;
+ }
}
#ifndef CONFIG_TINY_RCU
On 7/31/23 09:14, Peter Zijlstra wrote:
> On Mon, Jul 31, 2023 at 05:08:29PM +0100, Roy Hopkins wrote:
>> On Mon, 2023-07-31 at 16:52 +0200, Peter Zijlstra wrote:
>>> On Mon, Jul 31, 2023 at 07:48:19AM -0700, Guenter Roeck wrote:
>>>
>>>>> I've taken your config above, and the rootfs.ext2 and run-sh from x86/.
>>>>> I've then modified run-sh to use:
>>>>>
>>>>> qemu-system-x86_64 -enable-kvm -cpu host
>>>>>
>>>>> What I'm seeing is that some boots get stuck at:
>>>>>
>>>>> [ 0.608230] Running RCU-tasks wait API self tests
>>>>>
>>>>> Is this the right 'problem' ?
>>>>>
>>>>
>>>>
>>>> Yes, exactly.
>>>
>>> Excellent! Let me prod that with something sharp, see what comes
>>> creeping out.
>>
>> In an effort to get up to speed with this area of the kernel, I've been playing
>> around with this too today and managed to reproduce the problem using the same
>> configuration. I'm completely new to this code but I think I may have found the
>> root of the problem.
>>
>> What I've found is that there is a race condition between starting the RCU tasks
>> grace-period thread in rcu_spawn_tasks_kthread_generic() and a subsequent call
>> to synchronize_rcu_tasks_generic(). This results in rtp->tasks_gp_mutex being
>> locked in the initial thread which subsequently blocks the newly started grace-
>> period thread.
>>
>> The problem is that although synchronize_rcu_tasks_generic() checks to see if
>> the grace-period kthread is running, it uses rtp->kthread_ptr to achieve this.
>> This is only set in the thread entry point and not when the thread is created,
>> meaning that it is set only after the creating thread yields or is preempted. If
>> this has not happened before the next call to synchronize_rcu_tasks_generic()
>> then a deadlock occurs.
>>
>> I've created a debug patch that introduces a new flag in rcu_tasks that is set
>> when the kthread is created and used this in synchronize_rcu_tasks_generic() in
>> place of READ_ONCE(rtp->kthread_ptr). This fixes the issue in my test
>> environment.
>>
>> I'm happy to have a go at submitting a patch for this if it helps.
>
> Ha!, I was poking around the same thing. My hack below seems to (so far,
> <20 boots) help things.
>
So, dumb question:
How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ?
Thanks,
Guenter
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 56c470a489c8..b083b5a30025 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -652,7 +658,11 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
> t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
> if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
> return;
> - smp_mb(); /* Ensure others see full kthread. */
> + for (;;) {
> + cond_resched();
> + if (smp_load_acquire(&rtp->kthread_ptr))
> + break;
> + }
> }
>
> #ifndef CONFIG_TINY_RCU
On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote: > > Ha!, I was poking around the same thing. My hack below seems to (so far, > > <20 boots) help things. > > > > So, dumb question: > How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ? That commit changes the timings of things; dumb luck otherwise.
On 7/31/23 14:15, Peter Zijlstra wrote: > On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote: >>> Ha!, I was poking around the same thing. My hack below seems to (so far, >>> <20 boots) help things. >>> >> >> So, dumb question: >> How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ? > > That commit changes the timings of things; dumb luck otherwise. Kind of scary. So I only experienced the problem because the START_DEBIT patch happened to be queued roughly at the same time, and it might otherwise have found its way unnoticed into the upstream kernel. That makes me wonder if this or other similar patches may uncover similar problems elsewhere in the kernel (i.e., either hide new or existing race conditions or expose existing ones). This in turn makes me wonder if it would be possible to define a test which would uncover such problems without the START_DEBIT patch. Any idea ? Guenter
On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> On 7/31/23 14:15, Peter Zijlstra wrote:
> > On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote:
> > > > Ha!, I was poking around the same thing. My hack below seems to (so far,
> > > > <20 boots) help things.
> > > >
> > >
> > > So, dumb question:
> > > How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ?
> >
> > That commit changes the timings of things; dumb luck otherwise.
>
> Kind of scary. So I only experienced the problem because the START_DEBIT patch
> happened to be queued roughly at the same time, and it might otherwise have
> found its way unnoticed into the upstream kernel. That makes me wonder if this
> or other similar patches may uncover similar problems elsewhere in the kernel
> (i.e., either hide new or existing race conditions or expose existing ones).
>
> This in turn makes me wonder if it would be possible to define a test which
> would uncover such problems without the START_DEBIT patch. Any idea ?
Thank you all for tracking this down!
One way is to put a schedule_timeout_idle(100) right before the call to
rcu_tasks_one_gp() from synchronize_rcu_tasks_generic(). That is quite
specific to this particular issue, but it does have the virtue of making
it actually happen in my testing.
There have been a few academic projects that inject delays at points
chosen by various heuristics plus some randomness. But this would be
a bit of a challenge to those because each kernel only passes through
this window once at boot time.
Please see below for my preferred fix. Does this work for you guys?
Back to figuring out why recent kernels occasionally to blow up all
rcutorture guest OSes...
Thanx, Paul
------------------------------------------------------------------------
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 7294be62727b..2d5b8385c357 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
if (unlikely(midboot)) {
needgpcb = 0x2;
} else {
+ mutex_unlock(&rtp->tasks_gp_mutex);
set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
rcuwait_wait_event(&rtp->cbs_wait,
(needgpcb = rcu_tasks_need_gpcb(rtp)),
TASK_IDLE);
+ mutex_lock(&rtp->tasks_gp_mutex);
}
if (needgpcb & 0x2) {
Two quick comments, both of them "this code is a bit odd" rather than
anything else.
On Tue, 1 Aug 2023 at 12:11, Paul E. McKenney <paulmck@kernel.org> wrote:
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
Why is this file called "tasks.h"?
It's not a header file. It makes no sense. It's full of C code. It's
included in only one place. It's just _weird_.
However, more relevantly:
> + mutex_unlock(&rtp->tasks_gp_mutex);
> set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
Isn't the tasks_gp_mutex the thing that protects the gp state here?
Shouldn't it be after setting?
> rcuwait_wait_event(&rtp->cbs_wait,
> (needgpcb = rcu_tasks_need_gpcb(rtp)),
> TASK_IDLE);
Also, looking at rcu_tasks_need_gpcb() that is now called outside the
lock, it does something quite odd.
At the very top of the function does
for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
and 'smp_load_acquire()' is all about saying "everything *after* this
load is ordered,
But the way it is done in that loop, it is indeed done at the
beginning of the loop, but then it's done *after* the loop too, so the
last smp_load_acquire seems a bit nonsensical.
If you want to load a value and say "this value is now sensible for
everything that follows", I think you should load it *first*. No?
IOW, wouldn't the whole sequence make more sense as
dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
for (cpu = 0; cpu < dequeue_limit; cpu++) {
and say that everything in rcu_tasks_need_gpcb() is ordered wrt the
initial limit on entry?
I dunno. That use of "smp_load_acquire()" just seems odd. Memory
ordering is hard to understand to begin with, but then when you have
things like loops that do the same ordered load multiple times, it
goes from "hard to understand" to positively confusing.
Linus
On Wed, Aug 02, 2023 at 10:14:51AM -0700, Linus Torvalds wrote:
> Two quick comments, both of them "this code is a bit odd" rather than
> anything else.
Good to get eyes on this code, so thank you very much!!!
> On Tue, 1 Aug 2023 at 12:11, Paul E. McKenney <paulmck@kernel.org> wrote:
> >
> > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
>
> Why is this file called "tasks.h"?
>
> It's not a header file. It makes no sense. It's full of C code. It's
> included in only one place. It's just _weird_.
You are right, it is weird.
This is a holdover from when I was much more concerned about being
criticized for having #ifdef in a .c file, and pretty much every
line in this file is under some combination or another of #ifdefs.
This concern led to kernel/rcu/tree_plugin.h being set up in this way
back when preemptible RCU was introduced, and for good or for bad I just
kept following that pattern.
We could convert this to a .c file, keep the #ifdefs, drop some instances
of "static", add a bunch of declarations, and maybe (or maybe not) push a
function or two into some .h file for performance/inlining reasons. Me, I
would prefer to leave it alone, but we can certainly change it.
> However, more relevantly:
>
> > + mutex_unlock(&rtp->tasks_gp_mutex);
> > set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
>
> Isn't the tasks_gp_mutex the thing that protects the gp state here?
> Shouldn't it be after setting?
Much of the gp state is protected by being accessed only by the gp
kthread. But there is a window in time where the gp might be driven
directly out of the synchronize_rcu_tasks() call. That window in time
does not have a definite end, so this ->tasks_gp_mutex does the needed
mutual exclusion during the transition of gp processing to the newly
created gp kthread.
> > rcuwait_wait_event(&rtp->cbs_wait,
> > (needgpcb = rcu_tasks_need_gpcb(rtp)),
> > TASK_IDLE);
>
> Also, looking at rcu_tasks_need_gpcb() that is now called outside the
> lock, it does something quite odd.
The state of each callback list is protected by the ->lock field of
the rcu_tasks_percpu structure. Yes, rcu_segcblist_n_cbs() is invoked
int rcu_tasks_need_gpcb() outside of the lock, but it is designed for
lockless use. If it is modified just after the check, then there will
be a later wakeup on the one hand or we will just uselessly acquire that
->lock this one time on the other.
Also, ncbs records the number of callbacks seen in that first loop,
then used later, where its value might be stale. This might result in
a collapse back to single-callback-queue operation and a later expansion
back up. Except that at this point we are still in single-CPU mode, so
there should not be any lock contention, which means that there should
still be but a single callback queue. The transition itself is protected
by ->cbs_gbl_lock.
> At the very top of the function does
>
> for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
>
> and 'smp_load_acquire()' is all about saying "everything *after* this
> load is ordered,
>
> But the way it is done in that loop, it is indeed done at the
> beginning of the loop, but then it's done *after* the loop too, so the
> last smp_load_acquire seems a bit nonsensical.
>
> If you want to load a value and say "this value is now sensible for
> everything that follows", I think you should load it *first*. No?
>
> IOW, wouldn't the whole sequence make more sense as
>
> dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
> for (cpu = 0; cpu < dequeue_limit; cpu++) {
>
> and say that everything in rcu_tasks_need_gpcb() is ordered wrt the
> initial limit on entry?
>
> I dunno. That use of "smp_load_acquire()" just seems odd. Memory
> ordering is hard to understand to begin with, but then when you have
> things like loops that do the same ordered load multiple times, it
> goes from "hard to understand" to positively confusing.
Excellent point. I am queueing that change with your Suggested-by.
If testing goes well, it will be as shown below.
Thanx, Paul
------------------------------------------------------------------------
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 83049a893de5..94bb5abdbb37 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -432,6 +432,7 @@ static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp)
static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
{
int cpu;
+ int dequeue_limit;
unsigned long flags;
bool gpdone = poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq);
long n;
@@ -439,7 +440,8 @@ static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp)
long ncbsnz = 0;
int needgpcb = 0;
- for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) {
+ dequeue_limit = smp_load_acquire(&rtp->percpu_dequeue_lim);
+ for (cpu = 0; cpu < dequeue_limit; cpu++) {
struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu);
/* Advance and accelerate any new callbacks. */
On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
> On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
>
>
> Please see below for my preferred fix. Does this work for you guys?
>
> Back to figuring out why recent kernels occasionally to blow up all
> rcutorture guest OSes...
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 7294be62727b..2d5b8385c357 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
> if (unlikely(midboot)) {
> needgpcb = 0x2;
> } else {
> + mutex_unlock(&rtp->tasks_gp_mutex);
> set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> rcuwait_wait_event(&rtp->cbs_wait,
> (needgpcb = rcu_tasks_need_gpcb(rtp)),
> TASK_IDLE);
> + mutex_lock(&rtp->tasks_gp_mutex);
> }
>
> if (needgpcb & 0x2) {
Your preferred fix looks good to me.
With the original code I can quite easily reproduce the problem on my
system every 10 reboots or so. With your fix in place the problem no
longer occurs.
On Wed, Aug 02, 2023 at 02:57:56PM +0100, Roy Hopkins wrote:
> On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
> > On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> >
> >
> > Please see below for my preferred fix. Does this work for you guys?
> >
> > Back to figuring out why recent kernels occasionally to blow up all
> > rcutorture guest OSes...
> >
> > Thanx, Paul
> >
> > ------------------------------------------------------------------------
> >
> > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> > index 7294be62727b..2d5b8385c357 100644
> > --- a/kernel/rcu/tasks.h
> > +++ b/kernel/rcu/tasks.h
> > @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
> > if (unlikely(midboot)) {
> > needgpcb = 0x2;
> > } else {
> > + mutex_unlock(&rtp->tasks_gp_mutex);
> > set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> > rcuwait_wait_event(&rtp->cbs_wait,
> > (needgpcb = rcu_tasks_need_gpcb(rtp)),
> > TASK_IDLE);
> > + mutex_lock(&rtp->tasks_gp_mutex);
> > }
> >
> > if (needgpcb & 0x2) {
>
> Your preferred fix looks good to me.
>
> With the original code I can quite easily reproduce the problem on my
> system every 10 reboots or so. With your fix in place the problem no
> longer occurs.
Very good, thank you! May I add your Tested-by?
Thanx, Paul
On 8/2/23 08:05, Paul E. McKenney wrote:
> On Wed, Aug 02, 2023 at 02:57:56PM +0100, Roy Hopkins wrote:
>> On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
>>> On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
>>>
>>>
>>> Please see below for my preferred fix. Does this work for you guys?
>>>
>>> Back to figuring out why recent kernels occasionally to blow up all
>>> rcutorture guest OSes...
>>>
>>> Thanx, Paul
>>>
>>> ------------------------------------------------------------------------
>>>
>>> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
>>> index 7294be62727b..2d5b8385c357 100644
>>> --- a/kernel/rcu/tasks.h
>>> +++ b/kernel/rcu/tasks.h
>>> @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
>>> if (unlikely(midboot)) {
>>> needgpcb = 0x2;
>>> } else {
>>> + mutex_unlock(&rtp->tasks_gp_mutex);
>>> set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
>>> rcuwait_wait_event(&rtp->cbs_wait,
>>> (needgpcb = rcu_tasks_need_gpcb(rtp)),
>>> TASK_IDLE);
>>> + mutex_lock(&rtp->tasks_gp_mutex);
>>> }
>>>
>>> if (needgpcb & 0x2) {
>>
>> Your preferred fix looks good to me.
>>
>> With the original code I can quite easily reproduce the problem on my
>> system every 10 reboots or so. With your fix in place the problem no
>> longer occurs.
>
> Very good, thank you! May I add your Tested-by?
>
FWIW, I am still working on it. So far I get
[ 8.191589] KTAP version 1
[ 8.191769] # Subtest: kunit_executor_test
[ 8.191972] # module: kunit
[ 8.192012] 1..8
[ 8.197643] ok 1 parse_filter_test
[ 8.201851] ok 2 filter_suites_test
[ 8.206713] ok 3 filter_suites_test_glob_test
[ 8.211806] ok 4 filter_suites_to_empty_test
[ 8.214077] kunit executor: filter operation not found: speed>slow, module!=example
[ 8.217933] # parse_filter_attr_test: ASSERTION FAILED at lib/kunit/executor_test.c:126
[ 8.217933] Expected err == 0, but
[ 8.217933] err == -22 (0xffffffffffffffea)
[ 8.217933]
[ 8.217933] failed to parse filter '(efault)'
[ 8.221266] not ok 5 parse_filter_attr_test
[ 8.224224] kunit executor: filter operation not found: speed>slow
[ 8.225837] # filter_attr_test: ASSERTION FAILED at lib/kunit/executor_test.c:165
[ 8.225837] Expected err == 0, but
[ 8.225837] err == -22 (0xffffffffffffffea)
[ 8.228850] not ok 6 filter_attr_test
[ 8.230942] kunit executor: filter operation not found: module!=dummy
[ 8.232167] # filter_attr_empty_test: ASSERTION FAILED at lib/kunit/executor_test.c:190
[ 8.232167] Expected err == 0, but
[ 8.232167] err == -22 (0xffffffffffffffea)
[ 8.235317] not ok 7 filter_attr_empty_test
[ 8.237065] kunit executor: filter operation not found: speed>slow
[ 8.238796] # filter_attr_skip_test: ASSERTION FAILED at lib/kunit/executor_test.c:209
[ 8.238796] Expected err == 0, but
[ 8.238796] err == -22 (0xffffffffffffffea)
[ 8.241897] not ok 8 filter_attr_skip_test
[ 8.241947] # kunit_executor_test: pass:4 fail:4 skip:0 total:8
[ 8.242144] # Totals: pass:4 fail:4 skip:0 total:8
and it looks like the console no longer works. Most likely this is some other problem
that was introduced while tests were broken. It will take me some time to track that down.
Guenter
On Wed, Aug 02, 2023 at 08:45:06AM -0700, Guenter Roeck wrote:
> On 8/2/23 08:05, Paul E. McKenney wrote:
> > On Wed, Aug 02, 2023 at 02:57:56PM +0100, Roy Hopkins wrote:
> > > On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> > > >
> > > >
> > > > Please see below for my preferred fix. Does this work for you guys?
> > > >
> > > > Back to figuring out why recent kernels occasionally to blow up all
> > > > rcutorture guest OSes...
> > > >
> > > > Thanx, Paul
> > > >
> > > > ------------------------------------------------------------------------
> > > >
> > > > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> > > > index 7294be62727b..2d5b8385c357 100644
> > > > --- a/kernel/rcu/tasks.h
> > > > +++ b/kernel/rcu/tasks.h
> > > > @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
> > > > if (unlikely(midboot)) {
> > > > needgpcb = 0x2;
> > > > } else {
> > > > + mutex_unlock(&rtp->tasks_gp_mutex);
> > > > set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> > > > rcuwait_wait_event(&rtp->cbs_wait,
> > > > (needgpcb = rcu_tasks_need_gpcb(rtp)),
> > > > TASK_IDLE);
> > > > + mutex_lock(&rtp->tasks_gp_mutex);
> > > > }
> > > > if (needgpcb & 0x2) {
> > >
> > > Your preferred fix looks good to me.
> > >
> > > With the original code I can quite easily reproduce the problem on my
> > > system every 10 reboots or so. With your fix in place the problem no
> > > longer occurs.
> >
> > Very good, thank you! May I add your Tested-by?
> >
>
> FWIW, I am still working on it. So far I get
>
> [ 8.191589] KTAP version 1
> [ 8.191769] # Subtest: kunit_executor_test
> [ 8.191972] # module: kunit
> [ 8.192012] 1..8
> [ 8.197643] ok 1 parse_filter_test
> [ 8.201851] ok 2 filter_suites_test
> [ 8.206713] ok 3 filter_suites_test_glob_test
> [ 8.211806] ok 4 filter_suites_to_empty_test
> [ 8.214077] kunit executor: filter operation not found: speed>slow, module!=example
> [ 8.217933] # parse_filter_attr_test: ASSERTION FAILED at lib/kunit/executor_test.c:126
> [ 8.217933] Expected err == 0, but
> [ 8.217933] err == -22 (0xffffffffffffffea)
> [ 8.217933]
> [ 8.217933] failed to parse filter '(efault)'
> [ 8.221266] not ok 5 parse_filter_attr_test
> [ 8.224224] kunit executor: filter operation not found: speed>slow
> [ 8.225837] # filter_attr_test: ASSERTION FAILED at lib/kunit/executor_test.c:165
> [ 8.225837] Expected err == 0, but
> [ 8.225837] err == -22 (0xffffffffffffffea)
> [ 8.228850] not ok 6 filter_attr_test
> [ 8.230942] kunit executor: filter operation not found: module!=dummy
> [ 8.232167] # filter_attr_empty_test: ASSERTION FAILED at lib/kunit/executor_test.c:190
> [ 8.232167] Expected err == 0, but
> [ 8.232167] err == -22 (0xffffffffffffffea)
> [ 8.235317] not ok 7 filter_attr_empty_test
> [ 8.237065] kunit executor: filter operation not found: speed>slow
> [ 8.238796] # filter_attr_skip_test: ASSERTION FAILED at lib/kunit/executor_test.c:209
> [ 8.238796] Expected err == 0, but
> [ 8.238796] err == -22 (0xffffffffffffffea)
> [ 8.241897] not ok 8 filter_attr_skip_test
> [ 8.241947] # kunit_executor_test: pass:4 fail:4 skip:0 total:8
> [ 8.242144] # Totals: pass:4 fail:4 skip:0 total:8
>
> and it looks like the console no longer works. Most likely this is some other problem
> that was introduced while tests were broken. It will take me some time to track that down.
No rush.
Given that this bug is a year old, that it happens only when debug
options are enabled, and that it has only been seen in current -next,
my plan is to submit it into the next merge window.
So this one stays mutable for about another 10 days.
On the strength of Roy's Tested-by, however, I will push this patch into
-next soon, so that should make things a bit easier. Or so I hope.
And again, thank you all for tracking this down!
Thanx, Paul
On Wed, 2023-08-02 at 08:05 -0700, Paul E. McKenney wrote:
> On Wed, Aug 02, 2023 at 02:57:56PM +0100, Roy Hopkins wrote:
> > On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
> > > On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> > >
> > >
> > > Please see below for my preferred fix. Does this work for you guys?
> > >
> > > Back to figuring out why recent kernels occasionally to blow up all
> > > rcutorture guest OSes...
> > >
> > > Thanx, Paul
> > >
> > > ------------------------------------------------------------------------
> > >
> > > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> > > index 7294be62727b..2d5b8385c357 100644
> > > --- a/kernel/rcu/tasks.h
> > > +++ b/kernel/rcu/tasks.h
> > > @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp,
> > > bool midboot)
> > > if (unlikely(midboot)) {
> > > needgpcb = 0x2;
> > > } else {
> > > + mutex_unlock(&rtp->tasks_gp_mutex);
> > > set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> > > rcuwait_wait_event(&rtp->cbs_wait,
> > > (needgpcb = rcu_tasks_need_gpcb(rtp)),
> > > TASK_IDLE);
> > > + mutex_lock(&rtp->tasks_gp_mutex);
> > > }
> > >
> > > if (needgpcb & 0x2) {
> >
> > Your preferred fix looks good to me.
> >
> > With the original code I can quite easily reproduce the problem on my
> > system every 10 reboots or so. With your fix in place the problem no
> > longer occurs.
>
> Very good, thank you! May I add your Tested-by?
>
> Thanx, Paul
Yes, please do.
On Wed, Aug 02, 2023 at 04:31:12PM +0100, Roy Hopkins wrote:
> On Wed, 2023-08-02 at 08:05 -0700, Paul E. McKenney wrote:
> > On Wed, Aug 02, 2023 at 02:57:56PM +0100, Roy Hopkins wrote:
> > > On Tue, 2023-08-01 at 12:11 -0700, Paul E. McKenney wrote:
> > > > On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> > > >
> > > >
> > > > Please see below for my preferred fix. Does this work for you guys?
> > > >
> > > > Back to figuring out why recent kernels occasionally to blow up all
> > > > rcutorture guest OSes...
> > > >
> > > > Thanx, Paul
> > > >
> > > > ------------------------------------------------------------------------
> > > >
> > > > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> > > > index 7294be62727b..2d5b8385c357 100644
> > > > --- a/kernel/rcu/tasks.h
> > > > +++ b/kernel/rcu/tasks.h
> > > > @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp,
> > > > bool midboot)
> > > > if (unlikely(midboot)) {
> > > > needgpcb = 0x2;
> > > > } else {
> > > > + mutex_unlock(&rtp->tasks_gp_mutex);
> > > > set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> > > > rcuwait_wait_event(&rtp->cbs_wait,
> > > > (needgpcb = rcu_tasks_need_gpcb(rtp)),
> > > > TASK_IDLE);
> > > > + mutex_lock(&rtp->tasks_gp_mutex);
> > > > }
> > > >
> > > > if (needgpcb & 0x2) {
> > >
> > > Your preferred fix looks good to me.
> > >
> > > With the original code I can quite easily reproduce the problem on my
> > > system every 10 reboots or so. With your fix in place the problem no
> > > longer occurs.
> >
> > Very good, thank you! May I add your Tested-by?
> >
> > Thanx, Paul
> Yes, please do.
Thank you again, and I will apply this on my next rebase.
Thanx, Paul
On Tue, Aug 01, 2023 at 12:11:04PM -0700, Paul E. McKenney wrote:
> On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote:
> > On 7/31/23 14:15, Peter Zijlstra wrote:
> > > On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote:
> > > > > Ha!, I was poking around the same thing. My hack below seems to (so far,
> > > > > <20 boots) help things.
> > > > >
> > > >
> > > > So, dumb question:
> > > > How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ?
> > >
> > > That commit changes the timings of things; dumb luck otherwise.
> >
> > Kind of scary. So I only experienced the problem because the START_DEBIT patch
> > happened to be queued roughly at the same time, and it might otherwise have
> > found its way unnoticed into the upstream kernel.
And just to set the record straight, this bug has been in mainline for
about a year, since v5.19.
Thanx, Paul
> > That makes me wonder if this
> > or other similar patches may uncover similar problems elsewhere in the kernel
> > (i.e., either hide new or existing race conditions or expose existing ones).
> >
> > This in turn makes me wonder if it would be possible to define a test which
> > would uncover such problems without the START_DEBIT patch. Any idea ?
>
> Thank you all for tracking this down!
>
> One way is to put a schedule_timeout_idle(100) right before the call to
> rcu_tasks_one_gp() from synchronize_rcu_tasks_generic(). That is quite
> specific to this particular issue, but it does have the virtue of making
> it actually happen in my testing.
>
> There have been a few academic projects that inject delays at points
> chosen by various heuristics plus some randomness. But this would be
> a bit of a challenge to those because each kernel only passes through
> this window once at boot time.
>
> Please see below for my preferred fix. Does this work for you guys?
>
> Back to figuring out why recent kernels occasionally to blow up all
> rcutorture guest OSes...
>
> Thanx, Paul
>
> ------------------------------------------------------------------------
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 7294be62727b..2d5b8385c357 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -570,10 +570,12 @@ static void rcu_tasks_one_gp(struct rcu_tasks *rtp, bool midboot)
> if (unlikely(midboot)) {
> needgpcb = 0x2;
> } else {
> + mutex_unlock(&rtp->tasks_gp_mutex);
> set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
> rcuwait_wait_event(&rtp->cbs_wait,
> (needgpcb = rcu_tasks_need_gpcb(rtp)),
> TASK_IDLE);
> + mutex_lock(&rtp->tasks_gp_mutex);
> }
>
> if (needgpcb & 0x2) {
On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote: > On 7/31/23 14:15, Peter Zijlstra wrote: > > On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote: > > > > Ha!, I was poking around the same thing. My hack below seems to (so far, > > > > <20 boots) help things. > > > > > > > > > > So, dumb question: > > > How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ? > > > > That commit changes the timings of things; dumb luck otherwise. > > Kind of scary. So I only experienced the problem because the START_DEBIT patch > happened to be queued roughly at the same time, and it might otherwise have > found its way unnoticed into the upstream kernel. That makes me wonder if this > or other similar patches may uncover similar problems elsewhere in the kernel > (i.e., either hide new or existing race conditions or expose existing ones). > > This in turn makes me wonder if it would be possible to define a test which > would uncover such problems without the START_DEBIT patch. Any idea ? IIRC some of the thread sanitizers use breakpoints to inject random sleeps, specifically to tickle races.
On Tue, Aug 01, 2023 at 09:08:52PM +0200, Peter Zijlstra wrote: > On Tue, Aug 01, 2023 at 10:32:45AM -0700, Guenter Roeck wrote: > > On 7/31/23 14:15, Peter Zijlstra wrote: > > > On Mon, Jul 31, 2023 at 09:34:29AM -0700, Guenter Roeck wrote: > > > > > Ha!, I was poking around the same thing. My hack below seems to (so far, > > > > > <20 boots) help things. > > > > > > > > > > > > > So, dumb question: > > > > How comes this bisects to "sched/fair: Remove sched_feat(START_DEBIT)" ? > > > > > > That commit changes the timings of things; dumb luck otherwise. > > > > Kind of scary. So I only experienced the problem because the START_DEBIT patch > > happened to be queued roughly at the same time, and it might otherwise have > > found its way unnoticed into the upstream kernel. That makes me wonder if this > > or other similar patches may uncover similar problems elsewhere in the kernel > > (i.e., either hide new or existing race conditions or expose existing ones). > > > > This in turn makes me wonder if it would be possible to define a test which > > would uncover such problems without the START_DEBIT patch. Any idea ? > > IIRC some of the thread sanitizers use breakpoints to inject random > sleeps, specifically to tickle races. I have heard of are some of these, arguably including KCSAN, but they would have a tough time on this one. They would have to inject many milliseconds between the check of ->kthread_ptr in synchronize_rcu_tasks_generic() and that mutex_lock() in rcu_tasks_one_gp(). Plus this window only occurs during boot shortly before init is spawned. On the other hand, randomly injecting delay just before acquiring each lock would cover this case. But such a sanitzer would still only get one shot per boot of the kernel for this particular bug. Thanx, Paul
On Mon, 2023-07-31 at 18:14 +0200, Peter Zijlstra wrote:
> Ha!, I was poking around the same thing. My hack below seems to (so far,
> <20 boots) help things.
>
>
> diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
> index 56c470a489c8..b083b5a30025 100644
> --- a/kernel/rcu/tasks.h
> +++ b/kernel/rcu/tasks.h
> @@ -652,7 +658,11 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
> t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
> if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
> return;
> - smp_mb(); /* Ensure others see full kthread. */
> + for (;;) {
> + cond_resched();
> + if (smp_load_acquire(&rtp->kthread_ptr))
> + break;
> + }
> }
>
> #ifndef CONFIG_TINY_RCU
FWIW, here's my hack which seems to fix it.
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 9b9ce09f8f35..2e76fbfff9c6 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -52,6 +52,7 @@ struct rcu_tasks_percpu {
* @cbs_gbl_lock: Lock protecting callback list.
* @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone.
* @kthread_ptr: This flavor's grace-period/callback-invocation kthread.
+ * @kthread_started: Flag that indicates whether kthread has been launched.
* @gp_func: This flavor's grace-period-wait function.
* @gp_state: Grace period's most recent state transition (debugging).
* @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping.
@@ -92,6 +93,7 @@ struct rcu_tasks {
unsigned long n_ipis;
unsigned long n_ipis_fails;
struct task_struct *kthread_ptr;
+ int kthread_started;
rcu_tasks_gp_func_t gp_func;
pregp_func_t pregp_func;
pertask_func_t pertask_func;
@@ -582,7 +584,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp)
return;
// If the grace-period kthread is running, use it.
- if (READ_ONCE(rtp->kthread_ptr)) {
+ if (READ_ONCE(rtp->kthread_started)) {
wait_rcu_gp(rtp->call_func);
return;
}
@@ -595,6 +597,7 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp)
struct task_struct *t;
t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname);
+ rtp->kthread_started = 1;
if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name))
return;
smp_mb(); /* Ensure others see full kthread. */
On 7/31/23 07:19, Peter Zijlstra wrote: > On Sat, Jul 29, 2023 at 09:00:02PM -0700, Guenter Roeck wrote: >> On 7/27/23 16:18, Joel Fernandes wrote: >> >> [ ... ] >> >>>> I freely confess that I am having a hard time imagining what would >>>> be CPU dependent in that code. Timing, maybe? Whatever the reason, >>>> I am not seeing these failures in my testing. >>>> >>>> So which of the following Kconfig options is defined in your .config? >>>> CONFIG_TASKS_RCU, CONFIG_TASKS_RUDE_RCU, and CONFIG_TASKS_TRACE_RCU. >>>> >>>> If you have more than one of them, could you please apply this patch >>>> and show me the corresponding console output from the resulting hang? >>> >>> FWIW, I am not able to repro this issue either. If a .config can be shared of the problem system, I can try it out to see if it can be reproduced on my side. >>> >> >> I managed to bisect the problem. See bisect log below. Bisect repeated twice. >> so it should be reliable. I don't really understand it, but the following >> reverts fix the problem. This is on top of next-20230721 because next-20230728 >> crashes immediately in my tests. >> >> 0caafe9b94ab (HEAD) Revert "sched/fair: Remove sched_feat(START_DEBIT)" >> 518bdbd39fdb Revert "sched/fair: Add lag based placement" >> a011162c3e32 Revert "sched/fair: Implement an EEVDF-like scheduling policy" >> df579720bf98 Revert "sched/fair: Commit to lag based placement" >> aac459a7e738 Revert "sched/smp: Use lag to simplify cross-runqueue placement" >> 8d686eb173e1 Revert "sched/fair: Commit to EEVDF" >> 486474c50f95 Revert "sched/debug: Rename sysctl_sched_min_granularity to sysctl_sched_base_slice" >> 79e94d67d08a Revert "sched/fair: Propagate enqueue flags into place_entity()" >> ae867bc97b71 (tag: next-20230721) Add linux-next specific files for 20230721 >> >> For context: x86 images (32 and 64 bit) in -next tend to hang at >> >> [ 2.309323] RCU Tasks: Setting shift to 0 and lim to 1 rcu_task_cb_adjust=1. >> [ 2.311634] Running RCU-tasks wait API self tests >> >> The hang is not seen with every boot; it happens roughly about once every >> 10 boot attempts. It is not CPU dependent as I initially thought. >> >> Configuration file is at http://server.roeck-us.net/qemu/x86-next/config. >> Example qemu command line: > > Hurmph, let me see if I can reproduce on next-20230731 (not having the > older next thingies around). That crashes hard with my configuration. [ 6.353191] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) [ 6.353392] BUG: unable to handle page fault for address: ffff9b10c0013cd0 [ 6.353531] #PF: supervisor instruction fetch in kernel mode [ 6.353624] #PF: error_code(0x0011) - permissions violation [ 6.353751] PGD 1000067 P4D 1000067 PUD 1205067 PMD 1206067 PTE 800000000124e063 [ 6.354011] Oops: 0011 [#1] PREEMPT SMP PTI [ 6.354164] CPU: 0 PID: 182 Comm: kunit_try_catch Tainted: G N 6.5.0-rc4-next-20230731 #1 [ 6.354315] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 6.354525] RIP: 0010:0xffff9b10c0013cd0 [ 6.354793] Code: ff ff 60 64 ce a9 ff ff ff ff 00 00 00 00 00 00 00 00 d1 3a bc a8 ff ff ff ff 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <f0> 00 01 44 10 8a ff ff b8 01 01 44 10 8a ff ff 00 00 00 00 00 00 [ 6.355059] RSP: 0000:ffff9b10c027fd60 EFLAGS: 00000246 [ 6.355157] RAX: ffff9b10c0013cd0 RBX: ffff8a1043bdb400 RCX: 0000000000000000 [ 6.355259] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8a1043bdb400 [ 6.355358] RBP: ffff9b10c027fdc8 R08: 0000000000000001 R09: 0000000000000001 [ 6.355456] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9b10c027fe74 [ 6.355556] R13: ffff8a10440100f0 R14: ffff8a10440101b8 R15: ffff9b10c027fe74 [ 6.355679] FS: 0000000000000000(0000) GS:ffff8a104fc00000(0000) knlGS:0000000000000000 [ 6.355798] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 6.355886] CR2: ffff9b10c0013cd0 CR3: 000000000e048000 CR4: 00000000003506f0 [ 6.356029] Call Trace: [ 6.356158] <TASK> [ 6.356334] ? __die+0x1f/0x70 [ 6.356472] ? page_fault_oops+0x14a/0x460 [ 6.356547] ? exc_page_fault+0xee/0x1c0 [ 6.356612] ? asm_exc_page_fault+0x26/0x30 [ 6.356703] ? kunit_filter_attr_tests+0xc4/0x2e0 [ 6.356796] kunit_filter_suites+0x2e2/0x460 [ 6.356889] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 [ 6.356979] filter_suites_test+0xea/0x2c0 [ 6.357051] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 [ 6.357148] kunit_generic_run_threadfn_adapter+0x15/0x20 [ 6.357228] kthread+0xef/0x120 [ 6.357282] ? __pfx_kthread+0x10/0x10 [ 6.357343] ret_from_fork+0x2f/0x50 [ 6.357399] ? __pfx_kthread+0x10/0x10 [ 6.357458] ret_from_fork_asm+0x1b/0x30 [ 6.357560] </TASK> [ 6.357632] Modules linked in: [ 6.357786] CR2: ffff9b10c0013cd0 [ 6.358010] ---[ end trace 0000000000000000 ]--- Enabling CONFIG_ZERO_CALL_USED_REGS might fix (hide) this, but I have not tried. Guenter
On Mon, Jul 31, 2023 at 07:35:13AM -0700, Guenter Roeck wrote: > > Hurmph, let me see if I can reproduce on next-20230731 (not having the > > older next thingies around). > > That crashes hard with my configuration. > > [ 6.353191] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) > [ 6.353392] BUG: unable to handle page fault for address: ffff9b10c0013cd0 > [ 6.353531] #PF: supervisor instruction fetch in kernel mode > [ 6.353624] #PF: error_code(0x0011) - permissions violation > [ 6.353751] PGD 1000067 P4D 1000067 PUD 1205067 PMD 1206067 PTE 800000000124e063 > [ 6.354011] Oops: 0011 [#1] PREEMPT SMP PTI > [ 6.354164] CPU: 0 PID: 182 Comm: kunit_try_catch Tainted: G N 6.5.0-rc4-next-20230731 #1 > [ 6.354315] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 > [ 6.354525] RIP: 0010:0xffff9b10c0013cd0 > [ 6.354793] Code: ff ff 60 64 ce a9 ff ff ff ff 00 00 00 00 00 00 00 00 d1 3a bc a8 ff ff ff ff 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <f0> 00 01 44 10 8a ff ff b8 01 01 44 10 8a ff ff 00 00 00 00 00 00 > [ 6.355059] RSP: 0000:ffff9b10c027fd60 EFLAGS: 00000246 > [ 6.355157] RAX: ffff9b10c0013cd0 RBX: ffff8a1043bdb400 RCX: 0000000000000000 > [ 6.355259] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8a1043bdb400 > [ 6.355358] RBP: ffff9b10c027fdc8 R08: 0000000000000001 R09: 0000000000000001 > [ 6.355456] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9b10c027fe74 > [ 6.355556] R13: ffff8a10440100f0 R14: ffff8a10440101b8 R15: ffff9b10c027fe74 > [ 6.355679] FS: 0000000000000000(0000) GS:ffff8a104fc00000(0000) knlGS:0000000000000000 > [ 6.355798] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 6.355886] CR2: ffff9b10c0013cd0 CR3: 000000000e048000 CR4: 00000000003506f0 > [ 6.356029] Call Trace: > [ 6.356158] <TASK> > [ 6.356334] ? __die+0x1f/0x70 > [ 6.356472] ? page_fault_oops+0x14a/0x460 > [ 6.356547] ? exc_page_fault+0xee/0x1c0 > [ 6.356612] ? asm_exc_page_fault+0x26/0x30 > [ 6.356703] ? kunit_filter_attr_tests+0xc4/0x2e0 > [ 6.356796] kunit_filter_suites+0x2e2/0x460 > [ 6.356889] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 > [ 6.356979] filter_suites_test+0xea/0x2c0 > [ 6.357051] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 > [ 6.357148] kunit_generic_run_threadfn_adapter+0x15/0x20 > [ 6.357228] kthread+0xef/0x120 > [ 6.357282] ? __pfx_kthread+0x10/0x10 > [ 6.357343] ret_from_fork+0x2f/0x50 > [ 6.357399] ? __pfx_kthread+0x10/0x10 > [ 6.357458] ret_from_fork_asm+0x1b/0x30 > [ 6.357560] </TASK> > [ 6.357632] Modules linked in: > [ 6.357786] CR2: ffff9b10c0013cd0 > [ 6.358010] ---[ end trace 0000000000000000 ]--- I get: [ 2.423691] ------------[ cut here ]------------ [ 2.424994] WARNING: CPU: 0 PID: 184 at mm/slab_common.c:992 free_large_kmalloc+0x4f/0x80 [ 2.426183] Modules linked in: [ 2.426624] CPU: 0 PID: 184 Comm: kunit_try_catch Tainted: G N 6.5.0-rc4-next-20230731 #1 [ 2.427964] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014 [ 2.429265] RIP: 0010:free_large_kmalloc+0x4f/0x80 [ 2.429952] Code: f7 da 48 63 d2 48 8b 03 be 06 00 00 00 48 c1 e8 3a 48 8b 3c c5 60 ba 11 ab e8 0d 52 ff ff 89 ee 48 89 df 5b 5d e9 41 df 03 00 <0f> 0b 80 3d 49 43 e9 01 00 75 [ 2.432511] RSP: 0000:ffffadcb0024bdb8 EFLAGS: 00010246 [ 2.433259] RAX: 0100000000001000 RBX: ffffd16bc018aa40 RCX: ffffadcb0024bd7c [ 2.434262] RDX: ffffd16bc018aa48 RSI: ffffffffa96a9ec7 RDI: ffffd16bc018aa40 [ 2.435265] RBP: ffffadcb0024be60 R08: 0000000000000001 R09: 0000000000000001 [ 2.436269] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8a7084014410 [ 2.437267] R13: ffff8a70840c4000 R14: 0000000000000002 R15: ffff8a70840564a8 [ 2.438271] FS: 0000000000000000(0000) GS:ffff8a708f800000(0000) knlGS:0000000000000000 [ 2.439403] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.440215] CR2: ffff8a7089401000 CR3: 0000000007a48001 CR4: 0000000000170ef0 [ 2.441218] Call Trace: [ 2.441568] <TASK> [ 2.441883] ? free_large_kmalloc+0x4f/0x80 [ 2.442491] ? __warn+0x80/0x170 [ 2.442988] ? free_large_kmalloc+0x4f/0x80 [ 2.443591] ? report_bug+0x171/0x1a0 [ 2.444145] ? handle_bug+0x3c/0x70 [ 2.444662] ? exc_invalid_op+0x17/0x70 [ 2.445225] ? asm_exc_invalid_op+0x1a/0x20 [ 2.445844] ? kunit_add_action+0xc7/0x140 [ 2.446455] ? free_large_kmalloc+0x4f/0x80 [ 2.447054] kunit_filter_suites+0x468/0x480 [ 2.447662] ? kunit_add_action+0xc7/0x140 [ 2.448258] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 [ 2.449105] filter_suites_test+0xea/0x2c0 [ 2.449702] kunit_generic_run_threadfn_adapter+0x15/0x20 [ 2.450469] kthread+0xf0/0x120 [ 2.450940] ? __pfx_kthread+0x10/0x10 [ 2.451481] ret_from_fork+0x2f/0x50 [ 2.452012] ? __pfx_kthread+0x10/0x10 [ 2.452557] ret_from_fork_asm+0x1b/0x30 [ 2.453146] </TASK> [ 2.453474] irq event stamp: 677 [ 2.453943] hardirqs last enabled at (689): [<ffffffffa911c24a>] console_unlock+0x10a/0x160 [ 2.455151] hardirqs last disabled at (700): [<ffffffffa911c22f>] console_unlock+0xef/0x160 [ 2.456329] softirqs last enabled at (662): [<ffffffffa909179a>] irq_exit_rcu+0x7a/0xa0 [ 2.457474] softirqs last disabled at (657): [<ffffffffa909179a>] irq_exit_rcu+0x7a/0xa0 [ 2.458610] ---[ end trace 0000000000000000 ]--- But then it continues and eventually reaches: Linux version 6.5.0-rc4-next-20230731 (root@ivb-ep) (gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40) #1 SMP PREEMPT_DYNAMIC Mon Jul 31 15:39:05 CEST 2023 Network interface test passed Boot successful. / # Full log attached.
On 7/31/23 07:47, Peter Zijlstra wrote: > On Mon, Jul 31, 2023 at 07:35:13AM -0700, Guenter Roeck wrote: > >>> Hurmph, let me see if I can reproduce on next-20230731 (not having the >>> older next thingies around). >> >> That crashes hard with my configuration. >> >> [ 6.353191] kernel tried to execute NX-protected page - exploit attempt? (uid: 0) >> [ 6.353392] BUG: unable to handle page fault for address: ffff9b10c0013cd0 >> [ 6.353531] #PF: supervisor instruction fetch in kernel mode >> [ 6.353624] #PF: error_code(0x0011) - permissions violation >> [ 6.353751] PGD 1000067 P4D 1000067 PUD 1205067 PMD 1206067 PTE 800000000124e063 >> [ 6.354011] Oops: 0011 [#1] PREEMPT SMP PTI >> [ 6.354164] CPU: 0 PID: 182 Comm: kunit_try_catch Tainted: G N 6.5.0-rc4-next-20230731 #1 >> [ 6.354315] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 >> [ 6.354525] RIP: 0010:0xffff9b10c0013cd0 >> [ 6.354793] Code: ff ff 60 64 ce a9 ff ff ff ff 00 00 00 00 00 00 00 00 d1 3a bc a8 ff ff ff ff 00 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <f0> 00 01 44 10 8a ff ff b8 01 01 44 10 8a ff ff 00 00 00 00 00 00 >> [ 6.355059] RSP: 0000:ffff9b10c027fd60 EFLAGS: 00000246 >> [ 6.355157] RAX: ffff9b10c0013cd0 RBX: ffff8a1043bdb400 RCX: 0000000000000000 >> [ 6.355259] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8a1043bdb400 >> [ 6.355358] RBP: ffff9b10c027fdc8 R08: 0000000000000001 R09: 0000000000000001 >> [ 6.355456] R10: 0000000000000001 R11: 0000000000000001 R12: ffff9b10c027fe74 >> [ 6.355556] R13: ffff8a10440100f0 R14: ffff8a10440101b8 R15: ffff9b10c027fe74 >> [ 6.355679] FS: 0000000000000000(0000) GS:ffff8a104fc00000(0000) knlGS:0000000000000000 >> [ 6.355798] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 >> [ 6.355886] CR2: ffff9b10c0013cd0 CR3: 000000000e048000 CR4: 00000000003506f0 >> [ 6.356029] Call Trace: >> [ 6.356158] <TASK> >> [ 6.356334] ? __die+0x1f/0x70 >> [ 6.356472] ? page_fault_oops+0x14a/0x460 >> [ 6.356547] ? exc_page_fault+0xee/0x1c0 >> [ 6.356612] ? asm_exc_page_fault+0x26/0x30 >> [ 6.356703] ? kunit_filter_attr_tests+0xc4/0x2e0 >> [ 6.356796] kunit_filter_suites+0x2e2/0x460 >> [ 6.356889] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 >> [ 6.356979] filter_suites_test+0xea/0x2c0 >> [ 6.357051] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 >> [ 6.357148] kunit_generic_run_threadfn_adapter+0x15/0x20 >> [ 6.357228] kthread+0xef/0x120 >> [ 6.357282] ? __pfx_kthread+0x10/0x10 >> [ 6.357343] ret_from_fork+0x2f/0x50 >> [ 6.357399] ? __pfx_kthread+0x10/0x10 >> [ 6.357458] ret_from_fork_asm+0x1b/0x30 >> [ 6.357560] </TASK> >> [ 6.357632] Modules linked in: >> [ 6.357786] CR2: ffff9b10c0013cd0 >> [ 6.358010] ---[ end trace 0000000000000000 ]--- > > I get: > > [ 2.423691] ------------[ cut here ]------------ > [ 2.424994] WARNING: CPU: 0 PID: 184 at mm/slab_common.c:992 free_large_kmalloc+0x4f/0x80 > [ 2.426183] Modules linked in: > [ 2.426624] CPU: 0 PID: 184 Comm: kunit_try_catch Tainted: G N 6.5.0-rc4-next-20230731 #1 > [ 2.427964] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.0-debian-1.16.0-5 04/01/2014 > [ 2.429265] RIP: 0010:free_large_kmalloc+0x4f/0x80 > [ 2.429952] Code: f7 da 48 63 d2 48 8b 03 be 06 00 00 00 48 c1 e8 3a 48 8b 3c c5 60 ba 11 ab e8 0d 52 ff ff 89 ee 48 89 df 5b 5d e9 41 df 03 00 <0f> 0b 80 3d 49 43 e9 01 00 75 > [ 2.432511] RSP: 0000:ffffadcb0024bdb8 EFLAGS: 00010246 > [ 2.433259] RAX: 0100000000001000 RBX: ffffd16bc018aa40 RCX: ffffadcb0024bd7c > [ 2.434262] RDX: ffffd16bc018aa48 RSI: ffffffffa96a9ec7 RDI: ffffd16bc018aa40 > [ 2.435265] RBP: ffffadcb0024be60 R08: 0000000000000001 R09: 0000000000000001 > [ 2.436269] R10: 0000000000000001 R11: 0000000000000000 R12: ffff8a7084014410 > [ 2.437267] R13: ffff8a70840c4000 R14: 0000000000000002 R15: ffff8a70840564a8 > [ 2.438271] FS: 0000000000000000(0000) GS:ffff8a708f800000(0000) knlGS:0000000000000000 > [ 2.439403] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > [ 2.440215] CR2: ffff8a7089401000 CR3: 0000000007a48001 CR4: 0000000000170ef0 > [ 2.441218] Call Trace: > [ 2.441568] <TASK> > [ 2.441883] ? free_large_kmalloc+0x4f/0x80 > [ 2.442491] ? __warn+0x80/0x170 > [ 2.442988] ? free_large_kmalloc+0x4f/0x80 > [ 2.443591] ? report_bug+0x171/0x1a0 > [ 2.444145] ? handle_bug+0x3c/0x70 > [ 2.444662] ? exc_invalid_op+0x17/0x70 > [ 2.445225] ? asm_exc_invalid_op+0x1a/0x20 > [ 2.445844] ? kunit_add_action+0xc7/0x140 > [ 2.446455] ? free_large_kmalloc+0x4f/0x80 > [ 2.447054] kunit_filter_suites+0x468/0x480 > [ 2.447662] ? kunit_add_action+0xc7/0x140 > [ 2.448258] ? __pfx_kunit_generic_run_threadfn_adapter+0x10/0x10 > [ 2.449105] filter_suites_test+0xea/0x2c0 > [ 2.449702] kunit_generic_run_threadfn_adapter+0x15/0x20 > [ 2.450469] kthread+0xf0/0x120 > [ 2.450940] ? __pfx_kthread+0x10/0x10 > [ 2.451481] ret_from_fork+0x2f/0x50 > [ 2.452012] ? __pfx_kthread+0x10/0x10 > [ 2.452557] ret_from_fork_asm+0x1b/0x30 > [ 2.453146] </TASK> > [ 2.453474] irq event stamp: 677 > [ 2.453943] hardirqs last enabled at (689): [<ffffffffa911c24a>] console_unlock+0x10a/0x160 > [ 2.455151] hardirqs last disabled at (700): [<ffffffffa911c22f>] console_unlock+0xef/0x160 > [ 2.456329] softirqs last enabled at (662): [<ffffffffa909179a>] irq_exit_rcu+0x7a/0xa0 > [ 2.457474] softirqs last disabled at (657): [<ffffffffa909179a>] irq_exit_rcu+0x7a/0xa0 > [ 2.458610] ---[ end trace 0000000000000000 ]--- > Same problem. I see the warning on some architectures, the crash on others. The fix for that problem is at https://lore.kernel.org/linux-kselftest/20230729010003.4058582-1-ruanjinjie@huawei.com/ It is caused by the "kunit: Add test attributes API" patch series. See https://lore.kernel.org/lkml/5205b6aa-c9ea-8f9c-f42c-b840346f740c@roeck-us.net/T/ Guenter
On Tue, Jul 25, 2023 at 12:42:47PM +0200, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > Build results: total: 157 pass: 157 fail: 0 Qemu test results: total: 522 pass: 522 fail: 0 Tested-by: Guenter Roeck <linux@roeck-us.net> Guenter
On Tue, 25 Jul 2023 at 16:19, Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote: > > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > or in the git tree and branch at: > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > and the diffstat can be found below. > > thanks, > > greg k-h Results from Linaro’s test farm. No regressions on arm64, arm, x86_64, and i386. Tested-by: Linux Kernel Functional Testing <lkft@linaro.org> ## Build * kernel: 6.4.7-rc1 * git: https://gitlab.com/Linaro/lkft/mirrors/stable/linux-stable-rc * git branch: linux-6.4.y * git commit: 3c19c5641cce21ec84a7d62be76d53f454531f48 * git describe: v6.4.6-228-g3c19c5641cce * test details: https://qa-reports.linaro.org/lkft/linux-stable-rc-linux-6.4.y/build/v6.4.6-228-g3c19c5641cce ## Test Regressions (compared to v6.4.5) ## Metric Regressions (compared to v6.4.5) ## Test Fixes (compared to v6.4.5) ## Metric Fixes (compared to v6.4.5) ## Test result summary total: 166993, pass: 145128, fail: 2201, skip: 19509, xfail: 155 ## Build Summary * arc: 5 total, 5 passed, 0 failed * arm: 141 total, 141 passed, 0 failed * arm64: 50 total, 50 passed, 0 failed * i386: 37 total, 37 passed, 0 failed * mips: 26 total, 26 passed, 0 failed * parisc: 3 total, 3 passed, 0 failed * powerpc: 34 total, 34 passed, 0 failed * riscv: 22 total, 22 passed, 0 failed * s390: 12 total, 12 passed, 0 failed * sh: 12 total, 12 passed, 0 failed * sparc: 6 total, 6 passed, 0 failed * x86_64: 42 total, 42 passed, 0 failed ## Test suites summary * boot * kselftest-android * kselftest-arm64 * kselftest-breakpoints * kselftest-capabilities * kselftest-cgroup * kselftest-clone3 * kselftest-core * kselftest-cpu-hotplug * kselftest-cpufreq * kselftest-drivers-dma-buf * kselftest-efivarfs * kselftest-exec * kselftest-filesystems * kselftest-filesystems-binderfs * kselftest-filesystems-epoll * kselftest-firmware * kselftest-fpu * kselftest-ftrace * kselftest-futex * kselftest-gpio * kselftest-intel_pstate * kselftest-ipc * kselftest-ir * kselftest-kcmp * kselftest-kexec * kselftest-kvm * kselftest-lib * kselftest-livepatch * kselftest-membarrier * kselftest-memfd * kselftest-memory-hotplug * kselftest-mincore * kselftest-mount * kselftest-mqueue * kselftest-net * kselftest-net-forwarding * kselftest-net-mptcp * kselftest-netfilter * kselftest-nsfs * kselftest-openat2 * kselftest-pid_namespace * kselftest-pidfd * kselftest-proc * kselftest-pstore * kselftest-ptrace * kselftest-rseq * kselftest-rtc * kselftest-seccomp * kselftest-sigaltstack * kselftest-size * kselftest-splice * kselftest-static_keys * kselftest-sync * kselftest-sysctl * kselftest-tc-testing * kselftest-timens * kselftest-timers * kselftest-tmpfs * kselftest-tpm2 * kselftest-user * kselftest-user_events * kselftest-vDSO * kselftest-vm * kselftest-watchdog * kselftest-x86 * kselftest-zram * kunit * kvm-unit-tests * libgpiod * libhugetlbfs * log-parser-boot * log-parser-test * ltp-cap_bounds * ltp-commands * ltp-containers * ltp-controllers * ltp-cpuhotplug * ltp-crypto * ltp-cve * ltp-dio * ltp-fcntl-locktests * ltp-filecaps * ltp-fs * ltp-fs_bind * ltp-fs_perms_simple * ltp-fsx * ltp-hugetlb * ltp-io * ltp-ipc * ltp-math * ltp-mm * ltp-nptl * ltp-pty * ltp-sched * ltp-securebits * ltp-smoke * ltp-syscalls * ltp-tracing * network-basic-tests * perf * rcutorture * v4l2-compliance -- Linaro LKFT https://lkft.linaro.org
On 7/25/23 3:42 AM, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > or in the git tree and branch at: > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > and the diffstat can be found below. > > thanks, > > greg k-h Built and booted successfully on RISC-V RV64 (HiFive Unmatched). Tested-by: Ron Economos <re@w6rz.net>
On Tue, Jul 25, 2023 at 12:42:47PM +0200, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. Tested-by: Conor Dooley <conor.dooley@microchip.com> Thanks, Conor.
On Tue, Jul 25, 2023 at 12:42:47PM +0200, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > Successfully compiled and installed bindeb-pkgs on my computer (Acer Aspire E15, Intel Core i3 Haswell). No noticeable regressions. Tested-by: Bagas Sanjaya <bagasdotme@gmail.com> -- An old man doll... just what I always wanted! - Clara
On 7/25/23 03:42, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > or in the git tree and branch at: > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > and the diffstat can be found below. > > thanks, > > greg k-h On ARCH_BRCMSTB using 32-bit and 64-bit ARM kernels, build tested on BMIPS_GENERIC: Tested-by: Florian Fainelli <florian.fainelli@broadcom.com> -- Florian
On 7/25/23 04:42, Greg Kroah-Hartman wrote: > This is the start of the stable review cycle for the 6.4.7 release. > There are 227 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz > or in the git tree and branch at: > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y > and the diffstat can be found below. > > thanks, > > greg k-h > Compiled and booted on my test system. No dmesg regressions. Tested-by: Shuah Khan <skhan@linuxfoundation.org> thanks, -- Shuah
Hi,
On Tue, 25 Jul 2023 12:42:47 +0200 Greg Kroah-Hartman <gregkh@linuxfoundation.org> wrote:
> This is the start of the stable review cycle for the 6.4.7 release.
> There are 227 patches in this series, all will be posted as a response
> to this one. If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz
> or in the git tree and branch at:
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y
> and the diffstat can be found below.
This rc kernel passes DAMON functionality test[1] on my test machine.
Attaching the test results summary below. Please note that I retrieved the
kernel from linux-stable-rc tree[2].
Tested-by: SeongJae Park <sj@kernel.org>
[1] https://github.com/awslabs/damon-tests/tree/next/corr
[2] commit 3c19c5641cce ("Linux 6.4.7-rc1")
Thanks,
SJ
[...]
---
ok 1 selftests: damon: debugfs_attrs.sh
ok 2 selftests: damon: debugfs_schemes.sh
ok 3 selftests: damon: debugfs_target_ids.sh
ok 4 selftests: damon: debugfs_empty_targets.sh
ok 5 selftests: damon: debugfs_huge_count_read_write.sh
ok 6 selftests: damon: debugfs_duplicate_context_creation.sh
ok 7 selftests: damon: debugfs_rm_non_contexts.sh
ok 8 selftests: damon: sysfs.sh
ok 9 selftests: damon: sysfs_update_removed_scheme_dir.sh
ok 10 selftests: damon: reclaim.sh
ok 11 selftests: damon: lru_sort.sh
ok 1 selftests: damon-tests: kunit.sh
ok 2 selftests: damon-tests: huge_count_read_write.sh
ok 3 selftests: damon-tests: buffer_overflow.sh
ok 4 selftests: damon-tests: rm_contexts.sh
ok 5 selftests: damon-tests: record_null_deref.sh
ok 6 selftests: damon-tests: dbgfs_target_ids_read_before_terminate_race.sh
ok 7 selftests: damon-tests: dbgfs_target_ids_pid_leak.sh
ok 8 selftests: damon-tests: damo_tests.sh
ok 9 selftests: damon-tests: masim-record.sh
ok 10 selftests: damon-tests: build_i386.sh
ok 11 selftests: damon-tests: build_m68k.sh
ok 12 selftests: damon-tests: build_arm64.sh
ok 13 selftests: damon-tests: build_i386_idle_flag.sh
ok 14 selftests: damon-tests: build_i386_highpte.sh
ok 15 selftests: damon-tests: build_nomemcg.sh
PASS
On Tue, 25 Jul 2023 12:42:47 +0200, Greg Kroah-Hartman wrote:
> This is the start of the stable review cycle for the 6.4.7 release.
> There are 227 patches in this series, all will be posted as a response
> to this one. If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Thu, 27 Jul 2023 10:44:26 +0000.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> https://www.kernel.org/pub/linux/kernel/v6.x/stable-review/patch-6.4.7-rc1.gz
> or in the git tree and branch at:
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-6.4.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h
All tests passing for Tegra ...
Test results for stable-v6.4:
11 builds: 11 pass, 0 fail
28 boots: 28 pass, 0 fail
130 tests: 130 pass, 0 fail
Linux version: 6.4.7-rc1-g3c19c5641cce
Boards tested: tegra124-jetson-tk1, tegra186-p2771-0000,
tegra194-p2972-0000, tegra194-p3509-0000+p3668-0000,
tegra20-ventana, tegra210-p2371-2180,
tegra210-p3450-0000, tegra30-cardhu-a04
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Jon
© 2016 - 2026 Red Hat, Inc.