The loopback, recv_own_msgs, fd_frames and xl_frames fields of struct
raw_sock just need to store one bit of information.
Declare all those members as a bitfields of type unsigned int and
width one bit.
Add a temporary variable to raw_setsockopt() and raw_getsockopt() to
make the conversion between the stored bits and the socket interface.
This reduces struct raw_sock by eight bytes.
Statistics before:
$ pahole --class_name=raw_sock net/can/raw.o
struct raw_sock {
struct sock sk __attribute__((__aligned__(8))); /* 0 776 */
/* XXX last struct has 1 bit hole */
/* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */
int bound; /* 776 4 */
int ifindex; /* 780 4 */
struct net_device * dev; /* 784 8 */
netdevice_tracker dev_tracker; /* 792 0 */
struct list_head notifier; /* 792 16 */
int loopback; /* 808 4 */
int recv_own_msgs; /* 812 4 */
int fd_frames; /* 816 4 */
int xl_frames; /* 820 4 */
struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */
canid_t tx_vcid_shifted; /* 828 4 */
/* --- cacheline 13 boundary (832 bytes) --- */
canid_t rx_vcid_shifted; /* 832 4 */
canid_t rx_vcid_mask_shifted; /* 836 4 */
int join_filters; /* 840 4 */
int count; /* 844 4 */
struct can_filter dfilter; /* 848 8 */
struct can_filter * filter; /* 856 8 */
can_err_mask_t err_mask; /* 864 4 */
/* XXX 4 bytes hole, try to pack */
struct uniqframe * uniq; /* 872 8 */
/* size: 880, cachelines: 14, members: 20 */
/* sum members: 876, holes: 1, sum holes: 4 */
/* member types with bit holes: 1, total: 1 */
/* forced alignments: 1 */
/* last cacheline: 48 bytes */
} __attribute__((__aligned__(8)));
...and after:
$ pahole --class_name=raw_sock net/can/raw.o
struct raw_sock {
struct sock sk __attribute__((__aligned__(8))); /* 0 776 */
/* XXX last struct has 1 bit hole */
/* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */
int bound; /* 776 4 */
int ifindex; /* 780 4 */
struct net_device * dev; /* 784 8 */
netdevice_tracker dev_tracker; /* 792 0 */
struct list_head notifier; /* 792 16 */
unsigned int loopback:1; /* 808: 0 4 */
unsigned int recv_own_msgs:1; /* 808: 1 4 */
unsigned int fd_frames:1; /* 808: 2 4 */
unsigned int xl_frames:1; /* 808: 3 4 */
/* XXX 4 bits hole, try to pack */
/* Bitfield combined with next fields */
struct can_raw_vcid_options raw_vcid_opts; /* 809 4 */
/* XXX 3 bytes hole, try to pack */
canid_t tx_vcid_shifted; /* 816 4 */
canid_t rx_vcid_shifted; /* 820 4 */
canid_t rx_vcid_mask_shifted; /* 824 4 */
int join_filters; /* 828 4 */
/* --- cacheline 13 boundary (832 bytes) --- */
int count; /* 832 4 */
struct can_filter dfilter; /* 836 8 */
/* XXX 4 bytes hole, try to pack */
struct can_filter * filter; /* 848 8 */
can_err_mask_t err_mask; /* 856 4 */
/* XXX 4 bytes hole, try to pack */
struct uniqframe * uniq; /* 864 8 */
/* size: 872, cachelines: 14, members: 20 */
/* sum members: 860, holes: 3, sum holes: 11 */
/* sum bitfield members: 4 bits, bit holes: 1, sum bit holes: 4 bits */
/* member types with bit holes: 1, total: 1 */
/* forced alignments: 1 */
/* last cacheline: 40 bytes */
} __attribute__((__aligned__(8)));
Signed-off-by: Vincent Mailhol <mailhol@kernel.org>
---
net/can/raw.c | 47 ++++++++++++++++++++++++++++-------------------
1 file changed, 28 insertions(+), 19 deletions(-)
diff --git a/net/can/raw.c b/net/can/raw.c
index db21d8a8c54d1b6a25a72c7a9d11d5c94f3187b5..cec580ecd58e36931d1be05716e6beb9c93aa271 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -87,10 +87,10 @@ struct raw_sock {
struct net_device *dev;
netdevice_tracker dev_tracker;
struct list_head notifier;
- int loopback;
- int recv_own_msgs;
- int fd_frames;
- int xl_frames;
+ unsigned int loopback:1;
+ unsigned int recv_own_msgs:1;
+ unsigned int fd_frames:1;
+ unsigned int xl_frames:1;
struct can_raw_vcid_options raw_vcid_opts;
canid_t tx_vcid_shifted;
canid_t rx_vcid_shifted;
@@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
struct can_filter sfilter; /* single filter */
struct net_device *dev = NULL;
can_err_mask_t err_mask = 0;
- int fd_frames;
int count = 0;
+ int flag;
int err = 0;
if (level != SOL_CAN_RAW)
@@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
break;
case CAN_RAW_LOOPBACK:
- if (optlen != sizeof(ro->loopback))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->loopback, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->loopback = !!flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
- if (optlen != sizeof(ro->recv_own_msgs))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->recv_own_msgs = !!flag;
break;
case CAN_RAW_FD_FRAMES:
- if (optlen != sizeof(fd_frames))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&fd_frames, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
/* Enabling CAN XL includes CAN FD */
- if (ro->xl_frames && !fd_frames)
+ if (ro->xl_frames && !flag)
return -EINVAL;
- ro->fd_frames = fd_frames;
+ ro->fd_frames = !!flag;
break;
case CAN_RAW_XL_FRAMES:
- if (optlen != sizeof(ro->xl_frames))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->xl_frames, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->xl_frames = !!flag;
+
/* Enabling CAN XL includes CAN FD */
if (ro->xl_frames)
ro->fd_frames = ro->xl_frames;
@@ -758,6 +762,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
+ int flag;
int len;
void *val;
@@ -806,25 +811,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
case CAN_RAW_LOOPBACK:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->loopback;
+ flag = ro->loopback;
+ val = &flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->recv_own_msgs;
+ flag = ro->recv_own_msgs;
+ val = &flag;
break;
case CAN_RAW_FD_FRAMES:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->fd_frames;
+ flag = ro->fd_frames;
+ val = &flag;
break;
case CAN_RAW_XL_FRAMES:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->xl_frames;
+ flag = ro->xl_frames;
+ val = &flag;
break;
case CAN_RAW_XL_VCID_OPTS: {
--
2.49.1
Le 15/09/2025 à 11:23, Vincent Mailhol a écrit : > The loopback, recv_own_msgs, fd_frames and xl_frames fields of struct > raw_sock just need to store one bit of information. > > Declare all those members as a bitfields of type unsigned int and > width one bit. > > Add a temporary variable to raw_setsockopt() and raw_getsockopt() to > make the conversion between the stored bits and the socket interface. > > This reduces struct raw_sock by eight bytes. > > Statistics before: > > $ pahole --class_name=raw_sock net/can/raw.o > struct raw_sock { > struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ > > /* XXX last struct has 1 bit hole */ > > /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ > int bound; /* 776 4 */ > int ifindex; /* 780 4 */ > struct net_device * dev; /* 784 8 */ > netdevice_tracker dev_tracker; /* 792 0 */ > struct list_head notifier; /* 792 16 */ > int loopback; /* 808 4 */ > int recv_own_msgs; /* 812 4 */ > int fd_frames; /* 816 4 */ > int xl_frames; /* 820 4 */ > struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */ > canid_t tx_vcid_shifted; /* 828 4 */ > /* --- cacheline 13 boundary (832 bytes) --- */ > canid_t rx_vcid_shifted; /* 832 4 */ > canid_t rx_vcid_mask_shifted; /* 836 4 */ > int join_filters; /* 840 4 */ > int count; /* 844 4 */ > struct can_filter dfilter; /* 848 8 */ > struct can_filter * filter; /* 856 8 */ > can_err_mask_t err_mask; /* 864 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct uniqframe * uniq; /* 872 8 */ > > /* size: 880, cachelines: 14, members: 20 */ > /* sum members: 876, holes: 1, sum holes: 4 */ > /* member types with bit holes: 1, total: 1 */ > /* forced alignments: 1 */ > /* last cacheline: 48 bytes */ > } __attribute__((__aligned__(8))); > > ...and after: > > $ pahole --class_name=raw_sock net/can/raw.o > struct raw_sock { > struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ > > /* XXX last struct has 1 bit hole */ > > /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ > int bound; /* 776 4 */ > int ifindex; /* 780 4 */ > struct net_device * dev; /* 784 8 */ > netdevice_tracker dev_tracker; /* 792 0 */ > struct list_head notifier; /* 792 16 */ > unsigned int loopback:1; /* 808: 0 4 */ > unsigned int recv_own_msgs:1; /* 808: 1 4 */ > unsigned int fd_frames:1; /* 808: 2 4 */ > unsigned int xl_frames:1; /* 808: 3 4 */ > > /* XXX 4 bits hole, try to pack */ > /* Bitfield combined with next fields */ > > struct can_raw_vcid_options raw_vcid_opts; /* 809 4 */ > > /* XXX 3 bytes hole, try to pack */ > > canid_t tx_vcid_shifted; /* 816 4 */ > canid_t rx_vcid_shifted; /* 820 4 */ > canid_t rx_vcid_mask_shifted; /* 824 4 */ > int join_filters; /* 828 4 */ > /* --- cacheline 13 boundary (832 bytes) --- */ > int count; /* 832 4 */ > struct can_filter dfilter; /* 836 8 */ > > /* XXX 4 bytes hole, try to pack */ > > struct can_filter * filter; /* 848 8 */ > can_err_mask_t err_mask; /* 856 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct uniqframe * uniq; /* 864 8 */ > > /* size: 872, cachelines: 14, members: 20 */ > /* sum members: 860, holes: 3, sum holes: 11 */ > /* sum bitfield members: 4 bits, bit holes: 1, sum bit holes: 4 bits */ > /* member types with bit holes: 1, total: 1 */ > /* forced alignments: 1 */ > /* last cacheline: 40 bytes */ > } __attribute__((__aligned__(8))); > > Signed-off-by: Vincent Mailhol <mailhol@kernel.org> > --- > net/can/raw.c | 47 ++++++++++++++++++++++++++++------------------- > 1 file changed, 28 insertions(+), 19 deletions(-) > > diff --git a/net/can/raw.c b/net/can/raw.c > index db21d8a8c54d1b6a25a72c7a9d11d5c94f3187b5..cec580ecd58e36931d1be05716e6beb9c93aa271 100644 > --- a/net/can/raw.c > +++ b/net/can/raw.c > @@ -87,10 +87,10 @@ struct raw_sock { > struct net_device *dev; > netdevice_tracker dev_tracker; > struct list_head notifier; > - int loopback; > - int recv_own_msgs; > - int fd_frames; > - int xl_frames; > + unsigned int loopback:1; > + unsigned int recv_own_msgs:1; > + unsigned int fd_frames:1; > + unsigned int xl_frames:1; > struct can_raw_vcid_options raw_vcid_opts; > canid_t tx_vcid_shifted; > canid_t rx_vcid_shifted; [...] Hi, just in case, it looks like bound and join_filters could also be defined in the bitfield. just my 2c. CJ
On 17/09/2025 at 03:35, Christophe JAILLET wrote: > Le 15/09/2025 à 11:23, Vincent Mailhol a écrit : (...) >> --- a/net/can/raw.c >> +++ b/net/can/raw.c >> @@ -87,10 +87,10 @@ struct raw_sock { >> struct net_device *dev; >> netdevice_tracker dev_tracker; >> struct list_head notifier; >> - int loopback; >> - int recv_own_msgs; >> - int fd_frames; >> - int xl_frames; >> + unsigned int loopback:1; >> + unsigned int recv_own_msgs:1; >> + unsigned int fd_frames:1; >> + unsigned int xl_frames:1; >> struct can_raw_vcid_options raw_vcid_opts; >> canid_t tx_vcid_shifted; >> canid_t rx_vcid_shifted; > > [...] > > Hi, > > just in case, it looks like bound and join_filters could also be defined in the > bitfield. > > just my 2c. You are absolutely right. I will add these two to the bitfield in v2. Yours sincerely, Vincent Mailhol
On 15.09.25 11:23, Vincent Mailhol wrote: > The loopback, recv_own_msgs, fd_frames and xl_frames fields of struct > raw_sock just need to store one bit of information. > > Declare all those members as a bitfields of type unsigned int and > width one bit. > > Add a temporary variable to raw_setsockopt() and raw_getsockopt() to > make the conversion between the stored bits and the socket interface. > > This reduces struct raw_sock by eight bytes. > > Statistics before: > > $ pahole --class_name=raw_sock net/can/raw.o > struct raw_sock { > struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ > > /* XXX last struct has 1 bit hole */ > > /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ > int bound; /* 776 4 */ > int ifindex; /* 780 4 */ > struct net_device * dev; /* 784 8 */ > netdevice_tracker dev_tracker; /* 792 0 */ > struct list_head notifier; /* 792 16 */ > int loopback; /* 808 4 */ > int recv_own_msgs; /* 812 4 */ > int fd_frames; /* 816 4 */ > int xl_frames; /* 820 4 */ > struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */ > canid_t tx_vcid_shifted; /* 828 4 */ > /* --- cacheline 13 boundary (832 bytes) --- */ > canid_t rx_vcid_shifted; /* 832 4 */ > canid_t rx_vcid_mask_shifted; /* 836 4 */ > int join_filters; /* 840 4 */ > int count; /* 844 4 */ > struct can_filter dfilter; /* 848 8 */ > struct can_filter * filter; /* 856 8 */ > can_err_mask_t err_mask; /* 864 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct uniqframe * uniq; /* 872 8 */ > > /* size: 880, cachelines: 14, members: 20 */ > /* sum members: 876, holes: 1, sum holes: 4 */ > /* member types with bit holes: 1, total: 1 */ > /* forced alignments: 1 */ > /* last cacheline: 48 bytes */ > } __attribute__((__aligned__(8))); > > ...and after: > > $ pahole --class_name=raw_sock net/can/raw.o > struct raw_sock { > struct sock sk __attribute__((__aligned__(8))); /* 0 776 */ > > /* XXX last struct has 1 bit hole */ > > /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ > int bound; /* 776 4 */ > int ifindex; /* 780 4 */ > struct net_device * dev; /* 784 8 */ > netdevice_tracker dev_tracker; /* 792 0 */ > struct list_head notifier; /* 792 16 */ > unsigned int loopback:1; /* 808: 0 4 */ > unsigned int recv_own_msgs:1; /* 808: 1 4 */ > unsigned int fd_frames:1; /* 808: 2 4 */ > unsigned int xl_frames:1; /* 808: 3 4 */ This means that the former data structures (int) are not copied but bits are set (shifted, ANDed, ORed, etc) right? So what's the difference in the code the CPU has to process for this improvement? Is implementing this bitmap more efficient or similar to copy the (unsigned ints) as-is? > > /* XXX 4 bits hole, try to pack */ > /* Bitfield combined with next fields */ > > struct can_raw_vcid_options raw_vcid_opts; /* 809 4 */ > > /* XXX 3 bytes hole, try to pack */ > > canid_t tx_vcid_shifted; /* 816 4 */ > canid_t rx_vcid_shifted; /* 820 4 */ > canid_t rx_vcid_mask_shifted; /* 824 4 */ > int join_filters; /* 828 4 */ > /* --- cacheline 13 boundary (832 bytes) --- */ > int count; /* 832 4 */ > struct can_filter dfilter; /* 836 8 */ > > /* XXX 4 bytes hole, try to pack */ > > struct can_filter * filter; /* 848 8 */ > can_err_mask_t err_mask; /* 856 4 */ > > /* XXX 4 bytes hole, try to pack */ > > struct uniqframe * uniq; /* 864 8 */ > > /* size: 872, cachelines: 14, members: 20 */ > /* sum members: 860, holes: 3, sum holes: 11 */ > /* sum bitfield members: 4 bits, bit holes: 1, sum bit holes: 4 bits */ > /* member types with bit holes: 1, total: 1 */ > /* forced alignments: 1 */ > /* last cacheline: 40 bytes */ > } __attribute__((__aligned__(8))); > > Signed-off-by: Vincent Mailhol <mailhol@kernel.org> > --- > net/can/raw.c | 47 ++++++++++++++++++++++++++++------------------- > 1 file changed, 28 insertions(+), 19 deletions(-) > > diff --git a/net/can/raw.c b/net/can/raw.c > index db21d8a8c54d1b6a25a72c7a9d11d5c94f3187b5..cec580ecd58e36931d1be05716e6beb9c93aa271 100644 > --- a/net/can/raw.c > +++ b/net/can/raw.c > @@ -87,10 +87,10 @@ struct raw_sock { > struct net_device *dev; > netdevice_tracker dev_tracker; > struct list_head notifier; > - int loopback; > - int recv_own_msgs; > - int fd_frames; > - int xl_frames; > + unsigned int loopback:1; > + unsigned int recv_own_msgs:1; > + unsigned int fd_frames:1; > + unsigned int xl_frames:1; > struct can_raw_vcid_options raw_vcid_opts; > canid_t tx_vcid_shifted; > canid_t rx_vcid_shifted; > @@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, > struct can_filter sfilter; /* single filter */ > struct net_device *dev = NULL; > can_err_mask_t err_mask = 0; > - int fd_frames; > int count = 0; > + int flag; > int err = 0; > > if (level != SOL_CAN_RAW) > @@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, > break; > > case CAN_RAW_LOOPBACK: > - if (optlen != sizeof(ro->loopback)) > + if (optlen != sizeof(flag)) > return -EINVAL; > > - if (copy_from_sockptr(&ro->loopback, optval, optlen)) > + if (copy_from_sockptr(&flag, optval, optlen)) > return -EFAULT; > > + ro->loopback = !!flag; This is obviously an additional effort. Instead it a simple copy the code makes a copy to an extra variable and then an assignment with bit (shifting) operations. Best regards, Oliver > break; > > case CAN_RAW_RECV_OWN_MSGS: > - if (optlen != sizeof(ro->recv_own_msgs)) > + if (optlen != sizeof(flag)) > return -EINVAL; > > - if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) > + if (copy_from_sockptr(&flag, optval, optlen)) > return -EFAULT; > > + ro->recv_own_msgs = !!flag; > break; > > case CAN_RAW_FD_FRAMES: > - if (optlen != sizeof(fd_frames)) > + if (optlen != sizeof(flag)) > return -EINVAL; > > - if (copy_from_sockptr(&fd_frames, optval, optlen)) > + if (copy_from_sockptr(&flag, optval, optlen)) > return -EFAULT; > > /* Enabling CAN XL includes CAN FD */ > - if (ro->xl_frames && !fd_frames) > + if (ro->xl_frames && !flag) > return -EINVAL; > > - ro->fd_frames = fd_frames; > + ro->fd_frames = !!flag; > break; > > case CAN_RAW_XL_FRAMES: > - if (optlen != sizeof(ro->xl_frames)) > + if (optlen != sizeof(flag)) > return -EINVAL; > > - if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) > + if (copy_from_sockptr(&flag, optval, optlen)) > return -EFAULT; > > + ro->xl_frames = !!flag; > + > /* Enabling CAN XL includes CAN FD */ > if (ro->xl_frames) > ro->fd_frames = ro->xl_frames; > @@ -758,6 +762,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, > { > struct sock *sk = sock->sk; > struct raw_sock *ro = raw_sk(sk); > + int flag; > int len; > void *val; > > @@ -806,25 +811,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, > case CAN_RAW_LOOPBACK: > if (len > sizeof(int)) > len = sizeof(int); > - val = &ro->loopback; > + flag = ro->loopback; > + val = &flag; > break; > > case CAN_RAW_RECV_OWN_MSGS: > if (len > sizeof(int)) > len = sizeof(int); > - val = &ro->recv_own_msgs; > + flag = ro->recv_own_msgs; > + val = &flag; > break; > > case CAN_RAW_FD_FRAMES: > if (len > sizeof(int)) > len = sizeof(int); > - val = &ro->fd_frames; > + flag = ro->fd_frames; > + val = &flag; > break; > > case CAN_RAW_XL_FRAMES: > if (len > sizeof(int)) > len = sizeof(int); > - val = &ro->xl_frames; > + flag = ro->xl_frames; > + val = &flag; > break; > > case CAN_RAW_XL_VCID_OPTS: { >
On 15/09/2025 at 19:16, Oliver Hartkopp wrote: > On 15.09.25 11:23, Vincent Mailhol wrote: >> The loopback, recv_own_msgs, fd_frames and xl_frames fields of struct >> raw_sock just need to store one bit of information. >> >> Declare all those members as a bitfields of type unsigned int and >> width one bit. >> >> Add a temporary variable to raw_setsockopt() and raw_getsockopt() to >> make the conversion between the stored bits and the socket interface. >> >> This reduces struct raw_sock by eight bytes. >> >> Statistics before: >> >> $ pahole --class_name=raw_sock net/can/raw.o >> struct raw_sock { >> struct sock sk __attribute__((__aligned__(8))); /* >> 0 776 */ >> >> /* XXX last struct has 1 bit hole */ >> >> /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ >> int bound; /* 776 4 */ >> int ifindex; /* 780 4 */ >> struct net_device * dev; /* 784 8 */ >> netdevice_tracker dev_tracker; /* 792 0 */ >> struct list_head notifier; /* 792 16 */ >> int loopback; /* 808 4 */ >> int recv_own_msgs; /* 812 4 */ >> int fd_frames; /* 816 4 */ >> int xl_frames; /* 820 4 */ >> struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */ >> canid_t tx_vcid_shifted; /* 828 4 */ >> /* --- cacheline 13 boundary (832 bytes) --- */ >> canid_t rx_vcid_shifted; /* 832 4 */ >> canid_t rx_vcid_mask_shifted; /* 836 4 */ >> int join_filters; /* 840 4 */ >> int count; /* 844 4 */ >> struct can_filter dfilter; /* 848 8 */ >> struct can_filter * filter; /* 856 8 */ >> can_err_mask_t err_mask; /* 864 4 */ >> >> /* XXX 4 bytes hole, try to pack */ >> >> struct uniqframe * uniq; /* 872 8 */ >> >> /* size: 880, cachelines: 14, members: 20 */ >> /* sum members: 876, holes: 1, sum holes: 4 */ >> /* member types with bit holes: 1, total: 1 */ >> /* forced alignments: 1 */ >> /* last cacheline: 48 bytes */ >> } __attribute__((__aligned__(8))); >> >> ...and after: >> >> $ pahole --class_name=raw_sock net/can/raw.o >> struct raw_sock { >> struct sock sk __attribute__((__aligned__(8))); /* >> 0 776 */ >> >> /* XXX last struct has 1 bit hole */ >> >> /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ >> int bound; /* 776 4 */ >> int ifindex; /* 780 4 */ >> struct net_device * dev; /* 784 8 */ >> netdevice_tracker dev_tracker; /* 792 0 */ >> struct list_head notifier; /* 792 16 */ >> unsigned int loopback:1; /* 808: 0 4 */ >> unsigned int recv_own_msgs:1; /* 808: 1 4 */ >> unsigned int fd_frames:1; /* 808: 2 4 */ >> unsigned int xl_frames:1; /* 808: 3 4 */ > > This means that the former data structures (int) are not copied but bits are set > (shifted, ANDed, ORed, etc) right? > > So what's the difference in the code the CPU has to process for this > improvement? Is implementing this bitmap more efficient or similar to copy the > (unsigned ints) as-is? It will indeed have to add a couple assembly instructions. But this is peanuts. In the best case, the out of order execution might very well optimize this so that not even a CPU tick is wasted. In the worst case, it is a couple CPU ticks. On the other hands, reducing the size by 16 bytes lowers the risk to have a cache miss. And removing one cache miss outperforms by an order of magnitude the penalty of adding a couple assembly instructions. Well, I did not benchmark it, but this is a commonly accepted trade off. Yours sincerely, Vincent Mailhol
On 15.09.25 12:47, Vincent Mailhol wrote: > On 15/09/2025 at 19:16, Oliver Hartkopp wrote: >> On 15.09.25 11:23, Vincent Mailhol wrote: >>> The loopback, recv_own_msgs, fd_frames and xl_frames fields of struct >>> raw_sock just need to store one bit of information. >>> >>> Declare all those members as a bitfields of type unsigned int and >>> width one bit. >>> >>> Add a temporary variable to raw_setsockopt() and raw_getsockopt() to >>> make the conversion between the stored bits and the socket interface. >>> >>> This reduces struct raw_sock by eight bytes. >>> >>> Statistics before: >>> >>> $ pahole --class_name=raw_sock net/can/raw.o >>> struct raw_sock { >>> struct sock sk __attribute__((__aligned__(8))); /* >>> 0 776 */ >>> >>> /* XXX last struct has 1 bit hole */ >>> >>> /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ >>> int bound; /* 776 4 */ >>> int ifindex; /* 780 4 */ >>> struct net_device * dev; /* 784 8 */ >>> netdevice_tracker dev_tracker; /* 792 0 */ >>> struct list_head notifier; /* 792 16 */ >>> int loopback; /* 808 4 */ >>> int recv_own_msgs; /* 812 4 */ >>> int fd_frames; /* 816 4 */ >>> int xl_frames; /* 820 4 */ >>> struct can_raw_vcid_options raw_vcid_opts; /* 824 4 */ >>> canid_t tx_vcid_shifted; /* 828 4 */ >>> /* --- cacheline 13 boundary (832 bytes) --- */ >>> canid_t rx_vcid_shifted; /* 832 4 */ >>> canid_t rx_vcid_mask_shifted; /* 836 4 */ >>> int join_filters; /* 840 4 */ >>> int count; /* 844 4 */ >>> struct can_filter dfilter; /* 848 8 */ >>> struct can_filter * filter; /* 856 8 */ >>> can_err_mask_t err_mask; /* 864 4 */ >>> >>> /* XXX 4 bytes hole, try to pack */ >>> >>> struct uniqframe * uniq; /* 872 8 */ >>> >>> /* size: 880, cachelines: 14, members: 20 */ >>> /* sum members: 876, holes: 1, sum holes: 4 */ >>> /* member types with bit holes: 1, total: 1 */ >>> /* forced alignments: 1 */ >>> /* last cacheline: 48 bytes */ >>> } __attribute__((__aligned__(8))); >>> >>> ...and after: >>> >>> $ pahole --class_name=raw_sock net/can/raw.o >>> struct raw_sock { >>> struct sock sk __attribute__((__aligned__(8))); /* >>> 0 776 */ >>> >>> /* XXX last struct has 1 bit hole */ >>> >>> /* --- cacheline 12 boundary (768 bytes) was 8 bytes ago --- */ >>> int bound; /* 776 4 */ >>> int ifindex; /* 780 4 */ >>> struct net_device * dev; /* 784 8 */ >>> netdevice_tracker dev_tracker; /* 792 0 */ >>> struct list_head notifier; /* 792 16 */ >>> unsigned int loopback:1; /* 808: 0 4 */ >>> unsigned int recv_own_msgs:1; /* 808: 1 4 */ >>> unsigned int fd_frames:1; /* 808: 2 4 */ >>> unsigned int xl_frames:1; /* 808: 3 4 */ >> >> This means that the former data structures (int) are not copied but bits are set >> (shifted, ANDed, ORed, etc) right? >> >> So what's the difference in the code the CPU has to process for this >> improvement? Is implementing this bitmap more efficient or similar to copy the >> (unsigned ints) as-is? > > It will indeed have to add a couple assembly instructions. But this is peanuts. > In the best case, the out of order execution might very well optimize this so > that not even a CPU tick is wasted. In the worst case, it is a couple CPU ticks. > > On the other hands, reducing the size by 16 bytes lowers the risk to have a > cache miss. And removing one cache miss outperforms by an order of magnitude the > penalty of adding a couple assembly instructions. > > Well, I did not benchmark it, but this is a commonly accepted trade off. Ok. Most accesses of those values like ro->fd_frames are read-only anyway, which might add an additional AND operation with a constant value. Therefore your suggested changes are not in the hot path anyway and the ro->fd_frames = !!flag operation is executed at socket creation time only. Generally it is interesting the the compiler can handle bits in this way. Acked-by: Oliver Hartkopp <socketcan@hartkopp.net> Thanks! Oliver > > Yours sincerely, > Vincent Mailhol >
© 2016 - 2025 Red Hat, Inc.