hw/timer/mc146818rtc.c | 55 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 47 insertions(+), 8 deletions(-)
As windows guest use rtc as the clock source device,
and access rtc frequently. Let's move the rtc memory
region outside BQL to decrease overhead for windows guests.
Meanwhile, adding a new lock to avoid different vCPUs
access the RTC together.
$ cat strace_c.sh
strace -tt -p $1 -c -o result_$1.log &
sleep $2
pid=$(pidof strace)
kill $pid
cat result_$1.log
Before appling this change:
$ ./strace_c.sh 10528 30
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
93.87 0.119070 30 4000 ppoll
3.27 0.004148 2 2038 ioctl
2.66 0.003370 2 2014 futex
0.09 0.000113 1 106 read
0.09 0.000109 1 104 io_getevents
0.02 0.000029 1 30 poll
0.00 0.000000 0 1 write
------ ----------- ----------- --------- --------- ----------------
100.00 0.126839 8293 total
After appling the change:
$ ./strace_c.sh 23829 30
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
92.86 0.067441 16 4094 ppoll
4.85 0.003522 2 2136 ioctl
1.17 0.000850 4 189 futex
0.54 0.000395 2 202 read
0.52 0.000379 2 202 io_getevents
0.05 0.000037 1 30 poll
------ ----------- ----------- --------- --------- ----------------
100.00 0.072624 6853 total
The futex call number decreases ~90.6% on an idle windows 7 guest.
Signed-off-by: Gonglei <arei.gonglei@huawei.com>
---
v2->v1:
a)Adding a new lock to avoid different vCPUs
access the RTC together. [Paolo]
b)Taking the BQL before raising the outbound IRQ line. [Peter]
c)Don't hold BQL if it was holden. [Peter]
hw/timer/mc146818rtc.c | 55 ++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 47 insertions(+), 8 deletions(-)
diff --git a/hw/timer/mc146818rtc.c b/hw/timer/mc146818rtc.c
index 35a05a6..f0a2a62 100644
--- a/hw/timer/mc146818rtc.c
+++ b/hw/timer/mc146818rtc.c
@@ -85,6 +85,7 @@ typedef struct RTCState {
uint16_t irq_reinject_on_ack_count;
uint32_t irq_coalesced;
uint32_t period;
+ QemuMutex rtc_lock;
QEMUTimer *coalesced_timer;
Notifier clock_reset_notifier;
LostTickPolicy lost_tick_policy;
@@ -125,6 +126,36 @@ static void rtc_coalesced_timer_update(RTCState *s)
}
}
+static void rtc_rasie_irq(RTCState *s)
+{
+ bool unlocked = !qemu_mutex_iothread_locked();
+
+ if (unlocked) {
+ qemu_mutex_lock_iothread();
+ }
+
+ qemu_irq_raise(s->irq);
+
+ if (unlocked) {
+ qemu_mutex_unlock_iothread();
+ }
+}
+
+static void rtc_lower_irq(RTCState *s)
+{
+ bool unlocked = !qemu_mutex_iothread_locked();
+
+ if (unlocked) {
+ qemu_mutex_lock_iothread();
+ }
+
+ qemu_irq_lower(s->irq);
+
+ if (unlocked) {
+ qemu_mutex_unlock_iothread();
+ }
+}
+
static QLIST_HEAD(, RTCState) rtc_devices =
QLIST_HEAD_INITIALIZER(rtc_devices);
@@ -141,7 +172,7 @@ void qmp_rtc_reset_reinjection(Error **errp)
static bool rtc_policy_slew_deliver_irq(RTCState *s)
{
apic_reset_irq_delivered();
- qemu_irq_raise(s->irq);
+ rtc_rasie_irq(s);
return apic_get_irq_delivered();
}
@@ -277,8 +308,9 @@ static void rtc_periodic_timer(void *opaque)
DPRINTF_C("cmos: coalesced irqs increased to %d\n",
s->irq_coalesced);
}
- } else
- qemu_irq_raise(s->irq);
+ } else {
+ rtc_rasie_irq(s);
+ }
}
}
@@ -459,7 +491,7 @@ static void rtc_update_timer(void *opaque)
s->cmos_data[RTC_REG_C] |= irqs;
if ((new_irqs & s->cmos_data[RTC_REG_B]) != 0) {
s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
- qemu_irq_raise(s->irq);
+ rtc_rasie_irq(s);
}
check_update_timer(s);
}
@@ -471,6 +503,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr,
uint32_t old_period;
bool update_periodic_timer;
+ qemu_mutex_lock(&s->rtc_lock);
if ((addr & 1) == 0) {
s->cmos_index = data & 0x7f;
} else {
@@ -560,10 +593,10 @@ static void cmos_ioport_write(void *opaque, hwaddr addr,
* becomes enabled, raise an interrupt immediately. */
if (data & s->cmos_data[RTC_REG_C] & REG_C_MASK) {
s->cmos_data[RTC_REG_C] |= REG_C_IRQF;
- qemu_irq_raise(s->irq);
+ rtc_rasie_irq(s);
} else {
s->cmos_data[RTC_REG_C] &= ~REG_C_IRQF;
- qemu_irq_lower(s->irq);
+ rtc_lower_irq(s);
}
s->cmos_data[RTC_REG_B] = data;
@@ -583,6 +616,7 @@ static void cmos_ioport_write(void *opaque, hwaddr addr,
break;
}
}
+ qemu_mutex_unlock(&s->rtc_lock);
}
static inline int rtc_to_bcd(RTCState *s, int a)
@@ -710,6 +744,7 @@ static uint64_t cmos_ioport_read(void *opaque, hwaddr addr,
if ((addr & 1) == 0) {
return 0xff;
} else {
+ qemu_mutex_lock(&s->rtc_lock);
switch(s->cmos_index) {
case RTC_IBM_PS2_CENTURY_BYTE:
s->cmos_index = RTC_CENTURY;
@@ -737,7 +772,7 @@ static uint64_t cmos_ioport_read(void *opaque, hwaddr addr,
break;
case RTC_REG_C:
ret = s->cmos_data[s->cmos_index];
- qemu_irq_lower(s->irq);
+ rtc_lower_irq(s);
s->cmos_data[RTC_REG_C] = 0x00;
if (ret & (REG_C_UF | REG_C_AF)) {
check_update_timer(s);
@@ -762,6 +797,7 @@ static uint64_t cmos_ioport_read(void *opaque, hwaddr addr,
}
CMOS_DPRINTF("cmos: read index=0x%02x val=0x%02x\n",
s->cmos_index, ret);
+ qemu_mutex_unlock(&s->rtc_lock);
return ret;
}
}
@@ -909,7 +945,7 @@ static void rtc_reset(void *opaque)
s->cmos_data[RTC_REG_C] &= ~(REG_C_UF | REG_C_IRQF | REG_C_PF | REG_C_AF);
check_update_timer(s);
- qemu_irq_lower(s->irq);
+ rtc_lower_irq(s);
if (s->lost_tick_policy == LOST_TICK_POLICY_SLEW) {
s->irq_coalesced = 0;
@@ -960,6 +996,8 @@ static void rtc_realizefn(DeviceState *dev, Error **errp)
rtc_set_date_from_host(isadev);
+ qemu_mutex_init(&s->rtc_lock);
+
switch (s->lost_tick_policy) {
#ifdef TARGET_I386
case LOST_TICK_POLICY_SLEW:
@@ -986,6 +1024,7 @@ static void rtc_realizefn(DeviceState *dev, Error **errp)
qemu_register_suspend_notifier(&s->suspend_notifier);
memory_region_init_io(&s->io, OBJECT(s), &cmos_ops, s, "rtc", 2);
+ memory_region_clear_global_locking(&s->io);
isa_register_ioport(isadev, &s->io, base);
qdev_set_legacy_instance_id(dev, base, 3);
--
1.8.3.1
On 6 February 2018 at 14:07, Gonglei <arei.gonglei@huawei.com> wrote: > As windows guest use rtc as the clock source device, > and access rtc frequently. Let's move the rtc memory > region outside BQL to decrease overhead for windows guests. > Meanwhile, adding a new lock to avoid different vCPUs > access the RTC together. > > $ cat strace_c.sh > strace -tt -p $1 -c -o result_$1.log & > sleep $2 > pid=$(pidof strace) > kill $pid > cat result_$1.log > > Before appling this change: > $ ./strace_c.sh 10528 30 > % time seconds usecs/call calls errors syscall > ------ ----------- ----------- --------- --------- ---------------- > 93.87 0.119070 30 4000 ppoll > 3.27 0.004148 2 2038 ioctl > 2.66 0.003370 2 2014 futex > 0.09 0.000113 1 106 read > 0.09 0.000109 1 104 io_getevents > 0.02 0.000029 1 30 poll > 0.00 0.000000 0 1 write > ------ ----------- ----------- --------- --------- ---------------- > 100.00 0.126839 8293 total > > After appling the change: > $ ./strace_c.sh 23829 30 > % time seconds usecs/call calls errors syscall > ------ ----------- ----------- --------- --------- ---------------- > 92.86 0.067441 16 4094 ppoll > 4.85 0.003522 2 2136 ioctl > 1.17 0.000850 4 189 futex > 0.54 0.000395 2 202 read > 0.52 0.000379 2 202 io_getevents > 0.05 0.000037 1 30 poll > ------ ----------- ----------- --------- --------- ---------------- > 100.00 0.072624 6853 total > > The futex call number decreases ~90.6% on an idle windows 7 guest. These are the same figures as from v1 -- it would be interesting to check whether the additional locking that v2 adds has affected the results. Does the patch improve performance in a more interesting use case than "the guest is just idle" ? > +static void rtc_rasie_irq(RTCState *s) Typo: should be "raise". thanks -- PMM
> -----Original Message----- > From: Peter Maydell [mailto:peter.maydell@linaro.org] > Sent: Tuesday, February 06, 2018 10:36 PM > To: Gonglei (Arei) > Cc: QEMU Developers; Paolo Bonzini; Huangweidong (C) > Subject: Re: [PATCH v2] rtc: placing RTC memory region outside BQL > > On 6 February 2018 at 14:07, Gonglei <arei.gonglei@huawei.com> wrote: > > As windows guest use rtc as the clock source device, > > and access rtc frequently. Let's move the rtc memory > > region outside BQL to decrease overhead for windows guests. > > Meanwhile, adding a new lock to avoid different vCPUs > > access the RTC together. > > > > $ cat strace_c.sh > > strace -tt -p $1 -c -o result_$1.log & > > sleep $2 > > pid=$(pidof strace) > > kill $pid > > cat result_$1.log > > > > Before appling this change: > > $ ./strace_c.sh 10528 30 > > % time seconds usecs/call calls errors syscall > > ------ ----------- ----------- --------- --------- ---------------- > > 93.87 0.119070 30 4000 ppoll > > 3.27 0.004148 2 2038 ioctl > > 2.66 0.003370 2 2014 futex > > 0.09 0.000113 1 106 read > > 0.09 0.000109 1 104 io_getevents > > 0.02 0.000029 1 30 poll > > 0.00 0.000000 0 1 write > > ------ ----------- ----------- --------- --------- ---------------- > > 100.00 0.126839 8293 total > > > > After appling the change: > > $ ./strace_c.sh 23829 30 > > % time seconds usecs/call calls errors syscall > > ------ ----------- ----------- --------- --------- ---------------- > > 92.86 0.067441 16 4094 ppoll > > 4.85 0.003522 2 2136 ioctl > > 1.17 0.000850 4 189 futex > > 0.54 0.000395 2 202 read > > 0.52 0.000379 2 202 io_getevents > > 0.05 0.000037 1 30 poll > > ------ ----------- ----------- --------- --------- ---------------- > > 100.00 0.072624 6853 total > > > > The futex call number decreases ~90.6% on an idle windows 7 guest. > > These are the same figures as from v1 -- it would be interesting > to check whether the additional locking that v2 adds has affected > the results. > Oh, yes. the futex number of v2 don't decline compared too much to v1 because it takes the BQL before raising the outbound IRQ line now. Before applying v2: # ./strace_c.sh 8776 30 % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 78.01 0.164188 26 6436 ppoll 8.39 0.017650 5 3700 39 futex 7.68 0.016157 6 2758 ioctl 5.48 0.011530 3 4586 1113 read 0.30 0.000640 20 32 io_submit 0.15 0.000317 4 89 write ------ ----------- ----------- --------- --------- ---------------- 100.00 0.210482 17601 1152 total After applying v2: # ./strace_c.sh 15968 30 % time seconds usecs/call calls errors syscall ------ ----------- ----------- --------- --------- ---------------- 78.28 0.171117 27 6272 ppoll 8.50 0.018571 5 3663 21 futex 7.76 0.016973 6 2732 ioctl 4.85 0.010597 3 4115 853 read 0.31 0.000672 11 63 io_submit 0.30 0.000659 4 180 write ------ ----------- ----------- --------- --------- ---------------- 100.00 0.218589 17025 874 total > Does the patch improve performance in a more interesting use > case than "the guest is just idle" ? > I think so, after all, the scope of the locking is reduced . Besides this, can we optimize the rtc timer to avoid to hold BQL by separate threads? > > +static void rtc_rasie_irq(RTCState *s) > > Typo: should be "raise". > Good catch. :) Thanks, -Gonglei
© 2016 - 2024 Red Hat, Inc.