1
The following changes since commit 56f9e46b841c7be478ca038d8d4085d776ab4b0d:
1
The following changes since commit 1214d55d1c41fbab3a9973a05085b8760647e411:
2
2
3
Merge remote-tracking branch 'remotes/armbru/tags/pull-qapi-2017-02-20' into staging (2017-02-20 17:42:47 +0000)
3
Merge remote-tracking branch 'remotes/nvme/tags/nvme-next-pull-request' into staging (2021-02-09 13:24:37 +0000)
4
4
5
are available in the git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/stefanha/qemu.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to a7b91d35bab97a2d3e779d0c64c9b837b52a6cf7:
9
for you to fetch changes up to eb847c42296497978942f738cd41dc29a35a49b2:
10
10
11
coroutine-lock: make CoRwlock thread-safe and fair (2017-02-21 11:39:40 +0000)
11
docs: fix Parallels Image "dirty bitmap" section (2021-02-10 09:23:28 +0000)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Pull request
14
Pull request
15
15
16
v2:
16
v4:
17
* Rebased to resolve scsi conflicts
17
* Add PCI_EXPRESS Kconfig dependency to fix s390x in "multi-process: setup PCI
18
host bridge for remote device" [Philippe and Thomas]
18
19
19
----------------------------------------------------------------
20
----------------------------------------------------------------
20
21
21
Paolo Bonzini (24):
22
Denis V. Lunev (1):
22
block: move AioContext, QEMUTimer, main-loop to libqemuutil
23
docs: fix Parallels Image "dirty bitmap" section
23
aio: introduce aio_co_schedule and aio_co_wake
24
block-backend: allow blk_prw from coroutine context
25
test-thread-pool: use generic AioContext infrastructure
26
io: add methods to set I/O handlers on AioContext
27
io: make qio_channel_yield aware of AioContexts
28
nbd: convert to use qio_channel_yield
29
coroutine-lock: reschedule coroutine on the AioContext it was running
30
on
31
blkdebug: reschedule coroutine on the AioContext it is running on
32
qed: introduce qed_aio_start_io and qed_aio_next_io_cb
33
aio: push aio_context_acquire/release down to dispatching
34
block: explicitly acquire aiocontext in timers that need it
35
block: explicitly acquire aiocontext in callbacks that need it
36
block: explicitly acquire aiocontext in bottom halves that need it
37
block: explicitly acquire aiocontext in aio callbacks that need it
38
aio-posix: partially inline aio_dispatch into aio_poll
39
async: remove unnecessary inc/dec pairs
40
block: document fields protected by AioContext lock
41
coroutine-lock: make CoMutex thread-safe
42
coroutine-lock: add limited spinning to CoMutex
43
test-aio-multithread: add performance comparison with thread-based
44
mutexes
45
coroutine-lock: place CoMutex before CoQueue in header
46
coroutine-lock: add mutex argument to CoQueue APIs
47
coroutine-lock: make CoRwlock thread-safe and fair
48
24
49
Makefile.objs | 4 -
25
Elena Ufimtseva (8):
50
stubs/Makefile.objs | 1 +
26
multi-process: add configure and usage information
51
tests/Makefile.include | 19 +-
27
io: add qio_channel_writev_full_all helper
52
util/Makefile.objs | 6 +-
28
io: add qio_channel_readv_full_all_eof & qio_channel_readv_full_all
53
block/nbd-client.h | 2 +-
29
helpers
54
block/qed.h | 3 +
30
multi-process: define MPQemuMsg format and transmission functions
55
include/block/aio.h | 38 ++-
31
multi-process: introduce proxy object
56
include/block/block_int.h | 64 +++--
32
multi-process: add proxy communication functions
57
include/io/channel.h | 72 +++++-
33
multi-process: Forward PCI config space acceses to the remote process
58
include/qemu/coroutine.h | 84 ++++---
34
multi-process: perform device reset in the remote process
59
include/qemu/coroutine_int.h | 11 +-
35
60
include/sysemu/block-backend.h | 14 +-
36
Jagannathan Raman (11):
61
tests/iothread.h | 25 ++
37
memory: alloc RAM from file at offset
62
block/backup.c | 2 +-
38
multi-process: Add config option for multi-process QEMU
63
block/blkdebug.c | 9 +-
39
multi-process: setup PCI host bridge for remote device
64
block/blkreplay.c | 2 +-
40
multi-process: setup a machine object for remote device process
65
block/block-backend.c | 13 +-
41
multi-process: Initialize message handler in remote device
66
block/curl.c | 44 +++-
42
multi-process: Associate fd of a PCIDevice with its object
67
block/gluster.c | 9 +-
43
multi-process: setup memory manager for remote device
68
block/io.c | 42 +---
44
multi-process: PCI BAR read/write handling for proxy & remote
69
block/iscsi.c | 15 +-
45
endpoints
70
block/linux-aio.c | 10 +-
46
multi-process: Synchronize remote memory
71
block/mirror.c | 12 +-
47
multi-process: create IOHUB object to handle irq
72
block/nbd-client.c | 119 +++++----
48
multi-process: Retrieve PCI info from remote process
73
block/nfs.c | 9 +-
49
74
block/qcow2-cluster.c | 4 +-
50
John G Johnson (1):
75
block/qed-cluster.c | 2 +
51
multi-process: add the concept description to
76
block/qed-table.c | 12 +-
52
docs/devel/qemu-multiprocess
77
block/qed.c | 58 +++--
53
78
block/sheepdog.c | 31 +--
54
Stefan Hajnoczi (6):
79
block/ssh.c | 29 +--
55
.github: point Repo Lockdown bot to GitLab repo
80
block/throttle-groups.c | 4 +-
56
gitmodules: use GitLab repos instead of qemu.org
81
block/win32-aio.c | 9 +-
57
gitlab-ci: remove redundant GitLab repo URL command
82
dma-helpers.c | 2 +
58
docs: update README to use GitLab repo URLs
83
hw/9pfs/9p.c | 2 +-
59
pc-bios: update mirror URLs to GitLab
84
hw/block/virtio-blk.c | 19 +-
60
get_maintainer: update repo URL to GitLab
85
hw/scsi/scsi-bus.c | 2 +
61
86
hw/scsi/scsi-disk.c | 15 ++
62
MAINTAINERS | 24 +
87
hw/scsi/scsi-generic.c | 20 +-
63
README.rst | 4 +-
88
hw/scsi/virtio-scsi.c | 7 +
64
docs/devel/index.rst | 1 +
89
io/channel-command.c | 13 +
65
docs/devel/multi-process.rst | 966 ++++++++++++++++++++++
90
io/channel-file.c | 11 +
66
docs/system/index.rst | 1 +
91
io/channel-socket.c | 16 +-
67
docs/system/multi-process.rst | 64 ++
92
io/channel-tls.c | 12 +
68
docs/interop/parallels.txt | 2 +-
93
io/channel-watch.c | 6 +
69
configure | 10 +
94
io/channel.c | 97 ++++++--
70
meson.build | 5 +-
95
nbd/client.c | 2 +-
71
hw/remote/trace.h | 1 +
96
nbd/common.c | 9 +-
72
include/exec/memory.h | 2 +
97
nbd/server.c | 94 +++-----
73
include/exec/ram_addr.h | 4 +-
98
stubs/linux-aio.c | 32 +++
74
include/hw/pci-host/remote.h | 30 +
99
stubs/set-fd-handler.c | 11 -
75
include/hw/pci/pci_ids.h | 3 +
100
tests/iothread.c | 91 +++++++
76
include/hw/remote/iohub.h | 42 +
101
tests/test-aio-multithread.c | 463 ++++++++++++++++++++++++++++++++++++
77
include/hw/remote/machine.h | 38 +
102
tests/test-thread-pool.c | 12 +-
78
include/hw/remote/memory.h | 19 +
103
aio-posix.c => util/aio-posix.c | 62 ++---
79
include/hw/remote/mpqemu-link.h | 99 +++
104
aio-win32.c => util/aio-win32.c | 30 +--
80
include/hw/remote/proxy-memory-listener.h | 28 +
105
util/aiocb.c | 55 +++++
81
include/hw/remote/proxy.h | 48 ++
106
async.c => util/async.c | 84 ++++++-
82
include/io/channel.h | 78 ++
107
iohandler.c => util/iohandler.c | 0
83
include/qemu/mmap-alloc.h | 4 +-
108
main-loop.c => util/main-loop.c | 0
84
include/sysemu/iothread.h | 6 +
109
util/qemu-coroutine-lock.c | 254 ++++++++++++++++++--
85
backends/hostmem-memfd.c | 2 +-
110
util/qemu-coroutine-sleep.c | 2 +-
86
hw/misc/ivshmem.c | 3 +-
111
util/qemu-coroutine.c | 8 +
87
hw/pci-host/remote.c | 75 ++
112
qemu-timer.c => util/qemu-timer.c | 0
88
hw/remote/iohub.c | 119 +++
113
thread-pool.c => util/thread-pool.c | 8 +-
89
hw/remote/machine.c | 80 ++
114
trace-events | 11 -
90
hw/remote/memory.c | 65 ++
115
util/trace-events | 17 +-
91
hw/remote/message.c | 230 ++++++
116
67 files changed, 1712 insertions(+), 533 deletions(-)
92
hw/remote/mpqemu-link.c | 267 ++++++
117
create mode 100644 tests/iothread.h
93
hw/remote/proxy-memory-listener.c | 227 +++++
118
create mode 100644 stubs/linux-aio.c
94
hw/remote/proxy.c | 379 +++++++++
119
create mode 100644 tests/iothread.c
95
hw/remote/remote-obj.c | 203 +++++
120
create mode 100644 tests/test-aio-multithread.c
96
io/channel.c | 116 ++-
121
rename aio-posix.c => util/aio-posix.c (94%)
97
iothread.c | 6 +
122
rename aio-win32.c => util/aio-win32.c (95%)
98
softmmu/memory.c | 3 +-
123
create mode 100644 util/aiocb.c
99
softmmu/physmem.c | 12 +-
124
rename async.c => util/async.c (82%)
100
util/mmap-alloc.c | 8 +-
125
rename iohandler.c => util/iohandler.c (100%)
101
util/oslib-posix.c | 2 +-
126
rename main-loop.c => util/main-loop.c (100%)
102
.github/lockdown.yml | 8 +-
127
rename qemu-timer.c => util/qemu-timer.c (100%)
103
.gitlab-ci.yml | 1 -
128
rename thread-pool.c => util/thread-pool.c (97%)
104
.gitmodules | 44 +-
105
Kconfig.host | 4 +
106
hw/Kconfig | 1 +
107
hw/meson.build | 1 +
108
hw/pci-host/Kconfig | 3 +
109
hw/pci-host/meson.build | 1 +
110
hw/remote/Kconfig | 4 +
111
hw/remote/meson.build | 13 +
112
hw/remote/trace-events | 4 +
113
pc-bios/README | 4 +-
114
scripts/get_maintainer.pl | 2 +-
115
53 files changed, 3296 insertions(+), 70 deletions(-)
116
create mode 100644 docs/devel/multi-process.rst
117
create mode 100644 docs/system/multi-process.rst
118
create mode 100644 hw/remote/trace.h
119
create mode 100644 include/hw/pci-host/remote.h
120
create mode 100644 include/hw/remote/iohub.h
121
create mode 100644 include/hw/remote/machine.h
122
create mode 100644 include/hw/remote/memory.h
123
create mode 100644 include/hw/remote/mpqemu-link.h
124
create mode 100644 include/hw/remote/proxy-memory-listener.h
125
create mode 100644 include/hw/remote/proxy.h
126
create mode 100644 hw/pci-host/remote.c
127
create mode 100644 hw/remote/iohub.c
128
create mode 100644 hw/remote/machine.c
129
create mode 100644 hw/remote/memory.c
130
create mode 100644 hw/remote/message.c
131
create mode 100644 hw/remote/mpqemu-link.c
132
create mode 100644 hw/remote/proxy-memory-listener.c
133
create mode 100644 hw/remote/proxy.c
134
create mode 100644 hw/remote/remote-obj.c
135
create mode 100644 hw/remote/Kconfig
136
create mode 100644 hw/remote/meson.build
137
create mode 100644 hw/remote/trace-events
129
138
130
--
139
--
131
2.9.3
140
2.29.2
132
141
133
diff view generated by jsdifflib
New patch
1
Use the GitLab repo URL as the main repo location in order to reduce
2
load on qemu.org.
1
3
4
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
6
Reviewed-by: Thomas Huth <thuth@redhat.com>
7
Message-id: 20210111115017.156802-2-stefanha@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
.github/lockdown.yml | 8 ++++----
11
1 file changed, 4 insertions(+), 4 deletions(-)
12
13
diff --git a/.github/lockdown.yml b/.github/lockdown.yml
14
index XXXXXXX..XXXXXXX 100644
15
--- a/.github/lockdown.yml
16
+++ b/.github/lockdown.yml
17
@@ -XXX,XX +XXX,XX @@ issues:
18
comment: |
19
Thank you for your interest in the QEMU project.
20
21
- This repository is a read-only mirror of the project's master
22
- repostories hosted on https://git.qemu.org/git/qemu.git.
23
+ This repository is a read-only mirror of the project's repostories hosted
24
+ at https://gitlab.com/qemu-project/qemu.git.
25
The project does not process issues filed on GitHub.
26
27
The project issues are tracked on Launchpad:
28
@@ -XXX,XX +XXX,XX @@ pulls:
29
comment: |
30
Thank you for your interest in the QEMU project.
31
32
- This repository is a read-only mirror of the project's master
33
- repostories hosted on https://git.qemu.org/git/qemu.git.
34
+ This repository is a read-only mirror of the project's repostories hosted
35
+ on https://gitlab.com/qemu-project/qemu.git.
36
The project does not process merge requests filed on GitHub.
37
38
QEMU welcomes contributions of code (either fixing bugs or adding new
39
--
40
2.29.2
41
diff view generated by jsdifflib
New patch
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
1
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Message-id: 20210111115017.156802-3-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
.gitmodules | 44 ++++++++++++++++++++++----------------------
13
1 file changed, 22 insertions(+), 22 deletions(-)
14
15
diff --git a/.gitmodules b/.gitmodules
16
index XXXXXXX..XXXXXXX 100644
17
--- a/.gitmodules
18
+++ b/.gitmodules
19
@@ -XXX,XX +XXX,XX @@
20
[submodule "roms/seabios"]
21
    path = roms/seabios
22
-    url = https://git.qemu.org/git/seabios.git/
23
+    url = https://gitlab.com/qemu-project/seabios.git/
24
[submodule "roms/SLOF"]
25
    path = roms/SLOF
26
-    url = https://git.qemu.org/git/SLOF.git
27
+    url = https://gitlab.com/qemu-project/SLOF.git
28
[submodule "roms/ipxe"]
29
    path = roms/ipxe
30
-    url = https://git.qemu.org/git/ipxe.git
31
+    url = https://gitlab.com/qemu-project/ipxe.git
32
[submodule "roms/openbios"]
33
    path = roms/openbios
34
-    url = https://git.qemu.org/git/openbios.git
35
+    url = https://gitlab.com/qemu-project/openbios.git
36
[submodule "roms/qemu-palcode"]
37
    path = roms/qemu-palcode
38
-    url = https://git.qemu.org/git/qemu-palcode.git
39
+    url = https://gitlab.com/qemu-project/qemu-palcode.git
40
[submodule "roms/sgabios"]
41
    path = roms/sgabios
42
-    url = https://git.qemu.org/git/sgabios.git
43
+    url = https://gitlab.com/qemu-project/sgabios.git
44
[submodule "dtc"]
45
    path = dtc
46
-    url = https://git.qemu.org/git/dtc.git
47
+    url = https://gitlab.com/qemu-project/dtc.git
48
[submodule "roms/u-boot"]
49
    path = roms/u-boot
50
-    url = https://git.qemu.org/git/u-boot.git
51
+    url = https://gitlab.com/qemu-project/u-boot.git
52
[submodule "roms/skiboot"]
53
    path = roms/skiboot
54
-    url = https://git.qemu.org/git/skiboot.git
55
+    url = https://gitlab.com/qemu-project/skiboot.git
56
[submodule "roms/QemuMacDrivers"]
57
    path = roms/QemuMacDrivers
58
-    url = https://git.qemu.org/git/QemuMacDrivers.git
59
+    url = https://gitlab.com/qemu-project/QemuMacDrivers.git
60
[submodule "ui/keycodemapdb"]
61
    path = ui/keycodemapdb
62
-    url = https://git.qemu.org/git/keycodemapdb.git
63
+    url = https://gitlab.com/qemu-project/keycodemapdb.git
64
[submodule "capstone"]
65
    path = capstone
66
-    url = https://git.qemu.org/git/capstone.git
67
+    url = https://gitlab.com/qemu-project/capstone.git
68
[submodule "roms/seabios-hppa"]
69
    path = roms/seabios-hppa
70
-    url = https://git.qemu.org/git/seabios-hppa.git
71
+    url = https://gitlab.com/qemu-project/seabios-hppa.git
72
[submodule "roms/u-boot-sam460ex"]
73
    path = roms/u-boot-sam460ex
74
-    url = https://git.qemu.org/git/u-boot-sam460ex.git
75
+    url = https://gitlab.com/qemu-project/u-boot-sam460ex.git
76
[submodule "tests/fp/berkeley-testfloat-3"]
77
    path = tests/fp/berkeley-testfloat-3
78
-    url = https://git.qemu.org/git/berkeley-testfloat-3.git
79
+    url = https://gitlab.com/qemu-project/berkeley-testfloat-3.git
80
[submodule "tests/fp/berkeley-softfloat-3"]
81
    path = tests/fp/berkeley-softfloat-3
82
-    url = https://git.qemu.org/git/berkeley-softfloat-3.git
83
+    url = https://gitlab.com/qemu-project/berkeley-softfloat-3.git
84
[submodule "roms/edk2"]
85
    path = roms/edk2
86
-    url = https://git.qemu.org/git/edk2.git
87
+    url = https://gitlab.com/qemu-project/edk2.git
88
[submodule "slirp"]
89
    path = slirp
90
-    url = https://git.qemu.org/git/libslirp.git
91
+    url = https://gitlab.com/qemu-project/libslirp.git
92
[submodule "roms/opensbi"]
93
    path = roms/opensbi
94
-    url =     https://git.qemu.org/git/opensbi.git
95
+    url =     https://gitlab.com/qemu-project/opensbi.git
96
[submodule "roms/qboot"]
97
    path = roms/qboot
98
-    url = https://git.qemu.org/git/qboot.git
99
+    url = https://gitlab.com/qemu-project/qboot.git
100
[submodule "meson"]
101
    path = meson
102
-    url = https://git.qemu.org/git/meson.git
103
+    url = https://gitlab.com/qemu-project/meson.git
104
[submodule "roms/vbootrom"]
105
    path = roms/vbootrom
106
-    url = https://git.qemu.org/git/vbootrom.git
107
+    url = https://gitlab.com/qemu-project/vbootrom.git
108
--
109
2.29.2
110
diff view generated by jsdifflib
New patch
1
It is no longer necessary to point .gitmodules at GitLab repos when
2
running in GitLab CI since they are now used all the time.
1
3
4
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
6
Reviewed-by: Thomas Huth <thuth@redhat.com>
7
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
8
Message-id: 20210111115017.156802-4-stefanha@redhat.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
.gitlab-ci.yml | 1 -
12
1 file changed, 1 deletion(-)
13
14
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
15
index XXXXXXX..XXXXXXX 100644
16
--- a/.gitlab-ci.yml
17
+++ b/.gitlab-ci.yml
18
@@ -XXX,XX +XXX,XX @@ include:
19
image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest
20
before_script:
21
- JOBS=$(expr $(nproc) + 1)
22
- - sed -i s,git.qemu.org/git,gitlab.com/qemu-project, .gitmodules
23
script:
24
- mkdir build
25
- cd build
26
--
27
2.29.2
28
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
2
4
3
Keep the coroutine on the same AioContext. Without this change,
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
there would be a race between yielding the coroutine and reentering it.
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
5
While the race cannot happen now, because the code only runs from a single
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
6
AioContext, this will change with multiqueue support in the block layer.
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
7
9
Message-id: 20210111115017.156802-5-stefanha@redhat.com
8
While doing the change, replace custom bottom half with aio_co_schedule.
9
10
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Reviewed-by: Fam Zheng <famz@redhat.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
14
Message-id: 20170213135235.12274-10-pbonzini@redhat.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
11
---
17
block/blkdebug.c | 9 +--------
12
README.rst | 4 ++--
18
1 file changed, 1 insertion(+), 8 deletions(-)
13
1 file changed, 2 insertions(+), 2 deletions(-)
19
14
20
diff --git a/block/blkdebug.c b/block/blkdebug.c
15
diff --git a/README.rst b/README.rst
21
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
22
--- a/block/blkdebug.c
17
--- a/README.rst
23
+++ b/block/blkdebug.c
18
+++ b/README.rst
24
@@ -XXX,XX +XXX,XX @@ out:
19
@@ -XXX,XX +XXX,XX @@ The QEMU source code is maintained under the GIT version control system.
25
return ret;
20
26
}
21
.. code-block:: shell
27
22
28
-static void error_callback_bh(void *opaque)
23
- git clone https://git.qemu.org/git/qemu.git
29
-{
24
+ git clone https://gitlab.com/qemu-project/qemu.git
30
- Coroutine *co = opaque;
25
31
- qemu_coroutine_enter(co);
26
When submitting patches, one common approach is to use 'git
32
-}
27
format-patch' and/or 'git send-email' to format & send the mail to the
33
-
28
@@ -XXX,XX +XXX,XX @@ The QEMU website is also maintained under source control.
34
static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
29
35
{
30
.. code-block:: shell
36
BDRVBlkdebugState *s = bs->opaque;
31
37
@@ -XXX,XX +XXX,XX @@ static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
32
- git clone https://git.qemu.org/git/qemu-web.git
38
}
33
+ git clone https://gitlab.com/qemu-project/qemu-web.git
39
34
40
if (!immediately) {
35
* `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_
41
- aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), error_callback_bh,
42
- qemu_coroutine_self());
43
+ aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
44
qemu_coroutine_yield();
45
}
46
36
47
--
37
--
48
2.9.3
38
2.29.2
49
39
50
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
2
4
3
This adds a CoMutex around the existing CoQueue. Because the write-side
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
can just take CoMutex, the old "writer" field is not necessary anymore.
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
5
Instead of removing it altogether, count the number of pending writers
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
6
during a read-side critical section and forbid further readers from
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
7
entering.
9
Message-id: 20210111115017.156802-6-stefanha@redhat.com
8
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 20170213181244.16297-7-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
11
---
14
include/qemu/coroutine.h | 3 ++-
12
pc-bios/README | 4 ++--
15
util/qemu-coroutine-lock.c | 35 ++++++++++++++++++++++++-----------
13
1 file changed, 2 insertions(+), 2 deletions(-)
16
2 files changed, 26 insertions(+), 12 deletions(-)
17
14
18
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
15
diff --git a/pc-bios/README b/pc-bios/README
19
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100644
20
--- a/include/qemu/coroutine.h
17
--- a/pc-bios/README
21
+++ b/include/qemu/coroutine.h
18
+++ b/pc-bios/README
22
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
19
@@ -XXX,XX +XXX,XX @@
23
20
legacy x86 software to communicate with an attached serial console as
24
21
if a video card were attached. The master sources reside in a subversion
25
typedef struct CoRwlock {
22
repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is
26
- bool writer;
23
- available at https://git.qemu.org/git/sgabios.git.
27
+ int pending_writer;
24
+ available at https://gitlab.com/qemu-project/sgabios.git.
28
int reader;
25
29
+ CoMutex mutex;
26
- The PXE roms come from the iPXE project. Built with BANNER_TIME 0.
30
CoQueue queue;
27
Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping:
31
} CoRwlock;
28
@@ -XXX,XX +XXX,XX @@
32
29
33
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
30
- The u-boot binary for e500 comes from the upstream denx u-boot project where
34
index XXXXXXX..XXXXXXX 100644
31
it was compiled using the qemu-ppce500 target.
35
--- a/util/qemu-coroutine-lock.c
32
- A git mirror is available at: https://git.qemu.org/git/u-boot.git
36
+++ b/util/qemu-coroutine-lock.c
33
+ A git mirror is available at: https://gitlab.com/qemu-project/u-boot.git
37
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_init(CoRwlock *lock)
34
The hash used to compile the current version is: 2072e72
38
{
35
39
memset(lock, 0, sizeof(*lock));
36
- Skiboot (https://github.com/open-power/skiboot/) is an OPAL
40
qemu_co_queue_init(&lock->queue);
41
+ qemu_co_mutex_init(&lock->mutex);
42
}
43
44
void qemu_co_rwlock_rdlock(CoRwlock *lock)
45
{
46
Coroutine *self = qemu_coroutine_self();
47
48
- while (lock->writer) {
49
- qemu_co_queue_wait(&lock->queue, NULL);
50
+ qemu_co_mutex_lock(&lock->mutex);
51
+ /* For fairness, wait if a writer is in line. */
52
+ while (lock->pending_writer) {
53
+ qemu_co_queue_wait(&lock->queue, &lock->mutex);
54
}
55
lock->reader++;
56
+ qemu_co_mutex_unlock(&lock->mutex);
57
+
58
+ /* The rest of the read-side critical section is run without the mutex. */
59
self->locks_held++;
60
}
61
62
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
63
Coroutine *self = qemu_coroutine_self();
64
65
assert(qemu_in_coroutine());
66
- if (lock->writer) {
67
- lock->writer = false;
68
+ if (!lock->reader) {
69
+ /* The critical section started in qemu_co_rwlock_wrlock. */
70
qemu_co_queue_restart_all(&lock->queue);
71
} else {
72
+ self->locks_held--;
73
+
74
+ qemu_co_mutex_lock(&lock->mutex);
75
lock->reader--;
76
assert(lock->reader >= 0);
77
/* Wakeup only one waiting writer */
78
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_unlock(CoRwlock *lock)
79
qemu_co_queue_next(&lock->queue);
80
}
81
}
82
- self->locks_held--;
83
+ qemu_co_mutex_unlock(&lock->mutex);
84
}
85
86
void qemu_co_rwlock_wrlock(CoRwlock *lock)
87
{
88
- Coroutine *self = qemu_coroutine_self();
89
-
90
- while (lock->writer || lock->reader) {
91
- qemu_co_queue_wait(&lock->queue, NULL);
92
+ qemu_co_mutex_lock(&lock->mutex);
93
+ lock->pending_writer++;
94
+ while (lock->reader) {
95
+ qemu_co_queue_wait(&lock->queue, &lock->mutex);
96
}
97
- lock->writer = true;
98
- self->locks_held++;
99
+ lock->pending_writer--;
100
+
101
+ /* The rest of the write-side critical section is run with
102
+ * the mutex taken, so that lock->reader remains zero.
103
+ * There is no need to update self->locks_held.
104
+ */
105
}
106
--
37
--
107
2.9.3
38
2.29.2
108
39
109
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
2
4
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
5
Reviewed-by: Fam Zheng <famz@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
7
Message-id: 20170213135235.12274-19-pbonzini@redhat.com
9
Message-id: 20210111115017.156802-7-stefanha@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
11
---
10
include/block/block_int.h | 64 +++++++++++++++++++++++++-----------------
12
scripts/get_maintainer.pl | 2 +-
11
include/sysemu/block-backend.h | 14 ++++++---
13
1 file changed, 1 insertion(+), 1 deletion(-)
12
2 files changed, 49 insertions(+), 29 deletions(-)
13
14
14
diff --git a/include/block/block_int.h b/include/block/block_int.h
15
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
15
index XXXXXXX..XXXXXXX 100644
16
index XXXXXXX..XXXXXXX 100755
16
--- a/include/block/block_int.h
17
--- a/scripts/get_maintainer.pl
17
+++ b/include/block/block_int.h
18
+++ b/scripts/get_maintainer.pl
18
@@ -XXX,XX +XXX,XX @@ struct BdrvChild {
19
@@ -XXX,XX +XXX,XX @@ sub vcs_exists {
19
* copied as well.
20
    warn("$P: No supported VCS found. Add --nogit to options?\n");
20
*/
21
    warn("Using a git repository produces better results.\n");
21
struct BlockDriverState {
22
    warn("Try latest git repository using:\n");
22
- int64_t total_sectors; /* if we are reading a disk image, give its
23
-    warn("git clone https://git.qemu.org/git/qemu.git\n");
23
- size in sectors */
24
+    warn("git clone https://gitlab.com/qemu-project/qemu.git\n");
24
+ /* Protected by big QEMU lock or read-only after opening. No special
25
    $printed_novcs = 1;
25
+ * locking needed during I/O...
26
}
26
+ */
27
return 0;
27
int open_flags; /* flags used to open the file, re-used for re-open */
28
bool read_only; /* if true, the media is read only */
29
bool encrypted; /* if true, the media is encrypted */
30
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
31
bool sg; /* if true, the device is a /dev/sg* */
32
bool probed; /* if true, format was probed rather than specified */
33
34
- int copy_on_read; /* if nonzero, copy read backing sectors into image.
35
- note this is a reference count */
36
-
37
- CoQueue flush_queue; /* Serializing flush queue */
38
- bool active_flush_req; /* Flush request in flight? */
39
- unsigned int write_gen; /* Current data generation */
40
- unsigned int flushed_gen; /* Flushed write generation */
41
-
42
BlockDriver *drv; /* NULL means no media */
43
void *opaque;
44
45
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
46
BdrvChild *backing;
47
BdrvChild *file;
48
49
- /* Callback before write request is processed */
50
- NotifierWithReturnList before_write_notifiers;
51
-
52
- /* number of in-flight requests; overall and serialising */
53
- unsigned int in_flight;
54
- unsigned int serialising_in_flight;
55
-
56
- bool wakeup;
57
-
58
- /* Offset after the highest byte written to */
59
- uint64_t wr_highest_offset;
60
-
61
/* I/O Limits */
62
BlockLimits bl;
63
64
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
65
QTAILQ_ENTRY(BlockDriverState) bs_list;
66
/* element of the list of monitor-owned BDS */
67
QTAILQ_ENTRY(BlockDriverState) monitor_list;
68
- QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
69
int refcnt;
70
71
- QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
72
-
73
/* operation blockers */
74
QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
75
76
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
77
/* The error object in use for blocking operations on backing_hd */
78
Error *backing_blocker;
79
80
+ /* Protected by AioContext lock */
81
+
82
+ /* If true, copy read backing sectors into image. Can be >1 if more
83
+ * than one client has requested copy-on-read.
84
+ */
85
+ int copy_on_read;
86
+
87
+ /* If we are reading a disk image, give its size in sectors.
88
+ * Generally read-only; it is written to by load_vmstate and save_vmstate,
89
+ * but the block layer is quiescent during those.
90
+ */
91
+ int64_t total_sectors;
92
+
93
+ /* Callback before write request is processed */
94
+ NotifierWithReturnList before_write_notifiers;
95
+
96
+ /* number of in-flight requests; overall and serialising */
97
+ unsigned int in_flight;
98
+ unsigned int serialising_in_flight;
99
+
100
+ bool wakeup;
101
+
102
+ /* Offset after the highest byte written to */
103
+ uint64_t wr_highest_offset;
104
+
105
/* threshold limit for writes, in bytes. "High water mark". */
106
uint64_t write_threshold_offset;
107
NotifierWithReturn write_threshold_notifier;
108
@@ -XXX,XX +XXX,XX @@ struct BlockDriverState {
109
/* counter for nested bdrv_io_plug */
110
unsigned io_plugged;
111
112
+ QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
113
+ CoQueue flush_queue; /* Serializing flush queue */
114
+ bool active_flush_req; /* Flush request in flight? */
115
+ unsigned int write_gen; /* Current data generation */
116
+ unsigned int flushed_gen; /* Flushed write generation */
117
+
118
+ QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
119
+
120
+ /* do we need to tell the quest if we have a volatile write cache? */
121
+ int enable_write_cache;
122
+
123
int quiesce_counter;
124
};
125
126
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
127
index XXXXXXX..XXXXXXX 100644
128
--- a/include/sysemu/block-backend.h
129
+++ b/include/sysemu/block-backend.h
130
@@ -XXX,XX +XXX,XX @@ typedef struct BlockDevOps {
131
* fields that must be public. This is in particular for QLIST_ENTRY() and
132
* friends so that BlockBackends can be kept in lists outside block-backend.c */
133
typedef struct BlockBackendPublic {
134
- /* I/O throttling.
135
- * throttle_state tells us if this BlockBackend has I/O limits configured.
136
- * io_limits_disabled tells us if they are currently being enforced */
137
+ /* I/O throttling has its own locking, but also some fields are
138
+ * protected by the AioContext lock.
139
+ */
140
+
141
+ /* Protected by AioContext lock. */
142
CoQueue throttled_reqs[2];
143
+
144
+ /* Nonzero if the I/O limits are currently being ignored; generally
145
+ * it is zero. */
146
unsigned int io_limits_disabled;
147
148
/* The following fields are protected by the ThrottleGroup lock.
149
- * See the ThrottleGroup documentation for details. */
150
+ * See the ThrottleGroup documentation for details.
151
+ * throttle_state tells us if I/O limits are configured. */
152
ThrottleState *throttle_state;
153
ThrottleTimers throttle_timers;
154
unsigned pending_reqs[2];
155
--
28
--
156
2.9.3
29
2.29.2
157
30
158
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: John G Johnson <john.g.johnson@oracle.com>
2
2
3
Pull the increment/decrement pair out of aio_bh_poll and into the
3
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
4
callers.
4
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
5
5
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
7
Message-id: 02a68adef99f5df6a380bf8fd7b90948777e411c.1611938319.git.jag.raman@oracle.com
8
Reviewed-by: Fam Zheng <famz@redhat.com>
9
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
10
Message-id: 20170213135235.12274-18-pbonzini@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
9
---
13
util/aio-posix.c | 8 +++-----
10
MAINTAINERS | 7 +
14
util/aio-win32.c | 8 ++++----
11
docs/devel/index.rst | 1 +
15
util/async.c | 12 ++++++------
12
docs/devel/multi-process.rst | 966 +++++++++++++++++++++++++++++++++++
16
3 files changed, 13 insertions(+), 15 deletions(-)
13
3 files changed, 974 insertions(+)
14
create mode 100644 docs/devel/multi-process.rst
17
15
18
diff --git a/util/aio-posix.c b/util/aio-posix.c
16
diff --git a/MAINTAINERS b/MAINTAINERS
19
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
20
--- a/util/aio-posix.c
18
--- a/MAINTAINERS
21
+++ b/util/aio-posix.c
19
+++ b/MAINTAINERS
22
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
20
@@ -XXX,XX +XXX,XX @@ S: Maintained
23
21
F: hw/semihosting/
24
void aio_dispatch(AioContext *ctx)
22
F: include/hw/semihosting/
25
{
23
26
+ qemu_lockcnt_inc(&ctx->list_lock);
24
+Multi-process QEMU
27
aio_bh_poll(ctx);
25
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
28
-
26
+M: Jagannathan Raman <jag.raman@oracle.com>
29
- qemu_lockcnt_inc(&ctx->list_lock);
27
+M: John G Johnson <john.g.johnson@oracle.com>
30
aio_dispatch_handlers(ctx);
28
+S: Maintained
31
qemu_lockcnt_dec(&ctx->list_lock);
29
+F: docs/devel/multi-process.rst
32
30
+
33
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
31
Build and test automation
34
}
32
-------------------------
35
33
Build and test automation
36
npfd = 0;
34
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
37
- qemu_lockcnt_dec(&ctx->list_lock);
38
39
progress |= aio_bh_poll(ctx);
40
41
if (ret > 0) {
42
- qemu_lockcnt_inc(&ctx->list_lock);
43
progress |= aio_dispatch_handlers(ctx);
44
- qemu_lockcnt_dec(&ctx->list_lock);
45
}
46
47
+ qemu_lockcnt_dec(&ctx->list_lock);
48
+
49
progress |= timerlistgroup_run_timers(&ctx->tlg);
50
51
return progress;
52
diff --git a/util/aio-win32.c b/util/aio-win32.c
53
index XXXXXXX..XXXXXXX 100644
35
index XXXXXXX..XXXXXXX 100644
54
--- a/util/aio-win32.c
36
--- a/docs/devel/index.rst
55
+++ b/util/aio-win32.c
37
+++ b/docs/devel/index.rst
56
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
38
@@ -XXX,XX +XXX,XX @@ Contents:
57
bool progress = false;
39
clocks
58
AioHandler *tmp;
40
qom
59
41
block-coroutine-wrapper
60
- qemu_lockcnt_inc(&ctx->list_lock);
42
+ multi-process
61
-
43
diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst
62
/*
44
new file mode 100644
63
* We have to walk very carefully in case aio_set_fd_handler is
45
index XXXXXXX..XXXXXXX
64
* called while we're walking.
46
--- /dev/null
65
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
47
+++ b/docs/devel/multi-process.rst
66
}
48
@@ -XXX,XX +XXX,XX @@
67
}
49
+This is the design document for multi-process QEMU. It does not
68
50
+necessarily reflect the status of the current implementation, which
69
- qemu_lockcnt_dec(&ctx->list_lock);
51
+may lack features or be considerably different from what is described
70
return progress;
52
+in this document. This document is still useful as a description of
71
}
53
+the goals and general direction of this feature.
72
54
+
73
void aio_dispatch(AioContext *ctx)
55
+Please refer to the following wiki for latest details:
74
{
56
+https://wiki.qemu.org/Features/MultiProcessQEMU
75
+ qemu_lockcnt_inc(&ctx->list_lock);
57
+
76
aio_bh_poll(ctx);
58
+Multi-process QEMU
77
aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
59
+===================
78
+ qemu_lockcnt_dec(&ctx->list_lock);
60
+
79
timerlistgroup_run_timers(&ctx->tlg);
61
+QEMU is often used as the hypervisor for virtual machines running in the
80
}
62
+Oracle cloud. Since one of the advantages of cloud computing is the
81
63
+ability to run many VMs from different tenants in the same cloud
82
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
64
+infrastructure, a guest that compromised its hypervisor could
83
}
65
+potentially use the hypervisor's access privileges to access data it is
84
}
66
+not authorized for.
85
67
+
86
- qemu_lockcnt_dec(&ctx->list_lock);
68
+QEMU can be susceptible to security attacks because it is a large,
87
first = true;
69
+monolithic program that provides many features to the VMs it services.
88
70
+Many of these features can be configured out of QEMU, but even a reduced
89
/* ctx->notifier is always registered. */
71
+configuration QEMU has a large amount of code a guest can potentially
90
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
72
+attack. Separating QEMU reduces the attack surface by aiding to
91
progress |= aio_dispatch_handlers(ctx, event);
73
+limit each component in the system to only access the resources that
92
} while (count > 0);
74
+it needs to perform its job.
93
75
+
94
+ qemu_lockcnt_dec(&ctx->list_lock);
76
+QEMU services
95
+
77
+-------------
96
progress |= timerlistgroup_run_timers(&ctx->tlg);
78
+
97
return progress;
79
+QEMU can be broadly described as providing three main services. One is a
98
}
80
+VM control point, where VMs can be created, migrated, re-configured, and
99
diff --git a/util/async.c b/util/async.c
81
+destroyed. A second is to emulate the CPU instructions within the VM,
100
index XXXXXXX..XXXXXXX 100644
82
+often accelerated by HW virtualization features such as Intel's VT
101
--- a/util/async.c
83
+extensions. Finally, it provides IO services to the VM by emulating HW
102
+++ b/util/async.c
84
+IO devices, such as disk and network devices.
103
@@ -XXX,XX +XXX,XX @@ void aio_bh_call(QEMUBH *bh)
85
+
104
bh->cb(bh->opaque);
86
+A multi-process QEMU
105
}
87
+~~~~~~~~~~~~~~~~~~~~
106
88
+
107
-/* Multiple occurrences of aio_bh_poll cannot be called concurrently */
89
+A multi-process QEMU involves separating QEMU services into separate
108
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently.
90
+host processes. Each of these processes can be given only the privileges
109
+ * The count in ctx->list_lock is incremented before the call, and is
91
+it needs to provide its service, e.g., a disk service could be given
110
+ * not affected by the call.
92
+access only to the disk images it provides, and not be allowed to
111
+ */
93
+access other files, or any network devices. An attacker who compromised
112
int aio_bh_poll(AioContext *ctx)
94
+this service would not be able to use this exploit to access files or
113
{
95
+devices beyond what the disk service was given access to.
114
QEMUBH *bh, **bhp, *next;
96
+
115
int ret;
97
+A QEMU control process would remain, but in multi-process mode, will
116
bool deleted = false;
98
+have no direct interfaces to the VM. During VM execution, it would still
117
99
+provide the user interface to hot-plug devices or live migrate the VM.
118
- qemu_lockcnt_inc(&ctx->list_lock);
100
+
119
-
101
+A first step in creating a multi-process QEMU is to separate IO services
120
ret = 0;
102
+from the main QEMU program, which would continue to provide CPU
121
for (bh = atomic_rcu_read(&ctx->first_bh); bh; bh = next) {
103
+emulation. i.e., the control process would also be the CPU emulation
122
next = atomic_rcu_read(&bh->next);
104
+process. In a later phase, CPU emulation could be separated from the
123
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
105
+control process.
124
106
+
125
/* remove deleted bhs */
107
+Separating IO services
126
if (!deleted) {
108
+----------------------
127
- qemu_lockcnt_dec(&ctx->list_lock);
109
+
128
return ret;
110
+Separating IO services into individual host processes is a good place to
129
}
111
+begin for a couple of reasons. One is the sheer number of IO devices QEMU
130
112
+can emulate provides a large surface of interfaces which could potentially
131
- if (qemu_lockcnt_dec_and_lock(&ctx->list_lock)) {
113
+be exploited, and, indeed, have been a source of exploits in the past.
132
+ if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
114
+Another is the modular nature of QEMU device emulation code provides
133
bhp = &ctx->first_bh;
115
+interface points where the QEMU functions that perform device emulation
134
while (*bhp) {
116
+can be separated from the QEMU functions that manage the emulation of
135
bh = *bhp;
117
+guest CPU instructions. The devices emulated in the separate process are
136
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
118
+referred to as remote devices.
137
bhp = &bh->next;
119
+
138
}
120
+QEMU device emulation
139
}
121
+~~~~~~~~~~~~~~~~~~~~~
140
- qemu_lockcnt_unlock(&ctx->list_lock);
122
+
141
+ qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
123
+QEMU uses an object oriented SW architecture for device emulation code.
142
}
124
+Configured objects are all compiled into the QEMU binary, then objects
143
return ret;
125
+are instantiated by name when used by the guest VM. For example, the
144
}
126
+code to emulate a device named "foo" is always present in QEMU, but its
127
+instantiation code is only run when the device is included in the target
128
+VM. (e.g., via the QEMU command line as *-device foo*)
129
+
130
+The object model is hierarchical, so device emulation code names its
131
+parent object (such as "pci-device" for a PCI device) and QEMU will
132
+instantiate a parent object before calling the device's instantiation
133
+code.
134
+
135
+Current separation models
136
+~~~~~~~~~~~~~~~~~~~~~~~~~
137
+
138
+In order to separate the device emulation code from the CPU emulation
139
+code, the device object code must run in a different process. There are
140
+a couple of existing QEMU features that can run emulation code
141
+separately from the main QEMU process. These are examined below.
142
+
143
+vhost user model
144
+^^^^^^^^^^^^^^^^
145
+
146
+Virtio guest device drivers can be connected to vhost user applications
147
+in order to perform their IO operations. This model uses special virtio
148
+device drivers in the guest and vhost user device objects in QEMU, but
149
+once the QEMU vhost user code has configured the vhost user application,
150
+mission-mode IO is performed by the application. The vhost user
151
+application is a daemon process that can be contacted via a known UNIX
152
+domain socket.
153
+
154
+vhost socket
155
+''''''''''''
156
+
157
+As mentioned above, one of the tasks of the vhost device object within
158
+QEMU is to contact the vhost application and send it configuration
159
+information about this device instance. As part of the configuration
160
+process, the application can also be sent other file descriptors over
161
+the socket, which then can be used by the vhost user application in
162
+various ways, some of which are described below.
163
+
164
+vhost MMIO store acceleration
165
+'''''''''''''''''''''''''''''
166
+
167
+VMs are often run using HW virtualization features via the KVM kernel
168
+driver. This driver allows QEMU to accelerate the emulation of guest CPU
169
+instructions by running the guest in a virtual HW mode. When the guest
170
+executes instructions that cannot be executed by virtual HW mode,
171
+execution returns to the KVM driver so it can inform QEMU to emulate the
172
+instructions in SW.
173
+
174
+One of the events that can cause a return to QEMU is when a guest device
175
+driver accesses an IO location. QEMU then dispatches the memory
176
+operation to the corresponding QEMU device object. In the case of a
177
+vhost user device, the memory operation would need to be sent over a
178
+socket to the vhost application. This path is accelerated by the QEMU
179
+virtio code by setting up an eventfd file descriptor that the vhost
180
+application can directly receive MMIO store notifications from the KVM
181
+driver, instead of needing them to be sent to the QEMU process first.
182
+
183
+vhost interrupt acceleration
184
+''''''''''''''''''''''''''''
185
+
186
+Another optimization used by the vhost application is the ability to
187
+directly inject interrupts into the VM via the KVM driver, again,
188
+bypassing the need to send the interrupt back to the QEMU process first.
189
+The QEMU virtio setup code configures the KVM driver with an eventfd
190
+that triggers the device interrupt in the guest when the eventfd is
191
+written. This irqfd file descriptor is then passed to the vhost user
192
+application program.
193
+
194
+vhost access to guest memory
195
+''''''''''''''''''''''''''''
196
+
197
+The vhost application is also allowed to directly access guest memory,
198
+instead of needing to send the data as messages to QEMU. This is also
199
+done with file descriptors sent to the vhost user application by QEMU.
200
+These descriptors can be passed to ``mmap()`` by the vhost application
201
+to map the guest address space into the vhost application.
202
+
203
+IOMMUs introduce another level of complexity, since the address given to
204
+the guest virtio device to DMA to or from is not a guest physical
205
+address. This case is handled by having vhost code within QEMU register
206
+as a listener for IOMMU mapping changes. The vhost application maintains
207
+a cache of IOMMMU translations: sending translation requests back to
208
+QEMU on cache misses, and in turn receiving flush requests from QEMU
209
+when mappings are purged.
210
+
211
+applicability to device separation
212
+''''''''''''''''''''''''''''''''''
213
+
214
+Much of the vhost model can be re-used by separated device emulation. In
215
+particular, the ideas of using a socket between QEMU and the device
216
+emulation application, using a file descriptor to inject interrupts into
217
+the VM via KVM, and allowing the application to ``mmap()`` the guest
218
+should be re used.
219
+
220
+There are, however, some notable differences between how a vhost
221
+application works and the needs of separated device emulation. The most
222
+basic is that vhost uses custom virtio device drivers which always
223
+trigger IO with MMIO stores. A separated device emulation model must
224
+work with existing IO device models and guest device drivers. MMIO loads
225
+break vhost store acceleration since they are synchronous - guest
226
+progress cannot continue until the load has been emulated. By contrast,
227
+stores are asynchronous, the guest can continue after the store event
228
+has been sent to the vhost application.
229
+
230
+Another difference is that in the vhost user model, a single daemon can
231
+support multiple QEMU instances. This is contrary to the security regime
232
+desired, in which the emulation application should only be allowed to
233
+access the files or devices the VM it's running on behalf of can access.
234
+#### qemu-io model
235
+
236
+Qemu-io is a test harness used to test changes to the QEMU block backend
237
+object code. (e.g., the code that implements disk images for disk driver
238
+emulation) Qemu-io is not a device emulation application per se, but it
239
+does compile the QEMU block objects into a separate binary from the main
240
+QEMU one. This could be useful for disk device emulation, since its
241
+emulation applications will need to include the QEMU block objects.
242
+
243
+New separation model based on proxy objects
244
+-------------------------------------------
245
+
246
+A different model based on proxy objects in the QEMU program
247
+communicating with remote emulation programs could provide separation
248
+while minimizing the changes needed to the device emulation code. The
249
+rest of this section is a discussion of how a proxy object model would
250
+work.
251
+
252
+Remote emulation processes
253
+~~~~~~~~~~~~~~~~~~~~~~~~~~
254
+
255
+The remote emulation process will run the QEMU object hierarchy without
256
+modification. The device emulation objects will be also be based on the
257
+QEMU code, because for anything but the simplest device, it would not be
258
+a tractable to re-implement both the object model and the many device
259
+backends that QEMU has.
260
+
261
+The processes will communicate with the QEMU process over UNIX domain
262
+sockets. The processes can be executed either as standalone processes,
263
+or be executed by QEMU. In both cases, the host backends the emulation
264
+processes will provide are specified on its command line, as they would
265
+be for QEMU. For example:
266
+
267
+::
268
+
269
+ disk-proc -blockdev driver=file,node-name=file0,filename=disk-file0 \
270
+ -blockdev driver=qcow2,node-name=drive0,file=file0
271
+
272
+would indicate process *disk-proc* uses a qcow2 emulated disk named
273
+*file0* as its backend.
274
+
275
+Emulation processes may emulate more than one guest controller. A common
276
+configuration might be to put all controllers of the same device class
277
+(e.g., disk, network, etc.) in a single process, so that all backends of
278
+the same type can be managed by a single QMP monitor.
279
+
280
+communication with QEMU
281
+^^^^^^^^^^^^^^^^^^^^^^^
282
+
283
+The first argument to the remote emulation process will be a Unix domain
284
+socket that connects with the Proxy object. This is a required argument.
285
+
286
+::
287
+
288
+ disk-proc <socket number> <backend list>
289
+
290
+remote process QMP monitor
291
+^^^^^^^^^^^^^^^^^^^^^^^^^^
292
+
293
+Remote emulation processes can be monitored via QMP, similar to QEMU
294
+itself. The QMP monitor socket is specified the same as for a QEMU
295
+process:
296
+
297
+::
298
+
299
+ disk-proc -qmp unix:/tmp/disk-mon,server
300
+
301
+can be monitored over the UNIX socket path */tmp/disk-mon*.
302
+
303
+QEMU command line
304
+~~~~~~~~~~~~~~~~~
305
+
306
+Each remote device emulated in a remote process on the host is
307
+represented as a *-device* of type *pci-proxy-dev*. A socket
308
+sub-option to this option specifies the Unix socket that connects
309
+to the remote process. An *id* sub-option is required, and it should
310
+be the same id as used in the remote process.
311
+
312
+::
313
+
314
+ qemu-system-x86_64 ... -device pci-proxy-dev,id=lsi0,socket=3
315
+
316
+can be used to add a device emulated in a remote process
317
+
318
+
319
+QEMU management of remote processes
320
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
321
+
322
+QEMU is not aware of the type of type of the remote PCI device. It is
323
+a pass through device as far as QEMU is concerned.
324
+
325
+communication with emulation process
326
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
327
+
328
+primary channel
329
+'''''''''''''''
330
+
331
+The primary channel (referred to as com in the code) is used to bootstrap
332
+the remote process. It is also used to pass on device-agnostic commands
333
+like reset.
334
+
335
+per-device channels
336
+'''''''''''''''''''
337
+
338
+Each remote device communicates with QEMU using a dedicated communication
339
+channel. The proxy object sets up this channel using the primary
340
+channel during its initialization.
341
+
342
+QEMU device proxy objects
343
+~~~~~~~~~~~~~~~~~~~~~~~~~
344
+
345
+QEMU has an object model based on sub-classes inherited from the
346
+"object" super-class. The sub-classes that are of interest here are the
347
+"device" and "bus" sub-classes whose child sub-classes make up the
348
+device tree of a QEMU emulated system.
349
+
350
+The proxy object model will use device proxy objects to replace the
351
+device emulation code within the QEMU process. These objects will live
352
+in the same place in the object and bus hierarchies as the objects they
353
+replace. i.e., the proxy object for an LSI SCSI controller will be a
354
+sub-class of the "pci-device" class, and will have the same PCI bus
355
+parent and the same SCSI bus child objects as the LSI controller object
356
+it replaces.
357
+
358
+It is worth noting that the same proxy object is used to mediate with
359
+all types of remote PCI devices.
360
+
361
+object initialization
362
+^^^^^^^^^^^^^^^^^^^^^
363
+
364
+The Proxy device objects are initialized in the exact same manner in
365
+which any other QEMU device would be initialized.
366
+
367
+In addition, the Proxy objects perform the following two tasks:
368
+- Parses the "socket" sub option and connects to the remote process
369
+using this channel
370
+- Uses the "id" sub-option to connect to the emulated device on the
371
+separate process
372
+
373
+class\_init
374
+'''''''''''
375
+
376
+The ``class_init()`` method of a proxy object will, in general behave
377
+similarly to the object it replaces, including setting any static
378
+properties and methods needed by the proxy.
379
+
380
+instance\_init / realize
381
+''''''''''''''''''''''''
382
+
383
+The ``instance_init()`` and ``realize()`` functions would only need to
384
+perform tasks related to being a proxy, such are registering its own
385
+MMIO handlers, or creating a child bus that other proxy devices can be
386
+attached to later.
387
+
388
+Other tasks will be device-specific. For example, PCI device objects
389
+will initialize the PCI config space in order to make a valid PCI device
390
+tree within the QEMU process.
391
+
392
+address space registration
393
+^^^^^^^^^^^^^^^^^^^^^^^^^^
394
+
395
+Most devices are driven by guest device driver accesses to IO addresses
396
+or ports. The QEMU device emulation code uses QEMU's memory region
397
+function calls (such as ``memory_region_init_io()``) to add callback
398
+functions that QEMU will invoke when the guest accesses the device's
399
+areas of the IO address space. When a guest driver does access the
400
+device, the VM will exit HW virtualization mode and return to QEMU,
401
+which will then lookup and execute the corresponding callback function.
402
+
403
+A proxy object would need to mirror the memory region calls the actual
404
+device emulator would perform in its initialization code, but with its
405
+own callbacks. When invoked by QEMU as a result of a guest IO operation,
406
+they will forward the operation to the device emulation process.
407
+
408
+PCI config space
409
+^^^^^^^^^^^^^^^^
410
+
411
+PCI devices also have a configuration space that can be accessed by the
412
+guest driver. Guest accesses to this space is not handled by the device
413
+emulation object, but by its PCI parent object. Much of this space is
414
+read-only, but certain registers (especially BAR and MSI-related ones)
415
+need to be propagated to the emulation process.
416
+
417
+PCI parent proxy
418
+''''''''''''''''
419
+
420
+One way to propagate guest PCI config accesses is to create a
421
+"pci-device-proxy" class that can serve as the parent of a PCI device
422
+proxy object. This class's parent would be "pci-device" and it would
423
+override the PCI parent's ``config_read()`` and ``config_write()``
424
+methods with ones that forward these operations to the emulation
425
+program.
426
+
427
+interrupt receipt
428
+^^^^^^^^^^^^^^^^^
429
+
430
+A proxy for a device that generates interrupts will need to create a
431
+socket to receive interrupt indications from the emulation process. An
432
+incoming interrupt indication would then be sent up to its bus parent to
433
+be injected into the guest. For example, a PCI device object may use
434
+``pci_set_irq()``.
435
+
436
+live migration
437
+^^^^^^^^^^^^^^
438
+
439
+The proxy will register to save and restore any *vmstate* it needs over
440
+a live migration event. The device proxy does not need to manage the
441
+remote device's *vmstate*; that will be handled by the remote process
442
+proxy (see below).
443
+
444
+QEMU remote device operation
445
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
446
+
447
+Generic device operations, such as DMA, will be performed by the remote
448
+process proxy by sending messages to the remote process.
449
+
450
+DMA operations
451
+^^^^^^^^^^^^^^
452
+
453
+DMA operations would be handled much like vhost applications do. One of
454
+the initial messages sent to the emulation process is a guest memory
455
+table. Each entry in this table consists of a file descriptor and size
456
+that the emulation process can ``mmap()`` to directly access guest
457
+memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
458
+must be backed by file descriptors, such as when QEMU is given the
459
+*-mem-path* command line option.
460
+
461
+IOMMU operations
462
+^^^^^^^^^^^^^^^^
463
+
464
+When the emulated system includes an IOMMU, the remote process proxy in
465
+QEMU will need to create a socket for IOMMU requests from the emulation
466
+process. It will handle those requests with an
467
+``address_space_get_iotlb_entry()`` call. In order to handle IOMMU
468
+unmaps, the remote process proxy will also register as a listener on the
469
+device's DMA address space. When an IOMMU memory region is created
470
+within the DMA address space, an IOMMU notifier for unmaps will be added
471
+to the memory region that will forward unmaps to the emulation process
472
+over the IOMMU socket.
473
+
474
+device hot-plug via QMP
475
+^^^^^^^^^^^^^^^^^^^^^^^
476
+
477
+An QMP "device\_add" command can add a device emulated by a remote
478
+process. It will also have "rid" option to the command, just as the
479
+*-device* command line option does. The remote process may either be one
480
+started at QEMU startup, or be one added by the "add-process" QMP
481
+command described above. In either case, the remote process proxy will
482
+forward the new device's JSON description to the corresponding emulation
483
+process.
484
+
485
+live migration
486
+^^^^^^^^^^^^^^
487
+
488
+The remote process proxy will also register for live migration
489
+notifications with ``vmstate_register()``. When called to save state,
490
+the proxy will send the remote process a secondary socket file
491
+descriptor to save the remote process's device *vmstate* over. The
492
+incoming byte stream length and data will be saved as the proxy's
493
+*vmstate*. When the proxy is resumed on its new host, this *vmstate*
494
+will be extracted, and a secondary socket file descriptor will be sent
495
+to the new remote process through which it receives the *vmstate* in
496
+order to restore the devices there.
497
+
498
+device emulation in remote process
499
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
500
+
501
+The parts of QEMU that the emulation program will need include the
502
+object model; the memory emulation objects; the device emulation objects
503
+of the targeted device, and any dependent devices; and, the device's
504
+backends. It will also need code to setup the machine environment,
505
+handle requests from the QEMU process, and route machine-level requests
506
+(such as interrupts or IOMMU mappings) back to the QEMU process.
507
+
508
+initialization
509
+^^^^^^^^^^^^^^
510
+
511
+The process initialization sequence will follow the same sequence
512
+followed by QEMU. It will first initialize the backend objects, then
513
+device emulation objects. The JSON descriptions sent by the QEMU process
514
+will drive which objects need to be created.
515
+
516
+- address spaces
517
+
518
+Before the device objects are created, the initial address spaces and
519
+memory regions must be configured with ``memory_map_init()``. This
520
+creates a RAM memory region object (*system\_memory*) and an IO memory
521
+region object (*system\_io*).
522
+
523
+- RAM
524
+
525
+RAM memory region creation will follow how ``pc_memory_init()`` creates
526
+them, but must use ``memory_region_init_ram_from_fd()`` instead of
527
+``memory_region_allocate_system_memory()``. The file descriptors needed
528
+will be supplied by the guest memory table from above. Those RAM regions
529
+would then be added to the *system\_memory* memory region with
530
+``memory_region_add_subregion()``.
531
+
532
+- PCI
533
+
534
+IO initialization will be driven by the JSON descriptions sent from the
535
+QEMU process. For a PCI device, a PCI bus will need to be created with
536
+``pci_root_bus_new()``, and a PCI memory region will need to be created
537
+and added to the *system\_memory* memory region with
538
+``memory_region_add_subregion_overlap()``. The overlap version is
539
+required for architectures where PCI memory overlaps with RAM memory.
540
+
541
+MMIO handling
542
+^^^^^^^^^^^^^
543
+
544
+The device emulation objects will use ``memory_region_init_io()`` to
545
+install their MMIO handlers, and ``pci_register_bar()`` to associate
546
+those handlers with a PCI BAR, as they do within QEMU currently.
547
+
548
+In order to use ``address_space_rw()`` in the emulation process to
549
+handle MMIO requests from QEMU, the PCI physical addresses must be the
550
+same in the QEMU process and the device emulation process. In order to
551
+accomplish that, guest BAR programming must also be forwarded from QEMU
552
+to the emulation process.
553
+
554
+interrupt injection
555
+^^^^^^^^^^^^^^^^^^^
556
+
557
+When device emulation wants to inject an interrupt into the VM, the
558
+request climbs the device's bus object hierarchy until the point where a
559
+bus object knows how to signal the interrupt to the guest. The details
560
+depend on the type of interrupt being raised.
561
+
562
+- PCI pin interrupts
563
+
564
+On x86 systems, there is an emulated IOAPIC object attached to the root
565
+PCI bus object, and the root PCI object forwards interrupt requests to
566
+it. The IOAPIC object, in turn, calls the KVM driver to inject the
567
+corresponding interrupt into the VM. The simplest way to handle this in
568
+an emulation process would be to setup the root PCI bus driver (via
569
+``pci_bus_irqs()``) to send a interrupt request back to the QEMU
570
+process, and have the device proxy object reflect it up the PCI tree
571
+there.
572
+
573
+- PCI MSI/X interrupts
574
+
575
+PCI MSI/X interrupts are implemented in HW as DMA writes to a
576
+CPU-specific PCI address. In QEMU on x86, a KVM APIC object receives
577
+these DMA writes, then calls into the KVM driver to inject the interrupt
578
+into the VM. A simple emulation process implementation would be to send
579
+the MSI DMA address from QEMU as a message at initialization, then
580
+install an address space handler at that address which forwards the MSI
581
+message back to QEMU.
582
+
583
+DMA operations
584
+^^^^^^^^^^^^^^
585
+
586
+When a emulation object wants to DMA into or out of guest memory, it
587
+first must use dma\_memory\_map() to convert the DMA address to a local
588
+virtual address. The emulation process memory region objects setup above
589
+will be used to translate the DMA address to a local virtual address the
590
+device emulation code can access.
591
+
592
+IOMMU
593
+^^^^^
594
+
595
+When an IOMMU is in use in QEMU, DMA translation uses IOMMU memory
596
+regions to translate the DMA address to a guest physical address before
597
+that physical address can be translated to a local virtual address. The
598
+emulation process will need similar functionality.
599
+
600
+- IOTLB cache
601
+
602
+The emulation process will maintain a cache of recent IOMMU translations
603
+(the IOTLB). When the translate() callback of an IOMMU memory region is
604
+invoked, the IOTLB cache will be searched for an entry that will map the
605
+DMA address to a guest PA. On a cache miss, a message will be sent back
606
+to QEMU requesting the corresponding translation entry, which be both be
607
+used to return a guest address and be added to the cache.
608
+
609
+- IOTLB purge
610
+
611
+The IOMMU emulation will also need to act on unmap requests from QEMU.
612
+These happen when the guest IOMMU driver purges an entry from the
613
+guest's translation table.
614
+
615
+live migration
616
+^^^^^^^^^^^^^^
617
+
618
+When a remote process receives a live migration indication from QEMU, it
619
+will set up a channel using the received file descriptor with
620
+``qio_channel_socket_new_fd()``. This channel will be used to create a
621
+*QEMUfile* that can be passed to ``qemu_save_device_state()`` to send
622
+the process's device state back to QEMU. This method will be reversed on
623
+restore - the channel will be passed to ``qemu_loadvm_state()`` to
624
+restore the device state.
625
+
626
+Accelerating device emulation
627
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
628
+
629
+The messages that are required to be sent between QEMU and the emulation
630
+process can add considerable latency to IO operations. The optimizations
631
+described below attempt to ameliorate this effect by allowing the
632
+emulation process to communicate directly with the kernel KVM driver.
633
+The KVM file descriptors created would be passed to the emulation process
634
+via initialization messages, much like the guest memory table is done.
635
+#### MMIO acceleration
636
+
637
+Vhost user applications can receive guest virtio driver stores directly
638
+from KVM. The issue with the eventfd mechanism used by vhost user is
639
+that it does not pass any data with the event indication, so it cannot
640
+handle guest loads or guest stores that carry store data. This concept
641
+could, however, be expanded to cover more cases.
642
+
643
+The expanded idea would require a new type of KVM device:
644
+*KVM\_DEV\_TYPE\_USER*. This device has two file descriptors: a master
645
+descriptor that QEMU can use for configuration, and a slave descriptor
646
+that the emulation process can use to receive MMIO notifications. QEMU
647
+would create both descriptors using the KVM driver, and pass the slave
648
+descriptor to the emulation process via an initialization message.
649
+
650
+data structures
651
+^^^^^^^^^^^^^^^
652
+
653
+- guest physical range
654
+
655
+The guest physical range structure describes the address range that a
656
+device will respond to. It includes the base and length of the range, as
657
+well as which bus the range resides on (e.g., on an x86machine, it can
658
+specify whether the range refers to memory or IO addresses).
659
+
660
+A device can have multiple physical address ranges it responds to (e.g.,
661
+a PCI device can have multiple BARs), so the structure will also include
662
+an enumerated identifier to specify which of the device's ranges is
663
+being referred to.
664
+
665
++--------+----------------------------+
666
+| Name | Description |
667
++========+============================+
668
+| addr | range base address |
669
++--------+----------------------------+
670
+| len | range length |
671
++--------+----------------------------+
672
+| bus | addr type (memory or IO) |
673
++--------+----------------------------+
674
+| id | range ID (e.g., PCI BAR) |
675
++--------+----------------------------+
676
+
677
+- MMIO request structure
678
+
679
+This structure describes an MMIO operation. It includes which guest
680
+physical range the MMIO was within, the offset within that range, the
681
+MMIO type (e.g., load or store), and its length and data. It also
682
+includes a sequence number that can be used to reply to the MMIO, and
683
+the CPU that issued the MMIO.
684
+
685
++----------+------------------------+
686
+| Name | Description |
687
++==========+========================+
688
+| rid | range MMIO is within |
689
++----------+------------------------+
690
+| offset | offset withing *rid* |
691
++----------+------------------------+
692
+| type | e.g., load or store |
693
++----------+------------------------+
694
+| len | MMIO length |
695
++----------+------------------------+
696
+| data | store data |
697
++----------+------------------------+
698
+| seq | sequence ID |
699
++----------+------------------------+
700
+
701
+- MMIO request queues
702
+
703
+MMIO request queues are FIFO arrays of MMIO request structures. There
704
+are two queues: pending queue is for MMIOs that haven't been read by the
705
+emulation program, and the sent queue is for MMIOs that haven't been
706
+acknowledged. The main use of the second queue is to validate MMIO
707
+replies from the emulation program.
708
+
709
+- scoreboard
710
+
711
+Each CPU in the VM is emulated in QEMU by a separate thread, so multiple
712
+MMIOs may be waiting to be consumed by an emulation program and multiple
713
+threads may be waiting for MMIO replies. The scoreboard would contain a
714
+wait queue and sequence number for the per-CPU threads, allowing them to
715
+be individually woken when the MMIO reply is received from the emulation
716
+program. It also tracks the number of posted MMIO stores to the device
717
+that haven't been replied to, in order to satisfy the PCI constraint
718
+that a load to a device will not complete until all previous stores to
719
+that device have been completed.
720
+
721
+- device shadow memory
722
+
723
+Some MMIO loads do not have device side-effects. These MMIOs can be
724
+completed without sending a MMIO request to the emulation program if the
725
+emulation program shares a shadow image of the device's memory image
726
+with the KVM driver.
727
+
728
+The emulation program will ask the KVM driver to allocate memory for the
729
+shadow image, and will then use ``mmap()`` to directly access it. The
730
+emulation program can control KVM access to the shadow image by sending
731
+KVM an access map telling it which areas of the image have no
732
+side-effects (and can be completed immediately), and which require a
733
+MMIO request to the emulation program. The access map can also inform
734
+the KVM drive which size accesses are allowed to the image.
735
+
736
+master descriptor
737
+^^^^^^^^^^^^^^^^^
738
+
739
+The master descriptor is used by QEMU to configure the new KVM device.
740
+The descriptor would be returned by the KVM driver when QEMU issues a
741
+*KVM\_CREATE\_DEVICE* ``ioctl()`` with a *KVM\_DEV\_TYPE\_USER* type.
742
+
743
+KVM\_DEV\_TYPE\_USER device ops
744
+
745
+
746
+The *KVM\_DEV\_TYPE\_USER* operations vector will be registered by a
747
+``kvm_register_device_ops()`` call when the KVM system in initialized by
748
+``kvm_init()``. These device ops are called by the KVM driver when QEMU
749
+executes certain ``ioctl()`` operations on its KVM file descriptor. They
750
+include:
751
+
752
+- create
753
+
754
+This routine is called when QEMU issues a *KVM\_CREATE\_DEVICE*
755
+``ioctl()`` on its per-VM file descriptor. It will allocate and
756
+initialize a KVM user device specific data structure, and assign the
757
+*kvm\_device* private field to it.
758
+
759
+- ioctl
760
+
761
+This routine is invoked when QEMU issues an ``ioctl()`` on the master
762
+descriptor. The ``ioctl()`` commands supported are defined by the KVM
763
+device type. *KVM\_DEV\_TYPE\_USER* ones will need several commands:
764
+
765
+*KVM\_DEV\_USER\_SLAVE\_FD* creates the slave file descriptor that will
766
+be passed to the device emulation program. Only one slave can be created
767
+by each master descriptor. The file operations performed by this
768
+descriptor are described below.
769
+
770
+The *KVM\_DEV\_USER\_PA\_RANGE* command configures a guest physical
771
+address range that the slave descriptor will receive MMIO notifications
772
+for. The range is specified by a guest physical range structure
773
+argument. For buses that assign addresses to devices dynamically, this
774
+command can be executed while the guest is running, such as the case
775
+when a guest changes a device's PCI BAR registers.
776
+
777
+*KVM\_DEV\_USER\_PA\_RANGE* will use ``kvm_io_bus_register_dev()`` to
778
+register *kvm\_io\_device\_ops* callbacks to be invoked when the guest
779
+performs a MMIO operation within the range. When a range is changed,
780
+``kvm_io_bus_unregister_dev()`` is used to remove the previous
781
+instantiation.
782
+
783
+*KVM\_DEV\_USER\_TIMEOUT* will configure a timeout value that specifies
784
+how long KVM will wait for the emulation process to respond to a MMIO
785
+indication.
786
+
787
+- destroy
788
+
789
+This routine is called when the VM instance is destroyed. It will need
790
+to destroy the slave descriptor; and free any memory allocated by the
791
+driver, as well as the *kvm\_device* structure itself.
792
+
793
+slave descriptor
794
+^^^^^^^^^^^^^^^^
795
+
796
+The slave descriptor will have its own file operations vector, which
797
+responds to system calls on the descriptor performed by the device
798
+emulation program.
799
+
800
+- read
801
+
802
+A read returns any pending MMIO requests from the KVM driver as MMIO
803
+request structures. Multiple structures can be returned if there are
804
+multiple MMIO operations pending. The MMIO requests are moved from the
805
+pending queue to the sent queue, and if there are threads waiting for
806
+space in the pending to add new MMIO operations, they will be woken
807
+here.
808
+
809
+- write
810
+
811
+A write also consists of a set of MMIO requests. They are compared to
812
+the MMIO requests in the sent queue. Matches are removed from the sent
813
+queue, and any threads waiting for the reply are woken. If a store is
814
+removed, then the number of posted stores in the per-CPU scoreboard is
815
+decremented. When the number is zero, and a non side-effect load was
816
+waiting for posted stores to complete, the load is continued.
817
+
818
+- ioctl
819
+
820
+There are several ioctl()s that can be performed on the slave
821
+descriptor.
822
+
823
+A *KVM\_DEV\_USER\_SHADOW\_SIZE* ``ioctl()`` causes the KVM driver to
824
+allocate memory for the shadow image. This memory can later be
825
+``mmap()``\ ed by the emulation process to share the emulation's view of
826
+device memory with the KVM driver.
827
+
828
+A *KVM\_DEV\_USER\_SHADOW\_CTRL* ``ioctl()`` controls access to the
829
+shadow image. It will send the KVM driver a shadow control map, which
830
+specifies which areas of the image can complete guest loads without
831
+sending the load request to the emulation program. It will also specify
832
+the size of load operations that are allowed.
833
+
834
+- poll
835
+
836
+An emulation program will use the ``poll()`` call with a *POLLIN* flag
837
+to determine if there are MMIO requests waiting to be read. It will
838
+return if the pending MMIO request queue is not empty.
839
+
840
+- mmap
841
+
842
+This call allows the emulation program to directly access the shadow
843
+image allocated by the KVM driver. As device emulation updates device
844
+memory, changes with no side-effects will be reflected in the shadow,
845
+and the KVM driver can satisfy guest loads from the shadow image without
846
+needing to wait for the emulation program.
847
+
848
+kvm\_io\_device ops
849
+^^^^^^^^^^^^^^^^^^^
850
+
851
+Each KVM per-CPU thread can handle MMIO operation on behalf of the guest
852
+VM. KVM will use the MMIO's guest physical address to search for a
853
+matching *kvm\_io\_device* to see if the MMIO can be handled by the KVM
854
+driver instead of exiting back to QEMU. If a match is found, the
855
+corresponding callback will be invoked.
856
+
857
+- read
858
+
859
+This callback is invoked when the guest performs a load to the device.
860
+Loads with side-effects must be handled synchronously, with the KVM
861
+driver putting the QEMU thread to sleep waiting for the emulation
862
+process reply before re-starting the guest. Loads that do not have
863
+side-effects may be optimized by satisfying them from the shadow image,
864
+if there are no outstanding stores to the device by this CPU. PCI memory
865
+ordering demands that a load cannot complete before all older stores to
866
+the same device have been completed.
867
+
868
+- write
869
+
870
+Stores can be handled asynchronously unless the pending MMIO request
871
+queue is full. In this case, the QEMU thread must sleep waiting for
872
+space in the queue. Stores will increment the number of posted stores in
873
+the per-CPU scoreboard, in order to implement the PCI ordering
874
+constraint above.
875
+
876
+interrupt acceleration
877
+^^^^^^^^^^^^^^^^^^^^^^
878
+
879
+This performance optimization would work much like a vhost user
880
+application does, where the QEMU process sets up *eventfds* that cause
881
+the device's corresponding interrupt to be triggered by the KVM driver.
882
+These irq file descriptors are sent to the emulation process at
883
+initialization, and are used when the emulation code raises a device
884
+interrupt.
885
+
886
+intx acceleration
887
+'''''''''''''''''
888
+
889
+Traditional PCI pin interrupts are level based, so, in addition to an
890
+irq file descriptor, a re-sampling file descriptor needs to be sent to
891
+the emulation program. This second file descriptor allows multiple
892
+devices sharing an irq to be notified when the interrupt has been
893
+acknowledged by the guest, so they can re-trigger the interrupt if their
894
+device has not de-asserted its interrupt.
895
+
896
+intx irq descriptor
897
+
898
+
899
+The irq descriptors are created by the proxy object
900
+``using event_notifier_init()`` to create the irq and re-sampling
901
+*eventds*, and ``kvm_vm_ioctl(KVM_IRQFD)`` to bind them to an interrupt.
902
+The interrupt route can be found with
903
+``pci_device_route_intx_to_irq()``.
904
+
905
+intx routing changes
906
+
907
+
908
+Intx routing can be changed when the guest programs the APIC the device
909
+pin is connected to. The proxy object in QEMU will use
910
+``pci_device_set_intx_routing_notifier()`` to be informed of any guest
911
+changes to the route. This handler will broadly follow the VFIO
912
+interrupt logic to change the route: de-assigning the existing irq
913
+descriptor from its route, then assigning it the new route. (see
914
+``vfio_intx_update()``)
915
+
916
+MSI/X acceleration
917
+''''''''''''''''''
918
+
919
+MSI/X interrupts are sent as DMA transactions to the host. The interrupt
920
+data contains a vector that is programmed by the guest, A device may have
921
+multiple MSI interrupts associated with it, so multiple irq descriptors
922
+may need to be sent to the emulation program.
923
+
924
+MSI/X irq descriptor
925
+
926
+
927
+This case will also follow the VFIO example. For each MSI/X interrupt,
928
+an *eventfd* is created, a virtual interrupt is allocated by
929
+``kvm_irqchip_add_msi_route()``, and the virtual interrupt is bound to
930
+the eventfd with ``kvm_irqchip_add_irqfd_notifier()``.
931
+
932
+MSI/X config space changes
933
+
934
+
935
+The guest may dynamically update several MSI-related tables in the
936
+device's PCI config space. These include per-MSI interrupt enables and
937
+vector data. Additionally, MSIX tables exist in device memory space, not
938
+config space. Much like the BAR case above, the proxy object must look
939
+at guest config space programming to keep the MSI interrupt state
940
+consistent between QEMU and the emulation program.
941
+
942
+--------------
943
+
944
+Disaggregated CPU emulation
945
+---------------------------
946
+
947
+After IO services have been disaggregated, a second phase would be to
948
+separate a process to handle CPU instruction emulation from the main
949
+QEMU control function. There are no object separation points for this
950
+code, so the first task would be to create one.
951
+
952
+Host access controls
953
+--------------------
954
+
955
+Separating QEMU relies on the host OS's access restriction mechanisms to
956
+enforce that the differing processes can only access the objects they
957
+are entitled to. There are a couple types of mechanisms usually provided
958
+by general purpose OSs.
959
+
960
+Discretionary access control
961
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
962
+
963
+Discretionary access control allows each user to control who can access
964
+their files. In Linux, this type of control is usually too coarse for
965
+QEMU separation, since it only provides three separate access controls:
966
+one for the same user ID, the second for users IDs with the same group
967
+ID, and the third for all other user IDs. Each device instance would
968
+need a separate user ID to provide access control, which is likely to be
969
+unwieldy for dynamically created VMs.
970
+
971
+Mandatory access control
972
+~~~~~~~~~~~~~~~~~~~~~~~~
973
+
974
+Mandatory access control allows the OS to add an additional set of
975
+controls on top of discretionary access for the OS to control. It also
976
+adds other attributes to processes and files such as types, roles, and
977
+categories, and can establish rules for how processes and files can
978
+interact.
979
+
980
+Type enforcement
981
+^^^^^^^^^^^^^^^^
982
+
983
+Type enforcement assigns a *type* attribute to processes and files, and
984
+allows rules to be written on what operations a process with a given
985
+type can perform on a file with a given type. QEMU separation could take
986
+advantage of type enforcement by running the emulation processes with
987
+different types, both from the main QEMU process, and from the emulation
988
+processes of different classes of devices.
989
+
990
+For example, guest disk images and disk emulation processes could have
991
+types separate from the main QEMU process and non-disk emulation
992
+processes, and the type rules could prevent processes other than disk
993
+emulation ones from accessing guest disk images. Similarly, network
994
+emulation processes can have a type separate from the main QEMU process
995
+and non-network emulation process, and only that type can access the
996
+host tun/tap device used to provide guest networking.
997
+
998
+Category enforcement
999
+^^^^^^^^^^^^^^^^^^^^
1000
+
1001
+Category enforcement assigns a set of numbers within a given range to
1002
+the process or file. The process is granted access to the file if the
1003
+process's set is a superset of the file's set. This enforcement can be
1004
+used to separate multiple instances of devices in the same class.
1005
+
1006
+For example, if there are multiple disk devices provides to a guest,
1007
+each device emulation process could be provisioned with a separate
1008
+category. The different device emulation processes would not be able to
1009
+access each other's backing disk images.
1010
+
1011
+Alternatively, categories could be used in lieu of the type enforcement
1012
+scheme described above. In this scenario, different categories would be
1013
+used to prevent device emulation processes in different classes from
1014
+accessing resources assigned to other classes.
145
--
1015
--
146
2.9.3
1016
2.29.2
147
1017
148
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
Adds documentation explaining the command-line arguments needed
4
to use multi-process.
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Message-id: 49f757a84e5dd6fae14b22544897d1124c5fdbad.1611938319.git.jag.raman@oracle.com
5
Reviewed-by: Fam Zheng <famz@redhat.com>
11
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
12
[Move orphan docs/multi-process.rst document into docs/system/ and add
7
Message-id: 20170213135235.12274-16-pbonzini@redhat.com
13
it to index.rst to prevent Sphinx "document isn't included in any
14
toctree" error.
15
--Stefan]
16
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
18
---
10
block/archipelago.c | 3 ---
19
MAINTAINERS | 1 +
11
block/block-backend.c | 7 -------
20
docs/system/index.rst | 1 +
12
block/curl.c | 2 +-
21
docs/system/multi-process.rst | 64 +++++++++++++++++++++++++++++++++++
13
block/io.c | 6 +-----
22
3 files changed, 66 insertions(+)
14
block/iscsi.c | 3 ---
23
create mode 100644 docs/system/multi-process.rst
15
block/linux-aio.c | 5 +----
16
block/mirror.c | 12 +++++++++---
17
block/null.c | 8 --------
18
block/qed-cluster.c | 2 ++
19
block/qed-table.c | 12 ++++++++++--
20
block/qed.c | 4 ++--
21
block/rbd.c | 4 ----
22
block/win32-aio.c | 3 ---
23
hw/block/virtio-blk.c | 12 +++++++++++-
24
hw/scsi/scsi-disk.c | 15 +++++++++++++++
25
hw/scsi/scsi-generic.c | 20 +++++++++++++++++---
26
util/thread-pool.c | 4 +++-
27
17 files changed, 72 insertions(+), 50 deletions(-)
28
24
29
diff --git a/block/archipelago.c b/block/archipelago.c
25
diff --git a/MAINTAINERS b/MAINTAINERS
30
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
31
--- a/block/archipelago.c
27
--- a/MAINTAINERS
32
+++ b/block/archipelago.c
28
+++ b/MAINTAINERS
33
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
29
@@ -XXX,XX +XXX,XX @@ M: Jagannathan Raman <jag.raman@oracle.com>
34
{
30
M: John G Johnson <john.g.johnson@oracle.com>
35
AIORequestData *reqdata = (AIORequestData *) opaque;
31
S: Maintained
36
ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
32
F: docs/devel/multi-process.rst
37
- AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
33
+F: docs/system/multi-process.rst
38
34
39
- aio_context_acquire(ctx);
35
Build and test automation
40
aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
36
-------------------------
41
- aio_context_release(ctx);
37
diff --git a/docs/system/index.rst b/docs/system/index.rst
42
aio_cb->status = 0;
43
44
qemu_aio_unref(aio_cb);
45
diff --git a/block/block-backend.c b/block/block-backend.c
46
index XXXXXXX..XXXXXXX 100644
38
index XXXXXXX..XXXXXXX 100644
47
--- a/block/block-backend.c
39
--- a/docs/system/index.rst
48
+++ b/block/block-backend.c
40
+++ b/docs/system/index.rst
49
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
41
@@ -XXX,XX +XXX,XX @@ Contents:
50
static void error_callback_bh(void *opaque)
42
pr-manager
51
{
43
targets
52
struct BlockBackendAIOCB *acb = opaque;
44
security
53
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
45
+ multi-process
54
46
deprecated
55
bdrv_dec_in_flight(acb->common.bs);
47
removed-features
56
- aio_context_acquire(ctx);
48
build-platforms
57
acb->common.cb(acb->common.opaque, acb->ret);
49
diff --git a/docs/system/multi-process.rst b/docs/system/multi-process.rst
58
- aio_context_release(ctx);
50
new file mode 100644
59
qemu_aio_unref(acb);
51
index XXXXXXX..XXXXXXX
60
}
52
--- /dev/null
61
53
+++ b/docs/system/multi-process.rst
62
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
54
@@ -XXX,XX +XXX,XX @@
63
static void blk_aio_complete_bh(void *opaque)
55
+Multi-process QEMU
64
{
56
+==================
65
BlkAioEmAIOCB *acb = opaque;
66
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
67
-
68
assert(acb->has_returned);
69
- aio_context_acquire(ctx);
70
blk_aio_complete(acb);
71
- aio_context_release(ctx);
72
}
73
74
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
75
diff --git a/block/curl.c b/block/curl.c
76
index XXXXXXX..XXXXXXX 100644
77
--- a/block/curl.c
78
+++ b/block/curl.c
79
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
80
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
81
82
out:
83
+ aio_context_release(ctx);
84
if (ret != -EINPROGRESS) {
85
acb->common.cb(acb->common.opaque, ret);
86
qemu_aio_unref(acb);
87
}
88
- aio_context_release(ctx);
89
}
90
91
static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
92
diff --git a/block/io.c b/block/io.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/block/io.c
95
+++ b/block/io.c
96
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_io_em_complete(void *opaque, int ret)
97
CoroutineIOCompletion *co = opaque;
98
99
co->ret = ret;
100
- qemu_coroutine_enter(co->coroutine);
101
+ aio_co_wake(co->coroutine);
102
}
103
104
static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
105
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
106
static void bdrv_co_em_bh(void *opaque)
107
{
108
BlockAIOCBCoroutine *acb = opaque;
109
- BlockDriverState *bs = acb->common.bs;
110
- AioContext *ctx = bdrv_get_aio_context(bs);
111
112
assert(!acb->need_bh);
113
- aio_context_acquire(ctx);
114
bdrv_co_complete(acb);
115
- aio_context_release(ctx);
116
}
117
118
static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
119
diff --git a/block/iscsi.c b/block/iscsi.c
120
index XXXXXXX..XXXXXXX 100644
121
--- a/block/iscsi.c
122
+++ b/block/iscsi.c
123
@@ -XXX,XX +XXX,XX @@ static void
124
iscsi_bh_cb(void *p)
125
{
126
IscsiAIOCB *acb = p;
127
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
128
129
qemu_bh_delete(acb->bh);
130
131
g_free(acb->buf);
132
acb->buf = NULL;
133
134
- aio_context_acquire(ctx);
135
acb->common.cb(acb->common.opaque, acb->status);
136
- aio_context_release(ctx);
137
138
if (acb->task != NULL) {
139
scsi_free_scsi_task(acb->task);
140
diff --git a/block/linux-aio.c b/block/linux-aio.c
141
index XXXXXXX..XXXXXXX 100644
142
--- a/block/linux-aio.c
143
+++ b/block/linux-aio.c
144
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
145
*/
146
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
147
{
148
- LinuxAioState *s = laiocb->ctx;
149
int ret;
150
151
ret = laiocb->ret;
152
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
153
}
154
155
laiocb->ret = ret;
156
- aio_context_acquire(s->aio_context);
157
if (laiocb->co) {
158
/* If the coroutine is already entered it must be in ioq_submit() and
159
* will notice laio->ret has been filled in when it eventually runs
160
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
161
* that!
162
*/
163
if (!qemu_coroutine_entered(laiocb->co)) {
164
- qemu_coroutine_enter(laiocb->co);
165
+ aio_co_wake(laiocb->co);
166
}
167
} else {
168
laiocb->common.cb(laiocb->common.opaque, ret);
169
qemu_aio_unref(laiocb);
170
}
171
- aio_context_release(s->aio_context);
172
}
173
174
/**
175
diff --git a/block/mirror.c b/block/mirror.c
176
index XXXXXXX..XXXXXXX 100644
177
--- a/block/mirror.c
178
+++ b/block/mirror.c
179
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
180
{
181
MirrorOp *op = opaque;
182
MirrorBlockJob *s = op->s;
183
+
57
+
184
+ aio_context_acquire(blk_get_aio_context(s->common.blk));
58
+This document describes how to configure and use multi-process qemu.
185
if (ret < 0) {
59
+For the design document refer to docs/devel/qemu-multiprocess.
186
BlockErrorAction action;
187
188
@@ -XXX,XX +XXX,XX @@ static void mirror_write_complete(void *opaque, int ret)
189
}
190
}
191
mirror_iteration_done(op, ret);
192
+ aio_context_release(blk_get_aio_context(s->common.blk));
193
}
194
195
static void mirror_read_complete(void *opaque, int ret)
196
{
197
MirrorOp *op = opaque;
198
MirrorBlockJob *s = op->s;
199
+
60
+
200
+ aio_context_acquire(blk_get_aio_context(s->common.blk));
61
+1) Configuration
201
if (ret < 0) {
62
+----------------
202
BlockErrorAction action;
203
204
@@ -XXX,XX +XXX,XX @@ static void mirror_read_complete(void *opaque, int ret)
205
}
206
207
mirror_iteration_done(op, ret);
208
- return;
209
+ } else {
210
+ blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
211
+ 0, mirror_write_complete, op);
212
}
213
- blk_aio_pwritev(s->target, op->sector_num * BDRV_SECTOR_SIZE, &op->qiov,
214
- 0, mirror_write_complete, op);
215
+ aio_context_release(blk_get_aio_context(s->common.blk));
216
}
217
218
static inline void mirror_clip_sectors(MirrorBlockJob *s,
219
diff --git a/block/null.c b/block/null.c
220
index XXXXXXX..XXXXXXX 100644
221
--- a/block/null.c
222
+++ b/block/null.c
223
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
224
static void null_bh_cb(void *opaque)
225
{
226
NullAIOCB *acb = opaque;
227
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
228
-
229
- aio_context_acquire(ctx);
230
acb->common.cb(acb->common.opaque, 0);
231
- aio_context_release(ctx);
232
qemu_aio_unref(acb);
233
}
234
235
static void null_timer_cb(void *opaque)
236
{
237
NullAIOCB *acb = opaque;
238
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
239
-
240
- aio_context_acquire(ctx);
241
acb->common.cb(acb->common.opaque, 0);
242
- aio_context_release(ctx);
243
timer_deinit(&acb->timer);
244
qemu_aio_unref(acb);
245
}
246
diff --git a/block/qed-cluster.c b/block/qed-cluster.c
247
index XXXXXXX..XXXXXXX 100644
248
--- a/block/qed-cluster.c
249
+++ b/block/qed-cluster.c
250
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
251
unsigned int index;
252
unsigned int n;
253
254
+ qed_acquire(s);
255
if (ret) {
256
goto out;
257
}
258
@@ -XXX,XX +XXX,XX @@ static void qed_find_cluster_cb(void *opaque, int ret)
259
260
out:
261
find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len);
262
+ qed_release(s);
263
g_free(find_cluster_cb);
264
}
265
266
diff --git a/block/qed-table.c b/block/qed-table.c
267
index XXXXXXX..XXXXXXX 100644
268
--- a/block/qed-table.c
269
+++ b/block/qed-table.c
270
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
271
{
272
QEDReadTableCB *read_table_cb = opaque;
273
QEDTable *table = read_table_cb->table;
274
+ BDRVQEDState *s = read_table_cb->s;
275
int noffsets = read_table_cb->qiov.size / sizeof(uint64_t);
276
int i;
277
278
@@ -XXX,XX +XXX,XX @@ static void qed_read_table_cb(void *opaque, int ret)
279
}
280
281
/* Byteswap offsets */
282
+ qed_acquire(s);
283
for (i = 0; i < noffsets; i++) {
284
table->offsets[i] = le64_to_cpu(table->offsets[i]);
285
}
286
+ qed_release(s);
287
288
out:
289
/* Completion */
290
- trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret);
291
+ trace_qed_read_table_cb(s, read_table_cb->table, ret);
292
gencb_complete(&read_table_cb->gencb, ret);
293
}
294
295
@@ -XXX,XX +XXX,XX @@ typedef struct {
296
static void qed_write_table_cb(void *opaque, int ret)
297
{
298
QEDWriteTableCB *write_table_cb = opaque;
299
+ BDRVQEDState *s = write_table_cb->s;
300
301
- trace_qed_write_table_cb(write_table_cb->s,
302
+ trace_qed_write_table_cb(s,
303
write_table_cb->orig_table,
304
write_table_cb->flush,
305
ret);
306
@@ -XXX,XX +XXX,XX @@ static void qed_write_table_cb(void *opaque, int ret)
307
if (write_table_cb->flush) {
308
/* We still need to flush first */
309
write_table_cb->flush = false;
310
+ qed_acquire(s);
311
bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb,
312
write_table_cb);
313
+ qed_release(s);
314
return;
315
}
316
317
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
318
CachedL2Table *l2_table = request->l2_table;
319
uint64_t l2_offset = read_l2_table_cb->l2_offset;
320
321
+ qed_acquire(s);
322
if (ret) {
323
/* can't trust loaded L2 table anymore */
324
qed_unref_l2_cache_entry(l2_table);
325
@@ -XXX,XX +XXX,XX @@ static void qed_read_l2_table_cb(void *opaque, int ret)
326
request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
327
assert(request->l2_table != NULL);
328
}
329
+ qed_release(s);
330
331
gencb_complete(&read_l2_table_cb->gencb, ret);
332
}
333
diff --git a/block/qed.c b/block/qed.c
334
index XXXXXXX..XXXXXXX 100644
335
--- a/block/qed.c
336
+++ b/block/qed.c
337
@@ -XXX,XX +XXX,XX @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l
338
}
339
340
if (cb->co) {
341
- qemu_coroutine_enter(cb->co);
342
+ aio_co_wake(cb->co);
343
}
344
}
345
346
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qed_co_pwrite_zeroes_cb(void *opaque, int ret)
347
cb->done = true;
348
cb->ret = ret;
349
if (cb->co) {
350
- qemu_coroutine_enter(cb->co);
351
+ aio_co_wake(cb->co);
352
}
353
}
354
355
diff --git a/block/rbd.c b/block/rbd.c
356
index XXXXXXX..XXXXXXX 100644
357
--- a/block/rbd.c
358
+++ b/block/rbd.c
359
@@ -XXX,XX +XXX,XX @@ shutdown:
360
static void qemu_rbd_complete_aio(RADOSCB *rcb)
361
{
362
RBDAIOCB *acb = rcb->acb;
363
- AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
364
int64_t r;
365
366
r = rcb->ret;
367
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
368
qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
369
}
370
qemu_vfree(acb->bounce);
371
-
372
- aio_context_acquire(ctx);
373
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
374
- aio_context_release(ctx);
375
376
qemu_aio_unref(acb);
377
}
378
diff --git a/block/win32-aio.c b/block/win32-aio.c
379
index XXXXXXX..XXXXXXX 100644
380
--- a/block/win32-aio.c
381
+++ b/block/win32-aio.c
382
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
383
qemu_vfree(waiocb->buf);
384
}
385
386
-
387
- aio_context_acquire(s->aio_ctx);
388
waiocb->common.cb(waiocb->common.opaque, ret);
389
- aio_context_release(s->aio_ctx);
390
qemu_aio_unref(waiocb);
391
}
392
393
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
394
index XXXXXXX..XXXXXXX 100644
395
--- a/hw/block/virtio-blk.c
396
+++ b/hw/block/virtio-blk.c
397
@@ -XXX,XX +XXX,XX @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
398
static void virtio_blk_rw_complete(void *opaque, int ret)
399
{
400
VirtIOBlockReq *next = opaque;
401
+ VirtIOBlock *s = next->dev;
402
403
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
404
while (next) {
405
VirtIOBlockReq *req = next;
406
next = req->mr_next;
407
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_rw_complete(void *opaque, int ret)
408
block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
409
virtio_blk_free_request(req);
410
}
411
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
412
}
413
414
static void virtio_blk_flush_complete(void *opaque, int ret)
415
{
416
VirtIOBlockReq *req = opaque;
417
+ VirtIOBlock *s = req->dev;
418
419
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
420
if (ret) {
421
if (virtio_blk_handle_rw_error(req, -ret, 0)) {
422
- return;
423
+ goto out;
424
}
425
}
426
427
virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
428
block_acct_done(blk_get_stats(req->dev->blk), &req->acct);
429
virtio_blk_free_request(req);
430
+
63
+
431
+out:
64
+multi-process is enabled by default for targets that enable KVM
432
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
433
}
434
435
#ifdef __linux__
436
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
437
virtio_stl_p(vdev, &scsi->data_len, hdr->dxfer_len);
438
439
out:
440
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
441
virtio_blk_req_complete(req, status);
442
virtio_blk_free_request(req);
443
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
444
g_free(ioctl_req);
445
}
446
447
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
448
index XXXXXXX..XXXXXXX 100644
449
--- a/hw/scsi/scsi-disk.c
450
+++ b/hw/scsi/scsi-disk.c
451
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
452
453
assert(r->req.aiocb != NULL);
454
r->req.aiocb = NULL;
455
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
456
if (scsi_disk_req_check_error(r, ret, true)) {
457
goto done;
458
}
459
@@ -XXX,XX +XXX,XX @@ static void scsi_aio_complete(void *opaque, int ret)
460
scsi_req_complete(&r->req, GOOD);
461
462
done:
463
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
464
scsi_req_unref(&r->req);
465
}
466
467
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_complete(void *opaque, int ret)
468
assert(r->req.aiocb != NULL);
469
r->req.aiocb = NULL;
470
471
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
472
if (ret < 0) {
473
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
474
} else {
475
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
476
}
477
scsi_dma_complete_noio(r, ret);
478
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
479
}
480
481
static void scsi_read_complete(void * opaque, int ret)
482
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
483
484
assert(r->req.aiocb != NULL);
485
r->req.aiocb = NULL;
486
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
487
if (scsi_disk_req_check_error(r, ret, true)) {
488
goto done;
489
}
490
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
491
492
done:
493
scsi_req_unref(&r->req);
494
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
495
}
496
497
/* Actually issue a read to the block device. */
498
@@ -XXX,XX +XXX,XX @@ static void scsi_do_read_cb(void *opaque, int ret)
499
assert (r->req.aiocb != NULL);
500
r->req.aiocb = NULL;
501
502
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
503
if (ret < 0) {
504
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
505
} else {
506
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
507
}
508
scsi_do_read(opaque, ret);
509
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
510
}
511
512
/* Read more data from scsi device into buffer. */
513
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
514
assert (r->req.aiocb != NULL);
515
r->req.aiocb = NULL;
516
517
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
518
if (ret < 0) {
519
block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
520
} else {
521
block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
522
}
523
scsi_write_complete_noio(r, ret);
524
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
525
}
526
527
static void scsi_write_data(SCSIRequest *req)
528
@@ -XXX,XX +XXX,XX @@ static void scsi_unmap_complete(void *opaque, int ret)
529
{
530
UnmapCBData *data = opaque;
531
SCSIDiskReq *r = data->r;
532
+ SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
533
534
assert(r->req.aiocb != NULL);
535
r->req.aiocb = NULL;
536
537
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
538
scsi_unmap_complete_noio(data, ret);
539
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
540
}
541
542
static void scsi_disk_emulate_unmap(SCSIDiskReq *r, uint8_t *inbuf)
543
@@ -XXX,XX +XXX,XX @@ static void scsi_write_same_complete(void *opaque, int ret)
544
545
assert(r->req.aiocb != NULL);
546
r->req.aiocb = NULL;
547
+ aio_context_acquire(blk_get_aio_context(s->qdev.conf.blk));
548
if (scsi_disk_req_check_error(r, ret, true)) {
549
goto done;
550
}
551
@@ -XXX,XX +XXX,XX @@ done:
552
scsi_req_unref(&r->req);
553
qemu_vfree(data->iov.iov_base);
554
g_free(data);
555
+ aio_context_release(blk_get_aio_context(s->qdev.conf.blk));
556
}
557
558
static void scsi_disk_emulate_write_same(SCSIDiskReq *r, uint8_t *inbuf)
559
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
560
index XXXXXXX..XXXXXXX 100644
561
--- a/hw/scsi/scsi-generic.c
562
+++ b/hw/scsi/scsi-generic.c
563
@@ -XXX,XX +XXX,XX @@ done:
564
static void scsi_command_complete(void *opaque, int ret)
565
{
566
SCSIGenericReq *r = (SCSIGenericReq *)opaque;
567
+ SCSIDevice *s = r->req.dev;
568
569
assert(r->req.aiocb != NULL);
570
r->req.aiocb = NULL;
571
+
65
+
572
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
573
scsi_command_complete_noio(r, ret);
574
+ aio_context_release(blk_get_aio_context(s->conf.blk));
575
}
576
577
static int execute_command(BlockBackend *blk,
578
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
579
assert(r->req.aiocb != NULL);
580
r->req.aiocb = NULL;
581
582
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
583
+
66
+
584
if (ret || r->req.io_canceled) {
67
+2) Usage
585
scsi_command_complete_noio(r, ret);
68
+--------
586
- return;
587
+ goto done;
588
}
589
590
len = r->io_header.dxfer_len - r->io_header.resid;
591
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
592
r->len = -1;
593
if (len == 0) {
594
scsi_command_complete_noio(r, 0);
595
- return;
596
+ goto done;
597
}
598
599
/* Snoop READ CAPACITY output to set the blocksize. */
600
@@ -XXX,XX +XXX,XX @@ static void scsi_read_complete(void * opaque, int ret)
601
}
602
scsi_req_data(&r->req, len);
603
scsi_req_unref(&r->req);
604
+
69
+
605
+done:
70
+Multi-process QEMU requires an orchestrator to launch.
606
+ aio_context_release(blk_get_aio_context(s->conf.blk));
607
}
608
609
/* Read more data from scsi device into buffer. */
610
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
611
assert(r->req.aiocb != NULL);
612
r->req.aiocb = NULL;
613
614
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
615
+
71
+
616
if (ret || r->req.io_canceled) {
72
+Following is a description of command-line used to launch mpqemu.
617
scsi_command_complete_noio(r, ret);
618
- return;
619
+ goto done;
620
}
621
622
if (r->req.cmd.buf[0] == MODE_SELECT && r->req.cmd.buf[4] == 12 &&
623
@@ -XXX,XX +XXX,XX @@ static void scsi_write_complete(void * opaque, int ret)
624
}
625
626
scsi_command_complete_noio(r, ret);
627
+
73
+
628
+done:
74
+* Orchestrator:
629
+ aio_context_release(blk_get_aio_context(s->conf.blk));
75
+
630
}
76
+ - The Orchestrator creates a unix socketpair
631
77
+
632
/* Write data to a scsi device. Returns nonzero on failure.
78
+ - It launches the remote process and passes one of the
633
diff --git a/util/thread-pool.c b/util/thread-pool.c
79
+ sockets to it via command-line.
634
index XXXXXXX..XXXXXXX 100644
80
+
635
--- a/util/thread-pool.c
81
+ - It then launches QEMU and specifies the other socket as an option
636
+++ b/util/thread-pool.c
82
+ to the Proxy device object
637
@@ -XXX,XX +XXX,XX @@ restart:
83
+
638
*/
84
+* Remote Process:
639
qemu_bh_schedule(pool->completion_bh);
85
+
640
86
+ - QEMU can enter remote process mode by using the "remote" machine
641
+ aio_context_release(pool->ctx);
87
+ option.
642
elem->common.cb(elem->common.opaque, elem->ret);
88
+
643
+ aio_context_acquire(pool->ctx);
89
+ - The orchestrator creates a "remote-object" with details about
644
qemu_aio_unref(elem);
90
+ the device and the file descriptor for the device
645
goto restart;
91
+
646
} else {
92
+ - The remaining options are no different from how one launches QEMU with
647
@@ -XXX,XX +XXX,XX @@ static void thread_pool_co_cb(void *opaque, int ret)
93
+ devices.
648
ThreadPoolCo *co = opaque;
94
+
649
95
+ - Example command-line for the remote process is as follows:
650
co->ret = ret;
96
+
651
- qemu_coroutine_enter(co->co);
97
+ /usr/bin/qemu-system-x86_64 \
652
+ aio_co_wake(co->co);
98
+ -machine x-remote \
653
}
99
+ -device lsi53c895a,id=lsi0 \
654
100
+ -drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2 \
655
int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
101
+ -device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0 \
102
+ -object x-remote-object,id=robj1,devid=lsi1,fd=4,
103
+
104
+* QEMU:
105
+
106
+ - Since parts of the RAM are shared between QEMU & remote process, a
107
+ memory-backend-memfd is required to facilitate this, as follows:
108
+
109
+ -object memory-backend-memfd,id=mem,size=2G
110
+
111
+ - A "x-pci-proxy-dev" device is created for each of the PCI devices emulated
112
+ in the remote process. A "socket" sub-option specifies the other end of
113
+ unix channel created by orchestrator. The "id" sub-option must be specified
114
+ and should be the same as the "id" specified for the remote PCI device
115
+
116
+ - Example commandline for QEMU is as follows:
117
+
118
+ -device x-pci-proxy-dev,id=lsi0,socket=3
656
--
119
--
657
2.9.3
120
2.29.2
658
121
659
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
The AioContext data structures are now protected by list_lock and/or
3
Allow RAM MemoryRegion to be created from an offset in a file, instead
4
they are walked with FOREACH_RCU primitives. There is no need anymore
4
of allocating at offset of 0 by default. This is needed to synchronize
5
to acquire the AioContext for the entire duration of aio_dispatch.
5
RAM between QEMU & remote process.
6
Instead, just acquire it before and after invoking the callbacks.
6
7
The next step is then to push it further down.
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Message-id: 609996697ad8617e3b01df38accc5c208c24d74e.1611938319.git.jag.raman@oracle.com
11
Reviewed-by: Fam Zheng <famz@redhat.com>
12
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
13
Message-id: 20170213135235.12274-12-pbonzini@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
13
---
16
util/aio-posix.c | 25 +++++++++++--------------
14
include/exec/memory.h | 2 ++
17
util/aio-win32.c | 15 +++++++--------
15
include/exec/ram_addr.h | 4 ++--
18
util/async.c | 2 ++
16
include/qemu/mmap-alloc.h | 4 +++-
19
3 files changed, 20 insertions(+), 22 deletions(-)
17
backends/hostmem-memfd.c | 2 +-
20
18
hw/misc/ivshmem.c | 3 ++-
21
diff --git a/util/aio-posix.c b/util/aio-posix.c
19
softmmu/memory.c | 3 ++-
22
index XXXXXXX..XXXXXXX 100644
20
softmmu/physmem.c | 12 +++++++-----
23
--- a/util/aio-posix.c
21
util/mmap-alloc.c | 8 +++++---
24
+++ b/util/aio-posix.c
22
util/oslib-posix.c | 2 +-
25
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
23
9 files changed, 25 insertions(+), 15 deletions(-)
26
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
24
27
aio_node_check(ctx, node->is_external) &&
25
diff --git a/include/exec/memory.h b/include/exec/memory.h
28
node->io_read) {
26
index XXXXXXX..XXXXXXX 100644
29
+ aio_context_acquire(ctx);
27
--- a/include/exec/memory.h
30
node->io_read(node->opaque);
28
+++ b/include/exec/memory.h
31
+ aio_context_release(ctx);
29
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
32
30
* @size: size of the region.
33
/* aio_notify() does not count as progress */
31
* @share: %true if memory must be mmaped with the MAP_SHARED flag
34
if (node->opaque != &ctx->notifier) {
32
* @fd: the fd to mmap.
35
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
33
+ * @offset: offset within the file referenced by fd
36
(revents & (G_IO_OUT | G_IO_ERR)) &&
34
* @errp: pointer to Error*, to store an error if it happens.
37
aio_node_check(ctx, node->is_external) &&
35
*
38
node->io_write) {
36
* Note that this function does not do anything to cause the data in the
39
+ aio_context_acquire(ctx);
37
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
40
node->io_write(node->opaque);
38
uint64_t size,
41
+ aio_context_release(ctx);
39
bool share,
42
progress = true;
40
int fd,
43
}
41
+ ram_addr_t offset,
44
42
Error **errp);
45
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
43
#endif
44
45
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
46
index XXXXXXX..XXXXXXX 100644
47
--- a/include/exec/ram_addr.h
48
+++ b/include/exec/ram_addr.h
49
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
50
uint32_t ram_flags, const char *mem_path,
51
bool readonly, Error **errp);
52
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
53
- uint32_t ram_flags, int fd, bool readonly,
54
- Error **errp);
55
+ uint32_t ram_flags, int fd, off_t offset,
56
+ bool readonly, Error **errp);
57
58
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
59
MemoryRegion *mr, Error **errp);
60
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
61
index XXXXXXX..XXXXXXX 100644
62
--- a/include/qemu/mmap-alloc.h
63
+++ b/include/qemu/mmap-alloc.h
64
@@ -XXX,XX +XXX,XX @@ size_t qemu_mempath_getpagesize(const char *mem_path);
65
* @readonly: true for a read-only mapping, false for read/write.
66
* @shared: map has RAM_SHARED flag.
67
* @is_pmem: map has RAM_PMEM flag.
68
+ * @map_offset: map starts at offset of map_offset from the start of fd
69
*
70
* Return:
71
* On success, return a pointer to the mapped area.
72
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
73
size_t align,
74
bool readonly,
75
bool shared,
76
- bool is_pmem);
77
+ bool is_pmem,
78
+ off_t map_offset);
79
80
void qemu_ram_munmap(int fd, void *ptr, size_t size);
81
82
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
83
index XXXXXXX..XXXXXXX 100644
84
--- a/backends/hostmem-memfd.c
85
+++ b/backends/hostmem-memfd.c
86
@@ -XXX,XX +XXX,XX @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
87
name = host_memory_backend_get_name(backend);
88
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
89
name, backend->size,
90
- backend->share, fd, errp);
91
+ backend->share, fd, 0, errp);
92
g_free(name);
93
}
94
95
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/hw/misc/ivshmem.c
98
+++ b/hw/misc/ivshmem.c
99
@@ -XXX,XX +XXX,XX @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
100
101
/* mmap the region and map into the BAR2 */
102
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
103
- "ivshmem.bar2", size, true, fd, &local_err);
104
+ "ivshmem.bar2", size, true, fd, 0,
105
+ &local_err);
106
if (local_err) {
107
error_propagate(errp, local_err);
108
return;
109
diff --git a/softmmu/memory.c b/softmmu/memory.c
110
index XXXXXXX..XXXXXXX 100644
111
--- a/softmmu/memory.c
112
+++ b/softmmu/memory.c
113
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
114
uint64_t size,
115
bool share,
116
int fd,
117
+ ram_addr_t offset,
118
Error **errp)
119
{
120
Error *err = NULL;
121
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
122
mr->destructor = memory_region_destructor_ram;
123
mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
124
share ? RAM_SHARED : 0,
125
- fd, false, &err);
126
+ fd, offset, false, &err);
127
if (err) {
128
mr->size = int128_zero();
129
object_unparent(OBJECT(mr));
130
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
131
index XXXXXXX..XXXXXXX 100644
132
--- a/softmmu/physmem.c
133
+++ b/softmmu/physmem.c
134
@@ -XXX,XX +XXX,XX @@ static void *file_ram_alloc(RAMBlock *block,
135
int fd,
136
bool readonly,
137
bool truncate,
138
+ off_t offset,
139
Error **errp)
140
{
141
void *area;
142
@@ -XXX,XX +XXX,XX @@ static void *file_ram_alloc(RAMBlock *block,
46
}
143
}
47
144
48
/* Run our timers */
145
area = qemu_ram_mmap(fd, memory, block->mr->align, readonly,
49
+ aio_context_acquire(ctx);
146
- block->flags & RAM_SHARED, block->flags & RAM_PMEM);
50
progress |= timerlistgroup_run_timers(&ctx->tlg);
147
+ block->flags & RAM_SHARED, block->flags & RAM_PMEM,
51
+ aio_context_release(ctx);
148
+ offset);
52
149
if (area == MAP_FAILED) {
53
return progress;
150
error_setg_errno(errp, errno,
54
}
151
"unable to map backing store for guest RAM");
55
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
152
@@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
56
int64_t timeout;
153
57
int64_t start = 0;
154
#ifdef CONFIG_POSIX
58
155
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
59
- aio_context_acquire(ctx);
156
- uint32_t ram_flags, int fd, bool readonly,
60
- progress = false;
157
- Error **errp)
61
-
158
+ uint32_t ram_flags, int fd, off_t offset,
62
/* aio_notify can avoid the expensive event_notifier_set if
159
+ bool readonly, Error **errp)
63
* everything (file descriptors, bottom halves, timers) will
160
{
64
* be re-evaluated before the next blocking poll(). This is
161
RAMBlock *new_block;
65
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
162
Error *local_err = NULL;
66
start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
163
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
164
new_block->max_length = size;
165
new_block->flags = ram_flags;
166
new_block->host = file_ram_alloc(new_block, size, fd, readonly,
167
- !file_size, errp);
168
+ !file_size, offset, errp);
169
if (!new_block->host) {
170
g_free(new_block);
171
return NULL;
172
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
173
return NULL;
67
}
174
}
68
175
69
- if (try_poll_mode(ctx, blocking)) {
176
- block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, readonly, errp);
70
- progress = true;
177
+ block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp);
71
- } else {
178
if (!block) {
72
+ aio_context_acquire(ctx);
179
if (created) {
73
+ progress = try_poll_mode(ctx, blocking);
180
unlink(mem_path);
74
+ aio_context_release(ctx);
181
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
75
+
182
index XXXXXXX..XXXXXXX 100644
76
+ if (!progress) {
183
--- a/util/mmap-alloc.c
77
assert(npfd == 0);
184
+++ b/util/mmap-alloc.c
78
185
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
79
/* fill pollfds */
186
size_t align,
80
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
187
bool readonly,
81
timeout = blocking ? aio_compute_timeout(ctx) : 0;
188
bool shared,
82
189
- bool is_pmem)
83
/* wait until next event */
190
+ bool is_pmem,
84
- if (timeout) {
191
+ off_t map_offset)
85
- aio_context_release(ctx);
192
{
86
- }
193
int prot;
87
if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
194
int flags;
88
AioHandler epoll_handler;
195
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
89
196
90
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
197
prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
91
} else {
198
92
ret = qemu_poll_ns(pollfds, npfd, timeout);
199
- ptr = mmap(guardptr + offset, size, prot, flags | map_sync_flags, fd, 0);
93
}
200
+ ptr = mmap(guardptr + offset, size, prot,
94
- if (timeout) {
201
+ flags | map_sync_flags, fd, map_offset);
95
- aio_context_acquire(ctx);
202
96
- }
203
if (ptr == MAP_FAILED && map_sync_flags) {
204
if (errno == ENOTSUP) {
205
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
206
* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
207
* we will remove these flags to handle compatibility.
208
*/
209
- ptr = mmap(guardptr + offset, size, prot, flags, fd, 0);
210
+ ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset);
97
}
211
}
98
212
99
if (blocking) {
213
if (ptr == MAP_FAILED) {
100
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
214
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
101
progress = true;
215
index XXXXXXX..XXXXXXX 100644
102
}
216
--- a/util/oslib-posix.c
103
217
+++ b/util/oslib-posix.c
104
- aio_context_release(ctx);
218
@@ -XXX,XX +XXX,XX @@ void *qemu_memalign(size_t alignment, size_t size)
105
-
219
void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
106
return progress;
220
{
107
}
221
size_t align = QEMU_VMALLOC_ALIGN;
108
222
- void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false);
109
diff --git a/util/aio-win32.c b/util/aio-win32.c
223
+ void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false, 0);
110
index XXXXXXX..XXXXXXX 100644
224
111
--- a/util/aio-win32.c
225
if (ptr == MAP_FAILED) {
112
+++ b/util/aio-win32.c
226
return NULL;
113
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
114
(revents || event_notifier_get_handle(node->e) == event) &&
115
node->io_notify) {
116
node->pfd.revents = 0;
117
+ aio_context_acquire(ctx);
118
node->io_notify(node->e);
119
+ aio_context_release(ctx);
120
121
/* aio_notify() does not count as progress */
122
if (node->e != &ctx->notifier) {
123
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
124
(node->io_read || node->io_write)) {
125
node->pfd.revents = 0;
126
if ((revents & G_IO_IN) && node->io_read) {
127
+ aio_context_acquire(ctx);
128
node->io_read(node->opaque);
129
+ aio_context_release(ctx);
130
progress = true;
131
}
132
if ((revents & G_IO_OUT) && node->io_write) {
133
+ aio_context_acquire(ctx);
134
node->io_write(node->opaque);
135
+ aio_context_release(ctx);
136
progress = true;
137
}
138
139
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
140
int count;
141
int timeout;
142
143
- aio_context_acquire(ctx);
144
progress = false;
145
146
/* aio_notify can avoid the expensive event_notifier_set if
147
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
148
149
timeout = blocking && !have_select_revents
150
? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
151
- if (timeout) {
152
- aio_context_release(ctx);
153
- }
154
ret = WaitForMultipleObjects(count, events, FALSE, timeout);
155
if (blocking) {
156
assert(first);
157
atomic_sub(&ctx->notify_me, 2);
158
}
159
- if (timeout) {
160
- aio_context_acquire(ctx);
161
- }
162
163
if (first) {
164
aio_notify_accept(ctx);
165
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
166
progress |= aio_dispatch_handlers(ctx, event);
167
} while (count > 0);
168
169
+ aio_context_acquire(ctx);
170
progress |= timerlistgroup_run_timers(&ctx->tlg);
171
-
172
aio_context_release(ctx);
173
return progress;
174
}
175
diff --git a/util/async.c b/util/async.c
176
index XXXXXXX..XXXXXXX 100644
177
--- a/util/async.c
178
+++ b/util/async.c
179
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
180
ret = 1;
181
}
182
bh->idle = 0;
183
+ aio_context_acquire(ctx);
184
aio_bh_call(bh);
185
+ aio_context_release(ctx);
186
}
187
if (bh->deleted) {
188
deleted = true;
189
--
227
--
190
2.9.3
228
2.29.2
191
229
192
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
As a small step towards the introduction of multiqueue, we want
3
Add configuration options to enable or disable multiprocess QEMU code
4
coroutines to remain on the same AioContext that started them,
5
unless they are moved explicitly with e.g. aio_co_schedule. This patch
6
avoids that coroutines switch AioContext when they use a CoMutex.
7
For now it does not make much of a difference, because the CoMutex
8
is not thread-safe and the AioContext itself is used to protect the
9
CoMutex from concurrent access. However, this is going to change.
10
4
5
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
6
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
Message-id: 6cc37253e35418ebd7b675a31a3df6e3c7a12dc1.1611938319.git.jag.raman@oracle.com
13
Reviewed-by: Fam Zheng <famz@redhat.com>
14
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
15
Message-id: 20170213135235.12274-9-pbonzini@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
11
---
18
util/qemu-coroutine-lock.c | 5 ++---
12
configure | 10 ++++++++++
19
util/trace-events | 1 -
13
meson.build | 4 +++-
20
2 files changed, 2 insertions(+), 4 deletions(-)
14
Kconfig.host | 4 ++++
15
hw/Kconfig | 1 +
16
hw/remote/Kconfig | 3 +++
17
5 files changed, 21 insertions(+), 1 deletion(-)
18
create mode 100644 hw/remote/Kconfig
21
19
22
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
20
diff --git a/configure b/configure
21
index XXXXXXX..XXXXXXX 100755
22
--- a/configure
23
+++ b/configure
24
@@ -XXX,XX +XXX,XX @@ skip_meson=no
25
gettext="auto"
26
fuse="auto"
27
fuse_lseek="auto"
28
+multiprocess="no"
29
30
malloc_trim="auto"
31
32
@@ -XXX,XX +XXX,XX @@ Linux)
33
linux="yes"
34
linux_user="yes"
35
vhost_user=${default_feature:-yes}
36
+ multiprocess=${default_feature:-yes}
37
;;
38
esac
39
40
@@ -XXX,XX +XXX,XX @@ for opt do
41
;;
42
--disable-fuse-lseek) fuse_lseek="disabled"
43
;;
44
+ --enable-multiprocess) multiprocess="yes"
45
+ ;;
46
+ --disable-multiprocess) multiprocess="no"
47
+ ;;
48
*)
49
echo "ERROR: unknown option $opt"
50
echo "Try '$0 --help' for more information"
51
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available
52
libdaxctl libdaxctl support
53
fuse FUSE block device export
54
fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports
55
+ multiprocess Multiprocess QEMU support
56
57
NOTE: The object files are built at the place where configure is launched
58
EOF
59
@@ -XXX,XX +XXX,XX @@ fi
60
if test "$have_mlockall" = "yes" ; then
61
echo "HAVE_MLOCKALL=y" >> $config_host_mak
62
fi
63
+if test "$multiprocess" = "yes" ; then
64
+ echo "CONFIG_MULTIPROCESS_ALLOWED=y" >> $config_host_mak
65
+fi
66
if test "$fuzzing" = "yes" ; then
67
# If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the
68
# needed CFLAGS have already been provided
69
diff --git a/meson.build b/meson.build
23
index XXXXXXX..XXXXXXX 100644
70
index XXXXXXX..XXXXXXX 100644
24
--- a/util/qemu-coroutine-lock.c
71
--- a/meson.build
25
+++ b/util/qemu-coroutine-lock.c
72
+++ b/meson.build
73
@@ -XXX,XX +XXX,XX @@ host_kconfig = \
74
('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
75
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
76
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
77
- ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : [])
78
+ ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \
79
+ ('CONFIG_MULTIPROCESS_ALLOWED' in config_host ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
80
81
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
82
83
@@ -XXX,XX +XXX,XX @@ summary_info += {'libpmem support': config_host.has_key('CONFIG_LIBPMEM')}
84
summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')}
85
summary_info += {'libudev': libudev.found()}
86
summary_info += {'FUSE lseek': fuse_lseek.found()}
87
+summary_info += {'Multiprocess QEMU': config_host.has_key('CONFIG_MULTIPROCESS_ALLOWED')}
88
summary(summary_info, bool_yn: true, section: 'Dependencies')
89
90
if not supported_cpus.contains(cpu)
91
diff --git a/Kconfig.host b/Kconfig.host
92
index XXXXXXX..XXXXXXX 100644
93
--- a/Kconfig.host
94
+++ b/Kconfig.host
95
@@ -XXX,XX +XXX,XX @@ config VIRTFS
96
97
config PVRDMA
98
bool
99
+
100
+config MULTIPROCESS_ALLOWED
101
+ bool
102
+ imply MULTIPROCESS
103
diff --git a/hw/Kconfig b/hw/Kconfig
104
index XXXXXXX..XXXXXXX 100644
105
--- a/hw/Kconfig
106
+++ b/hw/Kconfig
107
@@ -XXX,XX +XXX,XX @@ source pci-host/Kconfig
108
source pcmcia/Kconfig
109
source pci/Kconfig
110
source rdma/Kconfig
111
+source remote/Kconfig
112
source rtc/Kconfig
113
source scsi/Kconfig
114
source sd/Kconfig
115
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
116
new file mode 100644
117
index XXXXXXX..XXXXXXX
118
--- /dev/null
119
+++ b/hw/remote/Kconfig
26
@@ -XXX,XX +XXX,XX @@
120
@@ -XXX,XX +XXX,XX @@
27
#include "qemu/coroutine.h"
121
+config MULTIPROCESS
28
#include "qemu/coroutine_int.h"
122
+ bool
29
#include "qemu/queue.h"
123
+ depends on PCI && KVM
30
+#include "block/aio.h"
31
#include "trace.h"
32
33
void qemu_co_queue_init(CoQueue *queue)
34
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_run_restart(Coroutine *co)
35
36
static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
37
{
38
- Coroutine *self = qemu_coroutine_self();
39
Coroutine *next;
40
41
if (QSIMPLEQ_EMPTY(&queue->entries)) {
42
@@ -XXX,XX +XXX,XX @@ static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
43
44
while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
45
QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
46
- QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, next, co_queue_next);
47
- trace_qemu_co_queue_next(next);
48
+ aio_co_wake(next);
49
if (single) {
50
break;
51
}
52
diff --git a/util/trace-events b/util/trace-events
53
index XXXXXXX..XXXXXXX 100644
54
--- a/util/trace-events
55
+++ b/util/trace-events
56
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
57
58
# util/qemu-coroutine-lock.c
59
qemu_co_queue_run_restart(void *co) "co %p"
60
-qemu_co_queue_next(void *nxt) "next %p"
61
qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
62
qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
63
qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
64
--
124
--
65
2.9.3
125
2.29.2
66
126
67
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
AioContext is fairly self contained, the only dependency is QEMUTimer but
3
PCI host bridge is setup for the remote device process. It is
4
that in turn doesn't need anything else. So move them out of block-obj-y
4
implemented using remote-pcihost object. It is an extension of the PCI
5
to avoid introducing a dependency from io/ to block-obj-y.
5
host bridge setup by QEMU.
6
Remote-pcihost configures a PCI bus which could be used by the remote
7
PCI device to latch on to.
6
8
7
main-loop and its dependency iohandler also need to be moved, because
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
later in this series io/ will call iohandler_get_aio_context.
10
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
11
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 0871ba857abb2eafacde07e7fe66a3f12415bfb2.1611938319.git.jag.raman@oracle.com
9
14
10
[Changed copyright "the QEMU team" to "other QEMU contributors" as
15
[Added PCI_EXPRESS condition in hw/remote/Kconfig since remote-pcihost
11
suggested by Daniel Berrange and agreed by Paolo.
16
needs PCIe. This solves "make check" failure on s390x. Fix suggested by
17
Philippe Mathieu-Daudé <philmd@redhat.com> and Thomas Huth
18
<thuth@redhat.com>.
12
--Stefan]
19
--Stefan]
13
20
14
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
15
Reviewed-by: Fam Zheng <famz@redhat.com>
16
Message-id: 20170213135235.12274-2-pbonzini@redhat.com
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
22
---
19
Makefile.objs | 4 ---
23
MAINTAINERS | 2 +
20
stubs/Makefile.objs | 1 +
24
include/hw/pci-host/remote.h | 29 ++++++++++++++
21
tests/Makefile.include | 11 ++++----
25
hw/pci-host/remote.c | 75 ++++++++++++++++++++++++++++++++++++
22
util/Makefile.objs | 6 +++-
26
hw/pci-host/Kconfig | 3 ++
23
block/io.c | 29 -------------------
27
hw/pci-host/meson.build | 1 +
24
stubs/linux-aio.c | 32 +++++++++++++++++++++
28
hw/remote/Kconfig | 3 +-
25
stubs/set-fd-handler.c | 11 --------
29
6 files changed, 112 insertions(+), 1 deletion(-)
26
aio-posix.c => util/aio-posix.c | 2 +-
30
create mode 100644 include/hw/pci-host/remote.h
27
aio-win32.c => util/aio-win32.c | 0
31
create mode 100644 hw/pci-host/remote.c
28
util/aiocb.c | 55 +++++++++++++++++++++++++++++++++++++
29
async.c => util/async.c | 3 +-
30
iohandler.c => util/iohandler.c | 0
31
main-loop.c => util/main-loop.c | 0
32
qemu-timer.c => util/qemu-timer.c | 0
33
thread-pool.c => util/thread-pool.c | 2 +-
34
trace-events | 11 --------
35
util/trace-events | 11 ++++++++
36
17 files changed, 114 insertions(+), 64 deletions(-)
37
create mode 100644 stubs/linux-aio.c
38
rename aio-posix.c => util/aio-posix.c (99%)
39
rename aio-win32.c => util/aio-win32.c (100%)
40
create mode 100644 util/aiocb.c
41
rename async.c => util/async.c (99%)
42
rename iohandler.c => util/iohandler.c (100%)
43
rename main-loop.c => util/main-loop.c (100%)
44
rename qemu-timer.c => util/qemu-timer.c (100%)
45
rename thread-pool.c => util/thread-pool.c (99%)
46
32
47
diff --git a/Makefile.objs b/Makefile.objs
33
diff --git a/MAINTAINERS b/MAINTAINERS
48
index XXXXXXX..XXXXXXX 100644
34
index XXXXXXX..XXXXXXX 100644
49
--- a/Makefile.objs
35
--- a/MAINTAINERS
50
+++ b/Makefile.objs
36
+++ b/MAINTAINERS
51
@@ -XXX,XX +XXX,XX @@ chardev-obj-y = chardev/
37
@@ -XXX,XX +XXX,XX @@ M: John G Johnson <john.g.johnson@oracle.com>
52
#######################################################################
38
S: Maintained
53
# block-obj-y is code used by both qemu system emulation and qemu-img
39
F: docs/devel/multi-process.rst
54
40
F: docs/system/multi-process.rst
55
-block-obj-y = async.o thread-pool.o
41
+F: hw/pci-host/remote.c
56
block-obj-y += nbd/
42
+F: include/hw/pci-host/remote.h
57
block-obj-y += block.o blockjob.o
43
58
-block-obj-y += main-loop.o iohandler.o qemu-timer.o
44
Build and test automation
59
-block-obj-$(CONFIG_POSIX) += aio-posix.o
45
-------------------------
60
-block-obj-$(CONFIG_WIN32) += aio-win32.o
46
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
61
block-obj-y += block/
62
block-obj-y += qemu-io-cmds.o
63
block-obj-$(CONFIG_REPLICATION) += replication.o
64
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
65
index XXXXXXX..XXXXXXX 100644
66
--- a/stubs/Makefile.objs
67
+++ b/stubs/Makefile.objs
68
@@ -XXX,XX +XXX,XX @@ stub-obj-y += get-vm-name.o
69
stub-obj-y += iothread.o
70
stub-obj-y += iothread-lock.o
71
stub-obj-y += is-daemonized.o
72
+stub-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
73
stub-obj-y += machine-init-done.o
74
stub-obj-y += migr-blocker.o
75
stub-obj-y += monitor.o
76
diff --git a/tests/Makefile.include b/tests/Makefile.include
77
index XXXXXXX..XXXXXXX 100644
78
--- a/tests/Makefile.include
79
+++ b/tests/Makefile.include
80
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
81
check-unit-y += tests/test-iov$(EXESUF)
82
gcov-files-test-iov-y = util/iov.c
83
check-unit-y += tests/test-aio$(EXESUF)
84
+gcov-files-test-aio-y = util/async.c util/qemu-timer.o
85
+gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
86
+gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
87
check-unit-y += tests/test-throttle$(EXESUF)
88
gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
89
gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
90
@@ -XXX,XX +XXX,XX @@ tests/check-qjson$(EXESUF): tests/check-qjson.o $(test-util-obj-y)
91
tests/check-qom-interface$(EXESUF): tests/check-qom-interface.o $(test-qom-obj-y)
92
tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
93
94
-tests/test-char$(EXESUF): tests/test-char.o qemu-timer.o \
95
-    $(test-util-obj-y) $(qtest-obj-y) $(test-block-obj-y) $(chardev-obj-y)
96
+tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
97
tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
98
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
99
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
100
@@ -XXX,XX +XXX,XX @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
101
    migration/vmstate.o migration/qemu-file.o \
102
migration/qemu-file-channel.o migration/qjson.o \
103
    $(test-io-obj-y)
104
-tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
105
-    $(test-util-obj-y)
106
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o $(test-util-obj-y)
107
tests/test-base64$(EXESUF): tests/test-base64.o \
108
    libqemuutil.a libqemustub.a
109
tests/ptimer-test$(EXESUF): tests/ptimer-test.o tests/ptimer-test-stubs.o hw/core/ptimer.o libqemustub.a
110
@@ -XXX,XX +XXX,XX @@ tests/usb-hcd-ehci-test$(EXESUF): tests/usb-hcd-ehci-test.o $(libqos-usb-obj-y)
111
tests/usb-hcd-xhci-test$(EXESUF): tests/usb-hcd-xhci-test.o $(libqos-usb-obj-y)
112
tests/pc-cpu-test$(EXESUF): tests/pc-cpu-test.o
113
tests/postcopy-test$(EXESUF): tests/postcopy-test.o
114
-tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o qemu-timer.o \
115
+tests/vhost-user-test$(EXESUF): tests/vhost-user-test.o $(test-util-obj-y) \
116
    $(qtest-obj-y) $(test-io-obj-y) $(libqos-virtio-obj-y) $(libqos-pc-obj-y) \
117
    $(chardev-obj-y)
118
tests/qemu-iotests/socket_scm_helper$(EXESUF): tests/qemu-iotests/socket_scm_helper.o
119
diff --git a/util/Makefile.objs b/util/Makefile.objs
120
index XXXXXXX..XXXXXXX 100644
121
--- a/util/Makefile.objs
122
+++ b/util/Makefile.objs
123
@@ -XXX,XX +XXX,XX @@
124
util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
125
util-obj-y += bufferiszero.o
126
util-obj-y += lockcnt.o
127
+util-obj-y += aiocb.o async.o thread-pool.o qemu-timer.o
128
+util-obj-y += main-loop.o iohandler.o
129
+util-obj-$(CONFIG_POSIX) += aio-posix.o
130
util-obj-$(CONFIG_POSIX) += compatfd.o
131
util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
132
util-obj-$(CONFIG_POSIX) += mmap-alloc.o
133
util-obj-$(CONFIG_POSIX) += oslib-posix.o
134
util-obj-$(CONFIG_POSIX) += qemu-openpty.o
135
util-obj-$(CONFIG_POSIX) += qemu-thread-posix.o
136
-util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
137
util-obj-$(CONFIG_POSIX) += memfd.o
138
+util-obj-$(CONFIG_WIN32) += aio-win32.o
139
+util-obj-$(CONFIG_WIN32) += event_notifier-win32.o
140
util-obj-$(CONFIG_WIN32) += oslib-win32.o
141
util-obj-$(CONFIG_WIN32) += qemu-thread-win32.o
142
util-obj-y += envlist.o path.o module.o
143
diff --git a/block/io.c b/block/io.c
144
index XXXXXXX..XXXXXXX 100644
145
--- a/block/io.c
146
+++ b/block/io.c
147
@@ -XXX,XX +XXX,XX @@ BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
148
return &acb->common;
149
}
150
151
-void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
152
- BlockCompletionFunc *cb, void *opaque)
153
-{
154
- BlockAIOCB *acb;
155
-
156
- acb = g_malloc(aiocb_info->aiocb_size);
157
- acb->aiocb_info = aiocb_info;
158
- acb->bs = bs;
159
- acb->cb = cb;
160
- acb->opaque = opaque;
161
- acb->refcnt = 1;
162
- return acb;
163
-}
164
-
165
-void qemu_aio_ref(void *p)
166
-{
167
- BlockAIOCB *acb = p;
168
- acb->refcnt++;
169
-}
170
-
171
-void qemu_aio_unref(void *p)
172
-{
173
- BlockAIOCB *acb = p;
174
- assert(acb->refcnt > 0);
175
- if (--acb->refcnt == 0) {
176
- g_free(acb);
177
- }
178
-}
179
-
180
/**************************************************************/
181
/* Coroutine block device emulation */
182
183
diff --git a/stubs/linux-aio.c b/stubs/linux-aio.c
184
new file mode 100644
47
new file mode 100644
185
index XXXXXXX..XXXXXXX
48
index XXXXXXX..XXXXXXX
186
--- /dev/null
49
--- /dev/null
187
+++ b/stubs/linux-aio.c
50
+++ b/include/hw/pci-host/remote.h
188
@@ -XXX,XX +XXX,XX @@
51
@@ -XXX,XX +XXX,XX @@
189
+/*
52
+/*
190
+ * Linux native AIO support.
53
+ * PCI Host for remote device
191
+ *
54
+ *
192
+ * Copyright (C) 2009 IBM, Corp.
55
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
193
+ * Copyright (C) 2009 Red Hat, Inc.
194
+ *
56
+ *
195
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
57
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
196
+ * See the COPYING file in the top-level directory.
58
+ * See the COPYING file in the top-level directory.
59
+ *
197
+ */
60
+ */
198
+#include "qemu/osdep.h"
199
+#include "block/aio.h"
200
+#include "block/raw-aio.h"
201
+
61
+
202
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
62
+#ifndef REMOTE_PCIHOST_H
203
+{
63
+#define REMOTE_PCIHOST_H
204
+ abort();
205
+}
206
+
64
+
207
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
65
+#include "exec/memory.h"
208
+{
66
+#include "hw/pci/pcie_host.h"
209
+ abort();
210
+}
211
+
67
+
212
+LinuxAioState *laio_init(void)
68
+#define TYPE_REMOTE_PCIHOST "remote-pcihost"
213
+{
69
+OBJECT_DECLARE_SIMPLE_TYPE(RemotePCIHost, REMOTE_PCIHOST)
214
+ abort();
215
+}
216
+
70
+
217
+void laio_cleanup(LinuxAioState *s)
71
+struct RemotePCIHost {
218
+{
72
+ /*< private >*/
219
+ abort();
73
+ PCIExpressHost parent_obj;
220
+}
74
+ /*< public >*/
221
diff --git a/stubs/set-fd-handler.c b/stubs/set-fd-handler.c
75
+
222
index XXXXXXX..XXXXXXX 100644
76
+ MemoryRegion *mr_pci_mem;
223
--- a/stubs/set-fd-handler.c
77
+ MemoryRegion *mr_sys_io;
224
+++ b/stubs/set-fd-handler.c
78
+};
225
@@ -XXX,XX +XXX,XX @@ void qemu_set_fd_handler(int fd,
79
+
226
{
80
+#endif
227
abort();
81
diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c
228
}
229
-
230
-void aio_set_fd_handler(AioContext *ctx,
231
- int fd,
232
- bool is_external,
233
- IOHandler *io_read,
234
- IOHandler *io_write,
235
- AioPollFn *io_poll,
236
- void *opaque)
237
-{
238
- abort();
239
-}
240
diff --git a/aio-posix.c b/util/aio-posix.c
241
similarity index 99%
242
rename from aio-posix.c
243
rename to util/aio-posix.c
244
index XXXXXXX..XXXXXXX 100644
245
--- a/aio-posix.c
246
+++ b/util/aio-posix.c
247
@@ -XXX,XX +XXX,XX @@
248
#include "qemu/rcu_queue.h"
249
#include "qemu/sockets.h"
250
#include "qemu/cutils.h"
251
-#include "trace-root.h"
252
+#include "trace.h"
253
#ifdef CONFIG_EPOLL_CREATE1
254
#include <sys/epoll.h>
255
#endif
256
diff --git a/aio-win32.c b/util/aio-win32.c
257
similarity index 100%
258
rename from aio-win32.c
259
rename to util/aio-win32.c
260
diff --git a/util/aiocb.c b/util/aiocb.c
261
new file mode 100644
82
new file mode 100644
262
index XXXXXXX..XXXXXXX
83
index XXXXXXX..XXXXXXX
263
--- /dev/null
84
--- /dev/null
264
+++ b/util/aiocb.c
85
+++ b/hw/pci-host/remote.c
265
@@ -XXX,XX +XXX,XX @@
86
@@ -XXX,XX +XXX,XX @@
266
+/*
87
+/*
267
+ * BlockAIOCB allocation
88
+ * Remote PCI host device
268
+ *
89
+ *
269
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
90
+ * Unlike PCI host devices that model physical hardware, the purpose
91
+ * of this PCI host is to host multi-process QEMU devices.
270
+ *
92
+ *
271
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
93
+ * Multi-process QEMU extends the PCI host of a QEMU machine into a
272
+ * of this software and associated documentation files (the "Software"), to deal
94
+ * remote process. Any PCI device attached to the remote process is
273
+ * in the Software without restriction, including without limitation the rights
95
+ * visible in the QEMU guest. This allows existing QEMU device models
274
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
96
+ * to be reused in the remote process.
275
+ * copies of the Software, and to permit persons to whom the Software is
276
+ * furnished to do so, subject to the following conditions:
277
+ *
97
+ *
278
+ * The above copyright notice and this permission notice shall be included in
98
+ * This PCI host is purely a container for PCI devices. It's fake in the
279
+ * all copies or substantial portions of the Software.
99
+ * sense that the guest never sees this PCI host and has no way of
100
+ * accessing it. Its job is just to provide the environment that QEMU
101
+ * PCI device models need when running in a remote process.
280
+ *
102
+ *
281
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
103
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
282
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
104
+ *
283
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
105
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
284
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
106
+ * See the COPYING file in the top-level directory.
285
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
107
+ *
286
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
287
+ * THE SOFTWARE.
288
+ */
108
+ */
289
+
109
+
290
+#include "qemu/osdep.h"
110
+#include "qemu/osdep.h"
291
+#include "block/aio.h"
111
+#include "qemu-common.h"
292
+
112
+
293
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
113
+#include "hw/pci/pci.h"
294
+ BlockCompletionFunc *cb, void *opaque)
114
+#include "hw/pci/pci_host.h"
115
+#include "hw/pci/pcie_host.h"
116
+#include "hw/qdev-properties.h"
117
+#include "hw/pci-host/remote.h"
118
+#include "exec/memory.h"
119
+
120
+static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
121
+ PCIBus *rootbus)
295
+{
122
+{
296
+ BlockAIOCB *acb;
123
+ return "0000:00";
297
+
298
+ acb = g_malloc(aiocb_info->aiocb_size);
299
+ acb->aiocb_info = aiocb_info;
300
+ acb->bs = bs;
301
+ acb->cb = cb;
302
+ acb->opaque = opaque;
303
+ acb->refcnt = 1;
304
+ return acb;
305
+}
124
+}
306
+
125
+
307
+void qemu_aio_ref(void *p)
126
+static void remote_pcihost_realize(DeviceState *dev, Error **errp)
308
+{
127
+{
309
+ BlockAIOCB *acb = p;
128
+ PCIHostState *pci = PCI_HOST_BRIDGE(dev);
310
+ acb->refcnt++;
129
+ RemotePCIHost *s = REMOTE_PCIHOST(dev);
130
+
131
+ pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci",
132
+ s->mr_pci_mem, s->mr_sys_io,
133
+ 0, TYPE_PCIE_BUS);
311
+}
134
+}
312
+
135
+
313
+void qemu_aio_unref(void *p)
136
+static void remote_pcihost_class_init(ObjectClass *klass, void *data)
314
+{
137
+{
315
+ BlockAIOCB *acb = p;
138
+ DeviceClass *dc = DEVICE_CLASS(klass);
316
+ assert(acb->refcnt > 0);
139
+ PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
317
+ if (--acb->refcnt == 0) {
140
+
318
+ g_free(acb);
141
+ hc->root_bus_path = remote_pcihost_root_bus_path;
319
+ }
142
+ dc->realize = remote_pcihost_realize;
143
+
144
+ dc->user_creatable = false;
145
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
146
+ dc->fw_name = "pci";
320
+}
147
+}
321
diff --git a/async.c b/util/async.c
148
+
322
similarity index 99%
149
+static const TypeInfo remote_pcihost_info = {
323
rename from async.c
150
+ .name = TYPE_REMOTE_PCIHOST,
324
rename to util/async.c
151
+ .parent = TYPE_PCIE_HOST_BRIDGE,
152
+ .instance_size = sizeof(RemotePCIHost),
153
+ .class_init = remote_pcihost_class_init,
154
+};
155
+
156
+static void remote_pcihost_register(void)
157
+{
158
+ type_register_static(&remote_pcihost_info);
159
+}
160
+
161
+type_init(remote_pcihost_register)
162
diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig
325
index XXXXXXX..XXXXXXX 100644
163
index XXXXXXX..XXXXXXX 100644
326
--- a/async.c
164
--- a/hw/pci-host/Kconfig
327
+++ b/util/async.c
165
+++ b/hw/pci-host/Kconfig
166
@@ -XXX,XX +XXX,XX @@ config PCI_POWERNV
167
select PCI_EXPRESS
168
select MSI_NONBROKEN
169
select PCIE_PORT
170
+
171
+config REMOTE_PCIHOST
172
+ bool
173
diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build
174
index XXXXXXX..XXXXXXX 100644
175
--- a/hw/pci-host/meson.build
176
+++ b/hw/pci-host/meson.build
177
@@ -XXX,XX +XXX,XX @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c'))
178
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
179
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
180
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
181
+pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c'))
182
183
# PPC devices
184
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
185
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
186
index XXXXXXX..XXXXXXX 100644
187
--- a/hw/remote/Kconfig
188
+++ b/hw/remote/Kconfig
328
@@ -XXX,XX +XXX,XX @@
189
@@ -XXX,XX +XXX,XX @@
329
/*
190
config MULTIPROCESS
330
- * QEMU System Emulator
191
bool
331
+ * Data plane event loop
192
- depends on PCI && KVM
332
*
193
+ depends on PCI && PCI_EXPRESS && KVM
333
* Copyright (c) 2003-2008 Fabrice Bellard
194
+ select REMOTE_PCIHOST
334
+ * Copyright (c) 2009-2017 QEMU contributors
335
*
336
* Permission is hereby granted, free of charge, to any person obtaining a copy
337
* of this software and associated documentation files (the "Software"), to deal
338
diff --git a/iohandler.c b/util/iohandler.c
339
similarity index 100%
340
rename from iohandler.c
341
rename to util/iohandler.c
342
diff --git a/main-loop.c b/util/main-loop.c
343
similarity index 100%
344
rename from main-loop.c
345
rename to util/main-loop.c
346
diff --git a/qemu-timer.c b/util/qemu-timer.c
347
similarity index 100%
348
rename from qemu-timer.c
349
rename to util/qemu-timer.c
350
diff --git a/thread-pool.c b/util/thread-pool.c
351
similarity index 99%
352
rename from thread-pool.c
353
rename to util/thread-pool.c
354
index XXXXXXX..XXXXXXX 100644
355
--- a/thread-pool.c
356
+++ b/util/thread-pool.c
357
@@ -XXX,XX +XXX,XX @@
358
#include "qemu/queue.h"
359
#include "qemu/thread.h"
360
#include "qemu/coroutine.h"
361
-#include "trace-root.h"
362
+#include "trace.h"
363
#include "block/thread-pool.h"
364
#include "qemu/main-loop.h"
365
366
diff --git a/trace-events b/trace-events
367
index XXXXXXX..XXXXXXX 100644
368
--- a/trace-events
369
+++ b/trace-events
370
@@ -XXX,XX +XXX,XX @@
371
#
372
# The <format-string> should be a sprintf()-compatible format string.
373
374
-# aio-posix.c
375
-run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
376
-run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
377
-poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
378
-poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
379
-
380
-# thread-pool.c
381
-thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
382
-thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
383
-thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
384
-
385
# ioport.c
386
cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
387
cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
388
diff --git a/util/trace-events b/util/trace-events
389
index XXXXXXX..XXXXXXX 100644
390
--- a/util/trace-events
391
+++ b/util/trace-events
392
@@ -XXX,XX +XXX,XX @@
393
# See docs/tracing.txt for syntax documentation.
394
395
+# util/aio-posix.c
396
+run_poll_handlers_begin(void *ctx, int64_t max_ns) "ctx %p max_ns %"PRId64
397
+run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
398
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
399
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
400
+
401
+# util/thread-pool.c
402
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
403
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
404
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
405
+
406
# util/buffer.c
407
buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
408
buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
409
--
195
--
410
2.9.3
196
2.29.2
411
197
412
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
This covers both file descriptor callbacks and polling callbacks,
3
x-remote-machine object sets up various subsystems of the remote
4
since they execute related code.
4
device process. Instantiate PCI host bridge object and initialize RAM, IO &
5
PCI memory regions.
5
6
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Message-id: c537f38d17f90453ca610c6b70cf3480274e0ba1.1611938319.git.jag.raman@oracle.com
8
Reviewed-by: Fam Zheng <famz@redhat.com>
9
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
10
Message-id: 20170213135235.12274-14-pbonzini@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
---
13
block/curl.c | 16 +++++++++++++---
14
MAINTAINERS | 2 ++
14
block/iscsi.c | 4 ++++
15
include/hw/pci-host/remote.h | 1 +
15
block/linux-aio.c | 4 ++++
16
include/hw/remote/machine.h | 27 ++++++++++++++
16
block/nfs.c | 6 ++++++
17
hw/remote/machine.c | 70 ++++++++++++++++++++++++++++++++++++
17
block/sheepdog.c | 29 +++++++++++++++--------------
18
hw/meson.build | 1 +
18
block/ssh.c | 29 +++++++++--------------------
19
hw/remote/meson.build | 5 +++
19
block/win32-aio.c | 10 ++++++----
20
6 files changed, 106 insertions(+)
20
hw/block/virtio-blk.c | 5 ++++-
21
create mode 100644 include/hw/remote/machine.h
21
hw/scsi/virtio-scsi.c | 7 +++++++
22
create mode 100644 hw/remote/machine.c
22
util/aio-posix.c | 7 -------
23
create mode 100644 hw/remote/meson.build
23
util/aio-win32.c | 6 ------
24
11 files changed, 68 insertions(+), 55 deletions(-)
25
24
26
diff --git a/block/curl.c b/block/curl.c
25
diff --git a/MAINTAINERS b/MAINTAINERS
27
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
28
--- a/block/curl.c
27
--- a/MAINTAINERS
29
+++ b/block/curl.c
28
+++ b/MAINTAINERS
30
@@ -XXX,XX +XXX,XX @@ static void curl_multi_check_completion(BDRVCURLState *s)
29
@@ -XXX,XX +XXX,XX @@ F: docs/devel/multi-process.rst
31
}
30
F: docs/system/multi-process.rst
32
}
31
F: hw/pci-host/remote.c
33
32
F: include/hw/pci-host/remote.h
34
-static void curl_multi_do(void *arg)
33
+F: hw/remote/machine.c
35
+static void curl_multi_do_locked(CURLState *s)
34
+F: include/hw/remote/machine.h
36
{
35
37
- CURLState *s = (CURLState *)arg;
36
Build and test automation
38
CURLSocket *socket, *next_socket;
37
-------------------------
39
int running;
38
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
40
int r;
39
index XXXXXXX..XXXXXXX 100644
41
@@ -XXX,XX +XXX,XX @@ static void curl_multi_do(void *arg)
40
--- a/include/hw/pci-host/remote.h
42
}
41
+++ b/include/hw/pci-host/remote.h
43
}
42
@@ -XXX,XX +XXX,XX @@ struct RemotePCIHost {
44
43
45
+static void curl_multi_do(void *arg)
44
MemoryRegion *mr_pci_mem;
45
MemoryRegion *mr_sys_io;
46
+ MemoryRegion *mr_sys_mem;
47
};
48
49
#endif
50
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
51
new file mode 100644
52
index XXXXXXX..XXXXXXX
53
--- /dev/null
54
+++ b/include/hw/remote/machine.h
55
@@ -XXX,XX +XXX,XX @@
56
+/*
57
+ * Remote machine configuration
58
+ *
59
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
60
+ *
61
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
62
+ * See the COPYING file in the top-level directory.
63
+ *
64
+ */
65
+
66
+#ifndef REMOTE_MACHINE_H
67
+#define REMOTE_MACHINE_H
68
+
69
+#include "qom/object.h"
70
+#include "hw/boards.h"
71
+#include "hw/pci-host/remote.h"
72
+
73
+struct RemoteMachineState {
74
+ MachineState parent_obj;
75
+
76
+ RemotePCIHost *host;
77
+};
78
+
79
+#define TYPE_REMOTE_MACHINE "x-remote-machine"
80
+OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
81
+
82
+#endif
83
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
84
new file mode 100644
85
index XXXXXXX..XXXXXXX
86
--- /dev/null
87
+++ b/hw/remote/machine.c
88
@@ -XXX,XX +XXX,XX @@
89
+/*
90
+ * Machine for remote device
91
+ *
92
+ * This machine type is used by the remote device process in multi-process
93
+ * QEMU. QEMU device models depend on parent busses, interrupt controllers,
94
+ * memory regions, etc. The remote machine type offers this environment so
95
+ * that QEMU device models can be used as remote devices.
96
+ *
97
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
98
+ *
99
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
100
+ * See the COPYING file in the top-level directory.
101
+ *
102
+ */
103
+
104
+#include "qemu/osdep.h"
105
+#include "qemu-common.h"
106
+
107
+#include "hw/remote/machine.h"
108
+#include "exec/address-spaces.h"
109
+#include "exec/memory.h"
110
+#include "qapi/error.h"
111
+
112
+static void remote_machine_init(MachineState *machine)
46
+{
113
+{
47
+ CURLState *s = (CURLState *)arg;
114
+ MemoryRegion *system_memory, *system_io, *pci_memory;
115
+ RemoteMachineState *s = REMOTE_MACHINE(machine);
116
+ RemotePCIHost *rem_host;
48
+
117
+
49
+ aio_context_acquire(s->s->aio_context);
118
+ system_memory = get_system_memory();
50
+ curl_multi_do_locked(s);
119
+ system_io = get_system_io();
51
+ aio_context_release(s->s->aio_context);
120
+
121
+ pci_memory = g_new(MemoryRegion, 1);
122
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
123
+
124
+ rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
125
+
126
+ rem_host->mr_pci_mem = pci_memory;
127
+ rem_host->mr_sys_mem = system_memory;
128
+ rem_host->mr_sys_io = system_io;
129
+
130
+ s->host = rem_host;
131
+
132
+ object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
133
+ memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
134
+
135
+ qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
52
+}
136
+}
53
+
137
+
54
static void curl_multi_read(void *arg)
138
+static void remote_machine_class_init(ObjectClass *oc, void *data)
55
{
139
+{
56
CURLState *s = (CURLState *)arg;
140
+ MachineClass *mc = MACHINE_CLASS(oc);
57
58
- curl_multi_do(arg);
59
+ aio_context_acquire(s->s->aio_context);
60
+ curl_multi_do_locked(s);
61
curl_multi_check_completion(s->s);
62
+ aio_context_release(s->s->aio_context);
63
}
64
65
static void curl_multi_timeout_do(void *arg)
66
diff --git a/block/iscsi.c b/block/iscsi.c
67
index XXXXXXX..XXXXXXX 100644
68
--- a/block/iscsi.c
69
+++ b/block/iscsi.c
70
@@ -XXX,XX +XXX,XX @@ iscsi_process_read(void *arg)
71
IscsiLun *iscsilun = arg;
72
struct iscsi_context *iscsi = iscsilun->iscsi;
73
74
+ aio_context_acquire(iscsilun->aio_context);
75
iscsi_service(iscsi, POLLIN);
76
iscsi_set_events(iscsilun);
77
+ aio_context_release(iscsilun->aio_context);
78
}
79
80
static void
81
@@ -XXX,XX +XXX,XX @@ iscsi_process_write(void *arg)
82
IscsiLun *iscsilun = arg;
83
struct iscsi_context *iscsi = iscsilun->iscsi;
84
85
+ aio_context_acquire(iscsilun->aio_context);
86
iscsi_service(iscsi, POLLOUT);
87
iscsi_set_events(iscsilun);
88
+ aio_context_release(iscsilun->aio_context);
89
}
90
91
static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun)
92
diff --git a/block/linux-aio.c b/block/linux-aio.c
93
index XXXXXXX..XXXXXXX 100644
94
--- a/block/linux-aio.c
95
+++ b/block/linux-aio.c
96
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
97
LinuxAioState *s = container_of(e, LinuxAioState, e);
98
99
if (event_notifier_test_and_clear(&s->e)) {
100
+ aio_context_acquire(s->aio_context);
101
qemu_laio_process_completions_and_submit(s);
102
+ aio_context_release(s->aio_context);
103
}
104
}
105
106
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
107
return false;
108
}
109
110
+ aio_context_acquire(s->aio_context);
111
qemu_laio_process_completions_and_submit(s);
112
+ aio_context_release(s->aio_context);
113
return true;
114
}
115
116
diff --git a/block/nfs.c b/block/nfs.c
117
index XXXXXXX..XXXXXXX 100644
118
--- a/block/nfs.c
119
+++ b/block/nfs.c
120
@@ -XXX,XX +XXX,XX @@ static void nfs_set_events(NFSClient *client)
121
static void nfs_process_read(void *arg)
122
{
123
NFSClient *client = arg;
124
+
141
+
125
+ aio_context_acquire(client->aio_context);
142
+ mc->init = remote_machine_init;
126
nfs_service(client->context, POLLIN);
143
+ mc->desc = "Experimental remote machine";
127
nfs_set_events(client);
128
+ aio_context_release(client->aio_context);
129
}
130
131
static void nfs_process_write(void *arg)
132
{
133
NFSClient *client = arg;
134
+
135
+ aio_context_acquire(client->aio_context);
136
nfs_service(client->context, POLLOUT);
137
nfs_set_events(client);
138
+ aio_context_release(client->aio_context);
139
}
140
141
static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
142
diff --git a/block/sheepdog.c b/block/sheepdog.c
143
index XXXXXXX..XXXXXXX 100644
144
--- a/block/sheepdog.c
145
+++ b/block/sheepdog.c
146
@@ -XXX,XX +XXX,XX @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data,
147
return ret;
148
}
149
150
-static void restart_co_req(void *opaque)
151
-{
152
- Coroutine *co = opaque;
153
-
154
- qemu_coroutine_enter(co);
155
-}
156
-
157
typedef struct SheepdogReqCo {
158
int sockfd;
159
BlockDriverState *bs;
160
@@ -XXX,XX +XXX,XX @@ typedef struct SheepdogReqCo {
161
unsigned int *rlen;
162
int ret;
163
bool finished;
164
+ Coroutine *co;
165
} SheepdogReqCo;
166
167
+static void restart_co_req(void *opaque)
168
+{
169
+ SheepdogReqCo *srco = opaque;
170
+
171
+ aio_co_wake(srco->co);
172
+}
144
+}
173
+
145
+
174
static coroutine_fn void do_co_req(void *opaque)
146
+static const TypeInfo remote_machine = {
175
{
147
+ .name = TYPE_REMOTE_MACHINE,
176
int ret;
148
+ .parent = TYPE_MACHINE,
177
- Coroutine *co;
149
+ .instance_size = sizeof(RemoteMachineState),
178
SheepdogReqCo *srco = opaque;
150
+ .class_init = remote_machine_class_init,
179
int sockfd = srco->sockfd;
151
+};
180
SheepdogReq *hdr = srco->hdr;
152
+
181
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
153
+static void remote_machine_register_types(void)
182
unsigned int *wlen = srco->wlen;
154
+{
183
unsigned int *rlen = srco->rlen;
155
+ type_register_static(&remote_machine);
184
156
+}
185
- co = qemu_coroutine_self();
157
+
186
+ srco->co = qemu_coroutine_self();
158
+type_init(remote_machine_register_types);
187
aio_set_fd_handler(srco->aio_context, sockfd, false,
159
diff --git a/hw/meson.build b/hw/meson.build
188
- NULL, restart_co_req, NULL, co);
189
+ NULL, restart_co_req, NULL, srco);
190
191
ret = send_co_req(sockfd, hdr, data, wlen);
192
if (ret < 0) {
193
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void do_co_req(void *opaque)
194
}
195
196
aio_set_fd_handler(srco->aio_context, sockfd, false,
197
- restart_co_req, NULL, NULL, co);
198
+ restart_co_req, NULL, NULL, srco);
199
200
ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr));
201
if (ret != sizeof(*hdr)) {
202
@@ -XXX,XX +XXX,XX @@ out:
203
aio_set_fd_handler(srco->aio_context, sockfd, false,
204
NULL, NULL, NULL, NULL);
205
206
+ srco->co = NULL;
207
srco->ret = ret;
208
srco->finished = true;
209
if (srco->bs) {
210
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn aio_read_response(void *opaque)
211
* We've finished all requests which belong to the AIOCB, so
212
* we can switch back to sd_co_readv/writev now.
213
*/
214
- qemu_coroutine_enter(acb->coroutine);
215
+ aio_co_wake(acb->coroutine);
216
}
217
218
return;
219
@@ -XXX,XX +XXX,XX @@ static void co_read_response(void *opaque)
220
s->co_recv = qemu_coroutine_create(aio_read_response, opaque);
221
}
222
223
- qemu_coroutine_enter(s->co_recv);
224
+ aio_co_wake(s->co_recv);
225
}
226
227
static void co_write_request(void *opaque)
228
{
229
BDRVSheepdogState *s = opaque;
230
231
- qemu_coroutine_enter(s->co_send);
232
+ aio_co_wake(s->co_send);
233
}
234
235
/*
236
diff --git a/block/ssh.c b/block/ssh.c
237
index XXXXXXX..XXXXXXX 100644
160
index XXXXXXX..XXXXXXX 100644
238
--- a/block/ssh.c
161
--- a/hw/meson.build
239
+++ b/block/ssh.c
162
+++ b/hw/meson.build
240
@@ -XXX,XX +XXX,XX @@ static void restart_coroutine(void *opaque)
163
@@ -XXX,XX +XXX,XX @@ subdir('moxie')
241
164
subdir('nios2')
242
DPRINTF("co=%p", co);
165
subdir('openrisc')
243
166
subdir('ppc')
244
- qemu_coroutine_enter(co);
167
+subdir('remote')
245
+ aio_co_wake(co);
168
subdir('riscv')
246
}
169
subdir('rx')
247
170
subdir('s390x')
248
-static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
171
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
249
+/* A non-blocking call returned EAGAIN, so yield, ensuring the
172
new file mode 100644
250
+ * handlers are set up so that we'll be rescheduled when there is an
173
index XXXXXXX..XXXXXXX
251
+ * interesting event on the socket.
174
--- /dev/null
252
+ */
175
+++ b/hw/remote/meson.build
253
+static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
176
@@ -XXX,XX +XXX,XX @@
254
{
177
+remote_ss = ss.source_set()
255
int r;
178
+
256
IOHandler *rd_handler = NULL, *wr_handler = NULL;
179
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
257
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs)
180
+
258
181
+softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
259
aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
260
false, rd_handler, wr_handler, NULL, co);
261
-}
262
-
263
-static coroutine_fn void clear_fd_handler(BDRVSSHState *s,
264
- BlockDriverState *bs)
265
-{
266
- DPRINTF("s->sock=%d", s->sock);
267
- aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock,
268
- false, NULL, NULL, NULL, NULL);
269
-}
270
-
271
-/* A non-blocking call returned EAGAIN, so yield, ensuring the
272
- * handlers are set up so that we'll be rescheduled when there is an
273
- * interesting event on the socket.
274
- */
275
-static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs)
276
-{
277
- set_fd_handler(s, bs);
278
qemu_coroutine_yield();
279
- clear_fd_handler(s, bs);
280
+ DPRINTF("s->sock=%d - back", s->sock);
281
+ aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, false,
282
+ NULL, NULL, NULL, NULL);
283
}
284
285
/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position
286
diff --git a/block/win32-aio.c b/block/win32-aio.c
287
index XXXXXXX..XXXXXXX 100644
288
--- a/block/win32-aio.c
289
+++ b/block/win32-aio.c
290
@@ -XXX,XX +XXX,XX @@ struct QEMUWin32AIOState {
291
HANDLE hIOCP;
292
EventNotifier e;
293
int count;
294
- bool is_aio_context_attached;
295
+ AioContext *aio_ctx;
296
};
297
298
typedef struct QEMUWin32AIOCB {
299
@@ -XXX,XX +XXX,XX @@ static void win32_aio_process_completion(QEMUWin32AIOState *s,
300
}
301
302
303
+ aio_context_acquire(s->aio_ctx);
304
waiocb->common.cb(waiocb->common.opaque, ret);
305
+ aio_context_release(s->aio_ctx);
306
qemu_aio_unref(waiocb);
307
}
308
309
@@ -XXX,XX +XXX,XX @@ void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
310
AioContext *old_context)
311
{
312
aio_set_event_notifier(old_context, &aio->e, false, NULL, NULL);
313
- aio->is_aio_context_attached = false;
314
+ aio->aio_ctx = NULL;
315
}
316
317
void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
318
AioContext *new_context)
319
{
320
- aio->is_aio_context_attached = true;
321
+ aio->aio_ctx = new_context;
322
aio_set_event_notifier(new_context, &aio->e, false,
323
win32_aio_completion_cb, NULL);
324
}
325
@@ -XXX,XX +XXX,XX @@ out_free_state:
326
327
void win32_aio_cleanup(QEMUWin32AIOState *aio)
328
{
329
- assert(!aio->is_aio_context_attached);
330
+ assert(!aio->aio_ctx);
331
CloseHandle(aio->hIOCP);
332
event_notifier_cleanup(&aio->e);
333
g_free(aio);
334
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
335
index XXXXXXX..XXXXXXX 100644
336
--- a/hw/block/virtio-blk.c
337
+++ b/hw/block/virtio-blk.c
338
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_ioctl_complete(void *opaque, int status)
339
{
340
VirtIOBlockIoctlReq *ioctl_req = opaque;
341
VirtIOBlockReq *req = ioctl_req->req;
342
- VirtIODevice *vdev = VIRTIO_DEVICE(req->dev);
343
+ VirtIOBlock *s = req->dev;
344
+ VirtIODevice *vdev = VIRTIO_DEVICE(s);
345
struct virtio_scsi_inhdr *scsi;
346
struct sg_io_hdr *hdr;
347
348
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
349
MultiReqBuffer mrb = {};
350
bool progress = false;
351
352
+ aio_context_acquire(blk_get_aio_context(s->blk));
353
blk_io_plug(s->blk);
354
355
do {
356
@@ -XXX,XX +XXX,XX @@ bool virtio_blk_handle_vq(VirtIOBlock *s, VirtQueue *vq)
357
}
358
359
blk_io_unplug(s->blk);
360
+ aio_context_release(blk_get_aio_context(s->blk));
361
return progress;
362
}
363
364
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
365
index XXXXXXX..XXXXXXX 100644
366
--- a/hw/scsi/virtio-scsi.c
367
+++ b/hw/scsi/virtio-scsi.c
368
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
369
VirtIOSCSIReq *req;
370
bool progress = false;
371
372
+ virtio_scsi_acquire(s);
373
while ((req = virtio_scsi_pop_req(s, vq))) {
374
progress = true;
375
virtio_scsi_handle_ctrl_req(s, req);
376
}
377
+ virtio_scsi_release(s);
378
return progress;
379
}
380
381
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
382
383
QTAILQ_HEAD(, VirtIOSCSIReq) reqs = QTAILQ_HEAD_INITIALIZER(reqs);
384
385
+ virtio_scsi_acquire(s);
386
do {
387
virtio_queue_set_notification(vq, 0);
388
389
@@ -XXX,XX +XXX,XX @@ bool virtio_scsi_handle_cmd_vq(VirtIOSCSI *s, VirtQueue *vq)
390
QTAILQ_FOREACH_SAFE(req, &reqs, next, next) {
391
virtio_scsi_handle_cmd_req_submit(s, req);
392
}
393
+ virtio_scsi_release(s);
394
return progress;
395
}
396
397
@@ -XXX,XX +XXX,XX @@ out:
398
399
bool virtio_scsi_handle_event_vq(VirtIOSCSI *s, VirtQueue *vq)
400
{
401
+ virtio_scsi_acquire(s);
402
if (s->events_dropped) {
403
virtio_scsi_push_event(s, NULL, VIRTIO_SCSI_T_NO_EVENT, 0);
404
+ virtio_scsi_release(s);
405
return true;
406
}
407
+ virtio_scsi_release(s);
408
return false;
409
}
410
411
diff --git a/util/aio-posix.c b/util/aio-posix.c
412
index XXXXXXX..XXXXXXX 100644
413
--- a/util/aio-posix.c
414
+++ b/util/aio-posix.c
415
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
416
(revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
417
aio_node_check(ctx, node->is_external) &&
418
node->io_read) {
419
- aio_context_acquire(ctx);
420
node->io_read(node->opaque);
421
- aio_context_release(ctx);
422
423
/* aio_notify() does not count as progress */
424
if (node->opaque != &ctx->notifier) {
425
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
426
(revents & (G_IO_OUT | G_IO_ERR)) &&
427
aio_node_check(ctx, node->is_external) &&
428
node->io_write) {
429
- aio_context_acquire(ctx);
430
node->io_write(node->opaque);
431
- aio_context_release(ctx);
432
progress = true;
433
}
434
435
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
436
start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
437
}
438
439
- aio_context_acquire(ctx);
440
progress = try_poll_mode(ctx, blocking);
441
- aio_context_release(ctx);
442
-
443
if (!progress) {
444
assert(npfd == 0);
445
446
diff --git a/util/aio-win32.c b/util/aio-win32.c
447
index XXXXXXX..XXXXXXX 100644
448
--- a/util/aio-win32.c
449
+++ b/util/aio-win32.c
450
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
451
(revents || event_notifier_get_handle(node->e) == event) &&
452
node->io_notify) {
453
node->pfd.revents = 0;
454
- aio_context_acquire(ctx);
455
node->io_notify(node->e);
456
- aio_context_release(ctx);
457
458
/* aio_notify() does not count as progress */
459
if (node->e != &ctx->notifier) {
460
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
461
(node->io_read || node->io_write)) {
462
node->pfd.revents = 0;
463
if ((revents & G_IO_IN) && node->io_read) {
464
- aio_context_acquire(ctx);
465
node->io_read(node->opaque);
466
- aio_context_release(ctx);
467
progress = true;
468
}
469
if ((revents & G_IO_OUT) && node->io_write) {
470
- aio_context_acquire(ctx);
471
node->io_write(node->opaque);
472
- aio_context_release(ctx);
473
progress = true;
474
}
475
476
--
182
--
477
2.9.3
183
2.29.2
478
184
479
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
This is in preparation for making qio_channel_yield work on
3
Adds qio_channel_writev_full_all() to transmit both data and FDs.
4
AioContexts other than the main one.
4
Refactors existing code to use this helper.
5
5
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
9
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 480fbf1fe4152495d60596c9b665124549b426a5.1611938319.git.jag.raman@oracle.com
10
Message-id: 20170213135235.12274-6-pbonzini@redhat.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
---
13
include/io/channel.h | 25 +++++++++++++++++++++++++
14
include/io/channel.h | 25 +++++++++++++++++++++++++
14
io/channel-command.c | 13 +++++++++++++
15
io/channel.c | 15 ++++++++++++++-
15
io/channel-file.c | 11 +++++++++++
16
2 files changed, 39 insertions(+), 1 deletion(-)
16
io/channel-socket.c | 16 +++++++++++-----
17
io/channel-tls.c | 12 ++++++++++++
18
io/channel-watch.c | 6 ++++++
19
io/channel.c | 11 +++++++++++
20
7 files changed, 89 insertions(+), 5 deletions(-)
21
17
22
diff --git a/include/io/channel.h b/include/io/channel.h
18
diff --git a/include/io/channel.h b/include/io/channel.h
23
index XXXXXXX..XXXXXXX 100644
19
index XXXXXXX..XXXXXXX 100644
24
--- a/include/io/channel.h
20
--- a/include/io/channel.h
25
+++ b/include/io/channel.h
21
+++ b/include/io/channel.h
26
@@ -XXX,XX +XXX,XX @@
22
@@ -XXX,XX +XXX,XX @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
27
23
IOHandler *io_write,
28
#include "qemu-common.h"
24
void *opaque);
29
#include "qom/object.h"
30
+#include "block/aio.h"
31
32
#define TYPE_QIO_CHANNEL "qio-channel"
33
#define QIO_CHANNEL(obj) \
34
@@ -XXX,XX +XXX,XX @@ struct QIOChannelClass {
35
off_t offset,
36
int whence,
37
Error **errp);
38
+ void (*io_set_aio_fd_handler)(QIOChannel *ioc,
39
+ AioContext *ctx,
40
+ IOHandler *io_read,
41
+ IOHandler *io_write,
42
+ void *opaque);
43
};
44
45
/* General I/O handling functions */
46
@@ -XXX,XX +XXX,XX @@ void qio_channel_yield(QIOChannel *ioc,
47
void qio_channel_wait(QIOChannel *ioc,
48
GIOCondition condition);
49
25
50
+/**
26
+/**
51
+ * qio_channel_set_aio_fd_handler:
27
+ * qio_channel_writev_full_all:
52
+ * @ioc: the channel object
28
+ * @ioc: the channel object
53
+ * @ctx: the AioContext to set the handlers on
29
+ * @iov: the array of memory regions to write data from
54
+ * @io_read: the read handler
30
+ * @niov: the length of the @iov array
55
+ * @io_write: the write handler
31
+ * @fds: an array of file handles to send
56
+ * @opaque: the opaque value passed to the handler
32
+ * @nfds: number of file handles in @fds
33
+ * @errp: pointer to a NULL-initialized error object
57
+ *
34
+ *
58
+ * This is used internally by qio_channel_yield(). It can
35
+ *
59
+ * be used by channel implementations to forward the handlers
36
+ * Behaves like qio_channel_writev_full but will attempt
60
+ * to another channel (e.g. from #QIOChannelTLS to the
37
+ * to send all data passed (file handles and memory regions).
61
+ * underlying socket).
38
+ * The function will wait for all requested data
39
+ * to be written, yielding from the current coroutine
40
+ * if required.
41
+ *
42
+ * Returns: 0 if all bytes were written, or -1 on error
62
+ */
43
+ */
63
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
44
+
64
+ AioContext *ctx,
45
+int qio_channel_writev_full_all(QIOChannel *ioc,
65
+ IOHandler *io_read,
46
+ const struct iovec *iov,
66
+ IOHandler *io_write,
47
+ size_t niov,
67
+ void *opaque);
48
+ int *fds, size_t nfds,
49
+ Error **errp);
68
+
50
+
69
#endif /* QIO_CHANNEL_H */
51
#endif /* QIO_CHANNEL_H */
70
diff --git a/io/channel-command.c b/io/channel-command.c
71
index XXXXXXX..XXXXXXX 100644
72
--- a/io/channel-command.c
73
+++ b/io/channel-command.c
74
@@ -XXX,XX +XXX,XX @@ static int qio_channel_command_close(QIOChannel *ioc,
75
}
76
77
78
+static void qio_channel_command_set_aio_fd_handler(QIOChannel *ioc,
79
+ AioContext *ctx,
80
+ IOHandler *io_read,
81
+ IOHandler *io_write,
82
+ void *opaque)
83
+{
84
+ QIOChannelCommand *cioc = QIO_CHANNEL_COMMAND(ioc);
85
+ aio_set_fd_handler(ctx, cioc->readfd, false, io_read, NULL, NULL, opaque);
86
+ aio_set_fd_handler(ctx, cioc->writefd, false, NULL, io_write, NULL, opaque);
87
+}
88
+
89
+
90
static GSource *qio_channel_command_create_watch(QIOChannel *ioc,
91
GIOCondition condition)
92
{
93
@@ -XXX,XX +XXX,XX @@ static void qio_channel_command_class_init(ObjectClass *klass,
94
ioc_klass->io_set_blocking = qio_channel_command_set_blocking;
95
ioc_klass->io_close = qio_channel_command_close;
96
ioc_klass->io_create_watch = qio_channel_command_create_watch;
97
+ ioc_klass->io_set_aio_fd_handler = qio_channel_command_set_aio_fd_handler;
98
}
99
100
static const TypeInfo qio_channel_command_info = {
101
diff --git a/io/channel-file.c b/io/channel-file.c
102
index XXXXXXX..XXXXXXX 100644
103
--- a/io/channel-file.c
104
+++ b/io/channel-file.c
105
@@ -XXX,XX +XXX,XX @@ static int qio_channel_file_close(QIOChannel *ioc,
106
}
107
108
109
+static void qio_channel_file_set_aio_fd_handler(QIOChannel *ioc,
110
+ AioContext *ctx,
111
+ IOHandler *io_read,
112
+ IOHandler *io_write,
113
+ void *opaque)
114
+{
115
+ QIOChannelFile *fioc = QIO_CHANNEL_FILE(ioc);
116
+ aio_set_fd_handler(ctx, fioc->fd, false, io_read, io_write, NULL, opaque);
117
+}
118
+
119
static GSource *qio_channel_file_create_watch(QIOChannel *ioc,
120
GIOCondition condition)
121
{
122
@@ -XXX,XX +XXX,XX @@ static void qio_channel_file_class_init(ObjectClass *klass,
123
ioc_klass->io_seek = qio_channel_file_seek;
124
ioc_klass->io_close = qio_channel_file_close;
125
ioc_klass->io_create_watch = qio_channel_file_create_watch;
126
+ ioc_klass->io_set_aio_fd_handler = qio_channel_file_set_aio_fd_handler;
127
}
128
129
static const TypeInfo qio_channel_file_info = {
130
diff --git a/io/channel-socket.c b/io/channel-socket.c
131
index XXXXXXX..XXXXXXX 100644
132
--- a/io/channel-socket.c
133
+++ b/io/channel-socket.c
134
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_set_blocking(QIOChannel *ioc,
135
qemu_set_block(sioc->fd);
136
} else {
137
qemu_set_nonblock(sioc->fd);
138
-#ifdef WIN32
139
- WSAEventSelect(sioc->fd, ioc->event,
140
- FD_READ | FD_ACCEPT | FD_CLOSE |
141
- FD_CONNECT | FD_WRITE | FD_OOB);
142
-#endif
143
}
144
return 0;
145
}
146
@@ -XXX,XX +XXX,XX @@ qio_channel_socket_shutdown(QIOChannel *ioc,
147
return 0;
148
}
149
150
+static void qio_channel_socket_set_aio_fd_handler(QIOChannel *ioc,
151
+ AioContext *ctx,
152
+ IOHandler *io_read,
153
+ IOHandler *io_write,
154
+ void *opaque)
155
+{
156
+ QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
157
+ aio_set_fd_handler(ctx, sioc->fd, false, io_read, io_write, NULL, opaque);
158
+}
159
+
160
static GSource *qio_channel_socket_create_watch(QIOChannel *ioc,
161
GIOCondition condition)
162
{
163
@@ -XXX,XX +XXX,XX @@ static void qio_channel_socket_class_init(ObjectClass *klass,
164
ioc_klass->io_set_cork = qio_channel_socket_set_cork;
165
ioc_klass->io_set_delay = qio_channel_socket_set_delay;
166
ioc_klass->io_create_watch = qio_channel_socket_create_watch;
167
+ ioc_klass->io_set_aio_fd_handler = qio_channel_socket_set_aio_fd_handler;
168
}
169
170
static const TypeInfo qio_channel_socket_info = {
171
diff --git a/io/channel-tls.c b/io/channel-tls.c
172
index XXXXXXX..XXXXXXX 100644
173
--- a/io/channel-tls.c
174
+++ b/io/channel-tls.c
175
@@ -XXX,XX +XXX,XX @@ static int qio_channel_tls_close(QIOChannel *ioc,
176
return qio_channel_close(tioc->master, errp);
177
}
178
179
+static void qio_channel_tls_set_aio_fd_handler(QIOChannel *ioc,
180
+ AioContext *ctx,
181
+ IOHandler *io_read,
182
+ IOHandler *io_write,
183
+ void *opaque)
184
+{
185
+ QIOChannelTLS *tioc = QIO_CHANNEL_TLS(ioc);
186
+
187
+ qio_channel_set_aio_fd_handler(tioc->master, ctx, io_read, io_write, opaque);
188
+}
189
+
190
static GSource *qio_channel_tls_create_watch(QIOChannel *ioc,
191
GIOCondition condition)
192
{
193
@@ -XXX,XX +XXX,XX @@ static void qio_channel_tls_class_init(ObjectClass *klass,
194
ioc_klass->io_close = qio_channel_tls_close;
195
ioc_klass->io_shutdown = qio_channel_tls_shutdown;
196
ioc_klass->io_create_watch = qio_channel_tls_create_watch;
197
+ ioc_klass->io_set_aio_fd_handler = qio_channel_tls_set_aio_fd_handler;
198
}
199
200
static const TypeInfo qio_channel_tls_info = {
201
diff --git a/io/channel-watch.c b/io/channel-watch.c
202
index XXXXXXX..XXXXXXX 100644
203
--- a/io/channel-watch.c
204
+++ b/io/channel-watch.c
205
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_socket_watch(QIOChannel *ioc,
206
GSource *source;
207
QIOChannelSocketSource *ssource;
208
209
+#ifdef WIN32
210
+ WSAEventSelect(socket, ioc->event,
211
+ FD_READ | FD_ACCEPT | FD_CLOSE |
212
+ FD_CONNECT | FD_WRITE | FD_OOB);
213
+#endif
214
+
215
source = g_source_new(&qio_channel_socket_source_funcs,
216
sizeof(QIOChannelSocketSource));
217
ssource = (QIOChannelSocketSource *)source;
218
diff --git a/io/channel.c b/io/channel.c
52
diff --git a/io/channel.c b/io/channel.c
219
index XXXXXXX..XXXXXXX 100644
53
index XXXXXXX..XXXXXXX 100644
220
--- a/io/channel.c
54
--- a/io/channel.c
221
+++ b/io/channel.c
55
+++ b/io/channel.c
222
@@ -XXX,XX +XXX,XX @@ GSource *qio_channel_create_watch(QIOChannel *ioc,
56
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
223
}
57
const struct iovec *iov,
224
58
size_t niov,
225
59
Error **errp)
226
+void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
227
+ AioContext *ctx,
228
+ IOHandler *io_read,
229
+ IOHandler *io_write,
230
+ void *opaque)
231
+{
60
+{
232
+ QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
61
+ return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp);
233
+
234
+ klass->io_set_aio_fd_handler(ioc, ctx, io_read, io_write, opaque);
235
+}
62
+}
236
+
63
+
237
guint qio_channel_add_watch(QIOChannel *ioc,
64
+int qio_channel_writev_full_all(QIOChannel *ioc,
238
GIOCondition condition,
65
+ const struct iovec *iov,
239
QIOChannelFunc func,
66
+ size_t niov,
67
+ int *fds, size_t nfds,
68
+ Error **errp)
69
{
70
int ret = -1;
71
struct iovec *local_iov = g_new(struct iovec, niov);
72
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
73
74
while (nlocal_iov > 0) {
75
ssize_t len;
76
- len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp);
77
+ len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds,
78
+ errp);
79
if (len == QIO_CHANNEL_ERR_BLOCK) {
80
if (qemu_in_coroutine()) {
81
qio_channel_yield(ioc, G_IO_OUT);
82
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
83
}
84
85
iov_discard_front(&local_iov, &nlocal_iov, len);
86
+
87
+ fds = NULL;
88
+ nfds = 0;
89
}
90
91
ret = 0;
240
--
92
--
241
2.9.3
93
2.29.2
242
94
243
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
Support separate coroutines for reading and writing, and place the
3
Adds qio_channel_readv_full_all_eof() and qio_channel_readv_full_all()
4
read/write handlers on the AioContext that the QIOChannel is registered
4
to read both data and FDs. Refactors existing code to use these helpers.
5
with.
5
6
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
10
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Message-id: b059c4cc0fb741e794d644c144cc21372cad877d.1611938319.git.jag.raman@oracle.com
11
Message-id: 20170213135235.12274-7-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
12
---
14
include/io/channel.h | 47 ++++++++++++++++++++++++++--
13
include/io/channel.h | 53 +++++++++++++++++++++++
15
io/channel.c | 86 +++++++++++++++++++++++++++++++++++++++-------------
14
io/channel.c | 101 ++++++++++++++++++++++++++++++++++---------
16
2 files changed, 109 insertions(+), 24 deletions(-)
15
2 files changed, 134 insertions(+), 20 deletions(-)
17
16
18
diff --git a/include/io/channel.h b/include/io/channel.h
17
diff --git a/include/io/channel.h b/include/io/channel.h
19
index XXXXXXX..XXXXXXX 100644
18
index XXXXXXX..XXXXXXX 100644
20
--- a/include/io/channel.h
19
--- a/include/io/channel.h
21
+++ b/include/io/channel.h
20
+++ b/include/io/channel.h
22
@@ -XXX,XX +XXX,XX @@
21
@@ -XXX,XX +XXX,XX @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
23
22
IOHandler *io_write,
24
#include "qemu-common.h"
23
void *opaque);
25
#include "qom/object.h"
24
26
+#include "qemu/coroutine.h"
25
+/**
27
#include "block/aio.h"
26
+ * qio_channel_readv_full_all_eof:
28
27
+ * @ioc: the channel object
29
#define TYPE_QIO_CHANNEL "qio-channel"
28
+ * @iov: the array of memory regions to read data to
30
@@ -XXX,XX +XXX,XX @@ struct QIOChannel {
29
+ * @niov: the length of the @iov array
31
Object parent;
30
+ * @fds: an array of file handles to read
32
unsigned int features; /* bitmask of QIOChannelFeatures */
31
+ * @nfds: number of file handles in @fds
33
char *name;
32
+ * @errp: pointer to a NULL-initialized error object
34
+ AioContext *ctx;
33
+ *
35
+ Coroutine *read_coroutine;
34
+ *
36
+ Coroutine *write_coroutine;
35
+ * Performs same function as qio_channel_readv_all_eof.
37
#ifdef _WIN32
36
+ * Additionally, attempts to read file descriptors shared
38
HANDLE event; /* For use with GSource on Win32 */
37
+ * over the channel. The function will wait for all
39
#endif
38
+ * requested data to be read, yielding from the current
40
@@ -XXX,XX +XXX,XX @@ guint qio_channel_add_watch(QIOChannel *ioc,
39
+ * coroutine if required. data refers to both file
41
40
+ * descriptors and the iovs.
42
41
+ *
42
+ * Returns: 1 if all bytes were read, 0 if end-of-file
43
+ * occurs without data, or -1 on error
44
+ */
45
+
46
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
47
+ const struct iovec *iov,
48
+ size_t niov,
49
+ int **fds, size_t *nfds,
50
+ Error **errp);
51
+
52
+/**
53
+ * qio_channel_readv_full_all:
54
+ * @ioc: the channel object
55
+ * @iov: the array of memory regions to read data to
56
+ * @niov: the length of the @iov array
57
+ * @fds: an array of file handles to read
58
+ * @nfds: number of file handles in @fds
59
+ * @errp: pointer to a NULL-initialized error object
60
+ *
61
+ *
62
+ * Performs same function as qio_channel_readv_all_eof.
63
+ * Additionally, attempts to read file descriptors shared
64
+ * over the channel. The function will wait for all
65
+ * requested data to be read, yielding from the current
66
+ * coroutine if required. data refers to both file
67
+ * descriptors and the iovs.
68
+ *
69
+ * Returns: 0 if all bytes were read, or -1 on error
70
+ */
71
+
72
+int qio_channel_readv_full_all(QIOChannel *ioc,
73
+ const struct iovec *iov,
74
+ size_t niov,
75
+ int **fds, size_t *nfds,
76
+ Error **errp);
77
+
43
/**
78
/**
44
+ * qio_channel_attach_aio_context:
79
* qio_channel_writev_full_all:
45
+ * @ioc: the channel object
46
+ * @ctx: the #AioContext to set the handlers on
47
+ *
48
+ * Request that qio_channel_yield() sets I/O handlers on
49
+ * the given #AioContext. If @ctx is %NULL, qio_channel_yield()
50
+ * uses QEMU's main thread event loop.
51
+ *
52
+ * You can move a #QIOChannel from one #AioContext to another even if
53
+ * I/O handlers are set for a coroutine. However, #QIOChannel provides
54
+ * no synchronization between the calls to qio_channel_yield() and
55
+ * qio_channel_attach_aio_context().
56
+ *
57
+ * Therefore you should first call qio_channel_detach_aio_context()
58
+ * to ensure that the coroutine is not entered concurrently. Then,
59
+ * while the coroutine has yielded, call qio_channel_attach_aio_context(),
60
+ * and then aio_co_schedule() to place the coroutine on the new
61
+ * #AioContext. The calls to qio_channel_detach_aio_context()
62
+ * and qio_channel_attach_aio_context() should be protected with
63
+ * aio_context_acquire() and aio_context_release().
64
+ */
65
+void qio_channel_attach_aio_context(QIOChannel *ioc,
66
+ AioContext *ctx);
67
+
68
+/**
69
+ * qio_channel_detach_aio_context:
70
+ * @ioc: the channel object
71
+ *
72
+ * Disable any I/O handlers set by qio_channel_yield(). With the
73
+ * help of aio_co_schedule(), this allows moving a coroutine that was
74
+ * paused by qio_channel_yield() to another context.
75
+ */
76
+void qio_channel_detach_aio_context(QIOChannel *ioc);
77
+
78
+/**
79
* qio_channel_yield:
80
* @ioc: the channel object
80
* @ioc: the channel object
81
* @condition: the I/O condition to wait for
82
*
83
- * Yields execution from the current coroutine until
84
- * the condition indicated by @condition becomes
85
- * available.
86
+ * Yields execution from the current coroutine until the condition
87
+ * indicated by @condition becomes available. @condition must
88
+ * be either %G_IO_IN or %G_IO_OUT; it cannot contain both. In
89
+ * addition, no two coroutine can be waiting on the same condition
90
+ * and channel at the same time.
91
*
92
* This must only be called from coroutine context
93
*/
94
diff --git a/io/channel.c b/io/channel.c
81
diff --git a/io/channel.c b/io/channel.c
95
index XXXXXXX..XXXXXXX 100644
82
index XXXXXXX..XXXXXXX 100644
96
--- a/io/channel.c
83
--- a/io/channel.c
97
+++ b/io/channel.c
84
+++ b/io/channel.c
98
@@ -XXX,XX +XXX,XX @@
85
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
99
#include "qemu/osdep.h"
86
const struct iovec *iov,
100
#include "io/channel.h"
87
size_t niov,
101
#include "qapi/error.h"
88
Error **errp)
102
-#include "qemu/coroutine.h"
89
+{
103
+#include "qemu/main-loop.h"
90
+ return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
104
91
+}
105
bool qio_channel_has_feature(QIOChannel *ioc,
92
+
106
QIOChannelFeature feature)
93
+int qio_channel_readv_all(QIOChannel *ioc,
107
@@ -XXX,XX +XXX,XX @@ off_t qio_channel_io_seek(QIOChannel *ioc,
94
+ const struct iovec *iov,
95
+ size_t niov,
96
+ Error **errp)
97
+{
98
+ return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp);
99
+}
100
+
101
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
102
+ const struct iovec *iov,
103
+ size_t niov,
104
+ int **fds, size_t *nfds,
105
+ Error **errp)
106
{
107
int ret = -1;
108
struct iovec *local_iov = g_new(struct iovec, niov);
109
struct iovec *local_iov_head = local_iov;
110
unsigned int nlocal_iov = niov;
111
+ int **local_fds = fds;
112
+ size_t *local_nfds = nfds;
113
bool partial = false;
114
115
+ if (nfds) {
116
+ *nfds = 0;
117
+ }
118
+
119
+ if (fds) {
120
+ *fds = NULL;
121
+ }
122
+
123
nlocal_iov = iov_copy(local_iov, nlocal_iov,
124
iov, niov,
125
0, iov_size(iov, niov));
126
127
- while (nlocal_iov > 0) {
128
+ while ((nlocal_iov > 0) || local_fds) {
129
ssize_t len;
130
- len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp);
131
+ len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
132
+ local_nfds, errp);
133
if (len == QIO_CHANNEL_ERR_BLOCK) {
134
if (qemu_in_coroutine()) {
135
qio_channel_yield(ioc, G_IO_IN);
136
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
137
qio_channel_wait(ioc, G_IO_IN);
138
}
139
continue;
140
- } else if (len < 0) {
141
- goto cleanup;
142
- } else if (len == 0) {
143
- if (partial) {
144
- error_setg(errp,
145
- "Unexpected end-of-file before all bytes were read");
146
- } else {
147
+ }
148
+
149
+ if (len == 0) {
150
+ if (local_nfds && *local_nfds) {
151
+ /*
152
+ * Got some FDs, but no data yet. This isn't an EOF
153
+ * scenario (yet), so carry on to try to read data
154
+ * on next loop iteration
155
+ */
156
+ goto next_iter;
157
+ } else if (!partial) {
158
+ /* No fds and no data - EOF before any data read */
159
ret = 0;
160
+ goto cleanup;
161
+ } else {
162
+ len = -1;
163
+ error_setg(errp,
164
+ "Unexpected end-of-file before all data were read");
165
+ /* Fallthrough into len < 0 handling */
166
+ }
167
+ }
168
+
169
+ if (len < 0) {
170
+ /* Close any FDs we previously received */
171
+ if (nfds && fds) {
172
+ size_t i;
173
+ for (i = 0; i < (*nfds); i++) {
174
+ close((*fds)[i]);
175
+ }
176
+ g_free(*fds);
177
+ *fds = NULL;
178
+ *nfds = 0;
179
}
180
goto cleanup;
181
}
182
183
+ if (nlocal_iov) {
184
+ iov_discard_front(&local_iov, &nlocal_iov, len);
185
+ }
186
+
187
+next_iter:
188
partial = true;
189
- iov_discard_front(&local_iov, &nlocal_iov, len);
190
+ local_fds = NULL;
191
+ local_nfds = NULL;
192
}
193
194
ret = 1;
195
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
196
return ret;
108
}
197
}
109
198
110
199
-int qio_channel_readv_all(QIOChannel *ioc,
111
-typedef struct QIOChannelYieldData QIOChannelYieldData;
200
- const struct iovec *iov,
112
-struct QIOChannelYieldData {
201
- size_t niov,
113
- QIOChannel *ioc;
202
- Error **errp)
114
- Coroutine *co;
203
+int qio_channel_readv_full_all(QIOChannel *ioc,
115
-};
204
+ const struct iovec *iov,
116
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc);
205
+ size_t niov,
117
206
+ int **fds, size_t *nfds,
118
+static void qio_channel_restart_read(void *opaque)
207
+ Error **errp)
119
+{
120
+ QIOChannel *ioc = opaque;
121
+ Coroutine *co = ioc->read_coroutine;
122
+
123
+ ioc->read_coroutine = NULL;
124
+ qio_channel_set_aio_fd_handlers(ioc);
125
+ aio_co_wake(co);
126
+}
127
128
-static gboolean qio_channel_yield_enter(QIOChannel *ioc,
129
- GIOCondition condition,
130
- gpointer opaque)
131
+static void qio_channel_restart_write(void *opaque)
132
{
208
{
133
- QIOChannelYieldData *data = opaque;
209
- int ret = qio_channel_readv_all_eof(ioc, iov, niov, errp);
134
- qemu_coroutine_enter(data->co);
210
+ int ret = qio_channel_readv_full_all_eof(ioc, iov, niov, fds, nfds, errp);
135
- return FALSE;
211
136
+ QIOChannel *ioc = opaque;
212
if (ret == 0) {
137
+ Coroutine *co = ioc->write_coroutine;
213
- ret = -1;
138
+
214
- error_setg(errp,
139
+ ioc->write_coroutine = NULL;
215
- "Unexpected end-of-file before all bytes were read");
140
+ qio_channel_set_aio_fd_handlers(ioc);
216
- } else if (ret == 1) {
141
+ aio_co_wake(co);
217
- ret = 0;
218
+ error_prepend(errp,
219
+ "Unexpected end-of-file before all data were read.");
220
+ return -1;
221
}
222
+ if (ret == 1) {
223
+ return 0;
224
+ }
225
+
226
return ret;
142
}
227
}
143
228
144
+static void qio_channel_set_aio_fd_handlers(QIOChannel *ioc)
145
+{
146
+ IOHandler *rd_handler = NULL, *wr_handler = NULL;
147
+ AioContext *ctx;
148
+
149
+ if (ioc->read_coroutine) {
150
+ rd_handler = qio_channel_restart_read;
151
+ }
152
+ if (ioc->write_coroutine) {
153
+ wr_handler = qio_channel_restart_write;
154
+ }
155
+
156
+ ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
157
+ qio_channel_set_aio_fd_handler(ioc, ctx, rd_handler, wr_handler, ioc);
158
+}
159
+
160
+void qio_channel_attach_aio_context(QIOChannel *ioc,
161
+ AioContext *ctx)
162
+{
163
+ AioContext *old_ctx;
164
+ if (ioc->ctx == ctx) {
165
+ return;
166
+ }
167
+
168
+ old_ctx = ioc->ctx ? ioc->ctx : iohandler_get_aio_context();
169
+ qio_channel_set_aio_fd_handler(ioc, old_ctx, NULL, NULL, NULL);
170
+ ioc->ctx = ctx;
171
+ qio_channel_set_aio_fd_handlers(ioc);
172
+}
173
+
174
+void qio_channel_detach_aio_context(QIOChannel *ioc)
175
+{
176
+ ioc->read_coroutine = NULL;
177
+ ioc->write_coroutine = NULL;
178
+ qio_channel_set_aio_fd_handlers(ioc);
179
+ ioc->ctx = NULL;
180
+}
181
182
void coroutine_fn qio_channel_yield(QIOChannel *ioc,
183
GIOCondition condition)
184
{
185
- QIOChannelYieldData data;
186
-
187
assert(qemu_in_coroutine());
188
- data.ioc = ioc;
189
- data.co = qemu_coroutine_self();
190
- qio_channel_add_watch(ioc,
191
- condition,
192
- qio_channel_yield_enter,
193
- &data,
194
- NULL);
195
+ if (condition == G_IO_IN) {
196
+ assert(!ioc->read_coroutine);
197
+ ioc->read_coroutine = qemu_coroutine_self();
198
+ } else if (condition == G_IO_OUT) {
199
+ assert(!ioc->write_coroutine);
200
+ ioc->write_coroutine = qemu_coroutine_self();
201
+ } else {
202
+ abort();
203
+ }
204
+ qio_channel_set_aio_fd_handlers(ioc);
205
qemu_coroutine_yield();
206
}
207
208
--
229
--
209
2.9.3
230
2.29.2
210
231
211
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
This will avoid forward references in the next patch. It is also
3
Defines MPQemuMsg, which is the message that is sent to the remote
4
more logical because CoQueue is not anymore the basic primitive.
4
process. This message is sent over QIOChannel and is used to
5
5
command the remote process to perform various tasks.
6
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Define transmission functions used by proxy and by remote.
7
Reviewed-by: Fam Zheng <famz@redhat.com>
7
8
Message-id: 20170213181244.16297-5-pbonzini@redhat.com
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 56ca8bcf95195b2b195b08f6b9565b6d7410bce5.1611938319.git.jag.raman@oracle.com
13
14
[Replace struct iovec send[2] = {0} with {} to make clang happy as
15
suggested by Peter Maydell <peter.maydell@linaro.org>.
16
--Stefan]
17
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
19
---
11
include/qemu/coroutine.h | 89 ++++++++++++++++++++++++------------------------
20
MAINTAINERS | 2 +
12
1 file changed, 44 insertions(+), 45 deletions(-)
21
meson.build | 1 +
13
22
hw/remote/trace.h | 1 +
14
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
23
include/hw/remote/mpqemu-link.h | 63 ++++++++++
15
index XXXXXXX..XXXXXXX 100644
24
include/sysemu/iothread.h | 6 +
16
--- a/include/qemu/coroutine.h
25
hw/remote/mpqemu-link.c | 205 ++++++++++++++++++++++++++++++++
17
+++ b/include/qemu/coroutine.h
26
iothread.c | 6 +
18
@@ -XXX,XX +XXX,XX @@ bool qemu_in_coroutine(void);
27
hw/remote/meson.build | 1 +
19
*/
28
hw/remote/trace-events | 4 +
20
bool qemu_coroutine_entered(Coroutine *co);
29
9 files changed, 289 insertions(+)
21
30
create mode 100644 hw/remote/trace.h
22
-
31
create mode 100644 include/hw/remote/mpqemu-link.h
23
-/**
32
create mode 100644 hw/remote/mpqemu-link.c
24
- * CoQueues are a mechanism to queue coroutines in order to continue executing
33
create mode 100644 hw/remote/trace-events
25
- * them later. They provide the fundamental primitives on which coroutine locks
34
26
- * are built.
35
diff --git a/MAINTAINERS b/MAINTAINERS
27
- */
36
index XXXXXXX..XXXXXXX 100644
28
-typedef struct CoQueue {
37
--- a/MAINTAINERS
29
- QSIMPLEQ_HEAD(, Coroutine) entries;
38
+++ b/MAINTAINERS
30
-} CoQueue;
39
@@ -XXX,XX +XXX,XX @@ F: hw/pci-host/remote.c
31
-
40
F: include/hw/pci-host/remote.h
32
-/**
41
F: hw/remote/machine.c
33
- * Initialise a CoQueue. This must be called before any other operation is used
42
F: include/hw/remote/machine.h
34
- * on the CoQueue.
43
+F: hw/remote/mpqemu-link.c
35
- */
44
+F: include/hw/remote/mpqemu-link.h
36
-void qemu_co_queue_init(CoQueue *queue);
45
37
-
46
Build and test automation
38
-/**
47
-------------------------
39
- * Adds the current coroutine to the CoQueue and transfers control to the
48
diff --git a/meson.build b/meson.build
40
- * caller of the coroutine.
49
index XXXXXXX..XXXXXXX 100644
41
- */
50
--- a/meson.build
42
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
51
+++ b/meson.build
43
-
52
@@ -XXX,XX +XXX,XX @@ if have_system
44
-/**
53
'net',
45
- * Restarts the next coroutine in the CoQueue and removes it from the queue.
54
'softmmu',
46
- *
55
'ui',
47
- * Returns true if a coroutine was restarted, false if the queue is empty.
56
+ 'hw/remote',
48
- */
57
]
49
-bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
58
endif
50
-
59
if have_system or have_user
51
-/**
60
diff --git a/hw/remote/trace.h b/hw/remote/trace.h
52
- * Restarts all coroutines in the CoQueue and leaves the queue empty.
61
new file mode 100644
53
- */
62
index XXXXXXX..XXXXXXX
54
-void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
63
--- /dev/null
55
-
64
+++ b/hw/remote/trace.h
56
-/**
65
@@ -0,0 +1 @@
57
- * Enter the next coroutine in the queue
66
+#include "trace/trace-hw_remote.h"
58
- */
67
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
59
-bool qemu_co_enter_next(CoQueue *queue);
68
new file mode 100644
60
-
69
index XXXXXXX..XXXXXXX
61
-/**
70
--- /dev/null
62
- * Checks if the CoQueue is empty.
71
+++ b/include/hw/remote/mpqemu-link.h
63
- */
72
@@ -XXX,XX +XXX,XX @@
64
-bool qemu_co_queue_empty(CoQueue *queue);
73
+/*
65
-
74
+ * Communication channel between QEMU and remote device process
66
-
75
+ *
67
/**
76
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
68
* Provides a mutex that can be used to synchronise coroutines
77
+ *
69
*/
78
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
70
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
79
+ * See the COPYING file in the top-level directory.
71
*/
80
+ *
72
void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
81
+ */
73
82
+
83
+#ifndef MPQEMU_LINK_H
84
+#define MPQEMU_LINK_H
85
+
86
+#include "qom/object.h"
87
+#include "qemu/thread.h"
88
+#include "io/channel.h"
89
+
90
+#define REMOTE_MAX_FDS 8
91
+
92
+#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data.u64)
74
+
93
+
75
+/**
94
+/**
76
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
95
+ * MPQemuCmd:
77
+ * them later.
96
+ *
78
+ */
97
+ * MPQemuCmd enum type to specify the command to be executed on the remote
79
+typedef struct CoQueue {
98
+ * device.
80
+ QSIMPLEQ_HEAD(, Coroutine) entries;
99
+ *
81
+} CoQueue;
100
+ * This uses a private protocol between QEMU and the remote process. vfio-user
101
+ * protocol would supersede this in the future.
102
+ *
103
+ */
104
+typedef enum {
105
+ MPQEMU_CMD_MAX,
106
+} MPQemuCmd;
82
+
107
+
83
+/**
108
+/**
84
+ * Initialise a CoQueue. This must be called before any other operation is used
109
+ * MPQemuMsg:
85
+ * on the CoQueue.
110
+ * @cmd: The remote command
86
+ */
111
+ * @size: Size of the data to be shared
87
+void qemu_co_queue_init(CoQueue *queue);
112
+ * @data: Structured data
88
+
113
+ * @fds: File descriptors to be shared with remote device
89
+/**
114
+ *
90
+ * Adds the current coroutine to the CoQueue and transfers control to the
115
+ * MPQemuMsg Format of the message sent to the remote device from QEMU.
91
+ * caller of the coroutine.
116
+ *
92
+ */
117
+ */
93
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
118
+typedef struct {
94
+
119
+ int cmd;
95
+/**
120
+ size_t size;
96
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
121
+
97
+ *
122
+ union {
98
+ * Returns true if a coroutine was restarted, false if the queue is empty.
123
+ uint64_t u64;
99
+ */
124
+ } data;
100
+bool coroutine_fn qemu_co_queue_next(CoQueue *queue);
125
+
101
+
126
+ int fds[REMOTE_MAX_FDS];
102
+/**
127
+ int num_fds;
103
+ * Restarts all coroutines in the CoQueue and leaves the queue empty.
128
+} MPQemuMsg;
104
+ */
129
+
105
+void coroutine_fn qemu_co_queue_restart_all(CoQueue *queue);
130
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
106
+
131
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
107
+/**
132
+
108
+ * Enter the next coroutine in the queue
133
+bool mpqemu_msg_valid(MPQemuMsg *msg);
109
+ */
134
+
110
+bool qemu_co_enter_next(CoQueue *queue);
135
+#endif
111
+
136
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
112
+/**
137
index XXXXXXX..XXXXXXX 100644
113
+ * Checks if the CoQueue is empty.
138
--- a/include/sysemu/iothread.h
114
+ */
139
+++ b/include/sysemu/iothread.h
115
+bool qemu_co_queue_empty(CoQueue *queue);
140
@@ -XXX,XX +XXX,XX @@ IOThread *iothread_create(const char *id, Error **errp);
116
+
141
void iothread_stop(IOThread *iothread);
117
+
142
void iothread_destroy(IOThread *iothread);
118
typedef struct CoRwlock {
143
119
bool writer;
144
+/*
120
int reader;
145
+ * Returns true if executing withing IOThread context,
146
+ * false otherwise.
147
+ */
148
+bool qemu_in_iothread(void);
149
+
150
#endif /* IOTHREAD_H */
151
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
152
new file mode 100644
153
index XXXXXXX..XXXXXXX
154
--- /dev/null
155
+++ b/hw/remote/mpqemu-link.c
156
@@ -XXX,XX +XXX,XX @@
157
+/*
158
+ * Communication channel between QEMU and remote device process
159
+ *
160
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
161
+ *
162
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
163
+ * See the COPYING file in the top-level directory.
164
+ *
165
+ */
166
+
167
+#include "qemu/osdep.h"
168
+#include "qemu-common.h"
169
+
170
+#include "qemu/module.h"
171
+#include "hw/remote/mpqemu-link.h"
172
+#include "qapi/error.h"
173
+#include "qemu/iov.h"
174
+#include "qemu/error-report.h"
175
+#include "qemu/main-loop.h"
176
+#include "io/channel.h"
177
+#include "sysemu/iothread.h"
178
+#include "trace.h"
179
+
180
+/*
181
+ * Send message over the ioc QIOChannel.
182
+ * This function is safe to call from:
183
+ * - main loop in co-routine context. Will block the main loop if not in
184
+ * co-routine context;
185
+ * - vCPU thread with no co-routine context and if the channel is not part
186
+ * of the main loop handling;
187
+ * - IOThread within co-routine context, outside of co-routine context
188
+ * will block IOThread;
189
+ * Returns true if no errors were encountered, false otherwise.
190
+ */
191
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
192
+{
193
+ ERRP_GUARD();
194
+ bool iolock = qemu_mutex_iothread_locked();
195
+ bool iothread = qemu_in_iothread();
196
+ struct iovec send[2] = {};
197
+ int *fds = NULL;
198
+ size_t nfds = 0;
199
+ bool ret = false;
200
+
201
+ send[0].iov_base = msg;
202
+ send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
203
+
204
+ send[1].iov_base = (void *)&msg->data;
205
+ send[1].iov_len = msg->size;
206
+
207
+ if (msg->num_fds) {
208
+ nfds = msg->num_fds;
209
+ fds = msg->fds;
210
+ }
211
+
212
+ /*
213
+ * Dont use in IOThread out of co-routine context as
214
+ * it will block IOThread.
215
+ */
216
+ assert(qemu_in_coroutine() || !iothread);
217
+
218
+ /*
219
+ * Skip unlocking/locking iothread lock when the IOThread is running
220
+ * in co-routine context. Co-routine context is asserted above
221
+ * for IOThread case.
222
+ * Also skip lock handling while in a co-routine in the main context.
223
+ */
224
+ if (iolock && !iothread && !qemu_in_coroutine()) {
225
+ qemu_mutex_unlock_iothread();
226
+ }
227
+
228
+ if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
229
+ fds, nfds, errp)) {
230
+ ret = true;
231
+ } else {
232
+ trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
233
+ }
234
+
235
+ if (iolock && !iothread && !qemu_in_coroutine()) {
236
+ /* See above comment why skip locking here. */
237
+ qemu_mutex_lock_iothread();
238
+ }
239
+
240
+ return ret;
241
+}
242
+
243
+/*
244
+ * Read message from the ioc QIOChannel.
245
+ * This function is safe to call from:
246
+ * - From main loop in co-routine context. Will block the main loop if not in
247
+ * co-routine context;
248
+ * - From vCPU thread with no co-routine context and if the channel is not part
249
+ * of the main loop handling;
250
+ * - From IOThread within co-routine context, outside of co-routine context
251
+ * will block IOThread;
252
+ */
253
+static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
254
+ size_t *nfds, Error **errp)
255
+{
256
+ ERRP_GUARD();
257
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
258
+ bool iolock = qemu_mutex_iothread_locked();
259
+ bool iothread = qemu_in_iothread();
260
+ int ret = -1;
261
+
262
+ /*
263
+ * Dont use in IOThread out of co-routine context as
264
+ * it will block IOThread.
265
+ */
266
+ assert(qemu_in_coroutine() || !iothread);
267
+
268
+ if (iolock && !iothread && !qemu_in_coroutine()) {
269
+ qemu_mutex_unlock_iothread();
270
+ }
271
+
272
+ ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
273
+
274
+ if (iolock && !iothread && !qemu_in_coroutine()) {
275
+ qemu_mutex_lock_iothread();
276
+ }
277
+
278
+ return (ret <= 0) ? ret : iov.iov_len;
279
+}
280
+
281
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
282
+{
283
+ ERRP_GUARD();
284
+ g_autofree int *fds = NULL;
285
+ size_t nfds = 0;
286
+ ssize_t len;
287
+ bool ret = false;
288
+
289
+ len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
290
+ if (len <= 0) {
291
+ goto fail;
292
+ } else if (len != MPQEMU_MSG_HDR_SIZE) {
293
+ error_setg(errp, "Message header corrupted");
294
+ goto fail;
295
+ }
296
+
297
+ if (msg->size > sizeof(msg->data)) {
298
+ error_setg(errp, "Invalid size for message");
299
+ goto fail;
300
+ }
301
+
302
+ if (!msg->size) {
303
+ goto copy_fds;
304
+ }
305
+
306
+ len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
307
+ if (len <= 0) {
308
+ goto fail;
309
+ }
310
+ if (len != msg->size) {
311
+ error_setg(errp, "Unable to read full message");
312
+ goto fail;
313
+ }
314
+
315
+copy_fds:
316
+ msg->num_fds = nfds;
317
+ if (nfds > G_N_ELEMENTS(msg->fds)) {
318
+ error_setg(errp,
319
+ "Overflow error: received %zu fds, more than max of %d fds",
320
+ nfds, REMOTE_MAX_FDS);
321
+ goto fail;
322
+ }
323
+ if (nfds) {
324
+ memcpy(msg->fds, fds, nfds * sizeof(int));
325
+ }
326
+
327
+ ret = true;
328
+
329
+fail:
330
+ if (*errp) {
331
+ trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
332
+ }
333
+ while (*errp && nfds) {
334
+ close(fds[nfds - 1]);
335
+ nfds--;
336
+ }
337
+
338
+ return ret;
339
+}
340
+
341
+bool mpqemu_msg_valid(MPQemuMsg *msg)
342
+{
343
+ if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
344
+ return false;
345
+ }
346
+
347
+ /* Verify FDs. */
348
+ if (msg->num_fds >= REMOTE_MAX_FDS) {
349
+ return false;
350
+ }
351
+
352
+ if (msg->num_fds > 0) {
353
+ for (int i = 0; i < msg->num_fds; i++) {
354
+ if (fcntl(msg->fds[i], F_GETFL) == -1) {
355
+ return false;
356
+ }
357
+ }
358
+ }
359
+
360
+ return true;
361
+}
362
diff --git a/iothread.c b/iothread.c
363
index XXXXXXX..XXXXXXX 100644
364
--- a/iothread.c
365
+++ b/iothread.c
366
@@ -XXX,XX +XXX,XX @@ IOThread *iothread_by_id(const char *id)
367
{
368
return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL));
369
}
370
+
371
+bool qemu_in_iothread(void)
372
+{
373
+ return qemu_get_current_aio_context() == qemu_get_aio_context() ?
374
+ false : true;
375
+}
376
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
377
index XXXXXXX..XXXXXXX 100644
378
--- a/hw/remote/meson.build
379
+++ b/hw/remote/meson.build
380
@@ -XXX,XX +XXX,XX @@
381
remote_ss = ss.source_set()
382
383
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
384
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
385
386
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
387
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
388
new file mode 100644
389
index XXXXXXX..XXXXXXX
390
--- /dev/null
391
+++ b/hw/remote/trace-events
392
@@ -XXX,XX +XXX,XX @@
393
+# multi-process trace events
394
+
395
+mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
396
+mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
121
--
397
--
122
2.9.3
398
2.29.2
123
399
124
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
This uses the lock-free mutex described in the paper '"Blocking without
3
Initializes the message handler function in the remote process. It is
4
Locking", or LFTHREADS: A lock-free thread library' by Gidenstam and
4
called whenever there's an event pending on QIOChannel that registers
5
Papatriantafilou. The same technique is used in OSv, and in fact
5
this function.
6
the code is essentially a conversion to C of OSv's code.
7
6
8
[Added missing coroutine_fn in tests/test-aio-multithread.c.
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
9
--Stefan]
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
10
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 99d38d8b93753a6409ac2340e858858cda59ab1b.1611938319.git.jag.raman@oracle.com
13
Message-id: 20170213181244.16297-2-pbonzini@redhat.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
13
---
16
include/qemu/coroutine.h | 17 ++++-
14
MAINTAINERS | 1 +
17
tests/test-aio-multithread.c | 86 ++++++++++++++++++++++++
15
include/hw/remote/machine.h | 9 ++++++
18
util/qemu-coroutine-lock.c | 155 ++++++++++++++++++++++++++++++++++++++++---
16
hw/remote/message.c | 57 +++++++++++++++++++++++++++++++++++++
19
util/trace-events | 1 +
17
hw/remote/meson.build | 1 +
20
4 files changed, 246 insertions(+), 13 deletions(-)
18
4 files changed, 68 insertions(+)
19
create mode 100644 hw/remote/message.c
21
20
22
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
21
diff --git a/MAINTAINERS b/MAINTAINERS
23
index XXXXXXX..XXXXXXX 100644
22
index XXXXXXX..XXXXXXX 100644
24
--- a/include/qemu/coroutine.h
23
--- a/MAINTAINERS
25
+++ b/include/qemu/coroutine.h
24
+++ b/MAINTAINERS
26
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue);
25
@@ -XXX,XX +XXX,XX @@ F: hw/remote/machine.c
27
/**
26
F: include/hw/remote/machine.h
28
* Provides a mutex that can be used to synchronise coroutines
27
F: hw/remote/mpqemu-link.c
29
*/
28
F: include/hw/remote/mpqemu-link.h
30
+struct CoWaitRecord;
29
+F: hw/remote/message.c
31
typedef struct CoMutex {
30
32
- bool locked;
31
Build and test automation
33
+ /* Count of pending lockers; 0 for a free mutex, 1 for an
32
-------------------------
34
+ * uncontended mutex.
33
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
35
+ */
34
index XXXXXXX..XXXXXXX 100644
36
+ unsigned locked;
35
--- a/include/hw/remote/machine.h
36
+++ b/include/hw/remote/machine.h
37
@@ -XXX,XX +XXX,XX @@
38
#include "qom/object.h"
39
#include "hw/boards.h"
40
#include "hw/pci-host/remote.h"
41
+#include "io/channel.h"
42
43
struct RemoteMachineState {
44
MachineState parent_obj;
45
@@ -XXX,XX +XXX,XX @@ struct RemoteMachineState {
46
RemotePCIHost *host;
47
};
48
49
+/* Used to pass to co-routine device and ioc. */
50
+typedef struct RemoteCommDev {
51
+ PCIDevice *dev;
52
+ QIOChannel *ioc;
53
+} RemoteCommDev;
37
+
54
+
38
+ /* A queue of waiters. Elements are added atomically in front of
55
#define TYPE_REMOTE_MACHINE "x-remote-machine"
39
+ * from_push. to_pop is only populated, and popped from, by whoever
56
OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
40
+ * is in charge of the next wakeup. This can be an unlocker or,
57
41
+ * through the handoff protocol, a locker that is about to go to sleep.
58
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data);
42
+ */
43
+ QSLIST_HEAD(, CoWaitRecord) from_push, to_pop;
44
+
59
+
45
+ unsigned handoff, sequence;
60
#endif
61
diff --git a/hw/remote/message.c b/hw/remote/message.c
62
new file mode 100644
63
index XXXXXXX..XXXXXXX
64
--- /dev/null
65
+++ b/hw/remote/message.c
66
@@ -XXX,XX +XXX,XX @@
67
+/*
68
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
69
+ *
70
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
71
+ *
72
+ * See the COPYING file in the top-level directory.
73
+ *
74
+ */
46
+
75
+
47
Coroutine *holder;
76
+#include "qemu/osdep.h"
48
- CoQueue queue;
77
+#include "qemu-common.h"
49
} CoMutex;
50
51
/**
52
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
53
index XXXXXXX..XXXXXXX 100644
54
--- a/tests/test-aio-multithread.c
55
+++ b/tests/test-aio-multithread.c
56
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_schedule_10(void)
57
test_multi_co_schedule(10);
58
}
59
60
+/* CoMutex thread-safety. */
61
+
78
+
62
+static uint32_t atomic_counter;
79
+#include "hw/remote/machine.h"
63
+static uint32_t running;
80
+#include "io/channel.h"
64
+static uint32_t counter;
81
+#include "hw/remote/mpqemu-link.h"
65
+static CoMutex comutex;
82
+#include "qapi/error.h"
83
+#include "sysemu/runstate.h"
66
+
84
+
67
+static void coroutine_fn test_multi_co_mutex_entry(void *opaque)
85
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
68
+{
86
+{
69
+ while (!atomic_mb_read(&now_stopping)) {
87
+ g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
70
+ qemu_co_mutex_lock(&comutex);
88
+ PCIDevice *pci_dev = NULL;
71
+ counter++;
89
+ Error *local_err = NULL;
72
+ qemu_co_mutex_unlock(&comutex);
73
+
90
+
74
+ /* Increase atomic_counter *after* releasing the mutex. Otherwise
91
+ assert(com->ioc);
75
+ * there is a chance (it happens about 1 in 3 runs) that the iothread
76
+ * exits before the coroutine is woken up, causing a spurious
77
+ * assertion failure.
78
+ */
79
+ atomic_inc(&atomic_counter);
80
+ }
81
+ atomic_dec(&running);
82
+}
83
+
92
+
84
+static void test_multi_co_mutex(int threads, int seconds)
93
+ pci_dev = com->dev;
85
+{
94
+ for (; !local_err;) {
86
+ int i;
95
+ MPQemuMsg msg = {0};
87
+
96
+
88
+ qemu_co_mutex_init(&comutex);
97
+ if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
89
+ counter = 0;
90
+ atomic_counter = 0;
91
+ now_stopping = false;
92
+
93
+ create_aio_contexts();
94
+ assert(threads <= NUM_CONTEXTS);
95
+ running = threads;
96
+ for (i = 0; i < threads; i++) {
97
+ Coroutine *co1 = qemu_coroutine_create(test_multi_co_mutex_entry, NULL);
98
+ aio_co_schedule(ctx[i], co1);
99
+ }
100
+
101
+ g_usleep(seconds * 1000000);
102
+
103
+ atomic_mb_set(&now_stopping, true);
104
+ while (running > 0) {
105
+ g_usleep(100000);
106
+ }
107
+
108
+ join_aio_contexts();
109
+ g_test_message("%d iterations/second\n", counter / seconds);
110
+ g_assert_cmpint(counter, ==, atomic_counter);
111
+}
112
+
113
+/* Testing with NUM_CONTEXTS threads focuses on the queue. The mutex however
114
+ * is too contended (and the threads spend too much time in aio_poll)
115
+ * to actually stress the handoff protocol.
116
+ */
117
+static void test_multi_co_mutex_1(void)
118
+{
119
+ test_multi_co_mutex(NUM_CONTEXTS, 1);
120
+}
121
+
122
+static void test_multi_co_mutex_10(void)
123
+{
124
+ test_multi_co_mutex(NUM_CONTEXTS, 10);
125
+}
126
+
127
+/* Testing with fewer threads stresses the handoff protocol too. Still, the
128
+ * case where the locker _can_ pick up a handoff is very rare, happening
129
+ * about 10 times in 1 million, so increase the runtime a bit compared to
130
+ * other "quick" testcases that only run for 1 second.
131
+ */
132
+static void test_multi_co_mutex_2_3(void)
133
+{
134
+ test_multi_co_mutex(2, 3);
135
+}
136
+
137
+static void test_multi_co_mutex_2_30(void)
138
+{
139
+ test_multi_co_mutex(2, 30);
140
+}
141
+
142
/* End of tests. */
143
144
int main(int argc, char **argv)
145
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
146
g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
147
if (g_test_quick()) {
148
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
149
+ g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
150
+ g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
151
} else {
152
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
153
+ g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
154
+ g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
155
}
156
return g_test_run();
157
}
158
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
159
index XXXXXXX..XXXXXXX 100644
160
--- a/util/qemu-coroutine-lock.c
161
+++ b/util/qemu-coroutine-lock.c
162
@@ -XXX,XX +XXX,XX @@
163
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
164
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
165
* THE SOFTWARE.
166
+ *
167
+ * The lock-free mutex implementation is based on OSv
168
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
169
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
170
*/
171
172
#include "qemu/osdep.h"
173
@@ -XXX,XX +XXX,XX @@ bool qemu_co_queue_empty(CoQueue *queue)
174
return QSIMPLEQ_FIRST(&queue->entries) == NULL;
175
}
176
177
+/* The wait records are handled with a multiple-producer, single-consumer
178
+ * lock-free queue. There cannot be two concurrent pop_waiter() calls
179
+ * because pop_waiter() can only be called while mutex->handoff is zero.
180
+ * This can happen in three cases:
181
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
182
+ * In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
183
+ * not take part in the handoff.
184
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
185
+ * qemu_co_mutex_unlock. In this case, qemu_co_mutex_unlock will fail
186
+ * the cmpxchg (it will see either 0 or the next sequence value) and
187
+ * exit. The next hand-off cannot begin until qemu_co_mutex_lock has
188
+ * woken up someone.
189
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
190
+ * In this case another iteration starts with mutex->handoff == 0;
191
+ * a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
192
+ * qemu_co_mutex_unlock will go back to case (1).
193
+ *
194
+ * The following functions manage this queue.
195
+ */
196
+typedef struct CoWaitRecord {
197
+ Coroutine *co;
198
+ QSLIST_ENTRY(CoWaitRecord) next;
199
+} CoWaitRecord;
200
+
201
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
202
+{
203
+ w->co = qemu_coroutine_self();
204
+ QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
205
+}
206
+
207
+static void move_waiters(CoMutex *mutex)
208
+{
209
+ QSLIST_HEAD(, CoWaitRecord) reversed;
210
+ QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
211
+ while (!QSLIST_EMPTY(&reversed)) {
212
+ CoWaitRecord *w = QSLIST_FIRST(&reversed);
213
+ QSLIST_REMOVE_HEAD(&reversed, next);
214
+ QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
215
+ }
216
+}
217
+
218
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
219
+{
220
+ CoWaitRecord *w;
221
+
222
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
223
+ move_waiters(mutex);
224
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
225
+ return NULL;
226
+ }
227
+ }
228
+ w = QSLIST_FIRST(&mutex->to_pop);
229
+ QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
230
+ return w;
231
+}
232
+
233
+static bool has_waiters(CoMutex *mutex)
234
+{
235
+ return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
236
+}
237
+
238
void qemu_co_mutex_init(CoMutex *mutex)
239
{
240
memset(mutex, 0, sizeof(*mutex));
241
- qemu_co_queue_init(&mutex->queue);
242
}
243
244
-void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
245
+static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
246
{
247
Coroutine *self = qemu_coroutine_self();
248
+ CoWaitRecord w;
249
+ unsigned old_handoff;
250
251
trace_qemu_co_mutex_lock_entry(mutex, self);
252
+ w.co = self;
253
+ push_waiter(mutex, &w);
254
255
- while (mutex->locked) {
256
- qemu_co_queue_wait(&mutex->queue);
257
+ /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
258
+ * a concurrent unlock() the responsibility of waking somebody up.
259
+ */
260
+ old_handoff = atomic_mb_read(&mutex->handoff);
261
+ if (old_handoff &&
262
+ has_waiters(mutex) &&
263
+ atomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
264
+ /* There can be no concurrent pops, because there can be only
265
+ * one active handoff at a time.
266
+ */
267
+ CoWaitRecord *to_wake = pop_waiter(mutex);
268
+ Coroutine *co = to_wake->co;
269
+ if (co == self) {
270
+ /* We got the lock ourselves! */
271
+ assert(to_wake == &w);
272
+ return;
273
+ }
274
+
275
+ aio_co_wake(co);
276
}
277
278
- mutex->locked = true;
279
- mutex->holder = self;
280
- self->locks_held++;
281
-
282
+ qemu_coroutine_yield();
283
trace_qemu_co_mutex_lock_return(mutex, self);
284
}
285
286
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
287
+{
288
+ Coroutine *self = qemu_coroutine_self();
289
+
290
+ if (atomic_fetch_inc(&mutex->locked) == 0) {
291
+ /* Uncontended. */
292
+ trace_qemu_co_mutex_lock_uncontended(mutex, self);
293
+ } else {
294
+ qemu_co_mutex_lock_slowpath(mutex);
295
+ }
296
+ mutex->holder = self;
297
+ self->locks_held++;
298
+}
299
+
300
void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
301
{
302
Coroutine *self = qemu_coroutine_self();
303
304
trace_qemu_co_mutex_unlock_entry(mutex, self);
305
306
- assert(mutex->locked == true);
307
+ assert(mutex->locked);
308
assert(mutex->holder == self);
309
assert(qemu_in_coroutine());
310
311
- mutex->locked = false;
312
mutex->holder = NULL;
313
self->locks_held--;
314
- qemu_co_queue_next(&mutex->queue);
315
+ if (atomic_fetch_dec(&mutex->locked) == 1) {
316
+ /* No waiting qemu_co_mutex_lock(). Pfew, that was easy! */
317
+ return;
318
+ }
319
+
320
+ for (;;) {
321
+ CoWaitRecord *to_wake = pop_waiter(mutex);
322
+ unsigned our_handoff;
323
+
324
+ if (to_wake) {
325
+ Coroutine *co = to_wake->co;
326
+ aio_co_wake(co);
327
+ break;
98
+ break;
328
+ }
99
+ }
329
+
100
+
330
+ /* Some concurrent lock() is in progress (we know this because
101
+ if (!mpqemu_msg_valid(&msg)) {
331
+ * mutex->locked was >1) but it hasn't yet put itself on the wait
102
+ error_setg(&local_err, "Received invalid message from proxy"
332
+ * queue. Pick a sequence number for the handoff protocol (not 0).
103
+ "in remote process pid="FMT_pid"",
333
+ */
104
+ getpid());
334
+ if (++mutex->sequence == 0) {
335
+ mutex->sequence = 1;
336
+ }
337
+
338
+ our_handoff = mutex->sequence;
339
+ atomic_mb_set(&mutex->handoff, our_handoff);
340
+ if (!has_waiters(mutex)) {
341
+ /* The concurrent lock has not added itself yet, so it
342
+ * will be able to pick our handoff.
343
+ */
344
+ break;
105
+ break;
345
+ }
106
+ }
346
+
107
+
347
+ /* Try to do the handoff protocol ourselves; if somebody else has
108
+ switch (msg.cmd) {
348
+ * already taken it, however, we're done and they're responsible.
109
+ default:
349
+ */
110
+ error_setg(&local_err,
350
+ if (atomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
111
+ "Unknown command (%d) received for device %s"
351
+ break;
112
+ " (pid="FMT_pid")",
113
+ msg.cmd, DEVICE(pci_dev)->id, getpid());
352
+ }
114
+ }
353
+ }
115
+ }
354
116
+
355
trace_qemu_co_mutex_unlock_return(mutex, self);
117
+ if (local_err) {
356
}
118
+ error_report_err(local_err);
357
diff --git a/util/trace-events b/util/trace-events
119
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
120
+ } else {
121
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
122
+ }
123
+}
124
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
358
index XXXXXXX..XXXXXXX 100644
125
index XXXXXXX..XXXXXXX 100644
359
--- a/util/trace-events
126
--- a/hw/remote/meson.build
360
+++ b/util/trace-events
127
+++ b/hw/remote/meson.build
361
@@ -XXX,XX +XXX,XX @@ qemu_coroutine_terminate(void *co) "self %p"
128
@@ -XXX,XX +XXX,XX @@ remote_ss = ss.source_set()
362
129
363
# util/qemu-coroutine-lock.c
130
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
364
qemu_co_queue_run_restart(void *co) "co %p"
131
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
365
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
132
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
366
qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
133
367
qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
134
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
368
qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
369
--
135
--
370
2.9.3
136
2.29.2
371
137
372
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
Once the thread pool starts using aio_co_wake, it will also need
3
Associate the file descriptor for a PCIDevice in remote process with
4
qemu_get_current_aio_context(). Make test-thread-pool create
4
DeviceState object.
5
an AioContext with qemu_init_main_loop, so that stubs/iothread.c
5
6
and tests/iothread.c can provide the rest.
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Message-id: f405a2ed5d7518b87bea7c59cfdf334d67e5ee51.1611938319.git.jag.raman@oracle.com
10
Reviewed-by: Fam Zheng <famz@redhat.com>
11
Message-id: 20170213135235.12274-5-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
12
---
14
tests/test-thread-pool.c | 12 +++---------
13
MAINTAINERS | 1 +
15
1 file changed, 3 insertions(+), 9 deletions(-)
14
hw/remote/remote-obj.c | 203 +++++++++++++++++++++++++++++++++++++++++
16
15
hw/remote/meson.build | 1 +
17
diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
16
3 files changed, 205 insertions(+)
17
create mode 100644 hw/remote/remote-obj.c
18
19
diff --git a/MAINTAINERS b/MAINTAINERS
18
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
19
--- a/tests/test-thread-pool.c
21
--- a/MAINTAINERS
20
+++ b/tests/test-thread-pool.c
22
+++ b/MAINTAINERS
23
@@ -XXX,XX +XXX,XX @@ F: include/hw/remote/machine.h
24
F: hw/remote/mpqemu-link.c
25
F: include/hw/remote/mpqemu-link.h
26
F: hw/remote/message.c
27
+F: hw/remote/remote-obj.c
28
29
Build and test automation
30
-------------------------
31
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
32
new file mode 100644
33
index XXXXXXX..XXXXXXX
34
--- /dev/null
35
+++ b/hw/remote/remote-obj.c
21
@@ -XXX,XX +XXX,XX @@
36
@@ -XXX,XX +XXX,XX @@
22
#include "qapi/error.h"
37
+/*
23
#include "qemu/timer.h"
38
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
24
#include "qemu/error-report.h"
39
+ *
25
+#include "qemu/main-loop.h"
40
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
26
41
+ *
27
static AioContext *ctx;
42
+ * See the COPYING file in the top-level directory.
28
static ThreadPool *pool;
43
+ *
29
@@ -XXX,XX +XXX,XX @@ static void test_cancel_async(void)
44
+ */
30
int main(int argc, char **argv)
45
+
31
{
46
+#include "qemu/osdep.h"
32
int ret;
47
+#include "qemu-common.h"
33
- Error *local_error = NULL;
48
+
34
49
+#include "qemu/error-report.h"
35
- init_clocks();
50
+#include "qemu/notify.h"
36
-
51
+#include "qom/object_interfaces.h"
37
- ctx = aio_context_new(&local_error);
52
+#include "hw/qdev-core.h"
38
- if (!ctx) {
53
+#include "io/channel.h"
39
- error_reportf_err(local_error, "Failed to create AIO Context: ");
54
+#include "hw/qdev-core.h"
40
- exit(1);
55
+#include "hw/remote/machine.h"
41
- }
56
+#include "io/channel-util.h"
42
+ qemu_init_main_loop(&error_abort);
57
+#include "qapi/error.h"
43
+ ctx = qemu_get_current_aio_context();
58
+#include "sysemu/sysemu.h"
44
pool = aio_get_thread_pool(ctx);
59
+#include "hw/pci/pci.h"
45
60
+#include "qemu/sockets.h"
46
g_test_init(&argc, &argv, NULL);
61
+#include "monitor/monitor.h"
47
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
62
+
48
63
+#define TYPE_REMOTE_OBJECT "x-remote-object"
49
ret = g_test_run();
64
+OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
50
65
+
51
- aio_context_unref(ctx);
66
+struct RemoteObjectClass {
52
return ret;
67
+ ObjectClass parent_class;
53
}
68
+
69
+ unsigned int nr_devs;
70
+ unsigned int max_devs;
71
+};
72
+
73
+struct RemoteObject {
74
+ /* private */
75
+ Object parent;
76
+
77
+ Notifier machine_done;
78
+
79
+ int32_t fd;
80
+ char *devid;
81
+
82
+ QIOChannel *ioc;
83
+
84
+ DeviceState *dev;
85
+ DeviceListener listener;
86
+};
87
+
88
+static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
89
+{
90
+ RemoteObject *o = REMOTE_OBJECT(obj);
91
+ int fd = -1;
92
+
93
+ fd = monitor_fd_param(monitor_cur(), str, errp);
94
+ if (fd == -1) {
95
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
96
+ return;
97
+ }
98
+
99
+ if (!fd_is_socket(fd)) {
100
+ error_setg(errp, "File descriptor '%s' is not a socket", str);
101
+ close(fd);
102
+ return;
103
+ }
104
+
105
+ o->fd = fd;
106
+}
107
+
108
+static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
109
+{
110
+ RemoteObject *o = REMOTE_OBJECT(obj);
111
+
112
+ g_free(o->devid);
113
+
114
+ o->devid = g_strdup(str);
115
+}
116
+
117
+static void remote_object_unrealize_listener(DeviceListener *listener,
118
+ DeviceState *dev)
119
+{
120
+ RemoteObject *o = container_of(listener, RemoteObject, listener);
121
+
122
+ if (o->dev == dev) {
123
+ object_unref(OBJECT(o));
124
+ }
125
+}
126
+
127
+static void remote_object_machine_done(Notifier *notifier, void *data)
128
+{
129
+ RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
130
+ DeviceState *dev = NULL;
131
+ QIOChannel *ioc = NULL;
132
+ Coroutine *co = NULL;
133
+ RemoteCommDev *comdev = NULL;
134
+ Error *err = NULL;
135
+
136
+ dev = qdev_find_recursive(sysbus_get_default(), o->devid);
137
+ if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
138
+ error_report("%s is not a PCI device", o->devid);
139
+ return;
140
+ }
141
+
142
+ ioc = qio_channel_new_fd(o->fd, &err);
143
+ if (!ioc) {
144
+ error_report_err(err);
145
+ return;
146
+ }
147
+ qio_channel_set_blocking(ioc, false, NULL);
148
+
149
+ o->dev = dev;
150
+
151
+ o->listener.unrealize = remote_object_unrealize_listener;
152
+ device_listener_register(&o->listener);
153
+
154
+ /* co-routine should free this. */
155
+ comdev = g_new0(RemoteCommDev, 1);
156
+ *comdev = (RemoteCommDev) {
157
+ .ioc = ioc,
158
+ .dev = PCI_DEVICE(dev),
159
+ };
160
+
161
+ co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
162
+ qemu_coroutine_enter(co);
163
+}
164
+
165
+static void remote_object_init(Object *obj)
166
+{
167
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
168
+ RemoteObject *o = REMOTE_OBJECT(obj);
169
+
170
+ if (k->nr_devs >= k->max_devs) {
171
+ error_report("Reached maximum number of devices: %u", k->max_devs);
172
+ return;
173
+ }
174
+
175
+ o->ioc = NULL;
176
+ o->fd = -1;
177
+ o->devid = NULL;
178
+
179
+ k->nr_devs++;
180
+
181
+ o->machine_done.notify = remote_object_machine_done;
182
+ qemu_add_machine_init_done_notifier(&o->machine_done);
183
+}
184
+
185
+static void remote_object_finalize(Object *obj)
186
+{
187
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
188
+ RemoteObject *o = REMOTE_OBJECT(obj);
189
+
190
+ device_listener_unregister(&o->listener);
191
+
192
+ if (o->ioc) {
193
+ qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
194
+ qio_channel_close(o->ioc, NULL);
195
+ }
196
+
197
+ object_unref(OBJECT(o->ioc));
198
+
199
+ k->nr_devs--;
200
+ g_free(o->devid);
201
+}
202
+
203
+static void remote_object_class_init(ObjectClass *klass, void *data)
204
+{
205
+ RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
206
+
207
+ /*
208
+ * Limit number of supported devices to 1. This is done to avoid devices
209
+ * from one VM accessing the RAM of another VM. This is done until we
210
+ * start using separate address spaces for individual devices.
211
+ */
212
+ k->max_devs = 1;
213
+ k->nr_devs = 0;
214
+
215
+ object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
216
+ object_class_property_add_str(klass, "devid", NULL,
217
+ remote_object_set_devid);
218
+}
219
+
220
+static const TypeInfo remote_object_info = {
221
+ .name = TYPE_REMOTE_OBJECT,
222
+ .parent = TYPE_OBJECT,
223
+ .instance_size = sizeof(RemoteObject),
224
+ .instance_init = remote_object_init,
225
+ .instance_finalize = remote_object_finalize,
226
+ .class_size = sizeof(RemoteObjectClass),
227
+ .class_init = remote_object_class_init,
228
+ .interfaces = (InterfaceInfo[]) {
229
+ { TYPE_USER_CREATABLE },
230
+ { }
231
+ }
232
+};
233
+
234
+static void register_types(void)
235
+{
236
+ type_register_static(&remote_object_info);
237
+}
238
+
239
+type_init(register_types);
240
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
241
index XXXXXXX..XXXXXXX 100644
242
--- a/hw/remote/meson.build
243
+++ b/hw/remote/meson.build
244
@@ -XXX,XX +XXX,XX @@ remote_ss = ss.source_set()
245
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
246
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
247
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
248
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
249
250
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
54
--
251
--
55
2.9.3
252
2.29.2
56
253
57
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
This patch prepares for the removal of unnecessary lockcnt inc/dec pairs.
3
SyncSysMemMsg message format is defined. It is used to send
4
Extract the dispatching loop for file descriptor handlers into a new
4
file descriptors of the RAM regions to remote device.
5
function aio_dispatch_handlers, and then inline aio_dispatch into
5
RAM on the remote device is configured with a set of file descriptors.
6
aio_poll.
6
Old RAM regions are deleted and new regions, each with an fd, is
7
7
added to the RAM.
8
aio_dispatch can now become void.
8
9
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
11
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
13
Message-id: 7d2d1831d812e85f681e7a8ab99e032cf4704689.1611938319.git.jag.raman@oracle.com
12
Reviewed-by: Fam Zheng <famz@redhat.com>
13
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
14
Message-id: 20170213135235.12274-17-pbonzini@redhat.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
15
---
17
include/block/aio.h | 6 +-----
16
MAINTAINERS | 2 +
18
util/aio-posix.c | 44 ++++++++++++++------------------------------
17
include/hw/remote/memory.h | 19 ++++++++++
19
util/aio-win32.c | 13 ++++---------
18
include/hw/remote/mpqemu-link.h | 10 +++++
20
util/async.c | 2 +-
19
hw/remote/memory.c | 65 +++++++++++++++++++++++++++++++++
21
4 files changed, 20 insertions(+), 45 deletions(-)
20
hw/remote/mpqemu-link.c | 11 ++++++
22
21
hw/remote/meson.build | 2 +
23
diff --git a/include/block/aio.h b/include/block/aio.h
22
6 files changed, 109 insertions(+)
24
index XXXXXXX..XXXXXXX 100644
23
create mode 100644 include/hw/remote/memory.h
25
--- a/include/block/aio.h
24
create mode 100644 hw/remote/memory.c
26
+++ b/include/block/aio.h
25
27
@@ -XXX,XX +XXX,XX @@ bool aio_pending(AioContext *ctx);
26
diff --git a/MAINTAINERS b/MAINTAINERS
28
/* Dispatch any pending callbacks from the GSource attached to the AioContext.
27
index XXXXXXX..XXXXXXX 100644
28
--- a/MAINTAINERS
29
+++ b/MAINTAINERS
30
@@ -XXX,XX +XXX,XX @@ F: hw/remote/mpqemu-link.c
31
F: include/hw/remote/mpqemu-link.h
32
F: hw/remote/message.c
33
F: hw/remote/remote-obj.c
34
+F: include/hw/remote/memory.h
35
+F: hw/remote/memory.c
36
37
Build and test automation
38
-------------------------
39
diff --git a/include/hw/remote/memory.h b/include/hw/remote/memory.h
40
new file mode 100644
41
index XXXXXXX..XXXXXXX
42
--- /dev/null
43
+++ b/include/hw/remote/memory.h
44
@@ -XXX,XX +XXX,XX @@
45
+/*
46
+ * Memory manager for remote device
47
+ *
48
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
49
+ *
50
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
51
+ * See the COPYING file in the top-level directory.
52
+ *
53
+ */
54
+
55
+#ifndef REMOTE_MEMORY_H
56
+#define REMOTE_MEMORY_H
57
+
58
+#include "exec/hwaddr.h"
59
+#include "hw/remote/mpqemu-link.h"
60
+
61
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp);
62
+
63
+#endif
64
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
65
index XXXXXXX..XXXXXXX 100644
66
--- a/include/hw/remote/mpqemu-link.h
67
+++ b/include/hw/remote/mpqemu-link.h
68
@@ -XXX,XX +XXX,XX @@
69
#include "qom/object.h"
70
#include "qemu/thread.h"
71
#include "io/channel.h"
72
+#include "exec/hwaddr.h"
73
74
#define REMOTE_MAX_FDS 8
75
76
@@ -XXX,XX +XXX,XX @@
29
*
77
*
30
* This is used internally in the implementation of the GSource.
31
- *
32
- * @dispatch_fds: true to process fds, false to skip them
33
- * (can be used as an optimization by callers that know there
34
- * are no fds ready)
35
*/
78
*/
36
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds);
79
typedef enum {
37
+void aio_dispatch(AioContext *ctx);
80
+ MPQEMU_CMD_SYNC_SYSMEM,
38
81
MPQEMU_CMD_MAX,
39
/* Progress in completing AIO work to occur. This can issue new pending
82
} MPQemuCmd;
40
* aio as a result of executing I/O completion or bh callbacks.
83
41
diff --git a/util/aio-posix.c b/util/aio-posix.c
84
+typedef struct {
42
index XXXXXXX..XXXXXXX 100644
85
+ hwaddr gpas[REMOTE_MAX_FDS];
43
--- a/util/aio-posix.c
86
+ uint64_t sizes[REMOTE_MAX_FDS];
44
+++ b/util/aio-posix.c
87
+ off_t offsets[REMOTE_MAX_FDS];
45
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
88
+} SyncSysmemMsg;
46
AioHandler *node, *tmp;
89
+
47
bool progress = false;
90
/**
48
91
* MPQemuMsg:
49
- /*
92
* @cmd: The remote command
50
- * We have to walk very carefully in case aio_set_fd_handler is
93
@@ -XXX,XX +XXX,XX @@ typedef enum {
51
- * called while we're walking.
94
* MPQemuMsg Format of the message sent to the remote device from QEMU.
52
- */
95
*
53
- qemu_lockcnt_inc(&ctx->list_lock);
96
*/
54
-
97
+
55
QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
98
typedef struct {
56
int revents;
99
int cmd;
57
100
size_t size;
58
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx)
101
102
union {
103
uint64_t u64;
104
+ SyncSysmemMsg sync_sysmem;
105
} data;
106
107
int fds[REMOTE_MAX_FDS];
108
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
109
new file mode 100644
110
index XXXXXXX..XXXXXXX
111
--- /dev/null
112
+++ b/hw/remote/memory.c
113
@@ -XXX,XX +XXX,XX @@
114
+/*
115
+ * Memory manager for remote device
116
+ *
117
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
118
+ *
119
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
120
+ * See the COPYING file in the top-level directory.
121
+ *
122
+ */
123
+
124
+#include "qemu/osdep.h"
125
+#include "qemu-common.h"
126
+
127
+#include "hw/remote/memory.h"
128
+#include "exec/address-spaces.h"
129
+#include "exec/ram_addr.h"
130
+#include "qapi/error.h"
131
+
132
+static void remote_sysmem_reset(void)
133
+{
134
+ MemoryRegion *sysmem, *subregion, *next;
135
+
136
+ sysmem = get_system_memory();
137
+
138
+ QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
139
+ if (subregion->ram) {
140
+ memory_region_del_subregion(sysmem, subregion);
141
+ object_unparent(OBJECT(subregion));
142
+ }
143
+ }
144
+}
145
+
146
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
147
+{
148
+ ERRP_GUARD();
149
+ SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
150
+ MemoryRegion *sysmem, *subregion;
151
+ static unsigned int suffix;
152
+ int region;
153
+
154
+ sysmem = get_system_memory();
155
+
156
+ remote_sysmem_reset();
157
+
158
+ for (region = 0; region < msg->num_fds; region++) {
159
+ g_autofree char *name;
160
+ subregion = g_new(MemoryRegion, 1);
161
+ name = g_strdup_printf("remote-mem-%u", suffix++);
162
+ memory_region_init_ram_from_fd(subregion, NULL,
163
+ name, sysmem_info->sizes[region],
164
+ true, msg->fds[region],
165
+ sysmem_info->offsets[region],
166
+ errp);
167
+
168
+ if (*errp) {
169
+ g_free(subregion);
170
+ remote_sysmem_reset();
171
+ return;
172
+ }
173
+
174
+ memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
175
+ subregion);
176
+
177
+ }
178
+}
179
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
180
index XXXXXXX..XXXXXXX 100644
181
--- a/hw/remote/mpqemu-link.c
182
+++ b/hw/remote/mpqemu-link.c
183
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
59
}
184
}
60
}
185
}
61
186
62
- qemu_lockcnt_dec(&ctx->list_lock);
187
+ /* Verify message specific fields. */
63
return progress;
188
+ switch (msg->cmd) {
64
}
189
+ case MPQEMU_CMD_SYNC_SYSMEM:
65
190
+ if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
66
-/*
191
+ return false;
67
- * Note that dispatch_fds == false has the side-effect of post-poning the
192
+ }
68
- * freeing of deleted handlers.
193
+ break;
69
- */
194
+ default:
70
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
195
+ break;
71
+void aio_dispatch(AioContext *ctx)
196
+ }
72
{
197
+
73
- bool progress;
74
+ aio_bh_poll(ctx);
75
76
- /*
77
- * If there are callbacks left that have been queued, we need to call them.
78
- * Do not call select in this case, because it is possible that the caller
79
- * does not need a complete flush (as is the case for aio_poll loops).
80
- */
81
- progress = aio_bh_poll(ctx);
82
+ qemu_lockcnt_inc(&ctx->list_lock);
83
+ aio_dispatch_handlers(ctx);
84
+ qemu_lockcnt_dec(&ctx->list_lock);
85
86
- if (dispatch_fds) {
87
- progress |= aio_dispatch_handlers(ctx);
88
- }
89
-
90
- /* Run our timers */
91
- progress |= timerlistgroup_run_timers(&ctx->tlg);
92
-
93
- return progress;
94
+ timerlistgroup_run_timers(&ctx->tlg);
95
}
96
97
/* These thread-local variables are used only in a small part of aio_poll
98
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
99
npfd = 0;
100
qemu_lockcnt_dec(&ctx->list_lock);
101
102
- /* Run dispatch even if there were no readable fds to run timers */
103
- if (aio_dispatch(ctx, ret > 0)) {
104
- progress = true;
105
+ progress |= aio_bh_poll(ctx);
106
+
107
+ if (ret > 0) {
108
+ qemu_lockcnt_inc(&ctx->list_lock);
109
+ progress |= aio_dispatch_handlers(ctx);
110
+ qemu_lockcnt_dec(&ctx->list_lock);
111
}
112
113
+ progress |= timerlistgroup_run_timers(&ctx->tlg);
114
+
115
return progress;
116
}
117
118
diff --git a/util/aio-win32.c b/util/aio-win32.c
119
index XXXXXXX..XXXXXXX 100644
120
--- a/util/aio-win32.c
121
+++ b/util/aio-win32.c
122
@@ -XXX,XX +XXX,XX @@ static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
123
return progress;
124
}
125
126
-bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
127
+void aio_dispatch(AioContext *ctx)
128
{
129
- bool progress;
130
-
131
- progress = aio_bh_poll(ctx);
132
- if (dispatch_fds) {
133
- progress |= aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
134
- }
135
- progress |= timerlistgroup_run_timers(&ctx->tlg);
136
- return progress;
137
+ aio_bh_poll(ctx);
138
+ aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
139
+ timerlistgroup_run_timers(&ctx->tlg);
140
}
141
142
bool aio_poll(AioContext *ctx, bool blocking)
143
diff --git a/util/async.c b/util/async.c
144
index XXXXXXX..XXXXXXX 100644
145
--- a/util/async.c
146
+++ b/util/async.c
147
@@ -XXX,XX +XXX,XX @@ aio_ctx_dispatch(GSource *source,
148
AioContext *ctx = (AioContext *) source;
149
150
assert(callback == NULL);
151
- aio_dispatch(ctx, true);
152
+ aio_dispatch(ctx);
153
return true;
198
return true;
154
}
199
}
155
200
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
201
index XXXXXXX..XXXXXXX 100644
202
--- a/hw/remote/meson.build
203
+++ b/hw/remote/meson.build
204
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
205
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
206
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
207
208
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
209
+
210
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
156
--
211
--
157
2.9.3
212
2.29.2
158
213
159
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
Add two implementations of the same benchmark as the previous patch,
3
Defines a PCI Device proxy object as a child of TYPE_PCI_DEVICE.
4
but using pthreads. One uses a normal QemuMutex, the other is Linux
5
only and implements a fair mutex based on MCS locks and futexes.
6
This shows that the slower performance of the 5-thread case is due to
7
the fairness of CoMutex, rather than to coroutines. If fairness does
8
not matter, as is the case with two threads, CoMutex can actually be
9
faster than pthreads.
10
4
11
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
5
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
12
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
13
Message-id: 20170213181244.16297-4-pbonzini@redhat.com
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-id: b5186ebfedf8e557044d09a768846c59230ad3a7.1611938319.git.jag.raman@oracle.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
11
---
16
tests/test-aio-multithread.c | 164 +++++++++++++++++++++++++++++++++++++++++++
12
MAINTAINERS | 2 +
17
1 file changed, 164 insertions(+)
13
include/hw/remote/proxy.h | 33 +++++++++++++
14
hw/remote/proxy.c | 99 +++++++++++++++++++++++++++++++++++++++
15
hw/remote/meson.build | 1 +
16
4 files changed, 135 insertions(+)
17
create mode 100644 include/hw/remote/proxy.h
18
create mode 100644 hw/remote/proxy.c
18
19
19
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
20
diff --git a/MAINTAINERS b/MAINTAINERS
20
index XXXXXXX..XXXXXXX 100644
21
index XXXXXXX..XXXXXXX 100644
21
--- a/tests/test-aio-multithread.c
22
--- a/MAINTAINERS
22
+++ b/tests/test-aio-multithread.c
23
+++ b/MAINTAINERS
23
@@ -XXX,XX +XXX,XX @@ static void test_multi_co_mutex_2_30(void)
24
@@ -XXX,XX +XXX,XX @@ F: hw/remote/message.c
24
test_multi_co_mutex(2, 30);
25
F: hw/remote/remote-obj.c
25
}
26
F: include/hw/remote/memory.h
26
27
F: hw/remote/memory.c
27
+/* Same test with fair mutexes, for performance comparison. */
28
+F: hw/remote/proxy.c
29
+F: include/hw/remote/proxy.h
30
31
Build and test automation
32
-------------------------
33
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
34
new file mode 100644
35
index XXXXXXX..XXXXXXX
36
--- /dev/null
37
+++ b/include/hw/remote/proxy.h
38
@@ -XXX,XX +XXX,XX @@
39
+/*
40
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
41
+ *
42
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
43
+ * See the COPYING file in the top-level directory.
44
+ *
45
+ */
28
+
46
+
29
+#ifdef CONFIG_LINUX
47
+#ifndef PROXY_H
30
+#include "qemu/futex.h"
48
+#define PROXY_H
31
+
49
+
32
+/* The nodes for the mutex reside in this structure (on which we try to avoid
50
+#include "hw/pci/pci.h"
33
+ * false sharing). The head of the mutex is in the "mutex_head" variable.
51
+#include "io/channel.h"
52
+
53
+#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
54
+OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
55
+
56
+struct PCIProxyDev {
57
+ PCIDevice parent_dev;
58
+ char *fd;
59
+
60
+ /*
61
+ * Mutex used to protect the QIOChannel fd from
62
+ * the concurrent access by the VCPUs since proxy
63
+ * blocks while awaiting for the replies from the
64
+ * process remote.
65
+ */
66
+ QemuMutex io_mutex;
67
+ QIOChannel *ioc;
68
+ Error *migration_blocker;
69
+};
70
+
71
+#endif /* PROXY_H */
72
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
73
new file mode 100644
74
index XXXXXXX..XXXXXXX
75
--- /dev/null
76
+++ b/hw/remote/proxy.c
77
@@ -XXX,XX +XXX,XX @@
78
+/*
79
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
80
+ *
81
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
82
+ * See the COPYING file in the top-level directory.
83
+ *
34
+ */
84
+ */
35
+static struct {
36
+ int next, locked;
37
+ int padding[14];
38
+} nodes[NUM_CONTEXTS] __attribute__((__aligned__(64)));
39
+
85
+
40
+static int mutex_head = -1;
86
+#include "qemu/osdep.h"
87
+#include "qemu-common.h"
41
+
88
+
42
+static void mcs_mutex_lock(void)
89
+#include "hw/remote/proxy.h"
90
+#include "hw/pci/pci.h"
91
+#include "qapi/error.h"
92
+#include "io/channel-util.h"
93
+#include "hw/qdev-properties.h"
94
+#include "monitor/monitor.h"
95
+#include "migration/blocker.h"
96
+#include "qemu/sockets.h"
97
+
98
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
43
+{
99
+{
44
+ int prev;
100
+ ERRP_GUARD();
101
+ PCIProxyDev *dev = PCI_PROXY_DEV(device);
102
+ int fd;
45
+
103
+
46
+ nodes[id].next = -1;
104
+ if (!dev->fd) {
47
+ nodes[id].locked = 1;
105
+ error_setg(errp, "fd parameter not specified for %s",
48
+ prev = atomic_xchg(&mutex_head, id);
106
+ DEVICE(device)->id);
49
+ if (prev != -1) {
107
+ return;
50
+ atomic_set(&nodes[prev].next, id);
51
+ qemu_futex_wait(&nodes[id].locked, 1);
52
+ }
108
+ }
109
+
110
+ fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
111
+ if (fd == -1) {
112
+ error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
113
+ return;
114
+ }
115
+
116
+ if (!fd_is_socket(fd)) {
117
+ error_setg(errp, "proxy: fd %d is not a socket", fd);
118
+ close(fd);
119
+ return;
120
+ }
121
+
122
+ dev->ioc = qio_channel_new_fd(fd, errp);
123
+
124
+ error_setg(&dev->migration_blocker, "%s does not support migration",
125
+ TYPE_PCI_PROXY_DEV);
126
+ migrate_add_blocker(dev->migration_blocker, errp);
127
+
128
+ qemu_mutex_init(&dev->io_mutex);
129
+ qio_channel_set_blocking(dev->ioc, true, NULL);
53
+}
130
+}
54
+
131
+
55
+static void mcs_mutex_unlock(void)
132
+static void pci_proxy_dev_exit(PCIDevice *pdev)
56
+{
133
+{
57
+ int next;
134
+ PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
58
+ if (nodes[id].next == -1) {
135
+
59
+ if (atomic_read(&mutex_head) == id &&
136
+ if (dev->ioc) {
60
+ atomic_cmpxchg(&mutex_head, id, -1) == id) {
137
+ qio_channel_close(dev->ioc, NULL);
61
+ /* Last item in the list, exit. */
62
+ return;
63
+ }
64
+ while (atomic_read(&nodes[id].next) == -1) {
65
+ /* mcs_mutex_lock did the xchg, but has not updated
66
+ * nodes[prev].next yet.
67
+ */
68
+ }
69
+ }
138
+ }
70
+
139
+
71
+ /* Wake up the next in line. */
140
+ migrate_del_blocker(dev->migration_blocker);
72
+ next = nodes[id].next;
141
+
73
+ nodes[next].locked = 0;
142
+ error_free(dev->migration_blocker);
74
+ qemu_futex_wake(&nodes[next].locked, 1);
75
+}
143
+}
76
+
144
+
77
+static void test_multi_fair_mutex_entry(void *opaque)
145
+static Property proxy_properties[] = {
146
+ DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
147
+ DEFINE_PROP_END_OF_LIST(),
148
+};
149
+
150
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
78
+{
151
+{
79
+ while (!atomic_mb_read(&now_stopping)) {
152
+ DeviceClass *dc = DEVICE_CLASS(klass);
80
+ mcs_mutex_lock();
153
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
81
+ counter++;
154
+
82
+ mcs_mutex_unlock();
155
+ k->realize = pci_proxy_dev_realize;
83
+ atomic_inc(&atomic_counter);
156
+ k->exit = pci_proxy_dev_exit;
84
+ }
157
+ device_class_set_props(dc, proxy_properties);
85
+ atomic_dec(&running);
86
+}
158
+}
87
+
159
+
88
+static void test_multi_fair_mutex(int threads, int seconds)
160
+static const TypeInfo pci_proxy_dev_type_info = {
161
+ .name = TYPE_PCI_PROXY_DEV,
162
+ .parent = TYPE_PCI_DEVICE,
163
+ .instance_size = sizeof(PCIProxyDev),
164
+ .class_init = pci_proxy_dev_class_init,
165
+ .interfaces = (InterfaceInfo[]) {
166
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
167
+ { },
168
+ },
169
+};
170
+
171
+static void pci_proxy_dev_register_types(void)
89
+{
172
+{
90
+ int i;
173
+ type_register_static(&pci_proxy_dev_type_info);
91
+
92
+ assert(mutex_head == -1);
93
+ counter = 0;
94
+ atomic_counter = 0;
95
+ now_stopping = false;
96
+
97
+ create_aio_contexts();
98
+ assert(threads <= NUM_CONTEXTS);
99
+ running = threads;
100
+ for (i = 0; i < threads; i++) {
101
+ Coroutine *co1 = qemu_coroutine_create(test_multi_fair_mutex_entry, NULL);
102
+ aio_co_schedule(ctx[i], co1);
103
+ }
104
+
105
+ g_usleep(seconds * 1000000);
106
+
107
+ atomic_mb_set(&now_stopping, true);
108
+ while (running > 0) {
109
+ g_usleep(100000);
110
+ }
111
+
112
+ join_aio_contexts();
113
+ g_test_message("%d iterations/second\n", counter / seconds);
114
+ g_assert_cmpint(counter, ==, atomic_counter);
115
+}
174
+}
116
+
175
+
117
+static void test_multi_fair_mutex_1(void)
176
+type_init(pci_proxy_dev_register_types)
118
+{
177
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
119
+ test_multi_fair_mutex(NUM_CONTEXTS, 1);
178
index XXXXXXX..XXXXXXX 100644
120
+}
179
--- a/hw/remote/meson.build
121
+
180
+++ b/hw/remote/meson.build
122
+static void test_multi_fair_mutex_10(void)
181
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
123
+{
182
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
124
+ test_multi_fair_mutex(NUM_CONTEXTS, 10);
183
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
125
+}
184
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
126
+#endif
185
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
127
+
186
128
+/* Same test with pthread mutexes, for performance comparison and
187
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
129
+ * portability. */
188
130
+
131
+static QemuMutex mutex;
132
+
133
+static void test_multi_mutex_entry(void *opaque)
134
+{
135
+ while (!atomic_mb_read(&now_stopping)) {
136
+ qemu_mutex_lock(&mutex);
137
+ counter++;
138
+ qemu_mutex_unlock(&mutex);
139
+ atomic_inc(&atomic_counter);
140
+ }
141
+ atomic_dec(&running);
142
+}
143
+
144
+static void test_multi_mutex(int threads, int seconds)
145
+{
146
+ int i;
147
+
148
+ qemu_mutex_init(&mutex);
149
+ counter = 0;
150
+ atomic_counter = 0;
151
+ now_stopping = false;
152
+
153
+ create_aio_contexts();
154
+ assert(threads <= NUM_CONTEXTS);
155
+ running = threads;
156
+ for (i = 0; i < threads; i++) {
157
+ Coroutine *co1 = qemu_coroutine_create(test_multi_mutex_entry, NULL);
158
+ aio_co_schedule(ctx[i], co1);
159
+ }
160
+
161
+ g_usleep(seconds * 1000000);
162
+
163
+ atomic_mb_set(&now_stopping, true);
164
+ while (running > 0) {
165
+ g_usleep(100000);
166
+ }
167
+
168
+ join_aio_contexts();
169
+ g_test_message("%d iterations/second\n", counter / seconds);
170
+ g_assert_cmpint(counter, ==, atomic_counter);
171
+}
172
+
173
+static void test_multi_mutex_1(void)
174
+{
175
+ test_multi_mutex(NUM_CONTEXTS, 1);
176
+}
177
+
178
+static void test_multi_mutex_10(void)
179
+{
180
+ test_multi_mutex(NUM_CONTEXTS, 10);
181
+}
182
+
183
/* End of tests. */
184
185
int main(int argc, char **argv)
186
@@ -XXX,XX +XXX,XX @@ int main(int argc, char **argv)
187
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
188
g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_1);
189
g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_3);
190
+#ifdef CONFIG_LINUX
191
+ g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_1);
192
+#endif
193
+ g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_1);
194
} else {
195
g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
196
g_test_add_func("/aio/multi/mutex/contended", test_multi_co_mutex_10);
197
g_test_add_func("/aio/multi/mutex/handoff", test_multi_co_mutex_2_30);
198
+#ifdef CONFIG_LINUX
199
+ g_test_add_func("/aio/multi/mutex/mcs", test_multi_fair_mutex_10);
200
+#endif
201
+ g_test_add_func("/aio/multi/mutex/pthread", test_multi_mutex_10);
202
}
203
return g_test_run();
204
}
205
--
189
--
206
2.9.3
190
2.29.2
207
191
208
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
All that CoQueue needs in order to become thread-safe is help
3
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
4
from an external mutex. Add this to the API.
4
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
5
5
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
6
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Reviewed-by: Fam Zheng <famz@redhat.com>
7
Message-id: d54edb4176361eed86b903e8f27058363b6c83b3.1611938319.git.jag.raman@oracle.com
8
Message-id: 20170213181244.16297-6-pbonzini@redhat.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
9
---
11
include/qemu/coroutine.h | 8 +++++---
10
include/hw/remote/mpqemu-link.h | 4 ++++
12
block/backup.c | 2 +-
11
hw/remote/mpqemu-link.c | 34 +++++++++++++++++++++++++++++++++
13
block/io.c | 4 ++--
12
2 files changed, 38 insertions(+)
14
block/nbd-client.c | 2 +-
15
block/qcow2-cluster.c | 4 +---
16
block/sheepdog.c | 2 +-
17
block/throttle-groups.c | 2 +-
18
hw/9pfs/9p.c | 2 +-
19
util/qemu-coroutine-lock.c | 24 +++++++++++++++++++++---
20
9 files changed, 34 insertions(+), 16 deletions(-)
21
13
22
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
14
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
23
index XXXXXXX..XXXXXXX 100644
15
index XXXXXXX..XXXXXXX 100644
24
--- a/include/qemu/coroutine.h
16
--- a/include/hw/remote/mpqemu-link.h
25
+++ b/include/qemu/coroutine.h
17
+++ b/include/hw/remote/mpqemu-link.h
26
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
18
@@ -XXX,XX +XXX,XX @@
27
19
#include "qemu/thread.h"
28
/**
20
#include "io/channel.h"
29
* CoQueues are a mechanism to queue coroutines in order to continue executing
21
#include "exec/hwaddr.h"
30
- * them later.
22
+#include "io/channel-socket.h"
31
+ * them later. They are similar to condition variables, but they need help
23
+#include "hw/remote/proxy.h"
32
+ * from an external mutex in order to maintain thread-safety.
24
33
*/
25
#define REMOTE_MAX_FDS 8
34
typedef struct CoQueue {
26
35
QSIMPLEQ_HEAD(, Coroutine) entries;
27
@@ -XXX,XX +XXX,XX @@ typedef struct {
36
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue);
28
bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
37
29
bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
38
/**
30
39
* Adds the current coroutine to the CoQueue and transfers control to the
31
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
40
- * caller of the coroutine.
32
+ Error **errp);
41
+ * caller of the coroutine. The mutex is unlocked during the wait and
33
bool mpqemu_msg_valid(MPQemuMsg *msg);
42
+ * locked again afterwards.
34
43
*/
35
#endif
44
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
36
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
45
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex);
46
47
/**
48
* Restarts the next coroutine in the CoQueue and removes it from the queue.
49
diff --git a/block/backup.c b/block/backup.c
50
index XXXXXXX..XXXXXXX 100644
37
index XXXXXXX..XXXXXXX 100644
51
--- a/block/backup.c
38
--- a/hw/remote/mpqemu-link.c
52
+++ b/block/backup.c
39
+++ b/hw/remote/mpqemu-link.c
53
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
40
@@ -XXX,XX +XXX,XX @@ fail:
54
retry = false;
41
return ret;
55
QLIST_FOREACH(req, &job->inflight_reqs, list) {
56
if (end > req->start && start < req->end) {
57
- qemu_co_queue_wait(&req->wait_queue);
58
+ qemu_co_queue_wait(&req->wait_queue, NULL);
59
retry = true;
60
break;
61
}
62
diff --git a/block/io.c b/block/io.c
63
index XXXXXXX..XXXXXXX 100644
64
--- a/block/io.c
65
+++ b/block/io.c
66
@@ -XXX,XX +XXX,XX @@ static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
67
* (instead of producing a deadlock in the former case). */
68
if (!req->waiting_for) {
69
self->waiting_for = req;
70
- qemu_co_queue_wait(&req->wait_queue);
71
+ qemu_co_queue_wait(&req->wait_queue, NULL);
72
self->waiting_for = NULL;
73
retry = true;
74
waited = true;
75
@@ -XXX,XX +XXX,XX @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
76
77
/* Wait until any previous flushes are completed */
78
while (bs->active_flush_req) {
79
- qemu_co_queue_wait(&bs->flush_queue);
80
+ qemu_co_queue_wait(&bs->flush_queue, NULL);
81
}
82
83
bs->active_flush_req = true;
84
diff --git a/block/nbd-client.c b/block/nbd-client.c
85
index XXXXXXX..XXXXXXX 100644
86
--- a/block/nbd-client.c
87
+++ b/block/nbd-client.c
88
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
89
/* Poor man semaphore. The free_sema is locked when no other request
90
* can be accepted, and unlocked after receiving one reply. */
91
if (s->in_flight == MAX_NBD_REQUESTS) {
92
- qemu_co_queue_wait(&s->free_sema);
93
+ qemu_co_queue_wait(&s->free_sema, NULL);
94
assert(s->in_flight < MAX_NBD_REQUESTS);
95
}
96
s->in_flight++;
97
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
98
index XXXXXXX..XXXXXXX 100644
99
--- a/block/qcow2-cluster.c
100
+++ b/block/qcow2-cluster.c
101
@@ -XXX,XX +XXX,XX @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
102
if (bytes == 0) {
103
/* Wait for the dependency to complete. We need to recheck
104
* the free/allocated clusters when we continue. */
105
- qemu_co_mutex_unlock(&s->lock);
106
- qemu_co_queue_wait(&old_alloc->dependent_requests);
107
- qemu_co_mutex_lock(&s->lock);
108
+ qemu_co_queue_wait(&old_alloc->dependent_requests, &s->lock);
109
return -EAGAIN;
110
}
111
}
112
diff --git a/block/sheepdog.c b/block/sheepdog.c
113
index XXXXXXX..XXXXXXX 100644
114
--- a/block/sheepdog.c
115
+++ b/block/sheepdog.c
116
@@ -XXX,XX +XXX,XX @@ static void wait_for_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *acb)
117
retry:
118
QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) {
119
if (AIOCBOverlapping(acb, cb)) {
120
- qemu_co_queue_wait(&s->overlapping_queue);
121
+ qemu_co_queue_wait(&s->overlapping_queue, NULL);
122
goto retry;
123
}
124
}
125
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
126
index XXXXXXX..XXXXXXX 100644
127
--- a/block/throttle-groups.c
128
+++ b/block/throttle-groups.c
129
@@ -XXX,XX +XXX,XX @@ void coroutine_fn throttle_group_co_io_limits_intercept(BlockBackend *blk,
130
if (must_wait || blkp->pending_reqs[is_write]) {
131
blkp->pending_reqs[is_write]++;
132
qemu_mutex_unlock(&tg->lock);
133
- qemu_co_queue_wait(&blkp->throttled_reqs[is_write]);
134
+ qemu_co_queue_wait(&blkp->throttled_reqs[is_write], NULL);
135
qemu_mutex_lock(&tg->lock);
136
blkp->pending_reqs[is_write]--;
137
}
138
diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
139
index XXXXXXX..XXXXXXX 100644
140
--- a/hw/9pfs/9p.c
141
+++ b/hw/9pfs/9p.c
142
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn v9fs_flush(void *opaque)
143
/*
144
* Wait for pdu to complete.
145
*/
146
- qemu_co_queue_wait(&cancel_pdu->complete);
147
+ qemu_co_queue_wait(&cancel_pdu->complete, NULL);
148
cancel_pdu->cancelled = 0;
149
pdu_free(cancel_pdu);
150
}
151
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
152
index XXXXXXX..XXXXXXX 100644
153
--- a/util/qemu-coroutine-lock.c
154
+++ b/util/qemu-coroutine-lock.c
155
@@ -XXX,XX +XXX,XX @@ void qemu_co_queue_init(CoQueue *queue)
156
QSIMPLEQ_INIT(&queue->entries);
157
}
42
}
158
43
159
-void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
44
+/*
160
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue, CoMutex *mutex)
45
+ * Send msg and wait for a reply with command code RET_MSG.
161
{
46
+ * Returns the message received of size u64 or UINT64_MAX
162
Coroutine *self = qemu_coroutine_self();
47
+ * on error.
163
QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
48
+ * Called from VCPU thread in non-coroutine context.
49
+ * Used by the Proxy object to communicate to remote processes.
50
+ */
51
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
52
+ Error **errp)
53
+{
54
+ ERRP_GUARD();
55
+ MPQemuMsg msg_reply = {0};
56
+ uint64_t ret = UINT64_MAX;
164
+
57
+
165
+ if (mutex) {
58
+ assert(!qemu_in_coroutine());
166
+ qemu_co_mutex_unlock(mutex);
59
+
60
+ QEMU_LOCK_GUARD(&pdev->io_mutex);
61
+ if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
62
+ return ret;
167
+ }
63
+ }
168
+
64
+
169
+ /* There is no race condition here. Other threads will call
65
+ if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
170
+ * aio_co_schedule on our AioContext, which can reenter this
66
+ return ret;
171
+ * coroutine but only after this yield and after the main loop
67
+ }
172
+ * has gone through the next iteration.
173
+ */
174
qemu_coroutine_yield();
175
assert(qemu_in_coroutine());
176
+
68
+
177
+ /* TODO: OSv implements wait morphing here, where the wakeup
69
+ if (!mpqemu_msg_valid(&msg_reply)) {
178
+ * primitive automatically places the woken coroutine on the
70
+ error_setg(errp, "ERROR: Invalid reply received for command %d",
179
+ * mutex's queue. This avoids the thundering herd effect.
71
+ msg->cmd);
180
+ */
72
+ return ret;
181
+ if (mutex) {
182
+ qemu_co_mutex_lock(mutex);
183
+ }
73
+ }
184
}
74
+
185
75
+ return msg_reply.data.u64;
186
/**
76
+}
187
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_rdlock(CoRwlock *lock)
77
+
188
Coroutine *self = qemu_coroutine_self();
78
bool mpqemu_msg_valid(MPQemuMsg *msg)
189
79
{
190
while (lock->writer) {
80
if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
191
- qemu_co_queue_wait(&lock->queue);
192
+ qemu_co_queue_wait(&lock->queue, NULL);
193
}
194
lock->reader++;
195
self->locks_held++;
196
@@ -XXX,XX +XXX,XX @@ void qemu_co_rwlock_wrlock(CoRwlock *lock)
197
Coroutine *self = qemu_coroutine_self();
198
199
while (lock->writer || lock->reader) {
200
- qemu_co_queue_wait(&lock->queue);
201
+ qemu_co_queue_wait(&lock->queue, NULL);
202
}
203
lock->writer = true;
204
self->locks_held++;
205
--
81
--
206
2.9.3
82
2.29.2
207
83
208
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
In the client, read the reply headers from a coroutine, switching the
3
The Proxy Object sends the PCI config space accesses as messages
4
read side between the "read header" coroutine and the I/O coroutine that
4
to the remote process over the communication channel
5
reads the body of the reply.
5
6
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
In the server, if the server can read more requests it will create a new
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
"read request" coroutine as soon as a request has been read. Otherwise,
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
the new coroutine is created in nbd_request_put.
10
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
10
Message-id: d3c94f4618813234655356c60e6f0d0362ff42d6.1611938319.git.jag.raman@oracle.com
13
Reviewed-by: Fam Zheng <famz@redhat.com>
14
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
15
Message-id: 20170213135235.12274-8-pbonzini@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
12
---
18
block/nbd-client.h | 2 +-
13
include/hw/remote/mpqemu-link.h | 10 ++++++
19
block/nbd-client.c | 117 ++++++++++++++++++++++++-----------------------------
14
hw/remote/message.c | 60 +++++++++++++++++++++++++++++++++
20
nbd/client.c | 2 +-
15
hw/remote/mpqemu-link.c | 8 ++++-
21
nbd/common.c | 9 +----
16
hw/remote/proxy.c | 55 ++++++++++++++++++++++++++++++
22
nbd/server.c | 94 +++++++++++++-----------------------------
17
4 files changed, 132 insertions(+), 1 deletion(-)
23
5 files changed, 83 insertions(+), 141 deletions(-)
18
24
19
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
25
diff --git a/block/nbd-client.h b/block/nbd-client.h
20
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
21
--- a/include/hw/remote/mpqemu-link.h
27
--- a/block/nbd-client.h
22
+++ b/include/hw/remote/mpqemu-link.h
28
+++ b/block/nbd-client.h
29
@@ -XXX,XX +XXX,XX @@ typedef struct NBDClientSession {
30
31
CoMutex send_mutex;
32
CoQueue free_sema;
33
- Coroutine *send_coroutine;
34
+ Coroutine *read_reply_co;
35
int in_flight;
36
37
Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
38
diff --git a/block/nbd-client.c b/block/nbd-client.c
39
index XXXXXXX..XXXXXXX 100644
40
--- a/block/nbd-client.c
41
+++ b/block/nbd-client.c
42
@@ -XXX,XX +XXX,XX @@
23
@@ -XXX,XX +XXX,XX @@
43
#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
24
*/
44
#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
25
typedef enum {
45
26
MPQEMU_CMD_SYNC_SYSMEM,
46
-static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
27
+ MPQEMU_CMD_RET,
47
+static void nbd_recv_coroutines_enter_all(BlockDriverState *bs)
28
+ MPQEMU_CMD_PCI_CFGWRITE,
29
+ MPQEMU_CMD_PCI_CFGREAD,
30
MPQEMU_CMD_MAX,
31
} MPQemuCmd;
32
33
@@ -XXX,XX +XXX,XX @@ typedef struct {
34
off_t offsets[REMOTE_MAX_FDS];
35
} SyncSysmemMsg;
36
37
+typedef struct {
38
+ uint32_t addr;
39
+ uint32_t val;
40
+ int len;
41
+} PciConfDataMsg;
42
+
43
/**
44
* MPQemuMsg:
45
* @cmd: The remote command
46
@@ -XXX,XX +XXX,XX @@ typedef struct {
47
48
union {
49
uint64_t u64;
50
+ PciConfDataMsg pci_conf_data;
51
SyncSysmemMsg sync_sysmem;
52
} data;
53
54
diff --git a/hw/remote/message.c b/hw/remote/message.c
55
index XXXXXXX..XXXXXXX 100644
56
--- a/hw/remote/message.c
57
+++ b/hw/remote/message.c
58
@@ -XXX,XX +XXX,XX @@
59
#include "hw/remote/mpqemu-link.h"
60
#include "qapi/error.h"
61
#include "sysemu/runstate.h"
62
+#include "hw/pci/pci.h"
63
+
64
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
65
+ MPQemuMsg *msg, Error **errp);
66
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
67
+ MPQemuMsg *msg, Error **errp);
68
69
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
48
{
70
{
49
+ NBDClientSession *s = nbd_get_client_session(bs);
71
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
50
int i;
51
52
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
53
@@ -XXX,XX +XXX,XX @@ static void nbd_recv_coroutines_enter_all(NBDClientSession *s)
54
qemu_coroutine_enter(s->recv_coroutine[i]);
55
}
72
}
56
}
73
57
+ BDRV_POLL_WHILE(bs, s->read_reply_co);
74
switch (msg.cmd) {
58
}
75
+ case MPQEMU_CMD_PCI_CFGWRITE:
59
76
+ process_config_write(com->ioc, pci_dev, &msg, &local_err);
60
static void nbd_teardown_connection(BlockDriverState *bs)
61
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
62
qio_channel_shutdown(client->ioc,
63
QIO_CHANNEL_SHUTDOWN_BOTH,
64
NULL);
65
- nbd_recv_coroutines_enter_all(client);
66
+ nbd_recv_coroutines_enter_all(bs);
67
68
nbd_client_detach_aio_context(bs);
69
object_unref(OBJECT(client->sioc));
70
@@ -XXX,XX +XXX,XX @@ static void nbd_teardown_connection(BlockDriverState *bs)
71
client->ioc = NULL;
72
}
73
74
-static void nbd_reply_ready(void *opaque)
75
+static coroutine_fn void nbd_read_reply_entry(void *opaque)
76
{
77
- BlockDriverState *bs = opaque;
78
- NBDClientSession *s = nbd_get_client_session(bs);
79
+ NBDClientSession *s = opaque;
80
uint64_t i;
81
int ret;
82
83
- if (!s->ioc) { /* Already closed */
84
- return;
85
- }
86
-
87
- if (s->reply.handle == 0) {
88
- /* No reply already in flight. Fetch a header. It is possible
89
- * that another thread has done the same thing in parallel, so
90
- * the socket is not readable anymore.
91
- */
92
+ for (;;) {
93
+ assert(s->reply.handle == 0);
94
ret = nbd_receive_reply(s->ioc, &s->reply);
95
- if (ret == -EAGAIN) {
96
- return;
97
- }
98
if (ret < 0) {
99
- s->reply.handle = 0;
100
- goto fail;
101
+ break;
77
+ break;
102
}
78
+ case MPQEMU_CMD_PCI_CFGREAD:
103
- }
79
+ process_config_read(com->ioc, pci_dev, &msg, &local_err);
104
105
- /* There's no need for a mutex on the receive side, because the
106
- * handler acts as a synchronization point and ensures that only
107
- * one coroutine is called until the reply finishes. */
108
- i = HANDLE_TO_INDEX(s, s->reply.handle);
109
- if (i >= MAX_NBD_REQUESTS) {
110
- goto fail;
111
- }
112
+ /* There's no need for a mutex on the receive side, because the
113
+ * handler acts as a synchronization point and ensures that only
114
+ * one coroutine is called until the reply finishes.
115
+ */
116
+ i = HANDLE_TO_INDEX(s, s->reply.handle);
117
+ if (i >= MAX_NBD_REQUESTS || !s->recv_coroutine[i]) {
118
+ break;
80
+ break;
119
+ }
81
default:
120
82
error_setg(&local_err,
121
- if (s->recv_coroutine[i]) {
83
"Unknown command (%d) received for device %s"
122
- qemu_coroutine_enter(s->recv_coroutine[i]);
84
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
123
- return;
85
qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
124
+ /* We're woken up by the recv_coroutine itself. Note that there
125
+ * is no race between yielding and reentering read_reply_co. This
126
+ * is because:
127
+ *
128
+ * - if recv_coroutine[i] runs on the same AioContext, it is only
129
+ * entered after we yield
130
+ *
131
+ * - if recv_coroutine[i] runs on a different AioContext, reentering
132
+ * read_reply_co happens through a bottom half, which can only
133
+ * run after we yield.
134
+ */
135
+ aio_co_wake(s->recv_coroutine[i]);
136
+ qemu_coroutine_yield();
137
}
138
-
139
-fail:
140
- nbd_teardown_connection(bs);
141
-}
142
-
143
-static void nbd_restart_write(void *opaque)
144
-{
145
- BlockDriverState *bs = opaque;
146
-
147
- qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine);
148
+ s->read_reply_co = NULL;
149
}
150
151
static int nbd_co_send_request(BlockDriverState *bs,
152
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
153
QEMUIOVector *qiov)
154
{
155
NBDClientSession *s = nbd_get_client_session(bs);
156
- AioContext *aio_context;
157
int rc, ret, i;
158
159
qemu_co_mutex_lock(&s->send_mutex);
160
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
161
return -EPIPE;
162
}
163
164
- s->send_coroutine = qemu_coroutine_self();
165
- aio_context = bdrv_get_aio_context(bs);
166
-
167
- aio_set_fd_handler(aio_context, s->sioc->fd, false,
168
- nbd_reply_ready, nbd_restart_write, NULL, bs);
169
if (qiov) {
170
qio_channel_set_cork(s->ioc, true);
171
rc = nbd_send_request(s->ioc, request);
172
@@ -XXX,XX +XXX,XX @@ static int nbd_co_send_request(BlockDriverState *bs,
173
} else {
174
rc = nbd_send_request(s->ioc, request);
175
}
176
- aio_set_fd_handler(aio_context, s->sioc->fd, false,
177
- nbd_reply_ready, NULL, NULL, bs);
178
- s->send_coroutine = NULL;
179
qemu_co_mutex_unlock(&s->send_mutex);
180
return rc;
181
}
182
@@ -XXX,XX +XXX,XX @@ static void nbd_co_receive_reply(NBDClientSession *s,
183
{
184
int ret;
185
186
- /* Wait until we're woken up by the read handler. TODO: perhaps
187
- * peek at the next reply and avoid yielding if it's ours? */
188
+ /* Wait until we're woken up by nbd_read_reply_entry. */
189
qemu_coroutine_yield();
190
*reply = s->reply;
191
if (reply->handle != request->handle ||
192
@@ -XXX,XX +XXX,XX @@ static void nbd_coroutine_start(NBDClientSession *s,
193
/* s->recv_coroutine[i] is set as soon as we get the send_lock. */
194
}
195
196
-static void nbd_coroutine_end(NBDClientSession *s,
197
+static void nbd_coroutine_end(BlockDriverState *bs,
198
NBDRequest *request)
199
{
200
+ NBDClientSession *s = nbd_get_client_session(bs);
201
int i = HANDLE_TO_INDEX(s, request->handle);
202
+
203
s->recv_coroutine[i] = NULL;
204
- if (s->in_flight-- == MAX_NBD_REQUESTS) {
205
- qemu_co_queue_next(&s->free_sema);
206
+ s->in_flight--;
207
+ qemu_co_queue_next(&s->free_sema);
208
+
209
+ /* Kick the read_reply_co to get the next reply. */
210
+ if (s->read_reply_co) {
211
+ aio_co_wake(s->read_reply_co);
212
}
86
}
213
}
87
}
214
88
+
215
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_preadv(BlockDriverState *bs, uint64_t offset,
89
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
216
} else {
90
+ MPQemuMsg *msg, Error **errp)
217
nbd_co_receive_reply(client, &request, &reply, qiov);
91
+{
218
}
92
+ ERRP_GUARD();
219
- nbd_coroutine_end(client, &request);
93
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
220
+ nbd_coroutine_end(bs, &request);
94
+ MPQemuMsg ret = { 0 };
221
return -reply.error;
95
+
222
}
96
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
223
97
+ error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
224
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwritev(BlockDriverState *bs, uint64_t offset,
98
+ getpid());
225
} else {
99
+ ret.data.u64 = UINT64_MAX;
226
nbd_co_receive_reply(client, &request, &reply, NULL);
100
+ } else {
227
}
101
+ pci_default_write_config(dev, conf->addr, conf->val, conf->len);
228
- nbd_coroutine_end(client, &request);
102
+ }
229
+ nbd_coroutine_end(bs, &request);
103
+
230
return -reply.error;
104
+ ret.cmd = MPQEMU_CMD_RET;
231
}
105
+ ret.size = sizeof(ret.data.u64);
232
106
+
233
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
107
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
234
} else {
108
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
235
nbd_co_receive_reply(client, &request, &reply, NULL);
109
+ getpid());
236
}
110
+ }
237
- nbd_coroutine_end(client, &request);
111
+}
238
+ nbd_coroutine_end(bs, &request);
112
+
239
return -reply.error;
113
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
240
}
114
+ MPQemuMsg *msg, Error **errp)
241
115
+{
242
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_flush(BlockDriverState *bs)
116
+ ERRP_GUARD();
243
} else {
117
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
244
nbd_co_receive_reply(client, &request, &reply, NULL);
118
+ MPQemuMsg ret = { 0 };
245
}
119
+
246
- nbd_coroutine_end(client, &request);
120
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
247
+ nbd_coroutine_end(bs, &request);
121
+ error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
248
return -reply.error;
122
+ getpid());
249
}
123
+ ret.data.u64 = UINT64_MAX;
250
124
+ } else {
251
@@ -XXX,XX +XXX,XX @@ int nbd_client_co_pdiscard(BlockDriverState *bs, int64_t offset, int count)
125
+ ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
252
} else {
126
+ }
253
nbd_co_receive_reply(client, &request, &reply, NULL);
127
+
254
}
128
+ ret.cmd = MPQEMU_CMD_RET;
255
- nbd_coroutine_end(client, &request);
129
+ ret.size = sizeof(ret.data.u64);
256
+ nbd_coroutine_end(bs, &request);
130
+
257
return -reply.error;
131
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
258
132
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
259
}
133
+ getpid());
260
134
+ }
261
void nbd_client_detach_aio_context(BlockDriverState *bs)
135
+}
262
{
136
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
263
- aio_set_fd_handler(bdrv_get_aio_context(bs),
137
index XXXXXXX..XXXXXXX 100644
264
- nbd_get_client_session(bs)->sioc->fd,
138
--- a/hw/remote/mpqemu-link.c
265
- false, NULL, NULL, NULL, NULL);
139
+++ b/hw/remote/mpqemu-link.c
266
+ NBDClientSession *client = nbd_get_client_session(bs);
140
@@ -XXX,XX +XXX,XX @@ uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
267
+ qio_channel_detach_aio_context(QIO_CHANNEL(client->sioc));
268
}
269
270
void nbd_client_attach_aio_context(BlockDriverState *bs,
271
AioContext *new_context)
272
{
273
- aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd,
274
- false, nbd_reply_ready, NULL, NULL, bs);
275
+ NBDClientSession *client = nbd_get_client_session(bs);
276
+ qio_channel_attach_aio_context(QIO_CHANNEL(client->sioc), new_context);
277
+ aio_co_schedule(new_context, client->read_reply_co);
278
}
279
280
void nbd_client_close(BlockDriverState *bs)
281
@@ -XXX,XX +XXX,XX @@ int nbd_client_init(BlockDriverState *bs,
282
/* Now that we're connected, set the socket to be non-blocking and
283
* kick the reply mechanism. */
284
qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL);
285
-
286
+ client->read_reply_co = qemu_coroutine_create(nbd_read_reply_entry, client);
287
nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs));
288
289
logout("Established connection with NBD server\n");
290
diff --git a/nbd/client.c b/nbd/client.c
291
index XXXXXXX..XXXXXXX 100644
292
--- a/nbd/client.c
293
+++ b/nbd/client.c
294
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_receive_reply(QIOChannel *ioc, NBDReply *reply)
295
ssize_t ret;
296
297
ret = read_sync(ioc, buf, sizeof(buf));
298
- if (ret < 0) {
299
+ if (ret <= 0) {
300
return ret;
141
return ret;
301
}
142
}
302
143
303
diff --git a/nbd/common.c b/nbd/common.c
144
- if (!mpqemu_msg_valid(&msg_reply)) {
304
index XXXXXXX..XXXXXXX 100644
145
+ if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
305
--- a/nbd/common.c
146
error_setg(errp, "ERROR: Invalid reply received for command %d",
306
+++ b/nbd/common.c
147
msg->cmd);
307
@@ -XXX,XX +XXX,XX @@ ssize_t nbd_wr_syncv(QIOChannel *ioc,
148
return ret;
149
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
150
return false;
308
}
151
}
309
if (len == QIO_CHANNEL_ERR_BLOCK) {
152
break;
310
if (qemu_in_coroutine()) {
153
+ case MPQEMU_CMD_PCI_CFGWRITE:
311
- /* XXX figure out if we can create a variant on
154
+ case MPQEMU_CMD_PCI_CFGREAD:
312
- * qio_channel_yield() that works with AIO contexts
155
+ if (msg->size != sizeof(PciConfDataMsg)) {
313
- * and consider using that in this branch */
156
+ return false;
314
- qemu_coroutine_yield();
157
+ }
315
- } else if (done) {
158
+ break;
316
- /* XXX this is needed by nbd_reply_ready. */
159
default:
317
- qio_channel_wait(ioc,
160
break;
318
- do_read ? G_IO_IN : G_IO_OUT);
161
}
319
+ qio_channel_yield(ioc, do_read ? G_IO_IN : G_IO_OUT);
162
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
320
} else {
163
index XXXXXXX..XXXXXXX 100644
321
return -EAGAIN;
164
--- a/hw/remote/proxy.c
322
}
165
+++ b/hw/remote/proxy.c
323
diff --git a/nbd/server.c b/nbd/server.c
166
@@ -XXX,XX +XXX,XX @@
324
index XXXXXXX..XXXXXXX 100644
167
#include "monitor/monitor.h"
325
--- a/nbd/server.c
168
#include "migration/blocker.h"
326
+++ b/nbd/server.c
169
#include "qemu/sockets.h"
327
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
170
+#include "hw/remote/mpqemu-link.h"
328
CoMutex send_lock;
171
+#include "qemu/error-report.h"
329
Coroutine *send_coroutine;
172
330
173
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
331
- bool can_read;
174
{
332
-
175
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
333
QTAILQ_ENTRY(NBDClient) next;
176
error_free(dev->migration_blocker);
334
int nb_requests;
335
bool closing;
336
@@ -XXX,XX +XXX,XX @@ struct NBDClient {
337
338
/* That's all folks */
339
340
-static void nbd_set_handlers(NBDClient *client);
341
-static void nbd_unset_handlers(NBDClient *client);
342
-static void nbd_update_can_read(NBDClient *client);
343
+static void nbd_client_receive_next_request(NBDClient *client);
344
345
static gboolean nbd_negotiate_continue(QIOChannel *ioc,
346
GIOCondition condition,
347
@@ -XXX,XX +XXX,XX @@ void nbd_client_put(NBDClient *client)
348
*/
349
assert(client->closing);
350
351
- nbd_unset_handlers(client);
352
+ qio_channel_detach_aio_context(client->ioc);
353
object_unref(OBJECT(client->sioc));
354
object_unref(OBJECT(client->ioc));
355
if (client->tlscreds) {
356
@@ -XXX,XX +XXX,XX @@ static NBDRequestData *nbd_request_get(NBDClient *client)
357
358
assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
359
client->nb_requests++;
360
- nbd_update_can_read(client);
361
362
req = g_new0(NBDRequestData, 1);
363
nbd_client_get(client);
364
@@ -XXX,XX +XXX,XX @@ static void nbd_request_put(NBDRequestData *req)
365
g_free(req);
366
367
client->nb_requests--;
368
- nbd_update_can_read(client);
369
+ nbd_client_receive_next_request(client);
370
+
371
nbd_client_put(client);
372
}
177
}
373
178
374
@@ -XXX,XX +XXX,XX @@ static void blk_aio_attached(AioContext *ctx, void *opaque)
179
+static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
375
exp->ctx = ctx;
180
+ int len, unsigned int op)
376
181
+{
377
QTAILQ_FOREACH(client, &exp->clients, next) {
182
+ MPQemuMsg msg = { 0 };
378
- nbd_set_handlers(client);
183
+ uint64_t ret = -EINVAL;
379
+ qio_channel_attach_aio_context(client->ioc, ctx);
184
+ Error *local_err = NULL;
380
+ if (client->recv_coroutine) {
185
+
381
+ aio_co_schedule(ctx, client->recv_coroutine);
186
+ msg.cmd = op;
382
+ }
187
+ msg.data.pci_conf_data.addr = addr;
383
+ if (client->send_coroutine) {
188
+ msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
384
+ aio_co_schedule(ctx, client->send_coroutine);
189
+ msg.data.pci_conf_data.len = len;
385
+ }
190
+ msg.size = sizeof(PciConfDataMsg);
386
}
191
+
192
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
193
+ if (local_err) {
194
+ error_report_err(local_err);
195
+ }
196
+
197
+ if (ret == UINT64_MAX) {
198
+ error_report("Failed to perform PCI config %s operation",
199
+ (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
200
+ }
201
+
202
+ if (op == MPQEMU_CMD_PCI_CFGREAD) {
203
+ *val = (uint32_t)ret;
204
+ }
205
+}
206
+
207
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
208
+{
209
+ uint32_t val;
210
+
211
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
212
+
213
+ return val;
214
+}
215
+
216
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
217
+ int len)
218
+{
219
+ /*
220
+ * Some of the functions access the copy of remote device's PCI config
221
+ * space which is cached in the proxy device. Therefore, maintain
222
+ * it updated.
223
+ */
224
+ pci_default_write_config(d, addr, val, len);
225
+
226
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
227
+}
228
+
229
static Property proxy_properties[] = {
230
DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
231
DEFINE_PROP_END_OF_LIST(),
232
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
233
234
k->realize = pci_proxy_dev_realize;
235
k->exit = pci_proxy_dev_exit;
236
+ k->config_read = pci_proxy_read_config;
237
+ k->config_write = pci_proxy_write_config;
238
+
239
device_class_set_props(dc, proxy_properties);
387
}
240
}
388
241
389
@@ -XXX,XX +XXX,XX @@ static void blk_aio_detach(void *opaque)
390
TRACE("Export %s: Detaching clients from AIO context %p\n", exp->name, exp->ctx);
391
392
QTAILQ_FOREACH(client, &exp->clients, next) {
393
- nbd_unset_handlers(client);
394
+ qio_channel_detach_aio_context(client->ioc);
395
}
396
397
exp->ctx = NULL;
398
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
399
g_assert(qemu_in_coroutine());
400
qemu_co_mutex_lock(&client->send_lock);
401
client->send_coroutine = qemu_coroutine_self();
402
- nbd_set_handlers(client);
403
404
if (!len) {
405
rc = nbd_send_reply(client->ioc, reply);
406
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_send_reply(NBDRequestData *req, NBDReply *reply,
407
}
408
409
client->send_coroutine = NULL;
410
- nbd_set_handlers(client);
411
qemu_co_mutex_unlock(&client->send_lock);
412
return rc;
413
}
414
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
415
ssize_t rc;
416
417
g_assert(qemu_in_coroutine());
418
- client->recv_coroutine = qemu_coroutine_self();
419
- nbd_update_can_read(client);
420
-
421
+ assert(client->recv_coroutine == qemu_coroutine_self());
422
rc = nbd_receive_request(client->ioc, request);
423
if (rc < 0) {
424
if (rc != -EAGAIN) {
425
@@ -XXX,XX +XXX,XX @@ static ssize_t nbd_co_receive_request(NBDRequestData *req,
426
427
out:
428
client->recv_coroutine = NULL;
429
- nbd_update_can_read(client);
430
+ nbd_client_receive_next_request(client);
431
432
return rc;
433
}
434
435
-static void nbd_trip(void *opaque)
436
+/* Owns a reference to the NBDClient passed as opaque. */
437
+static coroutine_fn void nbd_trip(void *opaque)
438
{
439
NBDClient *client = opaque;
440
NBDExport *exp = client->exp;
441
NBDRequestData *req;
442
- NBDRequest request;
443
+ NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
444
NBDReply reply;
445
ssize_t ret;
446
int flags;
447
448
TRACE("Reading request.");
449
if (client->closing) {
450
+ nbd_client_put(client);
451
return;
452
}
453
454
@@ -XXX,XX +XXX,XX @@ static void nbd_trip(void *opaque)
455
456
done:
457
nbd_request_put(req);
458
+ nbd_client_put(client);
459
return;
460
461
out:
462
nbd_request_put(req);
463
client_close(client);
464
+ nbd_client_put(client);
465
}
466
467
-static void nbd_read(void *opaque)
468
+static void nbd_client_receive_next_request(NBDClient *client)
469
{
470
- NBDClient *client = opaque;
471
-
472
- if (client->recv_coroutine) {
473
- qemu_coroutine_enter(client->recv_coroutine);
474
- } else {
475
- qemu_coroutine_enter(qemu_coroutine_create(nbd_trip, client));
476
- }
477
-}
478
-
479
-static void nbd_restart_write(void *opaque)
480
-{
481
- NBDClient *client = opaque;
482
-
483
- qemu_coroutine_enter(client->send_coroutine);
484
-}
485
-
486
-static void nbd_set_handlers(NBDClient *client)
487
-{
488
- if (client->exp && client->exp->ctx) {
489
- aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true,
490
- client->can_read ? nbd_read : NULL,
491
- client->send_coroutine ? nbd_restart_write : NULL,
492
- NULL, client);
493
- }
494
-}
495
-
496
-static void nbd_unset_handlers(NBDClient *client)
497
-{
498
- if (client->exp && client->exp->ctx) {
499
- aio_set_fd_handler(client->exp->ctx, client->sioc->fd, true, NULL,
500
- NULL, NULL, NULL);
501
- }
502
-}
503
-
504
-static void nbd_update_can_read(NBDClient *client)
505
-{
506
- bool can_read = client->recv_coroutine ||
507
- client->nb_requests < MAX_NBD_REQUESTS;
508
-
509
- if (can_read != client->can_read) {
510
- client->can_read = can_read;
511
- nbd_set_handlers(client);
512
-
513
- /* There is no need to invoke aio_notify(), since aio_set_fd_handler()
514
- * in nbd_set_handlers() will have taken care of that */
515
+ if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS) {
516
+ nbd_client_get(client);
517
+ client->recv_coroutine = qemu_coroutine_create(nbd_trip, client);
518
+ aio_co_schedule(client->exp->ctx, client->recv_coroutine);
519
}
520
}
521
522
@@ -XXX,XX +XXX,XX @@ static coroutine_fn void nbd_co_client_start(void *opaque)
523
goto out;
524
}
525
qemu_co_mutex_init(&client->send_lock);
526
- nbd_set_handlers(client);
527
528
if (exp) {
529
QTAILQ_INSERT_TAIL(&exp->clients, client, next);
530
}
531
+
532
+ nbd_client_receive_next_request(client);
533
+
534
out:
535
g_free(data);
536
}
537
@@ -XXX,XX +XXX,XX @@ void nbd_client_new(NBDExport *exp,
538
object_ref(OBJECT(client->sioc));
539
client->ioc = QIO_CHANNEL(sioc);
540
object_ref(OBJECT(client->ioc));
541
- client->can_read = true;
542
client->close = close_fn;
543
544
data->client = client;
545
--
242
--
546
2.9.3
243
2.29.2
547
244
548
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
Proxy device object implements handler for PCI BAR writes and reads.
4
The handler uses BAR_WRITE/BAR_READ message to communicate to the
5
remote process with the BAR address and value to be written/read.
6
The remote process implements handler for BAR_WRITE/BAR_READ
7
message.
8
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
13
Message-id: a8b76714a9688be5552c4c92d089bc9e8a4707ff.1611938319.git.jag.raman@oracle.com
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-13-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
15
---
10
block/qed.h | 3 +++
16
include/hw/remote/mpqemu-link.h | 10 ++++
11
block/curl.c | 2 ++
17
include/hw/remote/proxy.h | 9 ++++
12
block/io.c | 5 +++++
18
hw/remote/message.c | 83 +++++++++++++++++++++++++++++++++
13
block/iscsi.c | 8 ++++++--
19
hw/remote/mpqemu-link.c | 6 +++
14
block/null.c | 4 ++++
20
hw/remote/proxy.c | 60 ++++++++++++++++++++++++
15
block/qed.c | 12 ++++++++++++
21
5 files changed, 168 insertions(+)
16
block/throttle-groups.c | 2 ++
22
17
util/aio-posix.c | 2 --
23
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
18
util/aio-win32.c | 2 --
24
index XXXXXXX..XXXXXXX 100644
19
util/qemu-coroutine-sleep.c | 2 +-
25
--- a/include/hw/remote/mpqemu-link.h
20
10 files changed, 35 insertions(+), 7 deletions(-)
26
+++ b/include/hw/remote/mpqemu-link.h
21
27
@@ -XXX,XX +XXX,XX @@ typedef enum {
22
diff --git a/block/qed.h b/block/qed.h
28
MPQEMU_CMD_RET,
23
index XXXXXXX..XXXXXXX 100644
29
MPQEMU_CMD_PCI_CFGWRITE,
24
--- a/block/qed.h
30
MPQEMU_CMD_PCI_CFGREAD,
25
+++ b/block/qed.h
31
+ MPQEMU_CMD_BAR_WRITE,
26
@@ -XXX,XX +XXX,XX @@ enum {
32
+ MPQEMU_CMD_BAR_READ,
27
*/
33
MPQEMU_CMD_MAX,
28
typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len);
34
} MPQemuCmd;
29
35
30
+void qed_acquire(BDRVQEDState *s);
36
@@ -XXX,XX +XXX,XX @@ typedef struct {
31
+void qed_release(BDRVQEDState *s);
37
int len;
38
} PciConfDataMsg;
39
40
+typedef struct {
41
+ hwaddr addr;
42
+ uint64_t val;
43
+ unsigned size;
44
+ bool memory;
45
+} BarAccessMsg;
32
+
46
+
33
/**
47
/**
34
* Generic callback for chaining async callbacks
48
* MPQemuMsg:
35
*/
49
* @cmd: The remote command
36
diff --git a/block/curl.c b/block/curl.c
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
37
index XXXXXXX..XXXXXXX 100644
51
uint64_t u64;
38
--- a/block/curl.c
52
PciConfDataMsg pci_conf_data;
39
+++ b/block/curl.c
53
SyncSysmemMsg sync_sysmem;
40
@@ -XXX,XX +XXX,XX @@ static void curl_multi_timeout_do(void *arg)
54
+ BarAccessMsg bar_access;
41
return;
55
} data;
42
}
56
43
57
int fds[REMOTE_MAX_FDS];
44
+ aio_context_acquire(s->aio_context);
58
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
45
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
59
index XXXXXXX..XXXXXXX 100644
46
60
--- a/include/hw/remote/proxy.h
47
curl_multi_check_completion(s);
61
+++ b/include/hw/remote/proxy.h
48
+ aio_context_release(s->aio_context);
62
@@ -XXX,XX +XXX,XX @@
49
#else
63
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
50
abort();
64
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
51
#endif
65
52
diff --git a/block/io.c b/block/io.c
66
+typedef struct ProxyMemoryRegion {
53
index XXXXXXX..XXXXXXX 100644
67
+ PCIProxyDev *dev;
54
--- a/block/io.c
68
+ MemoryRegion mr;
55
+++ b/block/io.c
69
+ bool memory;
56
@@ -XXX,XX +XXX,XX @@ void bdrv_aio_cancel(BlockAIOCB *acb)
70
+ bool present;
57
if (acb->aiocb_info->get_aio_context) {
71
+ uint8_t type;
58
aio_poll(acb->aiocb_info->get_aio_context(acb), true);
72
+} ProxyMemoryRegion;
59
} else if (acb->bs) {
73
+
60
+ /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
74
struct PCIProxyDev {
61
+ * assert that we're not using an I/O thread. Thread-safe
75
PCIDevice parent_dev;
62
+ * code should use bdrv_aio_cancel_async exclusively.
76
char *fd;
63
+ */
77
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
64
+ assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
78
QemuMutex io_mutex;
65
aio_poll(bdrv_get_aio_context(acb->bs), true);
79
QIOChannel *ioc;
66
} else {
80
Error *migration_blocker;
67
abort();
81
+ ProxyMemoryRegion region[PCI_NUM_REGIONS];
68
diff --git a/block/iscsi.c b/block/iscsi.c
82
};
69
index XXXXXXX..XXXXXXX 100644
83
70
--- a/block/iscsi.c
84
#endif /* PROXY_H */
71
+++ b/block/iscsi.c
85
diff --git a/hw/remote/message.c b/hw/remote/message.c
72
@@ -XXX,XX +XXX,XX @@ static void iscsi_retry_timer_expired(void *opaque)
86
index XXXXXXX..XXXXXXX 100644
73
struct IscsiTask *iTask = opaque;
87
--- a/hw/remote/message.c
74
iTask->complete = 1;
88
+++ b/hw/remote/message.c
75
if (iTask->co) {
89
@@ -XXX,XX +XXX,XX @@
76
- qemu_coroutine_enter(iTask->co);
90
#include "qapi/error.h"
77
+ aio_co_wake(iTask->co);
91
#include "sysemu/runstate.h"
92
#include "hw/pci/pci.h"
93
+#include "exec/memattrs.h"
94
95
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
96
MPQemuMsg *msg, Error **errp);
97
static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
98
MPQemuMsg *msg, Error **errp);
99
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
100
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
101
102
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
103
{
104
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
105
case MPQEMU_CMD_PCI_CFGREAD:
106
process_config_read(com->ioc, pci_dev, &msg, &local_err);
107
break;
108
+ case MPQEMU_CMD_BAR_WRITE:
109
+ process_bar_write(com->ioc, &msg, &local_err);
110
+ break;
111
+ case MPQEMU_CMD_BAR_READ:
112
+ process_bar_read(com->ioc, &msg, &local_err);
113
+ break;
114
default:
115
error_setg(&local_err,
116
"Unknown command (%d) received for device %s"
117
@@ -XXX,XX +XXX,XX @@ static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
118
getpid());
78
}
119
}
79
}
120
}
80
121
+
81
@@ -XXX,XX +XXX,XX @@ static void iscsi_nop_timed_event(void *opaque)
122
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
82
{
123
+{
83
IscsiLun *iscsilun = opaque;
124
+ ERRP_GUARD();
84
125
+ BarAccessMsg *bar_access = &msg->data.bar_access;
85
+ aio_context_acquire(iscsilun->aio_context);
126
+ AddressSpace *as =
86
if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) {
127
+ bar_access->memory ? &address_space_memory : &address_space_io;
87
error_report("iSCSI: NOP timeout. Reconnecting...");
128
+ MPQemuMsg ret = { 0 };
88
iscsilun->request_timed_out = true;
129
+ MemTxResult res;
89
} else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) {
130
+ uint64_t val;
90
error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages.");
131
+
91
- return;
132
+ if (!is_power_of_2(bar_access->size) ||
92
+ goto out;
133
+ (bar_access->size > sizeof(uint64_t))) {
134
+ ret.data.u64 = UINT64_MAX;
135
+ goto fail;
136
+ }
137
+
138
+ val = cpu_to_le64(bar_access->val);
139
+
140
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
141
+ (void *)&val, bar_access->size, true);
142
+
143
+ if (res != MEMTX_OK) {
144
+ error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
145
+ bar_access->addr, getpid());
146
+ ret.data.u64 = -1;
147
+ }
148
+
149
+fail:
150
+ ret.cmd = MPQEMU_CMD_RET;
151
+ ret.size = sizeof(ret.data.u64);
152
+
153
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
154
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
155
+ getpid());
156
+ }
157
+}
158
+
159
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
160
+{
161
+ ERRP_GUARD();
162
+ BarAccessMsg *bar_access = &msg->data.bar_access;
163
+ MPQemuMsg ret = { 0 };
164
+ AddressSpace *as;
165
+ MemTxResult res;
166
+ uint64_t val = 0;
167
+
168
+ as = bar_access->memory ? &address_space_memory : &address_space_io;
169
+
170
+ if (!is_power_of_2(bar_access->size) ||
171
+ (bar_access->size > sizeof(uint64_t))) {
172
+ val = UINT64_MAX;
173
+ goto fail;
174
+ }
175
+
176
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
177
+ (void *)&val, bar_access->size, false);
178
+
179
+ if (res != MEMTX_OK) {
180
+ error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
181
+ bar_access->addr, getpid());
182
+ val = UINT64_MAX;
183
+ }
184
+
185
+fail:
186
+ ret.cmd = MPQEMU_CMD_RET;
187
+ ret.data.u64 = le64_to_cpu(val);
188
+ ret.size = sizeof(ret.data.u64);
189
+
190
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
191
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
192
+ getpid());
193
+ }
194
+}
195
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
196
index XXXXXXX..XXXXXXX 100644
197
--- a/hw/remote/mpqemu-link.c
198
+++ b/hw/remote/mpqemu-link.c
199
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
200
return false;
201
}
202
break;
203
+ case MPQEMU_CMD_BAR_WRITE:
204
+ case MPQEMU_CMD_BAR_READ:
205
+ if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
206
+ return false;
207
+ }
208
+ break;
209
default:
210
break;
93
}
211
}
94
212
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
95
timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL);
213
index XXXXXXX..XXXXXXX 100644
96
iscsi_set_events(iscsilun);
214
--- a/hw/remote/proxy.c
97
+
215
+++ b/hw/remote/proxy.c
98
+out:
216
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_register_types(void)
99
+ aio_context_release(iscsilun->aio_context);
100
}
217
}
101
218
102
static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp)
219
type_init(pci_proxy_dev_register_types)
103
diff --git a/block/null.c b/block/null.c
220
+
104
index XXXXXXX..XXXXXXX 100644
221
+static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
105
--- a/block/null.c
222
+ bool write, hwaddr addr, uint64_t *val,
106
+++ b/block/null.c
223
+ unsigned size, bool memory)
107
@@ -XXX,XX +XXX,XX @@ static void null_bh_cb(void *opaque)
224
+{
108
static void null_timer_cb(void *opaque)
225
+ MPQemuMsg msg = { 0 };
109
{
226
+ long ret = -EINVAL;
110
NullAIOCB *acb = opaque;
227
+ Error *local_err = NULL;
111
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
228
+
112
+
229
+ msg.size = sizeof(BarAccessMsg);
113
+ aio_context_acquire(ctx);
230
+ msg.data.bar_access.addr = mr->addr + addr;
114
acb->common.cb(acb->common.opaque, 0);
231
+ msg.data.bar_access.size = size;
115
+ aio_context_release(ctx);
232
+ msg.data.bar_access.memory = memory;
116
timer_deinit(&acb->timer);
233
+
117
qemu_aio_unref(acb);
234
+ if (write) {
118
}
235
+ msg.cmd = MPQEMU_CMD_BAR_WRITE;
119
diff --git a/block/qed.c b/block/qed.c
236
+ msg.data.bar_access.val = *val;
120
index XXXXXXX..XXXXXXX 100644
237
+ } else {
121
--- a/block/qed.c
238
+ msg.cmd = MPQEMU_CMD_BAR_READ;
122
+++ b/block/qed.c
239
+ }
123
@@ -XXX,XX +XXX,XX @@ static void qed_need_check_timer_cb(void *opaque)
240
+
124
241
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
125
trace_qed_need_check_timer_cb(s);
242
+ if (local_err) {
126
243
+ error_report_err(local_err);
127
+ qed_acquire(s);
244
+ }
128
qed_plug_allocating_write_reqs(s);
245
+
129
246
+ if (!write) {
130
/* Ensure writes are on disk before clearing flag */
247
+ *val = ret;
131
bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
248
+ }
132
+ qed_release(s);
249
+}
133
+}
250
+
134
+
251
+static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
135
+void qed_acquire(BDRVQEDState *s)
252
+ unsigned size)
136
+{
253
+{
137
+ aio_context_acquire(bdrv_get_aio_context(s->bs));
254
+ ProxyMemoryRegion *pmr = opaque;
138
+}
255
+
139
+
256
+ send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
140
+void qed_release(BDRVQEDState *s)
257
+ pmr->memory);
141
+{
258
+}
142
+ aio_context_release(bdrv_get_aio_context(s->bs));
259
+
143
}
260
+static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
144
261
+{
145
static void qed_start_need_check_timer(BDRVQEDState *s)
262
+ ProxyMemoryRegion *pmr = opaque;
146
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
263
+ uint64_t val;
147
index XXXXXXX..XXXXXXX 100644
264
+
148
--- a/block/throttle-groups.c
265
+ send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
149
+++ b/block/throttle-groups.c
266
+ pmr->memory);
150
@@ -XXX,XX +XXX,XX @@ static void timer_cb(BlockBackend *blk, bool is_write)
267
+
151
qemu_mutex_unlock(&tg->lock);
268
+ return val;
152
269
+}
153
/* Run the request that was waiting for this timer */
270
+
154
+ aio_context_acquire(blk_get_aio_context(blk));
271
+const MemoryRegionOps proxy_mr_ops = {
155
empty_queue = !qemu_co_enter_next(&blkp->throttled_reqs[is_write]);
272
+ .read = proxy_bar_read,
156
+ aio_context_release(blk_get_aio_context(blk));
273
+ .write = proxy_bar_write,
157
274
+ .endianness = DEVICE_NATIVE_ENDIAN,
158
/* If the request queue was empty then we have to take care of
275
+ .impl = {
159
* scheduling the next one */
276
+ .min_access_size = 1,
160
diff --git a/util/aio-posix.c b/util/aio-posix.c
277
+ .max_access_size = 8,
161
index XXXXXXX..XXXXXXX 100644
278
+ },
162
--- a/util/aio-posix.c
279
+};
163
+++ b/util/aio-posix.c
164
@@ -XXX,XX +XXX,XX @@ bool aio_dispatch(AioContext *ctx, bool dispatch_fds)
165
}
166
167
/* Run our timers */
168
- aio_context_acquire(ctx);
169
progress |= timerlistgroup_run_timers(&ctx->tlg);
170
- aio_context_release(ctx);
171
172
return progress;
173
}
174
diff --git a/util/aio-win32.c b/util/aio-win32.c
175
index XXXXXXX..XXXXXXX 100644
176
--- a/util/aio-win32.c
177
+++ b/util/aio-win32.c
178
@@ -XXX,XX +XXX,XX @@ bool aio_poll(AioContext *ctx, bool blocking)
179
progress |= aio_dispatch_handlers(ctx, event);
180
} while (count > 0);
181
182
- aio_context_acquire(ctx);
183
progress |= timerlistgroup_run_timers(&ctx->tlg);
184
- aio_context_release(ctx);
185
return progress;
186
}
187
188
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
189
index XXXXXXX..XXXXXXX 100644
190
--- a/util/qemu-coroutine-sleep.c
191
+++ b/util/qemu-coroutine-sleep.c
192
@@ -XXX,XX +XXX,XX @@ static void co_sleep_cb(void *opaque)
193
{
194
CoSleepCB *sleep_cb = opaque;
195
196
- qemu_coroutine_enter(sleep_cb->co);
197
+ aio_co_wake(sleep_cb->co);
198
}
199
200
void coroutine_fn co_aio_sleep_ns(AioContext *ctx, QEMUClockType type,
201
--
280
--
202
2.9.3
281
2.29.2
203
282
204
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
aio_co_wake provides the infrastructure to start a coroutine on a "home"
3
Add ProxyMemoryListener object which is used to keep the view of the RAM
4
AioContext. It will be used by CoMutex and CoQueue, so that coroutines
4
in sync between QEMU and remote process.
5
don't jump from one context to another when they go to sleep on a
5
A MemoryListener is registered for system-memory AddressSpace. The
6
mutex or waitqueue. However, it can also be used as a more efficient
6
listener sends SYNC_SYSMEM message to the remote process when memory
7
alternative to one-shot bottom halves, and saves the effort of tracking
7
listener commits the changes to memory, the remote process receives
8
which AioContext a coroutine is running on.
8
the message and processes it in the handler for SYNC_SYSMEM message.
9
9
10
aio_co_schedule is the part of aio_co_wake that starts a coroutine
10
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
11
on a remove AioContext, but it is also useful to implement e.g.
11
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
12
bdrv_set_aio_context callbacks.
12
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
13
13
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
The implementation of aio_co_schedule is based on a lock-free
14
Message-id: 04fe4e6a9ca90d4f11ab6f59be7652f5b086a071.1611938319.git.jag.raman@oracle.com
15
multiple-producer, single-consumer queue. The multiple producers use
16
cmpxchg to add to a LIFO stack. The consumer (a per-AioContext bottom
17
half) grabs all items added so far, inverts the list to make it FIFO,
18
and goes through it one item at a time until it's empty. The data
19
structure was inspired by OSv, which uses it in the very code we'll
20
"port" to QEMU for the thread-safe CoMutex.
21
22
Most of the new code is really tests.
23
24
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
25
Reviewed-by: Fam Zheng <famz@redhat.com>
26
Message-id: 20170213135235.12274-3-pbonzini@redhat.com
27
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
28
---
16
---
29
tests/Makefile.include | 8 +-
17
MAINTAINERS | 2 +
30
include/block/aio.h | 32 +++++++
18
include/hw/remote/proxy-memory-listener.h | 28 +++
31
include/qemu/coroutine_int.h | 11 ++-
19
include/hw/remote/proxy.h | 2 +
32
tests/iothread.h | 25 +++++
20
hw/remote/message.c | 4 +
33
tests/iothread.c | 91 ++++++++++++++++++
21
hw/remote/proxy-memory-listener.c | 227 ++++++++++++++++++++++
34
tests/test-aio-multithread.c | 213 +++++++++++++++++++++++++++++++++++++++++++
22
hw/remote/proxy.c | 6 +
35
util/async.c | 65 +++++++++++++
23
hw/remote/meson.build | 1 +
36
util/qemu-coroutine.c | 8 ++
24
7 files changed, 270 insertions(+)
37
util/trace-events | 4 +
25
create mode 100644 include/hw/remote/proxy-memory-listener.h
38
9 files changed, 453 insertions(+), 4 deletions(-)
26
create mode 100644 hw/remote/proxy-memory-listener.c
39
create mode 100644 tests/iothread.h
40
create mode 100644 tests/iothread.c
41
create mode 100644 tests/test-aio-multithread.c
42
27
43
diff --git a/tests/Makefile.include b/tests/Makefile.include
28
diff --git a/MAINTAINERS b/MAINTAINERS
44
index XXXXXXX..XXXXXXX 100644
29
index XXXXXXX..XXXXXXX 100644
45
--- a/tests/Makefile.include
30
--- a/MAINTAINERS
46
+++ b/tests/Makefile.include
31
+++ b/MAINTAINERS
47
@@ -XXX,XX +XXX,XX @@ check-unit-y += tests/test-aio$(EXESUF)
32
@@ -XXX,XX +XXX,XX @@ F: include/hw/remote/memory.h
48
gcov-files-test-aio-y = util/async.c util/qemu-timer.o
33
F: hw/remote/memory.c
49
gcov-files-test-aio-$(CONFIG_WIN32) += util/aio-win32.c
34
F: hw/remote/proxy.c
50
gcov-files-test-aio-$(CONFIG_POSIX) += util/aio-posix.c
35
F: include/hw/remote/proxy.h
51
+check-unit-y += tests/test-aio-multithread$(EXESUF)
36
+F: hw/remote/proxy-memory-listener.c
52
+gcov-files-test-aio-multithread-y = $(gcov-files-test-aio-y)
37
+F: include/hw/remote/proxy-memory-listener.h
53
+gcov-files-test-aio-multithread-y += util/qemu-coroutine.c tests/iothread.c
38
54
check-unit-y += tests/test-throttle$(EXESUF)
39
Build and test automation
55
-gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
40
-------------------------
56
-gcov-files-test-aio-$(CONFIG_POSIX) = aio-posix.c
41
diff --git a/include/hw/remote/proxy-memory-listener.h b/include/hw/remote/proxy-memory-listener.h
57
check-unit-y += tests/test-thread-pool$(EXESUF)
58
gcov-files-test-thread-pool-y = thread-pool.c
59
gcov-files-test-hbitmap-y = util/hbitmap.c
60
@@ -XXX,XX +XXX,XX @@ test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
61
    $(test-qom-obj-y)
62
test-crypto-obj-y = $(crypto-obj-y) $(test-qom-obj-y)
63
test-io-obj-y = $(io-obj-y) $(test-crypto-obj-y)
64
-test-block-obj-y = $(block-obj-y) $(test-io-obj-y)
65
+test-block-obj-y = $(block-obj-y) $(test-io-obj-y) tests/iothread.o
66
67
tests/check-qint$(EXESUF): tests/check-qint.o $(test-util-obj-y)
68
tests/check-qstring$(EXESUF): tests/check-qstring.o $(test-util-obj-y)
69
@@ -XXX,XX +XXX,XX @@ tests/check-qom-proplist$(EXESUF): tests/check-qom-proplist.o $(test-qom-obj-y)
70
tests/test-char$(EXESUF): tests/test-char.o $(test-util-obj-y) $(qtest-obj-y) $(test-io-obj-y) $(chardev-obj-y)
71
tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
72
tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
73
+tests/test-aio-multithread$(EXESUF): tests/test-aio-multithread.o $(test-block-obj-y)
74
tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
75
tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) $(test-util-obj-y)
76
tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
77
diff --git a/include/block/aio.h b/include/block/aio.h
78
index XXXXXXX..XXXXXXX 100644
79
--- a/include/block/aio.h
80
+++ b/include/block/aio.h
81
@@ -XXX,XX +XXX,XX @@ typedef void QEMUBHFunc(void *opaque);
82
typedef bool AioPollFn(void *opaque);
83
typedef void IOHandler(void *opaque);
84
85
+struct Coroutine;
86
struct ThreadPool;
87
struct LinuxAioState;
88
89
@@ -XXX,XX +XXX,XX @@ struct AioContext {
90
bool notified;
91
EventNotifier notifier;
92
93
+ QSLIST_HEAD(, Coroutine) scheduled_coroutines;
94
+ QEMUBH *co_schedule_bh;
95
+
96
/* Thread pool for performing work and receiving completion callbacks.
97
* Has its own locking.
98
*/
99
@@ -XXX,XX +XXX,XX @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
100
}
101
102
/**
103
+ * aio_co_schedule:
104
+ * @ctx: the aio context
105
+ * @co: the coroutine
106
+ *
107
+ * Start a coroutine on a remote AioContext.
108
+ *
109
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
110
+ * is active. In addition the coroutine must have yielded unless ctx
111
+ * is the context in which the coroutine is running (i.e. the value of
112
+ * qemu_get_current_aio_context() from the coroutine itself).
113
+ */
114
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
115
+
116
+/**
117
+ * aio_co_wake:
118
+ * @co: the coroutine
119
+ *
120
+ * Restart a coroutine on the AioContext where it was running last, thus
121
+ * preventing coroutines from jumping from one context to another when they
122
+ * go to sleep.
123
+ *
124
+ * aio_co_wake may be executed either in coroutine or non-coroutine
125
+ * context. The coroutine must not be entered by anyone else while
126
+ * aio_co_wake() is active.
127
+ */
128
+void aio_co_wake(struct Coroutine *co);
129
+
130
+/**
131
* Return the AioContext whose event loop runs in the current thread.
132
*
133
* If called from an IOThread this will be the IOThread's AioContext. If
134
diff --git a/include/qemu/coroutine_int.h b/include/qemu/coroutine_int.h
135
index XXXXXXX..XXXXXXX 100644
136
--- a/include/qemu/coroutine_int.h
137
+++ b/include/qemu/coroutine_int.h
138
@@ -XXX,XX +XXX,XX @@ struct Coroutine {
139
CoroutineEntry *entry;
140
void *entry_arg;
141
Coroutine *caller;
142
+
143
+ /* Only used when the coroutine has terminated. */
144
QSLIST_ENTRY(Coroutine) pool_next;
145
+
146
size_t locks_held;
147
148
- /* Coroutines that should be woken up when we yield or terminate */
149
+ /* Coroutines that should be woken up when we yield or terminate.
150
+ * Only used when the coroutine is running.
151
+ */
152
QSIMPLEQ_HEAD(, Coroutine) co_queue_wakeup;
153
+
154
+ /* Only used when the coroutine has yielded. */
155
+ AioContext *ctx;
156
QSIMPLEQ_ENTRY(Coroutine) co_queue_next;
157
+ QSLIST_ENTRY(Coroutine) co_scheduled_next;
158
};
159
160
Coroutine *qemu_coroutine_new(void);
161
diff --git a/tests/iothread.h b/tests/iothread.h
162
new file mode 100644
42
new file mode 100644
163
index XXXXXXX..XXXXXXX
43
index XXXXXXX..XXXXXXX
164
--- /dev/null
44
--- /dev/null
165
+++ b/tests/iothread.h
45
+++ b/include/hw/remote/proxy-memory-listener.h
166
@@ -XXX,XX +XXX,XX @@
46
@@ -XXX,XX +XXX,XX @@
167
+/*
47
+/*
168
+ * Event loop thread implementation for unit tests
48
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
169
+ *
170
+ * Copyright Red Hat Inc., 2013, 2016
171
+ *
172
+ * Authors:
173
+ * Stefan Hajnoczi <stefanha@redhat.com>
174
+ * Paolo Bonzini <pbonzini@redhat.com>
175
+ *
49
+ *
176
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
50
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
177
+ * See the COPYING file in the top-level directory.
51
+ * See the COPYING file in the top-level directory.
52
+ *
178
+ */
53
+ */
179
+#ifndef TEST_IOTHREAD_H
54
+
180
+#define TEST_IOTHREAD_H
55
+#ifndef PROXY_MEMORY_LISTENER_H
181
+
56
+#define PROXY_MEMORY_LISTENER_H
182
+#include "block/aio.h"
57
+
183
+#include "qemu/thread.h"
58
+#include "exec/memory.h"
184
+
59
+#include "io/channel.h"
185
+typedef struct IOThread IOThread;
60
+
186
+
61
+typedef struct ProxyMemoryListener {
187
+IOThread *iothread_new(void);
62
+ MemoryListener listener;
188
+void iothread_join(IOThread *iothread);
63
+
189
+AioContext *iothread_get_aio_context(IOThread *iothread);
64
+ int n_mr_sections;
65
+ MemoryRegionSection *mr_sections;
66
+
67
+ QIOChannel *ioc;
68
+} ProxyMemoryListener;
69
+
70
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
71
+ QIOChannel *ioc);
72
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener);
190
+
73
+
191
+#endif
74
+#endif
192
diff --git a/tests/iothread.c b/tests/iothread.c
75
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
76
index XXXXXXX..XXXXXXX 100644
77
--- a/include/hw/remote/proxy.h
78
+++ b/include/hw/remote/proxy.h
79
@@ -XXX,XX +XXX,XX @@
80
81
#include "hw/pci/pci.h"
82
#include "io/channel.h"
83
+#include "hw/remote/proxy-memory-listener.h"
84
85
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
86
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
87
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
88
QemuMutex io_mutex;
89
QIOChannel *ioc;
90
Error *migration_blocker;
91
+ ProxyMemoryListener proxy_listener;
92
ProxyMemoryRegion region[PCI_NUM_REGIONS];
93
};
94
95
diff --git a/hw/remote/message.c b/hw/remote/message.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/hw/remote/message.c
98
+++ b/hw/remote/message.c
99
@@ -XXX,XX +XXX,XX @@
100
#include "sysemu/runstate.h"
101
#include "hw/pci/pci.h"
102
#include "exec/memattrs.h"
103
+#include "hw/remote/memory.h"
104
105
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
106
MPQemuMsg *msg, Error **errp);
107
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
108
case MPQEMU_CMD_BAR_READ:
109
process_bar_read(com->ioc, &msg, &local_err);
110
break;
111
+ case MPQEMU_CMD_SYNC_SYSMEM:
112
+ remote_sysmem_reconfig(&msg, &local_err);
113
+ break;
114
default:
115
error_setg(&local_err,
116
"Unknown command (%d) received for device %s"
117
diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c
193
new file mode 100644
118
new file mode 100644
194
index XXXXXXX..XXXXXXX
119
index XXXXXXX..XXXXXXX
195
--- /dev/null
120
--- /dev/null
196
+++ b/tests/iothread.c
121
+++ b/hw/remote/proxy-memory-listener.c
197
@@ -XXX,XX +XXX,XX @@
122
@@ -XXX,XX +XXX,XX @@
198
+/*
123
+/*
199
+ * Event loop thread implementation for unit tests
124
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
200
+ *
201
+ * Copyright Red Hat Inc., 2013, 2016
202
+ *
203
+ * Authors:
204
+ * Stefan Hajnoczi <stefanha@redhat.com>
205
+ * Paolo Bonzini <pbonzini@redhat.com>
206
+ *
125
+ *
207
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
126
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
208
+ * See the COPYING file in the top-level directory.
127
+ * See the COPYING file in the top-level directory.
209
+ *
128
+ *
210
+ */
129
+ */
211
+
130
+
212
+#include "qemu/osdep.h"
131
+#include "qemu/osdep.h"
132
+#include "qemu-common.h"
133
+
134
+#include "qemu/compiler.h"
135
+#include "qemu/int128.h"
136
+#include "qemu/range.h"
137
+#include "exec/memory.h"
138
+#include "exec/cpu-common.h"
139
+#include "cpu.h"
140
+#include "exec/ram_addr.h"
141
+#include "exec/address-spaces.h"
213
+#include "qapi/error.h"
142
+#include "qapi/error.h"
214
+#include "block/aio.h"
143
+#include "hw/remote/mpqemu-link.h"
215
+#include "qemu/main-loop.h"
144
+#include "hw/remote/proxy-memory-listener.h"
216
+#include "qemu/rcu.h"
145
+
217
+#include "iothread.h"
218
+
219
+struct IOThread {
220
+ AioContext *ctx;
221
+
222
+ QemuThread thread;
223
+ QemuMutex init_done_lock;
224
+ QemuCond init_done_cond; /* is thread initialization done? */
225
+ bool stopping;
226
+};
227
+
228
+static __thread IOThread *my_iothread;
229
+
230
+AioContext *qemu_get_current_aio_context(void)
231
+{
232
+ return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
233
+}
234
+
235
+static void *iothread_run(void *opaque)
236
+{
237
+ IOThread *iothread = opaque;
238
+
239
+ rcu_register_thread();
240
+
241
+ my_iothread = iothread;
242
+ qemu_mutex_lock(&iothread->init_done_lock);
243
+ iothread->ctx = aio_context_new(&error_abort);
244
+ qemu_cond_signal(&iothread->init_done_cond);
245
+ qemu_mutex_unlock(&iothread->init_done_lock);
246
+
247
+ while (!atomic_read(&iothread->stopping)) {
248
+ aio_poll(iothread->ctx, true);
249
+ }
250
+
251
+ rcu_unregister_thread();
252
+ return NULL;
253
+}
254
+
255
+void iothread_join(IOThread *iothread)
256
+{
257
+ iothread->stopping = true;
258
+ aio_notify(iothread->ctx);
259
+ qemu_thread_join(&iothread->thread);
260
+ qemu_cond_destroy(&iothread->init_done_cond);
261
+ qemu_mutex_destroy(&iothread->init_done_lock);
262
+ aio_context_unref(iothread->ctx);
263
+ g_free(iothread);
264
+}
265
+
266
+IOThread *iothread_new(void)
267
+{
268
+ IOThread *iothread = g_new0(IOThread, 1);
269
+
270
+ qemu_mutex_init(&iothread->init_done_lock);
271
+ qemu_cond_init(&iothread->init_done_cond);
272
+ qemu_thread_create(&iothread->thread, NULL, iothread_run,
273
+ iothread, QEMU_THREAD_JOINABLE);
274
+
275
+ /* Wait for initialization to complete */
276
+ qemu_mutex_lock(&iothread->init_done_lock);
277
+ while (iothread->ctx == NULL) {
278
+ qemu_cond_wait(&iothread->init_done_cond,
279
+ &iothread->init_done_lock);
280
+ }
281
+ qemu_mutex_unlock(&iothread->init_done_lock);
282
+ return iothread;
283
+}
284
+
285
+AioContext *iothread_get_aio_context(IOThread *iothread)
286
+{
287
+ return iothread->ctx;
288
+}
289
diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
290
new file mode 100644
291
index XXXXXXX..XXXXXXX
292
--- /dev/null
293
+++ b/tests/test-aio-multithread.c
294
@@ -XXX,XX +XXX,XX @@
295
+/*
146
+/*
296
+ * AioContext multithreading tests
147
+ * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
297
+ *
148
+ * proxy_memory_listener_commit() defined below perform tasks similar to the
298
+ * Copyright Red Hat, Inc. 2016
149
+ * functions defined in vhost-user.c. These functions are good candidates
299
+ *
150
+ * for refactoring.
300
+ * Authors:
151
+ *
301
+ * Paolo Bonzini <pbonzini@redhat.com>
302
+ *
303
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
304
+ * See the COPYING.LIB file in the top-level directory.
305
+ */
152
+ */
306
+
153
+
307
+#include "qemu/osdep.h"
154
+static void proxy_memory_listener_reset(MemoryListener *listener)
308
+#include <glib.h>
155
+{
309
+#include "block/aio.h"
156
+ ProxyMemoryListener *proxy_listener = container_of(listener,
310
+#include "qapi/error.h"
157
+ ProxyMemoryListener,
311
+#include "qemu/coroutine.h"
158
+ listener);
312
+#include "qemu/thread.h"
159
+ int mrs;
313
+#include "qemu/error-report.h"
160
+
314
+#include "iothread.h"
161
+ for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
315
+
162
+ memory_region_unref(proxy_listener->mr_sections[mrs].mr);
316
+/* AioContext management */
163
+ }
317
+
164
+
318
+#define NUM_CONTEXTS 5
165
+ g_free(proxy_listener->mr_sections);
319
+
166
+ proxy_listener->mr_sections = NULL;
320
+static IOThread *threads[NUM_CONTEXTS];
167
+ proxy_listener->n_mr_sections = 0;
321
+static AioContext *ctx[NUM_CONTEXTS];
168
+}
322
+static __thread int id = -1;
169
+
323
+
170
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
324
+static QemuEvent done_event;
171
+{
325
+
172
+ MemoryRegion *mr;
326
+/* Run a function synchronously on a remote iothread. */
173
+ ram_addr_t off;
327
+
174
+
328
+typedef struct CtxRunData {
175
+ /**
329
+ QEMUBHFunc *cb;
176
+ * Assumes that the host address is a valid address as it's
330
+ void *arg;
177
+ * coming from the MemoryListener system. In the case host
331
+} CtxRunData;
178
+ * address is not valid, the following call would return
332
+
179
+ * the default subregion of "system_memory" region, and
333
+static void ctx_run_bh_cb(void *opaque)
180
+ * not NULL. So it's not possible to check for NULL here.
334
+{
181
+ */
335
+ CtxRunData *data = opaque;
182
+ mr = memory_region_from_host((void *)(uintptr_t)host, &off);
336
+
183
+
337
+ data->cb(data->arg);
184
+ if (offset) {
338
+ qemu_event_set(&done_event);
185
+ *offset = off;
339
+}
186
+ }
340
+
187
+
341
+static void ctx_run(int i, QEMUBHFunc *cb, void *opaque)
188
+ return memory_region_get_fd(mr);
342
+{
189
+}
343
+ CtxRunData data = {
190
+
344
+ .cb = cb,
191
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
345
+ .arg = opaque
192
+{
346
+ };
193
+ if (((prev_host + size) != host)) {
347
+
348
+ qemu_event_reset(&done_event);
349
+ aio_bh_schedule_oneshot(ctx[i], ctx_run_bh_cb, &data);
350
+ qemu_event_wait(&done_event);
351
+}
352
+
353
+/* Starting the iothreads. */
354
+
355
+static void set_id_cb(void *opaque)
356
+{
357
+ int *i = opaque;
358
+
359
+ id = *i;
360
+}
361
+
362
+static void create_aio_contexts(void)
363
+{
364
+ int i;
365
+
366
+ for (i = 0; i < NUM_CONTEXTS; i++) {
367
+ threads[i] = iothread_new();
368
+ ctx[i] = iothread_get_aio_context(threads[i]);
369
+ }
370
+
371
+ qemu_event_init(&done_event, false);
372
+ for (i = 0; i < NUM_CONTEXTS; i++) {
373
+ ctx_run(i, set_id_cb, &i);
374
+ }
375
+}
376
+
377
+/* Stopping the iothreads. */
378
+
379
+static void join_aio_contexts(void)
380
+{
381
+ int i;
382
+
383
+ for (i = 0; i < NUM_CONTEXTS; i++) {
384
+ aio_context_ref(ctx[i]);
385
+ }
386
+ for (i = 0; i < NUM_CONTEXTS; i++) {
387
+ iothread_join(threads[i]);
388
+ }
389
+ for (i = 0; i < NUM_CONTEXTS; i++) {
390
+ aio_context_unref(ctx[i]);
391
+ }
392
+ qemu_event_destroy(&done_event);
393
+}
394
+
395
+/* Basic test for the stuff above. */
396
+
397
+static void test_lifecycle(void)
398
+{
399
+ create_aio_contexts();
400
+ join_aio_contexts();
401
+}
402
+
403
+/* aio_co_schedule test. */
404
+
405
+static Coroutine *to_schedule[NUM_CONTEXTS];
406
+
407
+static bool now_stopping;
408
+
409
+static int count_retry;
410
+static int count_here;
411
+static int count_other;
412
+
413
+static bool schedule_next(int n)
414
+{
415
+ Coroutine *co;
416
+
417
+ co = atomic_xchg(&to_schedule[n], NULL);
418
+ if (!co) {
419
+ atomic_inc(&count_retry);
420
+ return false;
194
+ return false;
421
+ }
195
+ }
422
+
196
+
423
+ if (n == id) {
197
+ if (get_fd_from_hostaddr(host, NULL) !=
424
+ atomic_inc(&count_here);
198
+ get_fd_from_hostaddr(prev_host, NULL)) {
425
+ } else {
199
+ return false;
426
+ atomic_inc(&count_other);
200
+ }
427
+ }
201
+
428
+
429
+ aio_co_schedule(ctx[n], co);
430
+ return true;
202
+ return true;
431
+}
203
+}
432
+
204
+
433
+static void finish_cb(void *opaque)
205
+static bool try_merge(ProxyMemoryListener *proxy_listener,
434
+{
206
+ MemoryRegionSection *section)
435
+ schedule_next(id);
207
+{
436
+}
208
+ uint64_t mrs_size, mrs_gpa, mrs_page;
437
+
209
+ MemoryRegionSection *prev_sec;
438
+static coroutine_fn void test_multi_co_schedule_entry(void *opaque)
210
+ bool merged = false;
439
+{
211
+ uintptr_t mrs_host;
440
+ g_assert(to_schedule[id] == NULL);
212
+ RAMBlock *mrs_rb;
441
+ atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
213
+
442
+
214
+ if (!proxy_listener->n_mr_sections) {
443
+ while (!atomic_mb_read(&now_stopping)) {
215
+ return false;
444
+ int n;
216
+ }
445
+
217
+
446
+ n = g_test_rand_int_range(0, NUM_CONTEXTS);
218
+ mrs_rb = section->mr->ram_block;
447
+ schedule_next(n);
219
+ mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
448
+ qemu_coroutine_yield();
220
+ mrs_size = int128_get64(section->size);
449
+
221
+ mrs_gpa = section->offset_within_address_space;
450
+ g_assert(to_schedule[id] == NULL);
222
+ mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
451
+ atomic_mb_set(&to_schedule[id], qemu_coroutine_self());
223
+ section->offset_within_region;
452
+ }
224
+
453
+}
225
+ if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
454
+
226
+ return true;
455
+
227
+ }
456
+static void test_multi_co_schedule(int seconds)
228
+
457
+{
229
+ mrs_host = mrs_host & ~(mrs_page - 1);
458
+ int i;
230
+ mrs_gpa = mrs_gpa & ~(mrs_page - 1);
459
+
231
+ mrs_size = ROUND_UP(mrs_size, mrs_page);
460
+ count_here = count_other = count_retry = 0;
232
+
461
+ now_stopping = false;
233
+ prev_sec = proxy_listener->mr_sections +
462
+
234
+ (proxy_listener->n_mr_sections - 1);
463
+ create_aio_contexts();
235
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
464
+ for (i = 0; i < NUM_CONTEXTS; i++) {
236
+ uint64_t prev_size = int128_get64(prev_sec->size);
465
+ Coroutine *co1 = qemu_coroutine_create(test_multi_co_schedule_entry, NULL);
237
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
466
+ aio_co_schedule(ctx[i], co1);
238
+ uint64_t prev_host_start =
467
+ }
239
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
468
+
240
+ prev_sec->offset_within_region;
469
+ g_usleep(seconds * 1000000);
241
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
470
+
242
+
471
+ atomic_mb_set(&now_stopping, true);
243
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
472
+ for (i = 0; i < NUM_CONTEXTS; i++) {
244
+ g_assert(mrs_gpa > prev_gpa_start);
473
+ ctx_run(i, finish_cb, NULL);
245
+
474
+ to_schedule[i] = NULL;
246
+ if ((section->mr == prev_sec->mr) &&
475
+ }
247
+ proxy_mrs_can_merge(mrs_host, prev_host_start,
476
+
248
+ (mrs_gpa - prev_gpa_start))) {
477
+ join_aio_contexts();
249
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
478
+ g_test_message("scheduled %d, queued %d, retry %d, total %d\n",
250
+ merged = true;
479
+ count_other, count_here, count_retry,
251
+ prev_sec->offset_within_address_space =
480
+ count_here + count_other + count_retry);
252
+ MIN(prev_gpa_start, mrs_gpa);
481
+}
253
+ prev_sec->offset_within_region =
482
+
254
+ MIN(prev_host_start, mrs_host) -
483
+static void test_multi_co_schedule_1(void)
255
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
484
+{
256
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
485
+ test_multi_co_schedule(1);
257
+ mrs_host));
486
+}
258
+ }
487
+
259
+ }
488
+static void test_multi_co_schedule_10(void)
260
+
489
+{
261
+ return merged;
490
+ test_multi_co_schedule(10);
262
+}
491
+}
263
+
492
+
264
+static void proxy_memory_listener_region_addnop(MemoryListener *listener,
493
+/* End of tests. */
265
+ MemoryRegionSection *section)
494
+
266
+{
495
+int main(int argc, char **argv)
267
+ ProxyMemoryListener *proxy_listener = container_of(listener,
496
+{
268
+ ProxyMemoryListener,
497
+ init_clocks();
269
+ listener);
498
+
270
+
499
+ g_test_init(&argc, &argv, NULL);
271
+ if (!memory_region_is_ram(section->mr) ||
500
+ g_test_add_func("/aio/multi/lifecycle", test_lifecycle);
272
+ memory_region_is_rom(section->mr)) {
501
+ if (g_test_quick()) {
273
+ return;
502
+ g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_1);
274
+ }
503
+ } else {
275
+
504
+ g_test_add_func("/aio/multi/schedule", test_multi_co_schedule_10);
276
+ if (try_merge(proxy_listener, section)) {
505
+ }
277
+ return;
506
+ return g_test_run();
278
+ }
507
+}
279
+
508
diff --git a/util/async.c b/util/async.c
280
+ ++proxy_listener->n_mr_sections;
509
index XXXXXXX..XXXXXXX 100644
281
+ proxy_listener->mr_sections = g_renew(MemoryRegionSection,
510
--- a/util/async.c
282
+ proxy_listener->mr_sections,
511
+++ b/util/async.c
283
+ proxy_listener->n_mr_sections);
512
@@ -XXX,XX +XXX,XX @@
284
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
513
#include "qemu/main-loop.h"
285
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
514
#include "qemu/atomic.h"
286
+ memory_region_ref(section->mr);
515
#include "block/raw-aio.h"
287
+}
516
+#include "qemu/coroutine_int.h"
288
+
517
+#include "trace.h"
289
+static void proxy_memory_listener_commit(MemoryListener *listener)
518
290
+{
519
/***********************************************************/
291
+ ProxyMemoryListener *proxy_listener = container_of(listener,
520
/* bottom halves (can be seen as timers which expire ASAP) */
292
+ ProxyMemoryListener,
521
@@ -XXX,XX +XXX,XX @@ aio_ctx_finalize(GSource *source)
293
+ listener);
522
}
294
+ MPQemuMsg msg;
523
#endif
295
+ MemoryRegionSection *section;
524
296
+ ram_addr_t offset;
525
+ assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
297
+ uintptr_t host_addr;
526
+ qemu_bh_delete(ctx->co_schedule_bh);
298
+ int region;
527
+
299
+ Error *local_err = NULL;
528
qemu_lockcnt_lock(&ctx->list_lock);
300
+
529
assert(!qemu_lockcnt_count(&ctx->list_lock));
301
+ memset(&msg, 0, sizeof(MPQemuMsg));
530
while (ctx->first_bh) {
302
+
531
@@ -XXX,XX +XXX,XX @@ static bool event_notifier_poll(void *opaque)
303
+ msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
532
return atomic_read(&ctx->notified);
304
+ msg.num_fds = proxy_listener->n_mr_sections;
305
+ msg.size = sizeof(SyncSysmemMsg);
306
+ if (msg.num_fds > REMOTE_MAX_FDS) {
307
+ error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
308
+ return;
309
+ }
310
+
311
+ for (region = 0; region < proxy_listener->n_mr_sections; region++) {
312
+ section = &proxy_listener->mr_sections[region];
313
+ msg.data.sync_sysmem.gpas[region] =
314
+ section->offset_within_address_space;
315
+ msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
316
+ host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
317
+ section->offset_within_region;
318
+ msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
319
+ msg.data.sync_sysmem.offsets[region] = offset;
320
+ }
321
+ if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
322
+ error_report_err(local_err);
323
+ }
324
+}
325
+
326
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
327
+{
328
+ memory_listener_unregister(&proxy_listener->listener);
329
+
330
+ proxy_memory_listener_reset(&proxy_listener->listener);
331
+}
332
+
333
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
334
+ QIOChannel *ioc)
335
+{
336
+ proxy_listener->n_mr_sections = 0;
337
+ proxy_listener->mr_sections = NULL;
338
+
339
+ proxy_listener->ioc = ioc;
340
+
341
+ proxy_listener->listener.begin = proxy_memory_listener_reset;
342
+ proxy_listener->listener.commit = proxy_memory_listener_commit;
343
+ proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
344
+ proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
345
+ proxy_listener->listener.priority = 10;
346
+
347
+ memory_listener_register(&proxy_listener->listener,
348
+ &address_space_memory);
349
+}
350
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
351
index XXXXXXX..XXXXXXX 100644
352
--- a/hw/remote/proxy.c
353
+++ b/hw/remote/proxy.c
354
@@ -XXX,XX +XXX,XX @@
355
#include "qemu/sockets.h"
356
#include "hw/remote/mpqemu-link.h"
357
#include "qemu/error-report.h"
358
+#include "hw/remote/proxy-memory-listener.h"
359
+#include "qom/object.h"
360
361
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
362
{
363
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
364
365
qemu_mutex_init(&dev->io_mutex);
366
qio_channel_set_blocking(dev->ioc, true, NULL);
367
+
368
+ proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
533
}
369
}
534
370
535
+static void co_schedule_bh_cb(void *opaque)
371
static void pci_proxy_dev_exit(PCIDevice *pdev)
536
+{
372
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
537
+ AioContext *ctx = opaque;
373
migrate_del_blocker(dev->migration_blocker);
538
+ QSLIST_HEAD(, Coroutine) straight, reversed;
374
539
+
375
error_free(dev->migration_blocker);
540
+ QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
376
+
541
+ QSLIST_INIT(&straight);
377
+ proxy_memory_listener_deconfigure(&dev->proxy_listener);
542
+
543
+ while (!QSLIST_EMPTY(&reversed)) {
544
+ Coroutine *co = QSLIST_FIRST(&reversed);
545
+ QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
546
+ QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
547
+ }
548
+
549
+ while (!QSLIST_EMPTY(&straight)) {
550
+ Coroutine *co = QSLIST_FIRST(&straight);
551
+ QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
552
+ trace_aio_co_schedule_bh_cb(ctx, co);
553
+ qemu_coroutine_enter(co);
554
+ }
555
+}
556
+
557
AioContext *aio_context_new(Error **errp)
558
{
559
int ret;
560
@@ -XXX,XX +XXX,XX @@ AioContext *aio_context_new(Error **errp)
561
}
562
g_source_set_can_recurse(&ctx->source, true);
563
qemu_lockcnt_init(&ctx->list_lock);
564
+
565
+ ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
566
+ QSLIST_INIT(&ctx->scheduled_coroutines);
567
+
568
aio_set_event_notifier(ctx, &ctx->notifier,
569
false,
570
(EventNotifierHandler *)
571
@@ -XXX,XX +XXX,XX @@ fail:
572
return NULL;
573
}
378
}
574
379
575
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
380
static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
576
+{
381
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
577
+ trace_aio_co_schedule(ctx, co);
382
index XXXXXXX..XXXXXXX 100644
578
+ QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
383
--- a/hw/remote/meson.build
579
+ co, co_scheduled_next);
384
+++ b/hw/remote/meson.build
580
+ qemu_bh_schedule(ctx->co_schedule_bh);
385
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
581
+}
386
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
582
+
387
583
+void aio_co_wake(struct Coroutine *co)
388
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
584
+{
389
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
585
+ AioContext *ctx;
390
586
+
391
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
587
+ /* Read coroutine before co->ctx. Matches smp_wmb in
588
+ * qemu_coroutine_enter.
589
+ */
590
+ smp_read_barrier_depends();
591
+ ctx = atomic_read(&co->ctx);
592
+
593
+ if (ctx != qemu_get_current_aio_context()) {
594
+ aio_co_schedule(ctx, co);
595
+ return;
596
+ }
597
+
598
+ if (qemu_in_coroutine()) {
599
+ Coroutine *self = qemu_coroutine_self();
600
+ assert(self != co);
601
+ QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
602
+ } else {
603
+ aio_context_acquire(ctx);
604
+ qemu_coroutine_enter(co);
605
+ aio_context_release(ctx);
606
+ }
607
+}
608
+
609
void aio_context_ref(AioContext *ctx)
610
{
611
g_source_ref(&ctx->source);
612
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
613
index XXXXXXX..XXXXXXX 100644
614
--- a/util/qemu-coroutine.c
615
+++ b/util/qemu-coroutine.c
616
@@ -XXX,XX +XXX,XX @@
617
#include "qemu/atomic.h"
618
#include "qemu/coroutine.h"
619
#include "qemu/coroutine_int.h"
620
+#include "block/aio.h"
621
622
enum {
623
POOL_BATCH_SIZE = 64,
624
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
625
}
626
627
co->caller = self;
628
+ co->ctx = qemu_get_current_aio_context();
629
+
630
+ /* Store co->ctx before anything that stores co. Matches
631
+ * barrier in aio_co_wake.
632
+ */
633
+ smp_wmb();
634
+
635
ret = qemu_coroutine_switch(self, co, COROUTINE_ENTER);
636
637
qemu_co_queue_run_restart(co);
638
diff --git a/util/trace-events b/util/trace-events
639
index XXXXXXX..XXXXXXX 100644
640
--- a/util/trace-events
641
+++ b/util/trace-events
642
@@ -XXX,XX +XXX,XX @@ run_poll_handlers_end(void *ctx, bool progress) "ctx %p progress %d"
643
poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
644
poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
645
646
+# util/async.c
647
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
648
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
649
+
650
# util/thread-pool.c
651
thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
652
thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
653
--
392
--
654
2.9.3
393
2.29.2
655
394
656
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
qed_aio_start_io and qed_aio_next_io will not have to acquire/release
3
IOHUB object is added to manage PCI IRQs. It uses KVM_IRQFD
4
the AioContext, while qed_aio_next_io_cb will. Split the functionality
4
ioctl to create irqfd to injecting PCI interrupts to the guest.
5
and gain a little type-safety in the process.
5
IOHUB object forwards the irqfd to the remote process. Remote process
6
uses this fd to directly send interrupts to the guest, bypassing QEMU.
6
7
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
12
Message-id: 51d5c3d54e28a68b002e3875c59599c9f5a424a1.1611938319.git.jag.raman@oracle.com
9
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
11
Message-id: 20170213135235.12274-11-pbonzini@redhat.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
---
14
block/qed.c | 39 +++++++++++++++++++++++++--------------
15
MAINTAINERS | 2 +
15
1 file changed, 25 insertions(+), 14 deletions(-)
16
include/hw/pci/pci_ids.h | 3 +
17
include/hw/remote/iohub.h | 42 +++++++++++
18
include/hw/remote/machine.h | 2 +
19
include/hw/remote/mpqemu-link.h | 1 +
20
include/hw/remote/proxy.h | 4 ++
21
hw/remote/iohub.c | 119 ++++++++++++++++++++++++++++++++
22
hw/remote/machine.c | 10 +++
23
hw/remote/message.c | 4 ++
24
hw/remote/mpqemu-link.c | 5 ++
25
hw/remote/proxy.c | 56 +++++++++++++++
26
hw/remote/meson.build | 1 +
27
12 files changed, 249 insertions(+)
28
create mode 100644 include/hw/remote/iohub.h
29
create mode 100644 hw/remote/iohub.c
16
30
17
diff --git a/block/qed.c b/block/qed.c
31
diff --git a/MAINTAINERS b/MAINTAINERS
18
index XXXXXXX..XXXXXXX 100644
32
index XXXXXXX..XXXXXXX 100644
19
--- a/block/qed.c
33
--- a/MAINTAINERS
20
+++ b/block/qed.c
34
+++ b/MAINTAINERS
21
@@ -XXX,XX +XXX,XX @@ static CachedL2Table *qed_new_l2_table(BDRVQEDState *s)
35
@@ -XXX,XX +XXX,XX @@ F: hw/remote/proxy.c
22
return l2_table;
36
F: include/hw/remote/proxy.h
37
F: hw/remote/proxy-memory-listener.c
38
F: include/hw/remote/proxy-memory-listener.h
39
+F: hw/remote/iohub.c
40
+F: include/hw/remote/iohub.h
41
42
Build and test automation
43
-------------------------
44
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
45
index XXXXXXX..XXXXXXX 100644
46
--- a/include/hw/pci/pci_ids.h
47
+++ b/include/hw/pci/pci_ids.h
48
@@ -XXX,XX +XXX,XX @@
49
#define PCI_DEVICE_ID_SUN_SIMBA 0x5000
50
#define PCI_DEVICE_ID_SUN_SABRE 0xa000
51
52
+#define PCI_VENDOR_ID_ORACLE 0x108e
53
+#define PCI_DEVICE_ID_REMOTE_IOHUB 0xb000
54
+
55
#define PCI_VENDOR_ID_CMD 0x1095
56
#define PCI_DEVICE_ID_CMD_646 0x0646
57
58
diff --git a/include/hw/remote/iohub.h b/include/hw/remote/iohub.h
59
new file mode 100644
60
index XXXXXXX..XXXXXXX
61
--- /dev/null
62
+++ b/include/hw/remote/iohub.h
63
@@ -XXX,XX +XXX,XX @@
64
+/*
65
+ * IO Hub for remote device
66
+ *
67
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
68
+ *
69
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
70
+ * See the COPYING file in the top-level directory.
71
+ *
72
+ */
73
+
74
+#ifndef REMOTE_IOHUB_H
75
+#define REMOTE_IOHUB_H
76
+
77
+#include "hw/pci/pci.h"
78
+#include "qemu/event_notifier.h"
79
+#include "qemu/thread-posix.h"
80
+#include "hw/remote/mpqemu-link.h"
81
+
82
+#define REMOTE_IOHUB_NB_PIRQS PCI_DEVFN_MAX
83
+
84
+typedef struct ResampleToken {
85
+ void *iohub;
86
+ int pirq;
87
+} ResampleToken;
88
+
89
+typedef struct RemoteIOHubState {
90
+ PCIDevice d;
91
+ EventNotifier irqfds[REMOTE_IOHUB_NB_PIRQS];
92
+ EventNotifier resamplefds[REMOTE_IOHUB_NB_PIRQS];
93
+ unsigned int irq_level[REMOTE_IOHUB_NB_PIRQS];
94
+ ResampleToken token[REMOTE_IOHUB_NB_PIRQS];
95
+ QemuMutex irq_level_lock[REMOTE_IOHUB_NB_PIRQS];
96
+} RemoteIOHubState;
97
+
98
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx);
99
+void remote_iohub_set_irq(void *opaque, int pirq, int level);
100
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg);
101
+
102
+void remote_iohub_init(RemoteIOHubState *iohub);
103
+void remote_iohub_finalize(RemoteIOHubState *iohub);
104
+
105
+#endif
106
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
107
index XXXXXXX..XXXXXXX 100644
108
--- a/include/hw/remote/machine.h
109
+++ b/include/hw/remote/machine.h
110
@@ -XXX,XX +XXX,XX @@
111
#include "hw/boards.h"
112
#include "hw/pci-host/remote.h"
113
#include "io/channel.h"
114
+#include "hw/remote/iohub.h"
115
116
struct RemoteMachineState {
117
MachineState parent_obj;
118
119
RemotePCIHost *host;
120
+ RemoteIOHubState iohub;
121
};
122
123
/* Used to pass to co-routine device and ioc. */
124
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
125
index XXXXXXX..XXXXXXX 100644
126
--- a/include/hw/remote/mpqemu-link.h
127
+++ b/include/hw/remote/mpqemu-link.h
128
@@ -XXX,XX +XXX,XX @@ typedef enum {
129
MPQEMU_CMD_PCI_CFGREAD,
130
MPQEMU_CMD_BAR_WRITE,
131
MPQEMU_CMD_BAR_READ,
132
+ MPQEMU_CMD_SET_IRQFD,
133
MPQEMU_CMD_MAX,
134
} MPQemuCmd;
135
136
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
137
index XXXXXXX..XXXXXXX 100644
138
--- a/include/hw/remote/proxy.h
139
+++ b/include/hw/remote/proxy.h
140
@@ -XXX,XX +XXX,XX @@
141
#include "hw/pci/pci.h"
142
#include "io/channel.h"
143
#include "hw/remote/proxy-memory-listener.h"
144
+#include "qemu/event_notifier.h"
145
146
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
147
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
148
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
149
QIOChannel *ioc;
150
Error *migration_blocker;
151
ProxyMemoryListener proxy_listener;
152
+ int virq;
153
+ EventNotifier intr;
154
+ EventNotifier resample;
155
ProxyMemoryRegion region[PCI_NUM_REGIONS];
156
};
157
158
diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c
159
new file mode 100644
160
index XXXXXXX..XXXXXXX
161
--- /dev/null
162
+++ b/hw/remote/iohub.c
163
@@ -XXX,XX +XXX,XX @@
164
+/*
165
+ * Remote IO Hub
166
+ *
167
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
168
+ *
169
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
170
+ * See the COPYING file in the top-level directory.
171
+ *
172
+ */
173
+
174
+#include "qemu/osdep.h"
175
+#include "qemu-common.h"
176
+
177
+#include "hw/pci/pci.h"
178
+#include "hw/pci/pci_ids.h"
179
+#include "hw/pci/pci_bus.h"
180
+#include "qemu/thread.h"
181
+#include "hw/boards.h"
182
+#include "hw/remote/machine.h"
183
+#include "hw/remote/iohub.h"
184
+#include "qemu/main-loop.h"
185
+
186
+void remote_iohub_init(RemoteIOHubState *iohub)
187
+{
188
+ int pirq;
189
+
190
+ memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
191
+ memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
192
+
193
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
194
+ qemu_mutex_init(&iohub->irq_level_lock[pirq]);
195
+ iohub->irq_level[pirq] = 0;
196
+ event_notifier_init_fd(&iohub->irqfds[pirq], -1);
197
+ event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
198
+ }
199
+}
200
+
201
+void remote_iohub_finalize(RemoteIOHubState *iohub)
202
+{
203
+ int pirq;
204
+
205
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
206
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
207
+ NULL, NULL, NULL);
208
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
209
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
210
+ qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
211
+ }
212
+}
213
+
214
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
215
+{
216
+ return pci_dev->devfn;
217
+}
218
+
219
+void remote_iohub_set_irq(void *opaque, int pirq, int level)
220
+{
221
+ RemoteIOHubState *iohub = opaque;
222
+
223
+ assert(pirq >= 0);
224
+ assert(pirq < PCI_DEVFN_MAX);
225
+
226
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
227
+
228
+ if (level) {
229
+ if (++iohub->irq_level[pirq] == 1) {
230
+ event_notifier_set(&iohub->irqfds[pirq]);
231
+ }
232
+ } else if (iohub->irq_level[pirq] > 0) {
233
+ iohub->irq_level[pirq]--;
234
+ }
235
+}
236
+
237
+static void intr_resample_handler(void *opaque)
238
+{
239
+ ResampleToken *token = opaque;
240
+ RemoteIOHubState *iohub = token->iohub;
241
+ int pirq, s;
242
+
243
+ pirq = token->pirq;
244
+
245
+ s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
246
+
247
+ assert(s >= 0);
248
+
249
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
250
+
251
+ if (iohub->irq_level[pirq]) {
252
+ event_notifier_set(&iohub->irqfds[pirq]);
253
+ }
254
+}
255
+
256
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
257
+{
258
+ RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
259
+ RemoteIOHubState *iohub = &machine->iohub;
260
+ int pirq, intx;
261
+
262
+ intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
263
+
264
+ pirq = remote_iohub_map_irq(pci_dev, intx);
265
+
266
+ if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
267
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
268
+ NULL, NULL, NULL);
269
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
270
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
271
+ memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
272
+ }
273
+
274
+ event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
275
+ event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
276
+
277
+ iohub->token[pirq].iohub = iohub;
278
+ iohub->token[pirq].pirq = pirq;
279
+
280
+ qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
281
+ &iohub->token[pirq]);
282
+}
283
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
284
index XXXXXXX..XXXXXXX 100644
285
--- a/hw/remote/machine.c
286
+++ b/hw/remote/machine.c
287
@@ -XXX,XX +XXX,XX @@
288
#include "exec/address-spaces.h"
289
#include "exec/memory.h"
290
#include "qapi/error.h"
291
+#include "hw/pci/pci_host.h"
292
+#include "hw/remote/iohub.h"
293
294
static void remote_machine_init(MachineState *machine)
295
{
296
MemoryRegion *system_memory, *system_io, *pci_memory;
297
RemoteMachineState *s = REMOTE_MACHINE(machine);
298
RemotePCIHost *rem_host;
299
+ PCIHostState *pci_host;
300
301
system_memory = get_system_memory();
302
system_io = get_system_io();
303
@@ -XXX,XX +XXX,XX @@ static void remote_machine_init(MachineState *machine)
304
memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
305
306
qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
307
+
308
+ pci_host = PCI_HOST_BRIDGE(rem_host);
309
+
310
+ remote_iohub_init(&s->iohub);
311
+
312
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
313
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
23
}
314
}
24
315
25
-static void qed_aio_next_io(void *opaque, int ret);
316
static void remote_machine_class_init(ObjectClass *oc, void *data)
26
+static void qed_aio_next_io(QEDAIOCB *acb, int ret);
317
diff --git a/hw/remote/message.c b/hw/remote/message.c
27
+
318
index XXXXXXX..XXXXXXX 100644
28
+static void qed_aio_start_io(QEDAIOCB *acb)
319
--- a/hw/remote/message.c
29
+{
320
+++ b/hw/remote/message.c
30
+ qed_aio_next_io(acb, 0);
321
@@ -XXX,XX +XXX,XX @@
31
+}
322
#include "hw/pci/pci.h"
32
+
323
#include "exec/memattrs.h"
33
+static void qed_aio_next_io_cb(void *opaque, int ret)
324
#include "hw/remote/memory.h"
34
+{
325
+#include "hw/remote/iohub.h"
35
+ QEDAIOCB *acb = opaque;
326
36
+
327
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
37
+ qed_aio_next_io(acb, ret);
328
MPQemuMsg *msg, Error **errp);
38
+}
329
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
39
330
case MPQEMU_CMD_SYNC_SYSMEM:
40
static void qed_plug_allocating_write_reqs(BDRVQEDState *s)
331
remote_sysmem_reconfig(&msg, &local_err);
332
break;
333
+ case MPQEMU_CMD_SET_IRQFD:
334
+ process_set_irqfd_msg(pci_dev, &msg);
335
+ break;
336
default:
337
error_setg(&local_err,
338
"Unknown command (%d) received for device %s"
339
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
340
index XXXXXXX..XXXXXXX 100644
341
--- a/hw/remote/mpqemu-link.c
342
+++ b/hw/remote/mpqemu-link.c
343
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
344
return false;
345
}
346
break;
347
+ case MPQEMU_CMD_SET_IRQFD:
348
+ if (msg->size || (msg->num_fds != 2)) {
349
+ return false;
350
+ }
351
+ break;
352
default:
353
break;
354
}
355
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
356
index XXXXXXX..XXXXXXX 100644
357
--- a/hw/remote/proxy.c
358
+++ b/hw/remote/proxy.c
359
@@ -XXX,XX +XXX,XX @@
360
#include "qemu/error-report.h"
361
#include "hw/remote/proxy-memory-listener.h"
362
#include "qom/object.h"
363
+#include "qemu/event_notifier.h"
364
+#include "sysemu/kvm.h"
365
+#include "util/event_notifier-posix.c"
366
+
367
+static void proxy_intx_update(PCIDevice *pci_dev)
368
+{
369
+ PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
370
+ PCIINTxRoute route;
371
+ int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
372
+
373
+ if (dev->virq != -1) {
374
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
375
+ dev->virq = -1;
376
+ }
377
+
378
+ route = pci_device_route_intx_to_irq(pci_dev, pin);
379
+
380
+ dev->virq = route.irq;
381
+
382
+ if (dev->virq != -1) {
383
+ kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
384
+ &dev->resample, dev->virq);
385
+ }
386
+}
387
+
388
+static void setup_irqfd(PCIProxyDev *dev)
389
+{
390
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
391
+ MPQemuMsg msg;
392
+ Error *local_err = NULL;
393
+
394
+ event_notifier_init(&dev->intr, 0);
395
+ event_notifier_init(&dev->resample, 0);
396
+
397
+ memset(&msg, 0, sizeof(MPQemuMsg));
398
+ msg.cmd = MPQEMU_CMD_SET_IRQFD;
399
+ msg.num_fds = 2;
400
+ msg.fds[0] = event_notifier_get_fd(&dev->intr);
401
+ msg.fds[1] = event_notifier_get_fd(&dev->resample);
402
+ msg.size = 0;
403
+
404
+ if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
405
+ error_report_err(local_err);
406
+ }
407
+
408
+ dev->virq = -1;
409
+
410
+ proxy_intx_update(pci_dev);
411
+
412
+ pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
413
+}
414
415
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
41
{
416
{
42
@@ -XXX,XX +XXX,XX @@ static void qed_unplug_allocating_write_reqs(BDRVQEDState *s)
417
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
43
418
qio_channel_set_blocking(dev->ioc, true, NULL);
44
acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
419
45
if (acb) {
420
proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
46
- qed_aio_next_io(acb, 0);
421
+
47
+ qed_aio_start_io(acb);
422
+ setup_irqfd(dev);
48
}
49
}
423
}
50
424
51
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete(QEDAIOCB *acb, int ret)
425
static void pci_proxy_dev_exit(PCIDevice *pdev)
52
QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next);
426
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
53
acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs);
427
error_free(dev->migration_blocker);
54
if (acb) {
428
55
- qed_aio_next_io(acb, 0);
429
proxy_memory_listener_deconfigure(&dev->proxy_listener);
56
+ qed_aio_start_io(acb);
430
+
57
} else if (s->header.features & QED_F_NEED_CHECK) {
431
+ event_notifier_cleanup(&dev->intr);
58
qed_start_need_check_timer(s);
432
+ event_notifier_cleanup(&dev->resample);
59
}
60
@@ -XXX,XX +XXX,XX @@ static void qed_commit_l2_update(void *opaque, int ret)
61
acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset);
62
assert(acb->request.l2_table != NULL);
63
64
- qed_aio_next_io(opaque, ret);
65
+ qed_aio_next_io(acb, ret);
66
}
433
}
67
434
68
/**
435
static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
69
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset)
436
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
70
if (need_alloc) {
437
index XXXXXXX..XXXXXXX 100644
71
/* Write out the whole new L2 table */
438
--- a/hw/remote/meson.build
72
qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true,
439
+++ b/hw/remote/meson.build
73
- qed_aio_write_l1_update, acb);
440
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
74
+ qed_aio_write_l1_update, acb);
441
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
75
} else {
442
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
76
/* Write out only the updated part of the L2 table */
443
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
77
qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false,
444
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
78
- qed_aio_next_io, acb);
445
79
+ qed_aio_next_io_cb, acb);
446
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
80
}
447
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
81
return;
82
83
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_main(void *opaque, int ret)
84
}
85
86
if (acb->find_cluster_ret == QED_CLUSTER_FOUND) {
87
- next_fn = qed_aio_next_io;
88
+ next_fn = qed_aio_next_io_cb;
89
} else {
90
if (s->bs->backing) {
91
next_fn = qed_aio_write_flush_before_l2_update;
92
@@ -XXX,XX +XXX,XX @@ static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len)
93
if (acb->flags & QED_AIOCB_ZERO) {
94
/* Skip ahead if the clusters are already zero */
95
if (acb->find_cluster_ret == QED_CLUSTER_ZERO) {
96
- qed_aio_next_io(acb, 0);
97
+ qed_aio_start_io(acb);
98
return;
99
}
100
101
@@ -XXX,XX +XXX,XX @@ static void qed_aio_read_data(void *opaque, int ret,
102
/* Handle zero cluster and backing file reads */
103
if (ret == QED_CLUSTER_ZERO) {
104
qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size);
105
- qed_aio_next_io(acb, 0);
106
+ qed_aio_start_io(acb);
107
return;
108
} else if (ret != QED_CLUSTER_FOUND) {
109
qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov,
110
- &acb->backing_qiov, qed_aio_next_io, acb);
111
+ &acb->backing_qiov, qed_aio_next_io_cb, acb);
112
return;
113
}
114
115
BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
116
bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE,
117
&acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE,
118
- qed_aio_next_io, acb);
119
+ qed_aio_next_io_cb, acb);
120
return;
121
122
err:
123
@@ -XXX,XX +XXX,XX @@ err:
124
/**
125
* Begin next I/O or complete the request
126
*/
127
-static void qed_aio_next_io(void *opaque, int ret)
128
+static void qed_aio_next_io(QEDAIOCB *acb, int ret)
129
{
130
- QEDAIOCB *acb = opaque;
131
BDRVQEDState *s = acb_to_s(acb);
132
QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ?
133
qed_aio_write_data : qed_aio_read_data;
134
@@ -XXX,XX +XXX,XX @@ static BlockAIOCB *qed_aio_setup(BlockDriverState *bs,
135
qemu_iovec_init(&acb->cur_qiov, qiov->niov);
136
137
/* Start request */
138
- qed_aio_next_io(acb, 0);
139
+ qed_aio_start_io(acb);
140
return &acb->common;
141
}
142
143
--
448
--
144
2.9.3
449
2.29.2
145
450
146
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
2
3
Running a very small critical section on pthread_mutex_t and CoMutex
3
Retrieve PCI configuration info about the remote device and
4
shows that pthread_mutex_t is much faster because it doesn't actually
4
configure the Proxy PCI object based on the returned information
5
go to sleep. What happens is that the critical section is shorter
6
than the latency of entering the kernel and thus FUTEX_WAIT always
7
fails. With CoMutex there is no such latency but you still want to
8
avoid wait and wakeup. So introduce it artificially.
9
5
10
This only works with one waiters; because CoMutex is fair, it will
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
always have more waits and wakeups than a pthread_mutex_t.
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
12
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
13
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Reviewed-by: Fam Zheng <famz@redhat.com>
10
Message-id: 85ee367bbb993aa23699b44cfedd83b4ea6d5221.1611938319.git.jag.raman@oracle.com
15
Message-id: 20170213181244.16297-3-pbonzini@redhat.com
16
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
17
---
12
---
18
include/qemu/coroutine.h | 5 +++++
13
hw/remote/proxy.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++
19
util/qemu-coroutine-lock.c | 51 ++++++++++++++++++++++++++++++++++++++++------
14
1 file changed, 84 insertions(+)
20
util/qemu-coroutine.c | 2 +-
21
3 files changed, 51 insertions(+), 7 deletions(-)
22
15
23
diff --git a/include/qemu/coroutine.h b/include/qemu/coroutine.h
16
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
24
index XXXXXXX..XXXXXXX 100644
17
index XXXXXXX..XXXXXXX 100644
25
--- a/include/qemu/coroutine.h
18
--- a/hw/remote/proxy.c
26
+++ b/include/qemu/coroutine.h
19
+++ b/hw/remote/proxy.c
27
@@ -XXX,XX +XXX,XX @@ typedef struct CoMutex {
20
@@ -XXX,XX +XXX,XX @@
28
*/
21
#include "sysemu/kvm.h"
29
unsigned locked;
22
#include "util/event_notifier-posix.c"
30
23
31
+ /* Context that is holding the lock. Useful to avoid spinning
24
+static void probe_pci_info(PCIDevice *dev, Error **errp);
32
+ * when two coroutines on the same AioContext try to get the lock. :)
33
+ */
34
+ AioContext *ctx;
35
+
25
+
36
/* A queue of waiters. Elements are added atomically in front of
26
static void proxy_intx_update(PCIDevice *pci_dev)
37
* from_push. to_pop is only populated, and popped from, by whoever
27
{
38
* is in charge of the next wakeup. This can be an unlocker or,
28
PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
39
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
29
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
40
index XXXXXXX..XXXXXXX 100644
30
{
41
--- a/util/qemu-coroutine-lock.c
31
ERRP_GUARD();
42
+++ b/util/qemu-coroutine-lock.c
32
PCIProxyDev *dev = PCI_PROXY_DEV(device);
43
@@ -XXX,XX +XXX,XX @@
33
+ uint8_t *pci_conf = device->config;
44
#include "qemu-common.h"
34
int fd;
45
#include "qemu/coroutine.h"
35
46
#include "qemu/coroutine_int.h"
36
if (!dev->fd) {
47
+#include "qemu/processor.h"
37
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
48
#include "qemu/queue.h"
38
qemu_mutex_init(&dev->io_mutex);
49
#include "block/aio.h"
39
qio_channel_set_blocking(dev->ioc, true, NULL);
50
#include "trace.h"
40
51
@@ -XXX,XX +XXX,XX @@ void qemu_co_mutex_init(CoMutex *mutex)
41
+ pci_conf[PCI_LATENCY_TIMER] = 0xff;
52
memset(mutex, 0, sizeof(*mutex));
42
+ pci_conf[PCI_INTERRUPT_PIN] = 0x01;
43
+
44
proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
45
46
setup_irqfd(dev);
47
+
48
+ probe_pci_info(PCI_DEVICE(dev), errp);
53
}
49
}
54
50
55
-static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
51
static void pci_proxy_dev_exit(PCIDevice *pdev)
56
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
52
@@ -XXX,XX +XXX,XX @@ const MemoryRegionOps proxy_mr_ops = {
53
.max_access_size = 8,
54
},
55
};
56
+
57
+static void probe_pci_info(PCIDevice *dev, Error **errp)
57
+{
58
+{
58
+ /* Read co before co->ctx; pairs with smp_wmb() in
59
+ PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
59
+ * qemu_coroutine_enter().
60
+ uint32_t orig_val, new_val, base_class, val;
60
+ */
61
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
61
+ smp_read_barrier_depends();
62
+ DeviceClass *dc = DEVICE_CLASS(pc);
62
+ mutex->ctx = co->ctx;
63
+ uint8_t type;
63
+ aio_co_wake(co);
64
+ int i, size;
64
+}
65
+
65
+
66
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
66
+ config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
67
+ CoMutex *mutex)
67
+ pc->vendor_id = (uint16_t)val;
68
{
68
+
69
Coroutine *self = qemu_coroutine_self();
69
+ config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
70
CoWaitRecord w;
70
+ pc->device_id = (uint16_t)val;
71
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
71
+
72
if (co == self) {
72
+ config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
73
/* We got the lock ourselves! */
73
+ pc->class_id = (uint16_t)val;
74
assert(to_wake == &w);
74
+
75
+ mutex->ctx = ctx;
75
+ config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
76
return;
76
+ pc->subsystem_id = (uint16_t)val;
77
}
77
+
78
78
+ base_class = pc->class_id >> 4;
79
- aio_co_wake(co);
79
+ switch (base_class) {
80
+ qemu_co_mutex_wake(mutex, co);
80
+ case PCI_BASE_CLASS_BRIDGE:
81
}
81
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
82
82
+ break;
83
qemu_coroutine_yield();
83
+ case PCI_BASE_CLASS_STORAGE:
84
@@ -XXX,XX +XXX,XX @@ static void coroutine_fn qemu_co_mutex_lock_slowpath(CoMutex *mutex)
84
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
85
85
+ break;
86
void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
86
+ case PCI_BASE_CLASS_NETWORK:
87
{
87
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
88
+ AioContext *ctx = qemu_get_current_aio_context();
88
+ break;
89
Coroutine *self = qemu_coroutine_self();
89
+ case PCI_BASE_CLASS_INPUT:
90
+ int waiters, i;
90
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
91
91
+ break;
92
- if (atomic_fetch_inc(&mutex->locked) == 0) {
92
+ case PCI_BASE_CLASS_DISPLAY:
93
+ /* Running a very small critical section on pthread_mutex_t and CoMutex
93
+ set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
94
+ * shows that pthread_mutex_t is much faster because it doesn't actually
94
+ break;
95
+ * go to sleep. What happens is that the critical section is shorter
95
+ case PCI_BASE_CLASS_PROCESSOR:
96
+ * than the latency of entering the kernel and thus FUTEX_WAIT always
96
+ set_bit(DEVICE_CATEGORY_CPU, dc->categories);
97
+ * fails. With CoMutex there is no such latency but you still want to
97
+ break;
98
+ * avoid wait and wakeup. So introduce it artificially.
98
+ default:
99
+ */
99
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
100
+ i = 0;
100
+ break;
101
+retry_fast_path:
102
+ waiters = atomic_cmpxchg(&mutex->locked, 0, 1);
103
+ if (waiters != 0) {
104
+ while (waiters == 1 && ++i < 1000) {
105
+ if (atomic_read(&mutex->ctx) == ctx) {
106
+ break;
107
+ }
108
+ if (atomic_read(&mutex->locked) == 0) {
109
+ goto retry_fast_path;
110
+ }
111
+ cpu_relax();
112
+ }
113
+ waiters = atomic_fetch_inc(&mutex->locked);
114
+ }
101
+ }
115
+
102
+
116
+ if (waiters == 0) {
103
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
117
/* Uncontended. */
104
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
118
trace_qemu_co_mutex_lock_uncontended(mutex, self);
105
+ MPQEMU_CMD_PCI_CFGREAD);
119
+ mutex->ctx = ctx;
106
+ new_val = 0xffffffff;
120
} else {
107
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
121
- qemu_co_mutex_lock_slowpath(mutex);
108
+ MPQEMU_CMD_PCI_CFGWRITE);
122
+ qemu_co_mutex_lock_slowpath(ctx, mutex);
109
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
123
}
110
+ MPQEMU_CMD_PCI_CFGREAD);
124
mutex->holder = self;
111
+ size = (~(new_val & 0xFFFFFFF0)) + 1;
125
self->locks_held++;
112
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
126
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
113
+ MPQEMU_CMD_PCI_CFGWRITE);
127
assert(mutex->holder == self);
114
+ type = (new_val & 0x1) ?
128
assert(qemu_in_coroutine());
115
+ PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
129
116
+
130
+ mutex->ctx = NULL;
117
+ if (size) {
131
mutex->holder = NULL;
118
+ g_autofree char *name;
132
self->locks_held--;
119
+ pdev->region[i].dev = pdev;
133
if (atomic_fetch_dec(&mutex->locked) == 1) {
120
+ pdev->region[i].present = true;
134
@@ -XXX,XX +XXX,XX @@ void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
121
+ if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
135
unsigned our_handoff;
122
+ pdev->region[i].memory = true;
136
123
+ }
137
if (to_wake) {
124
+ name = g_strdup_printf("bar-region-%d", i);
138
- Coroutine *co = to_wake->co;
125
+ memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
139
- aio_co_wake(co);
126
+ &proxy_mr_ops, &pdev->region[i],
140
+ qemu_co_mutex_wake(mutex, to_wake->co);
127
+ name, size);
141
break;
128
+ pci_register_bar(dev, i, type, &pdev->region[i].mr);
142
}
129
+ }
143
130
+ }
144
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
131
+}
145
index XXXXXXX..XXXXXXX 100644
146
--- a/util/qemu-coroutine.c
147
+++ b/util/qemu-coroutine.c
148
@@ -XXX,XX +XXX,XX @@ void qemu_coroutine_enter(Coroutine *co)
149
co->ctx = qemu_get_current_aio_context();
150
151
/* Store co->ctx before anything that stores co. Matches
152
- * barrier in aio_co_wake.
153
+ * barrier in aio_co_wake and qemu_co_mutex_wake.
154
*/
155
smp_wmb();
156
157
--
132
--
158
2.9.3
133
2.29.2
159
134
160
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
Perform device reset in the remote process when QEMU performs
4
device reset. This is required to reset the internal state
5
(like registers, etc...) of emulated devices
6
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
3
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
4
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
11
Message-id: 7cb220a51f565dc0817bd76e2f540e89c2d2b850.1611938319.git.jag.raman@oracle.com
5
Reviewed-by: Fam Zheng <famz@redhat.com>
6
Reviewed-by: Daniel P. Berrange <berrange@redhat.com>
7
Message-id: 20170213135235.12274-15-pbonzini@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
13
---
10
block/archipelago.c | 3 +++
14
include/hw/remote/mpqemu-link.h | 1 +
11
block/blkreplay.c | 2 +-
15
hw/remote/message.c | 22 ++++++++++++++++++++++
12
block/block-backend.c | 6 ++++++
16
hw/remote/proxy.c | 19 +++++++++++++++++++
13
block/curl.c | 26 ++++++++++++++++++--------
17
3 files changed, 42 insertions(+)
14
block/gluster.c | 9 +--------
15
block/io.c | 6 +++++-
16
block/iscsi.c | 6 +++++-
17
block/linux-aio.c | 15 +++++++++------
18
block/nfs.c | 3 ++-
19
block/null.c | 4 ++++
20
block/qed.c | 3 +++
21
block/rbd.c | 4 ++++
22
dma-helpers.c | 2 ++
23
hw/block/virtio-blk.c | 2 ++
24
hw/scsi/scsi-bus.c | 2 ++
25
util/async.c | 4 ++--
26
util/thread-pool.c | 2 ++
27
17 files changed, 71 insertions(+), 28 deletions(-)
28
18
29
diff --git a/block/archipelago.c b/block/archipelago.c
19
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
30
index XXXXXXX..XXXXXXX 100644
20
index XXXXXXX..XXXXXXX 100644
31
--- a/block/archipelago.c
21
--- a/include/hw/remote/mpqemu-link.h
32
+++ b/block/archipelago.c
22
+++ b/include/hw/remote/mpqemu-link.h
33
@@ -XXX,XX +XXX,XX @@ static void qemu_archipelago_complete_aio(void *opaque)
23
@@ -XXX,XX +XXX,XX @@ typedef enum {
24
MPQEMU_CMD_BAR_WRITE,
25
MPQEMU_CMD_BAR_READ,
26
MPQEMU_CMD_SET_IRQFD,
27
+ MPQEMU_CMD_DEVICE_RESET,
28
MPQEMU_CMD_MAX,
29
} MPQemuCmd;
30
31
diff --git a/hw/remote/message.c b/hw/remote/message.c
32
index XXXXXXX..XXXXXXX 100644
33
--- a/hw/remote/message.c
34
+++ b/hw/remote/message.c
35
@@ -XXX,XX +XXX,XX @@
36
#include "exec/memattrs.h"
37
#include "hw/remote/memory.h"
38
#include "hw/remote/iohub.h"
39
+#include "sysemu/reset.h"
40
41
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
42
MPQemuMsg *msg, Error **errp);
43
@@ -XXX,XX +XXX,XX @@ static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
44
MPQemuMsg *msg, Error **errp);
45
static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
46
static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
47
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
48
+ Error **errp);
49
50
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
34
{
51
{
35
AIORequestData *reqdata = (AIORequestData *) opaque;
52
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
36
ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb;
53
case MPQEMU_CMD_SET_IRQFD:
37
+ AioContext *ctx = bdrv_get_aio_context(aio_cb->common.bs);
54
process_set_irqfd_msg(pci_dev, &msg);
38
55
break;
39
+ aio_context_acquire(ctx);
56
+ case MPQEMU_CMD_DEVICE_RESET:
40
aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret);
57
+ process_device_reset_msg(com->ioc, pci_dev, &local_err);
41
+ aio_context_release(ctx);
58
+ break;
42
aio_cb->status = 0;
43
44
qemu_aio_unref(aio_cb);
45
diff --git a/block/blkreplay.c b/block/blkreplay.c
46
index XXXXXXX..XXXXXXX 100755
47
--- a/block/blkreplay.c
48
+++ b/block/blkreplay.c
49
@@ -XXX,XX +XXX,XX @@ static int64_t blkreplay_getlength(BlockDriverState *bs)
50
static void blkreplay_bh_cb(void *opaque)
51
{
52
Request *req = opaque;
53
- qemu_coroutine_enter(req->co);
54
+ aio_co_wake(req->co);
55
qemu_bh_delete(req->bh);
56
g_free(req);
57
}
58
diff --git a/block/block-backend.c b/block/block-backend.c
59
index XXXXXXX..XXXXXXX 100644
60
--- a/block/block-backend.c
61
+++ b/block/block-backend.c
62
@@ -XXX,XX +XXX,XX @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
63
static void error_callback_bh(void *opaque)
64
{
65
struct BlockBackendAIOCB *acb = opaque;
66
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
67
68
bdrv_dec_in_flight(acb->common.bs);
69
+ aio_context_acquire(ctx);
70
acb->common.cb(acb->common.opaque, acb->ret);
71
+ aio_context_release(ctx);
72
qemu_aio_unref(acb);
73
}
74
75
@@ -XXX,XX +XXX,XX @@ static void blk_aio_complete(BlkAioEmAIOCB *acb)
76
static void blk_aio_complete_bh(void *opaque)
77
{
78
BlkAioEmAIOCB *acb = opaque;
79
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
80
81
assert(acb->has_returned);
82
+ aio_context_acquire(ctx);
83
blk_aio_complete(acb);
84
+ aio_context_release(ctx);
85
}
86
87
static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
88
diff --git a/block/curl.c b/block/curl.c
89
index XXXXXXX..XXXXXXX 100644
90
--- a/block/curl.c
91
+++ b/block/curl.c
92
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
93
{
94
CURLState *state;
95
int running;
96
+ int ret = -EINPROGRESS;
97
98
CURLAIOCB *acb = p;
99
- BDRVCURLState *s = acb->common.bs->opaque;
100
+ BlockDriverState *bs = acb->common.bs;
101
+ BDRVCURLState *s = bs->opaque;
102
+ AioContext *ctx = bdrv_get_aio_context(bs);
103
104
size_t start = acb->sector_num * BDRV_SECTOR_SIZE;
105
size_t end;
106
107
+ aio_context_acquire(ctx);
108
+
109
// In case we have the requested data already (e.g. read-ahead),
110
// we can just call the callback and be done.
111
switch (curl_find_buf(s, start, acb->nb_sectors * BDRV_SECTOR_SIZE, acb)) {
112
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
113
qemu_aio_unref(acb);
114
// fall through
115
case FIND_RET_WAIT:
116
- return;
117
+ goto out;
118
default:
59
default:
119
break;
60
error_setg(&local_err,
120
}
61
"Unknown command (%d) received for device %s"
121
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
62
@@ -XXX,XX +XXX,XX @@ fail:
122
// No cache found, so let's start a new request
63
getpid());
123
state = curl_init_state(acb->common.bs, s);
124
if (!state) {
125
- acb->common.cb(acb->common.opaque, -EIO);
126
- qemu_aio_unref(acb);
127
- return;
128
+ ret = -EIO;
129
+ goto out;
130
}
131
132
acb->start = 0;
133
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
134
state->orig_buf = g_try_malloc(state->buf_len);
135
if (state->buf_len && state->orig_buf == NULL) {
136
curl_clean_state(state);
137
- acb->common.cb(acb->common.opaque, -ENOMEM);
138
- qemu_aio_unref(acb);
139
- return;
140
+ ret = -ENOMEM;
141
+ goto out;
142
}
143
state->acb[0] = acb;
144
145
@@ -XXX,XX +XXX,XX @@ static void curl_readv_bh_cb(void *p)
146
147
/* Tell curl it needs to kick things off */
148
curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running);
149
+
150
+out:
151
+ if (ret != -EINPROGRESS) {
152
+ acb->common.cb(acb->common.opaque, ret);
153
+ qemu_aio_unref(acb);
154
+ }
155
+ aio_context_release(ctx);
156
}
157
158
static BlockAIOCB *curl_aio_readv(BlockDriverState *bs,
159
diff --git a/block/gluster.c b/block/gluster.c
160
index XXXXXXX..XXXXXXX 100644
161
--- a/block/gluster.c
162
+++ b/block/gluster.c
163
@@ -XXX,XX +XXX,XX @@ static struct glfs *qemu_gluster_init(BlockdevOptionsGluster *gconf,
164
return qemu_gluster_glfs_init(gconf, errp);
165
}
166
167
-static void qemu_gluster_complete_aio(void *opaque)
168
-{
169
- GlusterAIOCB *acb = (GlusterAIOCB *)opaque;
170
-
171
- qemu_coroutine_enter(acb->coroutine);
172
-}
173
-
174
/*
175
* AIO callback routine called from GlusterFS thread.
176
*/
177
@@ -XXX,XX +XXX,XX @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg)
178
acb->ret = -EIO; /* Partial read/write - fail it */
179
}
180
181
- aio_bh_schedule_oneshot(acb->aio_context, qemu_gluster_complete_aio, acb);
182
+ aio_co_schedule(acb->aio_context, acb->coroutine);
183
}
184
185
static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags)
186
diff --git a/block/io.c b/block/io.c
187
index XXXXXXX..XXXXXXX 100644
188
--- a/block/io.c
189
+++ b/block/io.c
190
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_drain_bh_cb(void *opaque)
191
bdrv_dec_in_flight(bs);
192
bdrv_drained_begin(bs);
193
data->done = true;
194
- qemu_coroutine_enter(co);
195
+ aio_co_wake(co);
196
}
197
198
static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
199
@@ -XXX,XX +XXX,XX @@ static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
200
static void bdrv_co_em_bh(void *opaque)
201
{
202
BlockAIOCBCoroutine *acb = opaque;
203
+ BlockDriverState *bs = acb->common.bs;
204
+ AioContext *ctx = bdrv_get_aio_context(bs);
205
206
assert(!acb->need_bh);
207
+ aio_context_acquire(ctx);
208
bdrv_co_complete(acb);
209
+ aio_context_release(ctx);
210
}
211
212
static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
213
diff --git a/block/iscsi.c b/block/iscsi.c
214
index XXXXXXX..XXXXXXX 100644
215
--- a/block/iscsi.c
216
+++ b/block/iscsi.c
217
@@ -XXX,XX +XXX,XX @@ static void
218
iscsi_bh_cb(void *p)
219
{
220
IscsiAIOCB *acb = p;
221
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
222
223
qemu_bh_delete(acb->bh);
224
225
g_free(acb->buf);
226
acb->buf = NULL;
227
228
+ aio_context_acquire(ctx);
229
acb->common.cb(acb->common.opaque, acb->status);
230
+ aio_context_release(ctx);
231
232
if (acb->task != NULL) {
233
scsi_free_scsi_task(acb->task);
234
@@ -XXX,XX +XXX,XX @@ iscsi_schedule_bh(IscsiAIOCB *acb)
235
static void iscsi_co_generic_bh_cb(void *opaque)
236
{
237
struct IscsiTask *iTask = opaque;
238
+
239
iTask->complete = 1;
240
- qemu_coroutine_enter(iTask->co);
241
+ aio_co_wake(iTask->co);
242
}
243
244
static void iscsi_retry_timer_expired(void *opaque)
245
diff --git a/block/linux-aio.c b/block/linux-aio.c
246
index XXXXXXX..XXXXXXX 100644
247
--- a/block/linux-aio.c
248
+++ b/block/linux-aio.c
249
@@ -XXX,XX +XXX,XX @@ struct LinuxAioState {
250
io_context_t ctx;
251
EventNotifier e;
252
253
- /* io queue for submit at batch */
254
+ /* io queue for submit at batch. Protected by AioContext lock. */
255
LaioQueue io_q;
256
257
- /* I/O completion processing */
258
+ /* I/O completion processing. Only runs in I/O thread. */
259
QEMUBH *completion_bh;
260
int event_idx;
261
int event_max;
262
@@ -XXX,XX +XXX,XX @@ static inline ssize_t io_event_ret(struct io_event *ev)
263
*/
264
static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
265
{
266
+ LinuxAioState *s = laiocb->ctx;
267
int ret;
268
269
ret = laiocb->ret;
270
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
271
}
272
273
laiocb->ret = ret;
274
+ aio_context_acquire(s->aio_context);
275
if (laiocb->co) {
276
/* If the coroutine is already entered it must be in ioq_submit() and
277
* will notice laio->ret has been filled in when it eventually runs
278
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completion(struct qemu_laiocb *laiocb)
279
laiocb->common.cb(laiocb->common.opaque, ret);
280
qemu_aio_unref(laiocb);
281
}
282
+ aio_context_release(s->aio_context);
283
}
284
285
/**
286
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_process_completions(LinuxAioState *s)
287
static void qemu_laio_process_completions_and_submit(LinuxAioState *s)
288
{
289
qemu_laio_process_completions(s);
290
+
291
+ aio_context_acquire(s->aio_context);
292
if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) {
293
ioq_submit(s);
294
}
295
+ aio_context_release(s->aio_context);
296
}
297
298
static void qemu_laio_completion_bh(void *opaque)
299
@@ -XXX,XX +XXX,XX @@ static void qemu_laio_completion_cb(EventNotifier *e)
300
LinuxAioState *s = container_of(e, LinuxAioState, e);
301
302
if (event_notifier_test_and_clear(&s->e)) {
303
- aio_context_acquire(s->aio_context);
304
qemu_laio_process_completions_and_submit(s);
305
- aio_context_release(s->aio_context);
306
}
64
}
307
}
65
}
308
66
+
309
@@ -XXX,XX +XXX,XX @@ static bool qemu_laio_poll_cb(void *opaque)
67
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
310
return false;
68
+ Error **errp)
311
}
69
+{
312
70
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
313
- aio_context_acquire(s->aio_context);
71
+ DeviceState *s = DEVICE(dev);
314
qemu_laio_process_completions_and_submit(s);
72
+ MPQemuMsg ret = { 0 };
315
- aio_context_release(s->aio_context);
73
+
316
return true;
74
+ if (dc->reset) {
75
+ dc->reset(s);
76
+ }
77
+
78
+ ret.cmd = MPQEMU_CMD_RET;
79
+
80
+ mpqemu_msg_send(&ret, ioc, errp);
81
+}
82
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
83
index XXXXXXX..XXXXXXX 100644
84
--- a/hw/remote/proxy.c
85
+++ b/hw/remote/proxy.c
86
@@ -XXX,XX +XXX,XX @@
87
#include "util/event_notifier-posix.c"
88
89
static void probe_pci_info(PCIDevice *dev, Error **errp);
90
+static void proxy_device_reset(DeviceState *dev);
91
92
static void proxy_intx_update(PCIDevice *pci_dev)
93
{
94
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
95
k->config_read = pci_proxy_read_config;
96
k->config_write = pci_proxy_write_config;
97
98
+ dc->reset = proxy_device_reset;
99
+
100
device_class_set_props(dc, proxy_properties);
317
}
101
}
318
102
319
@@ -XXX,XX +XXX,XX @@ void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context)
103
@@ -XXX,XX +XXX,XX @@ static void probe_pci_info(PCIDevice *dev, Error **errp)
320
{
321
aio_set_event_notifier(old_context, &s->e, false, NULL, NULL);
322
qemu_bh_delete(s->completion_bh);
323
+ s->aio_context = NULL;
324
}
325
326
void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context)
327
diff --git a/block/nfs.c b/block/nfs.c
328
index XXXXXXX..XXXXXXX 100644
329
--- a/block/nfs.c
330
+++ b/block/nfs.c
331
@@ -XXX,XX +XXX,XX @@ static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
332
static void nfs_co_generic_bh_cb(void *opaque)
333
{
334
NFSRPC *task = opaque;
335
+
336
task->complete = 1;
337
- qemu_coroutine_enter(task->co);
338
+ aio_co_wake(task->co);
339
}
340
341
static void
342
diff --git a/block/null.c b/block/null.c
343
index XXXXXXX..XXXXXXX 100644
344
--- a/block/null.c
345
+++ b/block/null.c
346
@@ -XXX,XX +XXX,XX @@ static const AIOCBInfo null_aiocb_info = {
347
static void null_bh_cb(void *opaque)
348
{
349
NullAIOCB *acb = opaque;
350
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
351
+
352
+ aio_context_acquire(ctx);
353
acb->common.cb(acb->common.opaque, 0);
354
+ aio_context_release(ctx);
355
qemu_aio_unref(acb);
356
}
357
358
diff --git a/block/qed.c b/block/qed.c
359
index XXXXXXX..XXXXXXX 100644
360
--- a/block/qed.c
361
+++ b/block/qed.c
362
@@ -XXX,XX +XXX,XX @@ static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index,
363
static void qed_aio_complete_bh(void *opaque)
364
{
365
QEDAIOCB *acb = opaque;
366
+ BDRVQEDState *s = acb_to_s(acb);
367
BlockCompletionFunc *cb = acb->common.cb;
368
void *user_opaque = acb->common.opaque;
369
int ret = acb->bh_ret;
370
@@ -XXX,XX +XXX,XX @@ static void qed_aio_complete_bh(void *opaque)
371
qemu_aio_unref(acb);
372
373
/* Invoke callback */
374
+ qed_acquire(s);
375
cb(user_opaque, ret);
376
+ qed_release(s);
377
}
378
379
static void qed_aio_complete(QEDAIOCB *acb, int ret)
380
diff --git a/block/rbd.c b/block/rbd.c
381
index XXXXXXX..XXXXXXX 100644
382
--- a/block/rbd.c
383
+++ b/block/rbd.c
384
@@ -XXX,XX +XXX,XX @@ shutdown:
385
static void qemu_rbd_complete_aio(RADOSCB *rcb)
386
{
387
RBDAIOCB *acb = rcb->acb;
388
+ AioContext *ctx = bdrv_get_aio_context(acb->common.bs);
389
int64_t r;
390
391
r = rcb->ret;
392
@@ -XXX,XX +XXX,XX @@ static void qemu_rbd_complete_aio(RADOSCB *rcb)
393
qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size);
394
}
395
qemu_vfree(acb->bounce);
396
+
397
+ aio_context_acquire(ctx);
398
acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret));
399
+ aio_context_release(ctx);
400
401
qemu_aio_unref(acb);
402
}
403
diff --git a/dma-helpers.c b/dma-helpers.c
404
index XXXXXXX..XXXXXXX 100644
405
--- a/dma-helpers.c
406
+++ b/dma-helpers.c
407
@@ -XXX,XX +XXX,XX @@ static void dma_blk_cb(void *opaque, int ret)
408
QEMU_ALIGN_DOWN(dbs->iov.size, dbs->align));
409
}
410
411
+ aio_context_acquire(dbs->ctx);
412
dbs->acb = dbs->io_func(dbs->offset, &dbs->iov,
413
dma_blk_cb, dbs, dbs->io_func_opaque);
414
+ aio_context_release(dbs->ctx);
415
assert(dbs->acb);
416
}
417
418
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
419
index XXXXXXX..XXXXXXX 100644
420
--- a/hw/block/virtio-blk.c
421
+++ b/hw/block/virtio-blk.c
422
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
423
424
s->rq = NULL;
425
426
+ aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
427
while (req) {
428
VirtIOBlockReq *next = req->next;
429
if (virtio_blk_handle_request(req, &mrb)) {
430
@@ -XXX,XX +XXX,XX @@ static void virtio_blk_dma_restart_bh(void *opaque)
431
if (mrb.num_reqs) {
432
virtio_blk_submit_multireq(s->blk, &mrb);
433
}
434
+ aio_context_release(blk_get_aio_context(s->conf.conf.blk));
435
}
436
437
static void virtio_blk_dma_restart_cb(void *opaque, int running,
438
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
439
index XXXXXXX..XXXXXXX 100644
440
--- a/hw/scsi/scsi-bus.c
441
+++ b/hw/scsi/scsi-bus.c
442
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
443
qemu_bh_delete(s->bh);
444
s->bh = NULL;
445
446
+ aio_context_acquire(blk_get_aio_context(s->conf.blk));
447
QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
448
scsi_req_ref(req);
449
if (req->retry) {
450
@@ -XXX,XX +XXX,XX @@ static void scsi_dma_restart_bh(void *opaque)
451
}
104
}
452
scsi_req_unref(req);
453
}
454
+ aio_context_release(blk_get_aio_context(s->conf.blk));
455
}
456
457
void scsi_req_retry(SCSIRequest *req)
458
diff --git a/util/async.c b/util/async.c
459
index XXXXXXX..XXXXXXX 100644
460
--- a/util/async.c
461
+++ b/util/async.c
462
@@ -XXX,XX +XXX,XX @@ int aio_bh_poll(AioContext *ctx)
463
ret = 1;
464
}
465
bh->idle = 0;
466
- aio_context_acquire(ctx);
467
aio_bh_call(bh);
468
- aio_context_release(ctx);
469
}
470
if (bh->deleted) {
471
deleted = true;
472
@@ -XXX,XX +XXX,XX @@ static void co_schedule_bh_cb(void *opaque)
473
Coroutine *co = QSLIST_FIRST(&straight);
474
QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
475
trace_aio_co_schedule_bh_cb(ctx, co);
476
+ aio_context_acquire(ctx);
477
qemu_coroutine_enter(co);
478
+ aio_context_release(ctx);
479
}
105
}
480
}
106
}
481
107
+
482
diff --git a/util/thread-pool.c b/util/thread-pool.c
108
+static void proxy_device_reset(DeviceState *dev)
483
index XXXXXXX..XXXXXXX 100644
109
+{
484
--- a/util/thread-pool.c
110
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
485
+++ b/util/thread-pool.c
111
+ MPQemuMsg msg = { 0 };
486
@@ -XXX,XX +XXX,XX @@ static void thread_pool_completion_bh(void *opaque)
112
+ Error *local_err = NULL;
487
ThreadPool *pool = opaque;
113
+
488
ThreadPoolElement *elem, *next;
114
+ msg.cmd = MPQEMU_CMD_DEVICE_RESET;
489
115
+ msg.size = 0;
490
+ aio_context_acquire(pool->ctx);
116
+
491
restart:
117
+ mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
492
QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
118
+ if (local_err) {
493
if (elem->state != THREAD_DONE) {
119
+ error_report_err(local_err);
494
@@ -XXX,XX +XXX,XX @@ restart:
120
+ }
495
qemu_aio_unref(elem);
121
+
496
}
122
+}
497
}
498
+ aio_context_release(pool->ctx);
499
}
500
501
static void thread_pool_cancel(BlockAIOCB *acb)
502
--
123
--
503
2.9.3
124
2.29.2
504
125
505
diff view generated by jsdifflib
1
From: Paolo Bonzini <pbonzini@redhat.com>
1
From: "Denis V. Lunev" <den@openvz.org>
2
2
3
qcow2_create2 calls this. Do not run a nested event loop, as that
3
Original specification says that l1 table size if 64 * l1_size, which
4
breaks when aio_co_wake tries to queue the coroutine on the co_queue_wakeup
4
is obviously wrong. The size of the l1 entry is 64 _bits_, not bytes.
5
list of the currently running one.
5
Thus 64 is to be replaces with 8 as specification says about bytes.
6
6
7
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
There is also minor tweak, field name is renamed from l1 to l1_table,
8
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
8
which matches with the later text.
9
Reviewed-by: Fam Zheng <famz@redhat.com>
9
10
Message-id: 20170213135235.12274-4-pbonzini@redhat.com
10
Signed-off-by: Denis V. Lunev <den@openvz.org>
11
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
12
Message-id: 20210128171313.2210947-1-den@openvz.org
13
CC: Stefan Hajnoczi <stefanha@redhat.com>
14
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
15
16
[Replace the original commit message "docs: fix mistake in dirty bitmap
17
feature description" as suggested by Eric Blake.
18
--Stefan]
19
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
21
---
13
block/block-backend.c | 12 ++++++++----
22
docs/interop/parallels.txt | 2 +-
14
1 file changed, 8 insertions(+), 4 deletions(-)
23
1 file changed, 1 insertion(+), 1 deletion(-)
15
24
16
diff --git a/block/block-backend.c b/block/block-backend.c
25
diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt
17
index XXXXXXX..XXXXXXX 100644
26
index XXXXXXX..XXXXXXX 100644
18
--- a/block/block-backend.c
27
--- a/docs/interop/parallels.txt
19
+++ b/block/block-backend.c
28
+++ b/docs/interop/parallels.txt
20
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
29
@@ -XXX,XX +XXX,XX @@ of its data area are:
21
{
30
28 - 31: l1_size
22
QEMUIOVector qiov;
31
The number of entries in the L1 table of the bitmap.
23
struct iovec iov;
32
24
- Coroutine *co;
33
- variable: l1 (64 * l1_size bytes)
25
BlkRwCo rwco;
34
+ variable: l1_table (8 * l1_size bytes)
26
35
L1 offset table (in bytes)
27
iov = (struct iovec) {
36
28
@@ -XXX,XX +XXX,XX @@ static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
37
A dirty bitmap is stored using a one-level structure for the mapping to host
29
.ret = NOT_DONE,
30
};
31
32
- co = qemu_coroutine_create(co_entry, &rwco);
33
- qemu_coroutine_enter(co);
34
- BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
35
+ if (qemu_in_coroutine()) {
36
+ /* Fast-path if already in coroutine context */
37
+ co_entry(&rwco);
38
+ } else {
39
+ Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
40
+ qemu_coroutine_enter(co);
41
+ BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
42
+ }
43
44
return rwco.ret;
45
}
46
--
38
--
47
2.9.3
39
2.29.2
48
40
49
diff view generated by jsdifflib