1
The following changes since commit ca4e667dbf431d4a2a5a619cde79d30dd2ac3eb2:
1
The following changes since commit e2c5093c993ef646e4e28f7aa78429853bcc06ac:
2
2
3
Merge remote-tracking branch 'remotes/kraxel/tags/usb-20170717-pull-request' into staging (2017-07-17 17:54:17 +0100)
3
iotests: 30: drop from auto group (and effectively from make check) (2021-02-05 15:16:13 +0000)
4
4
5
are available in the git repository at:
5
are available in the Git repository at:
6
6
7
git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request
7
https://gitlab.com/stefanha/qemu.git tags/block-pull-request
8
8
9
for you to fetch changes up to 8508eee740c78d1465e25dad7c3e06137485dfbc:
9
for you to fetch changes up to b07011f375bda3319cf72eee7cb18d310078387b:
10
10
11
live-block-ops.txt: Rename, rewrite, and improve it (2017-07-18 00:11:01 -0400)
11
docs: fix Parallels Image "dirty bitmap" section (2021-02-05 16:36:36 +0000)
12
12
13
----------------------------------------------------------------
13
----------------------------------------------------------------
14
Block patches (documentation)
14
Pull request
15
16
v3:
17
* Replace {0} array initialization with {} to make clang happy [Peter]
18
15
----------------------------------------------------------------
19
----------------------------------------------------------------
16
20
17
Kashyap Chamarthy (2):
21
Denis V. Lunev (1):
18
bitmaps.md: Convert to rST; move it into 'interop' dir
22
docs: fix Parallels Image "dirty bitmap" section
19
live-block-ops.txt: Rename, rewrite, and improve it
20
23
21
docs/devel/bitmaps.md | 505 ---------------
24
Elena Ufimtseva (8):
22
docs/interop/bitmaps.rst | 555 ++++++++++++++++
25
multi-process: add configure and usage information
23
docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
26
io: add qio_channel_writev_full_all helper
24
docs/live-block-ops.txt | 72 ---
27
io: add qio_channel_readv_full_all_eof & qio_channel_readv_full_all
25
4 files changed, 1643 insertions(+), 577 deletions(-)
28
helpers
26
delete mode 100644 docs/devel/bitmaps.md
29
multi-process: define MPQemuMsg format and transmission functions
27
create mode 100644 docs/interop/bitmaps.rst
30
multi-process: introduce proxy object
28
create mode 100644 docs/interop/live-block-operations.rst
31
multi-process: add proxy communication functions
29
delete mode 100644 docs/live-block-ops.txt
32
multi-process: Forward PCI config space acceses to the remote process
33
multi-process: perform device reset in the remote process
34
35
Jagannathan Raman (11):
36
memory: alloc RAM from file at offset
37
multi-process: Add config option for multi-process QEMU
38
multi-process: setup PCI host bridge for remote device
39
multi-process: setup a machine object for remote device process
40
multi-process: Initialize message handler in remote device
41
multi-process: Associate fd of a PCIDevice with its object
42
multi-process: setup memory manager for remote device
43
multi-process: PCI BAR read/write handling for proxy & remote
44
endpoints
45
multi-process: Synchronize remote memory
46
multi-process: create IOHUB object to handle irq
47
multi-process: Retrieve PCI info from remote process
48
49
John G Johnson (1):
50
multi-process: add the concept description to
51
docs/devel/qemu-multiprocess
52
53
Stefan Hajnoczi (6):
54
.github: point Repo Lockdown bot to GitLab repo
55
gitmodules: use GitLab repos instead of qemu.org
56
gitlab-ci: remove redundant GitLab repo URL command
57
docs: update README to use GitLab repo URLs
58
pc-bios: update mirror URLs to GitLab
59
get_maintainer: update repo URL to GitLab
60
61
MAINTAINERS | 24 +
62
README.rst | 4 +-
63
docs/devel/index.rst | 1 +
64
docs/devel/multi-process.rst | 966 ++++++++++++++++++++++
65
docs/system/index.rst | 1 +
66
docs/system/multi-process.rst | 64 ++
67
docs/interop/parallels.txt | 2 +-
68
configure | 10 +
69
meson.build | 5 +-
70
hw/remote/trace.h | 1 +
71
include/exec/memory.h | 2 +
72
include/exec/ram_addr.h | 4 +-
73
include/hw/pci-host/remote.h | 30 +
74
include/hw/pci/pci_ids.h | 3 +
75
include/hw/remote/iohub.h | 42 +
76
include/hw/remote/machine.h | 38 +
77
include/hw/remote/memory.h | 19 +
78
include/hw/remote/mpqemu-link.h | 99 +++
79
include/hw/remote/proxy-memory-listener.h | 28 +
80
include/hw/remote/proxy.h | 48 ++
81
include/io/channel.h | 78 ++
82
include/qemu/mmap-alloc.h | 4 +-
83
include/sysemu/iothread.h | 6 +
84
backends/hostmem-memfd.c | 2 +-
85
hw/misc/ivshmem.c | 3 +-
86
hw/pci-host/remote.c | 75 ++
87
hw/remote/iohub.c | 119 +++
88
hw/remote/machine.c | 80 ++
89
hw/remote/memory.c | 65 ++
90
hw/remote/message.c | 230 ++++++
91
hw/remote/mpqemu-link.c | 267 ++++++
92
hw/remote/proxy-memory-listener.c | 227 +++++
93
hw/remote/proxy.c | 379 +++++++++
94
hw/remote/remote-obj.c | 203 +++++
95
io/channel.c | 116 ++-
96
iothread.c | 6 +
97
softmmu/memory.c | 3 +-
98
softmmu/physmem.c | 12 +-
99
util/mmap-alloc.c | 8 +-
100
util/oslib-posix.c | 2 +-
101
.github/lockdown.yml | 8 +-
102
.gitlab-ci.yml | 1 -
103
.gitmodules | 44 +-
104
Kconfig.host | 4 +
105
hw/Kconfig | 1 +
106
hw/meson.build | 1 +
107
hw/pci-host/Kconfig | 3 +
108
hw/pci-host/meson.build | 1 +
109
hw/remote/Kconfig | 4 +
110
hw/remote/meson.build | 13 +
111
hw/remote/trace-events | 4 +
112
pc-bios/README | 4 +-
113
scripts/get_maintainer.pl | 2 +-
114
53 files changed, 3296 insertions(+), 70 deletions(-)
115
create mode 100644 docs/devel/multi-process.rst
116
create mode 100644 docs/system/multi-process.rst
117
create mode 100644 hw/remote/trace.h
118
create mode 100644 include/hw/pci-host/remote.h
119
create mode 100644 include/hw/remote/iohub.h
120
create mode 100644 include/hw/remote/machine.h
121
create mode 100644 include/hw/remote/memory.h
122
create mode 100644 include/hw/remote/mpqemu-link.h
123
create mode 100644 include/hw/remote/proxy-memory-listener.h
124
create mode 100644 include/hw/remote/proxy.h
125
create mode 100644 hw/pci-host/remote.c
126
create mode 100644 hw/remote/iohub.c
127
create mode 100644 hw/remote/machine.c
128
create mode 100644 hw/remote/memory.c
129
create mode 100644 hw/remote/message.c
130
create mode 100644 hw/remote/mpqemu-link.c
131
create mode 100644 hw/remote/proxy-memory-listener.c
132
create mode 100644 hw/remote/proxy.c
133
create mode 100644 hw/remote/remote-obj.c
134
create mode 100644 hw/remote/Kconfig
135
create mode 100644 hw/remote/meson.build
136
create mode 100644 hw/remote/trace-events
30
137
31
--
138
--
32
2.9.4
139
2.29.2
33
140
34
diff view generated by jsdifflib
New patch
1
Use the GitLab repo URL as the main repo location in order to reduce
2
load on qemu.org.
1
3
4
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
6
Reviewed-by: Thomas Huth <thuth@redhat.com>
7
Message-id: 20210111115017.156802-2-stefanha@redhat.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
.github/lockdown.yml | 8 ++++----
11
1 file changed, 4 insertions(+), 4 deletions(-)
12
13
diff --git a/.github/lockdown.yml b/.github/lockdown.yml
14
index XXXXXXX..XXXXXXX 100644
15
--- a/.github/lockdown.yml
16
+++ b/.github/lockdown.yml
17
@@ -XXX,XX +XXX,XX @@ issues:
18
comment: |
19
Thank you for your interest in the QEMU project.
20
21
- This repository is a read-only mirror of the project's master
22
- repostories hosted on https://git.qemu.org/git/qemu.git.
23
+ This repository is a read-only mirror of the project's repostories hosted
24
+ at https://gitlab.com/qemu-project/qemu.git.
25
The project does not process issues filed on GitHub.
26
27
The project issues are tracked on Launchpad:
28
@@ -XXX,XX +XXX,XX @@ pulls:
29
comment: |
30
Thank you for your interest in the QEMU project.
31
32
- This repository is a read-only mirror of the project's master
33
- repostories hosted on https://git.qemu.org/git/qemu.git.
34
+ This repository is a read-only mirror of the project's repostories hosted
35
+ on https://gitlab.com/qemu-project/qemu.git.
36
The project does not process merge requests filed on GitHub.
37
38
QEMU welcomes contributions of code (either fixing bugs or adding new
39
--
40
2.29.2
41
diff view generated by jsdifflib
New patch
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
1
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Message-id: 20210111115017.156802-3-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
.gitmodules | 44 ++++++++++++++++++++++----------------------
13
1 file changed, 22 insertions(+), 22 deletions(-)
14
15
diff --git a/.gitmodules b/.gitmodules
16
index XXXXXXX..XXXXXXX 100644
17
--- a/.gitmodules
18
+++ b/.gitmodules
19
@@ -XXX,XX +XXX,XX @@
20
[submodule "roms/seabios"]
21
    path = roms/seabios
22
-    url = https://git.qemu.org/git/seabios.git/
23
+    url = https://gitlab.com/qemu-project/seabios.git/
24
[submodule "roms/SLOF"]
25
    path = roms/SLOF
26
-    url = https://git.qemu.org/git/SLOF.git
27
+    url = https://gitlab.com/qemu-project/SLOF.git
28
[submodule "roms/ipxe"]
29
    path = roms/ipxe
30
-    url = https://git.qemu.org/git/ipxe.git
31
+    url = https://gitlab.com/qemu-project/ipxe.git
32
[submodule "roms/openbios"]
33
    path = roms/openbios
34
-    url = https://git.qemu.org/git/openbios.git
35
+    url = https://gitlab.com/qemu-project/openbios.git
36
[submodule "roms/qemu-palcode"]
37
    path = roms/qemu-palcode
38
-    url = https://git.qemu.org/git/qemu-palcode.git
39
+    url = https://gitlab.com/qemu-project/qemu-palcode.git
40
[submodule "roms/sgabios"]
41
    path = roms/sgabios
42
-    url = https://git.qemu.org/git/sgabios.git
43
+    url = https://gitlab.com/qemu-project/sgabios.git
44
[submodule "dtc"]
45
    path = dtc
46
-    url = https://git.qemu.org/git/dtc.git
47
+    url = https://gitlab.com/qemu-project/dtc.git
48
[submodule "roms/u-boot"]
49
    path = roms/u-boot
50
-    url = https://git.qemu.org/git/u-boot.git
51
+    url = https://gitlab.com/qemu-project/u-boot.git
52
[submodule "roms/skiboot"]
53
    path = roms/skiboot
54
-    url = https://git.qemu.org/git/skiboot.git
55
+    url = https://gitlab.com/qemu-project/skiboot.git
56
[submodule "roms/QemuMacDrivers"]
57
    path = roms/QemuMacDrivers
58
-    url = https://git.qemu.org/git/QemuMacDrivers.git
59
+    url = https://gitlab.com/qemu-project/QemuMacDrivers.git
60
[submodule "ui/keycodemapdb"]
61
    path = ui/keycodemapdb
62
-    url = https://git.qemu.org/git/keycodemapdb.git
63
+    url = https://gitlab.com/qemu-project/keycodemapdb.git
64
[submodule "capstone"]
65
    path = capstone
66
-    url = https://git.qemu.org/git/capstone.git
67
+    url = https://gitlab.com/qemu-project/capstone.git
68
[submodule "roms/seabios-hppa"]
69
    path = roms/seabios-hppa
70
-    url = https://git.qemu.org/git/seabios-hppa.git
71
+    url = https://gitlab.com/qemu-project/seabios-hppa.git
72
[submodule "roms/u-boot-sam460ex"]
73
    path = roms/u-boot-sam460ex
74
-    url = https://git.qemu.org/git/u-boot-sam460ex.git
75
+    url = https://gitlab.com/qemu-project/u-boot-sam460ex.git
76
[submodule "tests/fp/berkeley-testfloat-3"]
77
    path = tests/fp/berkeley-testfloat-3
78
-    url = https://git.qemu.org/git/berkeley-testfloat-3.git
79
+    url = https://gitlab.com/qemu-project/berkeley-testfloat-3.git
80
[submodule "tests/fp/berkeley-softfloat-3"]
81
    path = tests/fp/berkeley-softfloat-3
82
-    url = https://git.qemu.org/git/berkeley-softfloat-3.git
83
+    url = https://gitlab.com/qemu-project/berkeley-softfloat-3.git
84
[submodule "roms/edk2"]
85
    path = roms/edk2
86
-    url = https://git.qemu.org/git/edk2.git
87
+    url = https://gitlab.com/qemu-project/edk2.git
88
[submodule "slirp"]
89
    path = slirp
90
-    url = https://git.qemu.org/git/libslirp.git
91
+    url = https://gitlab.com/qemu-project/libslirp.git
92
[submodule "roms/opensbi"]
93
    path = roms/opensbi
94
-    url =     https://git.qemu.org/git/opensbi.git
95
+    url =     https://gitlab.com/qemu-project/opensbi.git
96
[submodule "roms/qboot"]
97
    path = roms/qboot
98
-    url = https://git.qemu.org/git/qboot.git
99
+    url = https://gitlab.com/qemu-project/qboot.git
100
[submodule "meson"]
101
    path = meson
102
-    url = https://git.qemu.org/git/meson.git
103
+    url = https://gitlab.com/qemu-project/meson.git
104
[submodule "roms/vbootrom"]
105
    path = roms/vbootrom
106
-    url = https://git.qemu.org/git/vbootrom.git
107
+    url = https://gitlab.com/qemu-project/vbootrom.git
108
--
109
2.29.2
110
diff view generated by jsdifflib
New patch
1
It is no longer necessary to point .gitmodules at GitLab repos when
2
running in GitLab CI since they are now used all the time.
1
3
4
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
5
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
6
Reviewed-by: Thomas Huth <thuth@redhat.com>
7
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
8
Message-id: 20210111115017.156802-4-stefanha@redhat.com
9
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
10
---
11
.gitlab-ci.yml | 1 -
12
1 file changed, 1 deletion(-)
13
14
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
15
index XXXXXXX..XXXXXXX 100644
16
--- a/.gitlab-ci.yml
17
+++ b/.gitlab-ci.yml
18
@@ -XXX,XX +XXX,XX @@ include:
19
image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest
20
before_script:
21
- JOBS=$(expr $(nproc) + 1)
22
- - sed -i s,git.qemu.org/git,gitlab.com/qemu-project, .gitmodules
23
script:
24
- mkdir build
25
- cd build
26
--
27
2.29.2
28
diff view generated by jsdifflib
New patch
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
1
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Message-id: 20210111115017.156802-5-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
README.rst | 4 ++--
13
1 file changed, 2 insertions(+), 2 deletions(-)
14
15
diff --git a/README.rst b/README.rst
16
index XXXXXXX..XXXXXXX 100644
17
--- a/README.rst
18
+++ b/README.rst
19
@@ -XXX,XX +XXX,XX @@ The QEMU source code is maintained under the GIT version control system.
20
21
.. code-block:: shell
22
23
- git clone https://git.qemu.org/git/qemu.git
24
+ git clone https://gitlab.com/qemu-project/qemu.git
25
26
When submitting patches, one common approach is to use 'git
27
format-patch' and/or 'git send-email' to format & send the mail to the
28
@@ -XXX,XX +XXX,XX @@ The QEMU website is also maintained under source control.
29
30
.. code-block:: shell
31
32
- git clone https://git.qemu.org/git/qemu-web.git
33
+ git clone https://gitlab.com/qemu-project/qemu-web.git
34
35
* `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_
36
37
--
38
2.29.2
39
diff view generated by jsdifflib
New patch
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
1
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Message-id: 20210111115017.156802-6-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
pc-bios/README | 4 ++--
13
1 file changed, 2 insertions(+), 2 deletions(-)
14
15
diff --git a/pc-bios/README b/pc-bios/README
16
index XXXXXXX..XXXXXXX 100644
17
--- a/pc-bios/README
18
+++ b/pc-bios/README
19
@@ -XXX,XX +XXX,XX @@
20
legacy x86 software to communicate with an attached serial console as
21
if a video card were attached. The master sources reside in a subversion
22
repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is
23
- available at https://git.qemu.org/git/sgabios.git.
24
+ available at https://gitlab.com/qemu-project/sgabios.git.
25
26
- The PXE roms come from the iPXE project. Built with BANNER_TIME 0.
27
Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping:
28
@@ -XXX,XX +XXX,XX @@
29
30
- The u-boot binary for e500 comes from the upstream denx u-boot project where
31
it was compiled using the qemu-ppce500 target.
32
- A git mirror is available at: https://git.qemu.org/git/u-boot.git
33
+ A git mirror is available at: https://gitlab.com/qemu-project/u-boot.git
34
The hash used to compile the current version is: 2072e72
35
36
- Skiboot (https://github.com/open-power/skiboot/) is an OPAL
37
--
38
2.29.2
39
diff view generated by jsdifflib
New patch
1
qemu.org is running out of bandwidth and the QEMU project is moving
2
towards a gating CI on GitLab. Use the GitLab repos instead of qemu.org
3
(they will become mirrors).
1
4
5
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
6
Reviewed-by: Wainer dos Santos Moschetta <wainersm@redhat.com>
7
Reviewed-by: Thomas Huth <thuth@redhat.com>
8
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
9
Message-id: 20210111115017.156802-7-stefanha@redhat.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
scripts/get_maintainer.pl | 2 +-
13
1 file changed, 1 insertion(+), 1 deletion(-)
14
15
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
16
index XXXXXXX..XXXXXXX 100755
17
--- a/scripts/get_maintainer.pl
18
+++ b/scripts/get_maintainer.pl
19
@@ -XXX,XX +XXX,XX @@ sub vcs_exists {
20
    warn("$P: No supported VCS found. Add --nogit to options?\n");
21
    warn("Using a git repository produces better results.\n");
22
    warn("Try latest git repository using:\n");
23
-    warn("git clone https://git.qemu.org/git/qemu.git\n");
24
+    warn("git clone https://gitlab.com/qemu-project/qemu.git\n");
25
    $printed_novcs = 1;
26
}
27
return 0;
28
--
29
2.29.2
30
diff view generated by jsdifflib
1
From: Kashyap Chamarthy <kchamart@redhat.com>
1
From: John G Johnson <john.g.johnson@oracle.com>
2
2
3
This patch documents (including their QMP invocations) all the four
3
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
4
major kinds of live block operations:
4
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
5
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Message-id: 02a68adef99f5df6a380bf8fd7b90948777e411c.1611938319.git.jag.raman@oracle.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
MAINTAINERS | 7 +
11
docs/devel/index.rst | 1 +
12
docs/devel/multi-process.rst | 966 +++++++++++++++++++++++++++++++++++
13
3 files changed, 974 insertions(+)
14
create mode 100644 docs/devel/multi-process.rst
5
15
6
- `block-stream`
16
diff --git a/MAINTAINERS b/MAINTAINERS
7
- `block-commit`
17
index XXXXXXX..XXXXXXX 100644
8
- `drive-mirror` (& `blockdev-mirror`)
18
--- a/MAINTAINERS
9
- `drive-backup` (& `blockdev-backup`)
19
+++ b/MAINTAINERS
10
20
@@ -XXX,XX +XXX,XX @@ S: Maintained
11
Things considered while writing this document:
21
F: hw/semihosting/
12
22
F: include/hw/semihosting/
13
- Use reStructuredText as markup language (with the goal of generating
23
14
the HTML output using the Sphinx Documentation Generator). It is
24
+Multi-process QEMU
15
gentler on the eye, and can be trivially converted to different
25
+M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
16
formats. (Another reason: upstream QEMU is considering to switch to
26
+M: Jagannathan Raman <jag.raman@oracle.com>
17
Sphinx, which uses reStructuredText as its markup language.)
27
+M: John G Johnson <john.g.johnson@oracle.com>
18
28
+S: Maintained
19
- Raw QMP JSON output vs. 'qmp-shell'. I debated with myself whether
29
+F: docs/devel/multi-process.rst
20
to only show raw QMP JSON output (as that is the canonical
30
+
21
representation), or use 'qmp-shell', which takes key-value pairs. I
31
Build and test automation
22
settled on the approach of: for the first occurrence of a command,
32
-------------------------
23
use raw JSON; for subsequent occurrences, use 'qmp-shell', with an
33
Build and test automation
24
occasional exception.
34
diff --git a/docs/devel/index.rst b/docs/devel/index.rst
25
35
index XXXXXXX..XXXXXXX 100644
26
- Usage of `-blockdev` command-line.
36
--- a/docs/devel/index.rst
27
37
+++ b/docs/devel/index.rst
28
- Usage of 'node-name' vs. file path to refer to disks. While we have
38
@@ -XXX,XX +XXX,XX @@ Contents:
29
`blockdev-{mirror, backup}` as 'node-name'-alternatives for
39
clocks
30
`drive-{mirror, backup}`, the `block-commit` command still operates
40
qom
31
on file names for parameters 'base' and 'top'. So I added a caveat
41
block-coroutine-wrapper
32
at the beginning to that effect.
42
+ multi-process
33
43
diff --git a/docs/devel/multi-process.rst b/docs/devel/multi-process.rst
34
Refer this related thread that I started (where I learnt
35
`block-stream` was recently reworked to accept 'node-name' for 'top'
36
and 'base' parameters):
37
https://lists.nongnu.org/archive/html/qemu-devel/2017-05/msg06466.html
38
"[RFC] Making 'block-stream', and 'block-commit' accept node-name"
39
40
All commands showed in this document were tested while documenting.
41
42
Thanks: Eric Blake for the section: "A note on points-in-time vs file
43
names". This useful bit was originally articulated by Eric in his
44
KVMForum 2015 presentation, so I included that specific bit in this
45
document.
46
47
Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
48
Reviewed-by: Eric Blake <eblake@redhat.com>
49
Message-id: 20170717105205.32639-3-kchamart@redhat.com
50
Signed-off-by: Jeff Cody <jcody@redhat.com>
51
---
52
docs/interop/live-block-operations.rst | 1088 ++++++++++++++++++++++++++++++++
53
docs/live-block-ops.txt | 72 ---
54
2 files changed, 1088 insertions(+), 72 deletions(-)
55
create mode 100644 docs/interop/live-block-operations.rst
56
delete mode 100644 docs/live-block-ops.txt
57
58
diff --git a/docs/interop/live-block-operations.rst b/docs/interop/live-block-operations.rst
59
new file mode 100644
44
new file mode 100644
60
index XXXXXXX..XXXXXXX
45
index XXXXXXX..XXXXXXX
61
--- /dev/null
46
--- /dev/null
62
+++ b/docs/interop/live-block-operations.rst
47
+++ b/docs/devel/multi-process.rst
63
@@ -XXX,XX +XXX,XX @@
48
@@ -XXX,XX +XXX,XX @@
64
+..
49
+This is the design document for multi-process QEMU. It does not
65
+ Copyright (C) 2017 Red Hat Inc.
50
+necessarily reflect the status of the current implementation, which
66
+
51
+may lack features or be considerably different from what is described
67
+ This work is licensed under the terms of the GNU GPL, version 2 or
52
+in this document. This document is still useful as a description of
68
+ later. See the COPYING file in the top-level directory.
53
+the goals and general direction of this feature.
69
+
54
+
70
+============================
55
+Please refer to the following wiki for latest details:
71
+Live Block Device Operations
56
+https://wiki.qemu.org/Features/MultiProcessQEMU
72
+============================
57
+
73
+
58
+Multi-process QEMU
74
+QEMU Block Layer currently (as of QEMU 2.9) supports four major kinds of
59
+===================
75
+live block device jobs -- stream, commit, mirror, and backup. These can
60
+
76
+be used to manipulate disk image chains to accomplish certain tasks,
61
+QEMU is often used as the hypervisor for virtual machines running in the
77
+namely: live copy data from backing files into overlays; shorten long
62
+Oracle cloud. Since one of the advantages of cloud computing is the
78
+disk image chains by merging data from overlays into backing files; live
63
+ability to run many VMs from different tenants in the same cloud
79
+synchronize data from a disk image chain (including current active disk)
64
+infrastructure, a guest that compromised its hypervisor could
80
+to another target image; and point-in-time (and incremental) backups of
65
+potentially use the hypervisor's access privileges to access data it is
81
+a block device. Below is a description of the said block (QMP)
66
+not authorized for.
82
+primitives, and some (non-exhaustive list of) examples to illustrate
67
+
83
+their use.
68
+QEMU can be susceptible to security attacks because it is a large,
84
+
69
+monolithic program that provides many features to the VMs it services.
85
+.. note::
70
+Many of these features can be configured out of QEMU, but even a reduced
86
+ The file ``qapi/block-core.json`` in the QEMU source tree has the
71
+configuration QEMU has a large amount of code a guest can potentially
87
+ canonical QEMU API (QAPI) schema documentation for the QMP
72
+attack. Separating QEMU reduces the attack surface by aiding to
88
+ primitives discussed here.
73
+limit each component in the system to only access the resources that
89
+
74
+it needs to perform its job.
90
+.. todo (kashyapc):: Remove the ".. contents::" directive when Sphinx is
75
+
91
+ integrated.
76
+QEMU services
92
+
77
+-------------
93
+.. contents::
78
+
94
+
79
+QEMU can be broadly described as providing three main services. One is a
95
+Disk image backing chain notation
80
+VM control point, where VMs can be created, migrated, re-configured, and
96
+---------------------------------
81
+destroyed. A second is to emulate the CPU instructions within the VM,
97
+
82
+often accelerated by HW virtualization features such as Intel's VT
98
+A simple disk image chain. (This can be created live using QMP
83
+extensions. Finally, it provides IO services to the VM by emulating HW
99
+``blockdev-snapshot-sync``, or offline via ``qemu-img``)::
84
+IO devices, such as disk and network devices.
100
+
85
+
101
+ (Live QEMU)
86
+A multi-process QEMU
102
+ |
87
+~~~~~~~~~~~~~~~~~~~~
103
+ .
88
+
104
+ V
89
+A multi-process QEMU involves separating QEMU services into separate
105
+
90
+host processes. Each of these processes can be given only the privileges
106
+ [A] <----- [B]
91
+it needs to provide its service, e.g., a disk service could be given
107
+
92
+access only to the disk images it provides, and not be allowed to
108
+ (backing file) (overlay)
93
+access other files, or any network devices. An attacker who compromised
109
+
94
+this service would not be able to use this exploit to access files or
110
+The arrow can be read as: Image [A] is the backing file of disk image
95
+devices beyond what the disk service was given access to.
111
+[B]. And live QEMU is currently writing to image [B], consequently, it
96
+
112
+is also referred to as the "active layer".
97
+A QEMU control process would remain, but in multi-process mode, will
113
+
98
+have no direct interfaces to the VM. During VM execution, it would still
114
+There are two kinds of terminology that are common when referring to
99
+provide the user interface to hot-plug devices or live migrate the VM.
115
+files in a disk image backing chain:
100
+
116
+
101
+A first step in creating a multi-process QEMU is to separate IO services
117
+(1) Directional: 'base' and 'top'. Given the simple disk image chain
102
+from the main QEMU program, which would continue to provide CPU
118
+ above, image [A] can be referred to as 'base', and image [B] as
103
+emulation. i.e., the control process would also be the CPU emulation
119
+ 'top'. (This terminology can be seen in in QAPI schema file,
104
+process. In a later phase, CPU emulation could be separated from the
120
+ block-core.json.)
105
+control process.
121
+
106
+
122
+(2) Relational: 'backing file' and 'overlay'. Again, taking the same
107
+Separating IO services
123
+ simple disk image chain from the above, disk image [A] is referred
108
+----------------------
124
+ to as the backing file, and image [B] as overlay.
109
+
125
+
110
+Separating IO services into individual host processes is a good place to
126
+ Throughout this document, we will use the relational terminology.
111
+begin for a couple of reasons. One is the sheer number of IO devices QEMU
127
+
112
+can emulate provides a large surface of interfaces which could potentially
128
+.. important::
113
+be exploited, and, indeed, have been a source of exploits in the past.
129
+ The overlay files can generally be any format that supports a
114
+Another is the modular nature of QEMU device emulation code provides
130
+ backing file, although QCOW2 is the preferred format and the one
115
+interface points where the QEMU functions that perform device emulation
131
+ used in this document.
116
+can be separated from the QEMU functions that manage the emulation of
132
+
117
+guest CPU instructions. The devices emulated in the separate process are
133
+
118
+referred to as remote devices.
134
+Brief overview of live block QMP primitives
119
+
120
+QEMU device emulation
121
+~~~~~~~~~~~~~~~~~~~~~
122
+
123
+QEMU uses an object oriented SW architecture for device emulation code.
124
+Configured objects are all compiled into the QEMU binary, then objects
125
+are instantiated by name when used by the guest VM. For example, the
126
+code to emulate a device named "foo" is always present in QEMU, but its
127
+instantiation code is only run when the device is included in the target
128
+VM. (e.g., via the QEMU command line as *-device foo*)
129
+
130
+The object model is hierarchical, so device emulation code names its
131
+parent object (such as "pci-device" for a PCI device) and QEMU will
132
+instantiate a parent object before calling the device's instantiation
133
+code.
134
+
135
+Current separation models
136
+~~~~~~~~~~~~~~~~~~~~~~~~~
137
+
138
+In order to separate the device emulation code from the CPU emulation
139
+code, the device object code must run in a different process. There are
140
+a couple of existing QEMU features that can run emulation code
141
+separately from the main QEMU process. These are examined below.
142
+
143
+vhost user model
144
+^^^^^^^^^^^^^^^^
145
+
146
+Virtio guest device drivers can be connected to vhost user applications
147
+in order to perform their IO operations. This model uses special virtio
148
+device drivers in the guest and vhost user device objects in QEMU, but
149
+once the QEMU vhost user code has configured the vhost user application,
150
+mission-mode IO is performed by the application. The vhost user
151
+application is a daemon process that can be contacted via a known UNIX
152
+domain socket.
153
+
154
+vhost socket
155
+''''''''''''
156
+
157
+As mentioned above, one of the tasks of the vhost device object within
158
+QEMU is to contact the vhost application and send it configuration
159
+information about this device instance. As part of the configuration
160
+process, the application can also be sent other file descriptors over
161
+the socket, which then can be used by the vhost user application in
162
+various ways, some of which are described below.
163
+
164
+vhost MMIO store acceleration
165
+'''''''''''''''''''''''''''''
166
+
167
+VMs are often run using HW virtualization features via the KVM kernel
168
+driver. This driver allows QEMU to accelerate the emulation of guest CPU
169
+instructions by running the guest in a virtual HW mode. When the guest
170
+executes instructions that cannot be executed by virtual HW mode,
171
+execution returns to the KVM driver so it can inform QEMU to emulate the
172
+instructions in SW.
173
+
174
+One of the events that can cause a return to QEMU is when a guest device
175
+driver accesses an IO location. QEMU then dispatches the memory
176
+operation to the corresponding QEMU device object. In the case of a
177
+vhost user device, the memory operation would need to be sent over a
178
+socket to the vhost application. This path is accelerated by the QEMU
179
+virtio code by setting up an eventfd file descriptor that the vhost
180
+application can directly receive MMIO store notifications from the KVM
181
+driver, instead of needing them to be sent to the QEMU process first.
182
+
183
+vhost interrupt acceleration
184
+''''''''''''''''''''''''''''
185
+
186
+Another optimization used by the vhost application is the ability to
187
+directly inject interrupts into the VM via the KVM driver, again,
188
+bypassing the need to send the interrupt back to the QEMU process first.
189
+The QEMU virtio setup code configures the KVM driver with an eventfd
190
+that triggers the device interrupt in the guest when the eventfd is
191
+written. This irqfd file descriptor is then passed to the vhost user
192
+application program.
193
+
194
+vhost access to guest memory
195
+''''''''''''''''''''''''''''
196
+
197
+The vhost application is also allowed to directly access guest memory,
198
+instead of needing to send the data as messages to QEMU. This is also
199
+done with file descriptors sent to the vhost user application by QEMU.
200
+These descriptors can be passed to ``mmap()`` by the vhost application
201
+to map the guest address space into the vhost application.
202
+
203
+IOMMUs introduce another level of complexity, since the address given to
204
+the guest virtio device to DMA to or from is not a guest physical
205
+address. This case is handled by having vhost code within QEMU register
206
+as a listener for IOMMU mapping changes. The vhost application maintains
207
+a cache of IOMMMU translations: sending translation requests back to
208
+QEMU on cache misses, and in turn receiving flush requests from QEMU
209
+when mappings are purged.
210
+
211
+applicability to device separation
212
+''''''''''''''''''''''''''''''''''
213
+
214
+Much of the vhost model can be re-used by separated device emulation. In
215
+particular, the ideas of using a socket between QEMU and the device
216
+emulation application, using a file descriptor to inject interrupts into
217
+the VM via KVM, and allowing the application to ``mmap()`` the guest
218
+should be re used.
219
+
220
+There are, however, some notable differences between how a vhost
221
+application works and the needs of separated device emulation. The most
222
+basic is that vhost uses custom virtio device drivers which always
223
+trigger IO with MMIO stores. A separated device emulation model must
224
+work with existing IO device models and guest device drivers. MMIO loads
225
+break vhost store acceleration since they are synchronous - guest
226
+progress cannot continue until the load has been emulated. By contrast,
227
+stores are asynchronous, the guest can continue after the store event
228
+has been sent to the vhost application.
229
+
230
+Another difference is that in the vhost user model, a single daemon can
231
+support multiple QEMU instances. This is contrary to the security regime
232
+desired, in which the emulation application should only be allowed to
233
+access the files or devices the VM it's running on behalf of can access.
234
+#### qemu-io model
235
+
236
+Qemu-io is a test harness used to test changes to the QEMU block backend
237
+object code. (e.g., the code that implements disk images for disk driver
238
+emulation) Qemu-io is not a device emulation application per se, but it
239
+does compile the QEMU block objects into a separate binary from the main
240
+QEMU one. This could be useful for disk device emulation, since its
241
+emulation applications will need to include the QEMU block objects.
242
+
243
+New separation model based on proxy objects
135
+-------------------------------------------
244
+-------------------------------------------
136
+
245
+
137
+The following are the four different kinds of live block operations that
246
+A different model based on proxy objects in the QEMU program
138
+QEMU block layer supports.
247
+communicating with remote emulation programs could provide separation
139
+
248
+while minimizing the changes needed to the device emulation code. The
140
+(1) ``block-stream``: Live copy of data from backing files into overlay
249
+rest of this section is a discussion of how a proxy object model would
141
+ files.
250
+work.
142
+
251
+
143
+ .. note:: Once the 'stream' operation has finished, three things to
252
+Remote emulation processes
144
+ note:
253
+~~~~~~~~~~~~~~~~~~~~~~~~~~
145
+
254
+
146
+ (a) QEMU rewrites the backing chain to remove
255
+The remote emulation process will run the QEMU object hierarchy without
147
+ reference to the now-streamed and redundant backing
256
+modification. The device emulation objects will be also be based on the
148
+ file;
257
+QEMU code, because for anything but the simplest device, it would not be
149
+
258
+a tractable to re-implement both the object model and the many device
150
+ (b) the streamed file *itself* won't be removed by QEMU,
259
+backends that QEMU has.
151
+ and must be explicitly discarded by the user;
260
+
152
+
261
+The processes will communicate with the QEMU process over UNIX domain
153
+ (c) the streamed file remains valid -- i.e. further
262
+sockets. The processes can be executed either as standalone processes,
154
+ overlays can be created based on it. Refer the
263
+or be executed by QEMU. In both cases, the host backends the emulation
155
+ ``block-stream`` section further below for more
264
+processes will provide are specified on its command line, as they would
156
+ details.
265
+be for QEMU. For example:
157
+
266
+
158
+(2) ``block-commit``: Live merge of data from overlay files into backing
267
+::
159
+ files (with the optional goal of removing the overlay file from the
268
+
160
+ chain). Since QEMU 2.0, this includes "active ``block-commit``"
269
+ disk-proc -blockdev driver=file,node-name=file0,filename=disk-file0 \
161
+ (i.e. merge the current active layer into the base image).
270
+ -blockdev driver=qcow2,node-name=drive0,file=file0
162
+
271
+
163
+ .. note:: Once the 'commit' operation has finished, there are three
272
+would indicate process *disk-proc* uses a qcow2 emulated disk named
164
+ things to note here as well:
273
+*file0* as its backend.
165
+
274
+
166
+ (a) QEMU rewrites the backing chain to remove reference
275
+Emulation processes may emulate more than one guest controller. A common
167
+ to now-redundant overlay images that have been
276
+configuration might be to put all controllers of the same device class
168
+ committed into a backing file;
277
+(e.g., disk, network, etc.) in a single process, so that all backends of
169
+
278
+the same type can be managed by a single QMP monitor.
170
+ (b) the committed file *itself* won't be removed by QEMU
279
+
171
+ -- it ought to be manually removed;
280
+communication with QEMU
172
+
281
+^^^^^^^^^^^^^^^^^^^^^^^
173
+ (c) however, unlike in the case of ``block-stream``, the
282
+
174
+ intermediate images will be rendered invalid -- i.e.
283
+The first argument to the remote emulation process will be a Unix domain
175
+ no more further overlays can be created based on
284
+socket that connects with the Proxy object. This is a required argument.
176
+ them. Refer the ``block-commit`` section further
285
+
177
+ below for more details.
286
+::
178
+
287
+
179
+(3) ``drive-mirror`` (and ``blockdev-mirror``): Synchronize a running
288
+ disk-proc <socket number> <backend list>
180
+ disk to another image.
289
+
181
+
290
+remote process QMP monitor
182
+(4) ``drive-backup`` (and ``blockdev-backup``): Point-in-time (live) copy
291
+^^^^^^^^^^^^^^^^^^^^^^^^^^
183
+ of a block device to a destination.
292
+
184
+
293
+Remote emulation processes can be monitored via QMP, similar to QEMU
185
+
294
+itself. The QMP monitor socket is specified the same as for a QEMU
186
+.. _`Interacting with a QEMU instance`:
295
+process:
187
+
296
+
188
+Interacting with a QEMU instance
297
+::
189
+--------------------------------
298
+
190
+
299
+ disk-proc -qmp unix:/tmp/disk-mon,server
191
+To show some example invocations of command-line, we will use the
300
+
192
+following invocation of QEMU, with a QMP server running over UNIX
301
+can be monitored over the UNIX socket path */tmp/disk-mon*.
193
+socket::
302
+
194
+
303
+QEMU command line
195
+ $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
304
+~~~~~~~~~~~~~~~~~
196
+ -M q35 -nodefaults -m 512 \
305
+
197
+ -blockdev node-name=node-A,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./a.qcow2 \
306
+Each remote device emulated in a remote process on the host is
198
+ -device virtio-blk,drive=node-A,id=virtio0 \
307
+represented as a *-device* of type *pci-proxy-dev*. A socket
199
+ -monitor stdio -qmp unix:/tmp/qmp-sock,server,nowait
308
+sub-option to this option specifies the Unix socket that connects
200
+
309
+to the remote process. An *id* sub-option is required, and it should
201
+The ``-blockdev`` command-line option, used above, is available from
310
+be the same id as used in the remote process.
202
+QEMU 2.9 onwards. In the above invocation, notice the ``node-name``
311
+
203
+parameter that is used to refer to the disk image a.qcow2 ('node-A') --
312
+::
204
+this is a cleaner way to refer to a disk image (as opposed to referring
313
+
205
+to it by spelling out file paths). So, we will continue to designate a
314
+ qemu-system-x86_64 ... -device pci-proxy-dev,id=lsi0,socket=3
206
+``node-name`` to each further disk image created (either via
315
+
207
+``blockdev-snapshot-sync``, or ``blockdev-add``) as part of the disk
316
+can be used to add a device emulated in a remote process
208
+image chain, and continue to refer to the disks using their
317
+
209
+``node-name`` (where possible, because ``block-commit`` does not yet, as
318
+
210
+of QEMU 2.9, accept ``node-name`` parameter) when performing various
319
+QEMU management of remote processes
211
+block operations.
212
+
213
+To interact with the QEMU instance launched above, we will use the
214
+``qmp-shell`` utility (located at: ``qemu/scripts/qmp``, as part of the
215
+QEMU source directory), which takes key-value pairs for QMP commands.
216
+Invoke it as below (which will also print out the complete raw JSON
217
+syntax for reference -- examples in the following sections)::
218
+
219
+ $ ./qmp-shell -v -p /tmp/qmp-sock
220
+ (QEMU)
221
+
222
+.. note::
223
+ In the event we have to repeat a certain QMP command, we will: for
224
+ the first occurrence of it, show the ``qmp-shell`` invocation, *and*
225
+ the corresponding raw JSON QMP syntax; but for subsequent
226
+ invocations, present just the ``qmp-shell`` syntax, and omit the
227
+ equivalent JSON output.
228
+
229
+
230
+Example disk image chain
231
+------------------------
232
+
233
+We will use the below disk image chain (and occasionally spelling it
234
+out where appropriate) when discussing various primitives::
235
+
236
+ [A] <-- [B] <-- [C] <-- [D]
237
+
238
+Where [A] is the original base image; [B] and [C] are intermediate
239
+overlay images; image [D] is the active layer -- i.e. live QEMU is
240
+writing to it. (The rule of thumb is: live QEMU will always be pointing
241
+to the rightmost image in a disk image chain.)
242
+
243
+The above image chain can be created by invoking
244
+``blockdev-snapshot-sync`` commands as following (which shows the
245
+creation of overlay image [B]) using the ``qmp-shell`` (our invocation
246
+also prints the raw JSON invocation of it)::
247
+
248
+ (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
249
+ {
250
+ "execute": "blockdev-snapshot-sync",
251
+ "arguments": {
252
+ "node-name": "node-A",
253
+ "snapshot-file": "b.qcow2",
254
+ "format": "qcow2",
255
+ "snapshot-node-name": "node-B"
256
+ }
257
+ }
258
+
259
+Here, "node-A" is the name QEMU internally uses to refer to the base
260
+image [A] -- it is the backing file, based on which the overlay image,
261
+[B], is created.
262
+
263
+To create the rest of the overlay images, [C], and [D] (omitting the raw
264
+JSON output for brevity)::
265
+
266
+ (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
267
+ (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
268
+
269
+
270
+A note on points-in-time vs file names
271
+--------------------------------------
272
+
273
+In our disk image chain::
274
+
275
+ [A] <-- [B] <-- [C] <-- [D]
276
+
277
+We have *three* points in time and an active layer:
278
+
279
+- Point 1: Guest state when [B] was created is contained in file [A]
280
+- Point 2: Guest state when [C] was created is contained in [A] + [B]
281
+- Point 3: Guest state when [D] was created is contained in
282
+ [A] + [B] + [C]
283
+- Active layer: Current guest state is contained in [A] + [B] + [C] +
284
+ [D]
285
+
286
+Therefore, be aware with naming choices:
287
+
288
+- Naming a file after the time it is created is misleading -- the
289
+ guest data for that point in time is *not* contained in that file
290
+ (as explained earlier)
291
+- Rather, think of files as a *delta* from the backing file
292
+
293
+
294
+Live block streaming --- ``block-stream``
295
+-----------------------------------------
296
+
297
+The ``block-stream`` command allows you to do live copy data from backing
298
+files into overlay images.
299
+
300
+Given our original example disk image chain from earlier::
301
+
302
+ [A] <-- [B] <-- [C] <-- [D]
303
+
304
+The disk image chain can be shortened in one of the following different
305
+ways (not an exhaustive list).
306
+
307
+.. _`Case-1`:
308
+
309
+(1) Merge everything into the active layer: I.e. copy all contents from
310
+ the base image, [A], and overlay images, [B] and [C], into [D],
311
+ *while* the guest is running. The resulting chain will be a
312
+ standalone image, [D] -- with contents from [A], [B] and [C] merged
313
+ into it (where live QEMU writes go to)::
314
+
315
+ [D]
316
+
317
+.. _`Case-2`:
318
+
319
+(2) Taking the same example disk image chain mentioned earlier, merge
320
+ only images [B] and [C] into [D], the active layer. The result will
321
+ be contents of images [B] and [C] will be copied into [D], and the
322
+ backing file pointer of image [D] will be adjusted to point to image
323
+ [A]. The resulting chain will be::
324
+
325
+ [A] <-- [D]
326
+
327
+.. _`Case-3`:
328
+
329
+(3) Intermediate streaming (available since QEMU 2.8): Starting afresh
330
+ with the original example disk image chain, with a total of four
331
+ images, it is possible to copy contents from image [B] into image
332
+ [C]. Once the copy is finished, image [B] can now be (optionally)
333
+ discarded; and the backing file pointer of image [C] will be
334
+ adjusted to point to [A]. I.e. after performing "intermediate
335
+ streaming" of [B] into [C], the resulting image chain will be (where
336
+ live QEMU is writing to [D])::
337
+
338
+ [A] <-- [C] <-- [D]
339
+
340
+
341
+QMP invocation for ``block-stream``
342
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
320
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
343
+
321
+
344
+For `Case-1`_, to merge contents of all the backing files into the
322
+QEMU is not aware of the type of type of the remote PCI device. It is
345
+active layer, where 'node-D' is the current active image (by default
323
+a pass through device as far as QEMU is concerned.
346
+``block-stream`` will flatten the entire chain); ``qmp-shell`` (and its
324
+
347
+corresponding JSON output)::
325
+communication with emulation process
348
+
326
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
349
+ (QEMU) block-stream device=node-D job-id=job0
327
+
350
+ {
328
+primary channel
351
+ "execute": "block-stream",
329
+'''''''''''''''
352
+ "arguments": {
330
+
353
+ "device": "node-D",
331
+The primary channel (referred to as com in the code) is used to bootstrap
354
+ "job-id": "job0"
332
+the remote process. It is also used to pass on device-agnostic commands
355
+ }
333
+like reset.
356
+ }
334
+
357
+
335
+per-device channels
358
+For `Case-2`_, merge contents of the images [B] and [C] into [D], where
336
+'''''''''''''''''''
359
+image [D] ends up referring to image [A] as its backing file::
337
+
360
+
338
+Each remote device communicates with QEMU using a dedicated communication
361
+ (QEMU) block-stream device=node-D base-node=node-A job-id=job0
339
+channel. The proxy object sets up this channel using the primary
362
+
340
+channel during its initialization.
363
+And for `Case-3`_, of "intermediate" streaming", merge contents of
341
+
364
+images [B] into [C], where [C] ends up referring to [A] as its backing
342
+QEMU device proxy objects
365
+image::
343
+~~~~~~~~~~~~~~~~~~~~~~~~~
366
+
344
+
367
+ (QEMU) block-stream device=node-C base-node=node-A job-id=job0
345
+QEMU has an object model based on sub-classes inherited from the
368
+
346
+"object" super-class. The sub-classes that are of interest here are the
369
+Progress of a ``block-stream`` operation can be monitored via the QMP
347
+"device" and "bus" sub-classes whose child sub-classes make up the
370
+command::
348
+device tree of a QEMU emulated system.
371
+
349
+
372
+ (QEMU) query-block-jobs
350
+The proxy object model will use device proxy objects to replace the
373
+ {
351
+device emulation code within the QEMU process. These objects will live
374
+ "execute": "query-block-jobs",
352
+in the same place in the object and bus hierarchies as the objects they
375
+ "arguments": {}
353
+replace. i.e., the proxy object for an LSI SCSI controller will be a
376
+ }
354
+sub-class of the "pci-device" class, and will have the same PCI bus
377
+
355
+parent and the same SCSI bus child objects as the LSI controller object
378
+
356
+it replaces.
379
+Once the ``block-stream`` operation has completed, QEMU will emit an
357
+
380
+event, ``BLOCK_JOB_COMPLETED``. The intermediate overlays remain valid,
358
+It is worth noting that the same proxy object is used to mediate with
381
+and can now be (optionally) discarded, or retained to create further
359
+all types of remote PCI devices.
382
+overlays based on them. Finally, the ``block-stream`` jobs can be
360
+
383
+restarted at anytime.
361
+object initialization
384
+
362
+^^^^^^^^^^^^^^^^^^^^^
385
+
363
+
386
+Live block commit --- ``block-commit``
364
+The Proxy device objects are initialized in the exact same manner in
387
+--------------------------------------
365
+which any other QEMU device would be initialized.
388
+
366
+
389
+The ``block-commit`` command lets you merge live data from overlay
367
+In addition, the Proxy objects perform the following two tasks:
390
+images into backing file(s). Since QEMU 2.0, this includes "live active
368
+- Parses the "socket" sub option and connects to the remote process
391
+commit" (i.e. it is possible to merge the "active layer", the right-most
369
+using this channel
392
+image in a disk image chain where live QEMU will be writing to, into the
370
+- Uses the "id" sub-option to connect to the emulated device on the
393
+base image). This is analogous to ``block-stream``, but in the opposite
371
+separate process
394
+direction.
372
+
395
+
373
+class\_init
396
+Again, starting afresh with our example disk image chain, where live
374
+'''''''''''
397
+QEMU is writing to the right-most image in the chain, [D]::
375
+
398
+
376
+The ``class_init()`` method of a proxy object will, in general behave
399
+ [A] <-- [B] <-- [C] <-- [D]
377
+similarly to the object it replaces, including setting any static
400
+
378
+properties and methods needed by the proxy.
401
+The disk image chain can be shortened in one of the following ways:
379
+
402
+
380
+instance\_init / realize
403
+.. _`block-commit_Case-1`:
381
+''''''''''''''''''''''''
404
+
382
+
405
+(1) Commit content from only image [B] into image [A]. The resulting
383
+The ``instance_init()`` and ``realize()`` functions would only need to
406
+ chain is the following, where image [C] is adjusted to point at [A]
384
+perform tasks related to being a proxy, such are registering its own
407
+ as its new backing file::
385
+MMIO handlers, or creating a child bus that other proxy devices can be
408
+
386
+attached to later.
409
+ [A] <-- [C] <-- [D]
387
+
410
+
388
+Other tasks will be device-specific. For example, PCI device objects
411
+(2) Commit content from images [B] and [C] into image [A]. The
389
+will initialize the PCI config space in order to make a valid PCI device
412
+ resulting chain, where image [D] is adjusted to point to image [A]
390
+tree within the QEMU process.
413
+ as its new backing file::
391
+
414
+
392
+address space registration
415
+ [A] <-- [D]
393
+^^^^^^^^^^^^^^^^^^^^^^^^^^
416
+
394
+
417
+.. _`block-commit_Case-3`:
395
+Most devices are driven by guest device driver accesses to IO addresses
418
+
396
+or ports. The QEMU device emulation code uses QEMU's memory region
419
+(3) Commit content from images [B], [C], and the active layer [D] into
397
+function calls (such as ``memory_region_init_io()``) to add callback
420
+ image [A]. The resulting chain (in this case, a consolidated single
398
+functions that QEMU will invoke when the guest accesses the device's
421
+ image)::
399
+areas of the IO address space. When a guest driver does access the
422
+
400
+device, the VM will exit HW virtualization mode and return to QEMU,
423
+ [A]
401
+which will then lookup and execute the corresponding callback function.
424
+
402
+
425
+(4) Commit content from image only image [C] into image [B]. The
403
+A proxy object would need to mirror the memory region calls the actual
426
+ resulting chain::
404
+device emulator would perform in its initialization code, but with its
427
+
405
+own callbacks. When invoked by QEMU as a result of a guest IO operation,
428
+    [A] <-- [B] <-- [D]
406
+they will forward the operation to the device emulation process.
429
+
407
+
430
+(5) Commit content from image [C] and the active layer [D] into image
408
+PCI config space
431
+ [B]. The resulting chain::
409
+^^^^^^^^^^^^^^^^
432
+
410
+
433
+    [A] <-- [B]
411
+PCI devices also have a configuration space that can be accessed by the
434
+
412
+guest driver. Guest accesses to this space is not handled by the device
435
+
413
+emulation object, but by its PCI parent object. Much of this space is
436
+QMP invocation for ``block-commit``
414
+read-only, but certain registers (especially BAR and MSI-related ones)
437
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
415
+need to be propagated to the emulation process.
438
+
416
+
439
+For :ref:`Case-1 <block-commit_Case-1>`, to merge contents only from
417
+PCI parent proxy
440
+image [B] into image [A], the invocation is as follows::
418
+''''''''''''''''
441
+
419
+
442
+ (QEMU) block-commit device=node-D base=a.qcow2 top=b.qcow2 job-id=job0
420
+One way to propagate guest PCI config accesses is to create a
443
+ {
421
+"pci-device-proxy" class that can serve as the parent of a PCI device
444
+ "execute": "block-commit",
422
+proxy object. This class's parent would be "pci-device" and it would
445
+ "arguments": {
423
+override the PCI parent's ``config_read()`` and ``config_write()``
446
+ "device": "node-D",
424
+methods with ones that forward these operations to the emulation
447
+ "job-id": "job0",
425
+program.
448
+ "top": "b.qcow2",
426
+
449
+ "base": "a.qcow2"
427
+interrupt receipt
450
+ }
428
+^^^^^^^^^^^^^^^^^
451
+ }
429
+
452
+
430
+A proxy for a device that generates interrupts will need to create a
453
+Once the above ``block-commit`` operation has completed, a
431
+socket to receive interrupt indications from the emulation process. An
454
+``BLOCK_JOB_COMPLETED`` event will be issued, and no further action is
432
+incoming interrupt indication would then be sent up to its bus parent to
455
+required. As the end result, the backing file of image [C] is adjusted
433
+be injected into the guest. For example, a PCI device object may use
456
+to point to image [A], and the original 4-image chain will end up being
434
+``pci_set_irq()``.
457
+transformed to::
435
+
458
+
436
+live migration
459
+ [A] <-- [C] <-- [D]
437
+^^^^^^^^^^^^^^
460
+
438
+
461
+.. note::
439
+The proxy will register to save and restore any *vmstate* it needs over
462
+ The intermediate image [B] is invalid (as in: no more further
440
+a live migration event. The device proxy does not need to manage the
463
+ overlays based on it can be created).
441
+remote device's *vmstate*; that will be handled by the remote process
464
+
442
+proxy (see below).
465
+ Reasoning: An intermediate image after a 'stream' operation still
443
+
466
+ represents that old point-in-time, and may be valid in that context.
444
+QEMU remote device operation
467
+ However, an intermediate image after a 'commit' operation no longer
468
+ represents any point-in-time, and is invalid in any context.
469
+
470
+
471
+However, :ref:`Case-3 <block-commit_Case-3>` (also called: "active
472
+``block-commit``") is a *two-phase* operation: In the first phase, the
473
+content from the active overlay, along with the intermediate overlays,
474
+is copied into the backing file (also called the base image). In the
475
+second phase, adjust the said backing file as the current active image
476
+-- possible via issuing the command ``block-job-complete``. Optionally,
477
+the ``block-commit`` operation can be cancelled by issuing the command
478
+``block-job-cancel``, but be careful when doing this.
479
+
480
+Once the ``block-commit`` operation has completed, the event
481
+``BLOCK_JOB_READY`` will be emitted, signalling that the synchronization
482
+has finished. Now the job can be gracefully completed by issuing the
483
+command ``block-job-complete`` -- until such a command is issued, the
484
+'commit' operation remains active.
485
+
486
+The following is the flow for :ref:`Case-3 <block-commit_Case-3>` to
487
+convert a disk image chain such as this::
488
+
489
+ [A] <-- [B] <-- [C] <-- [D]
490
+
491
+Into::
492
+
493
+ [A]
494
+
495
+Where content from all the subsequent overlays, [B], and [C], including
496
+the active layer, [D], is committed back to [A] -- which is where live
497
+QEMU is performing all its current writes).
498
+
499
+Start the "active ``block-commit``" operation::
500
+
501
+ (QEMU) block-commit device=node-D base=a.qcow2 top=d.qcow2 job-id=job0
502
+ {
503
+ "execute": "block-commit",
504
+ "arguments": {
505
+ "device": "node-D",
506
+ "job-id": "job0",
507
+ "top": "d.qcow2",
508
+ "base": "a.qcow2"
509
+ }
510
+ }
511
+
512
+
513
+Once the synchronization has completed, the event ``BLOCK_JOB_READY`` will
514
+be emitted.
515
+
516
+Then, optionally query for the status of the active block operations.
517
+We can see the 'commit' job is now ready to be completed, as indicated
518
+by the line *"ready": true*::
519
+
520
+ (QEMU) query-block-jobs
521
+ {
522
+ "execute": "query-block-jobs",
523
+ "arguments": {}
524
+ }
525
+ {
526
+ "return": [
527
+ {
528
+ "busy": false,
529
+ "type": "commit",
530
+ "len": 1376256,
531
+ "paused": false,
532
+ "ready": true,
533
+ "io-status": "ok",
534
+ "offset": 1376256,
535
+ "device": "job0",
536
+ "speed": 0
537
+ }
538
+ ]
539
+ }
540
+
541
+Gracefully complete the 'commit' block device job::
542
+
543
+ (QEMU) block-job-complete device=job0
544
+ {
545
+ "execute": "block-job-complete",
546
+ "arguments": {
547
+ "device": "job0"
548
+ }
549
+ }
550
+ {
551
+ "return": {}
552
+ }
553
+
554
+Finally, once the above job is completed, an event
555
+``BLOCK_JOB_COMPLETED`` will be emitted.
556
+
557
+.. note::
558
+ The invocation for rest of the cases (2, 4, and 5), discussed in the
559
+ previous section, is omitted for brevity.
560
+
561
+
562
+Live disk synchronization --- ``drive-mirror`` and ``blockdev-mirror``
563
+----------------------------------------------------------------------
564
+
565
+Synchronize a running disk image chain (all or part of it) to a target
566
+image.
567
+
568
+Again, given our familiar disk image chain::
569
+
570
+ [A] <-- [B] <-- [C] <-- [D]
571
+
572
+The ``drive-mirror`` (and its newer equivalent ``blockdev-mirror``) allows
573
+you to copy data from the entire chain into a single target image (which
574
+can be located on a different host).
575
+
576
+Once a 'mirror' job has started, there are two possible actions while a
577
+``drive-mirror`` job is active:
578
+
579
+(1) Issuing the command ``block-job-cancel`` after it emits the event
580
+ ``BLOCK_JOB_CANCELLED``: will (after completing synchronization of
581
+ the content from the disk image chain to the target image, [E])
582
+ create a point-in-time (which is at the time of *triggering* the
583
+ cancel command) copy, contained in image [E], of the the entire disk
584
+ image chain (or only the top-most image, depending on the ``sync``
585
+ mode).
586
+
587
+(2) Issuing the command ``block-job-complete`` after it emits the event
588
+ ``BLOCK_JOB_COMPLETED``: will, after completing synchronization of
589
+ the content, adjust the guest device (i.e. live QEMU) to point to
590
+ the target image, and, causing all the new writes from this point on
591
+ to happen there. One use case for this is live storage migration.
592
+
593
+About synchronization modes: The synchronization mode determines
594
+*which* part of the disk image chain will be copied to the target.
595
+Currently, there are four different kinds:
596
+
597
+(1) ``full`` -- Synchronize the content of entire disk image chain to
598
+ the target
599
+
600
+(2) ``top`` -- Synchronize only the contents of the top-most disk image
601
+ in the chain to the target
602
+
603
+(3) ``none`` -- Synchronize only the new writes from this point on.
604
+
605
+ .. note:: In the case of ``drive-backup`` (or ``blockdev-backup``),
606
+ the behavior of ``none`` synchronization mode is different.
607
+ Normally, a ``backup`` job consists of two parts: Anything
608
+ that is overwritten by the guest is first copied out to
609
+ the backup, and in the background the whole image is
610
+ copied from start to end. With ``sync=none``, it's only
611
+ the first part.
612
+
613
+(4) ``incremental`` -- Synchronize content that is described by the
614
+ dirty bitmap
615
+
616
+.. note::
617
+ Refer to the :doc:`bitmaps` document in the QEMU source
618
+ tree to learn about the detailed workings of the ``incremental``
619
+ synchronization mode.
620
+
621
+
622
+QMP invocation for ``drive-mirror``
623
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
624
+
625
+To copy the contents of the entire disk image chain, from [A] all the
626
+way to [D], to a new target (``drive-mirror`` will create the destination
627
+file, if it doesn't already exist), call it [E]::
628
+
629
+ (QEMU) drive-mirror device=node-D target=e.qcow2 sync=full job-id=job0
630
+ {
631
+ "execute": "drive-mirror",
632
+ "arguments": {
633
+ "device": "node-D",
634
+ "job-id": "job0",
635
+ "target": "e.qcow2",
636
+ "sync": "full"
637
+ }
638
+ }
639
+
640
+The ``"sync": "full"``, from the above, means: copy the *entire* chain
641
+to the destination.
642
+
643
+Following the above, querying for active block jobs will show that a
644
+'mirror' job is "ready" to be completed (and QEMU will also emit an
645
+event, ``BLOCK_JOB_READY``)::
646
+
647
+ (QEMU) query-block-jobs
648
+ {
649
+ "execute": "query-block-jobs",
650
+ "arguments": {}
651
+ }
652
+ {
653
+ "return": [
654
+ {
655
+ "busy": false,
656
+ "type": "mirror",
657
+ "len": 21757952,
658
+ "paused": false,
659
+ "ready": true,
660
+ "io-status": "ok",
661
+ "offset": 21757952,
662
+ "device": "job0",
663
+ "speed": 0
664
+ }
665
+ ]
666
+ }
667
+
668
+And, as noted in the previous section, there are two possible actions
669
+at this point:
670
+
671
+(a) Create a point-in-time snapshot by ending the synchronization. The
672
+ point-in-time is at the time of *ending* the sync. (The result of
673
+ the following being: the target image, [E], will be populated with
674
+ content from the entire chain, [A] to [D])::
675
+
676
+ (QEMU) block-job-cancel device=job0
677
+ {
678
+ "execute": "block-job-cancel",
679
+ "arguments": {
680
+ "device": "job0"
681
+ }
682
+ }
683
+
684
+(b) Or, complete the operation and pivot the live QEMU to the target
685
+ copy::
686
+
687
+ (QEMU) block-job-complete device=job0
688
+
689
+In either of the above cases, if you once again run the
690
+`query-block-jobs` command, there should not be any active block
691
+operation.
692
+
693
+Comparing 'commit' and 'mirror': In both then cases, the overlay images
694
+can be discarded. However, with 'commit', the *existing* base image
695
+will be modified (by updating it with contents from overlays); while in
696
+the case of 'mirror', a *new* target image is populated with the data
697
+from the disk image chain.
698
+
699
+
700
+QMP invocation for live storage migration with ``drive-mirror`` + NBD
701
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
702
+
703
+Live storage migration (without shared storage setup) is one of the most
704
+common use-cases that takes advantage of the ``drive-mirror`` primitive
705
+and QEMU's built-in Network Block Device (NBD) server. Here's a quick
706
+walk-through of this setup.
707
+
708
+Given the disk image chain::
709
+
710
+ [A] <-- [B] <-- [C] <-- [D]
711
+
712
+Instead of copying content from the entire chain, synchronize *only* the
713
+contents of the *top*-most disk image (i.e. the active layer), [D], to a
714
+target, say, [TargetDisk].
715
+
716
+.. important::
717
+ The destination host must already have the contents of the backing
718
+ chain, involving images [A], [B], and [C], visible via other means
719
+ -- whether by ``cp``, ``rsync``, or by some storage array-specific
720
+ command.)
721
+
722
+Sometimes, this is also referred to as "shallow copy" -- because only
723
+the "active layer", and not the rest of the image chain, is copied to
724
+the destination.
725
+
726
+.. note::
727
+ In this example, for the sake of simplicity, we'll be using the same
728
+ ``localhost`` as both source and destination.
729
+
730
+As noted earlier, on the destination host the contents of the backing
731
+chain -- from images [A] to [C] -- are already expected to exist in some
732
+form (e.g. in a file called, ``Contents-of-A-B-C.qcow2``). Now, on the
733
+destination host, let's create a target overlay image (with the image
734
+``Contents-of-A-B-C.qcow2`` as its backing file), to which the contents
735
+of image [D] (from the source QEMU) will be mirrored to::
736
+
737
+ $ qemu-img create -f qcow2 -b ./Contents-of-A-B-C.qcow2 \
738
+ -F qcow2 ./target-disk.qcow2
739
+
740
+And start the destination QEMU (we already have the source QEMU running
741
+-- discussed in the section: `Interacting with a QEMU instance`_)
742
+instance, with the following invocation. (As noted earlier, for
743
+simplicity's sake, the destination QEMU is started on the same host, but
744
+it could be located elsewhere)::
745
+
746
+ $ ./x86_64-softmmu/qemu-system-x86_64 -display none -nodefconfig \
747
+ -M q35 -nodefaults -m 512 \
748
+ -blockdev node-name=node-TargetDisk,driver=qcow2,file.driver=file,file.node-name=file,file.filename=./target-disk.qcow2 \
749
+ -device virtio-blk,drive=node-TargetDisk,id=virtio0 \
750
+ -S -monitor stdio -qmp unix:./qmp-sock2,server,nowait \
751
+ -incoming tcp:localhost:6666
752
+
753
+Given the disk image chain on source QEMU::
754
+
755
+ [A] <-- [B] <-- [C] <-- [D]
756
+
757
+On the destination host, it is expected that the contents of the chain
758
+``[A] <-- [B] <-- [C]`` are *already* present, and therefore copy *only*
759
+the content of image [D].
760
+
761
+(1) [On *destination* QEMU] As part of the first step, start the
762
+ built-in NBD server on a given host (local host, represented by
763
+ ``::``)and port::
764
+
765
+ (QEMU) nbd-server-start addr={"type":"inet","data":{"host":"::","port":"49153"}}
766
+ {
767
+ "execute": "nbd-server-start",
768
+ "arguments": {
769
+ "addr": {
770
+ "data": {
771
+ "host": "::",
772
+ "port": "49153"
773
+ },
774
+ "type": "inet"
775
+ }
776
+ }
777
+ }
778
+
779
+(2) [On *destination* QEMU] And export the destination disk image using
780
+ QEMU's built-in NBD server::
781
+
782
+ (QEMU) nbd-server-add device=node-TargetDisk writable=true
783
+ {
784
+ "execute": "nbd-server-add",
785
+ "arguments": {
786
+ "device": "node-TargetDisk"
787
+ }
788
+ }
789
+
790
+(3) [On *source* QEMU] Then, invoke ``drive-mirror`` (NB: since we're
791
+ running ``drive-mirror`` with ``mode=existing`` (meaning:
792
+ synchronize to a pre-created file, therefore 'existing', file on the
793
+ target host), with the synchronization mode as 'top' (``"sync:
794
+ "top"``)::
795
+
796
+ (QEMU) drive-mirror device=node-D target=nbd:localhost:49153:exportname=node-TargetDisk sync=top mode=existing job-id=job0
797
+ {
798
+ "execute": "drive-mirror",
799
+ "arguments": {
800
+ "device": "node-D",
801
+ "mode": "existing",
802
+ "job-id": "job0",
803
+ "target": "nbd:localhost:49153:exportname=node-TargetDisk",
804
+ "sync": "top"
805
+ }
806
+ }
807
+
808
+(4) [On *source* QEMU] Once ``drive-mirror`` copies the entire data, and the
809
+ event ``BLOCK_JOB_READY`` is emitted, issue ``block-job-cancel`` to
810
+ gracefully end the synchronization, from source QEMU::
811
+
812
+ (QEMU) block-job-cancel device=job0
813
+ {
814
+ "execute": "block-job-cancel",
815
+ "arguments": {
816
+ "device": "job0"
817
+ }
818
+ }
819
+
820
+(5) [On *destination* QEMU] Then, stop the NBD server::
821
+
822
+ (QEMU) nbd-server-stop
823
+ {
824
+ "execute": "nbd-server-stop",
825
+ "arguments": {}
826
+ }
827
+
828
+(6) [On *destination* QEMU] Finally, resume the guest vCPUs by issuing the
829
+ QMP command `cont`::
830
+
831
+ (QEMU) cont
832
+ {
833
+ "execute": "cont",
834
+ "arguments": {}
835
+ }
836
+
837
+.. note::
838
+ Higher-level libraries (e.g. libvirt) automate the entire above
839
+ process (although note that libvirt does not allow same-host
840
+ migrations to localhost for other reasons).
841
+
842
+
843
+Notes on ``blockdev-mirror``
844
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
445
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
845
+
446
+
846
+The ``blockdev-mirror`` command is equivalent in core functionality to
447
+Generic device operations, such as DMA, will be performed by the remote
847
+``drive-mirror``, except that it operates at node-level in a BDS graph.
448
+process proxy by sending messages to the remote process.
848
+
449
+
849
+Also: for ``blockdev-mirror``, the 'target' image needs to be explicitly
450
+DMA operations
850
+created (using ``qemu-img``) and attach it to live QEMU via
451
+^^^^^^^^^^^^^^
851
+``blockdev-add``, which assigns a name to the to-be created target node.
452
+
852
+
453
+DMA operations would be handled much like vhost applications do. One of
853
+E.g. the sequence of actions to create a point-in-time backup of an
454
+the initial messages sent to the emulation process is a guest memory
854
+entire disk image chain, to a target, using ``blockdev-mirror`` would be:
455
+table. Each entry in this table consists of a file descriptor and size
855
+
456
+that the emulation process can ``mmap()`` to directly access guest
856
+(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
457
+memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
857
+ depth
458
+must be backed by file descriptors, such as when QEMU is given the
858
+
459
+*-mem-path* command line option.
859
+(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
460
+
860
+
461
+IOMMU operations
861
+(2) Attach the above created file (``e.qcow2``), run-time, using
462
+^^^^^^^^^^^^^^^^
862
+ ``blockdev-add`` to QEMU
463
+
863
+
464
+When the emulated system includes an IOMMU, the remote process proxy in
864
+(3) Perform ``blockdev-mirror`` (use ``"sync": "full"`` to copy the
465
+QEMU will need to create a socket for IOMMU requests from the emulation
865
+ entire chain to the target). And notice the event
466
+process. It will handle those requests with an
866
+ ``BLOCK_JOB_READY``
467
+``address_space_get_iotlb_entry()`` call. In order to handle IOMMU
867
+
468
+unmaps, the remote process proxy will also register as a listener on the
868
+(4) Optionally, query for active block jobs, there should be a 'mirror'
469
+device's DMA address space. When an IOMMU memory region is created
869
+ job ready to be completed
470
+within the DMA address space, an IOMMU notifier for unmaps will be added
870
+
471
+to the memory region that will forward unmaps to the emulation process
871
+(5) Gracefully complete the 'mirror' block device job, and notice the
472
+over the IOMMU socket.
872
+ the event ``BLOCK_JOB_COMPLETED``
473
+
873
+
474
+device hot-plug via QMP
874
+(6) Shutdown the guest by issuing the QMP ``quit`` command so that
475
+^^^^^^^^^^^^^^^^^^^^^^^
875
+ caches are flushed
476
+
876
+
477
+An QMP "device\_add" command can add a device emulated by a remote
877
+(7) Then, finally, compare the contents of the disk image chain, and
478
+process. It will also have "rid" option to the command, just as the
878
+ the target copy with ``qemu-img compare``. You should notice:
479
+*-device* command line option does. The remote process may either be one
879
+ "Images are identical"
480
+started at QEMU startup, or be one added by the "add-process" QMP
880
+
481
+command described above. In either case, the remote process proxy will
881
+
482
+forward the new device's JSON description to the corresponding emulation
882
+QMP invocation for ``blockdev-mirror``
483
+process.
883
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
484
+
884
+
485
+live migration
885
+Given the disk image chain::
486
+^^^^^^^^^^^^^^
886
+
487
+
887
+ [A] <-- [B] <-- [C] <-- [D]
488
+The remote process proxy will also register for live migration
888
+
489
+notifications with ``vmstate_register()``. When called to save state,
889
+To copy the contents of the entire disk image chain, from [A] all the
490
+the proxy will send the remote process a secondary socket file
890
+way to [D], to a new target, call it [E]. The following is the flow.
491
+descriptor to save the remote process's device *vmstate* over. The
891
+
492
+incoming byte stream length and data will be saved as the proxy's
892
+Create the overlay images, [B], [C], and [D]::
493
+*vmstate*. When the proxy is resumed on its new host, this *vmstate*
893
+
494
+will be extracted, and a secondary socket file descriptor will be sent
894
+ (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
495
+to the new remote process through which it receives the *vmstate* in
895
+ (QEMU) blockdev-snapshot-sync node-name=node-B snapshot-file=c.qcow2 snapshot-node-name=node-C format=qcow2
496
+order to restore the devices there.
896
+ (QEMU) blockdev-snapshot-sync node-name=node-C snapshot-file=d.qcow2 snapshot-node-name=node-D format=qcow2
497
+
897
+
498
+device emulation in remote process
898
+Create the target image, [E]::
499
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
899
+
500
+
900
+ $ qemu-img create -f qcow2 e.qcow2 39M
501
+The parts of QEMU that the emulation program will need include the
901
+
502
+object model; the memory emulation objects; the device emulation objects
902
+Add the above created target image to QEMU, via ``blockdev-add``::
503
+of the targeted device, and any dependent devices; and, the device's
903
+
504
+backends. It will also need code to setup the machine environment,
904
+ (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
505
+handle requests from the QEMU process, and route machine-level requests
905
+ {
506
+(such as interrupts or IOMMU mappings) back to the QEMU process.
906
+ "execute": "blockdev-add",
507
+
907
+ "arguments": {
508
+initialization
908
+ "node-name": "node-E",
509
+^^^^^^^^^^^^^^
909
+ "driver": "qcow2",
510
+
910
+ "file": {
511
+The process initialization sequence will follow the same sequence
911
+ "driver": "file",
512
+followed by QEMU. It will first initialize the backend objects, then
912
+ "filename": "e.qcow2"
513
+device emulation objects. The JSON descriptions sent by the QEMU process
913
+ }
514
+will drive which objects need to be created.
914
+ }
515
+
915
+ }
516
+- address spaces
916
+
517
+
917
+Perform ``blockdev-mirror``, and notice the event ``BLOCK_JOB_READY``::
518
+Before the device objects are created, the initial address spaces and
918
+
519
+memory regions must be configured with ``memory_map_init()``. This
919
+ (QEMU) blockdev-mirror device=node-B target=node-E sync=full job-id=job0
520
+creates a RAM memory region object (*system\_memory*) and an IO memory
920
+ {
521
+region object (*system\_io*).
921
+ "execute": "blockdev-mirror",
522
+
922
+ "arguments": {
523
+- RAM
923
+ "device": "node-D",
524
+
924
+ "job-id": "job0",
525
+RAM memory region creation will follow how ``pc_memory_init()`` creates
925
+ "target": "node-E",
526
+them, but must use ``memory_region_init_ram_from_fd()`` instead of
926
+ "sync": "full"
527
+``memory_region_allocate_system_memory()``. The file descriptors needed
927
+ }
528
+will be supplied by the guest memory table from above. Those RAM regions
928
+ }
529
+would then be added to the *system\_memory* memory region with
929
+
530
+``memory_region_add_subregion()``.
930
+Query for active block jobs, there should be a 'mirror' job ready::
531
+
931
+
532
+- PCI
932
+ (QEMU) query-block-jobs
533
+
933
+ {
534
+IO initialization will be driven by the JSON descriptions sent from the
934
+ "execute": "query-block-jobs",
535
+QEMU process. For a PCI device, a PCI bus will need to be created with
935
+ "arguments": {}
536
+``pci_root_bus_new()``, and a PCI memory region will need to be created
936
+ }
537
+and added to the *system\_memory* memory region with
937
+ {
538
+``memory_region_add_subregion_overlap()``. The overlap version is
938
+ "return": [
539
+required for architectures where PCI memory overlaps with RAM memory.
939
+ {
540
+
940
+ "busy": false,
541
+MMIO handling
941
+ "type": "mirror",
542
+^^^^^^^^^^^^^
942
+ "len": 21561344,
543
+
943
+ "paused": false,
544
+The device emulation objects will use ``memory_region_init_io()`` to
944
+ "ready": true,
545
+install their MMIO handlers, and ``pci_register_bar()`` to associate
945
+ "io-status": "ok",
546
+those handlers with a PCI BAR, as they do within QEMU currently.
946
+ "offset": 21561344,
547
+
947
+ "device": "job0",
548
+In order to use ``address_space_rw()`` in the emulation process to
948
+ "speed": 0
549
+handle MMIO requests from QEMU, the PCI physical addresses must be the
949
+ }
550
+same in the QEMU process and the device emulation process. In order to
950
+ ]
551
+accomplish that, guest BAR programming must also be forwarded from QEMU
951
+ }
552
+to the emulation process.
952
+
553
+
953
+Gracefully complete the block device job operation, and notice the
554
+interrupt injection
954
+event ``BLOCK_JOB_COMPLETED``::
555
+^^^^^^^^^^^^^^^^^^^
955
+
556
+
956
+ (QEMU) block-job-complete device=job0
557
+When device emulation wants to inject an interrupt into the VM, the
957
+ {
558
+request climbs the device's bus object hierarchy until the point where a
958
+ "execute": "block-job-complete",
559
+bus object knows how to signal the interrupt to the guest. The details
959
+ "arguments": {
560
+depend on the type of interrupt being raised.
960
+ "device": "job0"
561
+
961
+ }
562
+- PCI pin interrupts
962
+ }
563
+
963
+ {
564
+On x86 systems, there is an emulated IOAPIC object attached to the root
964
+ "return": {}
565
+PCI bus object, and the root PCI object forwards interrupt requests to
965
+ }
566
+it. The IOAPIC object, in turn, calls the KVM driver to inject the
966
+
567
+corresponding interrupt into the VM. The simplest way to handle this in
967
+Shutdown the guest, by issuing the ``quit`` QMP command::
568
+an emulation process would be to setup the root PCI bus driver (via
968
+
569
+``pci_bus_irqs()``) to send a interrupt request back to the QEMU
969
+ (QEMU) quit
570
+process, and have the device proxy object reflect it up the PCI tree
970
+ {
571
+there.
971
+ "execute": "quit",
572
+
972
+ "arguments": {}
573
+- PCI MSI/X interrupts
973
+ }
574
+
974
+
575
+PCI MSI/X interrupts are implemented in HW as DMA writes to a
975
+
576
+CPU-specific PCI address. In QEMU on x86, a KVM APIC object receives
976
+Live disk backup --- ``drive-backup`` and ``blockdev-backup``
577
+these DMA writes, then calls into the KVM driver to inject the interrupt
977
+-------------------------------------------------------------
578
+into the VM. A simple emulation process implementation would be to send
978
+
579
+the MSI DMA address from QEMU as a message at initialization, then
979
+The ``drive-backup`` (and its newer equivalent ``blockdev-backup``) allows
580
+install an address space handler at that address which forwards the MSI
980
+you to create a point-in-time snapshot.
581
+message back to QEMU.
981
+
582
+
982
+In this case, the point-in-time is when you *start* the ``drive-backup``
583
+DMA operations
983
+(or its newer equivalent ``blockdev-backup``) command.
584
+^^^^^^^^^^^^^^
984
+
585
+
985
+
586
+When a emulation object wants to DMA into or out of guest memory, it
986
+QMP invocation for ``drive-backup``
587
+first must use dma\_memory\_map() to convert the DMA address to a local
987
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
588
+virtual address. The emulation process memory region objects setup above
988
+
589
+will be used to translate the DMA address to a local virtual address the
989
+Yet again, starting afresh with our example disk image chain::
590
+device emulation code can access.
990
+
591
+
991
+ [A] <-- [B] <-- [C] <-- [D]
592
+IOMMU
992
+
593
+^^^^^
993
+To create a target image [E], with content populated from image [A] to
594
+
994
+[D], from the above chain, the following is the syntax. (If the target
595
+When an IOMMU is in use in QEMU, DMA translation uses IOMMU memory
995
+image does not exist, ``drive-backup`` will create it)::
596
+regions to translate the DMA address to a guest physical address before
996
+
597
+that physical address can be translated to a local virtual address. The
997
+ (QEMU) drive-backup device=node-D sync=full target=e.qcow2 job-id=job0
598
+emulation process will need similar functionality.
998
+ {
599
+
999
+ "execute": "drive-backup",
600
+- IOTLB cache
1000
+ "arguments": {
601
+
1001
+ "device": "node-D",
602
+The emulation process will maintain a cache of recent IOMMU translations
1002
+ "job-id": "job0",
603
+(the IOTLB). When the translate() callback of an IOMMU memory region is
1003
+ "sync": "full",
604
+invoked, the IOTLB cache will be searched for an entry that will map the
1004
+ "target": "e.qcow2"
605
+DMA address to a guest PA. On a cache miss, a message will be sent back
1005
+ }
606
+to QEMU requesting the corresponding translation entry, which be both be
1006
+ }
607
+used to return a guest address and be added to the cache.
1007
+
608
+
1008
+Once the above ``drive-backup`` has completed, a ``BLOCK_JOB_COMPLETED`` event
609
+- IOTLB purge
1009
+will be issued, indicating the live block device job operation has
610
+
1010
+completed, and no further action is required.
611
+The IOMMU emulation will also need to act on unmap requests from QEMU.
1011
+
612
+These happen when the guest IOMMU driver purges an entry from the
1012
+
613
+guest's translation table.
1013
+Notes on ``blockdev-backup``
614
+
615
+live migration
616
+^^^^^^^^^^^^^^
617
+
618
+When a remote process receives a live migration indication from QEMU, it
619
+will set up a channel using the received file descriptor with
620
+``qio_channel_socket_new_fd()``. This channel will be used to create a
621
+*QEMUfile* that can be passed to ``qemu_save_device_state()`` to send
622
+the process's device state back to QEMU. This method will be reversed on
623
+restore - the channel will be passed to ``qemu_loadvm_state()`` to
624
+restore the device state.
625
+
626
+Accelerating device emulation
627
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
628
+
629
+The messages that are required to be sent between QEMU and the emulation
630
+process can add considerable latency to IO operations. The optimizations
631
+described below attempt to ameliorate this effect by allowing the
632
+emulation process to communicate directly with the kernel KVM driver.
633
+The KVM file descriptors created would be passed to the emulation process
634
+via initialization messages, much like the guest memory table is done.
635
+#### MMIO acceleration
636
+
637
+Vhost user applications can receive guest virtio driver stores directly
638
+from KVM. The issue with the eventfd mechanism used by vhost user is
639
+that it does not pass any data with the event indication, so it cannot
640
+handle guest loads or guest stores that carry store data. This concept
641
+could, however, be expanded to cover more cases.
642
+
643
+The expanded idea would require a new type of KVM device:
644
+*KVM\_DEV\_TYPE\_USER*. This device has two file descriptors: a master
645
+descriptor that QEMU can use for configuration, and a slave descriptor
646
+that the emulation process can use to receive MMIO notifications. QEMU
647
+would create both descriptors using the KVM driver, and pass the slave
648
+descriptor to the emulation process via an initialization message.
649
+
650
+data structures
651
+^^^^^^^^^^^^^^^
652
+
653
+- guest physical range
654
+
655
+The guest physical range structure describes the address range that a
656
+device will respond to. It includes the base and length of the range, as
657
+well as which bus the range resides on (e.g., on an x86machine, it can
658
+specify whether the range refers to memory or IO addresses).
659
+
660
+A device can have multiple physical address ranges it responds to (e.g.,
661
+a PCI device can have multiple BARs), so the structure will also include
662
+an enumerated identifier to specify which of the device's ranges is
663
+being referred to.
664
+
665
++--------+----------------------------+
666
+| Name | Description |
667
++========+============================+
668
+| addr | range base address |
669
++--------+----------------------------+
670
+| len | range length |
671
++--------+----------------------------+
672
+| bus | addr type (memory or IO) |
673
++--------+----------------------------+
674
+| id | range ID (e.g., PCI BAR) |
675
++--------+----------------------------+
676
+
677
+- MMIO request structure
678
+
679
+This structure describes an MMIO operation. It includes which guest
680
+physical range the MMIO was within, the offset within that range, the
681
+MMIO type (e.g., load or store), and its length and data. It also
682
+includes a sequence number that can be used to reply to the MMIO, and
683
+the CPU that issued the MMIO.
684
+
685
++----------+------------------------+
686
+| Name | Description |
687
++==========+========================+
688
+| rid | range MMIO is within |
689
++----------+------------------------+
690
+| offset | offset withing *rid* |
691
++----------+------------------------+
692
+| type | e.g., load or store |
693
++----------+------------------------+
694
+| len | MMIO length |
695
++----------+------------------------+
696
+| data | store data |
697
++----------+------------------------+
698
+| seq | sequence ID |
699
++----------+------------------------+
700
+
701
+- MMIO request queues
702
+
703
+MMIO request queues are FIFO arrays of MMIO request structures. There
704
+are two queues: pending queue is for MMIOs that haven't been read by the
705
+emulation program, and the sent queue is for MMIOs that haven't been
706
+acknowledged. The main use of the second queue is to validate MMIO
707
+replies from the emulation program.
708
+
709
+- scoreboard
710
+
711
+Each CPU in the VM is emulated in QEMU by a separate thread, so multiple
712
+MMIOs may be waiting to be consumed by an emulation program and multiple
713
+threads may be waiting for MMIO replies. The scoreboard would contain a
714
+wait queue and sequence number for the per-CPU threads, allowing them to
715
+be individually woken when the MMIO reply is received from the emulation
716
+program. It also tracks the number of posted MMIO stores to the device
717
+that haven't been replied to, in order to satisfy the PCI constraint
718
+that a load to a device will not complete until all previous stores to
719
+that device have been completed.
720
+
721
+- device shadow memory
722
+
723
+Some MMIO loads do not have device side-effects. These MMIOs can be
724
+completed without sending a MMIO request to the emulation program if the
725
+emulation program shares a shadow image of the device's memory image
726
+with the KVM driver.
727
+
728
+The emulation program will ask the KVM driver to allocate memory for the
729
+shadow image, and will then use ``mmap()`` to directly access it. The
730
+emulation program can control KVM access to the shadow image by sending
731
+KVM an access map telling it which areas of the image have no
732
+side-effects (and can be completed immediately), and which require a
733
+MMIO request to the emulation program. The access map can also inform
734
+the KVM drive which size accesses are allowed to the image.
735
+
736
+master descriptor
737
+^^^^^^^^^^^^^^^^^
738
+
739
+The master descriptor is used by QEMU to configure the new KVM device.
740
+The descriptor would be returned by the KVM driver when QEMU issues a
741
+*KVM\_CREATE\_DEVICE* ``ioctl()`` with a *KVM\_DEV\_TYPE\_USER* type.
742
+
743
+KVM\_DEV\_TYPE\_USER device ops
744
+
745
+
746
+The *KVM\_DEV\_TYPE\_USER* operations vector will be registered by a
747
+``kvm_register_device_ops()`` call when the KVM system in initialized by
748
+``kvm_init()``. These device ops are called by the KVM driver when QEMU
749
+executes certain ``ioctl()`` operations on its KVM file descriptor. They
750
+include:
751
+
752
+- create
753
+
754
+This routine is called when QEMU issues a *KVM\_CREATE\_DEVICE*
755
+``ioctl()`` on its per-VM file descriptor. It will allocate and
756
+initialize a KVM user device specific data structure, and assign the
757
+*kvm\_device* private field to it.
758
+
759
+- ioctl
760
+
761
+This routine is invoked when QEMU issues an ``ioctl()`` on the master
762
+descriptor. The ``ioctl()`` commands supported are defined by the KVM
763
+device type. *KVM\_DEV\_TYPE\_USER* ones will need several commands:
764
+
765
+*KVM\_DEV\_USER\_SLAVE\_FD* creates the slave file descriptor that will
766
+be passed to the device emulation program. Only one slave can be created
767
+by each master descriptor. The file operations performed by this
768
+descriptor are described below.
769
+
770
+The *KVM\_DEV\_USER\_PA\_RANGE* command configures a guest physical
771
+address range that the slave descriptor will receive MMIO notifications
772
+for. The range is specified by a guest physical range structure
773
+argument. For buses that assign addresses to devices dynamically, this
774
+command can be executed while the guest is running, such as the case
775
+when a guest changes a device's PCI BAR registers.
776
+
777
+*KVM\_DEV\_USER\_PA\_RANGE* will use ``kvm_io_bus_register_dev()`` to
778
+register *kvm\_io\_device\_ops* callbacks to be invoked when the guest
779
+performs a MMIO operation within the range. When a range is changed,
780
+``kvm_io_bus_unregister_dev()`` is used to remove the previous
781
+instantiation.
782
+
783
+*KVM\_DEV\_USER\_TIMEOUT* will configure a timeout value that specifies
784
+how long KVM will wait for the emulation process to respond to a MMIO
785
+indication.
786
+
787
+- destroy
788
+
789
+This routine is called when the VM instance is destroyed. It will need
790
+to destroy the slave descriptor; and free any memory allocated by the
791
+driver, as well as the *kvm\_device* structure itself.
792
+
793
+slave descriptor
794
+^^^^^^^^^^^^^^^^
795
+
796
+The slave descriptor will have its own file operations vector, which
797
+responds to system calls on the descriptor performed by the device
798
+emulation program.
799
+
800
+- read
801
+
802
+A read returns any pending MMIO requests from the KVM driver as MMIO
803
+request structures. Multiple structures can be returned if there are
804
+multiple MMIO operations pending. The MMIO requests are moved from the
805
+pending queue to the sent queue, and if there are threads waiting for
806
+space in the pending to add new MMIO operations, they will be woken
807
+here.
808
+
809
+- write
810
+
811
+A write also consists of a set of MMIO requests. They are compared to
812
+the MMIO requests in the sent queue. Matches are removed from the sent
813
+queue, and any threads waiting for the reply are woken. If a store is
814
+removed, then the number of posted stores in the per-CPU scoreboard is
815
+decremented. When the number is zero, and a non side-effect load was
816
+waiting for posted stores to complete, the load is continued.
817
+
818
+- ioctl
819
+
820
+There are several ioctl()s that can be performed on the slave
821
+descriptor.
822
+
823
+A *KVM\_DEV\_USER\_SHADOW\_SIZE* ``ioctl()`` causes the KVM driver to
824
+allocate memory for the shadow image. This memory can later be
825
+``mmap()``\ ed by the emulation process to share the emulation's view of
826
+device memory with the KVM driver.
827
+
828
+A *KVM\_DEV\_USER\_SHADOW\_CTRL* ``ioctl()`` controls access to the
829
+shadow image. It will send the KVM driver a shadow control map, which
830
+specifies which areas of the image can complete guest loads without
831
+sending the load request to the emulation program. It will also specify
832
+the size of load operations that are allowed.
833
+
834
+- poll
835
+
836
+An emulation program will use the ``poll()`` call with a *POLLIN* flag
837
+to determine if there are MMIO requests waiting to be read. It will
838
+return if the pending MMIO request queue is not empty.
839
+
840
+- mmap
841
+
842
+This call allows the emulation program to directly access the shadow
843
+image allocated by the KVM driver. As device emulation updates device
844
+memory, changes with no side-effects will be reflected in the shadow,
845
+and the KVM driver can satisfy guest loads from the shadow image without
846
+needing to wait for the emulation program.
847
+
848
+kvm\_io\_device ops
849
+^^^^^^^^^^^^^^^^^^^
850
+
851
+Each KVM per-CPU thread can handle MMIO operation on behalf of the guest
852
+VM. KVM will use the MMIO's guest physical address to search for a
853
+matching *kvm\_io\_device* to see if the MMIO can be handled by the KVM
854
+driver instead of exiting back to QEMU. If a match is found, the
855
+corresponding callback will be invoked.
856
+
857
+- read
858
+
859
+This callback is invoked when the guest performs a load to the device.
860
+Loads with side-effects must be handled synchronously, with the KVM
861
+driver putting the QEMU thread to sleep waiting for the emulation
862
+process reply before re-starting the guest. Loads that do not have
863
+side-effects may be optimized by satisfying them from the shadow image,
864
+if there are no outstanding stores to the device by this CPU. PCI memory
865
+ordering demands that a load cannot complete before all older stores to
866
+the same device have been completed.
867
+
868
+- write
869
+
870
+Stores can be handled asynchronously unless the pending MMIO request
871
+queue is full. In this case, the QEMU thread must sleep waiting for
872
+space in the queue. Stores will increment the number of posted stores in
873
+the per-CPU scoreboard, in order to implement the PCI ordering
874
+constraint above.
875
+
876
+interrupt acceleration
877
+^^^^^^^^^^^^^^^^^^^^^^
878
+
879
+This performance optimization would work much like a vhost user
880
+application does, where the QEMU process sets up *eventfds* that cause
881
+the device's corresponding interrupt to be triggered by the KVM driver.
882
+These irq file descriptors are sent to the emulation process at
883
+initialization, and are used when the emulation code raises a device
884
+interrupt.
885
+
886
+intx acceleration
887
+'''''''''''''''''
888
+
889
+Traditional PCI pin interrupts are level based, so, in addition to an
890
+irq file descriptor, a re-sampling file descriptor needs to be sent to
891
+the emulation program. This second file descriptor allows multiple
892
+devices sharing an irq to be notified when the interrupt has been
893
+acknowledged by the guest, so they can re-trigger the interrupt if their
894
+device has not de-asserted its interrupt.
895
+
896
+intx irq descriptor
897
+
898
+
899
+The irq descriptors are created by the proxy object
900
+``using event_notifier_init()`` to create the irq and re-sampling
901
+*eventds*, and ``kvm_vm_ioctl(KVM_IRQFD)`` to bind them to an interrupt.
902
+The interrupt route can be found with
903
+``pci_device_route_intx_to_irq()``.
904
+
905
+intx routing changes
906
+
907
+
908
+Intx routing can be changed when the guest programs the APIC the device
909
+pin is connected to. The proxy object in QEMU will use
910
+``pci_device_set_intx_routing_notifier()`` to be informed of any guest
911
+changes to the route. This handler will broadly follow the VFIO
912
+interrupt logic to change the route: de-assigning the existing irq
913
+descriptor from its route, then assigning it the new route. (see
914
+``vfio_intx_update()``)
915
+
916
+MSI/X acceleration
917
+''''''''''''''''''
918
+
919
+MSI/X interrupts are sent as DMA transactions to the host. The interrupt
920
+data contains a vector that is programmed by the guest, A device may have
921
+multiple MSI interrupts associated with it, so multiple irq descriptors
922
+may need to be sent to the emulation program.
923
+
924
+MSI/X irq descriptor
925
+
926
+
927
+This case will also follow the VFIO example. For each MSI/X interrupt,
928
+an *eventfd* is created, a virtual interrupt is allocated by
929
+``kvm_irqchip_add_msi_route()``, and the virtual interrupt is bound to
930
+the eventfd with ``kvm_irqchip_add_irqfd_notifier()``.
931
+
932
+MSI/X config space changes
933
+
934
+
935
+The guest may dynamically update several MSI-related tables in the
936
+device's PCI config space. These include per-MSI interrupt enables and
937
+vector data. Additionally, MSIX tables exist in device memory space, not
938
+config space. Much like the BAR case above, the proxy object must look
939
+at guest config space programming to keep the MSI interrupt state
940
+consistent between QEMU and the emulation program.
941
+
942
+--------------
943
+
944
+Disaggregated CPU emulation
945
+---------------------------
946
+
947
+After IO services have been disaggregated, a second phase would be to
948
+separate a process to handle CPU instruction emulation from the main
949
+QEMU control function. There are no object separation points for this
950
+code, so the first task would be to create one.
951
+
952
+Host access controls
953
+--------------------
954
+
955
+Separating QEMU relies on the host OS's access restriction mechanisms to
956
+enforce that the differing processes can only access the objects they
957
+are entitled to. There are a couple types of mechanisms usually provided
958
+by general purpose OSs.
959
+
960
+Discretionary access control
1014
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
961
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1015
+
962
+
1016
+The ``blockdev-backup`` command is equivalent in functionality to
963
+Discretionary access control allows each user to control who can access
1017
+``drive-backup``, except that it operates at node-level in a Block Driver
964
+their files. In Linux, this type of control is usually too coarse for
1018
+State (BDS) graph.
965
+QEMU separation, since it only provides three separate access controls:
1019
+
966
+one for the same user ID, the second for users IDs with the same group
1020
+E.g. the sequence of actions to create a point-in-time backup
967
+ID, and the third for all other user IDs. Each device instance would
1021
+of an entire disk image chain, to a target, using ``blockdev-backup``
968
+need a separate user ID to provide access control, which is likely to be
1022
+would be:
969
+unwieldy for dynamically created VMs.
1023
+
970
+
1024
+(0) Create the QCOW2 overlays, to arrive at a backing chain of desired
971
+Mandatory access control
1025
+ depth
972
+~~~~~~~~~~~~~~~~~~~~~~~~
1026
+
973
+
1027
+(1) Create the target image (using ``qemu-img``), say, ``e.qcow2``
974
+Mandatory access control allows the OS to add an additional set of
1028
+
975
+controls on top of discretionary access for the OS to control. It also
1029
+(2) Attach the above created file (``e.qcow2``), run-time, using
976
+adds other attributes to processes and files such as types, roles, and
1030
+ ``blockdev-add`` to QEMU
977
+categories, and can establish rules for how processes and files can
1031
+
978
+interact.
1032
+(3) Perform ``blockdev-backup`` (use ``"sync": "full"`` to copy the
979
+
1033
+ entire chain to the target). And notice the event
980
+Type enforcement
1034
+ ``BLOCK_JOB_COMPLETED``
981
+^^^^^^^^^^^^^^^^
1035
+
982
+
1036
+(4) Shutdown the guest, by issuing the QMP ``quit`` command, so that
983
+Type enforcement assigns a *type* attribute to processes and files, and
1037
+ caches are flushed
984
+allows rules to be written on what operations a process with a given
1038
+
985
+type can perform on a file with a given type. QEMU separation could take
1039
+(5) Then, finally, compare the contents of the disk image chain, and
986
+advantage of type enforcement by running the emulation processes with
1040
+ the target copy with ``qemu-img compare``. You should notice:
987
+different types, both from the main QEMU process, and from the emulation
1041
+ "Images are identical"
988
+processes of different classes of devices.
1042
+
989
+
1043
+The following section shows an example QMP invocation for
990
+For example, guest disk images and disk emulation processes could have
1044
+``blockdev-backup``.
991
+types separate from the main QEMU process and non-disk emulation
1045
+
992
+processes, and the type rules could prevent processes other than disk
1046
+QMP invocation for ``blockdev-backup``
993
+emulation ones from accessing guest disk images. Similarly, network
1047
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
994
+emulation processes can have a type separate from the main QEMU process
1048
+
995
+and non-network emulation process, and only that type can access the
1049
+Given a disk image chain of depth 1 where image [B] is the active
996
+host tun/tap device used to provide guest networking.
1050
+overlay (live QEMU is writing to it)::
997
+
1051
+
998
+Category enforcement
1052
+ [A] <-- [B]
999
+^^^^^^^^^^^^^^^^^^^^
1053
+
1000
+
1054
+The following is the procedure to copy the content from the entire chain
1001
+Category enforcement assigns a set of numbers within a given range to
1055
+to a target image (say, [E]), which has the full content from [A] and
1002
+the process or file. The process is granted access to the file if the
1056
+[B].
1003
+process's set is a superset of the file's set. This enforcement can be
1057
+
1004
+used to separate multiple instances of devices in the same class.
1058
+Create the overlay [B]::
1005
+
1059
+
1006
+For example, if there are multiple disk devices provides to a guest,
1060
+ (QEMU) blockdev-snapshot-sync node-name=node-A snapshot-file=b.qcow2 snapshot-node-name=node-B format=qcow2
1007
+each device emulation process could be provisioned with a separate
1061
+ {
1008
+category. The different device emulation processes would not be able to
1062
+ "execute": "blockdev-snapshot-sync",
1009
+access each other's backing disk images.
1063
+ "arguments": {
1010
+
1064
+ "node-name": "node-A",
1011
+Alternatively, categories could be used in lieu of the type enforcement
1065
+ "snapshot-file": "b.qcow2",
1012
+scheme described above. In this scenario, different categories would be
1066
+ "format": "qcow2",
1013
+used to prevent device emulation processes in different classes from
1067
+ "snapshot-node-name": "node-B"
1014
+accessing resources assigned to other classes.
1068
+ }
1069
+ }
1070
+
1071
+
1072
+Create a target image that will contain the copy::
1073
+
1074
+ $ qemu-img create -f qcow2 e.qcow2 39M
1075
+
1076
+Then add it to QEMU via ``blockdev-add``::
1077
+
1078
+ (QEMU) blockdev-add driver=qcow2 node-name=node-E file={"driver":"file","filename":"e.qcow2"}
1079
+ {
1080
+ "execute": "blockdev-add",
1081
+ "arguments": {
1082
+ "node-name": "node-E",
1083
+ "driver": "qcow2",
1084
+ "file": {
1085
+ "driver": "file",
1086
+ "filename": "e.qcow2"
1087
+ }
1088
+ }
1089
+ }
1090
+
1091
+Then invoke ``blockdev-backup`` to copy the contents from the entire
1092
+image chain, consisting of images [A] and [B] to the target image
1093
+'e.qcow2'::
1094
+
1095
+ (QEMU) blockdev-backup device=node-B target=node-E sync=full job-id=job0
1096
+ {
1097
+ "execute": "blockdev-backup",
1098
+ "arguments": {
1099
+ "device": "node-B",
1100
+ "job-id": "job0",
1101
+ "target": "node-E",
1102
+ "sync": "full"
1103
+ }
1104
+ }
1105
+
1106
+Once the above 'backup' operation has completed, the event,
1107
+``BLOCK_JOB_COMPLETED`` will be emitted, signalling successful
1108
+completion.
1109
+
1110
+Next, query for any active block device jobs (there should be none)::
1111
+
1112
+ (QEMU) query-block-jobs
1113
+ {
1114
+ "execute": "query-block-jobs",
1115
+ "arguments": {}
1116
+ }
1117
+
1118
+Shutdown the guest::
1119
+
1120
+ (QEMU) quit
1121
+ {
1122
+ "execute": "quit",
1123
+ "arguments": {}
1124
+ }
1125
+ "return": {}
1126
+ }
1127
+
1128
+.. note::
1129
+ The above step is really important; if forgotten, an error, "Failed
1130
+ to get shared "write" lock on e.qcow2", will be thrown when you do
1131
+ ``qemu-img compare`` to verify the integrity of the disk image
1132
+ with the backup content.
1133
+
1134
+
1135
+The end result will be the image 'e.qcow2' containing a
1136
+point-in-time backup of the disk image chain -- i.e. contents from
1137
+images [A] and [B] at the time the ``blockdev-backup`` command was
1138
+initiated.
1139
+
1140
+One way to confirm the backup disk image contains the identical content
1141
+with the disk image chain is to compare the backup and the contents of
1142
+the chain, you should see "Images are identical". (NB: this is assuming
1143
+QEMU was launched with ``-S`` option, which will not start the CPUs at
1144
+guest boot up)::
1145
+
1146
+ $ qemu-img compare b.qcow2 e.qcow2
1147
+ Warning: Image size mismatch!
1148
+ Images are identical.
1149
+
1150
+NOTE: The "Warning: Image size mismatch!" is expected, as we created the
1151
+target image (e.qcow2) with 39M size.
1152
diff --git a/docs/live-block-ops.txt b/docs/live-block-ops.txt
1153
deleted file mode 100644
1154
index XXXXXXX..XXXXXXX
1155
--- a/docs/live-block-ops.txt
1156
+++ /dev/null
1157
@@ -XXX,XX +XXX,XX @@
1158
-LIVE BLOCK OPERATIONS
1159
-=====================
1160
-
1161
-High level description of live block operations. Note these are not
1162
-supported for use with the raw format at the moment.
1163
-
1164
-Note also that this document is incomplete and it currently only
1165
-covers the 'stream' operation. Other operations supported by QEMU such
1166
-as 'commit', 'mirror' and 'backup' are not described here yet. Please
1167
-refer to the qapi/block-core.json file for an overview of those.
1168
-
1169
-Snapshot live merge
1170
-===================
1171
-
1172
-Given a snapshot chain, described in this document in the following
1173
-format:
1174
-
1175
-[A] <- [B] <- [C] <- [D] <- [E]
1176
-
1177
-Where the rightmost object ([E] in the example) described is the current
1178
-image which the guest OS has write access to. To the left of it is its base
1179
-image, and so on accordingly until the leftmost image, which has no
1180
-base.
1181
-
1182
-The snapshot live merge operation transforms such a chain into a
1183
-smaller one with fewer elements, such as this transformation relative
1184
-to the first example:
1185
-
1186
-[A] <- [E]
1187
-
1188
-Data is copied in the right direction with destination being the
1189
-rightmost image, but any other intermediate image can be specified
1190
-instead. In this example data is copied from [C] into [D], so [D] can
1191
-be backed by [B]:
1192
-
1193
-[A] <- [B] <- [D] <- [E]
1194
-
1195
-The operation is implemented in QEMU through image streaming facilities.
1196
-
1197
-The basic idea is to execute 'block_stream virtio0' while the guest is
1198
-running. Progress can be monitored using 'info block-jobs'. When the
1199
-streaming operation completes it raises a QMP event. 'block_stream'
1200
-copies data from the backing file(s) into the active image. When finished,
1201
-it adjusts the backing file pointer.
1202
-
1203
-The 'base' parameter specifies an image which data need not be
1204
-streamed from. This image will be used as the backing file for the
1205
-destination image when the operation is finished.
1206
-
1207
-In the first example above, the command would be:
1208
-
1209
-(qemu) block_stream virtio0 file-A.img
1210
-
1211
-In order to specify a destination image different from the active
1212
-(rightmost) one we can use its node name instead.
1213
-
1214
-In the second example above, the command would be:
1215
-
1216
-(qemu) block_stream node-D file-B.img
1217
-
1218
-Live block copy
1219
-===============
1220
-
1221
-To copy an in use image to another destination in the filesystem, one
1222
-should create a live snapshot in the desired destination, then stream
1223
-into that image. Example:
1224
-
1225
-(qemu) snapshot_blkdev ide0-hd0 /new-path/disk.img qcow2
1226
-
1227
-(qemu) block_stream ide0-hd0
1228
-
1229
-
1230
--
1015
--
1231
2.9.4
1016
2.29.2
1232
1017
1233
diff view generated by jsdifflib
New patch
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
1
2
3
Adds documentation explaining the command-line arguments needed
4
to use multi-process.
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 49f757a84e5dd6fae14b22544897d1124c5fdbad.1611938319.git.jag.raman@oracle.com
11
12
[Move orphan docs/multi-process.rst document into docs/system/ and add
13
it to index.rst to prevent Sphinx "document isn't included in any
14
toctree" error.
15
--Stefan]
16
17
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
18
---
19
MAINTAINERS | 1 +
20
docs/system/index.rst | 1 +
21
docs/system/multi-process.rst | 64 +++++++++++++++++++++++++++++++++++
22
3 files changed, 66 insertions(+)
23
create mode 100644 docs/system/multi-process.rst
24
25
diff --git a/MAINTAINERS b/MAINTAINERS
26
index XXXXXXX..XXXXXXX 100644
27
--- a/MAINTAINERS
28
+++ b/MAINTAINERS
29
@@ -XXX,XX +XXX,XX @@ M: Jagannathan Raman <jag.raman@oracle.com>
30
M: John G Johnson <john.g.johnson@oracle.com>
31
S: Maintained
32
F: docs/devel/multi-process.rst
33
+F: docs/system/multi-process.rst
34
35
Build and test automation
36
-------------------------
37
diff --git a/docs/system/index.rst b/docs/system/index.rst
38
index XXXXXXX..XXXXXXX 100644
39
--- a/docs/system/index.rst
40
+++ b/docs/system/index.rst
41
@@ -XXX,XX +XXX,XX @@ Contents:
42
pr-manager
43
targets
44
security
45
+ multi-process
46
deprecated
47
removed-features
48
build-platforms
49
diff --git a/docs/system/multi-process.rst b/docs/system/multi-process.rst
50
new file mode 100644
51
index XXXXXXX..XXXXXXX
52
--- /dev/null
53
+++ b/docs/system/multi-process.rst
54
@@ -XXX,XX +XXX,XX @@
55
+Multi-process QEMU
56
+==================
57
+
58
+This document describes how to configure and use multi-process qemu.
59
+For the design document refer to docs/devel/qemu-multiprocess.
60
+
61
+1) Configuration
62
+----------------
63
+
64
+multi-process is enabled by default for targets that enable KVM
65
+
66
+
67
+2) Usage
68
+--------
69
+
70
+Multi-process QEMU requires an orchestrator to launch.
71
+
72
+Following is a description of command-line used to launch mpqemu.
73
+
74
+* Orchestrator:
75
+
76
+ - The Orchestrator creates a unix socketpair
77
+
78
+ - It launches the remote process and passes one of the
79
+ sockets to it via command-line.
80
+
81
+ - It then launches QEMU and specifies the other socket as an option
82
+ to the Proxy device object
83
+
84
+* Remote Process:
85
+
86
+ - QEMU can enter remote process mode by using the "remote" machine
87
+ option.
88
+
89
+ - The orchestrator creates a "remote-object" with details about
90
+ the device and the file descriptor for the device
91
+
92
+ - The remaining options are no different from how one launches QEMU with
93
+ devices.
94
+
95
+ - Example command-line for the remote process is as follows:
96
+
97
+ /usr/bin/qemu-system-x86_64 \
98
+ -machine x-remote \
99
+ -device lsi53c895a,id=lsi0 \
100
+ -drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2 \
101
+ -device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0 \
102
+ -object x-remote-object,id=robj1,devid=lsi1,fd=4,
103
+
104
+* QEMU:
105
+
106
+ - Since parts of the RAM are shared between QEMU & remote process, a
107
+ memory-backend-memfd is required to facilitate this, as follows:
108
+
109
+ -object memory-backend-memfd,id=mem,size=2G
110
+
111
+ - A "x-pci-proxy-dev" device is created for each of the PCI devices emulated
112
+ in the remote process. A "socket" sub-option specifies the other end of
113
+ unix channel created by orchestrator. The "id" sub-option must be specified
114
+ and should be the same as the "id" specified for the remote PCI device
115
+
116
+ - Example commandline for QEMU is as follows:
117
+
118
+ -device x-pci-proxy-dev,id=lsi0,socket=3
119
--
120
2.29.2
121
diff view generated by jsdifflib
New patch
1
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
3
Allow RAM MemoryRegion to be created from an offset in a file, instead
4
of allocating at offset of 0 by default. This is needed to synchronize
5
RAM between QEMU & remote process.
6
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Message-id: 609996697ad8617e3b01df38accc5c208c24d74e.1611938319.git.jag.raman@oracle.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
include/exec/memory.h | 2 ++
15
include/exec/ram_addr.h | 4 ++--
16
include/qemu/mmap-alloc.h | 4 +++-
17
backends/hostmem-memfd.c | 2 +-
18
hw/misc/ivshmem.c | 3 ++-
19
softmmu/memory.c | 3 ++-
20
softmmu/physmem.c | 12 +++++++-----
21
util/mmap-alloc.c | 8 +++++---
22
util/oslib-posix.c | 2 +-
23
9 files changed, 25 insertions(+), 15 deletions(-)
24
25
diff --git a/include/exec/memory.h b/include/exec/memory.h
26
index XXXXXXX..XXXXXXX 100644
27
--- a/include/exec/memory.h
28
+++ b/include/exec/memory.h
29
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
30
* @size: size of the region.
31
* @share: %true if memory must be mmaped with the MAP_SHARED flag
32
* @fd: the fd to mmap.
33
+ * @offset: offset within the file referenced by fd
34
* @errp: pointer to Error*, to store an error if it happens.
35
*
36
* Note that this function does not do anything to cause the data in the
37
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
38
uint64_t size,
39
bool share,
40
int fd,
41
+ ram_addr_t offset,
42
Error **errp);
43
#endif
44
45
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
46
index XXXXXXX..XXXXXXX 100644
47
--- a/include/exec/ram_addr.h
48
+++ b/include/exec/ram_addr.h
49
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
50
uint32_t ram_flags, const char *mem_path,
51
bool readonly, Error **errp);
52
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
53
- uint32_t ram_flags, int fd, bool readonly,
54
- Error **errp);
55
+ uint32_t ram_flags, int fd, off_t offset,
56
+ bool readonly, Error **errp);
57
58
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
59
MemoryRegion *mr, Error **errp);
60
diff --git a/include/qemu/mmap-alloc.h b/include/qemu/mmap-alloc.h
61
index XXXXXXX..XXXXXXX 100644
62
--- a/include/qemu/mmap-alloc.h
63
+++ b/include/qemu/mmap-alloc.h
64
@@ -XXX,XX +XXX,XX @@ size_t qemu_mempath_getpagesize(const char *mem_path);
65
* @readonly: true for a read-only mapping, false for read/write.
66
* @shared: map has RAM_SHARED flag.
67
* @is_pmem: map has RAM_PMEM flag.
68
+ * @map_offset: map starts at offset of map_offset from the start of fd
69
*
70
* Return:
71
* On success, return a pointer to the mapped area.
72
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
73
size_t align,
74
bool readonly,
75
bool shared,
76
- bool is_pmem);
77
+ bool is_pmem,
78
+ off_t map_offset);
79
80
void qemu_ram_munmap(int fd, void *ptr, size_t size);
81
82
diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
83
index XXXXXXX..XXXXXXX 100644
84
--- a/backends/hostmem-memfd.c
85
+++ b/backends/hostmem-memfd.c
86
@@ -XXX,XX +XXX,XX @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
87
name = host_memory_backend_get_name(backend);
88
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
89
name, backend->size,
90
- backend->share, fd, errp);
91
+ backend->share, fd, 0, errp);
92
g_free(name);
93
}
94
95
diff --git a/hw/misc/ivshmem.c b/hw/misc/ivshmem.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/hw/misc/ivshmem.c
98
+++ b/hw/misc/ivshmem.c
99
@@ -XXX,XX +XXX,XX @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
100
101
/* mmap the region and map into the BAR2 */
102
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
103
- "ivshmem.bar2", size, true, fd, &local_err);
104
+ "ivshmem.bar2", size, true, fd, 0,
105
+ &local_err);
106
if (local_err) {
107
error_propagate(errp, local_err);
108
return;
109
diff --git a/softmmu/memory.c b/softmmu/memory.c
110
index XXXXXXX..XXXXXXX 100644
111
--- a/softmmu/memory.c
112
+++ b/softmmu/memory.c
113
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
114
uint64_t size,
115
bool share,
116
int fd,
117
+ ram_addr_t offset,
118
Error **errp)
119
{
120
Error *err = NULL;
121
@@ -XXX,XX +XXX,XX @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
122
mr->destructor = memory_region_destructor_ram;
123
mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
124
share ? RAM_SHARED : 0,
125
- fd, false, &err);
126
+ fd, offset, false, &err);
127
if (err) {
128
mr->size = int128_zero();
129
object_unparent(OBJECT(mr));
130
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
131
index XXXXXXX..XXXXXXX 100644
132
--- a/softmmu/physmem.c
133
+++ b/softmmu/physmem.c
134
@@ -XXX,XX +XXX,XX @@ static void *file_ram_alloc(RAMBlock *block,
135
int fd,
136
bool readonly,
137
bool truncate,
138
+ off_t offset,
139
Error **errp)
140
{
141
void *area;
142
@@ -XXX,XX +XXX,XX @@ static void *file_ram_alloc(RAMBlock *block,
143
}
144
145
area = qemu_ram_mmap(fd, memory, block->mr->align, readonly,
146
- block->flags & RAM_SHARED, block->flags & RAM_PMEM);
147
+ block->flags & RAM_SHARED, block->flags & RAM_PMEM,
148
+ offset);
149
if (area == MAP_FAILED) {
150
error_setg_errno(errp, errno,
151
"unable to map backing store for guest RAM");
152
@@ -XXX,XX +XXX,XX @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
153
154
#ifdef CONFIG_POSIX
155
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
156
- uint32_t ram_flags, int fd, bool readonly,
157
- Error **errp)
158
+ uint32_t ram_flags, int fd, off_t offset,
159
+ bool readonly, Error **errp)
160
{
161
RAMBlock *new_block;
162
Error *local_err = NULL;
163
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
164
new_block->max_length = size;
165
new_block->flags = ram_flags;
166
new_block->host = file_ram_alloc(new_block, size, fd, readonly,
167
- !file_size, errp);
168
+ !file_size, offset, errp);
169
if (!new_block->host) {
170
g_free(new_block);
171
return NULL;
172
@@ -XXX,XX +XXX,XX @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
173
return NULL;
174
}
175
176
- block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, readonly, errp);
177
+ block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp);
178
if (!block) {
179
if (created) {
180
unlink(mem_path);
181
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
182
index XXXXXXX..XXXXXXX 100644
183
--- a/util/mmap-alloc.c
184
+++ b/util/mmap-alloc.c
185
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
186
size_t align,
187
bool readonly,
188
bool shared,
189
- bool is_pmem)
190
+ bool is_pmem,
191
+ off_t map_offset)
192
{
193
int prot;
194
int flags;
195
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
196
197
prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
198
199
- ptr = mmap(guardptr + offset, size, prot, flags | map_sync_flags, fd, 0);
200
+ ptr = mmap(guardptr + offset, size, prot,
201
+ flags | map_sync_flags, fd, map_offset);
202
203
if (ptr == MAP_FAILED && map_sync_flags) {
204
if (errno == ENOTSUP) {
205
@@ -XXX,XX +XXX,XX @@ void *qemu_ram_mmap(int fd,
206
* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
207
* we will remove these flags to handle compatibility.
208
*/
209
- ptr = mmap(guardptr + offset, size, prot, flags, fd, 0);
210
+ ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset);
211
}
212
213
if (ptr == MAP_FAILED) {
214
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
215
index XXXXXXX..XXXXXXX 100644
216
--- a/util/oslib-posix.c
217
+++ b/util/oslib-posix.c
218
@@ -XXX,XX +XXX,XX @@ void *qemu_memalign(size_t alignment, size_t size)
219
void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
220
{
221
size_t align = QEMU_VMALLOC_ALIGN;
222
- void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false);
223
+ void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false, 0);
224
225
if (ptr == MAP_FAILED) {
226
return NULL;
227
--
228
2.29.2
229
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
Add configuration options to enable or disable multiprocess QEMU code
4
5
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
6
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-id: 6cc37253e35418ebd7b675a31a3df6e3c7a12dc1.1611938319.git.jag.raman@oracle.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
configure | 10 ++++++++++
13
meson.build | 4 +++-
14
Kconfig.host | 4 ++++
15
hw/Kconfig | 1 +
16
hw/remote/Kconfig | 3 +++
17
5 files changed, 21 insertions(+), 1 deletion(-)
18
create mode 100644 hw/remote/Kconfig
19
20
diff --git a/configure b/configure
21
index XXXXXXX..XXXXXXX 100755
22
--- a/configure
23
+++ b/configure
24
@@ -XXX,XX +XXX,XX @@ skip_meson=no
25
gettext="auto"
26
fuse="auto"
27
fuse_lseek="auto"
28
+multiprocess="no"
29
30
malloc_trim="auto"
31
32
@@ -XXX,XX +XXX,XX @@ Linux)
33
linux="yes"
34
linux_user="yes"
35
vhost_user=${default_feature:-yes}
36
+ multiprocess=${default_feature:-yes}
37
;;
38
esac
39
40
@@ -XXX,XX +XXX,XX @@ for opt do
41
;;
42
--disable-fuse-lseek) fuse_lseek="disabled"
43
;;
44
+ --enable-multiprocess) multiprocess="yes"
45
+ ;;
46
+ --disable-multiprocess) multiprocess="no"
47
+ ;;
48
*)
49
echo "ERROR: unknown option $opt"
50
echo "Try '$0 --help' for more information"
51
@@ -XXX,XX +XXX,XX @@ disabled with --disable-FEATURE, default is enabled if available
52
libdaxctl libdaxctl support
53
fuse FUSE block device export
54
fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports
55
+ multiprocess Multiprocess QEMU support
56
57
NOTE: The object files are built at the place where configure is launched
58
EOF
59
@@ -XXX,XX +XXX,XX @@ fi
60
if test "$have_mlockall" = "yes" ; then
61
echo "HAVE_MLOCKALL=y" >> $config_host_mak
62
fi
63
+if test "$multiprocess" = "yes" ; then
64
+ echo "CONFIG_MULTIPROCESS_ALLOWED=y" >> $config_host_mak
65
+fi
66
if test "$fuzzing" = "yes" ; then
67
# If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the
68
# needed CFLAGS have already been provided
69
diff --git a/meson.build b/meson.build
70
index XXXXXXX..XXXXXXX 100644
71
--- a/meson.build
72
+++ b/meson.build
73
@@ -XXX,XX +XXX,XX @@ host_kconfig = \
74
('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
75
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
76
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
77
- ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : [])
78
+ ('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \
79
+ ('CONFIG_MULTIPROCESS_ALLOWED' in config_host ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
80
81
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
82
83
@@ -XXX,XX +XXX,XX @@ summary_info += {'libpmem support': config_host.has_key('CONFIG_LIBPMEM')}
84
summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')}
85
summary_info += {'libudev': libudev.found()}
86
summary_info += {'FUSE lseek': fuse_lseek.found()}
87
+summary_info += {'Multiprocess QEMU': config_host.has_key('CONFIG_MULTIPROCESS_ALLOWED')}
88
summary(summary_info, bool_yn: true, section: 'Dependencies')
89
90
if not supported_cpus.contains(cpu)
91
diff --git a/Kconfig.host b/Kconfig.host
92
index XXXXXXX..XXXXXXX 100644
93
--- a/Kconfig.host
94
+++ b/Kconfig.host
95
@@ -XXX,XX +XXX,XX @@ config VIRTFS
96
97
config PVRDMA
98
bool
99
+
100
+config MULTIPROCESS_ALLOWED
101
+ bool
102
+ imply MULTIPROCESS
103
diff --git a/hw/Kconfig b/hw/Kconfig
104
index XXXXXXX..XXXXXXX 100644
105
--- a/hw/Kconfig
106
+++ b/hw/Kconfig
107
@@ -XXX,XX +XXX,XX @@ source pci-host/Kconfig
108
source pcmcia/Kconfig
109
source pci/Kconfig
110
source rdma/Kconfig
111
+source remote/Kconfig
112
source rtc/Kconfig
113
source scsi/Kconfig
114
source sd/Kconfig
115
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
116
new file mode 100644
117
index XXXXXXX..XXXXXXX
118
--- /dev/null
119
+++ b/hw/remote/Kconfig
120
@@ -XXX,XX +XXX,XX @@
121
+config MULTIPROCESS
122
+ bool
123
+ depends on PCI && KVM
124
--
125
2.29.2
126
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
PCI host bridge is setup for the remote device process. It is
4
implemented using remote-pcihost object. It is an extension of the PCI
5
host bridge setup by QEMU.
6
Remote-pcihost configures a PCI bus which could be used by the remote
7
PCI device to latch on to.
8
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
11
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 0871ba857abb2eafacde07e7fe66a3f12415bfb2.1611938319.git.jag.raman@oracle.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
MAINTAINERS | 2 +
17
include/hw/pci-host/remote.h | 29 ++++++++++++++
18
hw/pci-host/remote.c | 75 ++++++++++++++++++++++++++++++++++++
19
hw/pci-host/Kconfig | 3 ++
20
hw/pci-host/meson.build | 1 +
21
hw/remote/Kconfig | 1 +
22
6 files changed, 111 insertions(+)
23
create mode 100644 include/hw/pci-host/remote.h
24
create mode 100644 hw/pci-host/remote.c
25
26
diff --git a/MAINTAINERS b/MAINTAINERS
27
index XXXXXXX..XXXXXXX 100644
28
--- a/MAINTAINERS
29
+++ b/MAINTAINERS
30
@@ -XXX,XX +XXX,XX @@ M: John G Johnson <john.g.johnson@oracle.com>
31
S: Maintained
32
F: docs/devel/multi-process.rst
33
F: docs/system/multi-process.rst
34
+F: hw/pci-host/remote.c
35
+F: include/hw/pci-host/remote.h
36
37
Build and test automation
38
-------------------------
39
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
40
new file mode 100644
41
index XXXXXXX..XXXXXXX
42
--- /dev/null
43
+++ b/include/hw/pci-host/remote.h
44
@@ -XXX,XX +XXX,XX @@
45
+/*
46
+ * PCI Host for remote device
47
+ *
48
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
49
+ *
50
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
51
+ * See the COPYING file in the top-level directory.
52
+ *
53
+ */
54
+
55
+#ifndef REMOTE_PCIHOST_H
56
+#define REMOTE_PCIHOST_H
57
+
58
+#include "exec/memory.h"
59
+#include "hw/pci/pcie_host.h"
60
+
61
+#define TYPE_REMOTE_PCIHOST "remote-pcihost"
62
+OBJECT_DECLARE_SIMPLE_TYPE(RemotePCIHost, REMOTE_PCIHOST)
63
+
64
+struct RemotePCIHost {
65
+ /*< private >*/
66
+ PCIExpressHost parent_obj;
67
+ /*< public >*/
68
+
69
+ MemoryRegion *mr_pci_mem;
70
+ MemoryRegion *mr_sys_io;
71
+};
72
+
73
+#endif
74
diff --git a/hw/pci-host/remote.c b/hw/pci-host/remote.c
75
new file mode 100644
76
index XXXXXXX..XXXXXXX
77
--- /dev/null
78
+++ b/hw/pci-host/remote.c
79
@@ -XXX,XX +XXX,XX @@
80
+/*
81
+ * Remote PCI host device
82
+ *
83
+ * Unlike PCI host devices that model physical hardware, the purpose
84
+ * of this PCI host is to host multi-process QEMU devices.
85
+ *
86
+ * Multi-process QEMU extends the PCI host of a QEMU machine into a
87
+ * remote process. Any PCI device attached to the remote process is
88
+ * visible in the QEMU guest. This allows existing QEMU device models
89
+ * to be reused in the remote process.
90
+ *
91
+ * This PCI host is purely a container for PCI devices. It's fake in the
92
+ * sense that the guest never sees this PCI host and has no way of
93
+ * accessing it. Its job is just to provide the environment that QEMU
94
+ * PCI device models need when running in a remote process.
95
+ *
96
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
97
+ *
98
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
99
+ * See the COPYING file in the top-level directory.
100
+ *
101
+ */
102
+
103
+#include "qemu/osdep.h"
104
+#include "qemu-common.h"
105
+
106
+#include "hw/pci/pci.h"
107
+#include "hw/pci/pci_host.h"
108
+#include "hw/pci/pcie_host.h"
109
+#include "hw/qdev-properties.h"
110
+#include "hw/pci-host/remote.h"
111
+#include "exec/memory.h"
112
+
113
+static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
114
+ PCIBus *rootbus)
115
+{
116
+ return "0000:00";
117
+}
118
+
119
+static void remote_pcihost_realize(DeviceState *dev, Error **errp)
120
+{
121
+ PCIHostState *pci = PCI_HOST_BRIDGE(dev);
122
+ RemotePCIHost *s = REMOTE_PCIHOST(dev);
123
+
124
+ pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci",
125
+ s->mr_pci_mem, s->mr_sys_io,
126
+ 0, TYPE_PCIE_BUS);
127
+}
128
+
129
+static void remote_pcihost_class_init(ObjectClass *klass, void *data)
130
+{
131
+ DeviceClass *dc = DEVICE_CLASS(klass);
132
+ PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
133
+
134
+ hc->root_bus_path = remote_pcihost_root_bus_path;
135
+ dc->realize = remote_pcihost_realize;
136
+
137
+ dc->user_creatable = false;
138
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
139
+ dc->fw_name = "pci";
140
+}
141
+
142
+static const TypeInfo remote_pcihost_info = {
143
+ .name = TYPE_REMOTE_PCIHOST,
144
+ .parent = TYPE_PCIE_HOST_BRIDGE,
145
+ .instance_size = sizeof(RemotePCIHost),
146
+ .class_init = remote_pcihost_class_init,
147
+};
148
+
149
+static void remote_pcihost_register(void)
150
+{
151
+ type_register_static(&remote_pcihost_info);
152
+}
153
+
154
+type_init(remote_pcihost_register)
155
diff --git a/hw/pci-host/Kconfig b/hw/pci-host/Kconfig
156
index XXXXXXX..XXXXXXX 100644
157
--- a/hw/pci-host/Kconfig
158
+++ b/hw/pci-host/Kconfig
159
@@ -XXX,XX +XXX,XX @@ config PCI_POWERNV
160
select PCI_EXPRESS
161
select MSI_NONBROKEN
162
select PCIE_PORT
163
+
164
+config REMOTE_PCIHOST
165
+ bool
166
diff --git a/hw/pci-host/meson.build b/hw/pci-host/meson.build
167
index XXXXXXX..XXXXXXX 100644
168
--- a/hw/pci-host/meson.build
169
+++ b/hw/pci-host/meson.build
170
@@ -XXX,XX +XXX,XX @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c'))
171
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
172
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
173
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
174
+pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c'))
175
176
# PPC devices
177
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
178
diff --git a/hw/remote/Kconfig b/hw/remote/Kconfig
179
index XXXXXXX..XXXXXXX 100644
180
--- a/hw/remote/Kconfig
181
+++ b/hw/remote/Kconfig
182
@@ -XXX,XX +XXX,XX @@
183
config MULTIPROCESS
184
bool
185
depends on PCI && KVM
186
+ select REMOTE_PCIHOST
187
--
188
2.29.2
189
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
x-remote-machine object sets up various subsystems of the remote
4
device process. Instantiate PCI host bridge object and initialize RAM, IO &
5
PCI memory regions.
6
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Message-id: c537f38d17f90453ca610c6b70cf3480274e0ba1.1611938319.git.jag.raman@oracle.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
MAINTAINERS | 2 ++
15
include/hw/pci-host/remote.h | 1 +
16
include/hw/remote/machine.h | 27 ++++++++++++++
17
hw/remote/machine.c | 70 ++++++++++++++++++++++++++++++++++++
18
hw/meson.build | 1 +
19
hw/remote/meson.build | 5 +++
20
6 files changed, 106 insertions(+)
21
create mode 100644 include/hw/remote/machine.h
22
create mode 100644 hw/remote/machine.c
23
create mode 100644 hw/remote/meson.build
24
25
diff --git a/MAINTAINERS b/MAINTAINERS
26
index XXXXXXX..XXXXXXX 100644
27
--- a/MAINTAINERS
28
+++ b/MAINTAINERS
29
@@ -XXX,XX +XXX,XX @@ F: docs/devel/multi-process.rst
30
F: docs/system/multi-process.rst
31
F: hw/pci-host/remote.c
32
F: include/hw/pci-host/remote.h
33
+F: hw/remote/machine.c
34
+F: include/hw/remote/machine.h
35
36
Build and test automation
37
-------------------------
38
diff --git a/include/hw/pci-host/remote.h b/include/hw/pci-host/remote.h
39
index XXXXXXX..XXXXXXX 100644
40
--- a/include/hw/pci-host/remote.h
41
+++ b/include/hw/pci-host/remote.h
42
@@ -XXX,XX +XXX,XX @@ struct RemotePCIHost {
43
44
MemoryRegion *mr_pci_mem;
45
MemoryRegion *mr_sys_io;
46
+ MemoryRegion *mr_sys_mem;
47
};
48
49
#endif
50
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
51
new file mode 100644
52
index XXXXXXX..XXXXXXX
53
--- /dev/null
54
+++ b/include/hw/remote/machine.h
55
@@ -XXX,XX +XXX,XX @@
56
+/*
57
+ * Remote machine configuration
58
+ *
59
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
60
+ *
61
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
62
+ * See the COPYING file in the top-level directory.
63
+ *
64
+ */
65
+
66
+#ifndef REMOTE_MACHINE_H
67
+#define REMOTE_MACHINE_H
68
+
69
+#include "qom/object.h"
70
+#include "hw/boards.h"
71
+#include "hw/pci-host/remote.h"
72
+
73
+struct RemoteMachineState {
74
+ MachineState parent_obj;
75
+
76
+ RemotePCIHost *host;
77
+};
78
+
79
+#define TYPE_REMOTE_MACHINE "x-remote-machine"
80
+OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
81
+
82
+#endif
83
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
84
new file mode 100644
85
index XXXXXXX..XXXXXXX
86
--- /dev/null
87
+++ b/hw/remote/machine.c
88
@@ -XXX,XX +XXX,XX @@
89
+/*
90
+ * Machine for remote device
91
+ *
92
+ * This machine type is used by the remote device process in multi-process
93
+ * QEMU. QEMU device models depend on parent busses, interrupt controllers,
94
+ * memory regions, etc. The remote machine type offers this environment so
95
+ * that QEMU device models can be used as remote devices.
96
+ *
97
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
98
+ *
99
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
100
+ * See the COPYING file in the top-level directory.
101
+ *
102
+ */
103
+
104
+#include "qemu/osdep.h"
105
+#include "qemu-common.h"
106
+
107
+#include "hw/remote/machine.h"
108
+#include "exec/address-spaces.h"
109
+#include "exec/memory.h"
110
+#include "qapi/error.h"
111
+
112
+static void remote_machine_init(MachineState *machine)
113
+{
114
+ MemoryRegion *system_memory, *system_io, *pci_memory;
115
+ RemoteMachineState *s = REMOTE_MACHINE(machine);
116
+ RemotePCIHost *rem_host;
117
+
118
+ system_memory = get_system_memory();
119
+ system_io = get_system_io();
120
+
121
+ pci_memory = g_new(MemoryRegion, 1);
122
+ memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
123
+
124
+ rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
125
+
126
+ rem_host->mr_pci_mem = pci_memory;
127
+ rem_host->mr_sys_mem = system_memory;
128
+ rem_host->mr_sys_io = system_io;
129
+
130
+ s->host = rem_host;
131
+
132
+ object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
133
+ memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
134
+
135
+ qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
136
+}
137
+
138
+static void remote_machine_class_init(ObjectClass *oc, void *data)
139
+{
140
+ MachineClass *mc = MACHINE_CLASS(oc);
141
+
142
+ mc->init = remote_machine_init;
143
+ mc->desc = "Experimental remote machine";
144
+}
145
+
146
+static const TypeInfo remote_machine = {
147
+ .name = TYPE_REMOTE_MACHINE,
148
+ .parent = TYPE_MACHINE,
149
+ .instance_size = sizeof(RemoteMachineState),
150
+ .class_init = remote_machine_class_init,
151
+};
152
+
153
+static void remote_machine_register_types(void)
154
+{
155
+ type_register_static(&remote_machine);
156
+}
157
+
158
+type_init(remote_machine_register_types);
159
diff --git a/hw/meson.build b/hw/meson.build
160
index XXXXXXX..XXXXXXX 100644
161
--- a/hw/meson.build
162
+++ b/hw/meson.build
163
@@ -XXX,XX +XXX,XX @@ subdir('moxie')
164
subdir('nios2')
165
subdir('openrisc')
166
subdir('ppc')
167
+subdir('remote')
168
subdir('riscv')
169
subdir('rx')
170
subdir('s390x')
171
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
172
new file mode 100644
173
index XXXXXXX..XXXXXXX
174
--- /dev/null
175
+++ b/hw/remote/meson.build
176
@@ -XXX,XX +XXX,XX @@
177
+remote_ss = ss.source_set()
178
+
179
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
180
+
181
+softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
182
--
183
2.29.2
184
diff view generated by jsdifflib
New patch
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
1
2
3
Adds qio_channel_writev_full_all() to transmit both data and FDs.
4
Refactors existing code to use this helper.
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
11
Message-id: 480fbf1fe4152495d60596c9b665124549b426a5.1611938319.git.jag.raman@oracle.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
include/io/channel.h | 25 +++++++++++++++++++++++++
15
io/channel.c | 15 ++++++++++++++-
16
2 files changed, 39 insertions(+), 1 deletion(-)
17
18
diff --git a/include/io/channel.h b/include/io/channel.h
19
index XXXXXXX..XXXXXXX 100644
20
--- a/include/io/channel.h
21
+++ b/include/io/channel.h
22
@@ -XXX,XX +XXX,XX @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
23
IOHandler *io_write,
24
void *opaque);
25
26
+/**
27
+ * qio_channel_writev_full_all:
28
+ * @ioc: the channel object
29
+ * @iov: the array of memory regions to write data from
30
+ * @niov: the length of the @iov array
31
+ * @fds: an array of file handles to send
32
+ * @nfds: number of file handles in @fds
33
+ * @errp: pointer to a NULL-initialized error object
34
+ *
35
+ *
36
+ * Behaves like qio_channel_writev_full but will attempt
37
+ * to send all data passed (file handles and memory regions).
38
+ * The function will wait for all requested data
39
+ * to be written, yielding from the current coroutine
40
+ * if required.
41
+ *
42
+ * Returns: 0 if all bytes were written, or -1 on error
43
+ */
44
+
45
+int qio_channel_writev_full_all(QIOChannel *ioc,
46
+ const struct iovec *iov,
47
+ size_t niov,
48
+ int *fds, size_t nfds,
49
+ Error **errp);
50
+
51
#endif /* QIO_CHANNEL_H */
52
diff --git a/io/channel.c b/io/channel.c
53
index XXXXXXX..XXXXXXX 100644
54
--- a/io/channel.c
55
+++ b/io/channel.c
56
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
57
const struct iovec *iov,
58
size_t niov,
59
Error **errp)
60
+{
61
+ return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp);
62
+}
63
+
64
+int qio_channel_writev_full_all(QIOChannel *ioc,
65
+ const struct iovec *iov,
66
+ size_t niov,
67
+ int *fds, size_t nfds,
68
+ Error **errp)
69
{
70
int ret = -1;
71
struct iovec *local_iov = g_new(struct iovec, niov);
72
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
73
74
while (nlocal_iov > 0) {
75
ssize_t len;
76
- len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp);
77
+ len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds,
78
+ errp);
79
if (len == QIO_CHANNEL_ERR_BLOCK) {
80
if (qemu_in_coroutine()) {
81
qio_channel_yield(ioc, G_IO_OUT);
82
@@ -XXX,XX +XXX,XX @@ int qio_channel_writev_all(QIOChannel *ioc,
83
}
84
85
iov_discard_front(&local_iov, &nlocal_iov, len);
86
+
87
+ fds = NULL;
88
+ nfds = 0;
89
}
90
91
ret = 0;
92
--
93
2.29.2
94
diff view generated by jsdifflib
New patch
1
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
3
Adds qio_channel_readv_full_all_eof() and qio_channel_readv_full_all()
4
to read both data and FDs. Refactors existing code to use these helpers.
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Acked-by: Daniel P. Berrangé <berrange@redhat.com>
10
Message-id: b059c4cc0fb741e794d644c144cc21372cad877d.1611938319.git.jag.raman@oracle.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
include/io/channel.h | 53 +++++++++++++++++++++++
14
io/channel.c | 101 ++++++++++++++++++++++++++++++++++---------
15
2 files changed, 134 insertions(+), 20 deletions(-)
16
17
diff --git a/include/io/channel.h b/include/io/channel.h
18
index XXXXXXX..XXXXXXX 100644
19
--- a/include/io/channel.h
20
+++ b/include/io/channel.h
21
@@ -XXX,XX +XXX,XX @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
22
IOHandler *io_write,
23
void *opaque);
24
25
+/**
26
+ * qio_channel_readv_full_all_eof:
27
+ * @ioc: the channel object
28
+ * @iov: the array of memory regions to read data to
29
+ * @niov: the length of the @iov array
30
+ * @fds: an array of file handles to read
31
+ * @nfds: number of file handles in @fds
32
+ * @errp: pointer to a NULL-initialized error object
33
+ *
34
+ *
35
+ * Performs same function as qio_channel_readv_all_eof.
36
+ * Additionally, attempts to read file descriptors shared
37
+ * over the channel. The function will wait for all
38
+ * requested data to be read, yielding from the current
39
+ * coroutine if required. data refers to both file
40
+ * descriptors and the iovs.
41
+ *
42
+ * Returns: 1 if all bytes were read, 0 if end-of-file
43
+ * occurs without data, or -1 on error
44
+ */
45
+
46
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
47
+ const struct iovec *iov,
48
+ size_t niov,
49
+ int **fds, size_t *nfds,
50
+ Error **errp);
51
+
52
+/**
53
+ * qio_channel_readv_full_all:
54
+ * @ioc: the channel object
55
+ * @iov: the array of memory regions to read data to
56
+ * @niov: the length of the @iov array
57
+ * @fds: an array of file handles to read
58
+ * @nfds: number of file handles in @fds
59
+ * @errp: pointer to a NULL-initialized error object
60
+ *
61
+ *
62
+ * Performs same function as qio_channel_readv_all_eof.
63
+ * Additionally, attempts to read file descriptors shared
64
+ * over the channel. The function will wait for all
65
+ * requested data to be read, yielding from the current
66
+ * coroutine if required. data refers to both file
67
+ * descriptors and the iovs.
68
+ *
69
+ * Returns: 0 if all bytes were read, or -1 on error
70
+ */
71
+
72
+int qio_channel_readv_full_all(QIOChannel *ioc,
73
+ const struct iovec *iov,
74
+ size_t niov,
75
+ int **fds, size_t *nfds,
76
+ Error **errp);
77
+
78
/**
79
* qio_channel_writev_full_all:
80
* @ioc: the channel object
81
diff --git a/io/channel.c b/io/channel.c
82
index XXXXXXX..XXXXXXX 100644
83
--- a/io/channel.c
84
+++ b/io/channel.c
85
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
86
const struct iovec *iov,
87
size_t niov,
88
Error **errp)
89
+{
90
+ return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
91
+}
92
+
93
+int qio_channel_readv_all(QIOChannel *ioc,
94
+ const struct iovec *iov,
95
+ size_t niov,
96
+ Error **errp)
97
+{
98
+ return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp);
99
+}
100
+
101
+int qio_channel_readv_full_all_eof(QIOChannel *ioc,
102
+ const struct iovec *iov,
103
+ size_t niov,
104
+ int **fds, size_t *nfds,
105
+ Error **errp)
106
{
107
int ret = -1;
108
struct iovec *local_iov = g_new(struct iovec, niov);
109
struct iovec *local_iov_head = local_iov;
110
unsigned int nlocal_iov = niov;
111
+ int **local_fds = fds;
112
+ size_t *local_nfds = nfds;
113
bool partial = false;
114
115
+ if (nfds) {
116
+ *nfds = 0;
117
+ }
118
+
119
+ if (fds) {
120
+ *fds = NULL;
121
+ }
122
+
123
nlocal_iov = iov_copy(local_iov, nlocal_iov,
124
iov, niov,
125
0, iov_size(iov, niov));
126
127
- while (nlocal_iov > 0) {
128
+ while ((nlocal_iov > 0) || local_fds) {
129
ssize_t len;
130
- len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp);
131
+ len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
132
+ local_nfds, errp);
133
if (len == QIO_CHANNEL_ERR_BLOCK) {
134
if (qemu_in_coroutine()) {
135
qio_channel_yield(ioc, G_IO_IN);
136
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
137
qio_channel_wait(ioc, G_IO_IN);
138
}
139
continue;
140
- } else if (len < 0) {
141
- goto cleanup;
142
- } else if (len == 0) {
143
- if (partial) {
144
- error_setg(errp,
145
- "Unexpected end-of-file before all bytes were read");
146
- } else {
147
+ }
148
+
149
+ if (len == 0) {
150
+ if (local_nfds && *local_nfds) {
151
+ /*
152
+ * Got some FDs, but no data yet. This isn't an EOF
153
+ * scenario (yet), so carry on to try to read data
154
+ * on next loop iteration
155
+ */
156
+ goto next_iter;
157
+ } else if (!partial) {
158
+ /* No fds and no data - EOF before any data read */
159
ret = 0;
160
+ goto cleanup;
161
+ } else {
162
+ len = -1;
163
+ error_setg(errp,
164
+ "Unexpected end-of-file before all data were read");
165
+ /* Fallthrough into len < 0 handling */
166
+ }
167
+ }
168
+
169
+ if (len < 0) {
170
+ /* Close any FDs we previously received */
171
+ if (nfds && fds) {
172
+ size_t i;
173
+ for (i = 0; i < (*nfds); i++) {
174
+ close((*fds)[i]);
175
+ }
176
+ g_free(*fds);
177
+ *fds = NULL;
178
+ *nfds = 0;
179
}
180
goto cleanup;
181
}
182
183
+ if (nlocal_iov) {
184
+ iov_discard_front(&local_iov, &nlocal_iov, len);
185
+ }
186
+
187
+next_iter:
188
partial = true;
189
- iov_discard_front(&local_iov, &nlocal_iov, len);
190
+ local_fds = NULL;
191
+ local_nfds = NULL;
192
}
193
194
ret = 1;
195
@@ -XXX,XX +XXX,XX @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
196
return ret;
197
}
198
199
-int qio_channel_readv_all(QIOChannel *ioc,
200
- const struct iovec *iov,
201
- size_t niov,
202
- Error **errp)
203
+int qio_channel_readv_full_all(QIOChannel *ioc,
204
+ const struct iovec *iov,
205
+ size_t niov,
206
+ int **fds, size_t *nfds,
207
+ Error **errp)
208
{
209
- int ret = qio_channel_readv_all_eof(ioc, iov, niov, errp);
210
+ int ret = qio_channel_readv_full_all_eof(ioc, iov, niov, fds, nfds, errp);
211
212
if (ret == 0) {
213
- ret = -1;
214
- error_setg(errp,
215
- "Unexpected end-of-file before all bytes were read");
216
- } else if (ret == 1) {
217
- ret = 0;
218
+ error_prepend(errp,
219
+ "Unexpected end-of-file before all data were read.");
220
+ return -1;
221
}
222
+ if (ret == 1) {
223
+ return 0;
224
+ }
225
+
226
return ret;
227
}
228
229
--
230
2.29.2
231
diff view generated by jsdifflib
New patch
1
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
3
Defines MPQemuMsg, which is the message that is sent to the remote
4
process. This message is sent over QIOChannel and is used to
5
command the remote process to perform various tasks.
6
Define transmission functions used by proxy and by remote.
7
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 56ca8bcf95195b2b195b08f6b9565b6d7410bce5.1611938319.git.jag.raman@oracle.com
13
14
[Replace struct iovec send[2] = {0} with {} to make clang happy as
15
suggested by Peter Maydell <peter.maydell@linaro.org>.
16
--Stefan]
17
18
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
19
---
20
MAINTAINERS | 2 +
21
meson.build | 1 +
22
hw/remote/trace.h | 1 +
23
include/hw/remote/mpqemu-link.h | 63 ++++++++++
24
include/sysemu/iothread.h | 6 +
25
hw/remote/mpqemu-link.c | 205 ++++++++++++++++++++++++++++++++
26
iothread.c | 6 +
27
hw/remote/meson.build | 1 +
28
hw/remote/trace-events | 4 +
29
9 files changed, 289 insertions(+)
30
create mode 100644 hw/remote/trace.h
31
create mode 100644 include/hw/remote/mpqemu-link.h
32
create mode 100644 hw/remote/mpqemu-link.c
33
create mode 100644 hw/remote/trace-events
34
35
diff --git a/MAINTAINERS b/MAINTAINERS
36
index XXXXXXX..XXXXXXX 100644
37
--- a/MAINTAINERS
38
+++ b/MAINTAINERS
39
@@ -XXX,XX +XXX,XX @@ F: hw/pci-host/remote.c
40
F: include/hw/pci-host/remote.h
41
F: hw/remote/machine.c
42
F: include/hw/remote/machine.h
43
+F: hw/remote/mpqemu-link.c
44
+F: include/hw/remote/mpqemu-link.h
45
46
Build and test automation
47
-------------------------
48
diff --git a/meson.build b/meson.build
49
index XXXXXXX..XXXXXXX 100644
50
--- a/meson.build
51
+++ b/meson.build
52
@@ -XXX,XX +XXX,XX @@ if have_system
53
'net',
54
'softmmu',
55
'ui',
56
+ 'hw/remote',
57
]
58
endif
59
trace_events_subdirs += [
60
diff --git a/hw/remote/trace.h b/hw/remote/trace.h
61
new file mode 100644
62
index XXXXXXX..XXXXXXX
63
--- /dev/null
64
+++ b/hw/remote/trace.h
65
@@ -0,0 +1 @@
66
+#include "trace/trace-hw_remote.h"
67
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
68
new file mode 100644
69
index XXXXXXX..XXXXXXX
70
--- /dev/null
71
+++ b/include/hw/remote/mpqemu-link.h
72
@@ -XXX,XX +XXX,XX @@
73
+/*
74
+ * Communication channel between QEMU and remote device process
75
+ *
76
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
77
+ *
78
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
79
+ * See the COPYING file in the top-level directory.
80
+ *
81
+ */
82
+
83
+#ifndef MPQEMU_LINK_H
84
+#define MPQEMU_LINK_H
85
+
86
+#include "qom/object.h"
87
+#include "qemu/thread.h"
88
+#include "io/channel.h"
89
+
90
+#define REMOTE_MAX_FDS 8
91
+
92
+#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data.u64)
93
+
94
+/**
95
+ * MPQemuCmd:
96
+ *
97
+ * MPQemuCmd enum type to specify the command to be executed on the remote
98
+ * device.
99
+ *
100
+ * This uses a private protocol between QEMU and the remote process. vfio-user
101
+ * protocol would supersede this in the future.
102
+ *
103
+ */
104
+typedef enum {
105
+ MPQEMU_CMD_MAX,
106
+} MPQemuCmd;
107
+
108
+/**
109
+ * MPQemuMsg:
110
+ * @cmd: The remote command
111
+ * @size: Size of the data to be shared
112
+ * @data: Structured data
113
+ * @fds: File descriptors to be shared with remote device
114
+ *
115
+ * MPQemuMsg Format of the message sent to the remote device from QEMU.
116
+ *
117
+ */
118
+typedef struct {
119
+ int cmd;
120
+ size_t size;
121
+
122
+ union {
123
+ uint64_t u64;
124
+ } data;
125
+
126
+ int fds[REMOTE_MAX_FDS];
127
+ int num_fds;
128
+} MPQemuMsg;
129
+
130
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
131
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
132
+
133
+bool mpqemu_msg_valid(MPQemuMsg *msg);
134
+
135
+#endif
136
diff --git a/include/sysemu/iothread.h b/include/sysemu/iothread.h
137
index XXXXXXX..XXXXXXX 100644
138
--- a/include/sysemu/iothread.h
139
+++ b/include/sysemu/iothread.h
140
@@ -XXX,XX +XXX,XX @@ IOThread *iothread_create(const char *id, Error **errp);
141
void iothread_stop(IOThread *iothread);
142
void iothread_destroy(IOThread *iothread);
143
144
+/*
145
+ * Returns true if executing withing IOThread context,
146
+ * false otherwise.
147
+ */
148
+bool qemu_in_iothread(void);
149
+
150
#endif /* IOTHREAD_H */
151
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
152
new file mode 100644
153
index XXXXXXX..XXXXXXX
154
--- /dev/null
155
+++ b/hw/remote/mpqemu-link.c
156
@@ -XXX,XX +XXX,XX @@
157
+/*
158
+ * Communication channel between QEMU and remote device process
159
+ *
160
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
161
+ *
162
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
163
+ * See the COPYING file in the top-level directory.
164
+ *
165
+ */
166
+
167
+#include "qemu/osdep.h"
168
+#include "qemu-common.h"
169
+
170
+#include "qemu/module.h"
171
+#include "hw/remote/mpqemu-link.h"
172
+#include "qapi/error.h"
173
+#include "qemu/iov.h"
174
+#include "qemu/error-report.h"
175
+#include "qemu/main-loop.h"
176
+#include "io/channel.h"
177
+#include "sysemu/iothread.h"
178
+#include "trace.h"
179
+
180
+/*
181
+ * Send message over the ioc QIOChannel.
182
+ * This function is safe to call from:
183
+ * - main loop in co-routine context. Will block the main loop if not in
184
+ * co-routine context;
185
+ * - vCPU thread with no co-routine context and if the channel is not part
186
+ * of the main loop handling;
187
+ * - IOThread within co-routine context, outside of co-routine context
188
+ * will block IOThread;
189
+ * Returns true if no errors were encountered, false otherwise.
190
+ */
191
+bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
192
+{
193
+ ERRP_GUARD();
194
+ bool iolock = qemu_mutex_iothread_locked();
195
+ bool iothread = qemu_in_iothread();
196
+ struct iovec send[2] = {};
197
+ int *fds = NULL;
198
+ size_t nfds = 0;
199
+ bool ret = false;
200
+
201
+ send[0].iov_base = msg;
202
+ send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
203
+
204
+ send[1].iov_base = (void *)&msg->data;
205
+ send[1].iov_len = msg->size;
206
+
207
+ if (msg->num_fds) {
208
+ nfds = msg->num_fds;
209
+ fds = msg->fds;
210
+ }
211
+
212
+ /*
213
+ * Dont use in IOThread out of co-routine context as
214
+ * it will block IOThread.
215
+ */
216
+ assert(qemu_in_coroutine() || !iothread);
217
+
218
+ /*
219
+ * Skip unlocking/locking iothread lock when the IOThread is running
220
+ * in co-routine context. Co-routine context is asserted above
221
+ * for IOThread case.
222
+ * Also skip lock handling while in a co-routine in the main context.
223
+ */
224
+ if (iolock && !iothread && !qemu_in_coroutine()) {
225
+ qemu_mutex_unlock_iothread();
226
+ }
227
+
228
+ if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
229
+ fds, nfds, errp)) {
230
+ ret = true;
231
+ } else {
232
+ trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
233
+ }
234
+
235
+ if (iolock && !iothread && !qemu_in_coroutine()) {
236
+ /* See above comment why skip locking here. */
237
+ qemu_mutex_lock_iothread();
238
+ }
239
+
240
+ return ret;
241
+}
242
+
243
+/*
244
+ * Read message from the ioc QIOChannel.
245
+ * This function is safe to call from:
246
+ * - From main loop in co-routine context. Will block the main loop if not in
247
+ * co-routine context;
248
+ * - From vCPU thread with no co-routine context and if the channel is not part
249
+ * of the main loop handling;
250
+ * - From IOThread within co-routine context, outside of co-routine context
251
+ * will block IOThread;
252
+ */
253
+static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
254
+ size_t *nfds, Error **errp)
255
+{
256
+ ERRP_GUARD();
257
+ struct iovec iov = { .iov_base = buf, .iov_len = len };
258
+ bool iolock = qemu_mutex_iothread_locked();
259
+ bool iothread = qemu_in_iothread();
260
+ int ret = -1;
261
+
262
+ /*
263
+ * Dont use in IOThread out of co-routine context as
264
+ * it will block IOThread.
265
+ */
266
+ assert(qemu_in_coroutine() || !iothread);
267
+
268
+ if (iolock && !iothread && !qemu_in_coroutine()) {
269
+ qemu_mutex_unlock_iothread();
270
+ }
271
+
272
+ ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
273
+
274
+ if (iolock && !iothread && !qemu_in_coroutine()) {
275
+ qemu_mutex_lock_iothread();
276
+ }
277
+
278
+ return (ret <= 0) ? ret : iov.iov_len;
279
+}
280
+
281
+bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
282
+{
283
+ ERRP_GUARD();
284
+ g_autofree int *fds = NULL;
285
+ size_t nfds = 0;
286
+ ssize_t len;
287
+ bool ret = false;
288
+
289
+ len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
290
+ if (len <= 0) {
291
+ goto fail;
292
+ } else if (len != MPQEMU_MSG_HDR_SIZE) {
293
+ error_setg(errp, "Message header corrupted");
294
+ goto fail;
295
+ }
296
+
297
+ if (msg->size > sizeof(msg->data)) {
298
+ error_setg(errp, "Invalid size for message");
299
+ goto fail;
300
+ }
301
+
302
+ if (!msg->size) {
303
+ goto copy_fds;
304
+ }
305
+
306
+ len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
307
+ if (len <= 0) {
308
+ goto fail;
309
+ }
310
+ if (len != msg->size) {
311
+ error_setg(errp, "Unable to read full message");
312
+ goto fail;
313
+ }
314
+
315
+copy_fds:
316
+ msg->num_fds = nfds;
317
+ if (nfds > G_N_ELEMENTS(msg->fds)) {
318
+ error_setg(errp,
319
+ "Overflow error: received %zu fds, more than max of %d fds",
320
+ nfds, REMOTE_MAX_FDS);
321
+ goto fail;
322
+ }
323
+ if (nfds) {
324
+ memcpy(msg->fds, fds, nfds * sizeof(int));
325
+ }
326
+
327
+ ret = true;
328
+
329
+fail:
330
+ if (*errp) {
331
+ trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
332
+ }
333
+ while (*errp && nfds) {
334
+ close(fds[nfds - 1]);
335
+ nfds--;
336
+ }
337
+
338
+ return ret;
339
+}
340
+
341
+bool mpqemu_msg_valid(MPQemuMsg *msg)
342
+{
343
+ if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
344
+ return false;
345
+ }
346
+
347
+ /* Verify FDs. */
348
+ if (msg->num_fds >= REMOTE_MAX_FDS) {
349
+ return false;
350
+ }
351
+
352
+ if (msg->num_fds > 0) {
353
+ for (int i = 0; i < msg->num_fds; i++) {
354
+ if (fcntl(msg->fds[i], F_GETFL) == -1) {
355
+ return false;
356
+ }
357
+ }
358
+ }
359
+
360
+ return true;
361
+}
362
diff --git a/iothread.c b/iothread.c
363
index XXXXXXX..XXXXXXX 100644
364
--- a/iothread.c
365
+++ b/iothread.c
366
@@ -XXX,XX +XXX,XX @@ IOThread *iothread_by_id(const char *id)
367
{
368
return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL));
369
}
370
+
371
+bool qemu_in_iothread(void)
372
+{
373
+ return qemu_get_current_aio_context() == qemu_get_aio_context() ?
374
+ false : true;
375
+}
376
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
377
index XXXXXXX..XXXXXXX 100644
378
--- a/hw/remote/meson.build
379
+++ b/hw/remote/meson.build
380
@@ -XXX,XX +XXX,XX @@
381
remote_ss = ss.source_set()
382
383
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
384
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
385
386
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
387
diff --git a/hw/remote/trace-events b/hw/remote/trace-events
388
new file mode 100644
389
index XXXXXXX..XXXXXXX
390
--- /dev/null
391
+++ b/hw/remote/trace-events
392
@@ -XXX,XX +XXX,XX @@
393
+# multi-process trace events
394
+
395
+mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
396
+mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
397
--
398
2.29.2
399
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
Initializes the message handler function in the remote process. It is
4
called whenever there's an event pending on QIOChannel that registers
5
this function.
6
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Message-id: 99d38d8b93753a6409ac2340e858858cda59ab1b.1611938319.git.jag.raman@oracle.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
MAINTAINERS | 1 +
15
include/hw/remote/machine.h | 9 ++++++
16
hw/remote/message.c | 57 +++++++++++++++++++++++++++++++++++++
17
hw/remote/meson.build | 1 +
18
4 files changed, 68 insertions(+)
19
create mode 100644 hw/remote/message.c
20
21
diff --git a/MAINTAINERS b/MAINTAINERS
22
index XXXXXXX..XXXXXXX 100644
23
--- a/MAINTAINERS
24
+++ b/MAINTAINERS
25
@@ -XXX,XX +XXX,XX @@ F: hw/remote/machine.c
26
F: include/hw/remote/machine.h
27
F: hw/remote/mpqemu-link.c
28
F: include/hw/remote/mpqemu-link.h
29
+F: hw/remote/message.c
30
31
Build and test automation
32
-------------------------
33
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
34
index XXXXXXX..XXXXXXX 100644
35
--- a/include/hw/remote/machine.h
36
+++ b/include/hw/remote/machine.h
37
@@ -XXX,XX +XXX,XX @@
38
#include "qom/object.h"
39
#include "hw/boards.h"
40
#include "hw/pci-host/remote.h"
41
+#include "io/channel.h"
42
43
struct RemoteMachineState {
44
MachineState parent_obj;
45
@@ -XXX,XX +XXX,XX @@ struct RemoteMachineState {
46
RemotePCIHost *host;
47
};
48
49
+/* Used to pass to co-routine device and ioc. */
50
+typedef struct RemoteCommDev {
51
+ PCIDevice *dev;
52
+ QIOChannel *ioc;
53
+} RemoteCommDev;
54
+
55
#define TYPE_REMOTE_MACHINE "x-remote-machine"
56
OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
57
58
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data);
59
+
60
#endif
61
diff --git a/hw/remote/message.c b/hw/remote/message.c
62
new file mode 100644
63
index XXXXXXX..XXXXXXX
64
--- /dev/null
65
+++ b/hw/remote/message.c
66
@@ -XXX,XX +XXX,XX @@
67
+/*
68
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
69
+ *
70
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
71
+ *
72
+ * See the COPYING file in the top-level directory.
73
+ *
74
+ */
75
+
76
+#include "qemu/osdep.h"
77
+#include "qemu-common.h"
78
+
79
+#include "hw/remote/machine.h"
80
+#include "io/channel.h"
81
+#include "hw/remote/mpqemu-link.h"
82
+#include "qapi/error.h"
83
+#include "sysemu/runstate.h"
84
+
85
+void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
86
+{
87
+ g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
88
+ PCIDevice *pci_dev = NULL;
89
+ Error *local_err = NULL;
90
+
91
+ assert(com->ioc);
92
+
93
+ pci_dev = com->dev;
94
+ for (; !local_err;) {
95
+ MPQemuMsg msg = {0};
96
+
97
+ if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
98
+ break;
99
+ }
100
+
101
+ if (!mpqemu_msg_valid(&msg)) {
102
+ error_setg(&local_err, "Received invalid message from proxy"
103
+ "in remote process pid="FMT_pid"",
104
+ getpid());
105
+ break;
106
+ }
107
+
108
+ switch (msg.cmd) {
109
+ default:
110
+ error_setg(&local_err,
111
+ "Unknown command (%d) received for device %s"
112
+ " (pid="FMT_pid")",
113
+ msg.cmd, DEVICE(pci_dev)->id, getpid());
114
+ }
115
+ }
116
+
117
+ if (local_err) {
118
+ error_report_err(local_err);
119
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
120
+ } else {
121
+ qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
122
+ }
123
+}
124
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
125
index XXXXXXX..XXXXXXX 100644
126
--- a/hw/remote/meson.build
127
+++ b/hw/remote/meson.build
128
@@ -XXX,XX +XXX,XX @@ remote_ss = ss.source_set()
129
130
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
131
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
132
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
133
134
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
135
--
136
2.29.2
137
diff view generated by jsdifflib
New patch
1
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
3
Associate the file descriptor for a PCIDevice in remote process with
4
DeviceState object.
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: f405a2ed5d7518b87bea7c59cfdf334d67e5ee51.1611938319.git.jag.raman@oracle.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
MAINTAINERS | 1 +
14
hw/remote/remote-obj.c | 203 +++++++++++++++++++++++++++++++++++++++++
15
hw/remote/meson.build | 1 +
16
3 files changed, 205 insertions(+)
17
create mode 100644 hw/remote/remote-obj.c
18
19
diff --git a/MAINTAINERS b/MAINTAINERS
20
index XXXXXXX..XXXXXXX 100644
21
--- a/MAINTAINERS
22
+++ b/MAINTAINERS
23
@@ -XXX,XX +XXX,XX @@ F: include/hw/remote/machine.h
24
F: hw/remote/mpqemu-link.c
25
F: include/hw/remote/mpqemu-link.h
26
F: hw/remote/message.c
27
+F: hw/remote/remote-obj.c
28
29
Build and test automation
30
-------------------------
31
diff --git a/hw/remote/remote-obj.c b/hw/remote/remote-obj.c
32
new file mode 100644
33
index XXXXXXX..XXXXXXX
34
--- /dev/null
35
+++ b/hw/remote/remote-obj.c
36
@@ -XXX,XX +XXX,XX @@
37
+/*
38
+ * Copyright © 2020, 2021 Oracle and/or its affiliates.
39
+ *
40
+ * This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
41
+ *
42
+ * See the COPYING file in the top-level directory.
43
+ *
44
+ */
45
+
46
+#include "qemu/osdep.h"
47
+#include "qemu-common.h"
48
+
49
+#include "qemu/error-report.h"
50
+#include "qemu/notify.h"
51
+#include "qom/object_interfaces.h"
52
+#include "hw/qdev-core.h"
53
+#include "io/channel.h"
54
+#include "hw/qdev-core.h"
55
+#include "hw/remote/machine.h"
56
+#include "io/channel-util.h"
57
+#include "qapi/error.h"
58
+#include "sysemu/sysemu.h"
59
+#include "hw/pci/pci.h"
60
+#include "qemu/sockets.h"
61
+#include "monitor/monitor.h"
62
+
63
+#define TYPE_REMOTE_OBJECT "x-remote-object"
64
+OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
65
+
66
+struct RemoteObjectClass {
67
+ ObjectClass parent_class;
68
+
69
+ unsigned int nr_devs;
70
+ unsigned int max_devs;
71
+};
72
+
73
+struct RemoteObject {
74
+ /* private */
75
+ Object parent;
76
+
77
+ Notifier machine_done;
78
+
79
+ int32_t fd;
80
+ char *devid;
81
+
82
+ QIOChannel *ioc;
83
+
84
+ DeviceState *dev;
85
+ DeviceListener listener;
86
+};
87
+
88
+static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
89
+{
90
+ RemoteObject *o = REMOTE_OBJECT(obj);
91
+ int fd = -1;
92
+
93
+ fd = monitor_fd_param(monitor_cur(), str, errp);
94
+ if (fd == -1) {
95
+ error_prepend(errp, "Could not parse remote object fd %s:", str);
96
+ return;
97
+ }
98
+
99
+ if (!fd_is_socket(fd)) {
100
+ error_setg(errp, "File descriptor '%s' is not a socket", str);
101
+ close(fd);
102
+ return;
103
+ }
104
+
105
+ o->fd = fd;
106
+}
107
+
108
+static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
109
+{
110
+ RemoteObject *o = REMOTE_OBJECT(obj);
111
+
112
+ g_free(o->devid);
113
+
114
+ o->devid = g_strdup(str);
115
+}
116
+
117
+static void remote_object_unrealize_listener(DeviceListener *listener,
118
+ DeviceState *dev)
119
+{
120
+ RemoteObject *o = container_of(listener, RemoteObject, listener);
121
+
122
+ if (o->dev == dev) {
123
+ object_unref(OBJECT(o));
124
+ }
125
+}
126
+
127
+static void remote_object_machine_done(Notifier *notifier, void *data)
128
+{
129
+ RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
130
+ DeviceState *dev = NULL;
131
+ QIOChannel *ioc = NULL;
132
+ Coroutine *co = NULL;
133
+ RemoteCommDev *comdev = NULL;
134
+ Error *err = NULL;
135
+
136
+ dev = qdev_find_recursive(sysbus_get_default(), o->devid);
137
+ if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
138
+ error_report("%s is not a PCI device", o->devid);
139
+ return;
140
+ }
141
+
142
+ ioc = qio_channel_new_fd(o->fd, &err);
143
+ if (!ioc) {
144
+ error_report_err(err);
145
+ return;
146
+ }
147
+ qio_channel_set_blocking(ioc, false, NULL);
148
+
149
+ o->dev = dev;
150
+
151
+ o->listener.unrealize = remote_object_unrealize_listener;
152
+ device_listener_register(&o->listener);
153
+
154
+ /* co-routine should free this. */
155
+ comdev = g_new0(RemoteCommDev, 1);
156
+ *comdev = (RemoteCommDev) {
157
+ .ioc = ioc,
158
+ .dev = PCI_DEVICE(dev),
159
+ };
160
+
161
+ co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
162
+ qemu_coroutine_enter(co);
163
+}
164
+
165
+static void remote_object_init(Object *obj)
166
+{
167
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
168
+ RemoteObject *o = REMOTE_OBJECT(obj);
169
+
170
+ if (k->nr_devs >= k->max_devs) {
171
+ error_report("Reached maximum number of devices: %u", k->max_devs);
172
+ return;
173
+ }
174
+
175
+ o->ioc = NULL;
176
+ o->fd = -1;
177
+ o->devid = NULL;
178
+
179
+ k->nr_devs++;
180
+
181
+ o->machine_done.notify = remote_object_machine_done;
182
+ qemu_add_machine_init_done_notifier(&o->machine_done);
183
+}
184
+
185
+static void remote_object_finalize(Object *obj)
186
+{
187
+ RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
188
+ RemoteObject *o = REMOTE_OBJECT(obj);
189
+
190
+ device_listener_unregister(&o->listener);
191
+
192
+ if (o->ioc) {
193
+ qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
194
+ qio_channel_close(o->ioc, NULL);
195
+ }
196
+
197
+ object_unref(OBJECT(o->ioc));
198
+
199
+ k->nr_devs--;
200
+ g_free(o->devid);
201
+}
202
+
203
+static void remote_object_class_init(ObjectClass *klass, void *data)
204
+{
205
+ RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
206
+
207
+ /*
208
+ * Limit number of supported devices to 1. This is done to avoid devices
209
+ * from one VM accessing the RAM of another VM. This is done until we
210
+ * start using separate address spaces for individual devices.
211
+ */
212
+ k->max_devs = 1;
213
+ k->nr_devs = 0;
214
+
215
+ object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
216
+ object_class_property_add_str(klass, "devid", NULL,
217
+ remote_object_set_devid);
218
+}
219
+
220
+static const TypeInfo remote_object_info = {
221
+ .name = TYPE_REMOTE_OBJECT,
222
+ .parent = TYPE_OBJECT,
223
+ .instance_size = sizeof(RemoteObject),
224
+ .instance_init = remote_object_init,
225
+ .instance_finalize = remote_object_finalize,
226
+ .class_size = sizeof(RemoteObjectClass),
227
+ .class_init = remote_object_class_init,
228
+ .interfaces = (InterfaceInfo[]) {
229
+ { TYPE_USER_CREATABLE },
230
+ { }
231
+ }
232
+};
233
+
234
+static void register_types(void)
235
+{
236
+ type_register_static(&remote_object_info);
237
+}
238
+
239
+type_init(register_types);
240
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
241
index XXXXXXX..XXXXXXX 100644
242
--- a/hw/remote/meson.build
243
+++ b/hw/remote/meson.build
244
@@ -XXX,XX +XXX,XX @@ remote_ss = ss.source_set()
245
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
246
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
247
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
248
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
249
250
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
251
--
252
2.29.2
253
diff view generated by jsdifflib
New patch
1
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
3
SyncSysMemMsg message format is defined. It is used to send
4
file descriptors of the RAM regions to remote device.
5
RAM on the remote device is configured with a set of file descriptors.
6
Old RAM regions are deleted and new regions, each with an fd, is
7
added to the RAM.
8
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
11
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: 7d2d1831d812e85f681e7a8ab99e032cf4704689.1611938319.git.jag.raman@oracle.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
MAINTAINERS | 2 +
17
include/hw/remote/memory.h | 19 ++++++++++
18
include/hw/remote/mpqemu-link.h | 10 +++++
19
hw/remote/memory.c | 65 +++++++++++++++++++++++++++++++++
20
hw/remote/mpqemu-link.c | 11 ++++++
21
hw/remote/meson.build | 2 +
22
6 files changed, 109 insertions(+)
23
create mode 100644 include/hw/remote/memory.h
24
create mode 100644 hw/remote/memory.c
25
26
diff --git a/MAINTAINERS b/MAINTAINERS
27
index XXXXXXX..XXXXXXX 100644
28
--- a/MAINTAINERS
29
+++ b/MAINTAINERS
30
@@ -XXX,XX +XXX,XX @@ F: hw/remote/mpqemu-link.c
31
F: include/hw/remote/mpqemu-link.h
32
F: hw/remote/message.c
33
F: hw/remote/remote-obj.c
34
+F: include/hw/remote/memory.h
35
+F: hw/remote/memory.c
36
37
Build and test automation
38
-------------------------
39
diff --git a/include/hw/remote/memory.h b/include/hw/remote/memory.h
40
new file mode 100644
41
index XXXXXXX..XXXXXXX
42
--- /dev/null
43
+++ b/include/hw/remote/memory.h
44
@@ -XXX,XX +XXX,XX @@
45
+/*
46
+ * Memory manager for remote device
47
+ *
48
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
49
+ *
50
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
51
+ * See the COPYING file in the top-level directory.
52
+ *
53
+ */
54
+
55
+#ifndef REMOTE_MEMORY_H
56
+#define REMOTE_MEMORY_H
57
+
58
+#include "exec/hwaddr.h"
59
+#include "hw/remote/mpqemu-link.h"
60
+
61
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp);
62
+
63
+#endif
64
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
65
index XXXXXXX..XXXXXXX 100644
66
--- a/include/hw/remote/mpqemu-link.h
67
+++ b/include/hw/remote/mpqemu-link.h
68
@@ -XXX,XX +XXX,XX @@
69
#include "qom/object.h"
70
#include "qemu/thread.h"
71
#include "io/channel.h"
72
+#include "exec/hwaddr.h"
73
74
#define REMOTE_MAX_FDS 8
75
76
@@ -XXX,XX +XXX,XX @@
77
*
78
*/
79
typedef enum {
80
+ MPQEMU_CMD_SYNC_SYSMEM,
81
MPQEMU_CMD_MAX,
82
} MPQemuCmd;
83
84
+typedef struct {
85
+ hwaddr gpas[REMOTE_MAX_FDS];
86
+ uint64_t sizes[REMOTE_MAX_FDS];
87
+ off_t offsets[REMOTE_MAX_FDS];
88
+} SyncSysmemMsg;
89
+
90
/**
91
* MPQemuMsg:
92
* @cmd: The remote command
93
@@ -XXX,XX +XXX,XX @@ typedef enum {
94
* MPQemuMsg Format of the message sent to the remote device from QEMU.
95
*
96
*/
97
+
98
typedef struct {
99
int cmd;
100
size_t size;
101
102
union {
103
uint64_t u64;
104
+ SyncSysmemMsg sync_sysmem;
105
} data;
106
107
int fds[REMOTE_MAX_FDS];
108
diff --git a/hw/remote/memory.c b/hw/remote/memory.c
109
new file mode 100644
110
index XXXXXXX..XXXXXXX
111
--- /dev/null
112
+++ b/hw/remote/memory.c
113
@@ -XXX,XX +XXX,XX @@
114
+/*
115
+ * Memory manager for remote device
116
+ *
117
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
118
+ *
119
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
120
+ * See the COPYING file in the top-level directory.
121
+ *
122
+ */
123
+
124
+#include "qemu/osdep.h"
125
+#include "qemu-common.h"
126
+
127
+#include "hw/remote/memory.h"
128
+#include "exec/address-spaces.h"
129
+#include "exec/ram_addr.h"
130
+#include "qapi/error.h"
131
+
132
+static void remote_sysmem_reset(void)
133
+{
134
+ MemoryRegion *sysmem, *subregion, *next;
135
+
136
+ sysmem = get_system_memory();
137
+
138
+ QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
139
+ if (subregion->ram) {
140
+ memory_region_del_subregion(sysmem, subregion);
141
+ object_unparent(OBJECT(subregion));
142
+ }
143
+ }
144
+}
145
+
146
+void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
147
+{
148
+ ERRP_GUARD();
149
+ SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
150
+ MemoryRegion *sysmem, *subregion;
151
+ static unsigned int suffix;
152
+ int region;
153
+
154
+ sysmem = get_system_memory();
155
+
156
+ remote_sysmem_reset();
157
+
158
+ for (region = 0; region < msg->num_fds; region++) {
159
+ g_autofree char *name;
160
+ subregion = g_new(MemoryRegion, 1);
161
+ name = g_strdup_printf("remote-mem-%u", suffix++);
162
+ memory_region_init_ram_from_fd(subregion, NULL,
163
+ name, sysmem_info->sizes[region],
164
+ true, msg->fds[region],
165
+ sysmem_info->offsets[region],
166
+ errp);
167
+
168
+ if (*errp) {
169
+ g_free(subregion);
170
+ remote_sysmem_reset();
171
+ return;
172
+ }
173
+
174
+ memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
175
+ subregion);
176
+
177
+ }
178
+}
179
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
180
index XXXXXXX..XXXXXXX 100644
181
--- a/hw/remote/mpqemu-link.c
182
+++ b/hw/remote/mpqemu-link.c
183
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
184
}
185
}
186
187
+ /* Verify message specific fields. */
188
+ switch (msg->cmd) {
189
+ case MPQEMU_CMD_SYNC_SYSMEM:
190
+ if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
191
+ return false;
192
+ }
193
+ break;
194
+ default:
195
+ break;
196
+ }
197
+
198
return true;
199
}
200
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
201
index XXXXXXX..XXXXXXX 100644
202
--- a/hw/remote/meson.build
203
+++ b/hw/remote/meson.build
204
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
205
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
206
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
207
208
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
209
+
210
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
211
--
212
2.29.2
213
diff view generated by jsdifflib
1
From: Kashyap Chamarthy <kchamart@redhat.com>
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
2
3
This is part of the on-going effort to convert QEMU upstream
3
Defines a PCI Device proxy object as a child of TYPE_PCI_DEVICE.
4
documentation syntax to reStructuredText (rST).
5
4
6
The conversion to rST was done using:
5
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
6
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
9
Message-id: b5186ebfedf8e557044d09a768846c59230ad3a7.1611938319.git.jag.raman@oracle.com
10
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
11
---
12
MAINTAINERS | 2 +
13
include/hw/remote/proxy.h | 33 +++++++++++++
14
hw/remote/proxy.c | 99 +++++++++++++++++++++++++++++++++++++++
15
hw/remote/meson.build | 1 +
16
4 files changed, 135 insertions(+)
17
create mode 100644 include/hw/remote/proxy.h
18
create mode 100644 hw/remote/proxy.c
7
19
8
$ pandoc -f markdown -t rst bitmaps.md -o bitmaps.rst
20
diff --git a/MAINTAINERS b/MAINTAINERS
9
21
index XXXXXXX..XXXXXXX 100644
10
Then, make a couple of small syntactical adjustments. While at it,
22
--- a/MAINTAINERS
11
reword a statement to avoid ambiguity. Addressing the feedback from
23
+++ b/MAINTAINERS
12
this thread:
24
@@ -XXX,XX +XXX,XX @@ F: hw/remote/message.c
13
25
F: hw/remote/remote-obj.c
14
https://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg05428.html
26
F: include/hw/remote/memory.h
15
27
F: hw/remote/memory.c
16
Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
28
+F: hw/remote/proxy.c
17
Reviewed-by: John Snow <jsnow@redhat.com>
29
+F: include/hw/remote/proxy.h
18
Reviewed-by: Eric Blake <eblake@redhat.com>
30
19
Message-id: 20170717105205.32639-2-kchamart@redhat.com
31
Build and test automation
20
Signed-off-by: Jeff Cody <jcody@redhat.com>
32
-------------------------
21
---
33
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
22
docs/devel/bitmaps.md | 505 ------------------------------------------
23
docs/interop/bitmaps.rst | 555 +++++++++++++++++++++++++++++++++++++++++++++++
24
2 files changed, 555 insertions(+), 505 deletions(-)
25
delete mode 100644 docs/devel/bitmaps.md
26
create mode 100644 docs/interop/bitmaps.rst
27
28
diff --git a/docs/devel/bitmaps.md b/docs/devel/bitmaps.md
29
deleted file mode 100644
30
index XXXXXXX..XXXXXXX
31
--- a/docs/devel/bitmaps.md
32
+++ /dev/null
33
@@ -XXX,XX +XXX,XX @@
34
-<!--
35
-Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
36
-All rights reserved.
37
-
38
-This file is licensed via The FreeBSD Documentation License, the full text of
39
-which is included at the end of this document.
40
--->
41
-
42
-# Dirty Bitmaps and Incremental Backup
43
-
44
-* Dirty Bitmaps are objects that track which data needs to be backed up for the
45
- next incremental backup.
46
-
47
-* Dirty bitmaps can be created at any time and attached to any node
48
- (not just complete drives.)
49
-
50
-## Dirty Bitmap Names
51
-
52
-* A dirty bitmap's name is unique to the node, but bitmaps attached to different
53
- nodes can share the same name.
54
-
55
-* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
56
- name, but any user-created bitmaps may not be. There can be any number of
57
- anonymous bitmaps per node.
58
-
59
-* The name of a user-created bitmap must not be empty ("").
60
-
61
-## Bitmap Modes
62
-
63
-* A Bitmap can be "frozen," which means that it is currently in-use by a backup
64
- operation and cannot be deleted, renamed, written to, reset,
65
- etc.
66
-
67
-* The normal operating mode for a bitmap is "active."
68
-
69
-## Basic QMP Usage
70
-
71
-### Supported Commands ###
72
-
73
-* block-dirty-bitmap-add
74
-* block-dirty-bitmap-remove
75
-* block-dirty-bitmap-clear
76
-
77
-### Creation
78
-
79
-* To create a new bitmap, enabled, on the drive with id=drive0:
80
-
81
-```json
82
-{ "execute": "block-dirty-bitmap-add",
83
- "arguments": {
84
- "node": "drive0",
85
- "name": "bitmap0"
86
- }
87
-}
88
-```
89
-
90
-* This bitmap will have a default granularity that matches the cluster size of
91
- its associated drive, if available, clamped to between [4KiB, 64KiB].
92
- The current default for qcow2 is 64KiB.
93
-
94
-* To create a new bitmap that tracks changes in 32KiB segments:
95
-
96
-```json
97
-{ "execute": "block-dirty-bitmap-add",
98
- "arguments": {
99
- "node": "drive0",
100
- "name": "bitmap0",
101
- "granularity": 32768
102
- }
103
-}
104
-```
105
-
106
-### Deletion
107
-
108
-* Bitmaps that are frozen cannot be deleted.
109
-
110
-* Deleting the bitmap does not impact any other bitmaps attached to the same
111
- node, nor does it affect any backups already created from this node.
112
-
113
-* Because bitmaps are only unique to the node to which they are attached,
114
- you must specify the node/drive name here, too.
115
-
116
-```json
117
-{ "execute": "block-dirty-bitmap-remove",
118
- "arguments": {
119
- "node": "drive0",
120
- "name": "bitmap0"
121
- }
122
-}
123
-```
124
-
125
-### Resetting
126
-
127
-* Resetting a bitmap will clear all information it holds.
128
-
129
-* An incremental backup created from an empty bitmap will copy no data,
130
- as if nothing has changed.
131
-
132
-```json
133
-{ "execute": "block-dirty-bitmap-clear",
134
- "arguments": {
135
- "node": "drive0",
136
- "name": "bitmap0"
137
- }
138
-}
139
-```
140
-
141
-## Transactions
142
-
143
-### Justification
144
-
145
-Bitmaps can be safely modified when the VM is paused or halted by using
146
-the basic QMP commands. For instance, you might perform the following actions:
147
-
148
-1. Boot the VM in a paused state.
149
-2. Create a full drive backup of drive0.
150
-3. Create a new bitmap attached to drive0.
151
-4. Resume execution of the VM.
152
-5. Incremental backups are ready to be created.
153
-
154
-At this point, the bitmap and drive backup would be correctly in sync,
155
-and incremental backups made from this point forward would be correctly aligned
156
-to the full drive backup.
157
-
158
-This is not particularly useful if we decide we want to start incremental
159
-backups after the VM has been running for a while, for which we will need to
160
-perform actions such as the following:
161
-
162
-1. Boot the VM and begin execution.
163
-2. Using a single transaction, perform the following operations:
164
- * Create bitmap0.
165
- * Create a full drive backup of drive0.
166
-3. Incremental backups are now ready to be created.
167
-
168
-### Supported Bitmap Transactions
169
-
170
-* block-dirty-bitmap-add
171
-* block-dirty-bitmap-clear
172
-
173
-The usages are identical to their respective QMP commands, but see below
174
-for examples.
175
-
176
-### Example: New Incremental Backup
177
-
178
-As outlined in the justification, perhaps we want to create a new incremental
179
-backup chain attached to a drive.
180
-
181
-```json
182
-{ "execute": "transaction",
183
- "arguments": {
184
- "actions": [
185
- {"type": "block-dirty-bitmap-add",
186
- "data": {"node": "drive0", "name": "bitmap0"} },
187
- {"type": "drive-backup",
188
- "data": {"device": "drive0", "target": "/path/to/full_backup.img",
189
- "sync": "full", "format": "qcow2"} }
190
- ]
191
- }
192
-}
193
-```
194
-
195
-### Example: New Incremental Backup Anchor Point
196
-
197
-Maybe we just want to create a new full backup with an existing bitmap and
198
-want to reset the bitmap to track the new chain.
199
-
200
-```json
201
-{ "execute": "transaction",
202
- "arguments": {
203
- "actions": [
204
- {"type": "block-dirty-bitmap-clear",
205
- "data": {"node": "drive0", "name": "bitmap0"} },
206
- {"type": "drive-backup",
207
- "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
208
- "sync": "full", "format": "qcow2"} }
209
- ]
210
- }
211
-}
212
-```
213
-
214
-## Incremental Backups
215
-
216
-The star of the show.
217
-
218
-**Nota Bene!** Only incremental backups of entire drives are supported for now.
219
-So despite the fact that you can attach a bitmap to any arbitrary node, they are
220
-only currently useful when attached to the root node. This is because
221
-drive-backup only supports drives/devices instead of arbitrary nodes.
222
-
223
-### Example: First Incremental Backup
224
-
225
-1. Create a full backup and sync it to the dirty bitmap, as in the transactional
226
-examples above; or with the VM offline, manually create a full copy and then
227
-create a new bitmap before the VM begins execution.
228
-
229
- * Let's assume the full backup is named 'full_backup.img'.
230
- * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'.
231
-
232
-2. Create a destination image for the incremental backup that utilizes the
233
-full backup as a backing image.
234
-
235
- * Let's assume it is named 'incremental.0.img'.
236
-
237
- ```sh
238
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
239
- ```
240
-
241
-3. Issue the incremental backup command:
242
-
243
- ```json
244
- { "execute": "drive-backup",
245
- "arguments": {
246
- "device": "drive0",
247
- "bitmap": "bitmap0",
248
- "target": "incremental.0.img",
249
- "format": "qcow2",
250
- "sync": "incremental",
251
- "mode": "existing"
252
- }
253
- }
254
- ```
255
-
256
-### Example: Second Incremental Backup
257
-
258
-1. Create a new destination image for the incremental backup that points to the
259
- previous one, e.g.: 'incremental.1.img'
260
-
261
- ```sh
262
- # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
263
- ```
264
-
265
-2. Issue a new incremental backup command. The only difference here is that we
266
- have changed the target image below.
267
-
268
- ```json
269
- { "execute": "drive-backup",
270
- "arguments": {
271
- "device": "drive0",
272
- "bitmap": "bitmap0",
273
- "target": "incremental.1.img",
274
- "format": "qcow2",
275
- "sync": "incremental",
276
- "mode": "existing"
277
- }
278
- }
279
- ```
280
-
281
-## Errors
282
-
283
-* In the event of an error that occurs after a backup job is successfully
284
- launched, either by a direct QMP command or a QMP transaction, the user
285
- will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied
286
- by a BLOCK_JOB_ERROR event.
287
-
288
-* In the case of an event being cancelled, the user will receive a
289
- BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events.
290
-
291
-* In either case, the incremental backup data contained within the bitmap is
292
- safely rolled back, and the data within the bitmap is not lost. The image
293
- file created for the failed attempt can be safely deleted.
294
-
295
-* Once the underlying problem is fixed (e.g. more storage space is freed up),
296
- you can simply retry the incremental backup command with the same bitmap.
297
-
298
-### Example
299
-
300
-1. Create a target image:
301
-
302
- ```sh
303
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
304
- ```
305
-
306
-2. Attempt to create an incremental backup via QMP:
307
-
308
- ```json
309
- { "execute": "drive-backup",
310
- "arguments": {
311
- "device": "drive0",
312
- "bitmap": "bitmap0",
313
- "target": "incremental.0.img",
314
- "format": "qcow2",
315
- "sync": "incremental",
316
- "mode": "existing"
317
- }
318
- }
319
- ```
320
-
321
-3. Receive an event notifying us of failure:
322
-
323
- ```json
324
- { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
325
- "data": { "speed": 0, "offset": 0, "len": 67108864,
326
- "error": "No space left on device",
327
- "device": "drive1", "type": "backup" },
328
- "event": "BLOCK_JOB_COMPLETED" }
329
- ```
330
-
331
-4. Delete the failed incremental, and re-create the image.
332
-
333
- ```sh
334
- # rm incremental.0.img
335
- # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
336
- ```
337
-
338
-5. Retry the command after fixing the underlying problem,
339
- such as freeing up space on the backup volume:
340
-
341
- ```json
342
- { "execute": "drive-backup",
343
- "arguments": {
344
- "device": "drive0",
345
- "bitmap": "bitmap0",
346
- "target": "incremental.0.img",
347
- "format": "qcow2",
348
- "sync": "incremental",
349
- "mode": "existing"
350
- }
351
- }
352
- ```
353
-
354
-6. Receive confirmation that the job completed successfully:
355
-
356
- ```json
357
- { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
358
- "data": { "device": "drive1", "type": "backup",
359
- "speed": 0, "len": 67108864, "offset": 67108864},
360
- "event": "BLOCK_JOB_COMPLETED" }
361
- ```
362
-
363
-### Partial Transactional Failures
364
-
365
-* Sometimes, a transaction will succeed in launching and return success,
366
- but then later the backup jobs themselves may fail. It is possible that
367
- a management application may have to deal with a partial backup failure
368
- after a successful transaction.
369
-
370
-* If multiple backup jobs are specified in a single transaction, when one of
371
- them fails, it will not interact with the other backup jobs in any way.
372
-
373
-* The job(s) that succeeded will clear the dirty bitmap associated with the
374
- operation, but the job(s) that failed will not. It is not "safe" to delete
375
- any incremental backups that were created successfully in this scenario,
376
- even though others failed.
377
-
378
-#### Example
379
-
380
-* QMP example highlighting two backup jobs:
381
-
382
- ```json
383
- { "execute": "transaction",
384
- "arguments": {
385
- "actions": [
386
- { "type": "drive-backup",
387
- "data": { "device": "drive0", "bitmap": "bitmap0",
388
- "format": "qcow2", "mode": "existing",
389
- "sync": "incremental", "target": "d0-incr-1.qcow2" } },
390
- { "type": "drive-backup",
391
- "data": { "device": "drive1", "bitmap": "bitmap1",
392
- "format": "qcow2", "mode": "existing",
393
- "sync": "incremental", "target": "d1-incr-1.qcow2" } },
394
- ]
395
- }
396
- }
397
- ```
398
-
399
-* QMP example response, highlighting one success and one failure:
400
- * Acknowledgement that the Transaction was accepted and jobs were launched:
401
- ```json
402
- { "return": {} }
403
- ```
404
-
405
- * Later, QEMU sends notice that the first job was completed:
406
- ```json
407
- { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
408
- "data": { "device": "drive0", "type": "backup",
409
- "speed": 0, "len": 67108864, "offset": 67108864 },
410
- "event": "BLOCK_JOB_COMPLETED"
411
- }
412
- ```
413
-
414
- * Later yet, QEMU sends notice that the second job has failed:
415
- ```json
416
- { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
417
- "data": { "device": "drive1", "action": "report",
418
- "operation": "read" },
419
- "event": "BLOCK_JOB_ERROR" }
420
- ```
421
-
422
- ```json
423
- { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
424
- "data": { "speed": 0, "offset": 0, "len": 67108864,
425
- "error": "Input/output error",
426
- "device": "drive1", "type": "backup" },
427
- "event": "BLOCK_JOB_COMPLETED" }
428
-
429
-* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
430
- but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
431
- incremental backup of all drives at a point-in-time is to be made,
432
- new backups for both drives will need to be made, taking into account
433
- that a new incremental backup for drive0 needs to be based on top of
434
- "d0-incr-1.qcow2."
435
-
436
-### Grouped Completion Mode
437
-
438
-* While jobs launched by transactions normally complete or fail on their own,
439
- it is possible to instruct them to complete or fail together as a group.
440
-
441
-* QMP transactions take an optional properties structure that can affect
442
- the semantics of the transaction.
443
-
444
-* The "completion-mode" transaction property can be either "individual"
445
- which is the default, legacy behavior described above, or "grouped,"
446
- a new behavior detailed below.
447
-
448
-* Delayed Completion: In grouped completion mode, no jobs will report
449
- success until all jobs are ready to report success.
450
-
451
-* Grouped failure: If any job fails in grouped completion mode, all remaining
452
- jobs will be cancelled. Any incremental backups will restore their dirty
453
- bitmap objects as if no backup command was ever issued.
454
-
455
- * Regardless of if QEMU reports a particular incremental backup job as
456
- CANCELLED or as an ERROR, the in-memory bitmap will be restored.
457
-
458
-#### Example
459
-
460
-* Here's the same example scenario from above with the new property:
461
-
462
- ```json
463
- { "execute": "transaction",
464
- "arguments": {
465
- "actions": [
466
- { "type": "drive-backup",
467
- "data": { "device": "drive0", "bitmap": "bitmap0",
468
- "format": "qcow2", "mode": "existing",
469
- "sync": "incremental", "target": "d0-incr-1.qcow2" } },
470
- { "type": "drive-backup",
471
- "data": { "device": "drive1", "bitmap": "bitmap1",
472
- "format": "qcow2", "mode": "existing",
473
- "sync": "incremental", "target": "d1-incr-1.qcow2" } },
474
- ],
475
- "properties": {
476
- "completion-mode": "grouped"
477
- }
478
- }
479
- }
480
- ```
481
-
482
-* QMP example response, highlighting a failure for drive2:
483
- * Acknowledgement that the Transaction was accepted and jobs were launched:
484
- ```json
485
- { "return": {} }
486
- ```
487
-
488
- * Later, QEMU sends notice that the second job has errored out,
489
- but that the first job was also cancelled:
490
- ```json
491
- { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
492
- "data": { "device": "drive1", "action": "report",
493
- "operation": "read" },
494
- "event": "BLOCK_JOB_ERROR" }
495
- ```
496
-
497
- ```json
498
- { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
499
- "data": { "speed": 0, "offset": 0, "len": 67108864,
500
- "error": "Input/output error",
501
- "device": "drive1", "type": "backup" },
502
- "event": "BLOCK_JOB_COMPLETED" }
503
- ```
504
-
505
- ```json
506
- { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
507
- "data": { "device": "drive0", "type": "backup", "speed": 0,
508
- "len": 67108864, "offset": 16777216 },
509
- "event": "BLOCK_JOB_CANCELLED" }
510
- ```
511
-
512
-<!--
513
-The FreeBSD Documentation License
514
-
515
-Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
516
-PDF, PostScript, RTF and so forth) with or without modification, are permitted
517
-provided that the following conditions are met:
518
-
519
-Redistributions of source code (Markdown) must retain the above copyright
520
-notice, this list of conditions and the following disclaimer of this file
521
-unmodified.
522
-
523
-Redistributions in compiled form (transformed to other DTDs, converted to PDF,
524
-PostScript, RTF and other formats) must reproduce the above copyright notice,
525
-this list of conditions and the following disclaimer in the documentation and/or
526
-other materials provided with the distribution.
527
-
528
-THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
529
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
530
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
531
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
532
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
533
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
534
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
535
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
536
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
537
-THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
538
--->
539
diff --git a/docs/interop/bitmaps.rst b/docs/interop/bitmaps.rst
540
new file mode 100644
34
new file mode 100644
541
index XXXXXXX..XXXXXXX
35
index XXXXXXX..XXXXXXX
542
--- /dev/null
36
--- /dev/null
543
+++ b/docs/interop/bitmaps.rst
37
+++ b/include/hw/remote/proxy.h
544
@@ -XXX,XX +XXX,XX @@
38
@@ -XXX,XX +XXX,XX @@
545
+..
39
+/*
546
+ Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc.
40
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
547
+ All rights reserved.
41
+ *
42
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
43
+ * See the COPYING file in the top-level directory.
44
+ *
45
+ */
548
+
46
+
549
+ This file is licensed via The FreeBSD Documentation License, the full
47
+#ifndef PROXY_H
550
+ text of which is included at the end of this document.
48
+#define PROXY_H
551
+
49
+
552
+====================================
50
+#include "hw/pci/pci.h"
553
+Dirty Bitmaps and Incremental Backup
51
+#include "io/channel.h"
554
+====================================
555
+
52
+
556
+- Dirty Bitmaps are objects that track which data needs to be backed up
53
+#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
557
+ for the next incremental backup.
54
+OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
558
+
55
+
559
+- Dirty bitmaps can be created at any time and attached to any node
56
+struct PCIProxyDev {
560
+ (not just complete drives).
57
+ PCIDevice parent_dev;
58
+ char *fd;
561
+
59
+
562
+.. contents::
60
+ /*
61
+ * Mutex used to protect the QIOChannel fd from
62
+ * the concurrent access by the VCPUs since proxy
63
+ * blocks while awaiting for the replies from the
64
+ * process remote.
65
+ */
66
+ QemuMutex io_mutex;
67
+ QIOChannel *ioc;
68
+ Error *migration_blocker;
69
+};
563
+
70
+
564
+Dirty Bitmap Names
71
+#endif /* PROXY_H */
565
+------------------
72
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
73
new file mode 100644
74
index XXXXXXX..XXXXXXX
75
--- /dev/null
76
+++ b/hw/remote/proxy.c
77
@@ -XXX,XX +XXX,XX @@
78
+/*
79
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
80
+ *
81
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
82
+ * See the COPYING file in the top-level directory.
83
+ *
84
+ */
566
+
85
+
567
+- A dirty bitmap's name is unique to the node, but bitmaps attached to
86
+#include "qemu/osdep.h"
568
+ different nodes can share the same name.
87
+#include "qemu-common.h"
569
+
88
+
570
+- Dirty bitmaps created for internal use by QEMU may be anonymous and
89
+#include "hw/remote/proxy.h"
571
+ have no name, but any user-created bitmaps must have a name. There
90
+#include "hw/pci/pci.h"
572
+ can be any number of anonymous bitmaps per node.
91
+#include "qapi/error.h"
92
+#include "io/channel-util.h"
93
+#include "hw/qdev-properties.h"
94
+#include "monitor/monitor.h"
95
+#include "migration/blocker.h"
96
+#include "qemu/sockets.h"
573
+
97
+
574
+- The name of a user-created bitmap must not be empty ("").
98
+static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
99
+{
100
+ ERRP_GUARD();
101
+ PCIProxyDev *dev = PCI_PROXY_DEV(device);
102
+ int fd;
575
+
103
+
576
+Bitmap Modes
104
+ if (!dev->fd) {
577
+------------
105
+ error_setg(errp, "fd parameter not specified for %s",
578
+
106
+ DEVICE(device)->id);
579
+- A bitmap can be "frozen," which means that it is currently in-use by
107
+ return;
580
+ a backup operation and cannot be deleted, renamed, written to, reset,
581
+ etc.
582
+
583
+- The normal operating mode for a bitmap is "active."
584
+
585
+Basic QMP Usage
586
+---------------
587
+
588
+Supported Commands
589
+~~~~~~~~~~~~~~~~~~
590
+
591
+- ``block-dirty-bitmap-add``
592
+- ``block-dirty-bitmap-remove``
593
+- ``block-dirty-bitmap-clear``
594
+
595
+Creation
596
+~~~~~~~~
597
+
598
+- To create a new bitmap, enabled, on the drive with id=drive0:
599
+
600
+.. code:: json
601
+
602
+ { "execute": "block-dirty-bitmap-add",
603
+ "arguments": {
604
+ "node": "drive0",
605
+ "name": "bitmap0"
606
+ }
607
+ }
108
+ }
608
+
109
+
609
+- This bitmap will have a default granularity that matches the cluster
110
+ fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
610
+ size of its associated drive, if available, clamped to between [4KiB,
111
+ if (fd == -1) {
611
+ 64KiB]. The current default for qcow2 is 64KiB.
112
+ error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
612
+
113
+ return;
613
+- To create a new bitmap that tracks changes in 32KiB segments:
614
+
615
+.. code:: json
616
+
617
+ { "execute": "block-dirty-bitmap-add",
618
+ "arguments": {
619
+ "node": "drive0",
620
+ "name": "bitmap0",
621
+ "granularity": 32768
622
+ }
623
+ }
114
+ }
624
+
115
+
625
+Deletion
116
+ if (!fd_is_socket(fd)) {
626
+~~~~~~~~
117
+ error_setg(errp, "proxy: fd %d is not a socket", fd);
627
+
118
+ close(fd);
628
+- Bitmaps that are frozen cannot be deleted.
119
+ return;
629
+
630
+- Deleting the bitmap does not impact any other bitmaps attached to the
631
+ same node, nor does it affect any backups already created from this
632
+ node.
633
+
634
+- Because bitmaps are only unique to the node to which they are
635
+ attached, you must specify the node/drive name here, too.
636
+
637
+.. code:: json
638
+
639
+ { "execute": "block-dirty-bitmap-remove",
640
+ "arguments": {
641
+ "node": "drive0",
642
+ "name": "bitmap0"
643
+ }
644
+ }
120
+ }
645
+
121
+
646
+Resetting
122
+ dev->ioc = qio_channel_new_fd(fd, errp);
647
+~~~~~~~~~
648
+
123
+
649
+- Resetting a bitmap will clear all information it holds.
124
+ error_setg(&dev->migration_blocker, "%s does not support migration",
125
+ TYPE_PCI_PROXY_DEV);
126
+ migrate_add_blocker(dev->migration_blocker, errp);
650
+
127
+
651
+- An incremental backup created from an empty bitmap will copy no data,
128
+ qemu_mutex_init(&dev->io_mutex);
652
+ as if nothing has changed.
129
+ qio_channel_set_blocking(dev->ioc, true, NULL);
130
+}
653
+
131
+
654
+.. code:: json
132
+static void pci_proxy_dev_exit(PCIDevice *pdev)
133
+{
134
+ PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
655
+
135
+
656
+ { "execute": "block-dirty-bitmap-clear",
136
+ if (dev->ioc) {
657
+ "arguments": {
137
+ qio_channel_close(dev->ioc, NULL);
658
+ "node": "drive0",
659
+ "name": "bitmap0"
660
+ }
661
+ }
138
+ }
662
+
139
+
663
+Transactions
140
+ migrate_del_blocker(dev->migration_blocker);
664
+------------
665
+
141
+
666
+Justification
142
+ error_free(dev->migration_blocker);
667
+~~~~~~~~~~~~~
143
+}
668
+
144
+
669
+Bitmaps can be safely modified when the VM is paused or halted by using
145
+static Property proxy_properties[] = {
670
+the basic QMP commands. For instance, you might perform the following
146
+ DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
671
+actions:
147
+ DEFINE_PROP_END_OF_LIST(),
148
+};
672
+
149
+
673
+1. Boot the VM in a paused state.
150
+static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
674
+2. Create a full drive backup of drive0.
151
+{
675
+3. Create a new bitmap attached to drive0.
152
+ DeviceClass *dc = DEVICE_CLASS(klass);
676
+4. Resume execution of the VM.
153
+ PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
677
+5. Incremental backups are ready to be created.
678
+
154
+
679
+At this point, the bitmap and drive backup would be correctly in sync,
155
+ k->realize = pci_proxy_dev_realize;
680
+and incremental backups made from this point forward would be correctly
156
+ k->exit = pci_proxy_dev_exit;
681
+aligned to the full drive backup.
157
+ device_class_set_props(dc, proxy_properties);
158
+}
682
+
159
+
683
+This is not particularly useful if we decide we want to start
160
+static const TypeInfo pci_proxy_dev_type_info = {
684
+incremental backups after the VM has been running for a while, for which
161
+ .name = TYPE_PCI_PROXY_DEV,
685
+we will need to perform actions such as the following:
162
+ .parent = TYPE_PCI_DEVICE,
163
+ .instance_size = sizeof(PCIProxyDev),
164
+ .class_init = pci_proxy_dev_class_init,
165
+ .interfaces = (InterfaceInfo[]) {
166
+ { INTERFACE_CONVENTIONAL_PCI_DEVICE },
167
+ { },
168
+ },
169
+};
686
+
170
+
687
+1. Boot the VM and begin execution.
171
+static void pci_proxy_dev_register_types(void)
688
+2. Using a single transaction, perform the following operations:
172
+{
173
+ type_register_static(&pci_proxy_dev_type_info);
174
+}
689
+
175
+
690
+ - Create ``bitmap0``.
176
+type_init(pci_proxy_dev_register_types)
691
+ - Create a full drive backup of ``drive0``.
177
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
692
+
178
index XXXXXXX..XXXXXXX 100644
693
+3. Incremental backups are now ready to be created.
179
--- a/hw/remote/meson.build
694
+
180
+++ b/hw/remote/meson.build
695
+Supported Bitmap Transactions
181
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
696
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
182
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
697
+
183
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
698
+- ``block-dirty-bitmap-add``
184
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
699
+- ``block-dirty-bitmap-clear``
185
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
700
+
186
701
+The usages are identical to their respective QMP commands, but see below
187
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
702
+for examples.
188
703
+
704
+Example: New Incremental Backup
705
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
706
+
707
+As outlined in the justification, perhaps we want to create a new
708
+incremental backup chain attached to a drive.
709
+
710
+.. code:: json
711
+
712
+ { "execute": "transaction",
713
+ "arguments": {
714
+ "actions": [
715
+ {"type": "block-dirty-bitmap-add",
716
+ "data": {"node": "drive0", "name": "bitmap0"} },
717
+ {"type": "drive-backup",
718
+ "data": {"device": "drive0", "target": "/path/to/full_backup.img",
719
+ "sync": "full", "format": "qcow2"} }
720
+ ]
721
+ }
722
+ }
723
+
724
+Example: New Incremental Backup Anchor Point
725
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
726
+
727
+Maybe we just want to create a new full backup with an existing bitmap
728
+and want to reset the bitmap to track the new chain.
729
+
730
+.. code:: json
731
+
732
+ { "execute": "transaction",
733
+ "arguments": {
734
+ "actions": [
735
+ {"type": "block-dirty-bitmap-clear",
736
+ "data": {"node": "drive0", "name": "bitmap0"} },
737
+ {"type": "drive-backup",
738
+ "data": {"device": "drive0", "target": "/path/to/new_full_backup.img",
739
+ "sync": "full", "format": "qcow2"} }
740
+ ]
741
+ }
742
+ }
743
+
744
+Incremental Backups
745
+-------------------
746
+
747
+The star of the show.
748
+
749
+**Nota Bene!** Only incremental backups of entire drives are supported
750
+for now. So despite the fact that you can attach a bitmap to any
751
+arbitrary node, they are only currently useful when attached to the root
752
+node. This is because drive-backup only supports drives/devices instead
753
+of arbitrary nodes.
754
+
755
+Example: First Incremental Backup
756
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
757
+
758
+1. Create a full backup and sync it to the dirty bitmap, as in the
759
+ transactional examples above; or with the VM offline, manually create
760
+ a full copy and then create a new bitmap before the VM begins
761
+ execution.
762
+
763
+ - Let's assume the full backup is named ``full_backup.img``.
764
+ - Let's assume the bitmap you created is ``bitmap0`` attached to
765
+ ``drive0``.
766
+
767
+2. Create a destination image for the incremental backup that utilizes
768
+ the full backup as a backing image.
769
+
770
+ - Let's assume the new incremental image is named
771
+ ``incremental.0.img``.
772
+
773
+ .. code:: bash
774
+
775
+ $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
776
+
777
+3. Issue the incremental backup command:
778
+
779
+ .. code:: json
780
+
781
+ { "execute": "drive-backup",
782
+ "arguments": {
783
+ "device": "drive0",
784
+ "bitmap": "bitmap0",
785
+ "target": "incremental.0.img",
786
+ "format": "qcow2",
787
+ "sync": "incremental",
788
+ "mode": "existing"
789
+ }
790
+ }
791
+
792
+Example: Second Incremental Backup
793
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
794
+
795
+1. Create a new destination image for the incremental backup that points
796
+ to the previous one, e.g.: ``incremental.1.img``
797
+
798
+ .. code:: bash
799
+
800
+ $ qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2
801
+
802
+2. Issue a new incremental backup command. The only difference here is
803
+ that we have changed the target image below.
804
+
805
+ .. code:: json
806
+
807
+ { "execute": "drive-backup",
808
+ "arguments": {
809
+ "device": "drive0",
810
+ "bitmap": "bitmap0",
811
+ "target": "incremental.1.img",
812
+ "format": "qcow2",
813
+ "sync": "incremental",
814
+ "mode": "existing"
815
+ }
816
+ }
817
+
818
+Errors
819
+------
820
+
821
+- In the event of an error that occurs after a backup job is
822
+ successfully launched, either by a direct QMP command or a QMP
823
+ transaction, the user will receive a ``BLOCK_JOB_COMPLETE`` event with
824
+ a failure message, accompanied by a ``BLOCK_JOB_ERROR`` event.
825
+
826
+- In the case of an event being cancelled, the user will receive a
827
+ ``BLOCK_JOB_CANCELLED`` event instead of a pair of COMPLETE and ERROR
828
+ events.
829
+
830
+- In either case, the incremental backup data contained within the
831
+ bitmap is safely rolled back, and the data within the bitmap is not
832
+ lost. The image file created for the failed attempt can be safely
833
+ deleted.
834
+
835
+- Once the underlying problem is fixed (e.g. more storage space is
836
+ freed up), you can simply retry the incremental backup command with
837
+ the same bitmap.
838
+
839
+Example
840
+~~~~~~~
841
+
842
+1. Create a target image:
843
+
844
+ .. code:: bash
845
+
846
+ $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
847
+
848
+2. Attempt to create an incremental backup via QMP:
849
+
850
+ .. code:: json
851
+
852
+ { "execute": "drive-backup",
853
+ "arguments": {
854
+ "device": "drive0",
855
+ "bitmap": "bitmap0",
856
+ "target": "incremental.0.img",
857
+ "format": "qcow2",
858
+ "sync": "incremental",
859
+ "mode": "existing"
860
+ }
861
+ }
862
+
863
+3. Receive an event notifying us of failure:
864
+
865
+ .. code:: json
866
+
867
+ { "timestamp": { "seconds": 1424709442, "microseconds": 844524 },
868
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
869
+ "error": "No space left on device",
870
+ "device": "drive1", "type": "backup" },
871
+ "event": "BLOCK_JOB_COMPLETED" }
872
+
873
+4. Delete the failed incremental, and re-create the image.
874
+
875
+ .. code:: bash
876
+
877
+ $ rm incremental.0.img
878
+ $ qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2
879
+
880
+5. Retry the command after fixing the underlying problem, such as
881
+ freeing up space on the backup volume:
882
+
883
+ .. code:: json
884
+
885
+ { "execute": "drive-backup",
886
+ "arguments": {
887
+ "device": "drive0",
888
+ "bitmap": "bitmap0",
889
+ "target": "incremental.0.img",
890
+ "format": "qcow2",
891
+ "sync": "incremental",
892
+ "mode": "existing"
893
+ }
894
+ }
895
+
896
+6. Receive confirmation that the job completed successfully:
897
+
898
+ .. code:: json
899
+
900
+ { "timestamp": { "seconds": 1424709668, "microseconds": 526525 },
901
+ "data": { "device": "drive1", "type": "backup",
902
+ "speed": 0, "len": 67108864, "offset": 67108864},
903
+ "event": "BLOCK_JOB_COMPLETED" }
904
+
905
+Partial Transactional Failures
906
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
907
+
908
+- Sometimes, a transaction will succeed in launching and return
909
+ success, but then later the backup jobs themselves may fail. It is
910
+ possible that a management application may have to deal with a
911
+ partial backup failure after a successful transaction.
912
+
913
+- If multiple backup jobs are specified in a single transaction, when
914
+ one of them fails, it will not interact with the other backup jobs in
915
+ any way.
916
+
917
+- The job(s) that succeeded will clear the dirty bitmap associated with
918
+ the operation, but the job(s) that failed will not. It is not "safe"
919
+ to delete any incremental backups that were created successfully in
920
+ this scenario, even though others failed.
921
+
922
+Example
923
+^^^^^^^
924
+
925
+- QMP example highlighting two backup jobs:
926
+
927
+ .. code:: json
928
+
929
+ { "execute": "transaction",
930
+ "arguments": {
931
+ "actions": [
932
+ { "type": "drive-backup",
933
+ "data": { "device": "drive0", "bitmap": "bitmap0",
934
+ "format": "qcow2", "mode": "existing",
935
+ "sync": "incremental", "target": "d0-incr-1.qcow2" } },
936
+ { "type": "drive-backup",
937
+ "data": { "device": "drive1", "bitmap": "bitmap1",
938
+ "format": "qcow2", "mode": "existing",
939
+ "sync": "incremental", "target": "d1-incr-1.qcow2" } },
940
+ ]
941
+ }
942
+ }
943
+
944
+- QMP example response, highlighting one success and one failure:
945
+
946
+ - Acknowledgement that the Transaction was accepted and jobs were
947
+ launched:
948
+
949
+ .. code:: json
950
+
951
+ { "return": {} }
952
+
953
+ - Later, QEMU sends notice that the first job was completed:
954
+
955
+ .. code:: json
956
+
957
+ { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
958
+ "data": { "device": "drive0", "type": "backup",
959
+ "speed": 0, "len": 67108864, "offset": 67108864 },
960
+ "event": "BLOCK_JOB_COMPLETED"
961
+ }
962
+
963
+ - Later yet, QEMU sends notice that the second job has failed:
964
+
965
+ .. code:: json
966
+
967
+ { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
968
+ "data": { "device": "drive1", "action": "report",
969
+ "operation": "read" },
970
+ "event": "BLOCK_JOB_ERROR" }
971
+
972
+ .. code:: json
973
+
974
+ { "timestamp": { "seconds": 1447192399, "microseconds":
975
+ 685853 }, "data": { "speed": 0, "offset": 0, "len": 67108864,
976
+ "error": "Input/output error", "device": "drive1", "type":
977
+ "backup" }, "event": "BLOCK_JOB_COMPLETED" }
978
+
979
+- In the above example, ``d0-incr-1.qcow2`` is valid and must be kept,
980
+ but ``d1-incr-1.qcow2`` is invalid and should be deleted. If a VM-wide
981
+ incremental backup of all drives at a point-in-time is to be made,
982
+ new backups for both drives will need to be made, taking into account
983
+ that a new incremental backup for drive0 needs to be based on top of
984
+ ``d0-incr-1.qcow2``.
985
+
986
+Grouped Completion Mode
987
+~~~~~~~~~~~~~~~~~~~~~~~
988
+
989
+- While jobs launched by transactions normally complete or fail on
990
+ their own, it is possible to instruct them to complete or fail
991
+ together as a group.
992
+
993
+- QMP transactions take an optional properties structure that can
994
+ affect the semantics of the transaction.
995
+
996
+- The "completion-mode" transaction property can be either "individual"
997
+ which is the default, legacy behavior described above, or "grouped,"
998
+ a new behavior detailed below.
999
+
1000
+- Delayed Completion: In grouped completion mode, no jobs will report
1001
+ success until all jobs are ready to report success.
1002
+
1003
+- Grouped failure: If any job fails in grouped completion mode, all
1004
+ remaining jobs will be cancelled. Any incremental backups will
1005
+ restore their dirty bitmap objects as if no backup command was ever
1006
+ issued.
1007
+
1008
+ - Regardless of if QEMU reports a particular incremental backup job
1009
+ as CANCELLED or as an ERROR, the in-memory bitmap will be
1010
+ restored.
1011
+
1012
+Example
1013
+^^^^^^^
1014
+
1015
+- Here's the same example scenario from above with the new property:
1016
+
1017
+ .. code:: json
1018
+
1019
+ { "execute": "transaction",
1020
+ "arguments": {
1021
+ "actions": [
1022
+ { "type": "drive-backup",
1023
+ "data": { "device": "drive0", "bitmap": "bitmap0",
1024
+ "format": "qcow2", "mode": "existing",
1025
+ "sync": "incremental", "target": "d0-incr-1.qcow2" } },
1026
+ { "type": "drive-backup",
1027
+ "data": { "device": "drive1", "bitmap": "bitmap1",
1028
+ "format": "qcow2", "mode": "existing",
1029
+ "sync": "incremental", "target": "d1-incr-1.qcow2" } },
1030
+ ],
1031
+ "properties": {
1032
+ "completion-mode": "grouped"
1033
+ }
1034
+ }
1035
+ }
1036
+
1037
+- QMP example response, highlighting a failure for ``drive2``:
1038
+
1039
+ - Acknowledgement that the Transaction was accepted and jobs were
1040
+ launched:
1041
+
1042
+ .. code:: json
1043
+
1044
+ { "return": {} }
1045
+
1046
+ - Later, QEMU sends notice that the second job has errored out, but
1047
+ that the first job was also cancelled:
1048
+
1049
+ .. code:: json
1050
+
1051
+ { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
1052
+ "data": { "device": "drive1", "action": "report",
1053
+ "operation": "read" },
1054
+ "event": "BLOCK_JOB_ERROR" }
1055
+
1056
+ .. code:: json
1057
+
1058
+ { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
1059
+ "data": { "speed": 0, "offset": 0, "len": 67108864,
1060
+ "error": "Input/output error",
1061
+ "device": "drive1", "type": "backup" },
1062
+ "event": "BLOCK_JOB_COMPLETED" }
1063
+
1064
+ .. code:: json
1065
+
1066
+ { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
1067
+ "data": { "device": "drive0", "type": "backup", "speed": 0,
1068
+ "len": 67108864, "offset": 16777216 },
1069
+ "event": "BLOCK_JOB_CANCELLED" }
1070
+
1071
+.. raw:: html
1072
+
1073
+ <!--
1074
+ The FreeBSD Documentation License
1075
+
1076
+ Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML,
1077
+ PDF, PostScript, RTF and so forth) with or without modification, are permitted
1078
+ provided that the following conditions are met:
1079
+
1080
+ Redistributions of source code (Markdown) must retain the above copyright
1081
+ notice, this list of conditions and the following disclaimer of this file
1082
+ unmodified.
1083
+
1084
+ Redistributions in compiled form (transformed to other DTDs, converted to PDF,
1085
+ PostScript, RTF and other formats) must reproduce the above copyright notice,
1086
+ this list of conditions and the following disclaimer in the documentation and/or
1087
+ other materials provided with the distribution.
1088
+
1089
+ THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1090
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1091
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1092
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
1093
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1094
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
1095
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
1096
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
1097
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
1098
+ THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1099
+ -->
1100
--
189
--
1101
2.9.4
190
2.29.2
1102
191
1103
diff view generated by jsdifflib
New patch
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
1
2
3
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
4
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
5
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
6
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
7
Message-id: d54edb4176361eed86b903e8f27058363b6c83b3.1611938319.git.jag.raman@oracle.com
8
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
9
---
10
include/hw/remote/mpqemu-link.h | 4 ++++
11
hw/remote/mpqemu-link.c | 34 +++++++++++++++++++++++++++++++++
12
2 files changed, 38 insertions(+)
13
14
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
15
index XXXXXXX..XXXXXXX 100644
16
--- a/include/hw/remote/mpqemu-link.h
17
+++ b/include/hw/remote/mpqemu-link.h
18
@@ -XXX,XX +XXX,XX @@
19
#include "qemu/thread.h"
20
#include "io/channel.h"
21
#include "exec/hwaddr.h"
22
+#include "io/channel-socket.h"
23
+#include "hw/remote/proxy.h"
24
25
#define REMOTE_MAX_FDS 8
26
27
@@ -XXX,XX +XXX,XX @@ typedef struct {
28
bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
29
bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
30
31
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
32
+ Error **errp);
33
bool mpqemu_msg_valid(MPQemuMsg *msg);
34
35
#endif
36
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
37
index XXXXXXX..XXXXXXX 100644
38
--- a/hw/remote/mpqemu-link.c
39
+++ b/hw/remote/mpqemu-link.c
40
@@ -XXX,XX +XXX,XX @@ fail:
41
return ret;
42
}
43
44
+/*
45
+ * Send msg and wait for a reply with command code RET_MSG.
46
+ * Returns the message received of size u64 or UINT64_MAX
47
+ * on error.
48
+ * Called from VCPU thread in non-coroutine context.
49
+ * Used by the Proxy object to communicate to remote processes.
50
+ */
51
+uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
52
+ Error **errp)
53
+{
54
+ ERRP_GUARD();
55
+ MPQemuMsg msg_reply = {0};
56
+ uint64_t ret = UINT64_MAX;
57
+
58
+ assert(!qemu_in_coroutine());
59
+
60
+ QEMU_LOCK_GUARD(&pdev->io_mutex);
61
+ if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
62
+ return ret;
63
+ }
64
+
65
+ if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
66
+ return ret;
67
+ }
68
+
69
+ if (!mpqemu_msg_valid(&msg_reply)) {
70
+ error_setg(errp, "ERROR: Invalid reply received for command %d",
71
+ msg->cmd);
72
+ return ret;
73
+ }
74
+
75
+ return msg_reply.data.u64;
76
+}
77
+
78
bool mpqemu_msg_valid(MPQemuMsg *msg)
79
{
80
if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
81
--
82
2.29.2
83
diff view generated by jsdifflib
New patch
1
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
2
3
The Proxy Object sends the PCI config space accesses as messages
4
to the remote process over the communication channel
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: d3c94f4618813234655356c60e6f0d0362ff42d6.1611938319.git.jag.raman@oracle.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
include/hw/remote/mpqemu-link.h | 10 ++++++
14
hw/remote/message.c | 60 +++++++++++++++++++++++++++++++++
15
hw/remote/mpqemu-link.c | 8 ++++-
16
hw/remote/proxy.c | 55 ++++++++++++++++++++++++++++++
17
4 files changed, 132 insertions(+), 1 deletion(-)
18
19
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
20
index XXXXXXX..XXXXXXX 100644
21
--- a/include/hw/remote/mpqemu-link.h
22
+++ b/include/hw/remote/mpqemu-link.h
23
@@ -XXX,XX +XXX,XX @@
24
*/
25
typedef enum {
26
MPQEMU_CMD_SYNC_SYSMEM,
27
+ MPQEMU_CMD_RET,
28
+ MPQEMU_CMD_PCI_CFGWRITE,
29
+ MPQEMU_CMD_PCI_CFGREAD,
30
MPQEMU_CMD_MAX,
31
} MPQemuCmd;
32
33
@@ -XXX,XX +XXX,XX @@ typedef struct {
34
off_t offsets[REMOTE_MAX_FDS];
35
} SyncSysmemMsg;
36
37
+typedef struct {
38
+ uint32_t addr;
39
+ uint32_t val;
40
+ int len;
41
+} PciConfDataMsg;
42
+
43
/**
44
* MPQemuMsg:
45
* @cmd: The remote command
46
@@ -XXX,XX +XXX,XX @@ typedef struct {
47
48
union {
49
uint64_t u64;
50
+ PciConfDataMsg pci_conf_data;
51
SyncSysmemMsg sync_sysmem;
52
} data;
53
54
diff --git a/hw/remote/message.c b/hw/remote/message.c
55
index XXXXXXX..XXXXXXX 100644
56
--- a/hw/remote/message.c
57
+++ b/hw/remote/message.c
58
@@ -XXX,XX +XXX,XX @@
59
#include "hw/remote/mpqemu-link.h"
60
#include "qapi/error.h"
61
#include "sysemu/runstate.h"
62
+#include "hw/pci/pci.h"
63
+
64
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
65
+ MPQemuMsg *msg, Error **errp);
66
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
67
+ MPQemuMsg *msg, Error **errp);
68
69
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
70
{
71
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
72
}
73
74
switch (msg.cmd) {
75
+ case MPQEMU_CMD_PCI_CFGWRITE:
76
+ process_config_write(com->ioc, pci_dev, &msg, &local_err);
77
+ break;
78
+ case MPQEMU_CMD_PCI_CFGREAD:
79
+ process_config_read(com->ioc, pci_dev, &msg, &local_err);
80
+ break;
81
default:
82
error_setg(&local_err,
83
"Unknown command (%d) received for device %s"
84
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
85
qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
86
}
87
}
88
+
89
+static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
90
+ MPQemuMsg *msg, Error **errp)
91
+{
92
+ ERRP_GUARD();
93
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
94
+ MPQemuMsg ret = { 0 };
95
+
96
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
97
+ error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
98
+ getpid());
99
+ ret.data.u64 = UINT64_MAX;
100
+ } else {
101
+ pci_default_write_config(dev, conf->addr, conf->val, conf->len);
102
+ }
103
+
104
+ ret.cmd = MPQEMU_CMD_RET;
105
+ ret.size = sizeof(ret.data.u64);
106
+
107
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
108
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
109
+ getpid());
110
+ }
111
+}
112
+
113
+static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
114
+ MPQemuMsg *msg, Error **errp)
115
+{
116
+ ERRP_GUARD();
117
+ PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
118
+ MPQemuMsg ret = { 0 };
119
+
120
+ if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
121
+ error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
122
+ getpid());
123
+ ret.data.u64 = UINT64_MAX;
124
+ } else {
125
+ ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
126
+ }
127
+
128
+ ret.cmd = MPQEMU_CMD_RET;
129
+ ret.size = sizeof(ret.data.u64);
130
+
131
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
132
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
133
+ getpid());
134
+ }
135
+}
136
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
137
index XXXXXXX..XXXXXXX 100644
138
--- a/hw/remote/mpqemu-link.c
139
+++ b/hw/remote/mpqemu-link.c
140
@@ -XXX,XX +XXX,XX @@ uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
141
return ret;
142
}
143
144
- if (!mpqemu_msg_valid(&msg_reply)) {
145
+ if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
146
error_setg(errp, "ERROR: Invalid reply received for command %d",
147
msg->cmd);
148
return ret;
149
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
150
return false;
151
}
152
break;
153
+ case MPQEMU_CMD_PCI_CFGWRITE:
154
+ case MPQEMU_CMD_PCI_CFGREAD:
155
+ if (msg->size != sizeof(PciConfDataMsg)) {
156
+ return false;
157
+ }
158
+ break;
159
default:
160
break;
161
}
162
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
163
index XXXXXXX..XXXXXXX 100644
164
--- a/hw/remote/proxy.c
165
+++ b/hw/remote/proxy.c
166
@@ -XXX,XX +XXX,XX @@
167
#include "monitor/monitor.h"
168
#include "migration/blocker.h"
169
#include "qemu/sockets.h"
170
+#include "hw/remote/mpqemu-link.h"
171
+#include "qemu/error-report.h"
172
173
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
174
{
175
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
176
error_free(dev->migration_blocker);
177
}
178
179
+static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
180
+ int len, unsigned int op)
181
+{
182
+ MPQemuMsg msg = { 0 };
183
+ uint64_t ret = -EINVAL;
184
+ Error *local_err = NULL;
185
+
186
+ msg.cmd = op;
187
+ msg.data.pci_conf_data.addr = addr;
188
+ msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
189
+ msg.data.pci_conf_data.len = len;
190
+ msg.size = sizeof(PciConfDataMsg);
191
+
192
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
193
+ if (local_err) {
194
+ error_report_err(local_err);
195
+ }
196
+
197
+ if (ret == UINT64_MAX) {
198
+ error_report("Failed to perform PCI config %s operation",
199
+ (op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
200
+ }
201
+
202
+ if (op == MPQEMU_CMD_PCI_CFGREAD) {
203
+ *val = (uint32_t)ret;
204
+ }
205
+}
206
+
207
+static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
208
+{
209
+ uint32_t val;
210
+
211
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
212
+
213
+ return val;
214
+}
215
+
216
+static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
217
+ int len)
218
+{
219
+ /*
220
+ * Some of the functions access the copy of remote device's PCI config
221
+ * space which is cached in the proxy device. Therefore, maintain
222
+ * it updated.
223
+ */
224
+ pci_default_write_config(d, addr, val, len);
225
+
226
+ config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
227
+}
228
+
229
static Property proxy_properties[] = {
230
DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
231
DEFINE_PROP_END_OF_LIST(),
232
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
233
234
k->realize = pci_proxy_dev_realize;
235
k->exit = pci_proxy_dev_exit;
236
+ k->config_read = pci_proxy_read_config;
237
+ k->config_write = pci_proxy_write_config;
238
+
239
device_class_set_props(dc, proxy_properties);
240
}
241
242
--
243
2.29.2
244
diff view generated by jsdifflib
New patch
1
1
From: Jagannathan Raman <jag.raman@oracle.com>
2
3
Proxy device object implements handler for PCI BAR writes and reads.
4
The handler uses BAR_WRITE/BAR_READ message to communicate to the
5
remote process with the BAR address and value to be written/read.
6
The remote process implements handler for BAR_WRITE/BAR_READ
7
message.
8
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
12
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
13
Message-id: a8b76714a9688be5552c4c92d089bc9e8a4707ff.1611938319.git.jag.raman@oracle.com
14
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
15
---
16
include/hw/remote/mpqemu-link.h | 10 ++++
17
include/hw/remote/proxy.h | 9 ++++
18
hw/remote/message.c | 83 +++++++++++++++++++++++++++++++++
19
hw/remote/mpqemu-link.c | 6 +++
20
hw/remote/proxy.c | 60 ++++++++++++++++++++++++
21
5 files changed, 168 insertions(+)
22
23
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
24
index XXXXXXX..XXXXXXX 100644
25
--- a/include/hw/remote/mpqemu-link.h
26
+++ b/include/hw/remote/mpqemu-link.h
27
@@ -XXX,XX +XXX,XX @@ typedef enum {
28
MPQEMU_CMD_RET,
29
MPQEMU_CMD_PCI_CFGWRITE,
30
MPQEMU_CMD_PCI_CFGREAD,
31
+ MPQEMU_CMD_BAR_WRITE,
32
+ MPQEMU_CMD_BAR_READ,
33
MPQEMU_CMD_MAX,
34
} MPQemuCmd;
35
36
@@ -XXX,XX +XXX,XX @@ typedef struct {
37
int len;
38
} PciConfDataMsg;
39
40
+typedef struct {
41
+ hwaddr addr;
42
+ uint64_t val;
43
+ unsigned size;
44
+ bool memory;
45
+} BarAccessMsg;
46
+
47
/**
48
* MPQemuMsg:
49
* @cmd: The remote command
50
@@ -XXX,XX +XXX,XX @@ typedef struct {
51
uint64_t u64;
52
PciConfDataMsg pci_conf_data;
53
SyncSysmemMsg sync_sysmem;
54
+ BarAccessMsg bar_access;
55
} data;
56
57
int fds[REMOTE_MAX_FDS];
58
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
59
index XXXXXXX..XXXXXXX 100644
60
--- a/include/hw/remote/proxy.h
61
+++ b/include/hw/remote/proxy.h
62
@@ -XXX,XX +XXX,XX @@
63
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
64
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
65
66
+typedef struct ProxyMemoryRegion {
67
+ PCIProxyDev *dev;
68
+ MemoryRegion mr;
69
+ bool memory;
70
+ bool present;
71
+ uint8_t type;
72
+} ProxyMemoryRegion;
73
+
74
struct PCIProxyDev {
75
PCIDevice parent_dev;
76
char *fd;
77
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
78
QemuMutex io_mutex;
79
QIOChannel *ioc;
80
Error *migration_blocker;
81
+ ProxyMemoryRegion region[PCI_NUM_REGIONS];
82
};
83
84
#endif /* PROXY_H */
85
diff --git a/hw/remote/message.c b/hw/remote/message.c
86
index XXXXXXX..XXXXXXX 100644
87
--- a/hw/remote/message.c
88
+++ b/hw/remote/message.c
89
@@ -XXX,XX +XXX,XX @@
90
#include "qapi/error.h"
91
#include "sysemu/runstate.h"
92
#include "hw/pci/pci.h"
93
+#include "exec/memattrs.h"
94
95
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
96
MPQemuMsg *msg, Error **errp);
97
static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
98
MPQemuMsg *msg, Error **errp);
99
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
100
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
101
102
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
103
{
104
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
105
case MPQEMU_CMD_PCI_CFGREAD:
106
process_config_read(com->ioc, pci_dev, &msg, &local_err);
107
break;
108
+ case MPQEMU_CMD_BAR_WRITE:
109
+ process_bar_write(com->ioc, &msg, &local_err);
110
+ break;
111
+ case MPQEMU_CMD_BAR_READ:
112
+ process_bar_read(com->ioc, &msg, &local_err);
113
+ break;
114
default:
115
error_setg(&local_err,
116
"Unknown command (%d) received for device %s"
117
@@ -XXX,XX +XXX,XX @@ static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
118
getpid());
119
}
120
}
121
+
122
+static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
123
+{
124
+ ERRP_GUARD();
125
+ BarAccessMsg *bar_access = &msg->data.bar_access;
126
+ AddressSpace *as =
127
+ bar_access->memory ? &address_space_memory : &address_space_io;
128
+ MPQemuMsg ret = { 0 };
129
+ MemTxResult res;
130
+ uint64_t val;
131
+
132
+ if (!is_power_of_2(bar_access->size) ||
133
+ (bar_access->size > sizeof(uint64_t))) {
134
+ ret.data.u64 = UINT64_MAX;
135
+ goto fail;
136
+ }
137
+
138
+ val = cpu_to_le64(bar_access->val);
139
+
140
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
141
+ (void *)&val, bar_access->size, true);
142
+
143
+ if (res != MEMTX_OK) {
144
+ error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
145
+ bar_access->addr, getpid());
146
+ ret.data.u64 = -1;
147
+ }
148
+
149
+fail:
150
+ ret.cmd = MPQEMU_CMD_RET;
151
+ ret.size = sizeof(ret.data.u64);
152
+
153
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
154
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
155
+ getpid());
156
+ }
157
+}
158
+
159
+static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
160
+{
161
+ ERRP_GUARD();
162
+ BarAccessMsg *bar_access = &msg->data.bar_access;
163
+ MPQemuMsg ret = { 0 };
164
+ AddressSpace *as;
165
+ MemTxResult res;
166
+ uint64_t val = 0;
167
+
168
+ as = bar_access->memory ? &address_space_memory : &address_space_io;
169
+
170
+ if (!is_power_of_2(bar_access->size) ||
171
+ (bar_access->size > sizeof(uint64_t))) {
172
+ val = UINT64_MAX;
173
+ goto fail;
174
+ }
175
+
176
+ res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
177
+ (void *)&val, bar_access->size, false);
178
+
179
+ if (res != MEMTX_OK) {
180
+ error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
181
+ bar_access->addr, getpid());
182
+ val = UINT64_MAX;
183
+ }
184
+
185
+fail:
186
+ ret.cmd = MPQEMU_CMD_RET;
187
+ ret.data.u64 = le64_to_cpu(val);
188
+ ret.size = sizeof(ret.data.u64);
189
+
190
+ if (!mpqemu_msg_send(&ret, ioc, NULL)) {
191
+ error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
192
+ getpid());
193
+ }
194
+}
195
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
196
index XXXXXXX..XXXXXXX 100644
197
--- a/hw/remote/mpqemu-link.c
198
+++ b/hw/remote/mpqemu-link.c
199
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
200
return false;
201
}
202
break;
203
+ case MPQEMU_CMD_BAR_WRITE:
204
+ case MPQEMU_CMD_BAR_READ:
205
+ if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
206
+ return false;
207
+ }
208
+ break;
209
default:
210
break;
211
}
212
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
213
index XXXXXXX..XXXXXXX 100644
214
--- a/hw/remote/proxy.c
215
+++ b/hw/remote/proxy.c
216
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_register_types(void)
217
}
218
219
type_init(pci_proxy_dev_register_types)
220
+
221
+static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
222
+ bool write, hwaddr addr, uint64_t *val,
223
+ unsigned size, bool memory)
224
+{
225
+ MPQemuMsg msg = { 0 };
226
+ long ret = -EINVAL;
227
+ Error *local_err = NULL;
228
+
229
+ msg.size = sizeof(BarAccessMsg);
230
+ msg.data.bar_access.addr = mr->addr + addr;
231
+ msg.data.bar_access.size = size;
232
+ msg.data.bar_access.memory = memory;
233
+
234
+ if (write) {
235
+ msg.cmd = MPQEMU_CMD_BAR_WRITE;
236
+ msg.data.bar_access.val = *val;
237
+ } else {
238
+ msg.cmd = MPQEMU_CMD_BAR_READ;
239
+ }
240
+
241
+ ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
242
+ if (local_err) {
243
+ error_report_err(local_err);
244
+ }
245
+
246
+ if (!write) {
247
+ *val = ret;
248
+ }
249
+}
250
+
251
+static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
252
+ unsigned size)
253
+{
254
+ ProxyMemoryRegion *pmr = opaque;
255
+
256
+ send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
257
+ pmr->memory);
258
+}
259
+
260
+static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
261
+{
262
+ ProxyMemoryRegion *pmr = opaque;
263
+ uint64_t val;
264
+
265
+ send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
266
+ pmr->memory);
267
+
268
+ return val;
269
+}
270
+
271
+const MemoryRegionOps proxy_mr_ops = {
272
+ .read = proxy_bar_read,
273
+ .write = proxy_bar_write,
274
+ .endianness = DEVICE_NATIVE_ENDIAN,
275
+ .impl = {
276
+ .min_access_size = 1,
277
+ .max_access_size = 8,
278
+ },
279
+};
280
--
281
2.29.2
282
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
Add ProxyMemoryListener object which is used to keep the view of the RAM
4
in sync between QEMU and remote process.
5
A MemoryListener is registered for system-memory AddressSpace. The
6
listener sends SYNC_SYSMEM message to the remote process when memory
7
listener commits the changes to memory, the remote process receives
8
the message and processes it in the handler for SYNC_SYSMEM message.
9
10
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
11
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
12
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
13
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
14
Message-id: 04fe4e6a9ca90d4f11ab6f59be7652f5b086a071.1611938319.git.jag.raman@oracle.com
15
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
16
---
17
MAINTAINERS | 2 +
18
include/hw/remote/proxy-memory-listener.h | 28 +++
19
include/hw/remote/proxy.h | 2 +
20
hw/remote/message.c | 4 +
21
hw/remote/proxy-memory-listener.c | 227 ++++++++++++++++++++++
22
hw/remote/proxy.c | 6 +
23
hw/remote/meson.build | 1 +
24
7 files changed, 270 insertions(+)
25
create mode 100644 include/hw/remote/proxy-memory-listener.h
26
create mode 100644 hw/remote/proxy-memory-listener.c
27
28
diff --git a/MAINTAINERS b/MAINTAINERS
29
index XXXXXXX..XXXXXXX 100644
30
--- a/MAINTAINERS
31
+++ b/MAINTAINERS
32
@@ -XXX,XX +XXX,XX @@ F: include/hw/remote/memory.h
33
F: hw/remote/memory.c
34
F: hw/remote/proxy.c
35
F: include/hw/remote/proxy.h
36
+F: hw/remote/proxy-memory-listener.c
37
+F: include/hw/remote/proxy-memory-listener.h
38
39
Build and test automation
40
-------------------------
41
diff --git a/include/hw/remote/proxy-memory-listener.h b/include/hw/remote/proxy-memory-listener.h
42
new file mode 100644
43
index XXXXXXX..XXXXXXX
44
--- /dev/null
45
+++ b/include/hw/remote/proxy-memory-listener.h
46
@@ -XXX,XX +XXX,XX @@
47
+/*
48
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
49
+ *
50
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
51
+ * See the COPYING file in the top-level directory.
52
+ *
53
+ */
54
+
55
+#ifndef PROXY_MEMORY_LISTENER_H
56
+#define PROXY_MEMORY_LISTENER_H
57
+
58
+#include "exec/memory.h"
59
+#include "io/channel.h"
60
+
61
+typedef struct ProxyMemoryListener {
62
+ MemoryListener listener;
63
+
64
+ int n_mr_sections;
65
+ MemoryRegionSection *mr_sections;
66
+
67
+ QIOChannel *ioc;
68
+} ProxyMemoryListener;
69
+
70
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
71
+ QIOChannel *ioc);
72
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener);
73
+
74
+#endif
75
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
76
index XXXXXXX..XXXXXXX 100644
77
--- a/include/hw/remote/proxy.h
78
+++ b/include/hw/remote/proxy.h
79
@@ -XXX,XX +XXX,XX @@
80
81
#include "hw/pci/pci.h"
82
#include "io/channel.h"
83
+#include "hw/remote/proxy-memory-listener.h"
84
85
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
86
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
87
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
88
QemuMutex io_mutex;
89
QIOChannel *ioc;
90
Error *migration_blocker;
91
+ ProxyMemoryListener proxy_listener;
92
ProxyMemoryRegion region[PCI_NUM_REGIONS];
93
};
94
95
diff --git a/hw/remote/message.c b/hw/remote/message.c
96
index XXXXXXX..XXXXXXX 100644
97
--- a/hw/remote/message.c
98
+++ b/hw/remote/message.c
99
@@ -XXX,XX +XXX,XX @@
100
#include "sysemu/runstate.h"
101
#include "hw/pci/pci.h"
102
#include "exec/memattrs.h"
103
+#include "hw/remote/memory.h"
104
105
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
106
MPQemuMsg *msg, Error **errp);
107
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
108
case MPQEMU_CMD_BAR_READ:
109
process_bar_read(com->ioc, &msg, &local_err);
110
break;
111
+ case MPQEMU_CMD_SYNC_SYSMEM:
112
+ remote_sysmem_reconfig(&msg, &local_err);
113
+ break;
114
default:
115
error_setg(&local_err,
116
"Unknown command (%d) received for device %s"
117
diff --git a/hw/remote/proxy-memory-listener.c b/hw/remote/proxy-memory-listener.c
118
new file mode 100644
119
index XXXXXXX..XXXXXXX
120
--- /dev/null
121
+++ b/hw/remote/proxy-memory-listener.c
122
@@ -XXX,XX +XXX,XX @@
123
+/*
124
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
125
+ *
126
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
127
+ * See the COPYING file in the top-level directory.
128
+ *
129
+ */
130
+
131
+#include "qemu/osdep.h"
132
+#include "qemu-common.h"
133
+
134
+#include "qemu/compiler.h"
135
+#include "qemu/int128.h"
136
+#include "qemu/range.h"
137
+#include "exec/memory.h"
138
+#include "exec/cpu-common.h"
139
+#include "cpu.h"
140
+#include "exec/ram_addr.h"
141
+#include "exec/address-spaces.h"
142
+#include "qapi/error.h"
143
+#include "hw/remote/mpqemu-link.h"
144
+#include "hw/remote/proxy-memory-listener.h"
145
+
146
+/*
147
+ * TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
148
+ * proxy_memory_listener_commit() defined below perform tasks similar to the
149
+ * functions defined in vhost-user.c. These functions are good candidates
150
+ * for refactoring.
151
+ *
152
+ */
153
+
154
+static void proxy_memory_listener_reset(MemoryListener *listener)
155
+{
156
+ ProxyMemoryListener *proxy_listener = container_of(listener,
157
+ ProxyMemoryListener,
158
+ listener);
159
+ int mrs;
160
+
161
+ for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
162
+ memory_region_unref(proxy_listener->mr_sections[mrs].mr);
163
+ }
164
+
165
+ g_free(proxy_listener->mr_sections);
166
+ proxy_listener->mr_sections = NULL;
167
+ proxy_listener->n_mr_sections = 0;
168
+}
169
+
170
+static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
171
+{
172
+ MemoryRegion *mr;
173
+ ram_addr_t off;
174
+
175
+ /**
176
+ * Assumes that the host address is a valid address as it's
177
+ * coming from the MemoryListener system. In the case host
178
+ * address is not valid, the following call would return
179
+ * the default subregion of "system_memory" region, and
180
+ * not NULL. So it's not possible to check for NULL here.
181
+ */
182
+ mr = memory_region_from_host((void *)(uintptr_t)host, &off);
183
+
184
+ if (offset) {
185
+ *offset = off;
186
+ }
187
+
188
+ return memory_region_get_fd(mr);
189
+}
190
+
191
+static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
192
+{
193
+ if (((prev_host + size) != host)) {
194
+ return false;
195
+ }
196
+
197
+ if (get_fd_from_hostaddr(host, NULL) !=
198
+ get_fd_from_hostaddr(prev_host, NULL)) {
199
+ return false;
200
+ }
201
+
202
+ return true;
203
+}
204
+
205
+static bool try_merge(ProxyMemoryListener *proxy_listener,
206
+ MemoryRegionSection *section)
207
+{
208
+ uint64_t mrs_size, mrs_gpa, mrs_page;
209
+ MemoryRegionSection *prev_sec;
210
+ bool merged = false;
211
+ uintptr_t mrs_host;
212
+ RAMBlock *mrs_rb;
213
+
214
+ if (!proxy_listener->n_mr_sections) {
215
+ return false;
216
+ }
217
+
218
+ mrs_rb = section->mr->ram_block;
219
+ mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
220
+ mrs_size = int128_get64(section->size);
221
+ mrs_gpa = section->offset_within_address_space;
222
+ mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
223
+ section->offset_within_region;
224
+
225
+ if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
226
+ return true;
227
+ }
228
+
229
+ mrs_host = mrs_host & ~(mrs_page - 1);
230
+ mrs_gpa = mrs_gpa & ~(mrs_page - 1);
231
+ mrs_size = ROUND_UP(mrs_size, mrs_page);
232
+
233
+ prev_sec = proxy_listener->mr_sections +
234
+ (proxy_listener->n_mr_sections - 1);
235
+ uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
236
+ uint64_t prev_size = int128_get64(prev_sec->size);
237
+ uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
238
+ uint64_t prev_host_start =
239
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
240
+ prev_sec->offset_within_region;
241
+ uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
242
+
243
+ if (mrs_gpa <= (prev_gpa_end + 1)) {
244
+ g_assert(mrs_gpa > prev_gpa_start);
245
+
246
+ if ((section->mr == prev_sec->mr) &&
247
+ proxy_mrs_can_merge(mrs_host, prev_host_start,
248
+ (mrs_gpa - prev_gpa_start))) {
249
+ uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
250
+ merged = true;
251
+ prev_sec->offset_within_address_space =
252
+ MIN(prev_gpa_start, mrs_gpa);
253
+ prev_sec->offset_within_region =
254
+ MIN(prev_host_start, mrs_host) -
255
+ (uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
256
+ prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
257
+ mrs_host));
258
+ }
259
+ }
260
+
261
+ return merged;
262
+}
263
+
264
+static void proxy_memory_listener_region_addnop(MemoryListener *listener,
265
+ MemoryRegionSection *section)
266
+{
267
+ ProxyMemoryListener *proxy_listener = container_of(listener,
268
+ ProxyMemoryListener,
269
+ listener);
270
+
271
+ if (!memory_region_is_ram(section->mr) ||
272
+ memory_region_is_rom(section->mr)) {
273
+ return;
274
+ }
275
+
276
+ if (try_merge(proxy_listener, section)) {
277
+ return;
278
+ }
279
+
280
+ ++proxy_listener->n_mr_sections;
281
+ proxy_listener->mr_sections = g_renew(MemoryRegionSection,
282
+ proxy_listener->mr_sections,
283
+ proxy_listener->n_mr_sections);
284
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
285
+ proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
286
+ memory_region_ref(section->mr);
287
+}
288
+
289
+static void proxy_memory_listener_commit(MemoryListener *listener)
290
+{
291
+ ProxyMemoryListener *proxy_listener = container_of(listener,
292
+ ProxyMemoryListener,
293
+ listener);
294
+ MPQemuMsg msg;
295
+ MemoryRegionSection *section;
296
+ ram_addr_t offset;
297
+ uintptr_t host_addr;
298
+ int region;
299
+ Error *local_err = NULL;
300
+
301
+ memset(&msg, 0, sizeof(MPQemuMsg));
302
+
303
+ msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
304
+ msg.num_fds = proxy_listener->n_mr_sections;
305
+ msg.size = sizeof(SyncSysmemMsg);
306
+ if (msg.num_fds > REMOTE_MAX_FDS) {
307
+ error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
308
+ return;
309
+ }
310
+
311
+ for (region = 0; region < proxy_listener->n_mr_sections; region++) {
312
+ section = &proxy_listener->mr_sections[region];
313
+ msg.data.sync_sysmem.gpas[region] =
314
+ section->offset_within_address_space;
315
+ msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
316
+ host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
317
+ section->offset_within_region;
318
+ msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
319
+ msg.data.sync_sysmem.offsets[region] = offset;
320
+ }
321
+ if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
322
+ error_report_err(local_err);
323
+ }
324
+}
325
+
326
+void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
327
+{
328
+ memory_listener_unregister(&proxy_listener->listener);
329
+
330
+ proxy_memory_listener_reset(&proxy_listener->listener);
331
+}
332
+
333
+void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
334
+ QIOChannel *ioc)
335
+{
336
+ proxy_listener->n_mr_sections = 0;
337
+ proxy_listener->mr_sections = NULL;
338
+
339
+ proxy_listener->ioc = ioc;
340
+
341
+ proxy_listener->listener.begin = proxy_memory_listener_reset;
342
+ proxy_listener->listener.commit = proxy_memory_listener_commit;
343
+ proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
344
+ proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
345
+ proxy_listener->listener.priority = 10;
346
+
347
+ memory_listener_register(&proxy_listener->listener,
348
+ &address_space_memory);
349
+}
350
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
351
index XXXXXXX..XXXXXXX 100644
352
--- a/hw/remote/proxy.c
353
+++ b/hw/remote/proxy.c
354
@@ -XXX,XX +XXX,XX @@
355
#include "qemu/sockets.h"
356
#include "hw/remote/mpqemu-link.h"
357
#include "qemu/error-report.h"
358
+#include "hw/remote/proxy-memory-listener.h"
359
+#include "qom/object.h"
360
361
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
362
{
363
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
364
365
qemu_mutex_init(&dev->io_mutex);
366
qio_channel_set_blocking(dev->ioc, true, NULL);
367
+
368
+ proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
369
}
370
371
static void pci_proxy_dev_exit(PCIDevice *pdev)
372
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
373
migrate_del_blocker(dev->migration_blocker);
374
375
error_free(dev->migration_blocker);
376
+
377
+ proxy_memory_listener_deconfigure(&dev->proxy_listener);
378
}
379
380
static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
381
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
382
index XXXXXXX..XXXXXXX 100644
383
--- a/hw/remote/meson.build
384
+++ b/hw/remote/meson.build
385
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
386
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
387
388
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
389
+specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
390
391
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
392
--
393
2.29.2
394
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
IOHUB object is added to manage PCI IRQs. It uses KVM_IRQFD
4
ioctl to create irqfd to injecting PCI interrupts to the guest.
5
IOHUB object forwards the irqfd to the remote process. Remote process
6
uses this fd to directly send interrupts to the guest, bypassing QEMU.
7
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
11
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
12
Message-id: 51d5c3d54e28a68b002e3875c59599c9f5a424a1.1611938319.git.jag.raman@oracle.com
13
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
14
---
15
MAINTAINERS | 2 +
16
include/hw/pci/pci_ids.h | 3 +
17
include/hw/remote/iohub.h | 42 +++++++++++
18
include/hw/remote/machine.h | 2 +
19
include/hw/remote/mpqemu-link.h | 1 +
20
include/hw/remote/proxy.h | 4 ++
21
hw/remote/iohub.c | 119 ++++++++++++++++++++++++++++++++
22
hw/remote/machine.c | 10 +++
23
hw/remote/message.c | 4 ++
24
hw/remote/mpqemu-link.c | 5 ++
25
hw/remote/proxy.c | 56 +++++++++++++++
26
hw/remote/meson.build | 1 +
27
12 files changed, 249 insertions(+)
28
create mode 100644 include/hw/remote/iohub.h
29
create mode 100644 hw/remote/iohub.c
30
31
diff --git a/MAINTAINERS b/MAINTAINERS
32
index XXXXXXX..XXXXXXX 100644
33
--- a/MAINTAINERS
34
+++ b/MAINTAINERS
35
@@ -XXX,XX +XXX,XX @@ F: hw/remote/proxy.c
36
F: include/hw/remote/proxy.h
37
F: hw/remote/proxy-memory-listener.c
38
F: include/hw/remote/proxy-memory-listener.h
39
+F: hw/remote/iohub.c
40
+F: include/hw/remote/iohub.h
41
42
Build and test automation
43
-------------------------
44
diff --git a/include/hw/pci/pci_ids.h b/include/hw/pci/pci_ids.h
45
index XXXXXXX..XXXXXXX 100644
46
--- a/include/hw/pci/pci_ids.h
47
+++ b/include/hw/pci/pci_ids.h
48
@@ -XXX,XX +XXX,XX @@
49
#define PCI_DEVICE_ID_SUN_SIMBA 0x5000
50
#define PCI_DEVICE_ID_SUN_SABRE 0xa000
51
52
+#define PCI_VENDOR_ID_ORACLE 0x108e
53
+#define PCI_DEVICE_ID_REMOTE_IOHUB 0xb000
54
+
55
#define PCI_VENDOR_ID_CMD 0x1095
56
#define PCI_DEVICE_ID_CMD_646 0x0646
57
58
diff --git a/include/hw/remote/iohub.h b/include/hw/remote/iohub.h
59
new file mode 100644
60
index XXXXXXX..XXXXXXX
61
--- /dev/null
62
+++ b/include/hw/remote/iohub.h
63
@@ -XXX,XX +XXX,XX @@
64
+/*
65
+ * IO Hub for remote device
66
+ *
67
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
68
+ *
69
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
70
+ * See the COPYING file in the top-level directory.
71
+ *
72
+ */
73
+
74
+#ifndef REMOTE_IOHUB_H
75
+#define REMOTE_IOHUB_H
76
+
77
+#include "hw/pci/pci.h"
78
+#include "qemu/event_notifier.h"
79
+#include "qemu/thread-posix.h"
80
+#include "hw/remote/mpqemu-link.h"
81
+
82
+#define REMOTE_IOHUB_NB_PIRQS PCI_DEVFN_MAX
83
+
84
+typedef struct ResampleToken {
85
+ void *iohub;
86
+ int pirq;
87
+} ResampleToken;
88
+
89
+typedef struct RemoteIOHubState {
90
+ PCIDevice d;
91
+ EventNotifier irqfds[REMOTE_IOHUB_NB_PIRQS];
92
+ EventNotifier resamplefds[REMOTE_IOHUB_NB_PIRQS];
93
+ unsigned int irq_level[REMOTE_IOHUB_NB_PIRQS];
94
+ ResampleToken token[REMOTE_IOHUB_NB_PIRQS];
95
+ QemuMutex irq_level_lock[REMOTE_IOHUB_NB_PIRQS];
96
+} RemoteIOHubState;
97
+
98
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx);
99
+void remote_iohub_set_irq(void *opaque, int pirq, int level);
100
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg);
101
+
102
+void remote_iohub_init(RemoteIOHubState *iohub);
103
+void remote_iohub_finalize(RemoteIOHubState *iohub);
104
+
105
+#endif
106
diff --git a/include/hw/remote/machine.h b/include/hw/remote/machine.h
107
index XXXXXXX..XXXXXXX 100644
108
--- a/include/hw/remote/machine.h
109
+++ b/include/hw/remote/machine.h
110
@@ -XXX,XX +XXX,XX @@
111
#include "hw/boards.h"
112
#include "hw/pci-host/remote.h"
113
#include "io/channel.h"
114
+#include "hw/remote/iohub.h"
115
116
struct RemoteMachineState {
117
MachineState parent_obj;
118
119
RemotePCIHost *host;
120
+ RemoteIOHubState iohub;
121
};
122
123
/* Used to pass to co-routine device and ioc. */
124
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
125
index XXXXXXX..XXXXXXX 100644
126
--- a/include/hw/remote/mpqemu-link.h
127
+++ b/include/hw/remote/mpqemu-link.h
128
@@ -XXX,XX +XXX,XX @@ typedef enum {
129
MPQEMU_CMD_PCI_CFGREAD,
130
MPQEMU_CMD_BAR_WRITE,
131
MPQEMU_CMD_BAR_READ,
132
+ MPQEMU_CMD_SET_IRQFD,
133
MPQEMU_CMD_MAX,
134
} MPQemuCmd;
135
136
diff --git a/include/hw/remote/proxy.h b/include/hw/remote/proxy.h
137
index XXXXXXX..XXXXXXX 100644
138
--- a/include/hw/remote/proxy.h
139
+++ b/include/hw/remote/proxy.h
140
@@ -XXX,XX +XXX,XX @@
141
#include "hw/pci/pci.h"
142
#include "io/channel.h"
143
#include "hw/remote/proxy-memory-listener.h"
144
+#include "qemu/event_notifier.h"
145
146
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
147
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
148
@@ -XXX,XX +XXX,XX @@ struct PCIProxyDev {
149
QIOChannel *ioc;
150
Error *migration_blocker;
151
ProxyMemoryListener proxy_listener;
152
+ int virq;
153
+ EventNotifier intr;
154
+ EventNotifier resample;
155
ProxyMemoryRegion region[PCI_NUM_REGIONS];
156
};
157
158
diff --git a/hw/remote/iohub.c b/hw/remote/iohub.c
159
new file mode 100644
160
index XXXXXXX..XXXXXXX
161
--- /dev/null
162
+++ b/hw/remote/iohub.c
163
@@ -XXX,XX +XXX,XX @@
164
+/*
165
+ * Remote IO Hub
166
+ *
167
+ * Copyright © 2018, 2021 Oracle and/or its affiliates.
168
+ *
169
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
170
+ * See the COPYING file in the top-level directory.
171
+ *
172
+ */
173
+
174
+#include "qemu/osdep.h"
175
+#include "qemu-common.h"
176
+
177
+#include "hw/pci/pci.h"
178
+#include "hw/pci/pci_ids.h"
179
+#include "hw/pci/pci_bus.h"
180
+#include "qemu/thread.h"
181
+#include "hw/boards.h"
182
+#include "hw/remote/machine.h"
183
+#include "hw/remote/iohub.h"
184
+#include "qemu/main-loop.h"
185
+
186
+void remote_iohub_init(RemoteIOHubState *iohub)
187
+{
188
+ int pirq;
189
+
190
+ memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
191
+ memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
192
+
193
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
194
+ qemu_mutex_init(&iohub->irq_level_lock[pirq]);
195
+ iohub->irq_level[pirq] = 0;
196
+ event_notifier_init_fd(&iohub->irqfds[pirq], -1);
197
+ event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
198
+ }
199
+}
200
+
201
+void remote_iohub_finalize(RemoteIOHubState *iohub)
202
+{
203
+ int pirq;
204
+
205
+ for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
206
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
207
+ NULL, NULL, NULL);
208
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
209
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
210
+ qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
211
+ }
212
+}
213
+
214
+int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
215
+{
216
+ return pci_dev->devfn;
217
+}
218
+
219
+void remote_iohub_set_irq(void *opaque, int pirq, int level)
220
+{
221
+ RemoteIOHubState *iohub = opaque;
222
+
223
+ assert(pirq >= 0);
224
+ assert(pirq < PCI_DEVFN_MAX);
225
+
226
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
227
+
228
+ if (level) {
229
+ if (++iohub->irq_level[pirq] == 1) {
230
+ event_notifier_set(&iohub->irqfds[pirq]);
231
+ }
232
+ } else if (iohub->irq_level[pirq] > 0) {
233
+ iohub->irq_level[pirq]--;
234
+ }
235
+}
236
+
237
+static void intr_resample_handler(void *opaque)
238
+{
239
+ ResampleToken *token = opaque;
240
+ RemoteIOHubState *iohub = token->iohub;
241
+ int pirq, s;
242
+
243
+ pirq = token->pirq;
244
+
245
+ s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
246
+
247
+ assert(s >= 0);
248
+
249
+ QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
250
+
251
+ if (iohub->irq_level[pirq]) {
252
+ event_notifier_set(&iohub->irqfds[pirq]);
253
+ }
254
+}
255
+
256
+void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
257
+{
258
+ RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
259
+ RemoteIOHubState *iohub = &machine->iohub;
260
+ int pirq, intx;
261
+
262
+ intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
263
+
264
+ pirq = remote_iohub_map_irq(pci_dev, intx);
265
+
266
+ if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
267
+ qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
268
+ NULL, NULL, NULL);
269
+ event_notifier_cleanup(&iohub->irqfds[pirq]);
270
+ event_notifier_cleanup(&iohub->resamplefds[pirq]);
271
+ memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
272
+ }
273
+
274
+ event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
275
+ event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
276
+
277
+ iohub->token[pirq].iohub = iohub;
278
+ iohub->token[pirq].pirq = pirq;
279
+
280
+ qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
281
+ &iohub->token[pirq]);
282
+}
283
diff --git a/hw/remote/machine.c b/hw/remote/machine.c
284
index XXXXXXX..XXXXXXX 100644
285
--- a/hw/remote/machine.c
286
+++ b/hw/remote/machine.c
287
@@ -XXX,XX +XXX,XX @@
288
#include "exec/address-spaces.h"
289
#include "exec/memory.h"
290
#include "qapi/error.h"
291
+#include "hw/pci/pci_host.h"
292
+#include "hw/remote/iohub.h"
293
294
static void remote_machine_init(MachineState *machine)
295
{
296
MemoryRegion *system_memory, *system_io, *pci_memory;
297
RemoteMachineState *s = REMOTE_MACHINE(machine);
298
RemotePCIHost *rem_host;
299
+ PCIHostState *pci_host;
300
301
system_memory = get_system_memory();
302
system_io = get_system_io();
303
@@ -XXX,XX +XXX,XX @@ static void remote_machine_init(MachineState *machine)
304
memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
305
306
qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
307
+
308
+ pci_host = PCI_HOST_BRIDGE(rem_host);
309
+
310
+ remote_iohub_init(&s->iohub);
311
+
312
+ pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
313
+ &s->iohub, REMOTE_IOHUB_NB_PIRQS);
314
}
315
316
static void remote_machine_class_init(ObjectClass *oc, void *data)
317
diff --git a/hw/remote/message.c b/hw/remote/message.c
318
index XXXXXXX..XXXXXXX 100644
319
--- a/hw/remote/message.c
320
+++ b/hw/remote/message.c
321
@@ -XXX,XX +XXX,XX @@
322
#include "hw/pci/pci.h"
323
#include "exec/memattrs.h"
324
#include "hw/remote/memory.h"
325
+#include "hw/remote/iohub.h"
326
327
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
328
MPQemuMsg *msg, Error **errp);
329
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
330
case MPQEMU_CMD_SYNC_SYSMEM:
331
remote_sysmem_reconfig(&msg, &local_err);
332
break;
333
+ case MPQEMU_CMD_SET_IRQFD:
334
+ process_set_irqfd_msg(pci_dev, &msg);
335
+ break;
336
default:
337
error_setg(&local_err,
338
"Unknown command (%d) received for device %s"
339
diff --git a/hw/remote/mpqemu-link.c b/hw/remote/mpqemu-link.c
340
index XXXXXXX..XXXXXXX 100644
341
--- a/hw/remote/mpqemu-link.c
342
+++ b/hw/remote/mpqemu-link.c
343
@@ -XXX,XX +XXX,XX @@ bool mpqemu_msg_valid(MPQemuMsg *msg)
344
return false;
345
}
346
break;
347
+ case MPQEMU_CMD_SET_IRQFD:
348
+ if (msg->size || (msg->num_fds != 2)) {
349
+ return false;
350
+ }
351
+ break;
352
default:
353
break;
354
}
355
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
356
index XXXXXXX..XXXXXXX 100644
357
--- a/hw/remote/proxy.c
358
+++ b/hw/remote/proxy.c
359
@@ -XXX,XX +XXX,XX @@
360
#include "qemu/error-report.h"
361
#include "hw/remote/proxy-memory-listener.h"
362
#include "qom/object.h"
363
+#include "qemu/event_notifier.h"
364
+#include "sysemu/kvm.h"
365
+#include "util/event_notifier-posix.c"
366
+
367
+static void proxy_intx_update(PCIDevice *pci_dev)
368
+{
369
+ PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
370
+ PCIINTxRoute route;
371
+ int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
372
+
373
+ if (dev->virq != -1) {
374
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
375
+ dev->virq = -1;
376
+ }
377
+
378
+ route = pci_device_route_intx_to_irq(pci_dev, pin);
379
+
380
+ dev->virq = route.irq;
381
+
382
+ if (dev->virq != -1) {
383
+ kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
384
+ &dev->resample, dev->virq);
385
+ }
386
+}
387
+
388
+static void setup_irqfd(PCIProxyDev *dev)
389
+{
390
+ PCIDevice *pci_dev = PCI_DEVICE(dev);
391
+ MPQemuMsg msg;
392
+ Error *local_err = NULL;
393
+
394
+ event_notifier_init(&dev->intr, 0);
395
+ event_notifier_init(&dev->resample, 0);
396
+
397
+ memset(&msg, 0, sizeof(MPQemuMsg));
398
+ msg.cmd = MPQEMU_CMD_SET_IRQFD;
399
+ msg.num_fds = 2;
400
+ msg.fds[0] = event_notifier_get_fd(&dev->intr);
401
+ msg.fds[1] = event_notifier_get_fd(&dev->resample);
402
+ msg.size = 0;
403
+
404
+ if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
405
+ error_report_err(local_err);
406
+ }
407
+
408
+ dev->virq = -1;
409
+
410
+ proxy_intx_update(pci_dev);
411
+
412
+ pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
413
+}
414
415
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
416
{
417
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
418
qio_channel_set_blocking(dev->ioc, true, NULL);
419
420
proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
421
+
422
+ setup_irqfd(dev);
423
}
424
425
static void pci_proxy_dev_exit(PCIDevice *pdev)
426
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_exit(PCIDevice *pdev)
427
error_free(dev->migration_blocker);
428
429
proxy_memory_listener_deconfigure(&dev->proxy_listener);
430
+
431
+ event_notifier_cleanup(&dev->intr);
432
+ event_notifier_cleanup(&dev->resample);
433
}
434
435
static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
436
diff --git a/hw/remote/meson.build b/hw/remote/meson.build
437
index XXXXXXX..XXXXXXX 100644
438
--- a/hw/remote/meson.build
439
+++ b/hw/remote/meson.build
440
@@ -XXX,XX +XXX,XX @@ remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
441
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
442
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
443
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
444
+remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
445
446
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
447
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
448
--
449
2.29.2
450
diff view generated by jsdifflib
New patch
1
From: Jagannathan Raman <jag.raman@oracle.com>
1
2
3
Retrieve PCI configuration info about the remote device and
4
configure the Proxy PCI object based on the returned information
5
6
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
7
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
8
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
9
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
10
Message-id: 85ee367bbb993aa23699b44cfedd83b4ea6d5221.1611938319.git.jag.raman@oracle.com
11
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
12
---
13
hw/remote/proxy.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++
14
1 file changed, 84 insertions(+)
15
16
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
17
index XXXXXXX..XXXXXXX 100644
18
--- a/hw/remote/proxy.c
19
+++ b/hw/remote/proxy.c
20
@@ -XXX,XX +XXX,XX @@
21
#include "sysemu/kvm.h"
22
#include "util/event_notifier-posix.c"
23
24
+static void probe_pci_info(PCIDevice *dev, Error **errp);
25
+
26
static void proxy_intx_update(PCIDevice *pci_dev)
27
{
28
PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
29
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
30
{
31
ERRP_GUARD();
32
PCIProxyDev *dev = PCI_PROXY_DEV(device);
33
+ uint8_t *pci_conf = device->config;
34
int fd;
35
36
if (!dev->fd) {
37
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
38
qemu_mutex_init(&dev->io_mutex);
39
qio_channel_set_blocking(dev->ioc, true, NULL);
40
41
+ pci_conf[PCI_LATENCY_TIMER] = 0xff;
42
+ pci_conf[PCI_INTERRUPT_PIN] = 0x01;
43
+
44
proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
45
46
setup_irqfd(dev);
47
+
48
+ probe_pci_info(PCI_DEVICE(dev), errp);
49
}
50
51
static void pci_proxy_dev_exit(PCIDevice *pdev)
52
@@ -XXX,XX +XXX,XX @@ const MemoryRegionOps proxy_mr_ops = {
53
.max_access_size = 8,
54
},
55
};
56
+
57
+static void probe_pci_info(PCIDevice *dev, Error **errp)
58
+{
59
+ PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
60
+ uint32_t orig_val, new_val, base_class, val;
61
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
62
+ DeviceClass *dc = DEVICE_CLASS(pc);
63
+ uint8_t type;
64
+ int i, size;
65
+
66
+ config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
67
+ pc->vendor_id = (uint16_t)val;
68
+
69
+ config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
70
+ pc->device_id = (uint16_t)val;
71
+
72
+ config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
73
+ pc->class_id = (uint16_t)val;
74
+
75
+ config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
76
+ pc->subsystem_id = (uint16_t)val;
77
+
78
+ base_class = pc->class_id >> 4;
79
+ switch (base_class) {
80
+ case PCI_BASE_CLASS_BRIDGE:
81
+ set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
82
+ break;
83
+ case PCI_BASE_CLASS_STORAGE:
84
+ set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
85
+ break;
86
+ case PCI_BASE_CLASS_NETWORK:
87
+ set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
88
+ break;
89
+ case PCI_BASE_CLASS_INPUT:
90
+ set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
91
+ break;
92
+ case PCI_BASE_CLASS_DISPLAY:
93
+ set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
94
+ break;
95
+ case PCI_BASE_CLASS_PROCESSOR:
96
+ set_bit(DEVICE_CATEGORY_CPU, dc->categories);
97
+ break;
98
+ default:
99
+ set_bit(DEVICE_CATEGORY_MISC, dc->categories);
100
+ break;
101
+ }
102
+
103
+ for (i = 0; i < PCI_NUM_REGIONS; i++) {
104
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
105
+ MPQEMU_CMD_PCI_CFGREAD);
106
+ new_val = 0xffffffff;
107
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
108
+ MPQEMU_CMD_PCI_CFGWRITE);
109
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
110
+ MPQEMU_CMD_PCI_CFGREAD);
111
+ size = (~(new_val & 0xFFFFFFF0)) + 1;
112
+ config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
113
+ MPQEMU_CMD_PCI_CFGWRITE);
114
+ type = (new_val & 0x1) ?
115
+ PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
116
+
117
+ if (size) {
118
+ g_autofree char *name;
119
+ pdev->region[i].dev = pdev;
120
+ pdev->region[i].present = true;
121
+ if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
122
+ pdev->region[i].memory = true;
123
+ }
124
+ name = g_strdup_printf("bar-region-%d", i);
125
+ memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
126
+ &proxy_mr_ops, &pdev->region[i],
127
+ name, size);
128
+ pci_register_bar(dev, i, type, &pdev->region[i].mr);
129
+ }
130
+ }
131
+}
132
--
133
2.29.2
134
diff view generated by jsdifflib
New patch
1
From: Elena Ufimtseva <elena.ufimtseva@oracle.com>
1
2
3
Perform device reset in the remote process when QEMU performs
4
device reset. This is required to reset the internal state
5
(like registers, etc...) of emulated devices
6
7
Signed-off-by: Elena Ufimtseva <elena.ufimtseva@oracle.com>
8
Signed-off-by: John G Johnson <john.g.johnson@oracle.com>
9
Signed-off-by: Jagannathan Raman <jag.raman@oracle.com>
10
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
11
Message-id: 7cb220a51f565dc0817bd76e2f540e89c2d2b850.1611938319.git.jag.raman@oracle.com
12
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
13
---
14
include/hw/remote/mpqemu-link.h | 1 +
15
hw/remote/message.c | 22 ++++++++++++++++++++++
16
hw/remote/proxy.c | 19 +++++++++++++++++++
17
3 files changed, 42 insertions(+)
18
19
diff --git a/include/hw/remote/mpqemu-link.h b/include/hw/remote/mpqemu-link.h
20
index XXXXXXX..XXXXXXX 100644
21
--- a/include/hw/remote/mpqemu-link.h
22
+++ b/include/hw/remote/mpqemu-link.h
23
@@ -XXX,XX +XXX,XX @@ typedef enum {
24
MPQEMU_CMD_BAR_WRITE,
25
MPQEMU_CMD_BAR_READ,
26
MPQEMU_CMD_SET_IRQFD,
27
+ MPQEMU_CMD_DEVICE_RESET,
28
MPQEMU_CMD_MAX,
29
} MPQemuCmd;
30
31
diff --git a/hw/remote/message.c b/hw/remote/message.c
32
index XXXXXXX..XXXXXXX 100644
33
--- a/hw/remote/message.c
34
+++ b/hw/remote/message.c
35
@@ -XXX,XX +XXX,XX @@
36
#include "exec/memattrs.h"
37
#include "hw/remote/memory.h"
38
#include "hw/remote/iohub.h"
39
+#include "sysemu/reset.h"
40
41
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
42
MPQemuMsg *msg, Error **errp);
43
@@ -XXX,XX +XXX,XX @@ static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
44
MPQemuMsg *msg, Error **errp);
45
static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
46
static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
47
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
48
+ Error **errp);
49
50
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
51
{
52
@@ -XXX,XX +XXX,XX @@ void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
53
case MPQEMU_CMD_SET_IRQFD:
54
process_set_irqfd_msg(pci_dev, &msg);
55
break;
56
+ case MPQEMU_CMD_DEVICE_RESET:
57
+ process_device_reset_msg(com->ioc, pci_dev, &local_err);
58
+ break;
59
default:
60
error_setg(&local_err,
61
"Unknown command (%d) received for device %s"
62
@@ -XXX,XX +XXX,XX @@ fail:
63
getpid());
64
}
65
}
66
+
67
+static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
68
+ Error **errp)
69
+{
70
+ DeviceClass *dc = DEVICE_GET_CLASS(dev);
71
+ DeviceState *s = DEVICE(dev);
72
+ MPQemuMsg ret = { 0 };
73
+
74
+ if (dc->reset) {
75
+ dc->reset(s);
76
+ }
77
+
78
+ ret.cmd = MPQEMU_CMD_RET;
79
+
80
+ mpqemu_msg_send(&ret, ioc, errp);
81
+}
82
diff --git a/hw/remote/proxy.c b/hw/remote/proxy.c
83
index XXXXXXX..XXXXXXX 100644
84
--- a/hw/remote/proxy.c
85
+++ b/hw/remote/proxy.c
86
@@ -XXX,XX +XXX,XX @@
87
#include "util/event_notifier-posix.c"
88
89
static void probe_pci_info(PCIDevice *dev, Error **errp);
90
+static void proxy_device_reset(DeviceState *dev);
91
92
static void proxy_intx_update(PCIDevice *pci_dev)
93
{
94
@@ -XXX,XX +XXX,XX @@ static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
95
k->config_read = pci_proxy_read_config;
96
k->config_write = pci_proxy_write_config;
97
98
+ dc->reset = proxy_device_reset;
99
+
100
device_class_set_props(dc, proxy_properties);
101
}
102
103
@@ -XXX,XX +XXX,XX @@ static void probe_pci_info(PCIDevice *dev, Error **errp)
104
}
105
}
106
}
107
+
108
+static void proxy_device_reset(DeviceState *dev)
109
+{
110
+ PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
111
+ MPQemuMsg msg = { 0 };
112
+ Error *local_err = NULL;
113
+
114
+ msg.cmd = MPQEMU_CMD_DEVICE_RESET;
115
+ msg.size = 0;
116
+
117
+ mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
118
+ if (local_err) {
119
+ error_report_err(local_err);
120
+ }
121
+
122
+}
123
--
124
2.29.2
125
diff view generated by jsdifflib
New patch
1
From: "Denis V. Lunev" <den@openvz.org>
1
2
3
Original specification says that l1 table size if 64 * l1_size, which
4
is obviously wrong. The size of the l1 entry is 64 _bits_, not bytes.
5
Thus 64 is to be replaces with 8 as specification says about bytes.
6
7
There is also minor tweak, field name is renamed from l1 to l1_table,
8
which matches with the later text.
9
10
Signed-off-by: Denis V. Lunev <den@openvz.org>
11
Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
12
Message-id: 20210128171313.2210947-1-den@openvz.org
13
CC: Stefan Hajnoczi <stefanha@redhat.com>
14
CC: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
15
16
[Replace the original commit message "docs: fix mistake in dirty bitmap
17
feature description" as suggested by Eric Blake.
18
--Stefan]
19
20
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
21
---
22
docs/interop/parallels.txt | 2 +-
23
1 file changed, 1 insertion(+), 1 deletion(-)
24
25
diff --git a/docs/interop/parallels.txt b/docs/interop/parallels.txt
26
index XXXXXXX..XXXXXXX 100644
27
--- a/docs/interop/parallels.txt
28
+++ b/docs/interop/parallels.txt
29
@@ -XXX,XX +XXX,XX @@ of its data area are:
30
28 - 31: l1_size
31
The number of entries in the L1 table of the bitmap.
32
33
- variable: l1 (64 * l1_size bytes)
34
+ variable: l1_table (8 * l1_size bytes)
35
L1 offset table (in bytes)
36
37
A dirty bitmap is stored using a one-level structure for the mapping to host
38
--
39
2.29.2
40
diff view generated by jsdifflib