[v1] colo: Introduce resource agent and test suite/CI

[PATCH 3/5] colo: Introduce high-level test suite

Posted by Lukas Straub 5 years, 9 months ago

Add high-level test relying on the colo resource-agent to test
all failover cases while checking guest network connectivity.

Signed-off-by: Lukas Straub <lukasstraub2@web.de>
---
 scripts/colo-resource-agent/crm_master   |  44 ++
 scripts/colo-resource-agent/crm_resource |  12 +
 tests/acceptance/colo.py                 | 689 +++++++++++++++++++++++
 3 files changed, 745 insertions(+)
 create mode 100755 scripts/colo-resource-agent/crm_master
 create mode 100755 scripts/colo-resource-agent/crm_resource
 create mode 100644 tests/acceptance/colo.py

diff --git a/scripts/colo-resource-agent/crm_master b/scripts/colo-resource-agent/crm_master
new file mode 100755
index 0000000000..886f523bda
--- /dev/null
+++ b/scripts/colo-resource-agent/crm_master
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# Fake crm_master for COLO testing
+#
+# Copyright (c) Lukas Straub <lukasstraub2@web.de>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+TMPDIR="$HA_RSCTMP"
+score=0
+query=0
+
+OPTIND=1
+while getopts 'Qql:Dv:N:G' opt; do
+    case "$opt" in
+        Q|q)
+            # Noop
+        ;;
+        "l")
+            # Noop
+        ;;
+        "D")
+            score=0
+        ;;
+        "v")
+            score=$OPTARG
+        ;;
+        "N")
+            TMPDIR="$COLO_TEST_REMOTE_TMP"
+        ;;
+        "G")
+            query=1
+        ;;
+    esac
+done
+
+if (( query )); then
+    cat "${TMPDIR}/master_score" || exit 1
+else
+    echo $score > "${TMPDIR}/master_score" || exit 1
+fi
+
+exit 0
diff --git a/scripts/colo-resource-agent/crm_resource b/scripts/colo-resource-agent/crm_resource
new file mode 100755
index 0000000000..ad69ff3c6b
--- /dev/null
+++ b/scripts/colo-resource-agent/crm_resource
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+# Fake crm_resource for COLO testing
+#
+# Copyright (c) Lukas Straub <lukasstraub2@web.de>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+# Noop
+
+exit 0
diff --git a/tests/acceptance/colo.py b/tests/acceptance/colo.py
new file mode 100644
index 0000000000..465513fb6c
--- /dev/null
+++ b/tests/acceptance/colo.py
@@ -0,0 +1,689 @@
+# High-level test suite for qemu COLO testing all failover cases while checking
+# guest network connectivity
+#
+# Copyright (c) Lukas Straub <lukasstraub2@web.de>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or
+# later.  See the COPYING file in the top-level directory.
+
+# HOWTO:
+#
+# This test has the following parameters:
+# bridge_name: name of the bridge interface to connect qemu to
+# host_address: ip address of the bridge interface
+# guest_address: ip address that the guest gets from the dhcp server
+# bridge_helper: path to the brige helper
+#                (default: /usr/lib/qemu/qemu-bridge-helper)
+# install_cmd: command to run to install iperf3 and memtester in the guest
+#              (default: "sudo -n dnf -q -y install iperf3 memtester")
+#
+# To run the network tests, you have to specify the parameters.
+#
+# Example for running the colo tests:
+# make check-acceptance FEDORA_31_ARCHES="x86_64" AVOCADO_TAGS="-t colo \
+#  -p bridge_name=br0 -p host_address=192.168.220.1 \
+#  -p guest_address=192.168.220.222"
+#
+# The colo tests currently only use x86_64 test vm images. With the
+# FEDORA_31_ARCHES make variable as in the example, only the x86_64 images will
+# be downloaded.
+#
+# If you're running the network tests as an unprivileged user, you need to set
+# the suid bit on the bridge helper (chmod +s <bridge-helper>).
+#
+# The dhcp server should assign a static ip to the guest, else the test may be
+# unreliable. The Mac address for the guest is always 52:54:00:12:34:56.
+
+
+import select
+import sys
+import subprocess
+import shutil
+import os
+import signal
+import os.path
+import time
+import tempfile
+
+from avocado import Test
+from avocado import skipUnless
+from avocado.utils import network
+from avocado.utils import vmimage
+from avocado.utils import cloudinit
+from avocado.utils import ssh
+from avocado.utils.path import find_command
+
+from avocado_qemu import pick_default_qemu_bin, BUILD_DIR, SOURCE_DIR
+from qemu.qmp import QEMUMonitorProtocol
+
+def iperf3_available():
+    try:
+        find_command("iperf3")
+    except CmdNotFoundError:
+        return False
+    return True
+
+class ColoTest(Test):
+
+    # Constants
+    OCF_SUCCESS = 0
+    OCF_ERR_GENERIC = 1
+    OCF_ERR_ARGS = 2
+    OCF_ERR_UNIMPLEMENTED = 3
+    OCF_ERR_PERM = 4
+    OCF_ERR_INSTALLED = 5
+    OCF_ERR_CONFIGURED = 6
+    OCF_NOT_RUNNING = 7
+    OCF_RUNNING_MASTER = 8
+    OCF_FAILED_MASTER = 9
+
+    HOSTA = 10
+    HOSTB = 11
+
+    QEMU_OPTIONS = (" -display none -vga none -enable-kvm"
+                    " -smp 2 -cpu host -m 768"
+                    " -device e1000,mac=52:54:00:12:34:56,netdev=hn0"
+                    " -device virtio-blk,drive=colo-disk0")
+
+    FEDORA_VERSION = "31"
+    IMAGE_CHECKSUM = "e3c1b309d9203604922d6e255c2c5d098a309c2d46215d8fc026954f3c5c27a0"
+    IMAGE_SIZE = "4294967296b"
+
+    hang_qemu = False
+    checkpoint_failover = False
+    traffic_procs = []
+
+    def get_image(self, temp_dir):
+        try:
+            return vmimage.get(
+                "fedora", arch="x86_64", version=self.FEDORA_VERSION,
+                checksum=self.IMAGE_CHECKSUM, algorithm="sha256",
+                cache_dir=self.cache_dirs[0],
+                snapshot_dir=temp_dir)
+        except:
+            self.cancel("Failed to download/prepare image")
+
+    @skipUnless(ssh.SSH_CLIENT_BINARY, "No SSH client available")
+    def setUp(self):
+        # Qemu and qemu-img binary
+        default_qemu_bin = pick_default_qemu_bin()
+        self.QEMU_BINARY = self.params.get("qemu_bin", default=default_qemu_bin)
+
+        # If qemu-img has been built, use it, otherwise the system wide one
+        # will be used.  If none is available, the test will cancel.
+        qemu_img = os.path.join(BUILD_DIR, "qemu-img")
+        if not os.path.exists(qemu_img):
+            qemu_img = find_command("qemu-img", False)
+        if qemu_img is False:
+            self.cancel("Could not find \"qemu-img\", which is required to "
+                        "create the bootable image")
+        self.QEMU_IMG_BINARY = qemu_img
+        vmimage.QEMU_IMG = qemu_img
+
+        self.RESOURCE_AGENT = os.path.join(SOURCE_DIR,
+                                           "scripts/colo-resource-agent/colo")
+        self.ADD_PATH = os.path.join(SOURCE_DIR, "scripts/colo-resource-agent")
+
+        # Logs
+        self.RA_LOG = os.path.join(self.outputdir, "resource-agent.log")
+        self.HOSTA_LOGDIR = os.path.join(self.outputdir, "hosta")
+        self.HOSTB_LOGDIR = os.path.join(self.outputdir, "hostb")
+        os.makedirs(self.HOSTA_LOGDIR)
+        os.makedirs(self.HOSTB_LOGDIR)
+
+        # Temporary directories
+        # We don't use self.workdir because of unix socket path length
+        # limitations
+        self.TMPDIR = tempfile.mkdtemp()
+        self.TMPA = os.path.join(self.TMPDIR, "hosta")
+        self.TMPB = os.path.join(self.TMPDIR, "hostb")
+        os.makedirs(self.TMPA)
+        os.makedirs(self.TMPB)
+
+        # Network
+        self.BRIDGE_NAME = self.params.get("bridge_name")
+        if self.BRIDGE_NAME:
+            self.HOST_ADDRESS = self.params.get("host_address")
+            self.GUEST_ADDRESS = self.params.get("guest_address")
+            self.BRIDGE_HELPER = self.params.get("bridge_helper",
+                                    default="/usr/lib/qemu/qemu-bridge-helper")
+            self.SSH_PORT = 22
+        else:
+            # QEMU's hard coded usermode router address
+            self.HOST_ADDRESS = "10.0.2.2"
+            self.GUEST_ADDRESS = "127.0.0.1"
+            self.BRIDGE_HOSTA_PORT = network.find_free_port(address="127.0.0.1")
+            self.BRIDGE_HOSTB_PORT = network.find_free_port(address="127.0.0.1")
+            self.SSH_PORT = network.find_free_port(address="127.0.0.1")
+
+        self.CLOUDINIT_HOME_PORT = network.find_free_port()
+
+        # Find free port range
+        base_port = 1024
+        while True:
+            base_port = network.find_free_port(start_port=base_port,
+                                               address="127.0.0.1")
+            if base_port == None:
+                self.cancel("Failed to find a free port")
+            for n in range(base_port, base_port +4):
+                if n > 65535:
+                    break
+                if not network.is_port_free(n, "127.0.0.1"):
+                    break
+            else:
+                # for loop above didn't break
+                break
+
+        self.BASE_PORT = base_port
+
+        # Disk images
+        self.log.info("Downloading/preparing boot image")
+        self.HOSTA_IMAGE = self.get_image(self.TMPA).path
+        self.HOSTB_IMAGE = self.get_image(self.TMPB).path
+        self.CLOUDINIT_ISO = os.path.join(self.TMPDIR, "cloudinit.iso")
+
+        self.log.info("Preparing cloudinit image")
+        try:
+            cloudinit.iso(self.CLOUDINIT_ISO, self.name,
+                          username="test", password="password",
+                          phone_home_host=self.HOST_ADDRESS,
+                          phone_home_port=self.CLOUDINIT_HOME_PORT)
+        except Exception as e:
+            self.cancel("Failed to prepare cloudinit image")
+
+        self.QEMU_OPTIONS += " -cdrom %s" % self.CLOUDINIT_ISO
+
+        # Network bridge
+        if not self.BRIDGE_NAME:
+            self.BRIDGE_PIDFILE = os.path.join(self.TMPDIR, "bridge.pid")
+            self.run_command(("'%s' -pidfile '%s'"
+                " -M none -display none -daemonize"
+                " -netdev user,id=host,hostfwd=tcp:127.0.0.1:%s-:22"
+                " -netdev socket,id=hosta,listen=127.0.0.1:%s"
+                " -netdev socket,id=hostb,listen=127.0.0.1:%s"
+                " -netdev hubport,id=hostport,hubid=0,netdev=host"
+                " -netdev hubport,id=porta,hubid=0,netdev=hosta"
+                " -netdev hubport,id=portb,hubid=0,netdev=hostb")
+                % (self.QEMU_BINARY, self.BRIDGE_PIDFILE, self.SSH_PORT,
+                   self.BRIDGE_HOSTA_PORT, self.BRIDGE_HOSTB_PORT), 0)
+
+    def tearDown(self):
+        try:
+            pid = self.read_pidfile(self.BRIDGE_PIDFILE)
+            if pid and self.check_pid(pid):
+                os.kill(pid, signal.SIGKILL)
+        except Exception as e:
+            pass
+
+        try:
+            self.ra_stop(self.HOSTA)
+        except Exception as e:
+            pass
+
+        try:
+            self.ra_stop(self.HOSTB)
+        except Exception as e:
+            pass
+
+        try:
+            self.ssh_close()
+        except Exception as e:
+            pass
+
+        for proc in self.traffic_procs:
+            try:
+                os.killpg(proc.pid, signal.SIGTERM)
+            except Exception as e:
+                pass
+
+        shutil.rmtree(self.TMPDIR)
+
+    def run_command(self, cmdline, expected_status, env=None):
+        proc = subprocess.Popen(cmdline, shell=True, stdout=subprocess.PIPE,
+                                stderr=subprocess.STDOUT,
+                                universal_newlines=True, env=env)
+        stdout, stderr = proc.communicate()
+        if proc.returncode != expected_status:
+            self.fail("command \"%s\" failed with code %s:\n%s"
+                           % (cmdline, proc.returncode, stdout))
+
+        return proc.returncode
+
+    def cat_line(self, path):
+        line=""
+        try:
+            fd = open(path, "r")
+            line = str.strip(fd.readline())
+            fd.close()
+        except:
+            pass
+        return line
+
+    def read_pidfile(self, pidfile):
+        try:
+            pid = int(self.cat_line(pidfile))
+        except ValueError:
+            return None
+        else:
+            return pid
+
+    def check_pid(self, pid):
+        try:
+            os.kill(pid, 0)
+        except OSError:
+            return False
+        else:
+            return True
+
+    def ssh_open(self):
+        self.ssh_conn = ssh.Session(self.GUEST_ADDRESS, self.SSH_PORT,
+                                    user="test", password="password")
+        self.ssh_conn.DEFAULT_OPTIONS += (("UserKnownHostsFile", "/dev/null"),)
+        for i in range(10):
+            try:
+                if self.ssh_conn.connect():
+                    return
+            except Exception as e:
+                pass
+            time.sleep(4)
+        self.fail("sshd timeout")
+
+    def ssh_ping(self):
+        if self.ssh_conn.cmd("echo ping").stdout != b"ping\n":
+            self.fail("ssh ping failed")
+
+    def ssh_close(self):
+        self.ssh_conn.quit()
+
+    def setup_base_env(self, host):
+        PATH = os.getenv("PATH", "")
+        env = { "PATH": "%s:%s" % (self.ADD_PATH, PATH),
+                "HA_LOGFILE": self.RA_LOG,
+                "OCF_RESOURCE_INSTANCE": "colo-test",
+                "OCF_RESKEY_CRM_meta_clone_max": "2",
+                "OCF_RESKEY_CRM_meta_notify": "true",
+                "OCF_RESKEY_CRM_meta_timeout": "30000",
+                "OCF_RESKEY_qemu_binary": self.QEMU_BINARY,
+                "OCF_RESKEY_qemu_img_binary": self.QEMU_IMG_BINARY,
+                "OCF_RESKEY_disk_size": str(self.IMAGE_SIZE),
+                "OCF_RESKEY_checkpoint_interval": "10000",
+                "OCF_RESKEY_base_port": str(self.BASE_PORT),
+                "OCF_RESKEY_debug": "2"}
+
+        if host == self.HOSTA:
+            env.update({"OCF_RESKEY_options":
+                            ("%s -qmp unix:%s,server,nowait"
+                             " -drive if=none,id=parent0,file='%s'")
+                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
+                               self.HOSTA_IMAGE),
+                        "OCF_RESKEY_active_hidden_dir": self.TMPA,
+                        "OCF_RESKEY_listen_address": "127.0.0.1",
+                        "OCF_RESKEY_log_dir": self.HOSTA_LOGDIR,
+                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.1",
+                        "HA_RSCTMP": self.TMPA,
+                        "COLO_TEST_REMOTE_TMP": self.TMPB})
+
+        else:
+            env.update({"OCF_RESKEY_options":
+                            ("%s -qmp unix:%s,server,nowait"
+                             " -drive if=none,id=parent0,file='%s'")
+                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
+                               self.HOSTB_IMAGE),
+                        "OCF_RESKEY_active_hidden_dir": self.TMPB,
+                        "OCF_RESKEY_listen_address": "127.0.0.2",
+                        "OCF_RESKEY_log_dir": self.HOSTB_LOGDIR,
+                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.2",
+                        "HA_RSCTMP": self.TMPB,
+                        "COLO_TEST_REMOTE_TMP": self.TMPA})
+
+        if self.BRIDGE_NAME:
+            env["OCF_RESKEY_options"] += \
+                " -netdev bridge,id=hn0,br=%s,helper='%s'" \
+                % (self.BRIDGE_NAME, self.BRIDGE_HELPER)
+        else:
+            if host == self.HOSTA:
+                env["OCF_RESKEY_options"] += \
+                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
+                    % self.BRIDGE_HOSTA_PORT
+            else:
+                env["OCF_RESKEY_options"] += \
+                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
+                    % self.BRIDGE_HOSTB_PORT
+        return env
+
+    def ra_start(self, host):
+        env = self.setup_base_env(host)
+        self.run_command(self.RESOURCE_AGENT + " start", self.OCF_SUCCESS, env)
+
+    def ra_stop(self, host):
+        env = self.setup_base_env(host)
+        self.run_command(self.RESOURCE_AGENT + " stop", self.OCF_SUCCESS, env)
+
+    def ra_monitor(self, host, expected_status):
+        env = self.setup_base_env(host)
+        self.run_command(self.RESOURCE_AGENT + " monitor", expected_status, env)
+
+    def ra_promote(self, host):
+        env = self.setup_base_env(host)
+        self.run_command(self.RESOURCE_AGENT + " promote", self.OCF_SUCCESS,env)
+
+    def ra_notify_start(self, host):
+        env = self.setup_base_env(host)
+
+        env.update({"OCF_RESKEY_CRM_meta_notify_type": "post",
+                    "OCF_RESKEY_CRM_meta_notify_operation": "start"})
+
+        if host == self.HOSTA:
+            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
+                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.2"})
+        else:
+            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
+                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.1"})
+
+        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
+
+    def ra_notify_stop(self, host):
+        env = self.setup_base_env(host)
+
+        env.update({"OCF_RESKEY_CRM_meta_notify_type": "pre",
+                    "OCF_RESKEY_CRM_meta_notify_operation": "stop"})
+
+        if host == self.HOSTA:
+            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
+                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.2"})
+        else:
+            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
+                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.1"})
+
+        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
+
+    def get_pid(self, host):
+        if host == self.HOSTA:
+            return self.read_pidfile(os.path.join(self.TMPA,
+                                                 "colo-test-qemu.pid"))
+        else:
+            return self.read_pidfile(os.path.join(self.TMPB,
+                                                 "colo-test-qemu.pid"))
+
+    def get_master_score(self, host):
+        if host == self.HOSTA:
+            return int(self.cat_line(os.path.join(self.TMPA, "master_score")))
+        else:
+            return int(self.cat_line(os.path.join(self.TMPB, "master_score")))
+
+    def get_qmp_sock(self, host):
+        if host == self.HOSTA:
+            return os.path.join(self.TMPA, "my-qmp.sock")
+        else:
+            return os.path.join(self.TMPB, "my-qmp.sock")
+
+
+    def kill_qemu_pre(self, host):
+        pid = self.get_pid(host)
+
+        qmp_sock = self.get_qmp_sock(host)
+
+        if self.checkpoint_failover:
+            qmp_conn = QEMUMonitorProtocol(qmp_sock)
+            qmp_conn.settimeout(10)
+            qmp_conn.connect()
+            while True:
+                event = qmp_conn.pull_event(wait=True)
+                if event["event"] == "STOP":
+                    break
+            qmp_conn.close()
+
+
+        if pid and self.check_pid(pid):
+            if self.hang_qemu:
+                os.kill(pid, signal.SIGSTOP)
+            else:
+                os.kill(pid, signal.SIGKILL)
+                while self.check_pid(pid):
+                    time.sleep(1)
+
+    def kill_qemu_post(self, host):
+        pid = self.get_pid(host)
+
+        if self.hang_qemu and pid and self.check_pid(pid):
+            os.kill(pid, signal.SIGKILL)
+            while self.check_pid(pid):
+                time.sleep(1)
+
+    def prepare_guest(self):
+        pass
+
+    def cycle_start(self, cycle):
+        pass
+
+    def active_section(self):
+        return False
+
+    def cycle_end(self, cycle):
+        pass
+
+    def check_connection(self):
+        self.ssh_ping()
+        for proc in self.traffic_procs:
+            if proc.poll() != None:
+                self.fail("Traffic process died")
+
+    def _test_colo(self, loop=1):
+        loop = max(loop, 1)
+        self.log.info("Will put logs in %s" % self.outputdir)
+
+        self.ra_stop(self.HOSTA)
+        self.ra_stop(self.HOSTB)
+
+        self.log.info("*** Startup ***")
+        self.ra_start(self.HOSTA)
+        self.ra_start(self.HOSTB)
+
+        self.ra_monitor(self.HOSTA, self.OCF_SUCCESS)
+        self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
+
+        self.log.info("*** Promoting ***")
+        self.ra_promote(self.HOSTA)
+        cloudinit.wait_for_phone_home(("0.0.0.0", self.CLOUDINIT_HOME_PORT),
+                                      self.name)
+        self.ssh_open()
+        self.prepare_guest()
+
+        self.ra_notify_start(self.HOSTA)
+
+        while self.get_master_score(self.HOSTB) != 100:
+            self.ra_monitor(self.HOSTA, self.OCF_RUNNING_MASTER)
+            self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
+            time.sleep(1)
+
+        self.log.info("*** Replication started ***")
+
+        self.check_connection()
+
+        primary = self.HOSTA
+        secondary = self.HOSTB
+
+        for n in range(loop):
+            self.cycle_start(n)
+            self.log.info("*** Secondary failover ***")
+            self.kill_qemu_pre(primary)
+            self.ra_notify_stop(secondary)
+            self.ra_monitor(secondary, self.OCF_SUCCESS)
+            self.ra_promote(secondary)
+            self.ra_monitor(secondary, self.OCF_RUNNING_MASTER)
+            self.kill_qemu_post(primary)
+
+            self.check_connection()
+
+            tmp = primary
+            primary = secondary
+            secondary = tmp
+
+            self.log.info("*** Secondary continue replication ***")
+            self.ra_start(secondary)
+            self.ra_notify_start(primary)
+
+            self.check_connection()
+
+            # Wait for resync
+            while self.get_master_score(secondary) != 100:
+                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
+                self.ra_monitor(secondary, self.OCF_SUCCESS)
+                time.sleep(1)
+
+            self.log.info("*** Replication started ***")
+
+            self.check_connection()
+
+            if self.active_section():
+                break
+
+            self.log.info("*** Primary failover ***")
+            self.kill_qemu_pre(secondary)
+            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
+            self.ra_notify_stop(primary)
+            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
+            self.kill_qemu_post(secondary)
+
+            self.check_connection()
+
+            self.log.info("*** Primary continue replication ***")
+            self.ra_start(secondary)
+            self.ra_notify_start(primary)
+
+            self.check_connection()
+
+            # Wait for resync
+            while self.get_master_score(secondary) != 100:
+                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
+                self.ra_monitor(secondary, self.OCF_SUCCESS)
+                time.sleep(1)
+
+            self.log.info("*** Replication started ***")
+
+            self.check_connection()
+
+            self.cycle_end(n)
+
+        self.ssh_close()
+
+        self.ra_stop(self.HOSTA)
+        self.ra_stop(self.HOSTB)
+
+        self.ra_monitor(self.HOSTA, self.OCF_NOT_RUNNING)
+        self.ra_monitor(self.HOSTB, self.OCF_NOT_RUNNING)
+        self.log.info("*** all ok ***")
+
+
+class ColoQuickTest(ColoTest):
+    """
+    :avocado: tags=colo
+    :avocado: tags=quick
+    :avocado: tags=arch:x86_64
+    """
+
+    timeout = 300
+
+    def cycle_end(self, cycle):
+        self.log.info("Testing with peer qemu hanging"
+                      " and failover during checkpoint")
+        self.hang_qemu = True
+
+    def test_quick(self):
+        self.checkpoint_failover = True
+        self.log.info("Testing with peer qemu crashing"
+                      " and failover during checkpoint")
+        self._test_colo(loop=2)
+
+
+class ColoNetworkTest(ColoTest):
+
+    def prepare_guest(self):
+        install_cmd = self.params.get("install_cmd", default=
+                                "sudo -n dnf -q -y install iperf3 memtester")
+        self.ssh_conn.cmd(install_cmd)
+        # Use two instances to work around a bug in iperf3
+        self.ssh_conn.cmd("iperf3 -sD; iperf3 -sD -p 5202")
+
+    def _cycle_start(self, cycle):
+        pass
+
+    def cycle_start(self, cycle):
+        self._cycle_start(cycle)
+        tests = [("", "client-to-server tcp"), ("-R", "server-to-client tcp")]
+
+        self.log.info("Testing iperf %s" % tests[cycle % 2][1])
+        iperf_cmd = "iperf3 %s -t 60 -i 60 --connect-timeout 30000 -c %s" \
+                        % (tests[cycle % 2][0], self.GUEST_ADDRESS)
+        proc = subprocess.Popen("while %s && %s; do sleep 1; done >>'%s' 2>&1"
+                    % (iperf_cmd, iperf_cmd + " -p 5202",
+                    os.path.join(self.outputdir, "iperf.log")),
+                    shell=True, preexec_fn=os.setsid, stdin=subprocess.DEVNULL,
+                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        self.traffic_procs.append(proc)
+        time.sleep(5) # Wait for iperf to get up to speed
+
+    def cycle_end(self, cycle):
+        for proc in self.traffic_procs:
+            try:
+                os.killpg(proc.pid, signal.SIGTERM)
+                proc.wait()
+            except Exception as e:
+                pass
+        self.traffic_procs.clear()
+        time.sleep(20)
+
+class ColoRealNetworkTest(ColoNetworkTest):
+    """
+    :avocado: tags=colo
+    :avocado: tags=slow
+    :avocado: tags=network_test
+    :avocado: tags=arch:x86_64
+    """
+
+    timeout = 900
+
+    def active_section(self):
+        time.sleep(300)
+        return False
+
+    @skipUnless(iperf3_available(), "iperf3 not available")
+    def test_network(self):
+        if not self.BRIDGE_NAME:
+            self.cancel("bridge options not given, will skip network test")
+        self.log.info("Testing with peer qemu crashing and network load")
+        self._test_colo(loop=2)
+
+class ColoStressTest(ColoNetworkTest):
+    """
+    :avocado: tags=colo
+    :avocado: tags=slow
+    :avocado: tags=stress_test
+    :avocado: tags=arch:x86_64
+    """
+
+    timeout = 1800
+
+    def _cycle_start(self, cycle):
+        if cycle == 4:
+            self.log.info("Stresstest with peer qemu hanging, network load"
+                          " and failover during checkpoint")
+            self.checkpoint_failover = True
+            self.hang_qemu = True
+        elif cycle == 8:
+            self.log.info("Stresstest with peer qemu crashing and network load")
+            self.checkpoint_failover = False
+            self.hang_qemu = False
+        elif cycle == 12:
+            self.log.info("Stresstest with peer qemu hanging and network load")
+            self.checkpoint_failover = False
+            self.hang_qemu = True
+
+    @skipUnless(iperf3_available(), "iperf3 not available")
+    def test_stress(self):
+        if not self.BRIDGE_NAME:
+            self.cancel("bridge options not given, will skip network test")
+        self.log.info("Stresstest with peer qemu crashing, network load"
+                      " and failover during checkpoint")
+        self.checkpoint_failover = True
+        self._test_colo(loop=16)
-- 
2.20.1

Re: [PATCH 3/5] colo: Introduce high-level test suite

Posted by Philippe Mathieu-Daudé 5 years, 8 months ago

+Cleber/Wainer

On 5/11/20 2:27 PM, Lukas Straub wrote:
> Add high-level test relying on the colo resource-agent to test
> all failover cases while checking guest network connectivity.
> 
> Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> ---
>  scripts/colo-resource-agent/crm_master   |  44 ++
>  scripts/colo-resource-agent/crm_resource |  12 +
>  tests/acceptance/colo.py                 | 689 +++++++++++++++++++++++
>  3 files changed, 745 insertions(+)
>  create mode 100755 scripts/colo-resource-agent/crm_master
>  create mode 100755 scripts/colo-resource-agent/crm_resource
>  create mode 100644 tests/acceptance/colo.py
> 
> diff --git a/scripts/colo-resource-agent/crm_master b/scripts/colo-resource-agent/crm_master
> new file mode 100755
> index 0000000000..886f523bda
> --- /dev/null
> +++ b/scripts/colo-resource-agent/crm_master
> @@ -0,0 +1,44 @@
> +#!/bin/bash
> +
> +# Fake crm_master for COLO testing
> +#
> +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +TMPDIR="$HA_RSCTMP"
> +score=0
> +query=0
> +
> +OPTIND=1
> +while getopts 'Qql:Dv:N:G' opt; do
> +    case "$opt" in
> +        Q|q)
> +            # Noop
> +        ;;
> +        "l")
> +            # Noop
> +        ;;
> +        "D")
> +            score=0
> +        ;;
> +        "v")
> +            score=$OPTARG
> +        ;;
> +        "N")
> +            TMPDIR="$COLO_TEST_REMOTE_TMP"
> +        ;;
> +        "G")
> +            query=1
> +        ;;
> +    esac
> +done
> +
> +if (( query )); then
> +    cat "${TMPDIR}/master_score" || exit 1
> +else
> +    echo $score > "${TMPDIR}/master_score" || exit 1
> +fi
> +
> +exit 0
> diff --git a/scripts/colo-resource-agent/crm_resource b/scripts/colo-resource-agent/crm_resource
> new file mode 100755
> index 0000000000..ad69ff3c6b
> --- /dev/null
> +++ b/scripts/colo-resource-agent/crm_resource
> @@ -0,0 +1,12 @@
> +#!/bin/sh
> +
> +# Fake crm_resource for COLO testing
> +#
> +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +# Noop
> +
> +exit 0
> diff --git a/tests/acceptance/colo.py b/tests/acceptance/colo.py
> new file mode 100644
> index 0000000000..465513fb6c
> --- /dev/null
> +++ b/tests/acceptance/colo.py
> @@ -0,0 +1,689 @@
> +# High-level test suite for qemu COLO testing all failover cases while checking
> +# guest network connectivity
> +#
> +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +# HOWTO:
> +#
> +# This test has the following parameters:
> +# bridge_name: name of the bridge interface to connect qemu to
> +# host_address: ip address of the bridge interface
> +# guest_address: ip address that the guest gets from the dhcp server
> +# bridge_helper: path to the brige helper
> +#                (default: /usr/lib/qemu/qemu-bridge-helper)
> +# install_cmd: command to run to install iperf3 and memtester in the guest
> +#              (default: "sudo -n dnf -q -y install iperf3 memtester")
> +#
> +# To run the network tests, you have to specify the parameters.
> +#
> +# Example for running the colo tests:
> +# make check-acceptance FEDORA_31_ARCHES="x86_64" AVOCADO_TAGS="-t colo \
> +#  -p bridge_name=br0 -p host_address=192.168.220.1 \
> +#  -p guest_address=192.168.220.222"
> +#
> +# The colo tests currently only use x86_64 test vm images. With the
> +# FEDORA_31_ARCHES make variable as in the example, only the x86_64 images will
> +# be downloaded.
> +#
> +# If you're running the network tests as an unprivileged user, you need to set
> +# the suid bit on the bridge helper (chmod +s <bridge-helper>).
> +#
> +# The dhcp server should assign a static ip to the guest, else the test may be
> +# unreliable. The Mac address for the guest is always 52:54:00:12:34:56.
> +
> +
> +import select
> +import sys
> +import subprocess
> +import shutil
> +import os
> +import signal
> +import os.path
> +import time
> +import tempfile
> +
> +from avocado import Test
> +from avocado import skipUnless
> +from avocado.utils import network
> +from avocado.utils import vmimage
> +from avocado.utils import cloudinit
> +from avocado.utils import ssh
> +from avocado.utils.path import find_command
> +
> +from avocado_qemu import pick_default_qemu_bin, BUILD_DIR, SOURCE_DIR
> +from qemu.qmp import QEMUMonitorProtocol
> +
> +def iperf3_available():
> +    try:
> +        find_command("iperf3")
> +    except CmdNotFoundError:
> +        return False
> +    return True
> +
> +class ColoTest(Test):
> +
> +    # Constants
> +    OCF_SUCCESS = 0
> +    OCF_ERR_GENERIC = 1
> +    OCF_ERR_ARGS = 2
> +    OCF_ERR_UNIMPLEMENTED = 3
> +    OCF_ERR_PERM = 4
> +    OCF_ERR_INSTALLED = 5
> +    OCF_ERR_CONFIGURED = 6
> +    OCF_NOT_RUNNING = 7
> +    OCF_RUNNING_MASTER = 8
> +    OCF_FAILED_MASTER = 9
> +
> +    HOSTA = 10
> +    HOSTB = 11
> +
> +    QEMU_OPTIONS = (" -display none -vga none -enable-kvm"
> +                    " -smp 2 -cpu host -m 768"
> +                    " -device e1000,mac=52:54:00:12:34:56,netdev=hn0"
> +                    " -device virtio-blk,drive=colo-disk0")
> +
> +    FEDORA_VERSION = "31"
> +    IMAGE_CHECKSUM = "e3c1b309d9203604922d6e255c2c5d098a309c2d46215d8fc026954f3c5c27a0"
> +    IMAGE_SIZE = "4294967296b"
> +
> +    hang_qemu = False
> +    checkpoint_failover = False
> +    traffic_procs = []
> +
> +    def get_image(self, temp_dir):
> +        try:
> +            return vmimage.get(
> +                "fedora", arch="x86_64", version=self.FEDORA_VERSION,
> +                checksum=self.IMAGE_CHECKSUM, algorithm="sha256",
> +                cache_dir=self.cache_dirs[0],
> +                snapshot_dir=temp_dir)
> +        except:
> +            self.cancel("Failed to download/prepare image")
> +
> +    @skipUnless(ssh.SSH_CLIENT_BINARY, "No SSH client available")
> +    def setUp(self):
> +        # Qemu and qemu-img binary
> +        default_qemu_bin = pick_default_qemu_bin()
> +        self.QEMU_BINARY = self.params.get("qemu_bin", default=default_qemu_bin)
> +
> +        # If qemu-img has been built, use it, otherwise the system wide one
> +        # will be used.  If none is available, the test will cancel.
> +        qemu_img = os.path.join(BUILD_DIR, "qemu-img")
> +        if not os.path.exists(qemu_img):
> +            qemu_img = find_command("qemu-img", False)
> +        if qemu_img is False:
> +            self.cancel("Could not find \"qemu-img\", which is required to "
> +                        "create the bootable image")
> +        self.QEMU_IMG_BINARY = qemu_img

Probably worth refactoring that as re-usable pick_qemuimg_bin() or
better named?

Similarly with BRIDGE_HELPER... We need a generic class to get binaries
from environment or build dir.

> +        vmimage.QEMU_IMG = qemu_img
> +
> +        self.RESOURCE_AGENT = os.path.join(SOURCE_DIR,
> +                                           "scripts/colo-resource-agent/colo")
> +        self.ADD_PATH = os.path.join(SOURCE_DIR, "scripts/colo-resource-agent")
> +
> +        # Logs
> +        self.RA_LOG = os.path.join(self.outputdir, "resource-agent.log")
> +        self.HOSTA_LOGDIR = os.path.join(self.outputdir, "hosta")
> +        self.HOSTB_LOGDIR = os.path.join(self.outputdir, "hostb")
> +        os.makedirs(self.HOSTA_LOGDIR)
> +        os.makedirs(self.HOSTB_LOGDIR)
> +
> +        # Temporary directories
> +        # We don't use self.workdir because of unix socket path length
> +        # limitations
> +        self.TMPDIR = tempfile.mkdtemp()
> +        self.TMPA = os.path.join(self.TMPDIR, "hosta")
> +        self.TMPB = os.path.join(self.TMPDIR, "hostb")
> +        os.makedirs(self.TMPA)
> +        os.makedirs(self.TMPB)
> +
> +        # Network
> +        self.BRIDGE_NAME = self.params.get("bridge_name")
> +        if self.BRIDGE_NAME:
> +            self.HOST_ADDRESS = self.params.get("host_address")
> +            self.GUEST_ADDRESS = self.params.get("guest_address")
> +            self.BRIDGE_HELPER = self.params.get("bridge_helper",
> +                                    default="/usr/lib/qemu/qemu-bridge-helper")
> +            self.SSH_PORT = 22
> +        else:
> +            # QEMU's hard coded usermode router address
> +            self.HOST_ADDRESS = "10.0.2.2"
> +            self.GUEST_ADDRESS = "127.0.0.1"
> +            self.BRIDGE_HOSTA_PORT = network.find_free_port(address="127.0.0.1")
> +            self.BRIDGE_HOSTB_PORT = network.find_free_port(address="127.0.0.1")
> +            self.SSH_PORT = network.find_free_port(address="127.0.0.1")
> +
> +        self.CLOUDINIT_HOME_PORT = network.find_free_port()
> +
> +        # Find free port range
> +        base_port = 1024
> +        while True:
> +            base_port = network.find_free_port(start_port=base_port,
> +                                               address="127.0.0.1")
> +            if base_port == None:
> +                self.cancel("Failed to find a free port")
> +            for n in range(base_port, base_port +4):
> +                if n > 65535:
> +                    break
> +                if not network.is_port_free(n, "127.0.0.1"):
> +                    break
> +            else:
> +                # for loop above didn't break
> +                break
> +
> +        self.BASE_PORT = base_port
> +
> +        # Disk images
> +        self.log.info("Downloading/preparing boot image")
> +        self.HOSTA_IMAGE = self.get_image(self.TMPA).path
> +        self.HOSTB_IMAGE = self.get_image(self.TMPB).path
> +        self.CLOUDINIT_ISO = os.path.join(self.TMPDIR, "cloudinit.iso")
> +
> +        self.log.info("Preparing cloudinit image")
> +        try:
> +            cloudinit.iso(self.CLOUDINIT_ISO, self.name,
> +                          username="test", password="password",
> +                          phone_home_host=self.HOST_ADDRESS,
> +                          phone_home_port=self.CLOUDINIT_HOME_PORT)
> +        except Exception as e:
> +            self.cancel("Failed to prepare cloudinit image")
> +
> +        self.QEMU_OPTIONS += " -cdrom %s" % self.CLOUDINIT_ISO
> +
> +        # Network bridge
> +        if not self.BRIDGE_NAME:
> +            self.BRIDGE_PIDFILE = os.path.join(self.TMPDIR, "bridge.pid")
> +            self.run_command(("'%s' -pidfile '%s'"
> +                " -M none -display none -daemonize"
> +                " -netdev user,id=host,hostfwd=tcp:127.0.0.1:%s-:22"
> +                " -netdev socket,id=hosta,listen=127.0.0.1:%s"
> +                " -netdev socket,id=hostb,listen=127.0.0.1:%s"
> +                " -netdev hubport,id=hostport,hubid=0,netdev=host"
> +                " -netdev hubport,id=porta,hubid=0,netdev=hosta"
> +                " -netdev hubport,id=portb,hubid=0,netdev=hostb")
> +                % (self.QEMU_BINARY, self.BRIDGE_PIDFILE, self.SSH_PORT,
> +                   self.BRIDGE_HOSTA_PORT, self.BRIDGE_HOSTB_PORT), 0)
> +
> +    def tearDown(self):
> +        try:
> +            pid = self.read_pidfile(self.BRIDGE_PIDFILE)
> +            if pid and self.check_pid(pid):
> +                os.kill(pid, signal.SIGKILL)
> +        except Exception as e:
> +            pass
> +
> +        try:
> +            self.ra_stop(self.HOSTA)
> +        except Exception as e:
> +            pass
> +
> +        try:
> +            self.ra_stop(self.HOSTB)
> +        except Exception as e:
> +            pass
> +
> +        try:
> +            self.ssh_close()
> +        except Exception as e:
> +            pass
> +
> +        for proc in self.traffic_procs:
> +            try:
> +                os.killpg(proc.pid, signal.SIGTERM)
> +            except Exception as e:
> +                pass
> +
> +        shutil.rmtree(self.TMPDIR)
> +
> +    def run_command(self, cmdline, expected_status, env=None):
> +        proc = subprocess.Popen(cmdline, shell=True, stdout=subprocess.PIPE,
> +                                stderr=subprocess.STDOUT,
> +                                universal_newlines=True, env=env)
> +        stdout, stderr = proc.communicate()
> +        if proc.returncode != expected_status:
> +            self.fail("command \"%s\" failed with code %s:\n%s"
> +                           % (cmdline, proc.returncode, stdout))
> +
> +        return proc.returncode
> +
> +    def cat_line(self, path):
> +        line=""
> +        try:
> +            fd = open(path, "r")
> +            line = str.strip(fd.readline())
> +            fd.close()
> +        except:
> +            pass
> +        return line
> +
> +    def read_pidfile(self, pidfile):
> +        try:
> +            pid = int(self.cat_line(pidfile))
> +        except ValueError:
> +            return None
> +        else:
> +            return pid
> +
> +    def check_pid(self, pid):
> +        try:
> +            os.kill(pid, 0)
> +        except OSError:
> +            return False
> +        else:
> +            return True
> +
> +    def ssh_open(self):
> +        self.ssh_conn = ssh.Session(self.GUEST_ADDRESS, self.SSH_PORT,
> +                                    user="test", password="password")
> +        self.ssh_conn.DEFAULT_OPTIONS += (("UserKnownHostsFile", "/dev/null"),)
> +        for i in range(10):
> +            try:
> +                if self.ssh_conn.connect():
> +                    return
> +            except Exception as e:
> +                pass
> +            time.sleep(4)
> +        self.fail("sshd timeout")
> +
> +    def ssh_ping(self):
> +        if self.ssh_conn.cmd("echo ping").stdout != b"ping\n":
> +            self.fail("ssh ping failed")
> +
> +    def ssh_close(self):
> +        self.ssh_conn.quit()
> +
> +    def setup_base_env(self, host):
> +        PATH = os.getenv("PATH", "")
> +        env = { "PATH": "%s:%s" % (self.ADD_PATH, PATH),
> +                "HA_LOGFILE": self.RA_LOG,
> +                "OCF_RESOURCE_INSTANCE": "colo-test",
> +                "OCF_RESKEY_CRM_meta_clone_max": "2",
> +                "OCF_RESKEY_CRM_meta_notify": "true",
> +                "OCF_RESKEY_CRM_meta_timeout": "30000",
> +                "OCF_RESKEY_qemu_binary": self.QEMU_BINARY,
> +                "OCF_RESKEY_qemu_img_binary": self.QEMU_IMG_BINARY,
> +                "OCF_RESKEY_disk_size": str(self.IMAGE_SIZE),

We can remove IMAGE_SIZE and use a runtime filesize(HOSTA_IMAGE) instead.

> +                "OCF_RESKEY_checkpoint_interval": "10000",
> +                "OCF_RESKEY_base_port": str(self.BASE_PORT),
> +                "OCF_RESKEY_debug": "2"}
> +
> +        if host == self.HOSTA:
> +            env.update({"OCF_RESKEY_options":
> +                            ("%s -qmp unix:%s,server,nowait"
> +                             " -drive if=none,id=parent0,file='%s'")
> +                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
> +                               self.HOSTA_IMAGE),
> +                        "OCF_RESKEY_active_hidden_dir": self.TMPA,
> +                        "OCF_RESKEY_listen_address": "127.0.0.1",
> +                        "OCF_RESKEY_log_dir": self.HOSTA_LOGDIR,
> +                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.1",
> +                        "HA_RSCTMP": self.TMPA,
> +                        "COLO_TEST_REMOTE_TMP": self.TMPB})
> +
> +        else:
> +            env.update({"OCF_RESKEY_options":
> +                            ("%s -qmp unix:%s,server,nowait"
> +                             " -drive if=none,id=parent0,file='%s'")
> +                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
> +                               self.HOSTB_IMAGE),
> +                        "OCF_RESKEY_active_hidden_dir": self.TMPB,
> +                        "OCF_RESKEY_listen_address": "127.0.0.2",
> +                        "OCF_RESKEY_log_dir": self.HOSTB_LOGDIR,
> +                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.2",
> +                        "HA_RSCTMP": self.TMPB,
> +                        "COLO_TEST_REMOTE_TMP": self.TMPA})
> +
> +        if self.BRIDGE_NAME:
> +            env["OCF_RESKEY_options"] += \
> +                " -netdev bridge,id=hn0,br=%s,helper='%s'" \
> +                % (self.BRIDGE_NAME, self.BRIDGE_HELPER)
> +        else:
> +            if host == self.HOSTA:
> +                env["OCF_RESKEY_options"] += \
> +                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
> +                    % self.BRIDGE_HOSTA_PORT
> +            else:
> +                env["OCF_RESKEY_options"] += \
> +                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
> +                    % self.BRIDGE_HOSTB_PORT
> +        return env
> +
> +    def ra_start(self, host):
> +        env = self.setup_base_env(host)
> +        self.run_command(self.RESOURCE_AGENT + " start", self.OCF_SUCCESS, env)
> +
> +    def ra_stop(self, host):
> +        env = self.setup_base_env(host)
> +        self.run_command(self.RESOURCE_AGENT + " stop", self.OCF_SUCCESS, env)
> +
> +    def ra_monitor(self, host, expected_status):
> +        env = self.setup_base_env(host)
> +        self.run_command(self.RESOURCE_AGENT + " monitor", expected_status, env)
> +
> +    def ra_promote(self, host):
> +        env = self.setup_base_env(host)
> +        self.run_command(self.RESOURCE_AGENT + " promote", self.OCF_SUCCESS,env)
> +
> +    def ra_notify_start(self, host):
> +        env = self.setup_base_env(host)
> +
> +        env.update({"OCF_RESKEY_CRM_meta_notify_type": "post",
> +                    "OCF_RESKEY_CRM_meta_notify_operation": "start"})
> +
> +        if host == self.HOSTA:
> +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
> +                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.2"})
> +        else:
> +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
> +                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.1"})
> +
> +        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
> +
> +    def ra_notify_stop(self, host):
> +        env = self.setup_base_env(host)
> +
> +        env.update({"OCF_RESKEY_CRM_meta_notify_type": "pre",
> +                    "OCF_RESKEY_CRM_meta_notify_operation": "stop"})
> +
> +        if host == self.HOSTA:
> +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
> +                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.2"})
> +        else:
> +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
> +                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.1"})
> +
> +        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
> +
> +    def get_pid(self, host):
> +        if host == self.HOSTA:
> +            return self.read_pidfile(os.path.join(self.TMPA,
> +                                                 "colo-test-qemu.pid"))
> +        else:
> +            return self.read_pidfile(os.path.join(self.TMPB,
> +                                                 "colo-test-qemu.pid"))
> +
> +    def get_master_score(self, host):
> +        if host == self.HOSTA:
> +            return int(self.cat_line(os.path.join(self.TMPA, "master_score")))
> +        else:
> +            return int(self.cat_line(os.path.join(self.TMPB, "master_score")))
> +
> +    def get_qmp_sock(self, host):
> +        if host == self.HOSTA:
> +            return os.path.join(self.TMPA, "my-qmp.sock")
> +        else:
> +            return os.path.join(self.TMPB, "my-qmp.sock")
> +
> +
> +    def kill_qemu_pre(self, host):
> +        pid = self.get_pid(host)
> +
> +        qmp_sock = self.get_qmp_sock(host)
> +
> +        if self.checkpoint_failover:
> +            qmp_conn = QEMUMonitorProtocol(qmp_sock)
> +            qmp_conn.settimeout(10)
> +            qmp_conn.connect()
> +            while True:
> +                event = qmp_conn.pull_event(wait=True)
> +                if event["event"] == "STOP":
> +                    break
> +            qmp_conn.close()
> +
> +
> +        if pid and self.check_pid(pid):
> +            if self.hang_qemu:
> +                os.kill(pid, signal.SIGSTOP)
> +            else:
> +                os.kill(pid, signal.SIGKILL)
> +                while self.check_pid(pid):
> +                    time.sleep(1)
> +
> +    def kill_qemu_post(self, host):
> +        pid = self.get_pid(host)
> +
> +        if self.hang_qemu and pid and self.check_pid(pid):
> +            os.kill(pid, signal.SIGKILL)
> +            while self.check_pid(pid):
> +                time.sleep(1)
> +
> +    def prepare_guest(self):
> +        pass
> +
> +    def cycle_start(self, cycle):
> +        pass
> +
> +    def active_section(self):
> +        return False
> +
> +    def cycle_end(self, cycle):
> +        pass
> +
> +    def check_connection(self):
> +        self.ssh_ping()
> +        for proc in self.traffic_procs:
> +            if proc.poll() != None:
> +                self.fail("Traffic process died")
> +
> +    def _test_colo(self, loop=1):
> +        loop = max(loop, 1)
> +        self.log.info("Will put logs in %s" % self.outputdir)
> +
> +        self.ra_stop(self.HOSTA)
> +        self.ra_stop(self.HOSTB)
> +
> +        self.log.info("*** Startup ***")
> +        self.ra_start(self.HOSTA)
> +        self.ra_start(self.HOSTB)
> +
> +        self.ra_monitor(self.HOSTA, self.OCF_SUCCESS)
> +        self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
> +
> +        self.log.info("*** Promoting ***")
> +        self.ra_promote(self.HOSTA)
> +        cloudinit.wait_for_phone_home(("0.0.0.0", self.CLOUDINIT_HOME_PORT),
> +                                      self.name)
> +        self.ssh_open()
> +        self.prepare_guest()
> +
> +        self.ra_notify_start(self.HOSTA)
> +
> +        while self.get_master_score(self.HOSTB) != 100:
> +            self.ra_monitor(self.HOSTA, self.OCF_RUNNING_MASTER)
> +            self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
> +            time.sleep(1)
> +
> +        self.log.info("*** Replication started ***")
> +
> +        self.check_connection()
> +
> +        primary = self.HOSTA
> +        secondary = self.HOSTB
> +
> +        for n in range(loop):
> +            self.cycle_start(n)
> +            self.log.info("*** Secondary failover ***")
> +            self.kill_qemu_pre(primary)
> +            self.ra_notify_stop(secondary)
> +            self.ra_monitor(secondary, self.OCF_SUCCESS)
> +            self.ra_promote(secondary)
> +            self.ra_monitor(secondary, self.OCF_RUNNING_MASTER)
> +            self.kill_qemu_post(primary)
> +
> +            self.check_connection()
> +
> +            tmp = primary
> +            primary = secondary
> +            secondary = tmp
> +
> +            self.log.info("*** Secondary continue replication ***")
> +            self.ra_start(secondary)
> +            self.ra_notify_start(primary)
> +
> +            self.check_connection()
> +
> +            # Wait for resync
> +            while self.get_master_score(secondary) != 100:
> +                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> +                self.ra_monitor(secondary, self.OCF_SUCCESS)
> +                time.sleep(1)
> +
> +            self.log.info("*** Replication started ***")
> +
> +            self.check_connection()
> +
> +            if self.active_section():
> +                break
> +
> +            self.log.info("*** Primary failover ***")
> +            self.kill_qemu_pre(secondary)
> +            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> +            self.ra_notify_stop(primary)
> +            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> +            self.kill_qemu_post(secondary)
> +
> +            self.check_connection()
> +
> +            self.log.info("*** Primary continue replication ***")
> +            self.ra_start(secondary)
> +            self.ra_notify_start(primary)
> +
> +            self.check_connection()
> +
> +            # Wait for resync
> +            while self.get_master_score(secondary) != 100:
> +                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> +                self.ra_monitor(secondary, self.OCF_SUCCESS)
> +                time.sleep(1)
> +
> +            self.log.info("*** Replication started ***")
> +
> +            self.check_connection()
> +
> +            self.cycle_end(n)

Interesting test :)

> +
> +        self.ssh_close()
> +
> +        self.ra_stop(self.HOSTA)
> +        self.ra_stop(self.HOSTB)
> +
> +        self.ra_monitor(self.HOSTA, self.OCF_NOT_RUNNING)
> +        self.ra_monitor(self.HOSTB, self.OCF_NOT_RUNNING)
> +        self.log.info("*** all ok ***")
> +
> +
> +class ColoQuickTest(ColoTest):
> +    """
> +    :avocado: tags=colo
> +    :avocado: tags=quick
> +    :avocado: tags=arch:x86_64
> +    """
> +
> +    timeout = 300
> +
> +    def cycle_end(self, cycle):
> +        self.log.info("Testing with peer qemu hanging"
> +                      " and failover during checkpoint")
> +        self.hang_qemu = True
> +
> +    def test_quick(self):
> +        self.checkpoint_failover = True
> +        self.log.info("Testing with peer qemu crashing"
> +                      " and failover during checkpoint")
> +        self._test_colo(loop=2)
> +
> +
> +class ColoNetworkTest(ColoTest):
> +
> +    def prepare_guest(self):
> +        install_cmd = self.params.get("install_cmd", default=
> +                                "sudo -n dnf -q -y install iperf3 memtester")
> +        self.ssh_conn.cmd(install_cmd)
> +        # Use two instances to work around a bug in iperf3
> +        self.ssh_conn.cmd("iperf3 -sD; iperf3 -sD -p 5202")
> +
> +    def _cycle_start(self, cycle):
> +        pass
> +
> +    def cycle_start(self, cycle):
> +        self._cycle_start(cycle)
> +        tests = [("", "client-to-server tcp"), ("-R", "server-to-client tcp")]
> +
> +        self.log.info("Testing iperf %s" % tests[cycle % 2][1])
> +        iperf_cmd = "iperf3 %s -t 60 -i 60 --connect-timeout 30000 -c %s" \
> +                        % (tests[cycle % 2][0], self.GUEST_ADDRESS)
> +        proc = subprocess.Popen("while %s && %s; do sleep 1; done >>'%s' 2>&1"
> +                    % (iperf_cmd, iperf_cmd + " -p 5202",
> +                    os.path.join(self.outputdir, "iperf.log")),
> +                    shell=True, preexec_fn=os.setsid, stdin=subprocess.DEVNULL,
> +                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

So this run on the host, require the host to be Linux + iperf3
installed. Don't we need to be privileged to run it?

> +        self.traffic_procs.append(proc)
> +        time.sleep(5) # Wait for iperf to get up to speed
> +
> +    def cycle_end(self, cycle):
> +        for proc in self.traffic_procs:
> +            try:
> +                os.killpg(proc.pid, signal.SIGTERM)
> +                proc.wait()
> +            except Exception as e:
> +                pass
> +        self.traffic_procs.clear()
> +        time.sleep(20)
> +
> +class ColoRealNetworkTest(ColoNetworkTest):
> +    """
> +    :avocado: tags=colo
> +    :avocado: tags=slow
> +    :avocado: tags=network_test
> +    :avocado: tags=arch:x86_64
> +    """
> +
> +    timeout = 900
> +
> +    def active_section(self):
> +        time.sleep(300)
> +        return False
> +
> +    @skipUnless(iperf3_available(), "iperf3 not available")
> +    def test_network(self):
> +        if not self.BRIDGE_NAME:
> +            self.cancel("bridge options not given, will skip network test")
> +        self.log.info("Testing with peer qemu crashing and network load")
> +        self._test_colo(loop=2)
> +
> +class ColoStressTest(ColoNetworkTest):
> +    """
> +    :avocado: tags=colo
> +    :avocado: tags=slow
> +    :avocado: tags=stress_test
> +    :avocado: tags=arch:x86_64
> +    """
> +
> +    timeout = 1800

How long does this test take on your hw (what CPU, to compare)?

> +
> +    def _cycle_start(self, cycle):
> +        if cycle == 4:
> +            self.log.info("Stresstest with peer qemu hanging, network load"
> +                          " and failover during checkpoint")
> +            self.checkpoint_failover = True
> +            self.hang_qemu = True
> +        elif cycle == 8:
> +            self.log.info("Stresstest with peer qemu crashing and network load")
> +            self.checkpoint_failover = False
> +            self.hang_qemu = False
> +        elif cycle == 12:
> +            self.log.info("Stresstest with peer qemu hanging and network load")
> +            self.checkpoint_failover = False
> +            self.hang_qemu = True
> +
> +    @skipUnless(iperf3_available(), "iperf3 not available")
> +    def test_stress(self):
> +        if not self.BRIDGE_NAME:
> +            self.cancel("bridge options not given, will skip network test")
> +        self.log.info("Stresstest with peer qemu crashing, network load"
> +                      " and failover during checkpoint")
> +        self.checkpoint_failover = True
> +        self._test_colo(loop=16)
>

Re: [PATCH 3/5] colo: Introduce high-level test suite

Posted by Lukas Straub 5 years, 8 months ago

On Tue, 2 Jun 2020 14:19:08 +0200
Philippe Mathieu-Daudé <philmd@redhat.com> wrote:

> +Cleber/Wainer
> 
> On 5/11/20 2:27 PM, Lukas Straub wrote:
> > Add high-level test relying on the colo resource-agent to test
> > all failover cases while checking guest network connectivity.
> > 
> > Signed-off-by: Lukas Straub <lukasstraub2@web.de>
> > ---
> >  scripts/colo-resource-agent/crm_master   |  44 ++
> >  scripts/colo-resource-agent/crm_resource |  12 +
> >  tests/acceptance/colo.py                 | 689 +++++++++++++++++++++++
> >  3 files changed, 745 insertions(+)
> >  create mode 100755 scripts/colo-resource-agent/crm_master
> >  create mode 100755 scripts/colo-resource-agent/crm_resource
> >  create mode 100644 tests/acceptance/colo.py
> > 
> > diff --git a/scripts/colo-resource-agent/crm_master b/scripts/colo-resource-agent/crm_master
> > new file mode 100755
> > index 0000000000..886f523bda
> > --- /dev/null
> > +++ b/scripts/colo-resource-agent/crm_master
> > @@ -0,0 +1,44 @@
> > +#!/bin/bash
> > +
> > +# Fake crm_master for COLO testing
> > +#
> > +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > +#
> > +# This work is licensed under the terms of the GNU GPL, version 2 or
> > +# later.  See the COPYING file in the top-level directory.
> > +
> > +TMPDIR="$HA_RSCTMP"
> > +score=0
> > +query=0
> > +
> > +OPTIND=1
> > +while getopts 'Qql:Dv:N:G' opt; do
> > +    case "$opt" in
> > +        Q|q)
> > +            # Noop
> > +        ;;
> > +        "l")
> > +            # Noop
> > +        ;;
> > +        "D")
> > +            score=0
> > +        ;;
> > +        "v")
> > +            score=$OPTARG
> > +        ;;
> > +        "N")
> > +            TMPDIR="$COLO_TEST_REMOTE_TMP"
> > +        ;;
> > +        "G")
> > +            query=1
> > +        ;;
> > +    esac
> > +done
> > +
> > +if (( query )); then
> > +    cat "${TMPDIR}/master_score" || exit 1
> > +else
> > +    echo $score > "${TMPDIR}/master_score" || exit 1
> > +fi
> > +
> > +exit 0
> > diff --git a/scripts/colo-resource-agent/crm_resource b/scripts/colo-resource-agent/crm_resource
> > new file mode 100755
> > index 0000000000..ad69ff3c6b
> > --- /dev/null
> > +++ b/scripts/colo-resource-agent/crm_resource
> > @@ -0,0 +1,12 @@
> > +#!/bin/sh
> > +
> > +# Fake crm_resource for COLO testing
> > +#
> > +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > +#
> > +# This work is licensed under the terms of the GNU GPL, version 2 or
> > +# later.  See the COPYING file in the top-level directory.
> > +
> > +# Noop
> > +
> > +exit 0
> > diff --git a/tests/acceptance/colo.py b/tests/acceptance/colo.py
> > new file mode 100644
> > index 0000000000..465513fb6c
> > --- /dev/null
> > +++ b/tests/acceptance/colo.py
> > @@ -0,0 +1,689 @@
> > +# High-level test suite for qemu COLO testing all failover cases while checking
> > +# guest network connectivity
> > +#
> > +# Copyright (c) Lukas Straub <lukasstraub2@web.de>
> > +#
> > +# This work is licensed under the terms of the GNU GPL, version 2 or
> > +# later.  See the COPYING file in the top-level directory.
> > +
> > +# HOWTO:
> > +#
> > +# This test has the following parameters:
> > +# bridge_name: name of the bridge interface to connect qemu to
> > +# host_address: ip address of the bridge interface
> > +# guest_address: ip address that the guest gets from the dhcp server
> > +# bridge_helper: path to the brige helper
> > +#                (default: /usr/lib/qemu/qemu-bridge-helper)
> > +# install_cmd: command to run to install iperf3 and memtester in the guest
> > +#              (default: "sudo -n dnf -q -y install iperf3 memtester")
> > +#
> > +# To run the network tests, you have to specify the parameters.
> > +#
> > +# Example for running the colo tests:
> > +# make check-acceptance FEDORA_31_ARCHES="x86_64" AVOCADO_TAGS="-t colo \
> > +#  -p bridge_name=br0 -p host_address=192.168.220.1 \
> > +#  -p guest_address=192.168.220.222"
> > +#
> > +# The colo tests currently only use x86_64 test vm images. With the
> > +# FEDORA_31_ARCHES make variable as in the example, only the x86_64 images will
> > +# be downloaded.
> > +#
> > +# If you're running the network tests as an unprivileged user, you need to set
> > +# the suid bit on the bridge helper (chmod +s <bridge-helper>).
> > +#
> > +# The dhcp server should assign a static ip to the guest, else the test may be
> > +# unreliable. The Mac address for the guest is always 52:54:00:12:34:56.
> > +
> > +
> > +import select
> > +import sys
> > +import subprocess
> > +import shutil
> > +import os
> > +import signal
> > +import os.path
> > +import time
> > +import tempfile
> > +
> > +from avocado import Test
> > +from avocado import skipUnless
> > +from avocado.utils import network
> > +from avocado.utils import vmimage
> > +from avocado.utils import cloudinit
> > +from avocado.utils import ssh
> > +from avocado.utils.path import find_command
> > +
> > +from avocado_qemu import pick_default_qemu_bin, BUILD_DIR, SOURCE_DIR
> > +from qemu.qmp import QEMUMonitorProtocol
> > +
> > +def iperf3_available():
> > +    try:
> > +        find_command("iperf3")
> > +    except CmdNotFoundError:
> > +        return False
> > +    return True
> > +
> > +class ColoTest(Test):
> > +
> > +    # Constants
> > +    OCF_SUCCESS = 0
> > +    OCF_ERR_GENERIC = 1
> > +    OCF_ERR_ARGS = 2
> > +    OCF_ERR_UNIMPLEMENTED = 3
> > +    OCF_ERR_PERM = 4
> > +    OCF_ERR_INSTALLED = 5
> > +    OCF_ERR_CONFIGURED = 6
> > +    OCF_NOT_RUNNING = 7
> > +    OCF_RUNNING_MASTER = 8
> > +    OCF_FAILED_MASTER = 9
> > +
> > +    HOSTA = 10
> > +    HOSTB = 11
> > +
> > +    QEMU_OPTIONS = (" -display none -vga none -enable-kvm"
> > +                    " -smp 2 -cpu host -m 768"
> > +                    " -device e1000,mac=52:54:00:12:34:56,netdev=hn0"
> > +                    " -device virtio-blk,drive=colo-disk0")
> > +
> > +    FEDORA_VERSION = "31"
> > +    IMAGE_CHECKSUM = "e3c1b309d9203604922d6e255c2c5d098a309c2d46215d8fc026954f3c5c27a0"
> > +    IMAGE_SIZE = "4294967296b"
> > +
> > +    hang_qemu = False
> > +    checkpoint_failover = False
> > +    traffic_procs = []
> > +
> > +    def get_image(self, temp_dir):
> > +        try:
> > +            return vmimage.get(
> > +                "fedora", arch="x86_64", version=self.FEDORA_VERSION,
> > +                checksum=self.IMAGE_CHECKSUM, algorithm="sha256",
> > +                cache_dir=self.cache_dirs[0],
> > +                snapshot_dir=temp_dir)
> > +        except:
> > +            self.cancel("Failed to download/prepare image")
> > +
> > +    @skipUnless(ssh.SSH_CLIENT_BINARY, "No SSH client available")
> > +    def setUp(self):
> > +        # Qemu and qemu-img binary
> > +        default_qemu_bin = pick_default_qemu_bin()
> > +        self.QEMU_BINARY = self.params.get("qemu_bin", default=default_qemu_bin)
> > +
> > +        # If qemu-img has been built, use it, otherwise the system wide one
> > +        # will be used.  If none is available, the test will cancel.
> > +        qemu_img = os.path.join(BUILD_DIR, "qemu-img")
> > +        if not os.path.exists(qemu_img):
> > +            qemu_img = find_command("qemu-img", False)
> > +        if qemu_img is False:
> > +            self.cancel("Could not find \"qemu-img\", which is required to "
> > +                        "create the bootable image")
> > +        self.QEMU_IMG_BINARY = qemu_img  
> 
> Probably worth refactoring that as re-usable pick_qemuimg_bin() or
> better named?
> 
> Similarly with BRIDGE_HELPER... We need a generic class to get binaries
> from environment or build dir.

Agreed, I think a new function pick_qemu_util similar to this code in tests/acceptance/avocado_qemu/__init__.py should suffice.

> > +        vmimage.QEMU_IMG = qemu_img
> > +
> > +        self.RESOURCE_AGENT = os.path.join(SOURCE_DIR,
> > +                                           "scripts/colo-resource-agent/colo")
> > +        self.ADD_PATH = os.path.join(SOURCE_DIR, "scripts/colo-resource-agent")
> > +
> > +        # Logs
> > +        self.RA_LOG = os.path.join(self.outputdir, "resource-agent.log")
> > +        self.HOSTA_LOGDIR = os.path.join(self.outputdir, "hosta")
> > +        self.HOSTB_LOGDIR = os.path.join(self.outputdir, "hostb")
> > +        os.makedirs(self.HOSTA_LOGDIR)
> > +        os.makedirs(self.HOSTB_LOGDIR)
> > +
> > +        # Temporary directories
> > +        # We don't use self.workdir because of unix socket path length
> > +        # limitations
> > +        self.TMPDIR = tempfile.mkdtemp()
> > +        self.TMPA = os.path.join(self.TMPDIR, "hosta")
> > +        self.TMPB = os.path.join(self.TMPDIR, "hostb")
> > +        os.makedirs(self.TMPA)
> > +        os.makedirs(self.TMPB)
> > +
> > +        # Network
> > +        self.BRIDGE_NAME = self.params.get("bridge_name")
> > +        if self.BRIDGE_NAME:
> > +            self.HOST_ADDRESS = self.params.get("host_address")
> > +            self.GUEST_ADDRESS = self.params.get("guest_address")
> > +            self.BRIDGE_HELPER = self.params.get("bridge_helper",
> > +                                    default="/usr/lib/qemu/qemu-bridge-helper")
> > +            self.SSH_PORT = 22
> > +        else:
> > +            # QEMU's hard coded usermode router address
> > +            self.HOST_ADDRESS = "10.0.2.2"
> > +            self.GUEST_ADDRESS = "127.0.0.1"
> > +            self.BRIDGE_HOSTA_PORT = network.find_free_port(address="127.0.0.1")
> > +            self.BRIDGE_HOSTB_PORT = network.find_free_port(address="127.0.0.1")
> > +            self.SSH_PORT = network.find_free_port(address="127.0.0.1")
> > +
> > +        self.CLOUDINIT_HOME_PORT = network.find_free_port()
> > +
> > +        # Find free port range
> > +        base_port = 1024
> > +        while True:
> > +            base_port = network.find_free_port(start_port=base_port,
> > +                                               address="127.0.0.1")
> > +            if base_port == None:
> > +                self.cancel("Failed to find a free port")
> > +            for n in range(base_port, base_port +4):
> > +                if n > 65535:
> > +                    break
> > +                if not network.is_port_free(n, "127.0.0.1"):
> > +                    break
> > +            else:
> > +                # for loop above didn't break
> > +                break
> > +
> > +        self.BASE_PORT = base_port
> > +
> > +        # Disk images
> > +        self.log.info("Downloading/preparing boot image")
> > +        self.HOSTA_IMAGE = self.get_image(self.TMPA).path
> > +        self.HOSTB_IMAGE = self.get_image(self.TMPB).path
> > +        self.CLOUDINIT_ISO = os.path.join(self.TMPDIR, "cloudinit.iso")
> > +
> > +        self.log.info("Preparing cloudinit image")
> > +        try:
> > +            cloudinit.iso(self.CLOUDINIT_ISO, self.name,
> > +                          username="test", password="password",
> > +                          phone_home_host=self.HOST_ADDRESS,
> > +                          phone_home_port=self.CLOUDINIT_HOME_PORT)
> > +        except Exception as e:
> > +            self.cancel("Failed to prepare cloudinit image")
> > +
> > +        self.QEMU_OPTIONS += " -cdrom %s" % self.CLOUDINIT_ISO
> > +
> > +        # Network bridge
> > +        if not self.BRIDGE_NAME:
> > +            self.BRIDGE_PIDFILE = os.path.join(self.TMPDIR, "bridge.pid")
> > +            self.run_command(("'%s' -pidfile '%s'"
> > +                " -M none -display none -daemonize"
> > +                " -netdev user,id=host,hostfwd=tcp:127.0.0.1:%s-:22"
> > +                " -netdev socket,id=hosta,listen=127.0.0.1:%s"
> > +                " -netdev socket,id=hostb,listen=127.0.0.1:%s"
> > +                " -netdev hubport,id=hostport,hubid=0,netdev=host"
> > +                " -netdev hubport,id=porta,hubid=0,netdev=hosta"
> > +                " -netdev hubport,id=portb,hubid=0,netdev=hostb")
> > +                % (self.QEMU_BINARY, self.BRIDGE_PIDFILE, self.SSH_PORT,
> > +                   self.BRIDGE_HOSTA_PORT, self.BRIDGE_HOSTB_PORT), 0)
> > +
> > +    def tearDown(self):
> > +        try:
> > +            pid = self.read_pidfile(self.BRIDGE_PIDFILE)
> > +            if pid and self.check_pid(pid):
> > +                os.kill(pid, signal.SIGKILL)
> > +        except Exception as e:
> > +            pass
> > +
> > +        try:
> > +            self.ra_stop(self.HOSTA)
> > +        except Exception as e:
> > +            pass
> > +
> > +        try:
> > +            self.ra_stop(self.HOSTB)
> > +        except Exception as e:
> > +            pass
> > +
> > +        try:
> > +            self.ssh_close()
> > +        except Exception as e:
> > +            pass
> > +
> > +        for proc in self.traffic_procs:
> > +            try:
> > +                os.killpg(proc.pid, signal.SIGTERM)
> > +            except Exception as e:
> > +                pass
> > +
> > +        shutil.rmtree(self.TMPDIR)
> > +
> > +    def run_command(self, cmdline, expected_status, env=None):
> > +        proc = subprocess.Popen(cmdline, shell=True, stdout=subprocess.PIPE,
> > +                                stderr=subprocess.STDOUT,
> > +                                universal_newlines=True, env=env)
> > +        stdout, stderr = proc.communicate()
> > +        if proc.returncode != expected_status:
> > +            self.fail("command \"%s\" failed with code %s:\n%s"
> > +                           % (cmdline, proc.returncode, stdout))
> > +
> > +        return proc.returncode
> > +
> > +    def cat_line(self, path):
> > +        line=""
> > +        try:
> > +            fd = open(path, "r")
> > +            line = str.strip(fd.readline())
> > +            fd.close()
> > +        except:
> > +            pass
> > +        return line
> > +
> > +    def read_pidfile(self, pidfile):
> > +        try:
> > +            pid = int(self.cat_line(pidfile))
> > +        except ValueError:
> > +            return None
> > +        else:
> > +            return pid
> > +
> > +    def check_pid(self, pid):
> > +        try:
> > +            os.kill(pid, 0)
> > +        except OSError:
> > +            return False
> > +        else:
> > +            return True
> > +
> > +    def ssh_open(self):
> > +        self.ssh_conn = ssh.Session(self.GUEST_ADDRESS, self.SSH_PORT,
> > +                                    user="test", password="password")
> > +        self.ssh_conn.DEFAULT_OPTIONS += (("UserKnownHostsFile", "/dev/null"),)
> > +        for i in range(10):
> > +            try:
> > +                if self.ssh_conn.connect():
> > +                    return
> > +            except Exception as e:
> > +                pass
> > +            time.sleep(4)
> > +        self.fail("sshd timeout")
> > +
> > +    def ssh_ping(self):
> > +        if self.ssh_conn.cmd("echo ping").stdout != b"ping\n":
> > +            self.fail("ssh ping failed")
> > +
> > +    def ssh_close(self):
> > +        self.ssh_conn.quit()
> > +
> > +    def setup_base_env(self, host):
> > +        PATH = os.getenv("PATH", "")
> > +        env = { "PATH": "%s:%s" % (self.ADD_PATH, PATH),
> > +                "HA_LOGFILE": self.RA_LOG,
> > +                "OCF_RESOURCE_INSTANCE": "colo-test",
> > +                "OCF_RESKEY_CRM_meta_clone_max": "2",
> > +                "OCF_RESKEY_CRM_meta_notify": "true",
> > +                "OCF_RESKEY_CRM_meta_timeout": "30000",
> > +                "OCF_RESKEY_qemu_binary": self.QEMU_BINARY,
> > +                "OCF_RESKEY_qemu_img_binary": self.QEMU_IMG_BINARY,
> > +                "OCF_RESKEY_disk_size": str(self.IMAGE_SIZE),  
> 
> We can remove IMAGE_SIZE and use a runtime filesize(HOSTA_IMAGE) instead.

I will change the resource-agent so it doesn't need OCF_RESKEY_disk_size.

> > +                "OCF_RESKEY_checkpoint_interval": "10000",
> > +                "OCF_RESKEY_base_port": str(self.BASE_PORT),
> > +                "OCF_RESKEY_debug": "2"}
> > +
> > +        if host == self.HOSTA:
> > +            env.update({"OCF_RESKEY_options":
> > +                            ("%s -qmp unix:%s,server,nowait"
> > +                             " -drive if=none,id=parent0,file='%s'")
> > +                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
> > +                               self.HOSTA_IMAGE),
> > +                        "OCF_RESKEY_active_hidden_dir": self.TMPA,
> > +                        "OCF_RESKEY_listen_address": "127.0.0.1",
> > +                        "OCF_RESKEY_log_dir": self.HOSTA_LOGDIR,
> > +                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.1",
> > +                        "HA_RSCTMP": self.TMPA,
> > +                        "COLO_TEST_REMOTE_TMP": self.TMPB})
> > +
> > +        else:
> > +            env.update({"OCF_RESKEY_options":
> > +                            ("%s -qmp unix:%s,server,nowait"
> > +                             " -drive if=none,id=parent0,file='%s'")
> > +                            % (self.QEMU_OPTIONS, self.get_qmp_sock(host),
> > +                               self.HOSTB_IMAGE),
> > +                        "OCF_RESKEY_active_hidden_dir": self.TMPB,
> > +                        "OCF_RESKEY_listen_address": "127.0.0.2",
> > +                        "OCF_RESKEY_log_dir": self.HOSTB_LOGDIR,
> > +                        "OCF_RESKEY_CRM_meta_on_node": "127.0.0.2",
> > +                        "HA_RSCTMP": self.TMPB,
> > +                        "COLO_TEST_REMOTE_TMP": self.TMPA})
> > +
> > +        if self.BRIDGE_NAME:
> > +            env["OCF_RESKEY_options"] += \
> > +                " -netdev bridge,id=hn0,br=%s,helper='%s'" \
> > +                % (self.BRIDGE_NAME, self.BRIDGE_HELPER)
> > +        else:
> > +            if host == self.HOSTA:
> > +                env["OCF_RESKEY_options"] += \
> > +                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
> > +                    % self.BRIDGE_HOSTA_PORT
> > +            else:
> > +                env["OCF_RESKEY_options"] += \
> > +                    " -netdev socket,id=hn0,connect=127.0.0.1:%s" \
> > +                    % self.BRIDGE_HOSTB_PORT
> > +        return env
> > +
> > +    def ra_start(self, host):
> > +        env = self.setup_base_env(host)
> > +        self.run_command(self.RESOURCE_AGENT + " start", self.OCF_SUCCESS, env)
> > +
> > +    def ra_stop(self, host):
> > +        env = self.setup_base_env(host)
> > +        self.run_command(self.RESOURCE_AGENT + " stop", self.OCF_SUCCESS, env)
> > +
> > +    def ra_monitor(self, host, expected_status):
> > +        env = self.setup_base_env(host)
> > +        self.run_command(self.RESOURCE_AGENT + " monitor", expected_status, env)
> > +
> > +    def ra_promote(self, host):
> > +        env = self.setup_base_env(host)
> > +        self.run_command(self.RESOURCE_AGENT + " promote", self.OCF_SUCCESS,env)
> > +
> > +    def ra_notify_start(self, host):
> > +        env = self.setup_base_env(host)
> > +
> > +        env.update({"OCF_RESKEY_CRM_meta_notify_type": "post",
> > +                    "OCF_RESKEY_CRM_meta_notify_operation": "start"})
> > +
> > +        if host == self.HOSTA:
> > +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
> > +                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.2"})
> > +        else:
> > +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
> > +                        "OCF_RESKEY_CRM_meta_notify_start_uname": "127.0.0.1"})
> > +
> > +        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
> > +
> > +    def ra_notify_stop(self, host):
> > +        env = self.setup_base_env(host)
> > +
> > +        env.update({"OCF_RESKEY_CRM_meta_notify_type": "pre",
> > +                    "OCF_RESKEY_CRM_meta_notify_operation": "stop"})
> > +
> > +        if host == self.HOSTA:
> > +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.1",
> > +                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.2"})
> > +        else:
> > +            env.update({"OCF_RESKEY_CRM_meta_notify_master_uname": "127.0.0.2",
> > +                        "OCF_RESKEY_CRM_meta_notify_stop_uname": "127.0.0.1"})
> > +
> > +        self.run_command(self.RESOURCE_AGENT + " notify", self.OCF_SUCCESS, env)
> > +
> > +    def get_pid(self, host):
> > +        if host == self.HOSTA:
> > +            return self.read_pidfile(os.path.join(self.TMPA,
> > +                                                 "colo-test-qemu.pid"))
> > +        else:
> > +            return self.read_pidfile(os.path.join(self.TMPB,
> > +                                                 "colo-test-qemu.pid"))
> > +
> > +    def get_master_score(self, host):
> > +        if host == self.HOSTA:
> > +            return int(self.cat_line(os.path.join(self.TMPA, "master_score")))
> > +        else:
> > +            return int(self.cat_line(os.path.join(self.TMPB, "master_score")))
> > +
> > +    def get_qmp_sock(self, host):
> > +        if host == self.HOSTA:
> > +            return os.path.join(self.TMPA, "my-qmp.sock")
> > +        else:
> > +            return os.path.join(self.TMPB, "my-qmp.sock")
> > +
> > +
> > +    def kill_qemu_pre(self, host):
> > +        pid = self.get_pid(host)
> > +
> > +        qmp_sock = self.get_qmp_sock(host)
> > +
> > +        if self.checkpoint_failover:
> > +            qmp_conn = QEMUMonitorProtocol(qmp_sock)
> > +            qmp_conn.settimeout(10)
> > +            qmp_conn.connect()
> > +            while True:
> > +                event = qmp_conn.pull_event(wait=True)
> > +                if event["event"] == "STOP":
> > +                    break
> > +            qmp_conn.close()
> > +
> > +
> > +        if pid and self.check_pid(pid):
> > +            if self.hang_qemu:
> > +                os.kill(pid, signal.SIGSTOP)
> > +            else:
> > +                os.kill(pid, signal.SIGKILL)
> > +                while self.check_pid(pid):
> > +                    time.sleep(1)
> > +
> > +    def kill_qemu_post(self, host):
> > +        pid = self.get_pid(host)
> > +
> > +        if self.hang_qemu and pid and self.check_pid(pid):
> > +            os.kill(pid, signal.SIGKILL)
> > +            while self.check_pid(pid):
> > +                time.sleep(1)
> > +
> > +    def prepare_guest(self):
> > +        pass
> > +
> > +    def cycle_start(self, cycle):
> > +        pass
> > +
> > +    def active_section(self):
> > +        return False
> > +
> > +    def cycle_end(self, cycle):
> > +        pass
> > +
> > +    def check_connection(self):
> > +        self.ssh_ping()
> > +        for proc in self.traffic_procs:
> > +            if proc.poll() != None:
> > +                self.fail("Traffic process died")
> > +
> > +    def _test_colo(self, loop=1):
> > +        loop = max(loop, 1)
> > +        self.log.info("Will put logs in %s" % self.outputdir)
> > +
> > +        self.ra_stop(self.HOSTA)
> > +        self.ra_stop(self.HOSTB)
> > +
> > +        self.log.info("*** Startup ***")
> > +        self.ra_start(self.HOSTA)
> > +        self.ra_start(self.HOSTB)
> > +
> > +        self.ra_monitor(self.HOSTA, self.OCF_SUCCESS)
> > +        self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
> > +
> > +        self.log.info("*** Promoting ***")
> > +        self.ra_promote(self.HOSTA)
> > +        cloudinit.wait_for_phone_home(("0.0.0.0", self.CLOUDINIT_HOME_PORT),
> > +                                      self.name)
> > +        self.ssh_open()
> > +        self.prepare_guest()
> > +
> > +        self.ra_notify_start(self.HOSTA)
> > +
> > +        while self.get_master_score(self.HOSTB) != 100:
> > +            self.ra_monitor(self.HOSTA, self.OCF_RUNNING_MASTER)
> > +            self.ra_monitor(self.HOSTB, self.OCF_SUCCESS)
> > +            time.sleep(1)
> > +
> > +        self.log.info("*** Replication started ***")
> > +
> > +        self.check_connection()
> > +
> > +        primary = self.HOSTA
> > +        secondary = self.HOSTB
> > +
> > +        for n in range(loop):
> > +            self.cycle_start(n)
> > +            self.log.info("*** Secondary failover ***")
> > +            self.kill_qemu_pre(primary)
> > +            self.ra_notify_stop(secondary)
> > +            self.ra_monitor(secondary, self.OCF_SUCCESS)
> > +            self.ra_promote(secondary)
> > +            self.ra_monitor(secondary, self.OCF_RUNNING_MASTER)
> > +            self.kill_qemu_post(primary)
> > +
> > +            self.check_connection()
> > +
> > +            tmp = primary
> > +            primary = secondary
> > +            secondary = tmp
> > +
> > +            self.log.info("*** Secondary continue replication ***")
> > +            self.ra_start(secondary)
> > +            self.ra_notify_start(primary)
> > +
> > +            self.check_connection()
> > +
> > +            # Wait for resync
> > +            while self.get_master_score(secondary) != 100:
> > +                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> > +                self.ra_monitor(secondary, self.OCF_SUCCESS)
> > +                time.sleep(1)
> > +
> > +            self.log.info("*** Replication started ***")
> > +
> > +            self.check_connection()
> > +
> > +            if self.active_section():
> > +                break
> > +
> > +            self.log.info("*** Primary failover ***")
> > +            self.kill_qemu_pre(secondary)
> > +            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> > +            self.ra_notify_stop(primary)
> > +            self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> > +            self.kill_qemu_post(secondary)
> > +
> > +            self.check_connection()
> > +
> > +            self.log.info("*** Primary continue replication ***")
> > +            self.ra_start(secondary)
> > +            self.ra_notify_start(primary)
> > +
> > +            self.check_connection()
> > +
> > +            # Wait for resync
> > +            while self.get_master_score(secondary) != 100:
> > +                self.ra_monitor(primary, self.OCF_RUNNING_MASTER)
> > +                self.ra_monitor(secondary, self.OCF_SUCCESS)
> > +                time.sleep(1)
> > +
> > +            self.log.info("*** Replication started ***")
> > +
> > +            self.check_connection()
> > +
> > +            self.cycle_end(n)  
> 
> Interesting test :)
> 
> > +
> > +        self.ssh_close()
> > +
> > +        self.ra_stop(self.HOSTA)
> > +        self.ra_stop(self.HOSTB)
> > +
> > +        self.ra_monitor(self.HOSTA, self.OCF_NOT_RUNNING)
> > +        self.ra_monitor(self.HOSTB, self.OCF_NOT_RUNNING)
> > +        self.log.info("*** all ok ***")
> > +
> > +
> > +class ColoQuickTest(ColoTest):
> > +    """
> > +    :avocado: tags=colo
> > +    :avocado: tags=quick
> > +    :avocado: tags=arch:x86_64
> > +    """
> > +
> > +    timeout = 300
> > +
> > +    def cycle_end(self, cycle):
> > +        self.log.info("Testing with peer qemu hanging"
> > +                      " and failover during checkpoint")
> > +        self.hang_qemu = True
> > +
> > +    def test_quick(self):
> > +        self.checkpoint_failover = True
> > +        self.log.info("Testing with peer qemu crashing"
> > +                      " and failover during checkpoint")
> > +        self._test_colo(loop=2)
> > +
> > +
> > +class ColoNetworkTest(ColoTest):
> > +
> > +    def prepare_guest(self):
> > +        install_cmd = self.params.get("install_cmd", default=
> > +                                "sudo -n dnf -q -y install iperf3 memtester")
> > +        self.ssh_conn.cmd(install_cmd)
> > +        # Use two instances to work around a bug in iperf3
> > +        self.ssh_conn.cmd("iperf3 -sD; iperf3 -sD -p 5202")
> > +
> > +    def _cycle_start(self, cycle):
> > +        pass
> > +
> > +    def cycle_start(self, cycle):
> > +        self._cycle_start(cycle)
> > +        tests = [("", "client-to-server tcp"), ("-R", "server-to-client tcp")]
> > +
> > +        self.log.info("Testing iperf %s" % tests[cycle % 2][1])
> > +        iperf_cmd = "iperf3 %s -t 60 -i 60 --connect-timeout 30000 -c %s" \
> > +                        % (tests[cycle % 2][0], self.GUEST_ADDRESS)
> > +        proc = subprocess.Popen("while %s && %s; do sleep 1; done >>'%s' 2>&1"
> > +                    % (iperf_cmd, iperf_cmd + " -p 5202",
> > +                    os.path.join(self.outputdir, "iperf.log")),
> > +                    shell=True, preexec_fn=os.setsid, stdin=subprocess.DEVNULL,
> > +                    stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)  
> 
> So this run on the host, require the host to be Linux + iperf3
> installed. Don't we need to be privileged to run it?

Not for iperf3, but the bridge helper needs the setuid bit to be set if the test runs unprivileged.

> > +        self.traffic_procs.append(proc)
> > +        time.sleep(5) # Wait for iperf to get up to speed
> > +
> > +    def cycle_end(self, cycle):
> > +        for proc in self.traffic_procs:
> > +            try:
> > +                os.killpg(proc.pid, signal.SIGTERM)
> > +                proc.wait()
> > +            except Exception as e:
> > +                pass
> > +        self.traffic_procs.clear()
> > +        time.sleep(20)
> > +
> > +class ColoRealNetworkTest(ColoNetworkTest):
> > +    """
> > +    :avocado: tags=colo
> > +    :avocado: tags=slow
> > +    :avocado: tags=network_test
> > +    :avocado: tags=arch:x86_64
> > +    """
> > +
> > +    timeout = 900
> > +
> > +    def active_section(self):
> > +        time.sleep(300)
> > +        return False
> > +
> > +    @skipUnless(iperf3_available(), "iperf3 not available")
> > +    def test_network(self):
> > +        if not self.BRIDGE_NAME:
> > +            self.cancel("bridge options not given, will skip network test")
> > +        self.log.info("Testing with peer qemu crashing and network load")
> > +        self._test_colo(loop=2)
> > +
> > +class ColoStressTest(ColoNetworkTest):
> > +    """
> > +    :avocado: tags=colo
> > +    :avocado: tags=slow
> > +    :avocado: tags=stress_test
> > +    :avocado: tags=arch:x86_64
> > +    """
> > +
> > +    timeout = 1800  
> 
> How long does this test take on your hw (what CPU, to compare)?

The CPU is an Intel i7-5600U and M.2 SATA SSD (during resync the whole image is copied), here are the runtimes:
Quick test: ~200s
Network test: ~800s
Stress test: ~1300s

> > +
> > +    def _cycle_start(self, cycle):
> > +        if cycle == 4:
> > +            self.log.info("Stresstest with peer qemu hanging, network load"
> > +                          " and failover during checkpoint")
> > +            self.checkpoint_failover = True
> > +            self.hang_qemu = True
> > +        elif cycle == 8:
> > +            self.log.info("Stresstest with peer qemu crashing and network load")
> > +            self.checkpoint_failover = False
> > +            self.hang_qemu = False
> > +        elif cycle == 12:
> > +            self.log.info("Stresstest with peer qemu hanging and network load")
> > +            self.checkpoint_failover = False
> > +            self.hang_qemu = True
> > +
> > +    @skipUnless(iperf3_available(), "iperf3 not available")
> > +    def test_stress(self):
> > +        if not self.BRIDGE_NAME:
> > +            self.cancel("bridge options not given, will skip network test")
> > +        self.log.info("Stresstest with peer qemu crashing, network load"
> > +                      " and failover during checkpoint")
> > +        self.checkpoint_failover = True
> > +        self._test_colo(loop=16)
> >   
>

[PATCH 1/5] block/quorum.c: stable children names
[PATCH 2/5] colo: Introduce resource agent
[PATCH 3/5] colo: Introduce high-level test suite
[PATCH 4/5] configure,Makefile: Install colo resource-agent
[PATCH 5/5] MAINTAINERS: Add myself as maintainer for COLO resource agent