[PATCH 13/20] drbd: rewrite state machine for DRBD 9 multi-peer clusters

Christoph Böhmwalder posted 20 patches 5 days, 15 hours ago
[PATCH 13/20] drbd: rewrite state machine for DRBD 9 multi-peer clusters
Posted by Christoph Böhmwalder 5 days, 15 hours ago
Replace the monolithic DRBD 8.4 state machine with an architecture
suited for clusters with more than one peer.
The central concept is a transactional model: state is held in
per-object arrays indexed by [NOW] and [NEW], and every change is
bracketed by begin/end calls that validate the proposed transition
resource-wide before atomically committing it or rolling it back.

Replace the single psinlock that serialized everything with
finer-grained locking: a read-write lock for state access, separate
locks for peer requests and interval trees.

Cluster-wide state changes (role changes, connect/disconnect, resize)
use a two-phase commit protocol.
The initiating node sends a prepare message to all reachable peers,
collects replies with timeout and exponential backoff, then commits
or aborts.
Not-fully-connected topologies are handled by forwarding nested 2PC
rounds through intermediate nodes.

Add a quorum mechanism with tiebreaker support for even-sized clusters.
This can suspend or fail I/O when the cluster loses more than half of
its peers.

Unify post-state-change processing into a single resource-wide work
item that handles UUID propagation, resync startup, I/O suspension,
metadata persistence, and netlink notifications for all objects in one
pass, replacing the separate per-device and per-connection callbacks
from 8.4.

Co-developed-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Co-developed-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Co-developed-by: Joel Colledge <joel.colledge@linbit.com>
Signed-off-by: Joel Colledge <joel.colledge@linbit.com>
Co-developed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
---
 drivers/block/drbd/drbd_state.c | 7724 +++++++++++++++++++++++--------
 include/linux/drbd_genl.h       |    2 +
 include/linux/drbd_limits.h     |    7 +
 3 files changed, 5898 insertions(+), 1835 deletions(-)

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index adcba7f1d8ea..ab1ff6f85fb2 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -13,199 +13,414 @@
 
  */
 
-#include <linux/drbd_limits.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
 #include "drbd_int.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
 #include "drbd_state_change.h"
 
-struct after_state_chg_work {
+
+struct after_state_change_work {
 	struct drbd_work w;
-	struct drbd_device *device;
-	union drbd_state os;
-	union drbd_state ns;
-	enum chg_state_flags flags;
-	struct completion *done;
 	struct drbd_state_change *state_change;
+	struct completion *done;
+};
+
+struct quorum_info {
+	int up_to_date;
+	int present;
+	int voters;
+	int quorum_at;
+	int min_redundancy_at;
+};
+
+struct quorum_detail {
+	int up_to_date;
+	int present;
+	int outdated;
+	int diskless;
+	int missing_diskless;
+	int quorumless;
+	int unknown;
+	int quorate_peers;
+};
+
+struct change_context {
+	struct drbd_resource *resource;
+	int vnr;
+	union drbd_state mask;
+	union drbd_state val;
+	int target_node_id;
+	enum chg_state_flags flags;
+	bool change_local_state_last;
+	const char **err_str;
+};
+
+enum change_phase {
+	PH_LOCAL_COMMIT,
+	PH_PREPARE,
+	PH_84_COMMIT,
+	PH_COMMIT,
 };
 
-enum sanitize_state_warnings {
-	NO_WARNING,
-	ABORTED_ONLINE_VERIFY,
-	ABORTED_RESYNC,
-	CONNECTION_LOST_NEGOTIATING,
-	IMPLICITLY_UPGRADED_DISK,
-	IMPLICITLY_UPGRADED_PDSK,
+struct change_disk_state_context {
+	struct change_context context;
+	struct drbd_device *device;
 };
 
+static bool lost_contact_to_peer_data(enum drbd_disk_state *peer_disk_state);
+static bool peer_returns_diskless(struct drbd_peer_device *peer_device,
+				  enum drbd_disk_state os, enum drbd_disk_state ns);
+static void print_state_change(struct drbd_resource *resource, const char *prefix, const char *tag);
+static void finish_state_change(struct drbd_resource *, const char *tag);
+static int w_after_state_change(struct drbd_work *w, int unused);
+static enum drbd_state_rv is_valid_soft_transition(struct drbd_resource *);
+static enum drbd_state_rv is_valid_transition(struct drbd_resource *resource);
+static void sanitize_state(struct drbd_resource *resource);
+static void ensure_exposed_data_uuid(struct drbd_device *device);
+static enum drbd_state_rv change_peer_state(struct drbd_connection *, int, union drbd_state,
+					    union drbd_state, unsigned long *);
+static void check_wrongly_set_mdf_exists(struct drbd_device *);
+static void update_members(struct drbd_resource *resource);
+static bool calc_data_accessible(struct drbd_state_change *state_change, int n_device,
+				 enum which_state which);
+
+/* We need to stay consistent if we are neighbor of a diskless primary with
+   different UUID. This function should be used if the device was D_UP_TO_DATE
+   before.
+ */
+static bool may_return_to_up_to_date(struct drbd_device *device, enum which_state which)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = true;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		if (peer_device->disk_state[which] == D_DISKLESS &&
+		    peer_device->connection->peer_role[which] == R_PRIMARY &&
+		    peer_device->current_uuid != drbd_current_uuid(device)) {
+			rv = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * may_be_up_to_date()  -  check if transition from D_CONSISTENT to D_UP_TO_DATE is allowed
+ * @device: DRBD device.
+ * @which: OLD or NEW
+ *
+ * When fencing is enabled, it may only transition from D_CONSISTENT to D_UP_TO_DATE
+ * when ether all peers are connected, or outdated.
+ */
+static bool may_be_up_to_date(struct drbd_device *device, enum which_state which)
+{
+	bool all_peers_outdated = true;
+	int node_id;
+
+	if (!may_return_to_up_to_date(device, which))
+		return false;
+
+	rcu_read_lock();
+	for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) {
+		struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id];
+		struct drbd_peer_device *peer_device;
+		enum drbd_disk_state peer_disk_state;
+		bool want_bitmap = true;
+
+		if (node_id == device->ldev->md.node_id)
+			continue;
+
+		if (!(peer_md->flags & MDF_HAVE_BITMAP) && !(peer_md->flags & MDF_NODE_EXISTS))
+			continue;
+
+		if (!(peer_md->flags & MDF_PEER_FENCING))
+			continue;
+		peer_device = peer_device_by_node_id(device, node_id);
+		if (peer_device) {
+			struct peer_device_conf *pdc = rcu_dereference(peer_device->conf);
+			want_bitmap = pdc->bitmap;
+			peer_disk_state = peer_device->disk_state[NEW];
+		} else {
+			peer_disk_state = D_UNKNOWN;
+		}
+
+		switch (peer_disk_state) {
+		case D_DISKLESS:
+			if (!(peer_md->flags & MDF_PEER_DEVICE_SEEN))
+				continue;
+			fallthrough;
+		case D_ATTACHING:
+		case D_DETACHING:
+		case D_FAILED:
+		case D_NEGOTIATING:
+		case D_UNKNOWN:
+			if (!want_bitmap)
+				continue;
+			if ((peer_md->flags & MDF_PEER_OUTDATED))
+				continue;
+			break;
+		case D_INCONSISTENT:
+		case D_OUTDATED:
+			continue;
+		case D_CONSISTENT:
+		case D_UP_TO_DATE:
+			/* These states imply that there is a connection. If there is
+			   a connection we do not need to insist that the peer was
+			   outdated. */
+			continue;
+		case D_MASK:
+			break;
+		}
+
+		all_peers_outdated = false;
+	}
+	rcu_read_unlock();
+	return all_peers_outdated;
+}
+
+static bool stable_up_to_date_neighbor(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		if (peer_device->disk_state[NEW] == D_UP_TO_DATE &&
+		    peer_device->uuid_flags & UUID_FLAG_STABLE && /* primary is also stable */
+		    peer_device->current_uuid == drbd_current_uuid(device)) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * disk_state_from_md()  -  determine initial disk state
+ * @device: DRBD device.
+ *
+ * When a disk is attached to a device, we set the disk state to D_NEGOTIATING.
+ * We then wait for all connected peers to send the peer disk state.  Once that
+ * has happened, we can determine the actual disk state based on the peer disk
+ * states and the state of the disk itself.
+ *
+ * The initial disk state becomes D_UP_TO_DATE without fencing or when we know
+ * that all peers have been outdated, and D_CONSISTENT otherwise.
+ *
+ * The caller either needs to have a get_ldev() reference, or need to call
+ * this function only if disk_state[NOW] >= D_NEGOTIATING and holding the
+ * state_rwlock.
+ */
+enum drbd_disk_state disk_state_from_md(struct drbd_device *device)
+{
+	enum drbd_disk_state disk_state;
+
+	if (!drbd_md_test_flag(device->ldev, MDF_CONSISTENT))
+		disk_state = D_INCONSISTENT;
+	else if (!drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+		disk_state = D_OUTDATED;
+	else
+		disk_state = may_be_up_to_date(device, NOW) ? D_UP_TO_DATE : D_CONSISTENT;
+
+	return disk_state;
+}
+
+bool is_suspended_fen(struct drbd_resource *resource, enum which_state which)
+{
+	struct drbd_connection *connection;
+	bool rv = false;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (connection->susp_fen[which]) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+bool resource_is_suspended(struct drbd_resource *resource, enum which_state which)
+{
+	bool rv = resource->susp_user[which] || resource->susp_nod[which] ||
+		resource->susp_quorum[which] ||	resource->susp_uuid[which];
+
+	if (rv)
+		return rv;
+
+	return is_suspended_fen(resource, which);
+}
+
 static void count_objects(struct drbd_resource *resource,
-			  unsigned int *n_devices,
-			  unsigned int *n_connections)
+			  struct drbd_state_change_object_count *ocnt)
 {
+	struct drbd_path *path;
 	struct drbd_device *device;
 	struct drbd_connection *connection;
 	int vnr;
 
-	*n_devices = 0;
-	*n_connections = 0;
+	lockdep_assert_held(&resource->state_rwlock);
+
+	ocnt->n_devices = 0;
+	ocnt->n_connections = 0;
+	ocnt->n_paths = 0;
 
 	idr_for_each_entry(&resource->devices, device, vnr)
-		(*n_devices)++;
-	for_each_connection(connection, resource)
-		(*n_connections)++;
+		ocnt->n_devices++;
+	for_each_connection(connection, resource) {
+		ocnt->n_connections++;
+		list_for_each_entry(path, &connection->transport.paths, list) {
+			ocnt->n_paths++;
+		}
+	}
 }
 
-static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+static struct drbd_state_change *alloc_state_change(struct drbd_state_change_object_count *ocnt, gfp_t flags)
 {
 	struct drbd_state_change *state_change;
-	unsigned int size, n;
+	unsigned int size;
 
 	size = sizeof(struct drbd_state_change) +
-	       n_devices * sizeof(struct drbd_device_state_change) +
-	       n_connections * sizeof(struct drbd_connection_state_change) +
-	       n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
-	state_change = kmalloc(size, gfp);
+	       ocnt->n_devices * sizeof(struct drbd_device_state_change) +
+	       ocnt->n_connections * sizeof(struct drbd_connection_state_change) +
+	       ocnt->n_devices * ocnt->n_connections * sizeof(struct drbd_peer_device_state_change) +
+	       ocnt->n_paths * sizeof(struct drbd_path_state);
+	state_change = kzalloc(size, flags);
 	if (!state_change)
 		return NULL;
-	state_change->n_devices = n_devices;
-	state_change->n_connections = n_connections;
+	state_change->n_connections = ocnt->n_connections;
+	state_change->n_devices = ocnt->n_devices;
+	state_change->n_paths = ocnt->n_paths;
 	state_change->devices = (void *)(state_change + 1);
-	state_change->connections = (void *)&state_change->devices[n_devices];
-	state_change->peer_devices = (void *)&state_change->connections[n_connections];
-	state_change->resource->resource = NULL;
-	for (n = 0; n < n_devices; n++)
-		state_change->devices[n].device = NULL;
-	for (n = 0; n < n_connections; n++)
-		state_change->connections[n].connection = NULL;
+	state_change->connections = (void *)&state_change->devices[ocnt->n_devices];
+	state_change->peer_devices = (void *)&state_change->connections[ocnt->n_connections];
+	state_change->paths = (void *)&state_change->peer_devices[ocnt->n_devices*ocnt->n_connections];
 	return state_change;
 }
 
-struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+struct drbd_state_change *remember_state_change(struct drbd_resource *resource, gfp_t gfp)
 {
 	struct drbd_state_change *state_change;
 	struct drbd_device *device;
-	unsigned int n_devices;
 	struct drbd_connection *connection;
-	unsigned int n_connections;
+	struct drbd_state_change_object_count ocnt;
 	int vnr;
 
 	struct drbd_device_state_change *device_state_change;
 	struct drbd_peer_device_state_change *peer_device_state_change;
 	struct drbd_connection_state_change *connection_state_change;
+	struct drbd_path_state *path_state; /* yes, not a _change :-( */
+
+	lockdep_assert_held(&resource->state_rwlock);
 
-	/* Caller holds req_lock spinlock.
-	 * No state, no device IDR, no connections lists can change. */
-	count_objects(resource, &n_devices, &n_connections);
-	state_change = alloc_state_change(n_devices, n_connections, gfp);
+	count_objects(resource, &ocnt);
+	state_change = alloc_state_change(&ocnt, gfp);
 	if (!state_change)
 		return NULL;
 
 	kref_get(&resource->kref);
 	state_change->resource->resource = resource;
-	state_change->resource->role[OLD] =
-		conn_highest_role(first_connection(resource));
-	state_change->resource->susp[OLD] = resource->susp;
-	state_change->resource->susp_nod[OLD] = resource->susp_nod;
-	state_change->resource->susp_fen[OLD] = resource->susp_fen;
-
-	connection_state_change = state_change->connections;
-	for_each_connection(connection, resource) {
-		kref_get(&connection->kref);
-		connection_state_change->connection = connection;
-		connection_state_change->cstate[OLD] =
-			connection->cstate;
-		connection_state_change->peer_role[OLD] =
-			conn_highest_peer(connection);
-		connection_state_change++;
-	}
+	memcpy(state_change->resource->role,
+	       resource->role, sizeof(resource->role));
+	memcpy(state_change->resource->susp,
+	       resource->susp_user, sizeof(resource->susp_user));
+	memcpy(state_change->resource->susp_nod,
+	       resource->susp_nod, sizeof(resource->susp_nod));
+	memcpy(state_change->resource->susp_uuid,
+	       resource->susp_uuid, sizeof(resource->susp_uuid));
+	memcpy(state_change->resource->fail_io,
+	       resource->fail_io, sizeof(resource->fail_io));
 
 	device_state_change = state_change->devices;
 	peer_device_state_change = state_change->peer_devices;
 	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+
 		kref_get(&device->kref);
 		device_state_change->device = device;
-		device_state_change->disk_state[OLD] = device->state.disk;
+		memcpy(device_state_change->disk_state,
+		       device->disk_state, sizeof(device->disk_state));
+		memcpy(device_state_change->have_quorum,
+		       device->have_quorum, sizeof(device->have_quorum));
 
 		/* The peer_devices for each device have to be enumerated in
 		   the order of the connections. We may not use for_each_peer_device() here. */
 		for_each_connection(connection, resource) {
-			struct drbd_peer_device *peer_device;
-
 			peer_device = conn_peer_device(connection, device->vnr);
+
 			peer_device_state_change->peer_device = peer_device;
-			peer_device_state_change->disk_state[OLD] =
-				device->state.pdsk;
-			peer_device_state_change->repl_state[OLD] =
-				max_t(enum drbd_conns,
-				      C_WF_REPORT_PARAMS, device->state.conn);
-			peer_device_state_change->resync_susp_user[OLD] =
-				device->state.user_isp;
-			peer_device_state_change->resync_susp_peer[OLD] =
-				device->state.peer_isp;
-			peer_device_state_change->resync_susp_dependency[OLD] =
-				device->state.aftr_isp;
+			memcpy(peer_device_state_change->disk_state,
+			       peer_device->disk_state, sizeof(peer_device->disk_state));
+			memcpy(peer_device_state_change->repl_state,
+			       peer_device->repl_state, sizeof(peer_device->repl_state));
+			memcpy(peer_device_state_change->resync_susp_user,
+			       peer_device->resync_susp_user,
+			       sizeof(peer_device->resync_susp_user));
+			memcpy(peer_device_state_change->resync_susp_peer,
+			       peer_device->resync_susp_peer,
+			       sizeof(peer_device->resync_susp_peer));
+			memcpy(peer_device_state_change->resync_susp_dependency,
+			       peer_device->resync_susp_dependency,
+			       sizeof(peer_device->resync_susp_dependency));
+			memcpy(peer_device_state_change->resync_susp_other_c,
+			       peer_device->resync_susp_other_c,
+			       sizeof(peer_device->resync_susp_other_c));
+			memcpy(peer_device_state_change->resync_active,
+			       peer_device->resync_active,
+			       sizeof(peer_device->resync_active));
+			memcpy(peer_device_state_change->replication,
+			       peer_device->replication,
+			       sizeof(peer_device->replication));
+			memcpy(peer_device_state_change->peer_replication,
+			       peer_device->peer_replication,
+			       sizeof(peer_device->peer_replication));
 			peer_device_state_change++;
 		}
 		device_state_change++;
 	}
 
-	return state_change;
-}
-
-static void remember_new_state(struct drbd_state_change *state_change)
-{
-	struct drbd_resource_state_change *resource_state_change;
-	struct drbd_resource *resource;
-	unsigned int n;
-
-	if (!state_change)
-		return;
-
-	resource_state_change = &state_change->resource[0];
-	resource = resource_state_change->resource;
-
-	resource_state_change->role[NEW] =
-		conn_highest_role(first_connection(resource));
-	resource_state_change->susp[NEW] = resource->susp;
-	resource_state_change->susp_nod[NEW] = resource->susp_nod;
-	resource_state_change->susp_fen[NEW] = resource->susp_fen;
-
-	for (n = 0; n < state_change->n_devices; n++) {
-		struct drbd_device_state_change *device_state_change =
-			&state_change->devices[n];
-		struct drbd_device *device = device_state_change->device;
-
-		device_state_change->disk_state[NEW] = device->state.disk;
-	}
+	connection_state_change = state_change->connections;
+	path_state = state_change->paths;
+	for_each_connection(connection, resource) {
+		struct drbd_path *path;
 
-	for (n = 0; n < state_change->n_connections; n++) {
-		struct drbd_connection_state_change *connection_state_change =
-			&state_change->connections[n];
-		struct drbd_connection *connection =
-			connection_state_change->connection;
+		kref_get(&connection->kref);
+		connection_state_change->connection = connection;
+		memcpy(connection_state_change->cstate,
+		       connection->cstate, sizeof(connection->cstate));
+		memcpy(connection_state_change->peer_role,
+		       connection->peer_role, sizeof(connection->peer_role));
+		memcpy(connection_state_change->susp_fen,
+		       connection->susp_fen, sizeof(connection->susp_fen));
+
+		list_for_each_entry(path, &connection->transport.paths, list) {
+			/* Share the connection kref with above.
+			 * Could also share the pointer, but would then need to
+			 * remember an additional n_paths per connection
+			 * count/offset (connection_state_change->n_paths++)
+			 * to be able to associate the paths with its connection.
+			 * So why not directly store the pointer here again. */
+			path_state->connection = connection;
+			kref_get(&path->kref);
+			path_state->path = path;
+			path_state->path_established = test_bit(TR_ESTABLISHED, &path->flags);
+
+			path_state++;
+		}
 
-		connection_state_change->cstate[NEW] = connection->cstate;
-		connection_state_change->peer_role[NEW] =
-			conn_highest_peer(connection);
+		connection_state_change++;
 	}
 
-	for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
-		struct drbd_peer_device_state_change *peer_device_state_change =
-			&state_change->peer_devices[n];
-		struct drbd_device *device =
-			peer_device_state_change->peer_device->device;
-		union drbd_dev_state state = device->state;
-
-		peer_device_state_change->disk_state[NEW] = state.pdsk;
-		peer_device_state_change->repl_state[NEW] =
-			max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
-		peer_device_state_change->resync_susp_user[NEW] =
-			state.user_isp;
-		peer_device_state_change->resync_susp_peer[NEW] =
-			state.peer_isp;
-		peer_device_state_change->resync_susp_dependency[NEW] =
-			state.aftr_isp;
-	}
+	return state_change;
 }
 
 void copy_old_to_new_state_change(struct drbd_state_change *state_change)
@@ -219,7 +434,8 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change)
 	OLD_TO_NEW(resource_state_change->role);
 	OLD_TO_NEW(resource_state_change->susp);
 	OLD_TO_NEW(resource_state_change->susp_nod);
-	OLD_TO_NEW(resource_state_change->susp_fen);
+	OLD_TO_NEW(resource_state_change->susp_uuid);
+	OLD_TO_NEW(resource_state_change->fail_io);
 
 	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
 		struct drbd_connection_state_change *connection_state_change =
@@ -227,6 +443,7 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change)
 
 		OLD_TO_NEW(connection_state_change->peer_role);
 		OLD_TO_NEW(connection_state_change->cstate);
+		OLD_TO_NEW(connection_state_change->susp_fen);
 	}
 
 	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
@@ -234,6 +451,7 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change)
 			&state_change->devices[n_device];
 
 		OLD_TO_NEW(device_state_change->disk_state);
+		OLD_TO_NEW(device_state_change->have_quorum);
 	}
 
 	n_peer_devices = state_change->n_devices * state_change->n_connections;
@@ -246,6 +464,10 @@ void copy_old_to_new_state_change(struct drbd_state_change *state_change)
 		OLD_TO_NEW(p->resync_susp_user);
 		OLD_TO_NEW(p->resync_susp_peer);
 		OLD_TO_NEW(p->resync_susp_dependency);
+		OLD_TO_NEW(p->resync_susp_other_c);
+		OLD_TO_NEW(p->resync_active);
+		OLD_TO_NEW(p->replication);
+		OLD_TO_NEW(p->peer_replication);
 	}
 
 #undef OLD_TO_NEW
@@ -258,2140 +480,5972 @@ void forget_state_change(struct drbd_state_change *state_change)
 	if (!state_change)
 		return;
 
-	if (state_change->resource->resource)
+	if (state_change->resource->resource) {
 		kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+	}
 	for (n = 0; n < state_change->n_devices; n++) {
 		struct drbd_device *device = state_change->devices[n].device;
 
-		if (device)
+		if (device) {
 			kref_put(&device->kref, drbd_destroy_device);
+		}
 	}
 	for (n = 0; n < state_change->n_connections; n++) {
 		struct drbd_connection *connection =
 			state_change->connections[n].connection;
 
-		if (connection)
+		if (connection) {
 			kref_put(&connection->kref, drbd_destroy_connection);
+		}
+	}
+	for (n = 0; n < state_change->n_paths; n++) {
+		struct drbd_path *path = state_change->paths[n].path;
+		if (path) {
+			kref_put(&path->kref, drbd_destroy_path);
+		}
 	}
 	kfree(state_change);
 }
 
-static int w_after_state_ch(struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags,
-			   struct drbd_state_change *);
-static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
-static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
-static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
-				       union drbd_state ns, enum sanitize_state_warnings *warn);
-
-static inline bool is_susp(union drbd_state s)
+static bool state_has_changed(struct drbd_resource *resource)
 {
-        return s.susp || s.susp_nod || s.susp_fen;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	int vnr;
+
+	if (resource->state_change_flags & CS_FORCE_RECALC)
+		return true;
+
+	if (resource->role[OLD] != resource->role[NEW] ||
+	    resource->susp_user[OLD] != resource->susp_user[NEW] ||
+	    resource->susp_nod[OLD] != resource->susp_nod[NEW] ||
+	    resource->susp_quorum[OLD] != resource->susp_quorum[NEW] ||
+	    resource->susp_uuid[OLD] != resource->susp_uuid[NEW] ||
+	    resource->fail_io[OLD] != resource->fail_io[NEW])
+		return true;
+
+	for_each_connection(connection, resource) {
+		if (connection->cstate[OLD] != connection->cstate[NEW] ||
+		    connection->peer_role[OLD] != connection->peer_role[NEW] ||
+		    connection->susp_fen[OLD] != connection->susp_fen[NEW])
+			return true;
+	}
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+
+		if (device->disk_state[OLD] != device->disk_state[NEW] ||
+		    device->have_quorum[OLD] != device->have_quorum[NEW])
+			return true;
+
+		for_each_peer_device(peer_device, device) {
+			if (peer_device->disk_state[OLD] != peer_device->disk_state[NEW] ||
+			    peer_device->repl_state[OLD] != peer_device->repl_state[NEW] ||
+			    peer_device->resync_susp_user[OLD] !=
+				peer_device->resync_susp_user[NEW] ||
+			    peer_device->resync_susp_peer[OLD] !=
+				peer_device->resync_susp_peer[NEW] ||
+			    peer_device->resync_susp_dependency[OLD] !=
+				peer_device->resync_susp_dependency[NEW] ||
+			    peer_device->resync_susp_other_c[OLD] !=
+				peer_device->resync_susp_other_c[NEW] ||
+			    peer_device->resync_active[OLD] !=
+				peer_device->resync_active[NEW] ||
+			    peer_device->replication[OLD] !=
+				peer_device->replication[NEW] ||
+			    peer_device->peer_replication[OLD] !=
+				peer_device->peer_replication[NEW] ||
+			    peer_device->uuid_flags & UUID_FLAG_GOT_STABLE)
+				return true;
+		}
+	}
+	return false;
 }
 
-bool conn_all_vols_unconf(struct drbd_connection *connection)
+static void ___begin_state_change(struct drbd_resource *resource)
 {
-	struct drbd_peer_device *peer_device;
-	bool rv = true;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
 	int vnr;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		if (device->state.disk != D_DISKLESS ||
-		    device->state.conn != C_STANDALONE ||
-		    device->state.role != R_SECONDARY) {
-			rv = false;
-			break;
-		}
+	resource->role[NEW] = resource->role[NOW];
+	resource->susp_user[NEW] = resource->susp_user[NOW];
+	resource->susp_nod[NEW] = resource->susp_nod[NOW];
+	resource->susp_quorum[NEW] = resource->susp_quorum[NOW];
+	resource->susp_uuid[NEW] = resource->susp_uuid[NOW];
+	resource->fail_io[NEW] = resource->fail_io[NOW];
+
+	for_each_connection_rcu(connection, resource) {
+		connection->cstate[NEW] = connection->cstate[NOW];
+		connection->peer_role[NEW] = connection->peer_role[NOW];
+		connection->susp_fen[NEW] = connection->susp_fen[NOW];
 	}
-	rcu_read_unlock();
 
-	return rv;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+
+		device->disk_state[NEW] = device->disk_state[NOW];
+		device->have_quorum[NEW] = device->have_quorum[NOW];
+
+		for_each_peer_device_rcu(peer_device, device) {
+			peer_device->disk_state[NEW] = peer_device->disk_state[NOW];
+			peer_device->repl_state[NEW] = peer_device->repl_state[NOW];
+			peer_device->resync_susp_user[NEW] =
+				peer_device->resync_susp_user[NOW];
+			peer_device->resync_susp_peer[NEW] =
+				peer_device->resync_susp_peer[NOW];
+			peer_device->resync_susp_dependency[NEW] =
+				peer_device->resync_susp_dependency[NOW];
+			peer_device->resync_susp_other_c[NEW] =
+				peer_device->resync_susp_other_c[NOW];
+			peer_device->resync_active[NEW] =
+				peer_device->resync_active[NOW];
+			peer_device->replication[NEW] =
+				peer_device->replication[NOW];
+			peer_device->peer_replication[NEW] =
+				peer_device->peer_replication[NOW];
+		}
+	}
 }
 
-/* Unfortunately the states where not correctly ordered, when
-   they where defined. therefore can not use max_t() here. */
-static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+static void __begin_state_change(struct drbd_resource *resource)
 {
-	if (role1 == R_PRIMARY || role2 == R_PRIMARY)
-		return R_PRIMARY;
-	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
-		return R_SECONDARY;
-	return R_UNKNOWN;
+	rcu_read_lock();
+	___begin_state_change(resource);
 }
 
-static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+static enum drbd_state_rv try_state_change(struct drbd_resource *resource)
 {
-	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
-		return R_UNKNOWN;
-	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
-		return R_SECONDARY;
-	return R_PRIMARY;
+	enum drbd_state_rv rv;
+
+	if (!state_has_changed(resource))
+		return SS_NOTHING_TO_DO;
+	sanitize_state(resource);
+	rv = is_valid_transition(resource);
+	if (rv >= SS_SUCCESS && !(resource->state_change_flags & CS_HARD))
+		rv = is_valid_soft_transition(resource);
+	return rv;
 }
 
-enum drbd_role conn_highest_role(struct drbd_connection *connection)
+static void apply_update_to_exposed_data_uuid(struct drbd_resource *resource)
 {
-	enum drbd_role role = R_SECONDARY;
-	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
 	int vnr;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		role = max_role(role, device->state.role);
-	}
-	rcu_read_unlock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		u64 nedu = device->next_exposed_data_uuid;
+		int changed = 0;
 
-	return role;
+		if (!nedu)
+			continue;
+		if (device->disk_state[NOW] < D_INCONSISTENT)
+			changed = drbd_uuid_set_exposed(device, nedu, false);
+
+		device->next_exposed_data_uuid = 0;
+		if (changed)
+			drbd_info(device, "Executing delayed exposed data uuid update: %016llX\n",
+				  (unsigned long long)device->exposed_data_uuid);
+		else
+			drbd_info(device, "Canceling delayed exposed data uuid update\n");
+	}
 }
 
-enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+void __clear_remote_state_change(struct drbd_resource *resource)
 {
-	enum drbd_role peer = R_UNKNOWN;
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	bool is_connect = resource->twopc_reply.is_connect;
+	int initiator_node_id = resource->twopc_reply.initiator_node_id;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		peer = max_role(peer, device->state.peer);
+	resource->remote_state_change = false;
+	resource->twopc_reply.initiator_node_id = -1;
+	resource->twopc_reply.tid = 0;
+
+	if (is_connect && resource->twopc_prepare_reply_cmd == 0) {
+		struct drbd_connection *connection;
+
+		rcu_read_lock();
+		connection = drbd_connection_by_node_id(resource, initiator_node_id);
+		if (connection)
+			abort_connect(connection);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
 
-	return peer;
+	wake_up_all(&resource->twopc_wait);
+
+	/* Do things that where postponed to after two-phase commits finished */
+	apply_update_to_exposed_data_uuid(resource);
 }
 
-enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+static bool state_is_stable(struct drbd_device *device)
 {
-	enum drbd_disk_state disk_state = D_DISKLESS;
 	struct drbd_peer_device *peer_device;
-	int vnr;
+	bool stable = true;
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
 
 	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk);
+	for_each_peer_device_rcu(peer_device, device) {
+		switch (peer_device->repl_state[NOW]) {
+		/* New io is only accepted when the peer device is unknown or there is
+		 * a well-established connection. */
+		case L_OFF:
+		case L_ESTABLISHED:
+		case L_SYNC_SOURCE:
+		case L_SYNC_TARGET:
+		case L_VERIFY_S:
+		case L_VERIFY_T:
+		case L_PAUSED_SYNC_S:
+		case L_PAUSED_SYNC_T:
+		case L_AHEAD:
+		case L_BEHIND:
+		case L_STARTING_SYNC_S:
+		case L_STARTING_SYNC_T:
+			break;
+
+			/* Allow IO in BM exchange states with new protocols */
+		case L_WF_BITMAP_S:
+			if (peer_device->connection->agreed_pro_version < 96)
+				stable = false;
+			break;
+
+			/* no new io accepted in these states */
+		case L_WF_BITMAP_T:
+		case L_WF_SYNC_UUID:
+			stable = false;
+			break;
+		}
+		if (!stable)
+			break;
 	}
 	rcu_read_unlock();
 
-	return disk_state;
+	switch (device->disk_state[NOW]) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+	case D_FAILED:
+	case D_DETACHING:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during transitional states */
+	case D_ATTACHING:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		stable = false;
+	}
+
+	return stable;
 }
 
-enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+static bool drbd_state_change_is_connect(struct drbd_resource *resource)
 {
-	enum drbd_disk_state disk_state = D_MASK;
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	struct drbd_connection *connection;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+	for_each_connection(connection, resource) {
+		if (connection->cstate[NOW] == C_CONNECTING &&
+				connection->cstate[NEW] == C_CONNECTED)
+			return true;
 	}
-	rcu_read_unlock();
 
-	return disk_state;
+	return false;
 }
 
-enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+static struct after_state_change_work *alloc_after_state_change_work(struct drbd_resource *resource)
 {
-	enum drbd_disk_state disk_state = D_DISKLESS;
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	struct after_state_change_work *work;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk);
+	lockdep_assert_held(&resource->state_rwlock);
+
+	/* If the resource is already "unregistered", the worker thread
+	 * is gone, there is no-one to consume the work item and release
+	 * the associated refcounts. Just don't even create it.
+	 */
+	if (test_bit(R_UNREGISTERED, &resource->flags))
+		return NULL;
+
+	work = kmalloc_obj(*work, GFP_ATOMIC);
+	if (work) {
+		work->state_change = remember_state_change(resource, GFP_ATOMIC);
+		if (!work->state_change) {
+			kfree(work);
+			work = NULL;
+		}
 	}
-	rcu_read_unlock();
+	if (!work)
+		drbd_err(resource, "Could not allocate after state change work\n");
 
-	return disk_state;
+	return work;
 }
 
-enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+static void queue_after_state_change_work(struct drbd_resource *resource,
+					  struct completion *done,
+					  struct after_state_change_work *work)
 {
-	enum drbd_conns conn = C_MASK;
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	if (work) {
+		work->w.cb = w_after_state_change;
+		work->done = done;
+		drbd_queue_work(&resource->work, &work->w);
+	} else if (done) {
+		complete(done);
+	}
+}
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		conn = min_t(enum drbd_conns, conn, device->state.conn);
+static enum drbd_state_rv ___end_state_change(struct drbd_resource *resource, struct completion *done,
+					      enum drbd_state_rv rv, const char *tag)
+{
+	enum chg_state_flags flags = resource->state_change_flags;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	bool is_connect;
+	unsigned int pro_ver;
+	int vnr;
+	bool all_devs_have_quorum = true;
+	struct after_state_change_work *work;
+
+	if (flags & CS_ABORT)
+		goto out;
+	if (rv >= SS_SUCCESS)
+		rv = try_state_change(resource);
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE) {
+			drbd_err(resource, "State change failed: %s (%d)\n",
+					drbd_set_st_err_str(rv), rv);
+			print_state_change(resource, "Failed: ", tag);
+		}
+		goto out;
 	}
-	rcu_read_unlock();
+	if (flags & CS_PREPARE)
+		goto out;
 
-	return conn;
-}
+	update_members(resource);
+	finish_state_change(resource, tag);
 
-static bool no_peer_wf_report_params(struct drbd_connection *connection)
-{
-	struct drbd_peer_device *peer_device;
-	int vnr;
-	bool rv = true;
+	/* Check whether we are establishing a connection before applying the change. */
+	is_connect = drbd_state_change_is_connect(resource);
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-		if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
-			rv = false;
-			break;
+	/* This remembers the state change, so call before applying the change. */
+	work = alloc_after_state_change_work(resource);
+
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
+	resource->role[NOW] = resource->role[NEW];
+	resource->susp_user[NOW] = resource->susp_user[NEW];
+	resource->susp_nod[NOW] = resource->susp_nod[NEW];
+	resource->susp_quorum[NOW] = resource->susp_quorum[NEW];
+	resource->susp_uuid[NOW] = resource->susp_uuid[NEW];
+	resource->fail_io[NOW] = resource->fail_io[NEW];
+	resource->cached_susp = resource_is_suspended(resource, NEW);
+
+	pro_ver = PRO_VERSION_MAX;
+	for_each_connection(connection, resource) {
+		connection->cstate[NOW] = connection->cstate[NEW];
+		connection->peer_role[NOW] = connection->peer_role[NEW];
+		connection->susp_fen[NOW] = connection->susp_fen[NEW];
+
+		pro_ver = min_t(unsigned int, pro_ver,
+			connection->agreed_pro_version);
+
+		wake_up(&connection->ee_wait);
+	}
+	resource->cached_min_aggreed_protocol_version = pro_ver;
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct res_opts *o = &resource->res_opts;
+		struct drbd_peer_device *peer_device;
+
+		device->disk_state[NOW] = device->disk_state[NEW];
+		device->have_quorum[NOW] = device->have_quorum[NEW];
+
+		if (!device->have_quorum[NOW])
+			all_devs_have_quorum = false;
+
+		for_each_peer_device(peer_device, device) {
+			peer_device->disk_state[NOW] = peer_device->disk_state[NEW];
+			peer_device->repl_state[NOW] = peer_device->repl_state[NEW];
+			peer_device->resync_susp_user[NOW] =
+				peer_device->resync_susp_user[NEW];
+			peer_device->resync_susp_peer[NOW] =
+				peer_device->resync_susp_peer[NEW];
+			peer_device->resync_susp_dependency[NOW] =
+				peer_device->resync_susp_dependency[NEW];
+			peer_device->resync_susp_other_c[NOW] =
+				peer_device->resync_susp_other_c[NEW];
+			peer_device->resync_active[NOW] =
+				peer_device->resync_active[NEW];
+			peer_device->replication[NOW] =
+				peer_device->replication[NEW];
+			peer_device->peer_replication[NOW] =
+				peer_device->peer_replication[NEW];
 		}
-	rcu_read_unlock();
+		device->cached_state_unstable = !state_is_stable(device);
+		device->cached_err_io =
+			(o->on_no_quorum == ONQ_IO_ERROR && !device->have_quorum[NOW]) ||
+			(o->on_no_data == OND_IO_ERROR && !drbd_data_accessible(device, NOW)) ||
+			resource->fail_io[NEW];
+	}
+	resource->cached_all_devices_have_quorum = all_devs_have_quorum;
+	smp_wmb(); /* Make the NEW_CUR_UUID bit visible after the state change! */
 
-	return rv;
-}
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+		if (test_bit(__NEW_CUR_UUID, &device->flags)) {
+			clear_bit(__NEW_CUR_UUID, &device->flags);
+			set_bit(NEW_CUR_UUID, &device->flags);
+		}
+		ensure_exposed_data_uuid(device);
+
+		wake_up(&device->al_wait);
+		wake_up(&device->misc_wait);
+
+		/* Due to the exclusivity of two-phase commits, there can only
+		 * be one connection being established at once. Hence it is OK
+		 * to release uuid_sem for all connections if the state change
+		 * is establishing any connection. */
+		if (is_connect) {
+			for_each_peer_device(peer_device, device) {
+				if (test_and_clear_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags))
+					up_read_non_owner(&device->uuid_sem);
+			}
+		}
+	}
 
-static void wake_up_all_devices(struct drbd_connection *connection)
-{
-	struct drbd_peer_device *peer_device;
-	int vnr;
+	wake_up_all(&resource->state_wait);
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-		wake_up(&peer_device->device->state_wait);
+	/* Call this after applying the state change from NEW to NOW. */
+	queue_after_state_change_work(resource, done, work);
+out:
 	rcu_read_unlock();
 
-}
+	if ((flags & CS_TWOPC) && !(flags & CS_PREPARE))
+		__clear_remote_state_change(resource);
 
+	resource->state_change_err_str = NULL;
+	return rv;
+}
 
-/**
- * cl_wide_st_chg() - true if the state change is a cluster wide one
- * @device:	DRBD device.
- * @os:		old (current) state.
- * @ns:		new (wanted) state.
- */
-static int cl_wide_st_chg(struct drbd_device *device,
-			  union drbd_state os, union drbd_state ns)
+void state_change_lock(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags)
 {
-	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
-		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
-		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
-		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
-		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
-		(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+	if ((flags & CS_SERIALIZE) && !(flags & (CS_ALREADY_SERIALIZED | CS_PREPARED))) {
+		WARN_ONCE(current == resource->worker.task,
+			"worker should not initiate state changes with CS_SERIALIZE\n");
+		down(&resource->state_sem);
+	}
+	write_lock_irqsave(&resource->state_rwlock, *irq_flags);
+	resource->state_change_flags = flags;
 }
 
-static union drbd_state
-apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+static void __state_change_unlock(struct drbd_resource *resource, unsigned long *irq_flags, struct completion *done)
 {
-	union drbd_state ns;
-	ns.i = (os.i & ~mask.i) | val.i;
-	return ns;
+	enum chg_state_flags flags = resource->state_change_flags;
+
+	resource->state_change_flags = 0;
+	write_unlock_irqrestore(&resource->state_rwlock, *irq_flags);
+	if (done && expect(resource, current != resource->worker.task))
+		wait_for_completion(done);
+	if ((flags & CS_SERIALIZE) && !(flags & (CS_ALREADY_SERIALIZED | CS_PREPARE)))
+		up(&resource->state_sem);
 }
 
-enum drbd_state_rv
-drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
-		  union drbd_state mask, union drbd_state val)
+void state_change_unlock(struct drbd_resource *resource, unsigned long *irq_flags)
 {
-	unsigned long flags;
-	union drbd_state ns;
-	enum drbd_state_rv rv;
-
-	spin_lock_irqsave(&device->resource->req_lock, flags);
-	ns = apply_mask_val(drbd_read_state(device), mask, val);
-	rv = _drbd_set_state(device, ns, f, NULL);
-	spin_unlock_irqrestore(&device->resource->req_lock, flags);
-
-	return rv;
+	__state_change_unlock(resource, irq_flags, NULL);
 }
 
-/**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @device:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- */
-void drbd_force_state(struct drbd_device *device,
-	union drbd_state mask, union drbd_state val)
+void begin_state_change_locked(struct drbd_resource *resource, enum chg_state_flags flags)
 {
-	drbd_change_state(device, CS_HARD, mask, val);
+	BUG_ON(flags & (CS_SERIALIZE | CS_WAIT_COMPLETE | CS_PREPARE | CS_ABORT));
+	resource->state_change_flags = flags;
+	__begin_state_change(resource);
 }
 
-static enum drbd_state_rv
-_req_st_cond(struct drbd_device *device, union drbd_state mask,
-	     union drbd_state val)
+enum drbd_state_rv end_state_change_locked(struct drbd_resource *resource, const char *tag)
 {
-	union drbd_state os, ns;
-	unsigned long flags;
-	enum drbd_state_rv rv;
-
-	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
-		return SS_CW_SUCCESS;
+	return ___end_state_change(resource, NULL, SS_SUCCESS, tag);
+}
 
-	if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
-		return SS_CW_FAILED_BY_PEER;
+void begin_state_change(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags)
+{
+	state_change_lock(resource, irq_flags, flags);
+	__begin_state_change(resource);
+}
 
-	spin_lock_irqsave(&device->resource->req_lock, flags);
-	os = drbd_read_state(device);
-	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
-	rv = is_valid_transition(os, ns);
-	if (rv >= SS_SUCCESS)
-		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
+static enum drbd_state_rv __end_state_change(struct drbd_resource *resource,
+					     unsigned long *irq_flags,
+					     enum drbd_state_rv rv,
+					     const char *tag)
+{
+	enum chg_state_flags flags = resource->state_change_flags;
+	struct completion __done, *done = NULL;
 
-	if (!cl_wide_st_chg(device, os, ns))
-		rv = SS_CW_NO_NEED;
-	if (rv == SS_UNKNOWN_ERROR) {
-		rv = is_valid_state(device, ns);
-		if (rv >= SS_SUCCESS) {
-			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
-			if (rv >= SS_SUCCESS)
-				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
-		}
+	if ((flags & CS_WAIT_COMPLETE) && !(flags & (CS_PREPARE | CS_ABORT))) {
+		done = &__done;
+		init_completion(done);
 	}
-	spin_unlock_irqrestore(&device->resource->req_lock, flags);
-
+	rv = ___end_state_change(resource, done, rv, tag);
+	__state_change_unlock(resource, irq_flags, rv >= SS_SUCCESS ? done : NULL);
 	return rv;
 }
 
-/**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @device:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- * @f:		flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
-static enum drbd_state_rv
-drbd_req_state(struct drbd_device *device, union drbd_state mask,
-	       union drbd_state val, enum chg_state_flags f)
+enum drbd_state_rv end_state_change(struct drbd_resource *resource, unsigned long *irq_flags,
+		const char *tag)
 {
-	struct completion done;
-	unsigned long flags;
-	union drbd_state os, ns;
-	enum drbd_state_rv rv;
-	void *buffer = NULL;
-
-	init_completion(&done);
-
-	if (f & CS_SERIALIZE)
-		mutex_lock(device->state_mutex);
-	if (f & CS_INHIBIT_MD_IO)
-		buffer = drbd_md_get_buffer(device, __func__);
-
-	spin_lock_irqsave(&device->resource->req_lock, flags);
-	os = drbd_read_state(device);
-	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
-	rv = is_valid_transition(os, ns);
-	if (rv < SS_SUCCESS) {
-		spin_unlock_irqrestore(&device->resource->req_lock, flags);
-		goto abort;
-	}
+	return __end_state_change(resource, irq_flags, SS_SUCCESS, tag);
+}
 
-	if (cl_wide_st_chg(device, os, ns)) {
-		rv = is_valid_state(device, ns);
-		if (rv == SS_SUCCESS)
-			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
-		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+void abort_state_change(struct drbd_resource *resource, unsigned long *irq_flags)
+{
+	resource->state_change_flags &= ~CS_VERBOSE;
+	__end_state_change(resource, irq_flags, SS_UNKNOWN_ERROR, NULL);
+}
 
-		if (rv < SS_SUCCESS) {
-			if (f & CS_VERBOSE)
-				print_st_err(device, os, ns, rv);
-			goto abort;
-		}
+void abort_state_change_locked(struct drbd_resource *resource)
+{
+	resource->state_change_flags &= ~CS_VERBOSE;
+	___end_state_change(resource, NULL, SS_UNKNOWN_ERROR, NULL);
+}
 
-		if (drbd_send_state_req(first_peer_device(device), mask, val)) {
-			rv = SS_CW_FAILED_BY_PEER;
-			if (f & CS_VERBOSE)
-				print_st_err(device, os, ns, rv);
-			goto abort;
-		}
+static void begin_remote_state_change(struct drbd_resource *resource, unsigned long *irq_flags)
+{
+	rcu_read_unlock();
+	write_unlock_irqrestore(&resource->state_rwlock, *irq_flags);
+}
 
-		wait_event(device->state_wait,
-			(rv = _req_st_cond(device, mask, val)));
+static void __end_remote_state_change(struct drbd_resource *resource, enum chg_state_flags flags)
+{
+	rcu_read_lock();
+	resource->state_change_flags = flags;
+	___begin_state_change(resource);
+}
 
-		if (rv < SS_SUCCESS) {
-			if (f & CS_VERBOSE)
-				print_st_err(device, os, ns, rv);
-			goto abort;
-		}
-		spin_lock_irqsave(&device->resource->req_lock, flags);
-		ns = apply_mask_val(drbd_read_state(device), mask, val);
-		rv = _drbd_set_state(device, ns, f, &done);
-	} else {
-		rv = _drbd_set_state(device, ns, f, &done);
-	}
+static void end_remote_state_change(struct drbd_resource *resource, unsigned long *irq_flags, enum chg_state_flags flags)
+{
+	write_lock_irqsave(&resource->state_rwlock, *irq_flags);
+	__end_remote_state_change(resource, flags);
+}
 
-	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+void clear_remote_state_change(struct drbd_resource *resource)
+{
+	unsigned long irq_flags;
 
-	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
-		D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
-		wait_for_completion(&done);
-	}
+	write_lock_irqsave(&resource->state_rwlock, irq_flags);
+	__clear_remote_state_change(resource);
+	write_unlock_irqrestore(&resource->state_rwlock, irq_flags);
+}
 
-abort:
-	if (buffer)
-		drbd_md_put_buffer(device);
-	if (f & CS_SERIALIZE)
-		mutex_unlock(device->state_mutex);
+static union drbd_state drbd_get_resource_state(struct drbd_resource *resource, enum which_state which)
+{
+	union drbd_state rv = { {
+		.conn = C_STANDALONE,  /* really: undefined */
+		/* (user_isp, peer_isp, and aftr_isp are undefined as well.) */
+		.disk = D_UNKNOWN,  /* really: undefined */
+		.role = resource->role[which],
+		.peer = R_UNKNOWN,  /* really: undefined */
+		.susp = resource->susp_user[which] || resource->susp_quorum[which] || resource->susp_uuid[which],
+		.susp_nod = resource->susp_nod[which],
+		.susp_fen = is_suspended_fen(resource, which),
+		.pdsk = D_UNKNOWN,  /* really: undefined */
+	} };
 
 	return rv;
 }
 
-/**
- * _drbd_request_state() - Request a state change (with flags)
- * @device:	DRBD device.
- * @mask:	mask of state bits to change.
- * @val:	value of new state bits.
- * @f:		flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
-enum drbd_state_rv
-_drbd_request_state(struct drbd_device *device, union drbd_state mask,
-		    union drbd_state val, enum chg_state_flags f)
+union drbd_state drbd_get_device_state(struct drbd_device *device, enum which_state which)
 {
-	enum drbd_state_rv rv;
+	union drbd_state rv = drbd_get_resource_state(device->resource, which);
 
-	wait_event(device->state_wait,
-		   (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+	rv.disk = device->disk_state[which];
+	rv.quorum = device->have_quorum[which];
 
 	return rv;
 }
 
-/*
- * We grab drbd_md_get_buffer(), because we don't want to "fail" the disk while
- * there is IO in-flight: the transition into D_FAILED for detach purposes
- * may get misinterpreted as actual IO error in a confused endio function.
- *
- * We wrap it all into wait_event(), to retry in case the drbd_req_state()
- * returns SS_IN_TRANSIENT_STATE.
- *
- * To avoid potential deadlock with e.g. the receiver thread trying to grab
- * drbd_md_get_buffer() while trying to get out of the "transient state", we
- * need to grab and release the meta data buffer inside of that wait_event loop.
- */
-static enum drbd_state_rv
-request_detach(struct drbd_device *device)
-{
-	return drbd_req_state(device, NS(disk, D_FAILED),
-			CS_VERBOSE | CS_ORDERED | CS_INHIBIT_MD_IO);
-}
-
-int drbd_request_detach_interruptible(struct drbd_device *device)
+union drbd_state drbd_get_peer_device_state(struct drbd_peer_device *peer_device, enum which_state which)
 {
-	int ret, rv;
+	struct drbd_connection *connection = peer_device->connection;
+	union drbd_state rv;
 
-	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
-	wait_event_interruptible(device->state_wait,
-		(rv = request_detach(device)) != SS_IN_TRANSIENT_STATE);
-	drbd_resume_io(device);
-
-	ret = wait_event_interruptible(device->misc_wait,
-			device->state.disk != D_FAILED);
-
-	if (rv == SS_IS_DISKLESS)
-		rv = SS_NOTHING_TO_DO;
-	if (ret)
-		rv = ERR_INTR;
+	rv = drbd_get_device_state(peer_device->device, which);
+	rv.user_isp = peer_device->resync_susp_user[which];
+	rv.peer_isp = peer_device->resync_susp_peer[which];
+	rv.aftr_isp = resync_susp_comb_dep(peer_device, which);
+	rv.conn = combined_conn_state(peer_device, which);
+	rv.peer = connection->peer_role[which];
+	rv.pdsk = peer_device->disk_state[which];
 
 	return rv;
 }
 
-enum drbd_state_rv
-_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
-		    union drbd_state val, enum chg_state_flags f)
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
 {
-	enum drbd_state_rv rv;
-
-	BUG_ON(f & CS_SERIALIZE);
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
 
-	wait_event_cmd(device->state_wait,
-		       (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
-		       mutex_unlock(device->state_mutex),
-		       mutex_lock(device->state_mutex));
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = max_t(enum drbd_disk_state, disk_state, device->disk_state[NOW]);
+	}
+	rcu_read_unlock();
 
-	return rv;
+	return disk_state;
 }
 
-static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
 {
-	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
-	    name,
-	    drbd_conn_str(ns.conn),
-	    drbd_role_str(ns.role),
-	    drbd_role_str(ns.peer),
-	    drbd_disk_str(ns.disk),
-	    drbd_disk_str(ns.pdsk),
-	    is_susp(ns) ? 's' : 'r',
-	    ns.aftr_isp ? 'a' : '-',
-	    ns.peer_isp ? 'p' : '-',
-	    ns.user_isp ? 'u' : '-',
-	    ns.susp_fen ? 'F' : '-',
-	    ns.susp_nod ? 'N' : '-'
-	    );
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		disk_state = max_t(enum drbd_disk_state, disk_state, peer_device->disk_state[NOW]);
+	rcu_read_unlock();
+
+	return disk_state;
 }
 
-void print_st_err(struct drbd_device *device, union drbd_state os,
-	          union drbd_state ns, enum drbd_state_rv err)
+static bool suspend_reason_changed(struct drbd_resource *resource)
 {
-	if (err == SS_IN_TRANSIENT_STATE)
-		return;
-	drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
-	print_st(device, " state", os);
-	print_st(device, "wanted", ns);
+	return resource->susp_user[OLD] != resource->susp_user[NEW] ||
+		resource->susp_nod[OLD] != resource->susp_nod[NEW] ||
+		resource->susp_quorum[OLD] != resource->susp_quorum[NEW] ||
+		resource->susp_uuid[OLD] != resource->susp_uuid[NEW] ||
+		is_suspended_fen(resource, OLD) != is_suspended_fen(resource, NEW);
 }
 
-static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
-			       enum chg_state_flags flags)
+static bool resync_suspended(struct drbd_peer_device *peer_device, enum which_state which)
 {
-	char *pbp;
-	pbp = pb;
-	*pbp = 0;
-
-	if (ns.role != os.role && flags & CS_DC_ROLE)
-		pbp += sprintf(pbp, "role( %s -> %s ) ",
-			       drbd_role_str(os.role),
-			       drbd_role_str(ns.role));
-	if (ns.peer != os.peer && flags & CS_DC_PEER)
-		pbp += sprintf(pbp, "peer( %s -> %s ) ",
-			       drbd_role_str(os.peer),
-			       drbd_role_str(ns.peer));
-	if (ns.conn != os.conn && flags & CS_DC_CONN)
-		pbp += sprintf(pbp, "conn( %s -> %s ) ",
-			       drbd_conn_str(os.conn),
-			       drbd_conn_str(ns.conn));
-	if (ns.disk != os.disk && flags & CS_DC_DISK)
-		pbp += sprintf(pbp, "disk( %s -> %s ) ",
-			       drbd_disk_str(os.disk),
-			       drbd_disk_str(ns.disk));
-	if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
-		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
-			       drbd_disk_str(os.pdsk),
-			       drbd_disk_str(ns.pdsk));
-
-	return pbp - pb;
+	return peer_device->resync_susp_user[which] ||
+	       peer_device->resync_susp_peer[which] ||
+	       resync_susp_comb_dep(peer_device, which);
 }
 
-static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
-				 enum chg_state_flags flags)
+static int scnprintf_resync_suspend_flags(char *buffer, size_t size,
+					  struct drbd_peer_device *peer_device,
+					  enum which_state which)
 {
-	char pb[300];
-	char *pbp = pb;
+	struct drbd_device *device = peer_device->device;
+	char *b = buffer, *end = buffer + size;
+
+	if (!resync_suspended(peer_device, which))
+		return scnprintf(buffer, size, "no");
 
-	pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+	if (peer_device->resync_susp_user[which])
+		b += scnprintf(b, end - b, "user,");
+	if (peer_device->resync_susp_peer[which])
+		b += scnprintf(b, end - b, "peer,");
+	if (peer_device->resync_susp_dependency[which])
+		b += scnprintf(b, end - b, "after dependency,");
+	if (peer_device->resync_susp_other_c[which])
+		b += scnprintf(b, end - b, "connection dependency,");
+	if (is_sync_source_state(peer_device, which) && device->disk_state[which] <= D_INCONSISTENT)
+		b += scnprintf(b, end - b, "disk inconsistent,");
 
-	if (ns.aftr_isp != os.aftr_isp)
-		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
-			       os.aftr_isp,
-			       ns.aftr_isp);
-	if (ns.peer_isp != os.peer_isp)
-		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
-			       os.peer_isp,
-			       ns.peer_isp);
-	if (ns.user_isp != os.user_isp)
-		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
-			       os.user_isp,
-			       ns.user_isp);
+	*(--b) = 0;
 
-	if (pbp != pb)
-		drbd_info(device, "%s\n", pb);
+	return b - buffer;
 }
 
-static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
-				 enum chg_state_flags flags)
+static int scnprintf_io_suspend_flags(char *buffer, size_t size,
+				      struct drbd_resource *resource,
+				      enum which_state which)
 {
-	char pb[300];
-	char *pbp = pb;
-
-	pbp += print_state_change(pbp, os, ns, flags);
-
-	if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
-		pbp += sprintf(pbp, "susp( %d -> %d ) ",
-			       is_susp(os),
-			       is_susp(ns));
-
-	if (pbp != pb)
-		drbd_info(connection, "%s\n", pb);
+	char *b = buffer, *end = buffer + size;
+
+	if (!resource_is_suspended(resource, which))
+		return scnprintf(buffer, size, "no");
+
+	if (resource->susp_user[which])
+		b += scnprintf(b, end - b, "user,");
+	if (resource->susp_nod[which])
+		b += scnprintf(b, end - b, "no-disk,");
+	if (is_suspended_fen(resource, which))
+		b += scnprintf(b, end - b, "fencing,");
+	if (resource->susp_quorum[which])
+		b += scnprintf(b, end - b, "quorum,");
+	if (resource->susp_uuid[which])
+		b += scnprintf(b, end - b, "uuid,");
+	*(--b) = 0;
+
+	return b - buffer;
 }
 
-
-/**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @device:	DRBD device.
- * @ns:		State to consider.
- */
-static enum drbd_state_rv
-is_valid_state(struct drbd_device *device, union drbd_state ns)
+static void print_state_change(struct drbd_resource *resource, const char *prefix, const char *tag)
 {
-	/* See drbd_state_sw_errors in drbd_strings.c */
-
-	enum drbd_fencing_p fp;
-	enum drbd_state_rv rv = SS_SUCCESS;
-	struct net_conf *nc;
+	char buffer[150], *b, *end = buffer + sizeof(buffer);
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	enum drbd_role *role = resource->role;
+	bool *fail_io = resource->fail_io;
+	int vnr;
 
-	rcu_read_lock();
-	fp = FP_DONT_CARE;
-	if (get_ldev(device)) {
-		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
-		put_ldev(device);
+	b = buffer;
+	if (role[OLD] != role[NEW])
+		b += scnprintf(b, end - b, "role( %s -> %s ) ",
+			       drbd_role_str(role[OLD]),
+			       drbd_role_str(role[NEW]));
+	if (suspend_reason_changed(resource)) {
+		b += scnprintf(b, end - b, "susp-io( ");
+		b += scnprintf_io_suspend_flags(b, end - b, resource, OLD);
+		b += scnprintf(b, end - b, " -> ");
+		b += scnprintf_io_suspend_flags(b, end - b, resource, NEW);
+		b += scnprintf(b, end - b, " ) ");
+	}
+	if (fail_io[OLD] != fail_io[NEW])
+		b += scnprintf(b, end - b, "force-io-failures( %s -> %s ) ",
+			       fail_io[OLD] ? "yes" : "no",
+			       fail_io[NEW] ? "yes" : "no");
+	if (b != buffer) {
+		*(b-1) = 0;
+		drbd_info(resource, "%s%s%s%s%s\n", prefix, buffer,
+			tag ? " [" : "", tag ?: "", tag ? "]" : "");
 	}
 
-	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
-	if (nc) {
-		if (!nc->two_primaries && ns.role == R_PRIMARY) {
-			if (ns.peer == R_PRIMARY)
-				rv = SS_TWO_PRIMARIES;
-			else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
-				rv = SS_O_VOL_PEER_PRI;
+	for_each_connection(connection, resource) {
+		enum drbd_conn_state *cstate = connection->cstate;
+		enum drbd_role *peer_role = connection->peer_role;
+
+		b = buffer;
+		if (cstate[OLD] != cstate[NEW])
+			b += scnprintf(b, end - b, "conn( %s -> %s ) ",
+				       drbd_conn_str(cstate[OLD]),
+				       drbd_conn_str(cstate[NEW]));
+		if (peer_role[OLD] != peer_role[NEW])
+			b += scnprintf(b, end - b, "peer( %s -> %s ) ",
+				       drbd_role_str(peer_role[OLD]),
+				       drbd_role_str(peer_role[NEW]));
+
+		if (b != buffer) {
+			*(b-1) = 0;
+			drbd_info(connection, "%s%s%s%s%s\n", prefix, buffer,
+				tag ? " [" : "", tag ?: "", tag ? "]" : "");
 		}
 	}
 
-	if (rv <= 0)
-		goto out; /* already found a reason to abort */
-	else if (ns.role == R_SECONDARY && device->open_cnt)
-		rv = SS_DEVICE_IN_USE;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+		enum drbd_disk_state *disk_state = device->disk_state;
+		bool *have_quorum = device->have_quorum;
+
+		b = buffer;
+		if (disk_state[OLD] != disk_state[NEW])
+			b += scnprintf(b, end - b, "disk( %s -> %s ) ",
+				       drbd_disk_str(disk_state[OLD]),
+				       drbd_disk_str(disk_state[NEW]));
+		if (have_quorum[OLD] != have_quorum[NEW])
+			b += scnprintf(b, end - b, "quorum( %s -> %s ) ",
+				       have_quorum[OLD] ? "yes" : "no",
+				       have_quorum[NEW] ? "yes" : "no");
+		if (b != buffer) {
+			*(b-1) = 0;
+			drbd_info(device, "%s%s%s%s%s\n", prefix, buffer,
+				tag ? " [" : "", tag ?: "", tag ? "]" : "");
+		}
 
-	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
-		rv = SS_NO_UP_TO_DATE_DISK;
+		for_each_peer_device(peer_device, device) {
+			enum drbd_disk_state *peer_disk_state = peer_device->disk_state;
+			enum drbd_repl_state *repl_state = peer_device->repl_state;
+			bool *replication = peer_device->replication;
+			bool *peer_replication = peer_device->peer_replication;
+
+			b = buffer;
+			if (peer_disk_state[OLD] != peer_disk_state[NEW])
+				b += scnprintf(b, end - b, "pdsk( %s -> %s ) ",
+					       drbd_disk_str(peer_disk_state[OLD]),
+					       drbd_disk_str(peer_disk_state[NEW]));
+			if (repl_state[OLD] != repl_state[NEW])
+				b += scnprintf(b, end - b, "repl( %s -> %s ) ",
+					       drbd_repl_str(repl_state[OLD]),
+					       drbd_repl_str(repl_state[NEW]));
+
+			if (resync_suspended(peer_device, OLD) !=
+			    resync_suspended(peer_device, NEW)) {
+				b += scnprintf(b, end - b, "resync-susp( ");
+				b += scnprintf_resync_suspend_flags(b, end - b, peer_device, OLD);
+				b += scnprintf(b, end - b, " -> ");
+				b += scnprintf_resync_suspend_flags(b, end - b, peer_device, NEW);
+				b += scnprintf(b, end - b, " ) ");
+			}
 
-	else if (fp >= FP_RESOURCE &&
-		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
-		rv = SS_PRIMARY_NOP;
+			if (replication[OLD] != replication[NEW])
+				b += scnprintf(b, end - b, "replication( %s -> %s ) ",
+					       replication[OLD] ? "yes" : "no",
+					       replication[NEW] ? "yes" : "no");
 
-	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
-		rv = SS_NO_UP_TO_DATE_DISK;
+			if (peer_replication[OLD] != peer_replication[NEW])
+				b += scnprintf(b, end - b, "peer_replication( %s -> %s ) ",
+					       peer_replication[OLD] ? "yes" : "no",
+					       peer_replication[NEW] ? "yes" : "no");
 
-	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
-		rv = SS_NO_LOCAL_DISK;
+			if (b != buffer) {
+				*(b-1) = 0;
+				drbd_info(peer_device, "%s%s%s%s%s\n", prefix, buffer,
+					tag ? " [" : "", tag ?: "", tag ? "]" : "");
+			}
+		}
+	}
+}
 
-	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
-		rv = SS_NO_REMOTE_DISK;
+static bool local_disk_may_be_outdated(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
 
-	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
-		rv = SS_NO_UP_TO_DATE_DISK;
+	if (device->resource->role[NEW] == R_PRIMARY) {
+		for_each_peer_device(peer_device, device) {
+			if (peer_device->disk_state[NEW] == D_UP_TO_DATE &&
+			    peer_device->repl_state[NEW] == L_WF_BITMAP_T)
+				return true;
+		}
+		return false;
+	}
 
-	else if ((ns.conn == C_CONNECTED ||
-		  ns.conn == C_WF_BITMAP_S ||
-		  ns.conn == C_SYNC_SOURCE ||
-		  ns.conn == C_PAUSED_SYNC_S) &&
-		  ns.disk == D_OUTDATED)
-		rv = SS_CONNECTED_OUTDATES;
+	for_each_peer_device(peer_device, device) {
+		if (peer_device->connection->peer_role[NEW] == R_PRIMARY &&
+		    peer_device->repl_state[NEW] > L_OFF)
+			goto have_primary_neighbor;
+	}
 
-	else if (nc && (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-		 (nc->verify_alg[0] == 0))
-		rv = SS_NO_VERIFY_ALG;
+	return true;	/* No neighbor primary, I might be outdated*/
 
-	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-		  first_peer_device(device)->connection->agreed_pro_version < 88)
-		rv = SS_NOT_SUPPORTED;
+have_primary_neighbor:
+	/* Allow self outdating while connecting to a diskless primary. */
+	if (peer_device->disk_state[NEW] == D_DISKLESS &&
+	    peer_device->repl_state[OLD] == L_OFF && peer_device->repl_state[NEW] == L_ESTABLISHED)
+		return true;
 
-	else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
-		rv = SS_NO_UP_TO_DATE_DISK;
+	for_each_peer_device(peer_device, device) {
+		enum drbd_repl_state repl_state = peer_device->repl_state[NEW];
+		switch (repl_state) {
+		case L_WF_BITMAP_S:
+		case L_STARTING_SYNC_S:
+		case L_SYNC_SOURCE:
+		case L_PAUSED_SYNC_S:
+		case L_AHEAD:
+		case L_ESTABLISHED:
+		case L_VERIFY_S:
+		case L_VERIFY_T:
+		case L_OFF:
+			continue;
+		case L_WF_SYNC_UUID:
+		case L_WF_BITMAP_T:
+		case L_STARTING_SYNC_T:
+		case L_SYNC_TARGET:
+		case L_PAUSED_SYNC_T:
+		case L_BEHIND:
+			return true;
+		}
+	}
 
-	else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
-                 ns.pdsk == D_UNKNOWN)
-		rv = SS_NEED_CONNECTION;
+	return false;
+}
 
-	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
-		rv = SS_CONNECTED_OUTDATES;
+static int calc_quorum_at(s32 setting, int voters)
+{
+	int quorum_at;
 
-out:
-	rcu_read_unlock();
+	switch (setting) {
+	case QOU_MAJORITY:
+		quorum_at = voters / 2 + 1;
+		break;
+	case QOU_ALL:
+		quorum_at = voters;
+		break;
+	default:
+		quorum_at = setting;
+	}
 
-	return rv;
+	return quorum_at;
 }
 
-/**
- * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
- * This function limits state transitions that may be declined by DRBD. I.e.
- * user requests (aka soft transitions).
- * @os:		old state.
- * @ns:		new state.
- * @connection:  DRBD connection.
- */
-static enum drbd_state_rv
-is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+static void __calc_quorum_with_disk(struct drbd_device *device, struct quorum_detail *qd)
 {
-	enum drbd_state_rv rv = SS_SUCCESS;
-
-	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
-	    os.conn > C_CONNECTED)
-		rv = SS_RESYNC_RUNNING;
+	struct drbd_resource *resource = device->resource;
+	const u64 quorumless_nodes = device->have_quorum[NOW] ? ~resource->members : 0;
+	const int my_node_id = resource->res_opts.node_id;
+	int node_id;
 
-	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
-		rv = SS_ALREADY_STANDALONE;
+	check_wrongly_set_mdf_exists(device);
 
-	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
-		rv = SS_IS_DISKLESS;
+	rcu_read_lock();
+	for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) {
+		struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id];
+		struct drbd_peer_device *peer_device;
+		enum drbd_disk_state disk_state;
+		enum drbd_repl_state repl_state;
+		bool is_intentional_diskless, is_tiebreaker;
+		struct net_conf *nc;
+
+		if (node_id == my_node_id) {
+			disk_state = device->disk_state[NEW];
+			if (disk_state > D_DISKLESS) {
+				if (disk_state == D_UP_TO_DATE)
+					qd->up_to_date++;
+				else
+					qd->present++;
+			}
+			continue;
+		}
 
-	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
-		rv = SS_NO_NET_CONFIG;
+		/* Ignore non existing nodes.
+		   Note: a fresh (before connected once), intentional diskless peer
+		   gets ignored as well by this.
+		   A fresh diskful peer counts! (since it has MDF_HAVE_BITMAP) */
+		if (!(peer_md->flags & (MDF_HAVE_BITMAP | MDF_NODE_EXISTS | MDF_PEER_DEVICE_SEEN)))
+			continue;
 
-	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
-		rv = SS_LOWER_THAN_OUTDATED;
+		peer_device = peer_device_by_node_id(device, node_id);
 
-	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
-		rv = SS_IN_TRANSIENT_STATE;
+		if (peer_device) {
+			is_intentional_diskless = !want_bitmap(peer_device);
+			nc = rcu_dereference(peer_device->connection->transport.net_conf);
+			is_tiebreaker = rcu_dereference(peer_device->conf)->peer_tiebreaker;
+			if (nc && !nc->allow_remote_read) {
+				dynamic_drbd_dbg(peer_device,
+						 "Excluding from quorum calculation because allow-remote-read = no\n");
+				continue;
+			}
+		} else {
+			is_intentional_diskless = !(peer_md->flags & MDF_PEER_DEVICE_SEEN);
+			is_tiebreaker = true;
+		}
 
-	/* While establishing a connection only allow cstate to change.
-	   Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
-	if (test_bit(STATE_SENT, &connection->flags) &&
-	    !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
-	      (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
-		rv = SS_IN_TRANSIENT_STATE;
+		if (is_intentional_diskless && !is_tiebreaker)
+			continue;
 
-	/* Do not promote during resync handshake triggered by "force primary".
-	 * This is a hack. It should really be rejected by the peer during the
-	 * cluster wide state change request. */
-	if (os.role != R_PRIMARY && ns.role == R_PRIMARY
-		&& ns.pdsk == D_UP_TO_DATE
-		&& ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
-		&& (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
-			rv = SS_IN_TRANSIENT_STATE;
+		repl_state = peer_device ? peer_device->repl_state[NEW] : L_OFF;
+		disk_state = peer_device ? peer_device->disk_state[NEW] : D_UNKNOWN;
+
+		if (repl_state == L_OFF) {
+			if (is_intentional_diskless)
+				/* device should be diskless but is absent */
+				qd->missing_diskless++;
+			else if (disk_state <= D_OUTDATED || peer_md->flags & MDF_PEER_OUTDATED)
+				qd->outdated++;
+			else if (NODE_MASK(node_id) & quorumless_nodes)
+				qd->quorumless++;
+			else
+				qd->unknown++;
+		} else {
+			if (disk_state == D_DISKLESS && is_intentional_diskless)
+				qd->diskless++;
+			else if (disk_state == D_UP_TO_DATE)
+				qd->up_to_date++;
+			else
+				qd->present++;
+		}
+	}
+	rcu_read_unlock();
+}
 
-	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
-		rv = SS_NEED_CONNECTION;
+static void __calc_quorum_no_disk(struct drbd_device *device, struct quorum_detail *qd)
+{
+	struct drbd_resource *resource = device->resource;
+	const u64 quorumless_nodes = device->have_quorum[NOW] ? ~resource->members : 0;
+	struct drbd_peer_device *peer_device;
+	bool is_intentional_diskless;
+
+	if (device->disk_state[NEW] == D_DISKLESS) {
+		/* We only want to consider ourselves as a diskless node when
+		 * we actually intended to be diskless in the config. Otherwise,
+		 * we shouldn't get a vote in the quorum process, so count
+		 * ourselves as unknown. */
+		if (device->device_conf.intentional_diskless)
+			qd->diskless++;
+		else
+			qd->unknown++;
+	}
 
-	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-	    ns.conn != os.conn && os.conn > C_CONNECTED)
-		rv = SS_RESYNC_RUNNING;
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		enum drbd_disk_state disk_state;
+		enum drbd_repl_state repl_state;
+		struct net_conf *nc;
+		bool is_tiebreaker;
+
+		repl_state = peer_device->repl_state[NEW];
+		disk_state = peer_device->disk_state[NEW];
+
+		is_intentional_diskless = !want_bitmap(peer_device);
+		nc = rcu_dereference(peer_device->connection->transport.net_conf);
+		is_tiebreaker = rcu_dereference(peer_device->conf)->peer_tiebreaker;
+		if (nc && !nc->allow_remote_read) {
+			dynamic_drbd_dbg(peer_device,
+					 "Excluding from quorum calculation because allow-remote-read = no\n");
+			continue;
+		}
+		if (is_intentional_diskless && !is_tiebreaker)
+			continue;
 
-	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
-	    os.conn < C_CONNECTED)
-		rv = SS_NEED_CONNECTION;
+		if (repl_state == L_OFF) {
+			if (is_intentional_diskless)
+				/* device should be diskless but is absent */
+				qd->missing_diskless++;
+			else if (disk_state <= D_OUTDATED)
+				qd->outdated++;
+			else if (NODE_MASK(peer_device->node_id) & quorumless_nodes)
+				qd->quorumless++;
+			else
+				qd->unknown++;
+		} else {
+			if (disk_state == D_DISKLESS && is_intentional_diskless)
+				qd->diskless++;
+			else if (disk_state == D_UP_TO_DATE)
+				qd->up_to_date++;
+			else
+				qd->present++;
+		}
 
-	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
-	    && os.conn < C_WF_REPORT_PARAMS)
-		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+		if (disk_state == D_UP_TO_DATE && test_bit(PEER_QUORATE, &peer_device->flags))
+			qd->quorate_peers++;
 
-	if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
-	    os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
-		rv = SS_OUTDATE_WO_CONN;
 
-	return rv;
+	}
+	rcu_read_unlock();
 }
 
-static enum drbd_state_rv
-is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+static bool calc_quorum(struct drbd_device *device, struct quorum_info *qi)
 {
-	/* no change -> nothing to do, at least for the connection part */
-	if (oc == nc)
-		return SS_NOTHING_TO_DO;
-
-	/* disconnect of an unconfigured connection does not make sense */
-	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
-		return SS_ALREADY_STANDALONE;
+	struct drbd_resource *resource = device->resource;
+	int voters, quorum_at, diskless_majority_at, min_redundancy_at;
+	struct quorum_detail qd = {};
+	bool have_quorum;
 
-	/* from C_STANDALONE, we start with C_UNCONNECTED */
-	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
-		return SS_NEED_CONNECTION;
+	if (device->disk_state[NEW] > D_ATTACHING && get_ldev_if_state(device, D_ATTACHING)) {
+		__calc_quorum_with_disk(device, &qd);
+		put_ldev(device);
+	} else {
+		__calc_quorum_no_disk(device, &qd);
+	}
 
-	/* When establishing a connection we need to go through WF_REPORT_PARAMS!
-	   Necessary to do the right thing upon invalidate-remote on a disconnected resource */
-	if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
-		return SS_NEED_CONNECTION;
+	/* Check if a partition containing all missing nodes might have quorum */
+	voters = qd.outdated + qd.quorumless + qd.unknown + qd.up_to_date + qd.present;
+	quorum_at = calc_quorum_at(resource->res_opts.quorum, voters);
+	if (qd.outdated + qd.quorumless + qd.unknown >= quorum_at) {
+		/* when the missing nodes have the quorum, give up the quorumless */
+		qd.unknown += qd.quorumless;
+		qd.quorumless = 0;
+	}
 
-	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
-	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
-		return SS_IN_TRANSIENT_STATE;
+	/* When all the absent nodes are D_OUTDATED (no one D_UNKNOWN), we can be
+	   sure that the other partition is not able to promote. ->
+	   We remove them from the voters. -> We have quorum */
+	if (qd.unknown)
+		voters = qd.outdated + qd.quorumless + qd.unknown + qd.up_to_date + qd.present;
+	else
+		voters = qd.up_to_date + qd.present;
+
+	quorum_at = calc_quorum_at(resource->res_opts.quorum, voters);
+	diskless_majority_at = calc_quorum_at(QOU_MAJORITY, qd.diskless + qd.missing_diskless);
+	min_redundancy_at = calc_quorum_at(resource->res_opts.quorum_min_redundancy, voters);
+
+	if (qi) {
+		qi->voters = voters;
+		qi->up_to_date = qd.up_to_date;
+		qi->present = qd.present;
+		qi->quorum_at = quorum_at;
+		qi->min_redundancy_at = min_redundancy_at;
+	}
 
-	/* After C_DISCONNECTING only C_STANDALONE may follow */
-	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
-		return SS_IN_TRANSIENT_STATE;
+	have_quorum = qd.quorate_peers ||
+		((qd.up_to_date + qd.present) >= quorum_at && qd.up_to_date >= min_redundancy_at);
+
+	if (!have_quorum && voters != 0 && voters % 2 == 0 && qd.up_to_date + qd.present == quorum_at - 1 &&
+		/* It is an even number of nodes (think 2) and we failed by one vote.
+		   Check if we have majority of the diskless nodes connected.
+		   Using the diskless nodes a tie-breaker! */
+	    qd.diskless >= diskless_majority_at && device->have_quorum[NOW]) {
+		have_quorum = true;
+		if (!test_bit(TIEBREAKER_QUORUM, &device->flags)) {
+			set_bit(TIEBREAKER_QUORUM, &device->flags);
+			drbd_info(device, "Would lose quorum, but using tiebreaker logic to keep\n");
+		}
+	} else {
+		clear_bit(TIEBREAKER_QUORUM, &device->flags);
+	}
 
-	return SS_SUCCESS;
+	return have_quorum;
 }
 
+static __printf(2, 3) void _drbd_state_err(struct change_context *context, const char *fmt, ...)
+{
+	struct drbd_resource *resource = context->resource;
+	const char *err_str;
+	va_list args;
+
+	va_start(args, fmt);
+	err_str = kvasprintf(GFP_ATOMIC, fmt, args);
+	va_end(args);
+	if (!err_str)
+		return;
+	if (context->flags & CS_VERBOSE)
+		drbd_err(resource, "%s\n", err_str);
+
+	if (context->err_str)
+		*context->err_str = err_str;
+	else
+		kfree(err_str);
+}
+
+static __printf(2, 3) void drbd_state_err(struct drbd_resource *resource, const char *fmt, ...)
+{
+	const char *err_str;
+	va_list args;
+
+	va_start(args, fmt);
+	err_str = kvasprintf(GFP_ATOMIC, fmt, args);
+	va_end(args);
+	if (!err_str)
+		return;
+	if (resource->state_change_flags & CS_VERBOSE)
+		drbd_err(resource, "%s\n", err_str);
+
+	if (resource->state_change_err_str)
+		*resource->state_change_err_str = err_str;
+	else
+		kfree(err_str);
+}
+
+static enum drbd_state_rv __is_valid_soft_transition(struct drbd_resource *resource)
+{
+	enum drbd_role *role = resource->role;
+	bool *fail_io = resource->fail_io;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	bool in_handshake = false;
+	int vnr;
+
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	if (role[OLD] != R_PRIMARY && role[NEW] == R_PRIMARY) {
+		for_each_connection_rcu(connection, resource) {
+			struct net_conf *nc;
+
+			nc = rcu_dereference(connection->transport.net_conf);
+			if (!nc || nc->two_primaries)
+				continue;
+			if (connection->peer_role[NEW] == R_PRIMARY)
+				return SS_TWO_PRIMARIES;
+		}
+	}
+
+	for_each_connection_rcu(connection, resource) {
+		struct drbd_peer_device *peer_device;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			if (test_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags) &&
+			    peer_device->repl_state[NOW] == L_OFF) {
+				in_handshake = true;
+				goto handshake_found;
+			}
+		}
+	}
+handshake_found:
+
+	if (in_handshake && role[OLD] != role[NEW])
+		return SS_IN_TRANSIENT_STATE;
+
+	if (role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY && fail_io[NEW])
+		return SS_DEVICE_IN_USE;
+
+	for_each_connection_rcu(connection, resource) {
+		enum drbd_conn_state *cstate = connection->cstate;
+		enum drbd_role *peer_role = connection->peer_role;
+		struct net_conf *nc;
+		bool two_primaries;
+
+		if (cstate[NEW] == C_DISCONNECTING && cstate[OLD] == C_STANDALONE)
+			return SS_ALREADY_STANDALONE;
+
+		if (cstate[NEW] == C_CONNECTING && cstate[OLD] < C_UNCONNECTED)
+			return SS_NO_NET_CONFIG;
+
+		if (cstate[NEW] == C_DISCONNECTING && cstate[OLD] == C_UNCONNECTED)
+			return SS_IN_TRANSIENT_STATE;
+
+		nc = rcu_dereference(connection->transport.net_conf);
+		two_primaries = nc ? nc->two_primaries : false;
+		if (peer_role[NEW] == R_PRIMARY && peer_role[OLD] != R_PRIMARY && !two_primaries) {
+			if (role[NOW] == R_PRIMARY)
+				return SS_TWO_PRIMARIES;
+			if (!fail_io[NEW]) {
+				idr_for_each_entry(&resource->devices, device, vnr) {
+					if (!device->writable && device->open_cnt)
+						return SS_PRIMARY_READER;
+					/*
+					 * One might be tempted to add "|| open_rw_cont" here.
+					 * That is wrong. The promotion of a rw opener will be
+					 * handled in its own two-phase commit.
+					 * Returning SS_PRIMARY_READER for a rw_opener might
+					 * causes confusion for the caller, if that then waits
+					 * for the read-only openers to go away.
+					 */
+				}
+			}
+		}
+	}
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		enum drbd_disk_state *disk_state = device->disk_state;
+		struct drbd_peer_device *peer_device;
+		bool any_disk_up_to_date[2];
+		enum which_state which;
+		int nr_negotiating = 0;
+
+		if (in_handshake &&
+		    ((disk_state[OLD] < D_ATTACHING && disk_state[NEW] == D_ATTACHING) ||
+		     (disk_state[OLD] > D_DETACHING && disk_state[NEW] == D_DETACHING)))
+			return SS_IN_TRANSIENT_STATE;
+
+		if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY && device->writable &&
+		    !(resource->state_change_flags & CS_FS_IGN_OPENERS))
+			return SS_DEVICE_IN_USE;
+
+		if (disk_state[NEW] > D_ATTACHING && disk_state[OLD] == D_DISKLESS)
+			return SS_IS_DISKLESS;
+
+		if (disk_state[NEW] == D_OUTDATED && disk_state[OLD] < D_OUTDATED &&
+		    disk_state[OLD] != D_ATTACHING && disk_state[OLD] != D_NEGOTIATING) {
+			/* Do not allow outdate of inconsistent or diskless.
+			   But we have to allow Inconsistent -> Outdated if a resync
+			   finishes over one connection, and is paused on other connections */
+
+			for_each_peer_device_rcu(peer_device, device) {
+				enum drbd_repl_state *repl_state = peer_device->repl_state;
+				if (repl_state[OLD] == L_SYNC_TARGET && repl_state[NEW] == L_ESTABLISHED)
+					goto allow;
+			}
+			return SS_LOWER_THAN_OUTDATED;
+		}
+		allow:
+
+		for (which = OLD; which <= NEW; which++)
+			any_disk_up_to_date[which] = drbd_data_accessible(device, which);
+
+		/* Prevent becoming primary while there is not data accessible
+		   and prevent detach or disconnect while primary */
+		if (!(role[OLD] == R_PRIMARY && !any_disk_up_to_date[OLD]) &&
+		     (role[NEW] == R_PRIMARY && !any_disk_up_to_date[NEW]))
+			return SS_NO_UP_TO_DATE_DISK;
+
+		/* Prevent detach or disconnect while held open read only */
+		if (!device->writable && device->open_cnt &&
+		    any_disk_up_to_date[OLD] && !any_disk_up_to_date[NEW])
+			return SS_NO_UP_TO_DATE_DISK;
+
+		if (disk_state[NEW] == D_NEGOTIATING)
+			nr_negotiating++;
+
+		/* Prevent promote when there is no quorum and
+		 * prevent graceful disconnect/detach that would kill quorum
+		 */
+		if ((role[OLD] == R_SECONDARY || device->have_quorum[OLD]) &&
+		    role[NEW] == R_PRIMARY && !device->have_quorum[NEW]) {
+			struct quorum_info qi;
+
+			calc_quorum(device, &qi);
+
+			if (disk_state[NEW] <= D_ATTACHING)
+				drbd_state_err(resource, "no UpToDate peer with quorum");
+			else if (qi.up_to_date + qi.present < qi.quorum_at)
+				drbd_state_err(resource, "%d of %d nodes visible, need %d for quorum",
+					       qi.up_to_date + qi.present, qi.voters, qi.quorum_at);
+			else if (qi.up_to_date < qi.min_redundancy_at)
+				drbd_state_err(resource, "%d of %d nodes up_to_date, need %d for "
+					       "quorum-minimum-redundancy",
+					       qi.up_to_date, qi.voters, qi.min_redundancy_at);
+			return SS_NO_QUORUM;
+		}
+
+		for_each_peer_device_rcu(peer_device, device) {
+			enum drbd_disk_state *peer_disk_state = peer_device->disk_state;
+			enum drbd_repl_state *repl_state = peer_device->repl_state;
+
+			if (peer_disk_state[NEW] == D_NEGOTIATING)
+				nr_negotiating++;
+
+			if (nr_negotiating > 1)
+				return SS_IN_TRANSIENT_STATE;
+
+			if (peer_device->connection->fencing_policy >= FP_RESOURCE &&
+			    !(role[OLD] == R_PRIMARY && repl_state[OLD] < L_ESTABLISHED && !(peer_disk_state[OLD] <= D_OUTDATED)) &&
+			     (role[NEW] == R_PRIMARY && repl_state[NEW] < L_ESTABLISHED && !(peer_disk_state[NEW] <= D_OUTDATED)))
+				return SS_PRIMARY_NOP;
+
+			if (!(repl_state[OLD] > L_ESTABLISHED && disk_state[OLD] < D_INCONSISTENT) &&
+			     (repl_state[NEW] > L_ESTABLISHED && disk_state[NEW] < D_INCONSISTENT))
+				return SS_NO_LOCAL_DISK;
+
+			if (!(repl_state[OLD] > L_ESTABLISHED && peer_disk_state[OLD] < D_INCONSISTENT) &&
+			     (repl_state[NEW] > L_ESTABLISHED && peer_disk_state[NEW] < D_INCONSISTENT))
+				return SS_NO_REMOTE_DISK;
+
+			if (disk_state[OLD] > D_OUTDATED && disk_state[NEW] == D_OUTDATED &&
+			    !local_disk_may_be_outdated(device))
+				return SS_CONNECTED_OUTDATES;
+
+			if (!(repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) &&
+			     (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T)) {
+				struct net_conf *nc = rcu_dereference(peer_device->connection->transport.net_conf);
+
+				if (!nc || nc->verify_alg[0] == 0)
+					return SS_NO_VERIFY_ALG;
+			}
+
+			if (!(repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) &&
+			     (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) &&
+				  peer_device->connection->agreed_pro_version < 88)
+				return SS_NOT_SUPPORTED;
+
+			if (repl_is_sync_source(repl_state[OLD]) &&
+					repl_state[NEW] == L_WF_BITMAP_S)
+				return SS_RESYNC_RUNNING;
+
+			if (repl_is_sync_target(repl_state[OLD]) &&
+					repl_state[NEW] == L_WF_BITMAP_T)
+				return SS_RESYNC_RUNNING;
+
+			if (repl_state[NEW] != repl_state[OLD] &&
+			    (repl_state[NEW] == L_STARTING_SYNC_T || repl_state[NEW] == L_STARTING_SYNC_S) &&
+			    repl_state[OLD] > L_ESTABLISHED)
+				return SS_RESYNC_RUNNING;
+
+			if ((repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) && repl_state[OLD] < L_ESTABLISHED)
+				return SS_NEED_CONNECTION;
+
+			if ((repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T) &&
+			    repl_state[NEW] != repl_state[OLD] && repl_state[OLD] > L_ESTABLISHED)
+				return SS_RESYNC_RUNNING;
+
+			if ((repl_state[NEW] == L_STARTING_SYNC_S || repl_state[NEW] == L_STARTING_SYNC_T) &&
+			    repl_state[OLD] < L_ESTABLISHED)
+				return SS_NEED_CONNECTION;
+
+			if ((repl_state[NEW] == L_SYNC_TARGET || repl_state[NEW] == L_SYNC_SOURCE)
+			    && repl_state[OLD] < L_OFF)
+				return SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+			if ((peer_disk_state[NEW] > D_DISKLESS && peer_disk_state[NEW] != D_UNKNOWN) &&
+			    peer_disk_state[OLD] == D_DISKLESS && !want_bitmap(peer_device))
+				return SS_ATTACH_NO_BITMAP;  /* peer with --bitmap=no wannts to attach ??? */
+		}
+	}
+
+	return SS_SUCCESS;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if state[NEW] is not valid
+ *
+ * "Soft" transitions are voluntary state changes which drbd may decline, such
+ * as a user request to promote a resource to primary.  Opposed to that are
+ * involuntary or "hard" transitions like a network connection loss.
+ *
+ * When deciding if a "soft" transition should be allowed, "hard" transitions
+ * may already have forced the resource into a critical state.  It may take
+ * several "soft" transitions to get the resource back to normal.  To allow
+ * those, rather than checking if the desired new state is valid, we can only
+ * check if the desired new state is "at least as good" as the current state.
+ *
+ * @resource:	DRBD resource
+ */
+static enum drbd_state_rv is_valid_soft_transition(struct drbd_resource *resource)
+{
+	enum drbd_state_rv rv;
+
+	rcu_read_lock();
+	rv = __is_valid_soft_transition(resource);
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conn_state oc, enum drbd_conn_state nc)
+{
+	/* no change -> nothing to do, at least for the connection part */
+	if (oc == nc)
+		return SS_NOTHING_TO_DO;
+
+	/* disconnect of an unconfigured connection does not make sense */
+	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+		return SS_ALREADY_STANDALONE;
+
+	/* from C_STANDALONE, we start with C_UNCONNECTED */
+	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+		return SS_IN_TRANSIENT_STATE;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+		return SS_IN_TRANSIENT_STATE;
+
+	return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @resource: DRBD resource.
+ */
+static enum drbd_state_rv is_valid_transition(struct drbd_resource *resource)
+{
+	enum drbd_state_rv rv;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	int vnr;
+
+	for_each_connection(connection, resource) {
+		rv = is_valid_conn_transition(connection->cstate[OLD], connection->cstate[NEW]);
+		if (rv < SS_SUCCESS)
+			return rv;
+	}
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		/* we cannot fail (again) if we already detached */
+		if ((device->disk_state[NEW] == D_FAILED || device->disk_state[NEW] == D_DETACHING) &&
+		    device->disk_state[OLD] == D_DISKLESS) {
+			return SS_IS_DISKLESS;
+		}
+	}
+
+	return SS_SUCCESS;
+}
+
+static bool is_sync_target_other_c(struct drbd_peer_device *ign_peer_device)
+{
+	struct drbd_device *device = ign_peer_device->device;
+	struct drbd_peer_device *peer_device;
+
+	for_each_peer_device(peer_device, device) {
+		enum drbd_repl_state r;
+
+		if (peer_device == ign_peer_device)
+			continue;
+
+		r = peer_device->repl_state[NEW];
+		if (r == L_SYNC_TARGET || r == L_PAUSED_SYNC_T)
+			return true;
+	}
+
+	return false;
+}
+
+static void drbd_start_other_targets_paused(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_device *p;
+
+	for_each_peer_device(p, device) {
+		if (p == peer_device)
+			continue;
+
+		if (p->disk_state[NEW] >= D_INCONSISTENT && p->repl_state[NEW] == L_ESTABLISHED)
+			p->repl_state[NEW] = L_PAUSED_SYNC_T;
+	}
+}
+
+static bool drbd_is_sync_target_candidate(struct drbd_peer_device *peer_device)
+{
+	if (!repl_is_sync_target(peer_device->repl_state[NEW]))
+		return false;
+
+	if (peer_device->resync_susp_dependency[NEW] ||
+			peer_device->resync_susp_peer[NEW] ||
+			peer_device->resync_susp_user[NEW])
+		return false;
+
+	if (peer_device->disk_state[NEW] < D_OUTDATED)
+		return false;
+
+	return true;
+
+}
+
+static void drbd_select_sync_target(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_peer_device *target_current = NULL;
+	struct drbd_peer_device *target_active = NULL;
+	struct drbd_peer_device *target_desired = NULL;
+
+	/* Find current and active resync peers. */
+	for_each_peer_device_rcu(peer_device, device) {
+		if (peer_device->repl_state[OLD] == L_SYNC_TARGET && drbd_is_sync_target_candidate(peer_device))
+			target_current = peer_device;
+
+		if (peer_device->resync_active[NEW])
+			target_active = peer_device;
+	}
+
+	/* Choose desired resync peer. */
+	for_each_peer_device_rcu(peer_device, device) {
+		if (!drbd_is_sync_target_candidate(peer_device))
+			continue;
+
+		if (target_desired && drbd_bm_total_weight(peer_device) > drbd_bm_total_weight(target_desired))
+			continue;
+
+		target_desired = peer_device;
+	}
+
+	/* Keep current resync target if the alternative has less than 1MiB
+	 * storage (256 bits) less to resync. */
+	if (target_current && target_desired &&
+			drbd_bm_total_weight(target_current) < drbd_bm_total_weight(target_desired) + 256UL)
+		target_desired = target_current;
+
+	/* Do not activate/unpause a resync if some other is still active. */
+	if (target_desired && target_active && target_desired != target_active)
+		target_desired = NULL;
+
+	/* Activate resync (if not already active). */
+	if (target_desired)
+		target_desired->resync_active[NEW] = true;
+
+	/* Make sure that the targets are correctly paused/unpaused. */
+	for_each_peer_device_rcu(peer_device, device) {
+		enum drbd_repl_state *repl_state = peer_device->repl_state;
+
+		peer_device->resync_susp_other_c[NEW] = target_desired && peer_device != target_desired;
+
+		if (!repl_is_sync_target(repl_state[NEW]))
+			continue;
+
+		peer_device->repl_state[NEW] = peer_device == target_desired ? L_SYNC_TARGET : L_PAUSED_SYNC_T;
+	}
+}
+
+static bool drbd_change_to_inconsistent(enum drbd_disk_state *disk_state,
+		enum drbd_conn_state *cstate)
+{
+	return !(disk_state[OLD] == D_INCONSISTENT && cstate[OLD] == C_CONNECTED) &&
+		(disk_state[NEW] == D_INCONSISTENT && cstate[NEW] == C_CONNECTED);
+}
+
+static void sanitize_state(struct drbd_resource *resource)
+{
+	enum drbd_role *role = resource->role;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	bool maybe_crashed_primary = false;
+	bool volume_lost_data_access = false;
+	bool volumes_have_data_access = true;
+	bool resource_has_quorum = true;
+	int connected_primaries = 0;
+	int vnr;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		enum drbd_conn_state *cstate = connection->cstate;
+
+		if (cstate[NEW] < C_CONNECTED)
+			connection->peer_role[NEW] = R_UNKNOWN;
+
+		if (connection->peer_role[OLD] == R_PRIMARY && cstate[OLD] == C_CONNECTED &&
+		    ((cstate[NEW] >= C_TIMEOUT && cstate[NEW] <= C_PROTOCOL_ERROR) ||
+		     (cstate[NEW] == C_DISCONNECTING && resource->state_change_flags & CS_HARD)))
+			/* implies also C_BROKEN_PIPE and C_NETWORK_FAILURE */
+			maybe_crashed_primary = true;
+
+		if (connection->peer_role[NEW] == R_PRIMARY)
+			connected_primaries++;
+	}
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+		enum drbd_disk_state *disk_state = device->disk_state;
+		bool lost_connection = false;
+		bool have_good_peer = false;
+
+		if (disk_state[OLD] == D_DISKLESS && disk_state[NEW] == D_DETACHING)
+			disk_state[NEW] = D_DISKLESS;
+
+		if ((resource->state_change_flags & CS_IGN_OUTD_FAIL) &&
+		    disk_state[OLD] < D_OUTDATED && disk_state[NEW] == D_OUTDATED)
+			disk_state[NEW] = disk_state[OLD];
+
+		if (disk_state[NEW] == D_NEGOTIATING) {
+			int all = 0, target = 0, no_result = 0;
+			bool up_to_date_neighbor = false;
+
+			if (disk_state[OLD] != D_NEGOTIATING) {
+				for_each_peer_device_rcu(peer_device, device)
+					peer_device->negotiation_result = L_NEGOTIATING;
+			}
+
+			for_each_peer_device_rcu(peer_device, device) {
+				enum drbd_repl_state repl_state = peer_device->repl_state[NEW];
+				enum drbd_repl_state nr = peer_device->negotiation_result;
+				enum drbd_disk_state pdsk = peer_device->disk_state[NEW];
+
+				if (pdsk < D_NEGOTIATING || repl_state == L_OFF)
+					continue;
+
+				if (pdsk == D_UP_TO_DATE)
+					up_to_date_neighbor = true;
+
+				all++;
+				if (nr == L_NEG_NO_RESULT)
+					no_result++;
+				else if (nr == L_NEGOTIATING)
+					goto stay_negotiating;
+				else if (nr == L_WF_BITMAP_T)
+					target++;
+				else if (nr != L_ESTABLISHED && nr != L_WF_BITMAP_S)
+					drbd_err(peer_device, "Unexpected nr = %s\n", drbd_repl_str(nr));
+			}
+
+			/* negotiation finished */
+			if (no_result > 0 && no_result == all)
+				disk_state[NEW] = D_DETACHING;
+			else if (target)
+				disk_state[NEW] = D_INCONSISTENT;
+			else
+				disk_state[NEW] = up_to_date_neighbor ? D_UP_TO_DATE :
+					/* ldev_safe: dstate */ disk_state_from_md(device);
+
+			for_each_peer_device_rcu(peer_device, device) {
+				enum drbd_repl_state nr = peer_device->negotiation_result;
+
+				if (peer_device->connection->cstate[NEW] < C_CONNECTED ||
+				    nr == L_NEGOTIATING)
+					continue;
+
+				if (nr == L_NEG_NO_RESULT)
+					nr = L_ESTABLISHED;
+
+				if (nr == L_WF_BITMAP_S && disk_state[NEW] == D_INCONSISTENT) {
+					/* Should be sync source for one peer and sync
+					   target for an other peer. Delay the sync source
+					   role */
+					nr = L_PAUSED_SYNC_S;
+					peer_device->resync_susp_other_c[NEW] = true;
+					drbd_warn(peer_device, "Finish me\n");
+				}
+				peer_device->repl_state[NEW] = nr;
+			}
+		}
+	stay_negotiating:
+
+		for_each_peer_device_rcu(peer_device, device) {
+			enum drbd_repl_state *repl_state = peer_device->repl_state;
+			enum drbd_disk_state *peer_disk_state = peer_device->disk_state;
+			struct drbd_connection *connection = peer_device->connection;
+			enum drbd_conn_state *cstate = connection->cstate;
+
+			if (peer_disk_state[NEW] == D_UP_TO_DATE &&
+			    (device->exposed_data_uuid & ~UUID_PRIMARY) ==
+			    (peer_device->current_uuid & ~UUID_PRIMARY))
+				have_good_peer = true;
+
+			if (repl_state[NEW] < L_ESTABLISHED) {
+				peer_device->resync_susp_peer[NEW] = false;
+				if (peer_disk_state[NEW] > D_UNKNOWN ||
+				    peer_disk_state[NEW] < D_INCONSISTENT)
+					peer_disk_state[NEW] = D_UNKNOWN;
+			}
+			if (repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] < L_ESTABLISHED) {
+				lost_connection = true;
+				peer_device->resync_active[NEW] = false;
+			}
+
+			/* Clear the aftr_isp when becoming unconfigured */
+			if (cstate[NEW] == C_STANDALONE &&
+			    disk_state[NEW] == D_DISKLESS &&
+			    role[NEW] == R_SECONDARY)
+				peer_device->resync_susp_dependency[NEW] = false;
+
+			/* Abort resync if a disk fails/detaches */
+			if (repl_state[NEW] > L_ESTABLISHED &&
+			    (disk_state[NEW] <= D_FAILED ||
+			     peer_disk_state[NEW] <= D_FAILED)) {
+				repl_state[NEW] = L_ESTABLISHED;
+				clear_bit(RECONCILIATION_RESYNC, &peer_device->flags);
+				peer_device->resync_active[NEW] = false;
+			}
+
+			/* Suspend IO while fence-peer handler runs (peer lost) */
+			if (connection->fencing_policy == FP_STONITH &&
+			    (role[NEW] == R_PRIMARY &&
+			     repl_state[NEW] < L_ESTABLISHED &&
+			     peer_disk_state[NEW] == D_UNKNOWN) &&
+			    (role[OLD] != R_PRIMARY ||
+			     peer_disk_state[OLD] != D_UNKNOWN))
+				connection->susp_fen[NEW] = true;
+		}
+
+		drbd_select_sync_target(device);
+
+		for_each_peer_device_rcu(peer_device, device) {
+			enum drbd_repl_state *repl_state = peer_device->repl_state;
+			enum drbd_disk_state *peer_disk_state = peer_device->disk_state;
+			struct drbd_connection *connection = peer_device->connection;
+			enum drbd_conn_state *cstate = connection->cstate;
+			enum drbd_disk_state min_disk_state, max_disk_state;
+			enum drbd_disk_state min_peer_disk_state, max_peer_disk_state;
+			enum drbd_role *peer_role = connection->peer_role;
+			bool uuids_match, cond;
+
+			/* Pause a SyncSource until it finishes resync as target on other connections */
+			if (repl_state[OLD] != L_SYNC_SOURCE && repl_state[NEW] == L_SYNC_SOURCE &&
+			    is_sync_target_other_c(peer_device))
+				peer_device->resync_susp_other_c[NEW] = true;
+
+			if (resync_suspended(peer_device, NEW)) {
+				if (repl_state[NEW] == L_SYNC_SOURCE)
+					repl_state[NEW] = L_PAUSED_SYNC_S;
+			} else {
+				if (repl_state[NEW] == L_PAUSED_SYNC_S)
+					repl_state[NEW] = L_SYNC_SOURCE;
+			}
+
+			/* Implication of the repl state on other peer's repl state */
+			if (repl_state[OLD] != L_STARTING_SYNC_T && repl_state[NEW] == L_STARTING_SYNC_T)
+				drbd_start_other_targets_paused(peer_device);
+
+			/* D_CONSISTENT vanish when we get connected (pre 9.0) */
+			if (connection->agreed_pro_version < 110 &&
+			    repl_state[NEW] >= L_ESTABLISHED && repl_state[NEW] < L_AHEAD) {
+				if (disk_state[NEW] == D_CONSISTENT)
+					disk_state[NEW] = D_UP_TO_DATE;
+				if (peer_disk_state[NEW] == D_CONSISTENT)
+					peer_disk_state[NEW] = D_UP_TO_DATE;
+			}
+
+			/* Implications of the repl state on the disk states */
+			min_disk_state = D_DISKLESS;
+			max_disk_state = D_UP_TO_DATE;
+			min_peer_disk_state = D_INCONSISTENT;
+			max_peer_disk_state = D_UNKNOWN;
+			switch (repl_state[NEW]) {
+			case L_OFF:
+				/* values from above */
+				break;
+			case L_WF_BITMAP_T:
+			case L_STARTING_SYNC_T:
+			case L_WF_SYNC_UUID:
+			case L_BEHIND:
+				min_disk_state = D_INCONSISTENT;
+				max_disk_state = D_OUTDATED;
+				min_peer_disk_state = D_INCONSISTENT;
+				max_peer_disk_state = D_UP_TO_DATE;
+				break;
+			case L_VERIFY_S:
+			case L_VERIFY_T:
+				min_disk_state = D_INCONSISTENT;
+				max_disk_state = D_UP_TO_DATE;
+				min_peer_disk_state = D_INCONSISTENT;
+				max_peer_disk_state = D_UP_TO_DATE;
+				break;
+			case L_ESTABLISHED:
+				min_disk_state = D_DISKLESS;
+				max_disk_state = D_UP_TO_DATE;
+				min_peer_disk_state = D_DISKLESS;
+				max_peer_disk_state = D_UP_TO_DATE;
+				break;
+			case L_WF_BITMAP_S:
+			case L_PAUSED_SYNC_S:
+			case L_STARTING_SYNC_S:
+			case L_AHEAD:
+				min_disk_state = D_INCONSISTENT;
+				max_disk_state = D_UP_TO_DATE;
+				min_peer_disk_state = D_INCONSISTENT;
+				max_peer_disk_state = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+				break;
+			case L_PAUSED_SYNC_T:
+			case L_SYNC_TARGET:
+				min_disk_state = D_INCONSISTENT;
+				max_disk_state = D_INCONSISTENT;
+				min_peer_disk_state = D_INCONSISTENT;
+				max_peer_disk_state = D_UP_TO_DATE;
+				break;
+			case L_SYNC_SOURCE:
+				min_disk_state = D_INCONSISTENT;
+				max_disk_state = D_UP_TO_DATE;
+				min_peer_disk_state = D_INCONSISTENT;
+				max_peer_disk_state = D_INCONSISTENT;
+				break;
+			}
+
+			/* Implications of the repl state on the disk states */
+			if (disk_state[NEW] > max_disk_state)
+				disk_state[NEW] = max_disk_state;
+
+			if (disk_state[NEW] < min_disk_state)
+				disk_state[NEW] = min_disk_state;
+
+			if (peer_disk_state[NEW] > max_peer_disk_state)
+				peer_disk_state[NEW] = max_peer_disk_state;
+
+			if (peer_disk_state[NEW] < min_peer_disk_state)
+				peer_disk_state[NEW] = min_peer_disk_state;
+
+			/* A detach is a cluster wide transaction. The peer_disk_state updates
+			   are coming in while we have it prepared. When the cluster wide
+			   state change gets committed prevent D_DISKLESS -> D_FAILED */
+			if (peer_disk_state[OLD] == D_DISKLESS &&
+			    (peer_disk_state[NEW] == D_FAILED || peer_disk_state[NEW] == D_DETACHING))
+				peer_disk_state[NEW] = D_DISKLESS;
+
+			/* Upgrade myself from D_OUTDATED if..
+			   1) We connect to stable D_UP_TO_DATE(or D_CONSISTENT) peer without resync
+			   2) The peer just became stable
+			   3) the peer was stable and just became D_UP_TO_DATE */
+			if (repl_state[NEW] == L_ESTABLISHED && disk_state[NEW] == D_OUTDATED &&
+			    peer_disk_state[NEW] >= D_CONSISTENT && test_bit(UUIDS_RECEIVED, &peer_device->flags) &&
+			    peer_device->uuid_flags & UUID_FLAG_STABLE &&
+			    (repl_state[OLD] < L_ESTABLISHED ||
+			     peer_device->uuid_flags & UUID_FLAG_GOT_STABLE ||
+			     peer_disk_state[OLD] == D_OUTDATED))
+				disk_state[NEW] = peer_disk_state[NEW];
+
+			/* The attempted resync made us D_OUTDATED, roll that back in case */
+			if (repl_state[OLD] == L_WF_BITMAP_T && repl_state[NEW] == L_OFF &&
+			    disk_state[NEW] == D_OUTDATED && stable_up_to_date_neighbor(device) &&
+			    /* ldev_safe: repl_state[OLD] */ may_be_up_to_date(device, NEW))
+				disk_state[NEW] = D_UP_TO_DATE;
+
+			/* clause intentional here, the D_CONSISTENT form above might trigger this */
+			if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED &&
+			    disk_state[NEW] == D_CONSISTENT &&
+			    /* ldev_safe: repl_state[NEW] */ may_be_up_to_date(device, NEW))
+				disk_state[NEW] = D_UP_TO_DATE;
+
+			/* Follow a neighbor that goes from D_CONSISTENT TO D_UP_TO_DATE */
+			if (disk_state[NEW] == D_CONSISTENT &&
+			    peer_disk_state[OLD] == D_CONSISTENT &&
+			    peer_disk_state[NEW] == D_UP_TO_DATE &&
+			    peer_device->uuid_flags & UUID_FLAG_STABLE)
+				disk_state[NEW] = D_UP_TO_DATE;
+
+			peer_device->uuid_flags &= ~UUID_FLAG_GOT_STABLE;
+
+			uuids_match =
+				(peer_device->current_uuid & ~UUID_PRIMARY) ==
+				(drbd_current_uuid(device) & ~UUID_PRIMARY);
+
+			if (peer_role[OLD] == R_UNKNOWN && peer_role[NEW] == R_PRIMARY &&
+			    peer_disk_state[NEW] == D_DISKLESS && disk_state[NEW] >= D_NEGOTIATING) {
+				/* Got connected to a diskless primary */
+				if (uuids_match && !is_sync_target_other_c(peer_device)) {
+					if (device->disk_state[NOW] < D_UP_TO_DATE) {
+						drbd_info(peer_device, "Upgrading local disk to D_UP_TO_DATE since current UUID matches.\n");
+						disk_state[NEW] = D_UP_TO_DATE;
+					}
+				} else {
+					set_bit(TRY_TO_GET_RESYNC, &device->flags);
+					if (disk_state[NEW] == D_UP_TO_DATE) {
+						drbd_info(peer_device, "Downgrading local disk to D_CONSISTENT since current UUID differs.\n");
+						disk_state[NEW] = D_CONSISTENT;
+						/* This is a "safety net"; it can only happen if fencing and quorum
+						   are both disabled. This alone would be racy, look for
+						   "Do not trust this guy!" (see also may_return_to_up_to_date()) */
+					}
+				}
+			}
+
+			if (connection->agreed_features & DRBD_FF_RS_SKIP_UUID)
+				cond = have_good_peer &&
+					(device->exposed_data_uuid & ~UUID_PRIMARY) !=
+					(peer_device->current_uuid & ~UUID_PRIMARY);
+			else
+				cond = peer_disk_state[OLD] == D_UNKNOWN &&
+					role[NEW] == R_PRIMARY && !uuids_match;
+
+			if (disk_state[NEW] == D_DISKLESS && peer_disk_state[NEW] == D_UP_TO_DATE &&
+			    cond) {
+				/* Do not trust this guy!
+				   He wants to be D_UP_TO_DATE, but has a different current
+				   UUID. Do not accept him as D_UP_TO_DATE but downgrade that to
+				   D_CONSISTENT here.
+				*/
+				peer_disk_state[NEW] = D_CONSISTENT;
+			}
+
+			/*
+			 * Determine whether peer will disable replication due to this transition.
+			 *
+			 * This matches the condition on the peer below.
+			 */
+			if (drbd_change_to_inconsistent(disk_state, cstate) ||
+					(!repl_is_sync_target(repl_state[OLD]) &&
+					 repl_is_sync_target(repl_state[NEW])))
+				peer_device->peer_replication[NEW] =
+					test_bit(PEER_REPLICATION_NEXT, &peer_device->flags);
+
+			/*
+			 * Decide whether to disable replication when the peer
+			 * transitions to Inconsistent. Only consider the disk
+			 * state when we are Connected because we want to wait
+			 * until we know whether replication should be enabled
+			 * on the next transition to Inconsistent. This is
+			 * communicated with the P_ENABLE_REPLICATION_NEXT
+			 * packet.
+			 *
+			 * Also re-evaluate whether to disable replication when
+			 * we become SyncSource, even when the peer's disk was
+			 * already Inconsistent. This is relevant when
+			 * switching between Ahead-Behind+Inconsistent and
+			 * SyncSource-SyncTarget.
+			 *
+			 * This matches the condition on the peer above.
+			 */
+			if (drbd_change_to_inconsistent(peer_disk_state, cstate) ||
+					(!repl_is_sync_source(repl_state[OLD]) &&
+					 repl_is_sync_source(repl_state[NEW])))
+				peer_device->replication[NEW] =
+					test_bit(REPLICATION_NEXT, &peer_device->flags);
+
+			/*
+			 * Not strictly necessary, since "replication" is only
+			 * considered when the peer disk is Inconsistent, but
+			 * it makes the logs clearer.
+			 */
+			if (peer_disk_state[OLD] == D_INCONSISTENT &&
+					peer_disk_state[NEW] != D_INCONSISTENT)
+				peer_device->replication[NEW] = true;
+		}
+
+		if (resource->res_opts.quorum != QOU_OFF)
+			device->have_quorum[NEW] = calc_quorum(device, NULL);
+		else
+			device->have_quorum[NEW] = true;
+
+		if (!device->have_quorum[NEW] && disk_state[NEW] == D_UP_TO_DATE &&
+		    test_bit(RESTORE_QUORUM, &device->flags)) {
+			device->have_quorum[NEW] = true;
+			set_bit(RESTORING_QUORUM, &device->flags);
+		}
+
+		if (!device->have_quorum[NEW])
+			resource_has_quorum = false;
+
+		/* Suspend IO if we have no accessible data available.
+		 * Policy may be extended later to be able to suspend
+		 * if redundancy falls below a certain level. */
+		if (role[NEW] == R_PRIMARY && !drbd_data_accessible(device, NEW)) {
+			volumes_have_data_access = false;
+			if (role[OLD] != R_PRIMARY || drbd_data_accessible(device, OLD))
+				volume_lost_data_access = true;
+		}
+
+		if (lost_connection && disk_state[NEW] == D_NEGOTIATING)
+			disk_state[NEW] = /* ldev_safe: disk_state */ disk_state_from_md(device);
+
+		if (maybe_crashed_primary && !connected_primaries &&
+		    disk_state[NEW] == D_UP_TO_DATE && role[NOW] == R_SECONDARY)
+			disk_state[NEW] = D_CONSISTENT;
+	}
+	rcu_read_unlock();
+
+	if (volumes_have_data_access)
+		resource->susp_nod[NEW] = false;
+	if (volume_lost_data_access && resource->res_opts.on_no_data == OND_SUSPEND_IO)
+		resource->susp_nod[NEW] = true;
+
+	resource->susp_quorum[NEW] =
+		resource->res_opts.on_no_quorum == ONQ_SUSPEND_IO ? !resource_has_quorum : false;
+
+	if (!resource->susp_uuid[OLD] &&
+	    resource_is_suspended(resource, OLD) && !resource_is_suspended(resource, NEW)) {
+		idr_for_each_entry(&resource->devices, device, vnr) {
+			if (test_bit(NEW_CUR_UUID, &device->flags)) {
+				resource->susp_uuid[NEW] = true;
+				break;
+			}
+		}
+	}
+
+	if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY &&
+	    (resource->state_change_flags & CS_FS_IGN_OPENERS)) {
+		int rw_count, ro_count;
+		drbd_open_counts(resource, &rw_count, &ro_count);
+		if (rw_count)
+			resource->fail_io[NEW] = true;
+	}
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+		drbd_info(device, "Resumed AL updates\n");
+}
+
+static bool drbd_need_twopc_after_lost_peer(struct drbd_connection *connection)
+{
+	enum drbd_conn_state *cstate = connection->cstate;
+
+	/* Is the state change a disconnect? */
+	if (!(cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED))
+		return false;
+
+	/*
+	 * The peer did not provide reachable_nodes when disconnecting, so
+	 * trigger a twopc ourselves.
+	 */
+	if (!(connection->agreed_features & DRBD_FF_2PC_V2))
+		return true;
+
+	/* Trigger a twopc if it was a non-graceful disconnect. */
+	return cstate[NEW] != C_TEAR_DOWN;
+}
+
+static void drbd_schedule_empty_twopc(struct drbd_resource *resource)
+{
+	kref_get(&resource->kref);
+	if (!schedule_work(&resource->empty_twopc)) {
+		kref_put(&resource->kref, drbd_destroy_resource);
+	}
+}
+
+/*
+ * We cache a node mask of the online members of the cluster. It might
+ * be off because a node is still marked as online immediately after
+ * it crashes. That means it might have an online mark for an already
+ * offline node. On the other hand, we guarantee that it never has
+ * a zero for an online node.
+ */
+static void update_members(struct drbd_resource *resource)
+{
+	enum chg_state_flags flags = resource->state_change_flags;
+	struct twopc_reply *reply = &resource->twopc_reply;
+	const int my_node_id = resource->res_opts.node_id;
+	struct drbd_connection *connection;
+
+	/* in case we initiated 2PC we know the reachable nodes */
+	if (flags & CS_TWOPC && reply->initiator_node_id == my_node_id) {
+		resource->members = reply->reachable_nodes;
+		return;
+	}
+
+	/* In case I am 2PC target of a connect or non-graceful disconnect */
+	for_each_connection(connection, resource) {
+		enum drbd_conn_state *cstate = connection->cstate;
+		const int peer_node_mask = NODE_MASK(connection->peer_node_id);
+
+		/* add a fresh connection to the members */
+		if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED)
+			resource->members |= peer_node_mask;
+
+		/* Connection to peer lost. Check if we should remove it from the members */
+		if (drbd_need_twopc_after_lost_peer(connection) &&
+				resource->members & peer_node_mask)
+			drbd_schedule_empty_twopc(resource);
+	}
+}
+
+static bool drbd_any_peer_device_up_to_date(struct drbd_connection *connection)
+{
+	int vnr;
+	struct drbd_peer_device *peer_device;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (peer_device->disk_state[NEW] == D_UP_TO_DATE)
+			return true;
+	}
+
+	return false;
+}
+
+/* Whether replication is enabled on all peers for this device */
+bool drbd_all_peer_replication(struct drbd_device *device, enum which_state which)
+{
+	struct drbd_peer_device *peer_device;
+	bool all_peer_replication = true;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		if (!peer_device->peer_replication[which])
+			all_peer_replication = false;
+	}
+	rcu_read_unlock();
+
+	return all_peer_replication;
+}
+
+/* As drbd_all_peer_replication() but takes a state change object */
+static bool drbd_all_peer_replication_change(struct drbd_state_change *state_change, int n_device,
+		enum which_state which)
+{
+	int n_connection;
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[
+				n_device * state_change->n_connections + n_connection];
+
+		if (!peer_device_state_change->peer_replication[which])
+			return false;
+	}
+
+	return true;
+}
+
+static void drbd_determine_flush_pending(struct drbd_resource *resource)
+{
+	struct drbd_device *device;
+	struct drbd_connection *primary_connection;
+	struct drbd_connection *up_to_date_connection;
+	int vnr;
+	bool send_flush_requests = false;
+
+	/* Clear any bits if we no longer expect or require a flush ack */
+	spin_lock(&resource->initiator_flush_lock);
+	for_each_connection(primary_connection, resource) {
+		u64 *pending_flush_mask = &primary_connection->pending_flush_mask;
+
+		/*
+		 * Clear bits if we no longer expect or require a flush ack due
+		 * to loss of connection to the Primary peer.
+		 */
+		if (primary_connection->cstate[NEW] != C_CONNECTED) {
+			if (*pending_flush_mask)
+				*pending_flush_mask = 0;
+			continue;
+		}
+
+		/*
+		 * Clear bits if we no longer expect or require a flush ack
+		 * because the peer that was UpToDate is no longer UpToDate.
+		 * For instance, if we lose the connection to that peer.
+		 */
+		for_each_connection(up_to_date_connection, resource) {
+			u64 up_to_date_mask = NODE_MASK(up_to_date_connection->peer_node_id);
+
+			if (drbd_any_peer_device_up_to_date(up_to_date_connection))
+				continue;
+
+			if (*pending_flush_mask & up_to_date_mask)
+				*pending_flush_mask &= ~up_to_date_mask;
+		}
+	}
+	spin_unlock(&resource->initiator_flush_lock);
+
+	/* Check if we need a new flush */
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+
+		for_each_peer_device(peer_device, device) {
+			if (!(is_sync_target_state(peer_device, NOW) &&
+					drbd_all_peer_replication(device, NOW)) &&
+					is_sync_target_state(peer_device, NEW) &&
+					drbd_all_peer_replication(device, NEW))
+				send_flush_requests = true;
+		}
+	}
+
+	if (!send_flush_requests)
+		return;
+
+	/* We need a new flush. Mark which acks we are waiting for. */
+	spin_lock(&resource->initiator_flush_lock);
+	resource->current_flush_sequence++;
+
+	for_each_connection(primary_connection, resource) {
+		primary_connection->pending_flush_mask = 0;
+
+		if (primary_connection->peer_role[NEW] != R_PRIMARY)
+			continue;
+
+		if (primary_connection->agreed_pro_version < 123)
+			continue;
+
+		for_each_connection(up_to_date_connection, resource) {
+			u64 up_to_date_mask = NODE_MASK(up_to_date_connection->peer_node_id);
+
+			if (!drbd_any_peer_device_up_to_date(up_to_date_connection))
+				continue;
+
+			if (up_to_date_connection->agreed_pro_version < 123)
+				continue;
+
+			primary_connection->pending_flush_mask |= up_to_date_mask;
+		}
+	}
+	spin_unlock(&resource->initiator_flush_lock);
+}
+
+static void set_ov_position(struct drbd_peer_device *peer_device,
+			    enum drbd_repl_state repl_state)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_bitmap *bm = device->bitmap;
+
+	if (peer_device->connection->agreed_pro_version < 90)
+		peer_device->ov_start_sector = 0;
+	peer_device->rs_total = drbd_bm_bits(device);
+	peer_device->ov_position = 0;
+	if (repl_state == L_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on L_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_common_data_request once the
+		 * first P_OV_REQUEST is received */
+		peer_device->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = bm_sect_to_bit(bm, peer_device->ov_start_sector);
+		if (bit >= peer_device->rs_total) {
+			peer_device->ov_start_sector =
+				bm_bit_to_sect(bm, peer_device->rs_total - 1);
+			peer_device->rs_total = 1;
+		} else
+			peer_device->rs_total -= bit;
+		peer_device->ov_position = peer_device->ov_start_sector;
+	}
+	atomic64_set(&peer_device->ov_left, peer_device->rs_total);
+	peer_device->ov_skipped = 0;
+}
+
+static void initialize_resync_progress_marks(struct drbd_peer_device *peer_device)
+{
+	unsigned long tw = drbd_bm_total_weight(peer_device);
+	unsigned long now = jiffies;
+	int i;
+
+	peer_device->rs_last_progress_report_ts = now;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		peer_device->rs_mark_left[i] = tw;
+		peer_device->rs_mark_time[i] = now;
+	}
+}
+
+static void initialize_resync(struct drbd_peer_device *peer_device)
+{
+	unsigned long tw = drbd_bm_total_weight(peer_device);
+	unsigned long now = jiffies;
+
+	peer_device->last_in_sync_end = 0;
+	peer_device->resync_next_bit = 0;
+	peer_device->last_resync_pass_bits = tw;
+	peer_device->rs_failed = 0;
+	peer_device->rs_paused = 0;
+	peer_device->rs_same_csum = 0;
+	peer_device->rs_total = tw;
+	peer_device->rs_start = now;
+	peer_device->rs_last_writeout = now;
+	initialize_resync_progress_marks(peer_device);
+	drbd_rs_controller_reset(peer_device);
+}
+
+/* Is there a primary with access to up to date data known */
+static bool primary_and_data_present(struct drbd_device *device)
+{
+	bool up_to_date_data = device->disk_state[NEW] == D_UP_TO_DATE;
+	struct drbd_resource *resource = device->resource;
+	bool primary = resource->role[NEW] == R_PRIMARY;
+	struct drbd_peer_device *peer_device;
+
+	for_each_peer_device(peer_device, device) {
+		struct drbd_connection *connection = peer_device->connection;
+
+		/* Do not consider the peer if we are disconnecting. */
+		if (resource->remote_state_change &&
+				drbd_twopc_between_peer_and_me(connection) &&
+				resource->twopc_reply.is_disconnect)
+			continue;
+
+		if (connection->peer_role[NEW] == R_PRIMARY)
+			primary = true;
+
+		if (peer_device->disk_state[NEW] == D_UP_TO_DATE)
+			up_to_date_data = true;
+	}
+
+	return primary && up_to_date_data;
+}
+
+static bool extra_ldev_ref_for_after_state_chg(enum drbd_disk_state *disk_state)
+{
+	return (disk_state[OLD] != D_FAILED && disk_state[NEW] == D_FAILED) ||
+	       (disk_state[OLD] != D_DETACHING && disk_state[NEW] == D_DETACHING) ||
+	       (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS);
+}
+
+static bool has_starting_resyncs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (peer_device->repl_state[NEW] > L_ESTABLISHED)
+			return true;
+	}
+	return false;
+}
+
+static bool should_try_become_up_to_date(struct drbd_device *device, enum drbd_disk_state *disk_state,
+		enum which_state which)
+{
+	return disk_state[OLD] == D_UP_TO_DATE && disk_state[NEW] == D_CONSISTENT &&
+			may_return_to_up_to_date(device, which);
+}
+
+/**
+ * finish_state_change  -  carry out actions triggered by a state change
+ * @resource: DBRD resource.
+ * @tag: State change tag to print in status messages.
+ */
+static void finish_state_change(struct drbd_resource *resource, const char *tag)
+{
+	enum drbd_role *role = resource->role;
+	bool *susp_uuid = resource->susp_uuid;
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+	bool starting_resync = false;
+	bool start_new_epoch = false;
+	bool lost_a_primary_peer = false;
+	bool some_peer_is_primary = false;
+	bool some_peer_request_in_flight = false;
+	bool resource_suspended[2];
+	bool unfreeze_io = false;
+	int vnr;
+
+	print_state_change(resource, "", tag);
+
+	resource_suspended[OLD] = resource_is_suspended(resource, OLD);
+	resource_suspended[NEW] = resource_is_suspended(resource, NEW);
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		bool *have_quorum = device->have_quorum;
+		struct drbd_peer_device *peer_device;
+
+		for_each_peer_device(peer_device, device) {
+			struct drbd_connection *connection = peer_device->connection;
+			bool did, should;
+
+			did = drbd_should_do_remote(peer_device, NOW);
+			should = drbd_should_do_remote(peer_device, NEW);
+
+			if (!did && should) {
+				/* Since "did" is false, the request with this
+				 * dagtag and prior requests were not be marked
+				 * to be sent to this peer. Hence this will not
+				 * send a dagtag packet before the
+				 * corresponding data packet.
+				 *
+				 * It is possible that this peer does not
+				 * actually have the data corresponding to this
+				 * dagtag. However in that case, the disk state
+				 * of that peer will not be D_UP_TO_DATE, so it
+				 * not be relevant what dagtag we have sent it. */
+				connection->send_dagtag = resource->dagtag_sector;
+				drbd_queue_work_if_unqueued(
+						&connection->sender_work,
+						&connection->send_dagtag_work);
+			}
+
+			if (did != should)
+				start_new_epoch = true;
+
+			if (peer_device->repl_state[OLD] != L_WF_BITMAP_S &&
+					peer_device->repl_state[NEW] == L_WF_BITMAP_S)
+				clear_bit(B_RS_H_DONE, &peer_device->flags);
+
+			if (peer_device->repl_state[OLD] != L_WF_BITMAP_T &&
+					peer_device->repl_state[NEW] == L_WF_BITMAP_T)
+				clear_bit(B_RS_H_DONE, &peer_device->flags);
+
+			if (!is_sync_state(peer_device, NOW) &&
+			    is_sync_state(peer_device, NEW)) {
+				clear_bit(RS_DONE, &peer_device->flags);
+				clear_bit(B_RS_H_DONE, &peer_device->flags);
+				clear_bit(SYNC_TARGET_TO_BEHIND, &peer_device->flags);
+			}
+		}
+
+		if (role[NEW] == R_PRIMARY && !have_quorum[NEW])
+			set_bit(PRIMARY_LOST_QUORUM, &device->flags);
+	}
+	if (start_new_epoch)
+		start_new_tl_epoch(resource);
+
+	spin_lock(&resource->peer_ack_lock);
+	if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY && resource->peer_ack_req) {
+		resource->last_peer_acked_dagtag = resource->peer_ack_req->dagtag_sector;
+		drbd_queue_peer_ack(resource, resource->peer_ack_req);
+		resource->peer_ack_req = NULL;
+	}
+	spin_unlock(&resource->peer_ack_lock);
+
+	drbd_determine_flush_pending(resource);
+
+	if (!resource->fail_io[OLD] && resource->fail_io[NEW])
+		drbd_warn(resource, "Failing IOs\n");
+
+	for_each_connection(connection, resource) {
+		enum drbd_role *peer_role = connection->peer_role;
+		enum drbd_conn_state *cstate = connection->cstate;
+
+		if (peer_role[NEW] == R_PRIMARY)
+			some_peer_is_primary = true;
+
+		switch (cstate[NEW]) {
+		case C_CONNECTED:
+			if (atomic_read(&connection->active_ee_cnt)
+					|| atomic_read(&connection->done_ee_cnt))
+				some_peer_request_in_flight = true;
+			break;
+		case C_STANDALONE:
+		case C_UNCONNECTED:
+		case C_CONNECTING:
+			/* maybe others are safe as well? which ones? */
+			break;
+		default:
+			/* if we just disconnected, there may still be some request in flight. */
+			some_peer_request_in_flight = true;
+		}
+
+		if (some_peer_is_primary && some_peer_request_in_flight)
+			break;
+	}
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct drbd_peer_device *peer_device;
+		enum drbd_disk_state *disk_state = device->disk_state;
+		bool create_new_uuid = false;
+
+		if (test_bit(RESTORING_QUORUM, &device->flags) &&
+		    !device->have_quorum[OLD] && device->have_quorum[NEW]) {
+			clear_bit(RESTORING_QUORUM, &device->flags);
+			drbd_info(resource, "Restored quorum from before reboot\n");
+		}
+
+		if (test_bit(RESTORE_QUORUM, &device->flags) &&
+		    (device->have_quorum[NEW] || disk_state[NEW] < D_UP_TO_DATE))
+			clear_bit(RESTORE_QUORUM, &device->flags);
+
+		/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+		 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+		 * drbd_ldev_destroy() won't happen before our corresponding
+		 * w_after_state_change works run, where we put_ldev again. */
+		if (extra_ldev_ref_for_after_state_chg(disk_state))
+			atomic_inc(&device->local_cnt);
+
+		if (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS) {
+			/* who knows if we are ever going to be attached again,
+			 * and whether that will be the same device, or a newly
+			 * initialized one. */
+			for_each_peer_device(peer_device, device)
+				peer_device->bitmap_index = -1;
+		}
+
+		/* ldev_safe: transitioning from D_ATTACHING, ldev just established */
+		if (disk_state[OLD] == D_ATTACHING && disk_state[NEW] >= D_NEGOTIATING)
+			drbd_info(device, "attached to current UUID: %016llX\n", device->ldev->md.current_uuid);
+
+		for_each_peer_device(peer_device, device) {
+			enum drbd_repl_state *repl_state = peer_device->repl_state;
+			enum drbd_disk_state *peer_disk_state = peer_device->disk_state;
+			struct drbd_connection *connection = peer_device->connection;
+			enum drbd_role *peer_role = connection->peer_role;
+
+			if (repl_state[OLD] <= L_ESTABLISHED && repl_state[NEW] == L_WF_BITMAP_S)
+				starting_resync = true;
+
+			if ((disk_state[OLD] != D_UP_TO_DATE || peer_disk_state[OLD] != D_UP_TO_DATE) &&
+			    (disk_state[NEW] == D_UP_TO_DATE && peer_disk_state[NEW] == D_UP_TO_DATE)) {
+				clear_bit(CRASHED_PRIMARY, &device->flags);
+				if (test_bit(UUIDS_RECEIVED, &peer_device->flags))
+					peer_device->uuid_flags &= ~((u64)UUID_FLAG_CRASHED_PRIMARY);
+			}
+
+			/* Aborted verify run, or we reached the stop sector.
+			 * Log the last position, unless end-of-device. */
+			if ((repl_state[OLD] == L_VERIFY_S || repl_state[OLD] == L_VERIFY_T) &&
+			    repl_state[NEW] <= L_ESTABLISHED) {
+				/* ldev_safe: repl_state[OLD] */
+				struct drbd_bitmap *bm = device->bitmap;
+				unsigned long ov_left = atomic64_read(&peer_device->ov_left);
+
+				/* ldev_safe: repl_state[OLD] */
+				peer_device->ov_start_sector =
+					bm_bit_to_sect(bm, drbd_bm_bits(device) - ov_left);
+				if (ov_left)
+					drbd_info(peer_device, "Online Verify reached sector %llu\n",
+						  (unsigned long long)peer_device->ov_start_sector);
+			}
+
+			if ((repl_state[OLD] == L_PAUSED_SYNC_T || repl_state[OLD] == L_PAUSED_SYNC_S) &&
+			    (repl_state[NEW] == L_SYNC_TARGET  || repl_state[NEW] == L_SYNC_SOURCE)) {
+				drbd_info(peer_device, "Syncer continues.\n");
+				peer_device->rs_paused += (long)jiffies
+						  -(long)peer_device->rs_mark_time[peer_device->rs_last_mark];
+				initialize_resync_progress_marks(peer_device);
+				peer_device->resync_next_bit = 0;
+			}
+
+			if ((repl_state[OLD] == L_SYNC_TARGET  || repl_state[OLD] == L_SYNC_SOURCE) &&
+			    (repl_state[NEW] == L_PAUSED_SYNC_T || repl_state[NEW] == L_PAUSED_SYNC_S)) {
+				drbd_info(peer_device, "Resync suspended\n");
+				peer_device->rs_mark_time[peer_device->rs_last_mark] = jiffies;
+			}
+
+
+			if (repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED)
+				clear_bit(RECONCILIATION_RESYNC, &peer_device->flags);
+
+			if (repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] < L_ESTABLISHED)
+				clear_bit(AHEAD_TO_SYNC_SOURCE, &peer_device->flags);
+
+			if (repl_state[OLD] == L_ESTABLISHED &&
+			    (repl_state[NEW] == L_VERIFY_S || repl_state[NEW] == L_VERIFY_T)) {
+				unsigned long now = jiffies;
+				int i;
+
+				/* ldev_safe: repl_state[NEW] */
+				set_ov_position(peer_device, repl_state[NEW]);
+				peer_device->rs_start = now;
+				peer_device->ov_last_oos_size = 0;
+				peer_device->ov_last_oos_start = 0;
+				peer_device->ov_last_skipped_size = 0;
+				peer_device->ov_last_skipped_start = 0;
+				peer_device->rs_last_writeout = now;
+				peer_device->rs_last_progress_report_ts = now;
+				for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+					peer_device->rs_mark_left[i] = peer_device->rs_total;
+					peer_device->rs_mark_time[i] = now;
+				}
+
+				drbd_rs_controller_reset(peer_device);
+			} else if (!(repl_state[OLD] >= L_SYNC_SOURCE && repl_state[OLD] <= L_PAUSED_SYNC_T) &&
+				   (repl_state[NEW] >= L_SYNC_SOURCE && repl_state[NEW] <= L_PAUSED_SYNC_T)) {
+				initialize_resync(peer_device);
+			}
+
+			if (disk_state[NEW] != D_NEGOTIATING && get_ldev(device)) {
+				if (peer_device->bitmap_index != -1) {
+					enum drbd_disk_state pdsk = peer_device->disk_state[NEW];
+					u32 mdf = device->ldev->md.peers[peer_device->node_id].flags;
+					/* Do NOT clear MDF_PEER_DEVICE_SEEN here.
+					 * We want to be able to refuse a resize beyond "last agreed" size,
+					 * even if the peer is currently detached.
+					 */
+					mdf &= ~(MDF_PEER_CONNECTED | MDF_PEER_OUTDATED | MDF_PEER_FENCING);
+					if (repl_state[NEW] > L_OFF)
+						mdf |= MDF_PEER_CONNECTED;
+					if (pdsk >= D_INCONSISTENT) {
+						if (pdsk <= D_OUTDATED)
+							mdf |= MDF_PEER_OUTDATED;
+						if (pdsk != D_UNKNOWN)
+							mdf |= MDF_PEER_DEVICE_SEEN;
+					}
+					if (pdsk == D_DISKLESS && !want_bitmap(peer_device))
+						mdf &= ~MDF_PEER_DEVICE_SEEN;
+					if (peer_device->connection->fencing_policy != FP_DONT_CARE)
+						mdf |= MDF_PEER_FENCING;
+					if (mdf != device->ldev->md.peers[peer_device->node_id].flags) {
+						device->ldev->md.peers[peer_device->node_id].flags = mdf;
+						drbd_md_mark_dirty(device);
+					}
+				}
+
+				/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+				if (disk_state[OLD] == D_INCONSISTENT &&
+				    peer_disk_state[OLD] == D_INCONSISTENT && peer_disk_state[NEW] == D_UP_TO_DATE &&
+				    peer_role[OLD] == R_SECONDARY && peer_role[NEW] == R_PRIMARY)
+					set_bit(CONSIDER_RESYNC, &peer_device->flags);
+
+				/* Resume AL writing if we get a connection */
+				if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED)
+					drbd_resume_al(device);
+				put_ldev(device);
+			}
+
+			if (repl_state[OLD] == L_AHEAD && repl_state[NEW] == L_SYNC_SOURCE) {
+				set_bit(SEND_STATE_AFTER_AHEAD, &peer_device->flags);
+				set_bit(SEND_STATE_AFTER_AHEAD_C, &connection->flags);
+
+				clear_bit(CONN_CONGESTED, &connection->flags);
+				wake_up(&connection->sender_work.q_wait);
+			}
+
+			/* We start writing locally without replicating the changes,
+			 * better start a new data generation */
+			if (repl_state[OLD] != L_AHEAD && repl_state[NEW] == L_AHEAD)
+				create_new_uuid = true;
+
+			if (lost_contact_to_peer_data(peer_disk_state)) {
+				if (role[NEW] == R_PRIMARY && !test_bit(UNREGISTERED, &device->flags) &&
+				    drbd_data_accessible(device, NEW))
+					create_new_uuid = true;
+
+				if (connection->agreed_pro_version < 110 &&
+				    peer_role[NEW] == R_PRIMARY &&
+				    disk_state[NEW] >= D_UP_TO_DATE)
+					create_new_uuid = true;
+			}
+			if (peer_returns_diskless(peer_device, peer_disk_state[OLD], peer_disk_state[NEW])) {
+				if (role[NEW] == R_PRIMARY && !test_bit(UNREGISTERED, &device->flags) &&
+				    disk_state[NEW] == D_UP_TO_DATE)
+					create_new_uuid = true;
+			}
+
+			if (disk_state[OLD] > D_FAILED && disk_state[NEW] == D_FAILED &&
+			    role[NEW] == R_PRIMARY && drbd_data_accessible(device, NEW))
+				create_new_uuid = true;
+
+			if (peer_disk_state[NEW] < D_UP_TO_DATE && test_bit(GOT_NEG_ACK, &peer_device->flags))
+				clear_bit(GOT_NEG_ACK, &peer_device->flags);
+
+			if (repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED)
+				clear_bit(SYNC_SRC_CRASHED_PRI, &peer_device->flags);
+
+			if (peer_role[OLD] != peer_role[NEW] || role[OLD] != role[NEW] ||
+			    peer_disk_state[OLD] != peer_disk_state[NEW])
+				drbd_update_mdf_al_disabled(device, NEW);
+		}
+
+		if (disk_state[OLD] >= D_INCONSISTENT && disk_state[NEW] < D_INCONSISTENT &&
+		    role[NEW] == R_PRIMARY && drbd_data_accessible(device, NEW))
+			create_new_uuid = true;
+
+		if (role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY)
+			create_new_uuid = true;
+
+		/* Only a single new current uuid when susp_uuid becomes true */
+		if (create_new_uuid && !susp_uuid[OLD])
+			set_bit(__NEW_CUR_UUID, &device->flags);
+
+		if (disk_state[NEW] != D_NEGOTIATING && get_ldev_if_state(device, D_DETACHING)) {
+			u32 mdf = device->ldev->md.flags;
+			bool graceful_detach = disk_state[NEW] == D_DETACHING && !test_bit(FORCE_DETACH, &device->flags);
+
+			/* For now, always require a drbdmeta apply-al run,
+			 * even if that ends up only re-initializing the AL */
+			mdf &= ~MDF_AL_CLEAN;
+			/* reset some flags to what we know now */
+			mdf &= ~MDF_CRASHED_PRIMARY;
+			if (test_bit(CRASHED_PRIMARY, &device->flags) ||
+			    (role[NEW] == R_PRIMARY && !graceful_detach))
+				mdf |= MDF_CRASHED_PRIMARY;
+			mdf &= ~MDF_PRIMARY_LOST_QUORUM;
+			if (test_bit(PRIMARY_LOST_QUORUM, &device->flags))
+				mdf |= MDF_PRIMARY_LOST_QUORUM;
+			/* Do not touch MDF_CONSISTENT if we are D_FAILED */
+			if (disk_state[NEW] >= D_INCONSISTENT) {
+				mdf &= ~(MDF_CONSISTENT | MDF_WAS_UP_TO_DATE);
+
+				if (disk_state[NEW] > D_INCONSISTENT)
+					mdf |= MDF_CONSISTENT;
+				if (disk_state[NEW] > D_OUTDATED)
+					mdf |= MDF_WAS_UP_TO_DATE;
+			} else if ((disk_state[NEW] == D_FAILED || disk_state[NEW] == D_DETACHING) &&
+				   mdf & MDF_WAS_UP_TO_DATE &&
+				   primary_and_data_present(device)) {
+				/* There are cases when we still can update meta-data even if disk
+				   state is failed.... Clear MDF_WAS_UP_TO_DATE if appropriate */
+				mdf &= ~MDF_WAS_UP_TO_DATE;
+			}
+
+/*
+ * MDF_PRIMARY_IND  IS set: apply activity log after crash
+ * MDF_PRIMARY_IND NOT set: do not apply, forget and re-initialize activity log after crash.
+ * We want the MDF_PRIMARY_IND set *always* before our backend could possibly
+ * be target of write requests, whether we are Secondary or Primary ourselves.
+ *
+ * We want to avoid to clear that flag just because we lost the connection to a
+ * detached Primary, but before all in-flight IO was drained, because we may
+ * have some dirty bits not yet persisted.
+ *
+ * We want it cleared only once we are *certain* that we no longer see any Primary,
+ * are not Primary ourselves, AND all previously received WRITE (peer-) requests
+ * have been processed, NOTHING is in flight against our backend anymore,
+ * AND we have successfully written out any dirty bitmap pages.
+ *
+ *
+ * MDF_PEER_DEVICE_SEEN ... The peer had a backing device at some point
+ * MDF_NODE_EXISTS ... We have seen evidence that this node exists in the cluster.
+ *   Note: This bit does **not** get set when a new peer/connection is created with
+ *   `drbdsetup new-peer ...`.  The bit gets set when we establish a connection
+ *   successfully for the first time or we learn via other nodes about the
+ *   existence.
+ */
+
+			/* set, if someone is/becomes primary */
+			if (role[NEW] == R_PRIMARY || some_peer_is_primary)
+				mdf |= MDF_PRIMARY_IND;
+			/* clear, if */
+			else if (/* NO peer requests in flight, AND */
+				 !some_peer_request_in_flight &&
+				 (graceful_detach ||
+				  /* or everyone secondary ... */
+				  (role[NEW] == R_SECONDARY && !some_peer_is_primary &&
+				   /* ... and not detaching because of IO error. */
+				   disk_state[NEW] >= D_INCONSISTENT)))
+				mdf &= ~MDF_PRIMARY_IND;
+
+			if (device->have_quorum[NEW])
+				mdf |= MDF_HAVE_QUORUM;
+			else
+				mdf &= ~MDF_HAVE_QUORUM;
+			/* apply changed flags to md.flags,
+			 * and "schedule" for write-out */
+			if (mdf != device->ldev->md.flags ||
+			    device->ldev->md.members != resource->members) {
+				device->ldev->md.flags = mdf;
+				device->ldev->md.members = resource->members;
+				drbd_md_mark_dirty(device);
+			}
+			if (disk_state[OLD] < D_CONSISTENT && disk_state[NEW] >= D_CONSISTENT)
+				drbd_uuid_set_exposed(device, device->ldev->md.current_uuid, true);
+			put_ldev(device);
+		}
+
+		/* remember last attach time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if ((disk_state[OLD] == D_ATTACHING || disk_state[OLD] == D_NEGOTIATING) &&
+		    disk_state[NEW] > D_NEGOTIATING)
+			device->last_reattach_jif = jiffies;
+
+		if (!device->have_quorum[OLD] && device->have_quorum[NEW])
+			clear_bit(PRIMARY_LOST_QUORUM, &device->flags);
+
+		if (resource_suspended[NEW] &&
+		    !(role[OLD] == R_PRIMARY && !drbd_data_accessible(device, OLD)) &&
+		     (role[NEW] == R_PRIMARY && !drbd_data_accessible(device, NEW)) &&
+		    resource->res_opts.on_no_data == OND_IO_ERROR)
+			unfreeze_io = true;
+
+		if (!resource->fail_io[OLD] && resource->fail_io[NEW])
+			unfreeze_io = true;
+
+		if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY)
+			clear_bit(NEW_CUR_UUID, &device->flags);
+
+		if (should_try_become_up_to_date(device, disk_state, NEW))
+			set_bit(TRY_BECOME_UP_TO_DATE_PENDING, &resource->flags);
+	}
+
+	for_each_connection(connection, resource) {
+		enum drbd_conn_state *cstate = connection->cstate;
+		enum drbd_role *peer_role = connection->peer_role;
+
+		/*
+		 * If we lose connection to a Primary node then we need to
+		 * inform our peers so that we can potentially do a
+		 * reconciliation resync. The function conn_disconnect()
+		 * informs the peers. So we must set the flag before stopping
+		 * the receiver.
+		 */
+		if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED &&
+				peer_role[OLD] == R_PRIMARY)
+			set_bit(NOTIFY_PEERS_LOST_PRIMARY, &connection->flags);
+
+		/* Receiver should clean up itself */
+		if (cstate[OLD] != C_DISCONNECTING && cstate[NEW] == C_DISCONNECTING)
+			drbd_thread_stop_nowait(&connection->receiver);
+
+		/* Now the receiver finished cleaning up itself, it should die */
+		if (cstate[OLD] != C_STANDALONE && cstate[NEW] == C_STANDALONE)
+			drbd_thread_stop_nowait(&connection->receiver);
+
+		/* Upon network failure, we need to restart the receiver. */
+		if (cstate[OLD] >= C_CONNECTING &&
+		    cstate[NEW] <= C_TEAR_DOWN && cstate[NEW] >= C_TIMEOUT)
+			drbd_thread_restart_nowait(&connection->receiver);
+
+		if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED)
+			twopc_connection_down(connection);
+
+		/* remember last connect time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED)
+			connection->last_reconnect_jif = jiffies;
+
+		if (resource_suspended[OLD]) {
+			enum drbd_req_event walk_event = -1;
+
+			/* If we resume IO without this connection, then we
+			 * need to cancel suspended requests. */
+			if ((!resource_suspended[NEW] || unfreeze_io) && cstate[NEW] < C_CONNECTED)
+				walk_event = CANCEL_SUSPENDED_IO;
+			/* On reconnection when we have been suspended we need
+			 * to process suspended requests. If there are resyncs,
+			 * that means that it was not a simple disconnect and
+			 * reconnect, so we cannot resend. We must cancel
+			 * instead. */
+			else if (cstate[OLD] < C_CONNECTED && cstate[NEW] == C_CONNECTED)
+				walk_event = has_starting_resyncs(connection) ? CANCEL_SUSPENDED_IO : RESEND;
+
+			if (walk_event != -1)
+				__tl_walk(resource, connection, &connection->req_not_net_done, walk_event);
+
+			/* Since we are in finish_state_change(), and the state
+			 * was previously not C_CONNECTED, the sender cannot
+			 * have received any requests yet. So it will find any
+			 * requests to resend when it rescans the transfer log. */
+			if (walk_event == RESEND)
+				wake_up(&connection->sender_work.q_wait);
+		}
+
+		if (cstate[OLD] == C_CONNECTED && cstate[NEW] < C_CONNECTED)
+			set_bit(RECONNECT, &connection->flags);
+
+		if (starting_resync && peer_role[NEW] == R_PRIMARY)
+			apply_unacked_peer_requests(connection);
+
+		if (peer_role[OLD] == R_PRIMARY && peer_role[NEW] == R_UNKNOWN)
+			lost_a_primary_peer = true;
+	}
+
+	if (lost_a_primary_peer) {
+		idr_for_each_entry(&resource->devices, device, vnr) {
+			struct drbd_peer_device *peer_device;
+
+			for_each_peer_device(peer_device, device) {
+				enum drbd_repl_state repl_state = peer_device->repl_state[NEW];
+
+				if (!test_bit(UNSTABLE_RESYNC, &peer_device->flags) &&
+				    (repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) &&
+				    !(peer_device->uuid_flags & UUID_FLAG_STABLE) &&
+				    !drbd_stable_sync_source_present(peer_device, NEW))
+					set_bit(UNSTABLE_RESYNC, &peer_device->flags);
+			}
+		}
+	}
+
+	if (resource_suspended[OLD] && !resource_suspended[NEW])
+		drbd_restart_suspended_reqs(resource);
+
+	if ((resource_suspended[OLD] && !resource_suspended[NEW]) || unfreeze_io)
+		__tl_walk(resource, NULL, NULL, COMPLETION_RESUMED);
+}
+
+static void abw_start_sync(struct drbd_device *device,
+			   struct drbd_peer_device *peer_device, int rv)
+{
+	struct drbd_peer_device *pd;
+
+	if (rv) {
+		drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+		stable_change_repl_state(peer_device, L_ESTABLISHED, CS_VERBOSE, "start-sync");
+		return;
+	}
+
+	switch (peer_device->repl_state[NOW]) {
+	case L_STARTING_SYNC_T:
+		/* Since the number of set bits changed and the other peer_devices are
+		   lready in L_PAUSED_SYNC_T state, we need to set rs_total here */
+		rcu_read_lock();
+		for_each_peer_device_rcu(pd, device)
+			initialize_resync(pd);
+		rcu_read_unlock();
+
+		if (peer_device->connection->agreed_pro_version < 110)
+			stable_change_repl_state(peer_device, L_WF_SYNC_UUID, CS_VERBOSE,
+					"start-sync");
+		else
+			drbd_start_resync(peer_device, L_SYNC_TARGET, "start-sync");
+		break;
+	case L_STARTING_SYNC_S:
+		drbd_start_resync(peer_device, L_SYNC_SOURCE, "start-sync");
+		break;
+	default:
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
+		char *why, enum bm_flag flags,
+		struct drbd_peer_device *peer_device)
+{
+	int rv;
+
+	D_ASSERT(device, current == device->resource->worker.task);
+
+	if (!device->bitmap)
+		return 0;
+
+	/* open coded non-blocking drbd_suspend_io(device); */
+	atomic_inc(&device->suspend_cnt);
+
+	if (flags & BM_LOCK_SINGLE_SLOT)
+		drbd_bm_slot_lock(peer_device, why, flags);
+	else
+		drbd_bm_lock(device, why, flags);
+	rv = io_fn(device, peer_device);
+	if (flags & BM_LOCK_SINGLE_SLOT)
+		drbd_bm_slot_unlock(peer_device);
+	else
+		drbd_bm_unlock(device);
+
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+static bool state_change_is_susp_fen(struct drbd_state_change *state_change,
+					    enum which_state which)
+{
+	int n_connection;
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		if (connection_state_change->susp_fen[which])
+			return true;
+	}
+
+	return false;
+}
+
+static bool state_change_is_susp_quorum(struct drbd_state_change *state_change,
+					       enum which_state which)
+{
+	struct drbd_resource *resource = state_change->resource[0].resource;
+	int n_device;
+
+	if (resource->res_opts.on_no_quorum != ONQ_SUSPEND_IO)
+		return false;
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+				&state_change->devices[n_device];
+
+		if (!device_state_change->have_quorum[which])
+			return true;
+	}
+
+	return false;
+}
+
+static bool resync_susp_comb_dep_sc(struct drbd_state_change *state_change,
+				    unsigned int n_device, int n_connection,
+				    enum which_state which)
+{
+	struct drbd_peer_device_state_change *peer_device_state_change =
+		&state_change->peer_devices[n_device * state_change->n_connections + n_connection];
+	struct drbd_device_state_change *device_state_change = &state_change->devices[n_device];
+	bool resync_susp_dependency = peer_device_state_change->resync_susp_dependency[which];
+	bool resync_susp_other_c = peer_device_state_change->resync_susp_other_c[which];
+	enum drbd_repl_state repl_state = peer_device_state_change->repl_state[which];
+	enum drbd_disk_state disk_state = device_state_change->disk_state[which];
+
+	return resync_susp_dependency || resync_susp_other_c ||
+		((repl_state == L_SYNC_SOURCE || repl_state == L_PAUSED_SYNC_S)
+		 && disk_state <= D_INCONSISTENT);
+}
+
+static union drbd_state state_change_word(struct drbd_state_change *state_change,
+					  unsigned int n_device, int n_connection,
+					  enum which_state which)
+{
+	struct drbd_resource_state_change *resource_state_change =
+		&state_change->resource[0];
+	struct drbd_device_state_change *device_state_change =
+		&state_change->devices[n_device];
+	union drbd_state state = { {
+		.role = R_UNKNOWN,
+		.peer = R_UNKNOWN,
+		.conn = C_STANDALONE,
+		.disk = D_UNKNOWN,
+		.pdsk = D_UNKNOWN,
+	} };
+
+	state.role = resource_state_change->role[which];
+	state.susp = resource_state_change->susp[which] || state_change_is_susp_quorum(state_change, which) ||
+		resource_state_change->susp_uuid[which];
+	state.susp_nod = resource_state_change->susp_nod[which];
+	state.susp_fen = state_change_is_susp_fen(state_change, which);
+	state.quorum = device_state_change->have_quorum[which];
+	state.disk = device_state_change->disk_state[which];
+	if (n_connection != -1) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n_connection];
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n_device * state_change->n_connections + n_connection];
+
+		state.peer = connection_state_change->peer_role[which];
+		state.conn = peer_device_state_change->repl_state[which];
+		if (state.conn <= L_OFF)
+			state.conn = connection_state_change->cstate[which];
+		state.pdsk = peer_device_state_change->disk_state[which];
+		state.aftr_isp = resync_susp_comb_dep_sc(state_change, n_device, n_connection, which);
+		state.peer_isp = peer_device_state_change->resync_susp_peer[which];
+		state.user_isp = peer_device_state_change->resync_susp_user[which];
+	}
+	return state;
+}
+
+int notify_resource_state_change(struct sk_buff *skb,
+				  unsigned int seq,
+				  void *state_change,
+				  enum drbd_notification_type type)
+{
+	struct drbd_resource_state_change *resource_state_change =
+		((struct drbd_state_change *)state_change)->resource;
+	struct drbd_resource *resource = resource_state_change->resource;
+	struct resource_info resource_info = {
+		.res_role = resource_state_change->role[NEW],
+		.res_susp = resource_state_change->susp[NEW],
+		.res_susp_nod = resource_state_change->susp_nod[NEW],
+		.res_susp_fen = state_change_is_susp_fen(state_change, NEW),
+		.res_susp_quorum = state_change_is_susp_quorum(state_change, NEW) ||
+			resource_state_change->susp_uuid[NEW],
+		.res_fail_io = resource_state_change->fail_io[NEW],
+	};
+
+	return notify_resource_state(skb, seq, resource, &resource_info, NULL, type);
+}
+
+int notify_connection_state_change(struct sk_buff *skb,
+				    unsigned int seq,
+				    void *state_change,
+				    enum drbd_notification_type type)
+{
+	struct drbd_connection_state_change *connection_state_change = state_change;
+	struct drbd_connection *connection = connection_state_change->connection;
+	struct connection_info connection_info = {
+		.conn_connection_state = connection_state_change->cstate[NEW],
+		.conn_role = connection_state_change->peer_role[NEW],
+	};
+
+	return notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+
+int notify_device_state_change(struct sk_buff *skb,
+				unsigned int seq,
+				void *state_change,
+				enum drbd_notification_type type)
+{
+	struct drbd_device_state_change *device_state_change = state_change;
+	struct drbd_device *device = device_state_change->device;
+	struct device_info device_info;
+	device_state_change_to_info(&device_info, device_state_change);
+
+	return notify_device_state(skb, seq, device, &device_info, type);
+}
+
+int notify_peer_device_state_change(struct sk_buff *skb,
+				     unsigned int seq,
+				     void *state_change,
+				     enum drbd_notification_type type)
+{
+	struct drbd_peer_device_state_change *peer_device_state_change = state_change;
+	struct drbd_peer_device *peer_device = peer_device_state_change->peer_device;
+	struct peer_device_info peer_device_info;
+	peer_device_state_change_to_info(&peer_device_info, state_change);
+
+	return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+
+static void notify_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	bool resource_state_has_changed;
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+	int (*last_func)(struct sk_buff *, unsigned int, void *,
+			  enum drbd_notification_type) = NULL;
+	void *last_arg = NULL;
+
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+	({ if (last_func) \
+		last_func(NULL, 0, last_arg, type); \
+	})
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+	({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+	   last_func = (typeof(last_func))func; \
+	   last_arg = arg; \
+	 })
+
+	mutex_lock(&notification_mutex);
+
+	resource_state_has_changed =
+		HAS_CHANGED(resource_state_change->role) ||
+		HAS_CHANGED(resource_state_change->susp) ||
+		HAS_CHANGED(resource_state_change->susp_nod) ||
+		HAS_CHANGED(resource_state_change->susp_uuid) ||
+		state_change_is_susp_fen(state_change, OLD) !=
+		state_change_is_susp_fen(state_change, NEW) ||
+		state_change_is_susp_quorum(state_change, OLD) !=
+		state_change_is_susp_quorum(state_change, NEW) ||
+		HAS_CHANGED(resource_state_change->fail_io);
+
+	if (resource_state_has_changed)
+		REMEMBER_STATE_CHANGE(notify_resource_state_change,
+				      state_change, NOTIFY_CHANGE);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		if (HAS_CHANGED(connection_state_change->peer_role) ||
+		    HAS_CHANGED(connection_state_change->cstate))
+			REMEMBER_STATE_CHANGE(notify_connection_state_change,
+					      connection_state_change, NOTIFY_CHANGE);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		if (HAS_CHANGED(device_state_change->disk_state) ||
+		    HAS_CHANGED(device_state_change->have_quorum))
+			REMEMBER_STATE_CHANGE(notify_device_state_change,
+					      device_state_change, NOTIFY_CHANGE);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		if (HAS_CHANGED(p->disk_state) ||
+		    HAS_CHANGED(p->repl_state) ||
+		    HAS_CHANGED(p->resync_susp_user) ||
+		    HAS_CHANGED(p->resync_susp_peer) ||
+		    HAS_CHANGED(p->resync_susp_dependency) ||
+		    HAS_CHANGED(p->resync_susp_other_c))
+			REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+					      p, NOTIFY_CHANGE);
+	}
+
+	FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+	mutex_unlock(&notification_mutex);
+
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
+
+static void send_role_to_all_peers(struct drbd_state_change *state_change)
+{
+	unsigned int n_connection;
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n_connection];
+		struct drbd_connection *connection = connection_state_change->connection;
+		enum drbd_conn_state new_cstate = connection_state_change->cstate[NEW];
+
+		if (new_cstate < C_CONNECTED)
+			continue;
+
+		if (connection->agreed_pro_version < 110) {
+			unsigned int n_device;
+
+			/* Before DRBD 9, the role is a device attribute
+			 * instead of a resource attribute. */
+			for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+				struct drbd_peer_device *peer_device =
+					state_change->peer_devices[n_connection].peer_device;
+				union drbd_state state =
+					state_change_word(state_change, n_device, n_connection, NEW);
+
+				drbd_send_state(peer_device, state);
+			}
+		} else {
+			union drbd_state state = { {
+				.role = state_change->resource[0].role[NEW],
+			} };
+
+			conn_send_state(connection, state);
+		}
+	}
+}
+
+static void send_new_state_to_all_peer_devices(struct drbd_state_change *state_change, int n_device)
+{
+	unsigned int n_connection;
+
+	BUG_ON(state_change->n_devices <= n_device);
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n_device * state_change->n_connections + n_connection];
+		struct drbd_peer_device *peer_device = peer_device_state_change->peer_device;
+		union drbd_state new_state = state_change_word(state_change, n_device, n_connection, NEW);
+
+		if (new_state.conn >= C_CONNECTED)
+			drbd_send_state(peer_device, new_state);
+	}
+}
+
+/* This function is supposed to have the same semantics as drbd_device_stable() in drbd_main.c
+   A primary is stable since it is authoritative.
+   Unstable are neighbors of a primary and resync target nodes.
+   Nodes further away from a primary are stable! Do no confuse with "weak".*/
+static bool calc_device_stable(struct drbd_state_change *state_change, int n_device, enum which_state which)
+{
+	int n_connection;
+
+	if (state_change->resource->role[which] == R_PRIMARY)
+		return true;
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n_connection];
+		enum drbd_role *peer_role = connection_state_change->peer_role;
+
+		if (peer_role[which] == R_PRIMARY)
+			return false;
+	}
+
+	return true;
+}
+
+static bool calc_resync_target(struct drbd_state_change *state_change, int n_device, enum which_state which)
+{
+	int n_connection;
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n_device * state_change->n_connections + n_connection];
+		enum drbd_repl_state *repl_state = peer_device_state_change->repl_state;
+
+		switch (repl_state[which]) {
+		case L_WF_BITMAP_T:
+		case L_SYNC_TARGET:
+		case L_PAUSED_SYNC_T:
+			return true;
+		default:
+			continue;
+		}
+	}
+
+	return false;
+}
+
+/* takes old and new peer disk state */
+static bool lost_contact_to_peer_data(enum drbd_disk_state *peer_disk_state)
+{
+	enum drbd_disk_state os = peer_disk_state[OLD];
+	enum drbd_disk_state ns = peer_disk_state[NEW];
+
+	return (os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
+		&& (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED);
+}
+
+static bool peer_returns_diskless(struct drbd_peer_device *peer_device,
+				  enum drbd_disk_state os, enum drbd_disk_state ns)
+{
+	struct drbd_device *device = peer_device->device;
+	bool rv = false;
+
+	/* Scenario, starting with normal operation
+	 * Connected Primary/Secondary UpToDate/UpToDate
+	 * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
+	 * ...
+	 * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
+	 */
+
+	if (get_ldev(device)) {
+		if (os == D_UNKNOWN && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED) &&
+		    drbd_bitmap_uuid(peer_device) == 0)
+			rv = true;
+		put_ldev(device);
+	}
+	return rv;
+}
+
+static void check_may_resume_io_after_fencing(struct drbd_state_change *state_change, int n_connection)
+{
+	struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection];
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	struct drbd_connection *connection = connection_state_change->connection;
+	struct drbd_resource *resource = resource_state_change->resource;
+	bool all_peer_disks_outdated = true;
+	bool all_peer_disks_connected = true;
+	struct drbd_peer_device *peer_device;
+	unsigned long irq_flags;
+	int vnr, n_device;
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n_device * state_change->n_connections + n_connection];
+		enum drbd_repl_state *repl_state = peer_device_state_change->repl_state;
+		enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state;
+
+		if (peer_disk_state[NEW] > D_OUTDATED)
+			all_peer_disks_outdated = false;
+		if (repl_state[NEW] < L_ESTABLISHED)
+			all_peer_disks_connected = false;
+	}
+
+	/* case1: The outdate peer handler is successful: */
+	if (all_peer_disks_outdated) {
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			if (test_and_clear_bit(NEW_CUR_UUID, &device->flags)) {
+				kref_get(&device->kref);
+				rcu_read_unlock();
+				drbd_uuid_new_current(device, false);
+				kref_put(&device->kref, drbd_destroy_device);
+				rcu_read_lock();
+			}
+		}
+		rcu_read_unlock();
+		begin_state_change(resource, &irq_flags, CS_VERBOSE);
+		__change_io_susp_fencing(connection, false);
+		end_state_change(resource, &irq_flags, "after-fencing");
+	}
+	/* case2: The connection was established again: */
+	if (all_peer_disks_connected) {
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			clear_bit(NEW_CUR_UUID, &device->flags);
+		}
+		rcu_read_unlock();
+		begin_state_change(resource, &irq_flags, CS_VERBOSE);
+		__change_io_susp_fencing(connection, false);
+		end_state_change(resource, &irq_flags, "after-fencing");
+	}
+}
+
+static bool drbd_should_unfence(struct drbd_state_change *state_change, int n_connection)
+{
+	bool some_peer_was_not_up_to_date = false;
+	int n_device;
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+		enum drbd_disk_state *disk_state = device_state_change->disk_state;
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[
+				n_device * state_change->n_connections + n_connection];
+		enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state;
+
+		/* Do not unfence if some volume is not yet up-to-date. */
+		if (disk_state[NEW] != D_UP_TO_DATE || peer_disk_state[NEW] != D_UP_TO_DATE)
+			return false;
+
+		/* Only unfence when the final volume becomes up-to-date. */
+		if (peer_disk_state[OLD] != D_UP_TO_DATE)
+			some_peer_was_not_up_to_date = true;
+	}
+
+	return some_peer_was_not_up_to_date;
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+	bool csums_after_crash_only;
+	rcu_read_lock();
+	csums_after_crash_only = rcu_dereference(connection->transport.net_conf)->csums_after_crash_only;
+	rcu_read_unlock();
+	return connection->agreed_pro_version >= 89 &&		/* supported? */
+		connection->csums_tfm &&			/* configured? */
+		(csums_after_crash_only == false		/* use for each resync? */
+		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
+}
+
+static void drbd_run_resync(struct drbd_peer_device *peer_device, enum drbd_repl_state repl_state)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_bitmap *bm = device->bitmap;
+	struct drbd_connection *connection = peer_device->connection;
+	enum drbd_repl_state side = repl_is_sync_target(repl_state) ? L_SYNC_TARGET : L_SYNC_SOURCE;
+
+	drbd_info(peer_device, "Began resync as %s (will sync %llu KB [%lu bits set]).\n",
+			drbd_repl_str(repl_state),
+			bm_bit_to_kb(bm, peer_device->rs_total),
+			(unsigned long) peer_device->rs_total);
+
+	if (side == L_SYNC_TARGET)
+		drbd_uuid_set_exposed(device, peer_device->current_uuid, false);
+
+	peer_device->use_csums = side == L_SYNC_TARGET ?
+		use_checksum_based_resync(connection, device) : false;
+
+	if (side == L_SYNC_TARGET &&
+			!(peer_device->uuid_flags & UUID_FLAG_STABLE) &&
+			!drbd_stable_sync_source_present(peer_device, NOW))
+		set_bit(UNSTABLE_RESYNC, &peer_device->flags);
+
+	/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+	 * with w_send_oos, or the sync target will get confused as to
+	 * how much bits to resync.  We cannot do that always, because for an
+	 * empty resync and protocol < 95, we need to do it here, as we call
+	 * drbd_resync_finished from here in that case.
+	 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+	 * and from after_state_ch otherwise. */
+	if (side == L_SYNC_SOURCE && connection->agreed_pro_version < 96)
+		drbd_gen_and_send_sync_uuid(peer_device);
+
+	if (connection->agreed_pro_version < 95 && peer_device->rs_total == 0) {
+		/* This still has a race (about when exactly the peers
+		 * detect connection loss) that can lead to a full sync
+		 * on next handshake. In 8.3.9 we fixed this with explicit
+		 * resync-finished notifications, but the fix
+		 * introduces a protocol change.  Sleeping for some
+		 * time longer than the ping interval + timeout on the
+		 * SyncSource, to give the SyncTarget the chance to
+		 * detect connection loss, then waiting for a ping
+		 * response (implicit in drbd_resync_finished) reduces
+		 * the race considerably, but does not solve it. */
+		if (side == L_SYNC_SOURCE) {
+			struct net_conf *nc;
+			int timeo;
+
+			rcu_read_lock();
+			nc = rcu_dereference(connection->transport.net_conf);
+			timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+			rcu_read_unlock();
+			schedule_timeout_interruptible(timeo);
+		}
+		drbd_resync_finished(peer_device, D_MASK);
+	}
+
+	/* ns.conn may already be != peer_device->repl_state[NOW],
+	 * we may have been paused in between, or become paused until
+	 * the timer triggers.
+	 * No matter, that is handled in resync_timer_fn() */
+	if (repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T)
+		drbd_uuid_resync_starting(peer_device);
+
+	drbd_md_sync_if_dirty(device);
+}
+
+
+/*
+ * Perform after state change actions that may sleep.
+ */
+static int w_after_state_change(struct drbd_work *w, int unused)
+{
+	struct after_state_change_work *work =
+		container_of(w, struct after_state_change_work, w);
+	struct drbd_state_change *state_change = work->state_change;
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	struct drbd_resource *resource = resource_state_change->resource;
+	enum drbd_role *role = resource_state_change->role;
+	bool *susp_uuid = resource_state_change->susp_uuid;
+	struct drbd_peer_device *send_state_others = NULL;
+	int n_device, n_connection;
+	bool still_connected = false;
+	bool try_become_up_to_date = false;
+	bool healed_primary = false;
+	bool send_flush_requests = false;
+
+	notify_state_change(state_change);
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change = &state_change->devices[n_device];
+		struct drbd_device *device = device_state_change->device;
+		enum drbd_disk_state *disk_state = device_state_change->disk_state;
+		bool have_ldev = extra_ldev_ref_for_after_state_chg(disk_state);
+		bool *have_quorum = device_state_change->have_quorum;
+		bool effective_disk_size_determined = false;
+		bool device_stable[2], resync_target[2];
+		bool data_accessible[2];
+		bool all_peer_replication[2];
+		bool resync_finished = false;
+		bool some_peer_demoted = false;
+		bool new_current_uuid = false;
+		enum which_state which;
+
+		for (which = OLD; which <= NEW; which++) {
+			device_stable[which] = calc_device_stable(state_change, n_device, which);
+			resync_target[which] = calc_resync_target(state_change, n_device, which);
+			data_accessible[which] =
+				calc_data_accessible(state_change, n_device, which);
+			all_peer_replication[which] =
+				drbd_all_peer_replication_change(state_change, n_device, which);
+
+		}
+
+		if (disk_state[NEW] == D_UP_TO_DATE)
+			effective_disk_size_determined = true;
+
+		for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+			struct drbd_peer_device_state_change *peer_device_state_change =
+				&state_change->peer_devices[
+					n_device * state_change->n_connections + n_connection];
+			struct drbd_peer_device *peer_device = peer_device_state_change->peer_device;
+			enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state;
+			enum drbd_repl_state *repl_state = peer_device_state_change->repl_state;
+
+			if ((repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) &&
+			    repl_state[NEW] == L_ESTABLISHED)
+				resync_finished = true;
+
+			if (disk_state[OLD] == D_INCONSISTENT && disk_state[NEW] == D_UP_TO_DATE &&
+			    peer_disk_state[OLD] == D_INCONSISTENT && peer_disk_state[NEW] == D_UP_TO_DATE)
+				send_state_others = peer_device;
+
+			/* connect without resync or remote attach without resync */
+			if (disk_state[NOW] >= D_OUTDATED && repl_state[NEW] == L_ESTABLISHED &&
+			    ((repl_state[OLD] == L_OFF && peer_disk_state[NEW] >= D_OUTDATED) ||
+			     (peer_disk_state[OLD] == D_DISKLESS && peer_disk_state[NEW] >= D_OUTDATED))) {
+				u64 peer_current_uuid = peer_device->current_uuid & ~UUID_PRIMARY;
+				u64 my_current_uuid = drbd_current_uuid(device) & ~UUID_PRIMARY;
+
+				if (peer_current_uuid == my_current_uuid && get_ldev(device)) {
+					down_write(&device->uuid_sem);
+					drbd_uuid_set_bitmap(peer_device, 0);
+					up_write(&device->uuid_sem);
+					drbd_print_uuids(peer_device, "cleared bm UUID and bitmap");
+					drbd_bitmap_io_from_worker(device, &drbd_bmio_clear_one_peer,
+								   "clearing bm one peer", BM_LOCK_CLEAR | BM_LOCK_BULK,
+								   peer_device);
+					put_ldev(device);
+				}
+			}
+		}
+
+		if (role[NEW] == R_PRIMARY && !data_accessible[OLD] && data_accessible[NEW])
+			healed_primary = true;
+
+		for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+			struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection];
+			struct drbd_connection *connection = connection_state_change->connection;
+			enum drbd_conn_state *cstate = connection_state_change->cstate;
+			enum drbd_role *peer_role = connection_state_change->peer_role;
+			struct drbd_peer_device_state_change *peer_device_state_change =
+				&state_change->peer_devices[
+					n_device * state_change->n_connections + n_connection];
+			struct drbd_peer_device *peer_device = peer_device_state_change->peer_device;
+			enum drbd_repl_state *repl_state = peer_device_state_change->repl_state;
+			enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state;
+			bool *resync_susp_user = peer_device_state_change->resync_susp_user;
+			bool *resync_susp_peer = peer_device_state_change->resync_susp_peer;
+			bool *resync_susp_dependency = peer_device_state_change->resync_susp_dependency;
+			union drbd_state new_state =
+				state_change_word(state_change, n_device, n_connection, NEW);
+			bool send_uuids, send_state = false;
+
+			/* In case we finished a resync as resync-target update all neighbors
+			 * about having a bitmap_uuid of 0 towards the previous sync-source.
+			 * That needs to go out before sending the new disk state
+			 * to avoid a race where the other node might downgrade our disk
+			 * state due to old UUID values.
+			 *
+			 * Also check the replication state to ensure that we
+			 * do not send these extra UUIDs before the initial
+			 * handshake. */
+			send_uuids = resync_finished &&
+				peer_disk_state[NEW] != D_UNKNOWN &&
+				repl_state[NEW] > L_OFF;
+
+			/* Send UUIDs again if they changed while establishing the connection */
+			if (repl_state[OLD] == L_OFF && repl_state[NEW] > L_OFF &&
+			    peer_device->comm_current_uuid != drbd_resolved_uuid(peer_device, NULL))
+				send_uuids = true;
+
+			if (repl_state[NEW] > L_OFF && device_stable[OLD] != device_stable[NEW])
+				send_uuids = true;
+
+			if (send_uuids)
+				drbd_send_uuids(peer_device, 0, 0);
+
+			if (peer_disk_state[NEW] == D_UP_TO_DATE)
+				effective_disk_size_determined = true;
+
+			if (!(role[OLD] == R_PRIMARY && !data_accessible[OLD]) &&
+			     (role[NEW] == R_PRIMARY && !data_accessible[NEW]) &&
+			    !test_bit(UNREGISTERED, &device->flags))
+				drbd_maybe_khelper(device, connection, "pri-on-incon-degr");
+
+			/* Became sync source.  With protocol >= 96, we still need to send out
+			 * the sync uuid now. Need to do that before any drbd_send_state, or
+			 * the other side may go "paused sync" before receiving the sync uuids,
+			 * which is unexpected. */
+			if (!(repl_state[OLD] == L_SYNC_SOURCE || repl_state[OLD] == L_PAUSED_SYNC_S) &&
+			     (repl_state[NEW] == L_SYNC_SOURCE || repl_state[NEW] == L_PAUSED_SYNC_S) &&
+			    connection->agreed_pro_version >= 96 && connection->agreed_pro_version < 110 &&
+			    get_ldev(device)) {
+				drbd_gen_and_send_sync_uuid(peer_device);
+				put_ldev(device);
+			}
+
+			/* Do not change the order of the if above and the two below... */
+			if (peer_disk_state[OLD] < D_NEGOTIATING &&
+			    peer_disk_state[NEW] == D_NEGOTIATING) { /* attach on the peer */
+				/* we probably will start a resync soon.
+				 * make sure those things are properly reset. */
+				peer_device->rs_total = 0;
+				peer_device->rs_failed = 0;
+
+				drbd_send_uuids(peer_device, 0, 0);
+				drbd_send_state(peer_device, new_state);
+			}
+			/* No point in queuing send_bitmap if we don't have a connection
+			 * anymore, so check also the _current_ state, not only the new state
+			 * at the time this work was queued. */
+			if (repl_state[OLD] != L_WF_BITMAP_S && repl_state[NEW] == L_WF_BITMAP_S &&
+			    peer_device->repl_state[NOW] == L_WF_BITMAP_S) {
+				/* Now that the connection is L_WF_BITMAP_S,
+				 * new requests will be sent to the peer as
+				 * P_OUT_OF_SYNC packets. However, active
+				 * requests may not have been communicated to
+				 * the peer and may not yet be marked in the
+				 * local bitmap. Mark these requests in the
+				 * bitmap before reading and sending that
+				 * bitmap. This may set bits unnecessarily, but
+				 * it does no harm to resync a small amount of
+				 * additional data. */
+				drbd_set_pending_out_of_sync(peer_device);
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+						"send_bitmap (WFBitMapS)",
+						BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK | BM_LOCK_SINGLE_SLOT,
+						peer_device);
+			}
+
+			if (peer_role[OLD] == R_PRIMARY && peer_role[NEW] == R_SECONDARY)
+				some_peer_demoted = true;
+
+			/* Last part of the attaching process ... */
+			if (cstate[NEW] == C_CONNECTED && /* repl_state[NEW] might still be L_OFF */
+			    disk_state[OLD] == D_ATTACHING && disk_state[NEW] >= D_NEGOTIATING) {
+				drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+				drbd_send_uuids(peer_device, 0, 0);
+				drbd_send_state(peer_device, new_state);
+			}
+
+			/* Started resync, tell peer if drbd9 */
+			if (repl_state[NEW] >= L_SYNC_SOURCE && repl_state[NEW] <= L_PAUSED_SYNC_T &&
+			    (repl_state[OLD] < L_SYNC_SOURCE || repl_state[OLD] > L_PAUSED_SYNC_T))
+				send_state = true;
+
+			/* We want to pause/continue resync, tell peer. */
+			if (repl_state[NEW] >= L_ESTABLISHED &&
+			    ((resync_susp_comb_dep_sc(state_change, n_device, n_connection, OLD) !=
+			      resync_susp_comb_dep_sc(state_change, n_device, n_connection, NEW)) ||
+			     (resync_susp_user[OLD] != resync_susp_user[NEW])))
+				send_state = true;
+
+			/* finished resync, tell sync source */
+			if ((repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) &&
+			    repl_state[NEW] == L_ESTABLISHED)
+				send_state = true;
+
+			/* In case one of the isp bits got set, suspend other devices. */
+			if (!(resync_susp_dependency[OLD] || resync_susp_peer[OLD] || resync_susp_user[OLD]) &&
+			     (resync_susp_dependency[NEW] || resync_susp_peer[NEW] || resync_susp_user[NEW]))
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				suspend_other_sg(device);
+
+			/* Make sure the peer gets informed about eventual state
+			   changes (ISP bits) while we were in L_OFF. */
+			if (repl_state[OLD] == L_OFF && repl_state[NEW] >= L_ESTABLISHED)
+				send_state = true;
+
+			if (repl_state[OLD] != L_AHEAD && repl_state[NEW] == L_AHEAD)
+				send_state = true;
+
+			/* We are in the progress to start a full sync. SyncTarget sets all slots. */
+			if (repl_state[OLD] != L_STARTING_SYNC_T && repl_state[NEW] == L_STARTING_SYNC_T)
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				drbd_queue_bitmap_io(device,
+					&drbd_bmio_set_all_n_write, &abw_start_sync,
+					"set_n_write from StartingSync",
+					BM_LOCK_CLEAR | BM_LOCK_BULK,
+					peer_device);
+
+			/* We are in the progress to start a full sync. SyncSource one slot. */
+			if (repl_state[OLD] != L_STARTING_SYNC_S && repl_state[NEW] == L_STARTING_SYNC_S)
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				drbd_queue_bitmap_io(device,
+					&drbd_bmio_set_n_write, &abw_start_sync,
+					"set_n_write from StartingSync",
+					BM_LOCK_CLEAR | BM_LOCK_BULK,
+					peer_device);
+
+			/* Disks got bigger while they were detached */
+			if (disk_state[NEW] > D_NEGOTIATING && peer_disk_state[NEW] > D_NEGOTIATING &&
+			    test_and_clear_bit(RESYNC_AFTER_NEG, &peer_device->flags)) {
+				if (repl_state[NEW] == L_ESTABLISHED)
+					resync_after_online_grow(peer_device);
+			}
+
+			/* A resync finished or aborted, wake paused devices... */
+			if ((repl_state[OLD] > L_ESTABLISHED && repl_state[NEW] <= L_ESTABLISHED) ||
+			    (resync_susp_peer[OLD] && !resync_susp_peer[NEW]) ||
+			    (resync_susp_user[OLD] && !resync_susp_user[NEW]))
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				resume_next_sg(device);
+
+			/* sync target done with resync. Explicitly notify all peers. Our sync
+			   source should even know by himself, but the others need that info. */
+			if (disk_state[OLD] < D_UP_TO_DATE && repl_state[OLD] >= L_SYNC_SOURCE && repl_state[NEW] == L_ESTABLISHED)
+				send_new_state_to_all_peer_devices(state_change, n_device);
+
+			/* Outdated myself, or became D_UP_TO_DATE tell peers
+			 * Do not do it, when the local node was forced from R_SECONDARY to R_PRIMARY,
+			 * because that is part of the 2-phase-commit and that is necessary to trigger
+			 * the initial resync. */
+			if ((disk_state[NEW] >= D_INCONSISTENT && disk_state[NEW] != disk_state[OLD] &&
+			     repl_state[OLD] >= L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED) &&
+			    !(role[OLD] == R_SECONDARY && role[NEW] == R_PRIMARY))
+				send_state = true;
+
+			/* diskless peers need to be informed about quorum changes, since they consider
+			   the quorum state of the diskfull nodes. */
+			if (have_quorum[OLD] != have_quorum[NEW] && disk_state[NEW] >= D_INCONSISTENT)
+				send_state = true;
+
+			/* Skipped resync with peer_device, tell others... */
+			if (send_state_others && send_state_others != peer_device)
+				send_state = true;
+
+			/* This triggers bitmap writeout of potentially still unwritten pages
+			 * if the resync finished cleanly, or aborted because of peer disk
+			 * failure, or on transition from resync back to AHEAD/BEHIND.
+			 *
+			 * Connection loss is handled in conn_disconnect() by the receiver.
+			 *
+			 * For resync aborted because of local disk failure, we cannot do
+			 * any bitmap writeout anymore.
+			 *
+			 * No harm done if some bits change during this phase.
+			 */
+			if ((repl_state[OLD] > L_ESTABLISHED && repl_state[OLD] < L_AHEAD) &&
+			    (repl_state[NEW] == L_ESTABLISHED || repl_state[NEW] >= L_AHEAD) &&
+			    get_ldev(device)) {
+				drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+					"write from resync_finished", BM_LOCK_BULK,
+					NULL);
+				put_ldev(device);
+			}
+
+			/* Verify finished, or reached stop sector.  Peer did not know about
+			 * the stop sector, and we may even have changed the stop sector during
+			 * verify to interrupt/stop early.  Send the new state. */
+			if (repl_state[OLD] == L_VERIFY_S && repl_state[NEW] == L_ESTABLISHED
+			    && verify_can_do_stop_sector(peer_device))
+				send_new_state_to_all_peer_devices(state_change, n_device);
+
+			if (disk_state[NEW] == D_DISKLESS &&
+			    cstate[NEW] == C_STANDALONE &&
+			    role[NEW] == R_SECONDARY) {
+				if (resync_susp_dependency[OLD] != resync_susp_dependency[NEW])
+					/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg */
+					resume_next_sg(device);
+			}
+
+			if (device_stable[OLD] && !device_stable[NEW] &&
+			    repl_state[NEW] >= L_ESTABLISHED && get_ldev(device)) {
+				/* Inform peers about being unstable...
+				   Maybe it would be a better idea to have the stable bit as
+				   part of the state (and being sent with the state) */
+				drbd_send_uuids(peer_device, 0, 0);
+				put_ldev(device);
+			}
+
+			if (send_state && cstate[NEW] == C_CONNECTED)
+				drbd_send_state(peer_device, new_state);
+
+			if (((!device_stable[OLD] && device_stable[NEW]) ||
+			     (resync_target[OLD] && !resync_target[NEW] && device_stable[NEW])) &&
+			    !(repl_state[OLD] == L_SYNC_TARGET || repl_state[OLD] == L_PAUSED_SYNC_T) &&
+			    !(peer_role[OLD] == R_PRIMARY) && disk_state[NEW] >= D_OUTDATED &&
+			    repl_state[NEW] >= L_ESTABLISHED &&
+			    get_ldev(device)) {
+				/* Offer all peers a resync, with the exception of ...
+				   ... the node that made me up-to-date (with a resync)
+				   ... I was primary
+				   ... the peer that transitioned from primary to secondary
+				*/
+				drbd_send_uuids(peer_device, UUID_FLAG_GOT_STABLE, 0);
+				put_ldev(device);
+			}
+
+			if (peer_disk_state[OLD] == D_UP_TO_DATE &&
+			    (peer_disk_state[NEW] == D_FAILED || peer_disk_state[NEW] == D_INCONSISTENT) &&
+			    test_and_clear_bit(NEW_CUR_UUID, &device->flags))
+				/* When a peer disk goes from D_UP_TO_DATE to D_FAILED or D_INCONSISTENT
+				   we know that a write failed on that node. Therefore we need to create
+				   the new UUID right now (not wait for the next write to come in) */
+				new_current_uuid = true;
+
+			if (disk_state[OLD] > D_FAILED && disk_state[NEW] == D_FAILED &&
+			    role[NEW] == R_PRIMARY && test_and_clear_bit(NEW_CUR_UUID, &device->flags))
+				new_current_uuid = true;
+
+			if (repl_state[OLD] != L_VERIFY_S && repl_state[NEW] == L_VERIFY_S) {
+				drbd_info(peer_device, "Starting Online Verify from sector %llu\n",
+						(unsigned long long)peer_device->ov_position);
+				drbd_queue_work_if_unqueued(
+						&peer_device->connection->sender_work,
+						&peer_device->resync_work);
+			}
+
+			if (!repl_is_sync(repl_state[OLD]) && repl_is_sync(repl_state[NEW]))
+				/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+				drbd_run_resync(peer_device, repl_state[NEW]);
+
+			if (repl_is_sync(repl_state[OLD]) && !repl_is_sync(repl_state[NEW]))
+				drbd_last_resync_request(peer_device, false);
+
+			if (peer_device_state_change->repl_state[OLD] != L_SYNC_TARGET &&
+					peer_device_state_change->repl_state[NEW] == L_SYNC_TARGET)
+				drbd_queue_work_if_unqueued(
+						&peer_device->connection->sender_work,
+						&peer_device->resync_work);
+
+			if (!(repl_is_sync_target(repl_state[OLD]) &&
+					all_peer_replication[OLD]) &&
+					repl_is_sync_target(repl_state[NEW]) &&
+					all_peer_replication[NEW])
+				send_flush_requests = true;
+
+			if (!peer_device_state_change->peer_replication[OLD] &&
+					peer_device_state_change->peer_replication[NEW])
+				drbd_send_enable_replication(peer_device, true);
+		}
+
+		if (((role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY) || some_peer_demoted) &&
+		    get_ldev(device)) {
+			/* The some_peer_demoted case is superseded by
+			 * handle_neighbor_demotion(). We keep this call for
+			 * compatibility until support for protocol version 121
+			 * is removed.
+			 *
+			 * No changes to the bitmap expected after this point, so write out any
+			 * changes up to now to ensure that the metadata disk has the full
+			 * bitmap content. Even if the bitmap changes (e.g. it was dual primary)
+			 * no harm was done if it did change. */
+			drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+						   "demote", BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK,
+						   NULL);
+			put_ldev(device);
+		}
+
+		/* Make sure the effective disk size is stored in the metadata
+		 * if a local disk is attached and either the local disk state
+		 * or a peer disk state is D_UP_TO_DATE.  */
+		if (effective_disk_size_determined && get_ldev(device)) {
+			sector_t size = get_capacity(device->vdisk);
+			if (device->ldev->md.effective_size != size) {
+				char ppb[10];
+
+				drbd_info(device, "persisting effective size = %s (%llu KB)\n",
+				     ppsize(ppb, size >> 1),
+				     (unsigned long long)size >> 1);
+				device->ldev->md.effective_size = size;
+				drbd_md_mark_dirty(device);
+			}
+			put_ldev(device);
+		}
+
+		/* first half of local IO error, failure to attach,
+		 * or administrative detach */
+		if ((disk_state[OLD] != D_FAILED && disk_state[NEW] == D_FAILED) ||
+		    (disk_state[OLD] != D_DETACHING && disk_state[NEW] == D_DETACHING)) {
+			enum drbd_io_error_p eh = EP_PASS_ON;
+			int was_io_error = 0;
+
+			/* Our cleanup here with the transition to D_DISKLESS.
+			 * It is still not safe to dereference ldev here, since
+			 * we might come from an failed Attach before ldev was set. */
+			/* ldev_safe: ref from extra_ldev_ref_for_after_state_chg() */
+			if (have_ldev && device->ldev) {
+				rcu_read_lock();
+				eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+				rcu_read_unlock();
+
+				was_io_error = disk_state[NEW] == D_FAILED;
+
+				/* Intentionally call this handler first, before drbd_send_state().
+				 * See: 2932204 drbd: call local-io-error handler early
+				 * People may chose to hard-reset the box from this handler.
+				 * It is useful if this looks like a "regular node crash". */
+				if (was_io_error && eh == EP_CALL_HELPER)
+					drbd_maybe_khelper(device, NULL, "local-io-error");
+
+				/* Immediately allow completion of all application IO,
+				 * that waits for completion from the local disk,
+				 * if this was a force-detach due to disk_timeout
+				 * or administrator request (drbdsetup detach --force).
+				 * Do NOT abort otherwise.
+				 * Aborting local requests may cause serious problems,
+				 * if requests are completed to upper layers already,
+				 * and then later the already submitted local bio completes.
+				 * This can cause DMA into former bio pages that meanwhile
+				 * have been re-used for other things.
+				 * So aborting local requests may cause crashes,
+				 * or even worse, silent data corruption.
+				 */
+				if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+					tl_abort_disk_io(device);
+
+				send_new_state_to_all_peer_devices(state_change, n_device);
+
+				/* In case we want to get something to stable storage still,
+				 * this may be the last chance.
+				 * Following put_ldev may transition to D_DISKLESS. */
+				drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+						"detach", BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK,
+						NULL);
+				drbd_md_sync_if_dirty(device);
+			}
+		}
+
+		/* second half of local IO error, failure to attach,
+		 * or administrative detach,
+		 * after local_cnt references have reached zero again */
+		if (disk_state[OLD] != D_DISKLESS && disk_state[NEW] == D_DISKLESS) {
+			/* We must still be diskless,
+			 * re-attach has to be serialized with this! */
+			if (device->disk_state[NOW] != D_DISKLESS)
+				drbd_err(device,
+					"ASSERT FAILED: disk is %s while going diskless\n",
+					drbd_disk_str(device->disk_state[NOW]));
+
+			/* we may need to cancel the md_sync timer */
+			timer_delete_sync(&device->md_sync_timer);
+
+			if (have_ldev)
+				send_new_state_to_all_peer_devices(state_change, n_device);
+		}
+
+		if (have_ldev)
+			put_ldev(device);
+
+		/* Notify peers that I had a local IO error and did not detach. */
+		if (disk_state[OLD] == D_UP_TO_DATE && disk_state[NEW] == D_INCONSISTENT)
+			send_new_state_to_all_peer_devices(state_change, n_device);
+
+		/* Testing EMPTY_TWOPC_PENDING would cause more queuing than necessary */
+		if (should_try_become_up_to_date(device, disk_state, NOW))
+			try_become_up_to_date = true;
+
+		if (test_bit(TRY_TO_GET_RESYNC, &device->flags)) {
+			/* Got connected to a diskless primary */
+			clear_bit(TRY_TO_GET_RESYNC, &device->flags);
+			drbd_try_to_get_resynced(device);
+		}
+
+		drbd_md_sync_if_dirty(device);
+
+		if (role[NEW] == R_PRIMARY && have_quorum[OLD] && !have_quorum[NEW])
+			drbd_maybe_khelper(device, NULL, "quorum-lost");
+
+		if (!susp_uuid[OLD] && susp_uuid[NEW] &&
+		    test_and_clear_bit(NEW_CUR_UUID, &device->flags))
+			new_current_uuid = true;
+
+		if (new_current_uuid)
+			drbd_uuid_new_current(device, false);
+
+		if (disk_state[OLD] > D_DISKLESS && disk_state[NEW] == D_DISKLESS)
+			drbd_reconsider_queue_parameters(device, NULL);
+	}
+
+	if (role[OLD] == R_PRIMARY && role[NEW] == R_SECONDARY)
+		send_role_to_all_peers(state_change);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection];
+		struct drbd_connection *connection = connection_state_change->connection;
+		enum drbd_conn_state *cstate = connection_state_change->cstate;
+		bool *susp_fen = connection_state_change->susp_fen;
+		enum drbd_fencing_policy fencing_policy;
+
+		if (connection_state_change->peer_role[NEW] == R_PRIMARY && send_flush_requests &&
+				connection->agreed_pro_version >= 123) {
+			u64 current_flush_sequence;
+
+			spin_lock_irq(&resource->initiator_flush_lock);
+			/* Requirement: At least the value from the corresponding state change */
+			current_flush_sequence = resource->current_flush_sequence;
+			spin_unlock_irq(&resource->initiator_flush_lock);
+
+			drbd_send_flush_requests(connection, current_flush_sequence);
+		}
+
+		/* Upon network configuration, we need to start the receiver */
+		if (cstate[OLD] == C_STANDALONE && cstate[NEW] == C_UNCONNECTED)
+			drbd_thread_start(&connection->receiver);
+
+		if (susp_fen[NEW])
+			check_may_resume_io_after_fencing(state_change, n_connection);
+
+		rcu_read_lock();
+		fencing_policy = connection->fencing_policy;
+		rcu_read_unlock();
+		if (fencing_policy != FP_DONT_CARE &&
+				drbd_should_unfence(state_change, n_connection))
+			drbd_maybe_khelper(NULL, connection, "unfence-peer");
+	}
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change = &state_change->connections[n_connection];
+		enum drbd_conn_state *cstate = connection_state_change->cstate;
+
+		if (cstate[NEW] == C_CONNECTED || cstate[NEW] == C_CONNECTING)
+			still_connected = true;
+	}
+
+	if (susp_uuid[NEW]) {
+		unsigned long irq_flags;
+
+		begin_state_change(resource, &irq_flags, CS_VERBOSE);
+		resource->susp_uuid[NEW] = false;
+		end_state_change(resource, &irq_flags, "susp-uuid");
+	}
+
+	if (try_become_up_to_date || healed_primary)
+		drbd_schedule_empty_twopc(resource);
+
+	if (!still_connected)
+		mod_timer_pending(&resource->twopc_timer, jiffies);
+
+	if (work->done)
+		complete(work->done);
+	forget_state_change(state_change);
+	kfree(work);
+
+	return 0;
+}
+
+static bool local_state_change(enum chg_state_flags flags)
+{
+	return flags & (CS_HARD | CS_LOCAL_ONLY);
+}
+
+static enum drbd_state_rv
+__peer_request(struct drbd_connection *connection, int vnr,
+	       union drbd_state mask, union drbd_state val)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if (connection->cstate[NOW] == C_CONNECTED) {
+		enum drbd_packet cmd = (vnr == -1) ? P_CONN_ST_CHG_REQ : P_STATE_CHG_REQ;
+		if (!conn_send_state_req(connection, vnr, cmd, mask, val)) {
+			set_bit(TWOPC_PREPARED, &connection->flags);
+			rv = SS_CW_SUCCESS;
+		}
+	}
+	return rv;
+}
+
+static enum drbd_state_rv __peer_reply(struct drbd_connection *connection)
+{
+	if (test_and_clear_bit(TWOPC_NO, &connection->flags))
+		return SS_CW_FAILED_BY_PEER;
+	if (test_and_clear_bit(TWOPC_YES, &connection->flags) ||
+	    !test_bit(TWOPC_PREPARED, &connection->flags))
+		return SS_CW_SUCCESS;
+
+	/* This is DRBD 9.x <-> 8.4 compat code.
+	 * Consistent with __peer_request() above:
+	 * No more connection: fake success. */
+	if (connection->cstate[NOW] != C_CONNECTED)
+		return SS_SUCCESS;
+	return SS_UNKNOWN_ERROR;
+}
+
+static bool when_done_lock(struct drbd_resource *resource,
+			   unsigned long *irq_flags)
+{
+	write_lock_irqsave(&resource->state_rwlock, *irq_flags);
+	if (!resource->remote_state_change && !test_bit(TWOPC_WORK_PENDING, &resource->flags))
+		return true;
+	write_unlock_irqrestore(&resource->state_rwlock, *irq_flags);
+	return false;
+}
 
 /**
- * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
- * This limits hard state transitions. Hard state transitions are facts there are
- * imposed on DRBD by the environment. E.g. disk broke or network broke down.
- * But those hard state transitions are still not allowed to do everything.
- * @ns:		new state.
- * @os:		old state.
+ * complete_remote_state_change  -  Wait for other remote state changes to complete
+ * @resource: DRBD resource.
+ * @irq_flags: IRQ flags from begin_state_change.
  */
+static void complete_remote_state_change(struct drbd_resource *resource,
+					 unsigned long *irq_flags)
+{
+	if (resource->remote_state_change) {
+		enum chg_state_flags flags = resource->state_change_flags;
+
+		begin_remote_state_change(resource, irq_flags);
+		for (;;) {
+			long t = twopc_timeout(resource);
+
+			t = wait_event_timeout(resource->twopc_wait,
+				   when_done_lock(resource, irq_flags), t);
+			if (t)
+				break;
+			if (when_done_lock(resource, irq_flags)) {
+				drbd_info(resource, "Two-phase commit: "
+					  "not woken up in time\n");
+				break;
+			}
+		}
+		__end_remote_state_change(resource, flags);
+	}
+}
+
 static enum drbd_state_rv
-is_valid_transition(union drbd_state os, union drbd_state ns)
+change_peer_state(struct drbd_connection *connection, int vnr,
+		  union drbd_state mask, union drbd_state val, unsigned long *irq_flags)
 {
+	struct drbd_resource *resource = connection->resource;
+	enum chg_state_flags flags = resource->state_change_flags | CS_TWOPC;
 	enum drbd_state_rv rv;
 
-	rv = is_valid_conn_transition(os.conn, ns.conn);
+	if (!expect(resource, flags & CS_SERIALIZE))
+		return SS_CW_FAILED_BY_PEER;
+
+	complete_remote_state_change(resource, irq_flags);
+
+	resource->remote_state_change = true;
+	resource->twopc_reply.initiator_node_id = resource->res_opts.node_id;
+	resource->twopc_reply.tid = 0;
+	begin_remote_state_change(resource, irq_flags);
+	rv = __peer_request(connection, vnr, mask, val);
+	if (rv == SS_CW_SUCCESS) {
+		wait_event(resource->state_wait,
+			((rv = __peer_reply(connection)) != SS_UNKNOWN_ERROR));
+		clear_bit(TWOPC_PREPARED, &connection->flags);
+	}
+	end_remote_state_change(resource, irq_flags, flags);
+	return rv;
+}
+
+static enum drbd_state_rv
+__cluster_wide_request(struct drbd_resource *resource, struct twopc_request *request,
+		       u64 reach_immediately)
+{
+	enum drbd_packet cmd = request->cmd;
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	u64 im;
+
+	for_each_connection_ref(connection, im, resource) {
+		u64 mask;
+		int err;
+
+		clear_bit(TWOPC_PREPARED, &connection->flags);
+
+		if (connection->agreed_pro_version < 110)
+			continue;
+		mask = NODE_MASK(connection->peer_node_id);
+		if (reach_immediately & mask)
+			set_bit(TWOPC_PREPARED, &connection->flags);
+		else
+			continue;
 
-	/* we cannot fail (again) if we already detached */
-	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
-		rv = SS_IS_DISKLESS;
+		clear_bit(TWOPC_YES, &connection->flags);
+		clear_bit(TWOPC_NO, &connection->flags);
+		clear_bit(TWOPC_RETRY, &connection->flags);
 
+		err = conn_send_twopc_request(connection, request);
+		if (err) {
+			clear_bit(TWOPC_PREPARED, &connection->flags);
+			wake_up(&resource->work.q_wait);
+			continue;
+		}
+		if (cmd == P_TWOPC_PREPARE || cmd == P_TWOPC_PREP_RSZ)
+			schedule_work(&connection->send_ping_work);
+		rv = SS_CW_SUCCESS;
+	}
 	return rv;
 }
 
-static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+bool drbd_twopc_between_peer_and_me(struct drbd_connection *connection)
 {
-	static const char *msg_table[] = {
-		[NO_WARNING] = "",
-		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
-		[ABORTED_RESYNC] = "Resync aborted.",
-		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
-		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
-		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
-	};
+	const int my_node_id = connection->resource->res_opts.node_id;
+	struct twopc_reply *o = &connection->resource->twopc_reply;
+
+	return ((o->target_node_id == my_node_id || o->target_node_id == -1) &&
+		o->initiator_node_id == connection->peer_node_id) ||
+		((o->target_node_id == connection->peer_node_id || o->target_node_id == -1) &&
+		 o->initiator_node_id == my_node_id);
+}
+
+bool cluster_wide_reply_ready(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+	bool connect_ready = true;
+	bool have_no = resource->twopc_reply.state_change_failed;
+	bool have_retry = false;
+	bool all_yes = true;
+
+	if (test_bit(TWOPC_ABORT_LOCAL, &resource->flags))
+		return true;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (connection->agreed_pro_version >= 118 &&
+				!idr_is_empty(&resource->devices) &&
+				resource->twopc_reply.is_connect &&
+				drbd_twopc_between_peer_and_me(connection) &&
+				!test_bit(CONN_HANDSHAKE_READY, &connection->flags))
+			connect_ready = false;
+
+		if (!test_bit(TWOPC_PREPARED, &connection->flags))
+			continue;
+		if (test_bit(TWOPC_NO, &connection->flags))
+			have_no = true;
+		if (test_bit(TWOPC_RETRY, &connection->flags))
+			have_retry = true;
+		if (!test_bit(TWOPC_YES, &connection->flags))
+			all_yes = false;
+	}
+	rcu_read_unlock();
+
+	return have_retry || (connect_ready && (have_no || all_yes));
+}
+
+static enum drbd_state_rv get_cluster_wide_reply(struct drbd_resource *resource,
+						 struct change_context *context)
+{
+	struct drbd_connection *connection, *failed_by = NULL;
+	bool handshake_disconnect = false;
+	bool handshake_retry = false;
+	bool have_no = resource->twopc_reply.state_change_failed;
+	bool have_retry = false;
+	enum drbd_state_rv rv = SS_CW_SUCCESS;
+
+	if (test_bit(TWOPC_ABORT_LOCAL, &resource->flags))
+		return SS_CONCURRENT_ST_CHG;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (resource->twopc_reply.is_connect &&
+				drbd_twopc_between_peer_and_me(connection)) {
+			if (test_bit(CONN_HANDSHAKE_DISCONNECT, &connection->flags))
+				handshake_disconnect = true;
+			if (test_bit(CONN_HANDSHAKE_RETRY, &connection->flags))
+				handshake_retry = true;
+		}
+
+		if (!test_bit(TWOPC_PREPARED, &connection->flags))
+			continue;
+		if (test_bit(TWOPC_NO, &connection->flags)) {
+			failed_by = connection;
+			have_no = true;
+		}
+		if (test_bit(TWOPC_RETRY, &connection->flags))
+			have_retry = true;
+	}
+
+	if (have_retry)
+		rv = SS_CONCURRENT_ST_CHG;
+	else if (handshake_retry)
+		rv = SS_HANDSHAKE_RETRY;
+	else if (handshake_disconnect)
+		rv = SS_HANDSHAKE_DISCONNECT;
+	else if (have_no) {
+		if (context && failed_by)
+			_drbd_state_err(context, "Declined by peer %s (id: %d), see the kernel log there",
+					rcu_dereference(failed_by->transport.net_conf)->name,
+					failed_by->peer_node_id);
+		rv = SS_CW_FAILED_BY_PEER;
+	}
+	rcu_read_unlock();
+
+	if (rv == SS_CW_SUCCESS && test_bit(TWOPC_RECV_SIZES_ERR, &resource->flags))
+		rv = SS_HANDSHAKE_DISCONNECT;
+
+	return rv;
+}
+
+static bool supports_two_phase_commit(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+	bool supported = true;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (connection->cstate[NOW] != C_CONNECTED)
+			continue;
+		if (connection->agreed_pro_version < 110) {
+			supported = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return supported;
+}
+
+static struct drbd_connection *get_first_connection(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection = NULL;
+
+	rcu_read_lock();
+	if (!list_empty(&resource->connections)) {
+		connection = first_connection(resource);
+		kref_get(&connection->kref);
+	}
+	rcu_read_unlock();
+	return connection;
+}
+
+/* That two_primaries is a connection option is one of those things of
+   the past, that should be cleaned up!! it should be a resource config!
+   Here is a inaccurate heuristic */
+static bool multiple_primaries_allowed(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+	bool allowed = false;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		nc = rcu_dereference(connection->transport.net_conf);
+		if (nc && nc->two_primaries) {
+			allowed = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return allowed;
+}
+
+static enum drbd_state_rv
+check_primaries_distances(struct drbd_resource *resource)
+{
+	struct twopc_reply *reply = &resource->twopc_reply;
+	int nr_primaries = hweight64(reply->primary_nodes);
+	u64 common_server;
+
+	if (nr_primaries <= 1)
+		return SS_SUCCESS;
+	if (nr_primaries > 1 && !multiple_primaries_allowed(resource))
+		return SS_TWO_PRIMARIES;
+	/* All primaries directly connected. Good */
+	if (!(reply->primary_nodes & reply->weak_nodes))
+		return SS_SUCCESS;
+
+	/* For virtualization setups with diskless hypervisors (R_PRIMARY) and one
+	   or multiple storage servers (R_SECONDARY) allow live-migration between the
+	   hypervisors. */
+	common_server = ~reply->weak_nodes;
+	if (common_server) {
+		int node_id;
+		/* Only allow if the new primary is diskless. See also far_away_change()
+		   in drbd_receiver.c for the diskless check on the other primary */
+		if ((reply->primary_nodes & NODE_MASK(resource->res_opts.node_id)) &&
+		    drbd_have_local_disk(resource))
+			return SS_WEAKLY_CONNECTED;
+
+		for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) {
+			struct drbd_connection *connection;
+			struct net_conf *nc;
+			bool two_primaries;
+
+			if (!(common_server & NODE_MASK(node_id)))
+				continue;
+			connection = drbd_connection_by_node_id(resource, node_id);
+			if (!connection)
+				continue;
+
+			rcu_read_lock();
+			nc = rcu_dereference(connection->transport.net_conf);
+			two_primaries = nc ? nc->two_primaries : false;
+			rcu_read_unlock();
+
+			if (!two_primaries)
+				return SS_TWO_PRIMARIES;
+		}
+
+		return SS_SUCCESS;
+	}
+	return SS_WEAKLY_CONNECTED;
+}
+
+static enum drbd_state_rv
+check_ro_cnt_and_primary(struct drbd_resource *resource)
+{
+	struct twopc_reply *reply = &resource->twopc_reply;
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct net_conf *nc;
+
+	if (drbd_open_ro_count(resource) == 0)
+		return rv;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		nc = rcu_dereference(connection->transport.net_conf);
+		if (!nc->two_primaries &&
+		    NODE_MASK(connection->peer_node_id) & reply->primary_nodes) {
+			rv = SS_PRIMARY_READER;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+long twopc_retry_timeout(struct drbd_resource *resource, int retries)
+{
+	struct drbd_connection *connection;
+	int connections = 0;
+	long timeout = 0;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (connection->cstate[NOW] < C_CONNECTING)
+			continue;
+		connections++;
+	}
+	rcu_read_unlock();
+
+	if (connections > 0) {
+		if (retries > 5)
+			retries = 5;
+		timeout = resource->res_opts.twopc_retry_timeout *
+			  HZ / 10 * connections * (1 << retries);
+		timeout = get_random_u32_below(timeout);
+	}
+	return timeout;
+}
+
+void abort_connect(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (test_and_clear_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags))
+			up_read_non_owner(&peer_device->device->uuid_sem);
+		clear_bit(INITIAL_STATE_SENT, &peer_device->flags);
+		clear_bit(INITIAL_STATE_RECEIVED, &peer_device->flags);
+		clear_bit(UUIDS_RECEIVED, &peer_device->flags);
+		clear_bit(CURRENT_UUID_RECEIVED, &peer_device->flags);
+	}
+	rcu_read_unlock();
+}
+
+static void twopc_phase2(struct drbd_resource *resource,
+			 struct twopc_request *request,
+			 u64 reach_immediately)
+{
+	struct drbd_connection *connection;
+	u64 im;
+
+	for_each_connection_ref(connection, im, resource) {
+		u64 mask = NODE_MASK(connection->peer_node_id);
+		if (!(reach_immediately & mask))
+			continue;
+
+		conn_send_twopc_request(connection, request);
+	}
+}
+
+void drbd_print_cluster_wide_state_change(struct drbd_resource *resource, const char *message,
+		unsigned int tid, unsigned int initiator_node_id, int target_node_id,
+		union drbd_state mask, union drbd_state val)
+{
+	char buffer[150], *b, *end = buffer + sizeof(buffer);
+
+	b = buffer;
+	b += scnprintf(b, end - b, "%u->", initiator_node_id);
+	if (target_node_id == -1)
+		b += scnprintf(b, end - b, "all");
+	else
+		b += scnprintf(b, end - b, "%d", target_node_id);
+
+	if (mask.role)
+		b += scnprintf(b, end - b, " role( %s )", drbd_role_str(val.role));
+
+	if (mask.peer)
+		b += scnprintf(b, end - b, " peer( %s )", drbd_role_str(val.peer));
+
+	if (mask.conn) {
+		if (val.conn > C_CONNECTED)
+			b += scnprintf(b, end - b, " repl( %s )", drbd_repl_str(val.conn));
+		else
+			b += scnprintf(b, end - b, " conn( %s )", drbd_conn_str(val.conn));
+	}
+
+	if (mask.disk)
+		b += scnprintf(b, end - b, " disk( %s )", drbd_disk_str(val.disk));
+
+	if (mask.pdsk)
+		b += scnprintf(b, end - b, " pdsk( %s )", drbd_disk_str(val.pdsk));
+
+	// Any of "susp-io( user )", "susp-io( quorum )" or "susp-io( uuid )"
+	if (mask.susp)
+		b += scnprintf(b, end - b, " %ssusp-io", val.susp ? "+" : "-");
+
+	if (mask.susp_nod)
+		b += scnprintf(b, end - b, " susp-io( %sno-disk )", val.susp_nod ? "+" : "-");
+
+	if (mask.susp_fen)
+		b += scnprintf(b, end - b, " susp-io( %sfencing )", val.susp_fen ? "+" : "-");
+
+	if (mask.user_isp)
+		b += scnprintf(b, end - b, " resync-susp( %suser )", val.user_isp ? "+" : "-");
+
+	if (mask.peer_isp)
+		b += scnprintf(b, end - b, " resync-susp( %speer )", val.peer_isp ? "+" : "-");
+
+	if (mask.aftr_isp)
+		b += scnprintf(b, end - b, " resync-susp( %safter dependency )",
+				val.aftr_isp ? "+" : "-");
 
-	if (warn != NO_WARNING)
-		drbd_warn(device, "%s\n", msg_table[warn]);
+	if (!mask.i)
+		b += scnprintf(b, end - b, " empty");
+
+	drbd_info(resource, "%s %u: %s\n", message, tid, buffer);
 }
 
 /**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @device:	DRBD device.
- * @os:		old state.
- * @ns:		new state.
- * @warn:	placeholder for returned state warning.
+ * change_cluster_wide_state  -  Cluster-wide two-phase commit
+ * @change: The callback function that does the actual state change.
+ * @context: State change context.
+ * @tag: State change tag to print in status messages.
+ *
+ * Perform a two-phase commit transaction among all (reachable) nodes in the
+ * cluster.  In our transaction model, the initiator of a transaction is also
+ * the coordinator.
+ *
+ * In phase one of the transaction, the coordinator sends all nodes in the
+ * cluster a P_TWOPC_PREPARE packet.  Each node replies with either P_TWOPC_YES
+ * if it consents or with P_TWOPC_NO if it denies the transaction.  Once all
+ * replies have been received, the coordinator sends all nodes in the cluster a
+ * P_TWOPC_COMMIT or P_TWOPC_ABORT packet to finish the transaction.
+ *
+ * When a node in the cluster is busy with another transaction, it replies with
+ * P_TWOPC_NO.  The coordinator is then responsible for retrying the
+ * transaction.
  *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ * Since a cluster is not guaranteed to always be fully connected, some nodes
+ * will not be directly reachable from other nodes.  In order to still reach
+ * all nodes in the cluster, participants will forward requests to nodes which
+ * haven't received the request yet:
+ *
+ * The nodes_to_reach field in requests indicates which nodes have received the
+ * request already.  Before forwarding a request to a peer, a node removes
+ * itself from nodes_to_reach; it then sends the request to all directly
+ * connected nodes in nodes_to_reach.
+ *
+ * If there are redundant paths in the cluster, requests will reach some nodes
+ * more than once.  Nodes remember when they are taking part in a transaction;
+ * they detect duplicate requests and reply to them with P_TWOPC_YES packets.
+ * (Transactions are identified by the node id of the initiator and a random,
+ * unique-enough transaction identifier.)
+ *
+ * A configurable timeout determines how long a coordinator or participant will
+ * wait for a transaction to finish.  A transaction that times out is assumed
+ * to have aborted.
  */
-static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
-				       union drbd_state ns, enum sanitize_state_warnings *warn)
+static enum drbd_state_rv
+change_cluster_wide_state(bool (*change)(struct change_context *, enum change_phase),
+			  struct change_context *context, const char *tag)
 {
-	enum drbd_fencing_p fp;
-	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+	struct drbd_resource *resource = context->resource;
+	unsigned long irq_flags;
+	struct twopc_request request;
+	struct twopc_reply *reply = &resource->twopc_reply;
+	struct drbd_connection *connection, *target_connection = NULL;
+	enum drbd_state_rv rv;
+	u64 reach_immediately;
+	int retries = 1;
+	unsigned long start_time;
+	bool have_peers;
+
+	begin_state_change(resource, &irq_flags, context->flags | CS_LOCAL_ONLY);
+	resource->state_change_err_str = context->err_str;
+
+	if (local_state_change(context->flags)) {
+		/* Not a cluster-wide state change. */
+		change(context, PH_LOCAL_COMMIT);
+		return end_state_change(resource, &irq_flags, tag);
+	} else {
+		if (!change(context, PH_PREPARE)) {
+			/* Not a cluster-wide state change. */
+			return end_state_change(resource, &irq_flags, tag);
+		}
+		rv = try_state_change(resource);
+		if (rv != SS_SUCCESS) {
+			/* Failure or nothing to do. */
+			/* abort_state_change(resource, &irq_flags); */
+			if (rv == SS_NOTHING_TO_DO)
+				resource->state_change_flags &= ~CS_VERBOSE;
+			return __end_state_change(resource, &irq_flags, rv, tag);
+		}
+		/* Really a cluster-wide state change. */
+	}
+
+	if (!supports_two_phase_commit(resource)) {
+		connection = get_first_connection(resource);
+		rv = SS_SUCCESS;
+		if (connection) {
+			rv = change_peer_state(connection, context->vnr, context->mask, context->val, &irq_flags);
+			kref_put(&connection->kref, drbd_destroy_connection);
+		}
+		if (rv >= SS_SUCCESS)
+			change(context, PH_84_COMMIT);
+		return __end_state_change(resource, &irq_flags, rv, tag);
+	}
+
+	if (!expect(resource, context->flags & CS_SERIALIZE || context->mask.i == 0)) {
+		rv = SS_CW_FAILED_BY_PEER;
+		return __end_state_change(resource, &irq_flags, rv, tag);
+	}
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (!expect(connection, current != connection->receiver.task)) {
+			rcu_read_unlock();
+			BUG();
+		}
+	}
+	rcu_read_unlock();
+
+    retry:
+	if (current == resource->worker.task && resource->remote_state_change)
+		return __end_state_change(resource, &irq_flags, SS_CONCURRENT_ST_CHG, tag);
+
+	complete_remote_state_change(resource, &irq_flags);
+	start_time = jiffies;
+	resource->state_change_err_str = context->err_str;
+
+	*reply = (struct twopc_reply) { 0 };
+
+	reach_immediately = directly_connected_nodes(resource, NOW);
+	if (context->target_node_id != -1) {
+		struct drbd_connection *connection;
+
+		/* Fail if the target node is no longer directly reachable. */
+		connection = drbd_get_connection_by_node_id(resource, context->target_node_id);
+		if (!connection) {
+			rv = SS_NEED_CONNECTION;
+			return __end_state_change(resource, &irq_flags, rv, tag);
+		}
 
-	if (warn)
-		*warn = NO_WARNING;
+		if (!(connection->cstate[NOW] == C_CONNECTED ||
+		      (connection->cstate[NOW] == C_CONNECTING &&
+		       context->mask.conn == conn_MASK &&
+		       context->val.conn == C_CONNECTED))) {
+			rv = SS_NEED_CONNECTION;
 
-	fp = FP_DONT_CARE;
-	if (get_ldev(device)) {
-		rcu_read_lock();
-		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
-		rcu_read_unlock();
-		put_ldev(device);
+			kref_put(&connection->kref, drbd_destroy_connection);
+			return __end_state_change(resource, &irq_flags, rv, tag);
+		}
+		target_connection = connection;
+
+		/* For connect transactions, add the target node id. */
+		reach_immediately |= NODE_MASK(context->target_node_id);
 	}
 
-	/* Implications from connection to peer and peer_isp */
-	if (ns.conn < C_CONNECTED) {
-		ns.peer_isp = 0;
-		ns.peer = R_UNKNOWN;
-		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
-			ns.pdsk = D_UNKNOWN;
+	do
+		reply->tid = get_random_u32();
+	while (!reply->tid);
+
+	clear_bit(TWOPC_RECV_SIZES_ERR, &resource->flags);
+	request.tid = reply->tid;
+	request.initiator_node_id = resource->res_opts.node_id;
+	request.target_node_id = context->target_node_id;
+	request.nodes_to_reach = ~(reach_immediately | NODE_MASK(resource->res_opts.node_id));
+	request.vnr = context->vnr;
+	request.cmd = P_TWOPC_PREPARE;
+	request.flags = TWOPC_HAS_REACHABLE;
+
+	resource->twopc.type = TWOPC_STATE_CHANGE;
+	resource->twopc.state_change.mask = context->mask;
+	resource->twopc.state_change.val = context->val;
+	resource->twopc.state_change.primary_nodes = 0;
+	resource->twopc.state_change.reachable_nodes = 0;
+	resource->twopc_parent_nodes = 0;
+	resource->remote_state_change = true;
+
+	drbd_print_cluster_wide_state_change(resource, "Preparing cluster-wide state change",
+			request.tid, resource->res_opts.node_id, context->target_node_id,
+			context->mask, context->val);
+
+	reply->initiator_node_id = resource->res_opts.node_id;
+	reply->target_node_id = context->target_node_id;
+
+	reply->reachable_nodes = directly_connected_nodes(resource, NOW) |
+				       NODE_MASK(resource->res_opts.node_id);
+	if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED) {
+		reply->reachable_nodes |= NODE_MASK(context->target_node_id);
+		reply->target_reachable_nodes = reply->reachable_nodes;
+		reply->is_connect = 1;
+		drbd_init_connect_state(target_connection);
+	} else if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) {
+		reply->target_reachable_nodes = NODE_MASK(context->target_node_id);
+		reply->reachable_nodes &= ~reply->target_reachable_nodes;
+		reply->is_disconnect = 1;
+	} else {
+		reply->target_reachable_nodes = reply->reachable_nodes;
 	}
 
-	/* Clear the aftr_isp when becoming unconfigured */
-	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
-		ns.aftr_isp = 0;
+	D_ASSERT(resource, !test_bit(TWOPC_WORK_PENDING, &resource->flags));
+	begin_remote_state_change(resource, &irq_flags);
+	rv = __cluster_wide_request(resource, &request, reach_immediately);
 
-	/* An implication of the disk states onto the connection state */
-	/* Abort resync if a disk fails/detaches */
-	if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-		if (warn)
-			*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
-				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
-		ns.conn = C_CONNECTED;
-	}
+	/* If we are changing state attached to a particular connection then we
+	 * expect that connection to remain connected. A failure to send
+	 * P_TWOPC_PREPARE on that connection is a failure for the whole
+	 * cluster-wide state change. */
+	if (target_connection && !test_bit(TWOPC_PREPARED, &target_connection->flags))
+		rv = SS_NEED_CONNECTION;
 
-	/* Connection breaks down before we finished "Negotiating" */
-	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
-	    get_ldev_if_state(device, D_NEGOTIATING)) {
-		if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
-			ns.disk = device->new_state_tmp.disk;
-			ns.pdsk = device->new_state_tmp.pdsk;
-		} else {
-			if (warn)
-				*warn = CONNECTION_LOST_NEGOTIATING;
-			ns.disk = D_DISKLESS;
-			ns.pdsk = D_UNKNOWN;
+	have_peers = rv == SS_CW_SUCCESS;
+	if (have_peers) {
+		long t;
+
+		if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED &&
+		    target_connection->agreed_pro_version >= 118)
+			conn_connect2(target_connection);
+
+		t = wait_event_interruptible_timeout(resource->state_wait,
+						     cluster_wide_reply_ready(resource),
+						     twopc_timeout(resource));
+		if (t > 0)
+			rv = get_cluster_wide_reply(resource, context);
+		else
+			rv = t == 0 ? SS_TIMEOUT : SS_INTERRUPTED;
+
+		/* while waiting for the replies, reach_immediately might have changed. */
+		reach_immediately = directly_connected_nodes(resource, NOW);
+		if (target_connection && target_connection->cstate[NOW] == C_CONNECTING)
+			reach_immediately |= NODE_MASK(context->target_node_id);
+
+		request.nodes_to_reach =
+			~(reach_immediately | NODE_MASK(resource->res_opts.node_id));
+
+		if (rv == SS_CW_SUCCESS) {
+			u64 directly_reachable = reach_immediately |
+				NODE_MASK(resource->res_opts.node_id);
+
+			if (context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING)
+				directly_reachable &= ~NODE_MASK(context->target_node_id);
+
+			if ((context->mask.role == role_MASK && context->val.role == R_PRIMARY) ||
+			    (context->mask.role != role_MASK && resource->role[NOW] == R_PRIMARY)) {
+				reply->primary_nodes |= NODE_MASK(resource->res_opts.node_id);
+				if (drbd_res_data_accessible(resource))
+					reply->weak_nodes |= ~directly_reachable;
+			}
+
+			/*
+			 * When a node is Primary and has access to UpToDate data, it sets
+			 * weak_nodes to the mask of those it is not connected to. This includes the
+			 * bits for nodes which are not configured, so will always have some set
+			 * bits. Thus if there is a Primary node and no bits are set in weak_nodes,
+			 * the Primary cannot have access to UpToDate data.
+			 */
+			if (reply->primary_nodes && !reply->weak_nodes)
+				request.flags |= TWOPC_PRI_INCAPABLE;
+
+			drbd_info(resource, "State change %u: primary_nodes=%lX, weak_nodes=%lX\n",
+				  reply->tid, (unsigned long)reply->primary_nodes,
+				  (unsigned long)reply->weak_nodes);
+
+			if ((context->mask.role == role_MASK && context->val.role == R_PRIMARY) ||
+			    (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED))
+				rv = check_primaries_distances(resource);
+
+			if (rv >= SS_SUCCESS &&
+			    context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED)
+				rv = check_ro_cnt_and_primary(resource);
+
+			if (!(context->mask.conn == conn_MASK && context->val.conn == C_DISCONNECTING) ||
+			    (reply->reachable_nodes & reply->target_reachable_nodes)) {
+				/* The cluster is still connected after this
+				 * transaction: either this transaction does
+				 * not disconnect a connection, or there are
+				 * redundant connections.  */
+
+				u64 m;
+
+				m = reply->reachable_nodes | reply->target_reachable_nodes;
+				reply->reachable_nodes = m;
+				reply->target_reachable_nodes = m;
+			} else {
+				rcu_read_lock();
+				for_each_connection_rcu(connection, resource) {
+					int node_id = connection->peer_node_id;
+
+					if (node_id == context->target_node_id) {
+						drbd_info(connection, "Cluster is now split\n");
+						break;
+					}
+				}
+				rcu_read_unlock();
+			}
+
+			resource->twopc.state_change.primary_nodes = reply->primary_nodes;
+			resource->twopc.state_change.reachable_nodes =
+				reply->target_reachable_nodes;
 		}
-		put_ldev(device);
-	}
 
-	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
-	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
-		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
-			ns.disk = D_UP_TO_DATE;
-		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
-			ns.pdsk = D_UP_TO_DATE;
-	}
-
-	/* Implications of the connection state on the disk states */
-	disk_min = D_DISKLESS;
-	disk_max = D_UP_TO_DATE;
-	pdsk_min = D_INCONSISTENT;
-	pdsk_max = D_UNKNOWN;
-	switch ((enum drbd_conns)ns.conn) {
-	case C_WF_BITMAP_T:
-	case C_PAUSED_SYNC_T:
-	case C_STARTING_SYNC_T:
-	case C_WF_SYNC_UUID:
-	case C_BEHIND:
-		disk_min = D_INCONSISTENT;
-		disk_max = D_OUTDATED;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_VERIFY_S:
-	case C_VERIFY_T:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_CONNECTED:
-		disk_min = D_DISKLESS;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_DISKLESS;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_WF_BITMAP_S:
-	case C_PAUSED_SYNC_S:
-	case C_STARTING_SYNC_S:
-	case C_AHEAD:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_INCONSISTENT;
-		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
-		break;
-	case C_SYNC_TARGET:
-		disk_min = D_INCONSISTENT;
-		disk_max = D_INCONSISTENT;
-		pdsk_min = D_UP_TO_DATE;
-		pdsk_max = D_UP_TO_DATE;
-		break;
-	case C_SYNC_SOURCE:
-		disk_min = D_UP_TO_DATE;
-		disk_max = D_UP_TO_DATE;
-		pdsk_min = D_INCONSISTENT;
-		pdsk_max = D_INCONSISTENT;
-		break;
-	case C_STANDALONE:
-	case C_DISCONNECTING:
-	case C_UNCONNECTED:
-	case C_TIMEOUT:
-	case C_BROKEN_PIPE:
-	case C_NETWORK_FAILURE:
-	case C_PROTOCOL_ERROR:
-	case C_TEAR_DOWN:
-	case C_WF_CONNECTION:
-	case C_WF_REPORT_PARAMS:
-	case C_MASK:
-		break;
-	}
-	if (ns.disk > disk_max)
-		ns.disk = disk_max;
+		if (context->mask.conn == conn_MASK && context->val.conn == C_CONNECTED &&
+		    target_connection->agreed_pro_version >= 118) {
+			wait_initial_states_received(target_connection);
 
-	if (ns.disk < disk_min) {
-		if (warn)
-			*warn = IMPLICITLY_UPGRADED_DISK;
-		ns.disk = disk_min;
+			if (rv >= SS_SUCCESS && test_bit(TWOPC_RECV_SIZES_ERR, &resource->flags))
+				rv = SS_HANDSHAKE_DISCONNECT;
+		}
 	}
-	if (ns.pdsk > pdsk_max)
-		ns.pdsk = pdsk_max;
 
-	if (ns.pdsk < pdsk_min) {
-		if (warn)
-			*warn = IMPLICITLY_UPGRADED_PDSK;
-		ns.pdsk = pdsk_min;
+	request.cmd = rv >= SS_SUCCESS ? P_TWOPC_COMMIT : P_TWOPC_ABORT;
+	if (rv < SS_SUCCESS && target_connection)
+		abort_connect(target_connection);
+
+	if ((rv == SS_TIMEOUT || rv == SS_CONCURRENT_ST_CHG) &&
+	    !(context->flags & CS_DONT_RETRY)) {
+		long timeout = twopc_retry_timeout(resource, retries++);
+		drbd_info(resource, "Retrying cluster-wide state change after %ums\n",
+			  jiffies_to_msecs(timeout));
+		if (have_peers)
+			twopc_phase2(resource, &request, reach_immediately);
+		if (target_connection) {
+			kref_put(&target_connection->kref, drbd_destroy_connection);
+			target_connection = NULL;
+		}
+		clear_remote_state_change(resource);
+		schedule_timeout_interruptible(timeout);
+		end_remote_state_change(resource, &irq_flags, context->flags | CS_TWOPC);
+		goto retry;
 	}
 
-	if (fp == FP_STONITH &&
-	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
-	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
-	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
-	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
-		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+	if (rv >= SS_SUCCESS)
+		drbd_info(resource, "Committing cluster-wide state change %u (%ums)\n",
+			  request.tid,
+			  jiffies_to_msecs(jiffies - start_time));
+	else
+		drbd_info(resource, "Aborting cluster-wide state change %u (%ums) rv = %d\n",
+			  request.tid,
+			  jiffies_to_msecs(jiffies - start_time),
+			  rv);
+
+	if (have_peers && context->change_local_state_last) {
+		set_bit(TWOPC_STATE_CHANGE_PENDING, &resource->flags);
+		twopc_phase2(resource, &request, reach_immediately);
+	}
 
-	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
-		if (ns.conn == C_SYNC_SOURCE)
-			ns.conn = C_PAUSED_SYNC_S;
-		if (ns.conn == C_SYNC_TARGET)
-			ns.conn = C_PAUSED_SYNC_T;
+	end_remote_state_change(resource, &irq_flags, context->flags | CS_TWOPC);
+	clear_bit(TWOPC_STATE_CHANGE_PENDING, &resource->flags);
+	if (rv >= SS_SUCCESS) {
+		change(context, PH_COMMIT);
+		rv = end_state_change(resource, &irq_flags, tag);
+		if (rv < SS_SUCCESS)
+			drbd_err(resource, "FATAL: Local commit of already committed %u failed! \n",
+				 request.tid);
 	} else {
-		if (ns.conn == C_PAUSED_SYNC_S)
-			ns.conn = C_SYNC_SOURCE;
-		if (ns.conn == C_PAUSED_SYNC_T)
-			ns.conn = C_SYNC_TARGET;
+		abort_state_change(resource, &irq_flags);
 	}
 
-	return ns;
-}
+	if (have_peers && !context->change_local_state_last)
+		twopc_phase2(resource, &request, reach_immediately);
 
-void drbd_resume_al(struct drbd_device *device)
-{
-	if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
-		drbd_info(device, "Resumed AL updates\n");
+	if (target_connection) {
+		kref_put(&target_connection->kref, drbd_destroy_connection);
+	}
+	return rv;
 }
 
-/* helper for _drbd_set_state */
-static void set_ov_position(struct drbd_peer_device *peer_device, enum drbd_conns cs)
+enum determine_dev_size
+change_cluster_wide_device_size(struct drbd_device *device,
+				sector_t local_max_size,
+				uint64_t new_user_size,
+				enum dds_flags dds_flags,
+				struct resize_parms *rs)
 {
-	struct drbd_device *device = peer_device->device;
+	struct drbd_resource *resource = device->resource;
+	struct twopc_reply *reply = &resource->twopc_reply;
+	struct twopc_request request;
+	unsigned long start_time;
+	unsigned long irq_flags;
+	enum drbd_state_rv rv;
+	enum determine_dev_size dd;
+	u64 reach_immediately;
+	bool have_peers, commit_it;
+	sector_t new_size = 0;
+	int retries = 1;
+
+retry:
+	rv = drbd_support_2pc_resize(resource);
+	if (rv < SS_SUCCESS)
+		return DS_2PC_NOT_SUPPORTED;
 
-	if (peer_device->connection->agreed_pro_version < 90)
-		device->ov_start_sector = 0;
-	device->rs_total = drbd_bm_bits(device);
-	device->ov_position = 0;
-	if (cs == C_VERIFY_T) {
-		/* starting online verify from an arbitrary position
-		 * does not fit well into the existing protocol.
-		 * on C_VERIFY_T, we initialize ov_left and friends
-		 * implicitly in receive_DataRequest once the
-		 * first P_OV_REQUEST is received */
-		device->ov_start_sector = ~(sector_t)0;
-	} else {
-		unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
-		if (bit >= device->rs_total) {
-			device->ov_start_sector =
-				BM_BIT_TO_SECT(device->rs_total - 1);
-			device->rs_total = 1;
-		} else
-			device->rs_total -= bit;
-		device->ov_position = device->ov_start_sector;
-	}
-	device->ov_left = device->rs_total;
-}
+	state_change_lock(resource, &irq_flags, CS_VERBOSE | CS_LOCAL_ONLY);
+	rcu_read_lock();
+	complete_remote_state_change(resource, &irq_flags);
+	start_time = jiffies;
+	reach_immediately = directly_connected_nodes(resource, NOW);
+
+	*reply = (struct twopc_reply) { 0 };
+
+	do
+		reply->tid = get_random_u32();
+	while (!reply->tid);
+
+	request.tid = reply->tid;
+	request.initiator_node_id = resource->res_opts.node_id;
+	request.target_node_id = -1;
+	request.nodes_to_reach = ~(reach_immediately | NODE_MASK(resource->res_opts.node_id));
+	request.vnr = device->vnr;
+	request.cmd = P_TWOPC_PREP_RSZ;
+	request.flags = 0;
+	resource->twopc.type = TWOPC_RESIZE;
+	resource->twopc.resize.dds_flags = dds_flags;
+	resource->twopc.resize.user_size = new_user_size;
+	resource->twopc.resize.diskful_primary_nodes = 0;
+	resource->twopc.resize.new_size = 0;
+	resource->twopc_parent_nodes = 0;
+	resource->remote_state_change = true;
+
+	reply->initiator_node_id = resource->res_opts.node_id;
+	reply->target_node_id = -1;
+	reply->max_possible_size = local_max_size;
+	reply->reachable_nodes = reach_immediately | NODE_MASK(resource->res_opts.node_id);
+	reply->target_reachable_nodes = reply->reachable_nodes;
+	if (resource->role[NOW] == R_PRIMARY)
+		reply->diskful_primary_nodes = NODE_MASK(resource->res_opts.node_id);
+	rcu_read_unlock();
+	state_change_unlock(resource, &irq_flags);
 
-/**
- * _drbd_set_state() - Set a new DRBD state
- * @device:	DRBD device.
- * @ns:		new state.
- * @flags:	Flags
- * @done:	Optional completion, that will get completed after the after_state_ch() finished
- *
- * Caller needs to hold req_lock. Do not call directly.
- */
-enum drbd_state_rv
-_drbd_set_state(struct drbd_device *device, union drbd_state ns,
-	        enum chg_state_flags flags, struct completion *done)
-{
-	struct drbd_peer_device *peer_device = first_peer_device(device);
-	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
-	union drbd_state os;
-	enum drbd_state_rv rv = SS_SUCCESS;
-	enum sanitize_state_warnings ssw;
-	struct after_state_chg_work *ascw;
-	struct drbd_state_change *state_change;
+	drbd_info(device, "Preparing cluster-wide size change %u "
+		  "(local_max_size = %llu KB, user_cap = %llu KB)\n",
+		  request.tid,
+		  (unsigned long long)local_max_size >> 1,
+		  (unsigned long long)new_user_size >> 1);
 
-	os = drbd_read_state(device);
+	rv = __cluster_wide_request(resource, &request, reach_immediately);
 
-	ns = sanitize_state(device, os, ns, &ssw);
-	if (ns.i == os.i)
-		return SS_NOTHING_TO_DO;
+	have_peers = rv == SS_CW_SUCCESS;
+	if (have_peers) {
+		if (wait_event_timeout(resource->state_wait,
+				       cluster_wide_reply_ready(resource),
+				       twopc_timeout(resource)))
+			rv = get_cluster_wide_reply(resource, NULL);
+		else
+			rv = SS_TIMEOUT;
 
-	rv = is_valid_transition(os, ns);
-	if (rv < SS_SUCCESS)
-		return rv;
+		if (rv == SS_TIMEOUT || rv == SS_CONCURRENT_ST_CHG) {
+			long timeout = twopc_retry_timeout(resource, retries++);
 
-	if (!(flags & CS_HARD)) {
-		/*  pre-state-change checks ; only look at ns  */
-		/* See drbd_state_sw_errors in drbd_strings.c */
+			drbd_info(device, "Retrying cluster-wide size change after %ums\n",
+				  jiffies_to_msecs(timeout));
 
-		rv = is_valid_state(device, ns);
-		if (rv < SS_SUCCESS) {
-			/* If the old state was illegal as well, then let
-			   this happen...*/
+			request.cmd = P_TWOPC_ABORT;
+			twopc_phase2(resource, &request, reach_immediately);
 
-			if (is_valid_state(device, os) == rv)
-				rv = is_valid_soft_transition(os, ns, connection);
-		} else
-			rv = is_valid_soft_transition(os, ns, connection);
+			clear_remote_state_change(resource);
+			schedule_timeout_interruptible(timeout);
+			goto retry;
+		}
 	}
 
-	if (rv < SS_SUCCESS) {
-		if (flags & CS_VERBOSE)
-			print_st_err(device, os, ns, rv);
-		return rv;
+	if (rv >= SS_SUCCESS) {
+		new_size = drbd_new_dev_size(device, reply->max_possible_size,
+						new_user_size, dds_flags | DDSF_2PC);
+		commit_it = new_size != get_capacity(device->vdisk);
+
+		if (commit_it) {
+			resource->twopc.resize.new_size = new_size;
+			resource->twopc.resize.diskful_primary_nodes = reply->diskful_primary_nodes;
+			drbd_info(device, "Committing cluster-wide size change %u (%ums)\n",
+				  request.tid,
+				  jiffies_to_msecs(jiffies - start_time));
+		} else {
+			drbd_info(device, "Aborting cluster-wide size change %u (%ums) size unchanged\n",
+				  request.tid,
+				  jiffies_to_msecs(jiffies - start_time));
+		}
+	} else {
+		commit_it = false;
+		drbd_info(device, "Aborting cluster-wide size change %u (%ums) rv = %d\n",
+			  request.tid,
+			  jiffies_to_msecs(jiffies - start_time),
+			  rv);
 	}
 
-	print_sanitize_warnings(device, ssw);
+	request.cmd = commit_it ? P_TWOPC_COMMIT : P_TWOPC_ABORT;
+	if (have_peers)
+		twopc_phase2(resource, &request, reach_immediately);
 
-	drbd_pr_state_change(device, os, ns, flags);
+	if (commit_it) {
+		struct twopc_resize *tr = &resource->twopc.resize;
 
-	/* Display changes to the susp* flags that where caused by the call to
-	   sanitize_state(). Only display it here if we where not called from
-	   _conn_request_state() */
-	if (!(flags & CS_DC_SUSP))
-		conn_pr_state_change(connection, os, ns,
-				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+		tr->diskful_primary_nodes = reply->diskful_primary_nodes;
+		tr->new_size = new_size;
+		tr->dds_flags = dds_flags;
+		tr->user_size = new_user_size;
 
-	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
-	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
-	 * drbd_ldev_destroy() won't happen before our corresponding
-	 * after_state_ch works run, where we put_ldev again. */
-	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
-	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
-		atomic_inc(&device->local_cnt);
+		dd = drbd_commit_size_change(device, rs, reach_immediately);
+	} else {
+		if (rv == SS_CW_FAILED_BY_PEER)
+			dd = DS_2PC_NOT_SUPPORTED;
+		else if (rv >= SS_SUCCESS)
+			dd = DS_UNCHANGED;
+		else
+			dd = DS_2PC_ERR;
+	}
 
-	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
-		clear_bit(RS_DONE, &device->flags);
+	clear_remote_state_change(resource);
+	return dd;
+}
 
-	/* FIXME: Have any flags been set earlier in this function already? */
-	state_change = remember_old_state(device->resource, GFP_ATOMIC);
+static void twopc_end_nested(struct drbd_resource *resource, enum drbd_packet cmd)
+{
+	struct drbd_connection *twopc_parent;
+	u64 im;
+	struct twopc_reply twopc_reply;
+	u64 twopc_parent_nodes = 0;
+
+	write_lock_irq(&resource->state_rwlock);
+	twopc_reply = resource->twopc_reply;
+	/* Only send replies if we are in a twopc and have not yet sent replies. */
+	if (twopc_reply.tid && resource->twopc_prepare_reply_cmd == 0) {
+		resource->twopc_prepare_reply_cmd = cmd;
+		twopc_parent_nodes = resource->twopc_parent_nodes;
+	}
+	clear_bit(TWOPC_WORK_PENDING, &resource->flags);
+	write_unlock_irq(&resource->state_rwlock);
 
-	/* changes to local_cnt and device flags should be visible before
-	 * changes to state, which again should be visible before anything else
-	 * depending on that change happens. */
-	smp_wmb();
-	device->state.i = ns.i;
-	device->resource->susp = ns.susp;
-	device->resource->susp_nod = ns.susp_nod;
-	device->resource->susp_fen = ns.susp_fen;
-	smp_wmb();
+	if (!twopc_reply.tid)
+		return;
 
-	remember_new_state(state_change);
+	for_each_connection_ref(twopc_parent, im, resource) {
+		if (!(twopc_parent_nodes & NODE_MASK(twopc_parent->peer_node_id)))
+			continue;
 
-	/* put replicated vs not-replicated requests in seperate epochs */
-	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
-	    drbd_should_do_remote((union drbd_dev_state)ns.i))
-		start_new_tl_epoch(connection);
+		if (twopc_reply.is_disconnect)
+			set_bit(DISCONNECT_EXPECTED, &twopc_parent->flags);
 
-	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
-		drbd_print_uuids(device, "attached to UUIDs");
+		dynamic_drbd_dbg(twopc_parent, "Nested state change %u result: %s\n",
+			   twopc_reply.tid, drbd_packet_name(cmd));
 
-	/* Wake up role changes, that were delayed because of connection establishing */
-	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
-	    no_peer_wf_report_params(connection)) {
-		clear_bit(STATE_SENT, &connection->flags);
-		wake_up_all_devices(connection);
+		drbd_send_twopc_reply(twopc_parent, cmd, &twopc_reply);
 	}
+	wake_up_all(&resource->twopc_wait);
+}
 
-	wake_up(&device->misc_wait);
-	wake_up(&device->state_wait);
-	wake_up(&connection->ping_wait);
-
-	/* Aborted verify run, or we reached the stop sector.
-	 * Log the last position, unless end-of-device. */
-	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
-	    ns.conn <= C_CONNECTED) {
-		device->ov_start_sector =
-			BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
-		if (device->ov_left)
-			drbd_info(device, "Online Verify reached sector %llu\n",
-				(unsigned long long)device->ov_start_sector);
-	}
+static void __nested_twopc_work(struct drbd_resource *resource)
+{
+	enum drbd_state_rv rv;
+	enum drbd_packet cmd;
 
-	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
-	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
-		drbd_info(device, "Syncer continues.\n");
-		device->rs_paused += (long)jiffies
-				  -(long)device->rs_mark_time[device->rs_last_mark];
-		if (ns.conn == C_SYNC_TARGET)
-			mod_timer(&device->resync_timer, jiffies);
-	}
+	rv = get_cluster_wide_reply(resource, NULL);
+	if (rv >= SS_SUCCESS)
+		cmd = P_TWOPC_YES;
+	else if (rv == SS_CONCURRENT_ST_CHG || rv == SS_HANDSHAKE_RETRY)
+		cmd = P_TWOPC_RETRY;
+	else
+		cmd = P_TWOPC_NO;
+	twopc_end_nested(resource, cmd);
+}
 
-	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
-	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
-		drbd_info(device, "Resync suspended\n");
-		device->rs_mark_time[device->rs_last_mark] = jiffies;
-	}
+void nested_twopc_work(struct work_struct *work)
+{
+	struct drbd_resource *resource =
+		container_of(work, struct drbd_resource, twopc_work);
 
-	if (os.conn == C_CONNECTED &&
-	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
-		unsigned long now = jiffies;
-		int i;
+	__nested_twopc_work(resource);
 
-		set_ov_position(peer_device, ns.conn);
-		device->rs_start = now;
-		device->rs_last_sect_ev = 0;
-		device->ov_last_oos_size = 0;
-		device->ov_last_oos_start = 0;
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
 
-		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-			device->rs_mark_left[i] = device->ov_left;
-			device->rs_mark_time[i] = now;
-		}
+void drbd_maybe_cluster_wide_reply(struct drbd_resource *resource)
+{
+	lockdep_assert_held(&resource->state_rwlock);
 
-		drbd_rs_controller_reset(peer_device);
+	if (!resource->remote_state_change || !cluster_wide_reply_ready(resource))
+		return;
 
-		if (ns.conn == C_VERIFY_S) {
-			drbd_info(device, "Starting Online Verify from sector %llu\n",
-					(unsigned long long)device->ov_position);
-			mod_timer(&device->resync_timer, jiffies);
-		}
+	if (resource->twopc_reply.initiator_node_id == resource->res_opts.node_id) {
+		wake_up_all(&resource->state_wait);
+		return;
 	}
 
-	if (get_ldev(device)) {
-		u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
-						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
-						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
-		mdf &= ~MDF_AL_CLEAN;
-		if (test_bit(CRASHED_PRIMARY, &device->flags))
-			mdf |= MDF_CRASHED_PRIMARY;
-		if (device->state.role == R_PRIMARY ||
-		    (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
-			mdf |= MDF_PRIMARY_IND;
-		if (device->state.conn > C_WF_REPORT_PARAMS)
-			mdf |= MDF_CONNECTED_IND;
-		if (device->state.disk > D_INCONSISTENT)
-			mdf |= MDF_CONSISTENT;
-		if (device->state.disk > D_OUTDATED)
-			mdf |= MDF_WAS_UP_TO_DATE;
-		if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
-			mdf |= MDF_PEER_OUT_DATED;
-		if (mdf != device->ldev->md.flags) {
-			device->ldev->md.flags = mdf;
-			drbd_md_mark_dirty(device);
-		}
-		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
-			drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
-		put_ldev(device);
-	}
+	if (test_and_set_bit(TWOPC_WORK_PENDING, &resource->flags))
+		return;
 
-	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
-	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
-	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
-		set_bit(CONSIDER_RESYNC, &device->flags);
-
-	/* Receiver should clean up itself */
-	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-		drbd_thread_stop_nowait(&connection->receiver);
-
-	/* Now the receiver finished cleaning up itself, it should die */
-	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-		drbd_thread_stop_nowait(&connection->receiver);
-
-	/* Upon network failure, we need to restart the receiver. */
-	if (os.conn > C_WF_CONNECTION &&
-	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-		drbd_thread_restart_nowait(&connection->receiver);
-
-	/* Resume AL writing if we get a connection */
-	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-		drbd_resume_al(device);
-		connection->connect_cnt++;
-	}
-
-	/* remember last attach time so request_timer_fn() won't
-	 * kill newly established sessions while we are still trying to thaw
-	 * previously frozen IO */
-	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-	    ns.disk > D_NEGOTIATING)
-		device->last_reattach_jif = jiffies;
-
-	ascw = kmalloc_obj(*ascw, GFP_ATOMIC);
-	if (ascw) {
-		ascw->os = os;
-		ascw->ns = ns;
-		ascw->flags = flags;
-		ascw->w.cb = w_after_state_ch;
-		ascw->device = device;
-		ascw->done = done;
-		ascw->state_change = state_change;
-		drbd_queue_work(&connection->sender_work,
-				&ascw->w);
-	} else {
-		drbd_err(device, "Could not kmalloc an ascw\n");
-	}
+	kref_get(&resource->kref);
+	schedule_work(&resource->twopc_work);
+}
 
+enum drbd_state_rv
+nested_twopc_request(struct drbd_resource *resource, struct twopc_request *request)
+{
+	u64 nodes_to_reach, reach_immediately;
+	enum drbd_packet cmd = request->cmd;
+	enum drbd_state_rv rv;
+	bool have_peers;
+
+	write_lock_irq(&resource->state_rwlock);
+	nodes_to_reach = request->nodes_to_reach;
+	reach_immediately = directly_connected_nodes(resource, NOW) & nodes_to_reach;
+	nodes_to_reach &= ~(reach_immediately | NODE_MASK(resource->res_opts.node_id));
+	request->nodes_to_reach = nodes_to_reach;
+	write_unlock_irq(&resource->state_rwlock);
+
+	rv = __cluster_wide_request(resource, request, reach_immediately);
+	have_peers = rv == SS_CW_SUCCESS;
+	if (cmd == P_TWOPC_PREPARE || cmd == P_TWOPC_PREP_RSZ) {
+		if (rv < SS_SUCCESS)
+			twopc_end_nested(resource, P_TWOPC_NO);
+		else if (!have_peers && cluster_wide_reply_ready(resource)) /* no nested nodes */
+			__nested_twopc_work(resource);
+	}
 	return rv;
 }
 
-static int w_after_state_ch(struct drbd_work *w, int unused)
+static bool has_up_to_date_peer_disks(struct drbd_device *device)
 {
-	struct after_state_chg_work *ascw =
-		container_of(w, struct after_state_chg_work, w);
-	struct drbd_device *device = ascw->device;
+	struct drbd_peer_device *peer_device;
 
-	after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
-	forget_state_change(ascw->state_change);
-	if (ascw->flags & CS_WAIT_COMPLETE)
-		complete(ascw->done);
-	kfree(ascw);
+	for_each_peer_device(peer_device, device)
+		if (peer_device->disk_state[NEW] == D_UP_TO_DATE)
+			return true;
+	return false;
+}
 
-	return 0;
+static void disconnect_where_resync_target(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+
+	for_each_peer_device(peer_device, device)
+		if (is_sync_target_state(peer_device, NEW))
+			__change_cstate(peer_device->connection, C_TEAR_DOWN);
 }
 
-static void abw_start_sync(struct drbd_device *device, int rv)
+static bool do_change_role(struct change_context *context, enum change_phase phase)
 {
-	if (rv) {
-		drbd_err(device, "Writing the bitmap failed not starting resync.\n");
-		_drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
-		return;
+	struct drbd_resource *resource = context->resource;
+	enum drbd_role role = context->val.role;
+	int flags = context->flags;
+	struct drbd_device *device;
+	int vnr;
+
+	resource->role[NEW] = role;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (role == R_PRIMARY && (flags & CS_FP_LOCAL_UP_TO_DATE)) {
+			if (device->disk_state[NEW] < D_UP_TO_DATE &&
+			    device->disk_state[NEW] >= D_INCONSISTENT &&
+			    !has_up_to_date_peer_disks(device)) {
+				device->disk_state[NEW] = D_UP_TO_DATE;
+				/* adding it to the context so that it gets sent to the peers */
+				context->mask.disk |= disk_MASK;
+				context->val.disk |= D_UP_TO_DATE;
+				disconnect_where_resync_target(device);
+			}
+		}
+
+		if (role == R_PRIMARY && (flags & CS_FP_OUTDATE_PEERS)) {
+			struct drbd_peer_device *peer_device;
+			for_each_peer_device_rcu(peer_device, device) {
+				if (peer_device->disk_state[NEW] == D_UNKNOWN)
+					__change_peer_disk_state(peer_device, D_OUTDATED);
+			}
+		}
+
+		if (role == R_PRIMARY && phase == PH_COMMIT) {
+			u64 reachable_nodes = resource->twopc_reply.reachable_nodes;
+			struct drbd_peer_device *peer_device;
+
+			for_each_peer_device_rcu(peer_device, device) {
+				if (NODE_MASK(peer_device->node_id) & reachable_nodes &&
+				    peer_device->disk_state[NEW] == D_UNKNOWN &&
+				    want_bitmap(peer_device))
+					__change_peer_disk_state(peer_device, D_OUTDATED);
+			}
+		}
 	}
+	rcu_read_unlock();
 
-	switch (device->state.conn) {
-	case C_STARTING_SYNC_T:
-		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-		break;
-	case C_STARTING_SYNC_S:
-		drbd_start_resync(device, C_SYNC_SOURCE);
-		break;
+	return phase != PH_PREPARE ||
+		context->resource->role[NOW] != context->val.role;
+}
+
+enum drbd_state_rv change_role(struct drbd_resource *resource,
+			       enum drbd_role role,
+			       enum chg_state_flags flags,
+			       const char *tag,
+			       const char **err_str)
+{
+	struct change_context role_context = {
+		.resource = resource,
+		.vnr = -1,
+		.mask = { { .role = role_MASK } },
+		.val = { { .role = role } },
+		.target_node_id = -1,
+		.flags = flags | CS_SERIALIZE,
+		.err_str = err_str,
+	};
+	enum drbd_state_rv rv;
+	bool got_state_sem = false;
+
+	if (role == R_SECONDARY) {
+		if (!(flags & CS_ALREADY_SERIALIZED)) {
+			down(&resource->state_sem);
+			got_state_sem = true;
+			role_context.flags |= CS_ALREADY_SERIALIZED;
+		}
+		role_context.change_local_state_last = true;
 	}
+	rv = change_cluster_wide_state(do_change_role, &role_context, tag);
+	if (got_state_sem)
+		up(&resource->state_sem);
+	return rv;
 }
 
-int drbd_bitmap_io_from_worker(struct drbd_device *device,
-		int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
-		char *why, enum bm_flag flags,
-		struct drbd_peer_device *peer_device)
+void __change_io_susp_user(struct drbd_resource *resource, bool value)
 {
-	int rv;
+	resource->susp_user[NEW] = value;
+}
 
-	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+enum drbd_state_rv change_io_susp_user(struct drbd_resource *resource,
+				       bool value,
+				       enum chg_state_flags flags)
+{
+	unsigned long irq_flags;
 
-	/* open coded non-blocking drbd_suspend_io(device); */
-	atomic_inc(&device->suspend_cnt);
+	begin_state_change(resource, &irq_flags, flags);
+	__change_io_susp_user(resource, value);
+	return end_state_change(resource, &irq_flags, value ? "suspend-io" : "resume-io");
+}
 
-	drbd_bm_lock(device, why, flags);
-	rv = io_fn(device, peer_device);
-	drbd_bm_unlock(device);
+void __change_io_susp_no_data(struct drbd_resource *resource, bool value)
+{
+	resource->susp_nod[NEW] = value;
+}
 
-	drbd_resume_io(device);
+void __change_io_susp_fencing(struct drbd_connection *connection, bool value)
+{
+	connection->susp_fen[NEW] = value;
+}
 
-	return rv;
+void __change_io_susp_quorum(struct drbd_resource *resource, bool value)
+{
+	resource->susp_quorum[NEW] = value;
 }
 
-int notify_resource_state_change(struct sk_buff *skb,
-				  unsigned int seq,
-				  void *state_change,
-				  enum drbd_notification_type type)
+void __change_disk_state(struct drbd_device *device, enum drbd_disk_state disk_state)
 {
-	struct drbd_resource_state_change *resource_state_change = state_change;
-	struct drbd_resource *resource = resource_state_change->resource;
-	struct resource_info resource_info = {
-		.res_role = resource_state_change->role[NEW],
-		.res_susp = resource_state_change->susp[NEW],
-		.res_susp_nod = resource_state_change->susp_nod[NEW],
-		.res_susp_fen = resource_state_change->susp_fen[NEW],
-	};
+	device->disk_state[NEW] = disk_state;
+}
+
+void __downgrade_disk_states(struct drbd_resource *resource, enum drbd_disk_state disk_state)
+{
+	struct drbd_device *device;
+	int vnr;
 
-	return notify_resource_state(skb, seq, resource, &resource_info, type);
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (device->disk_state[NEW] > disk_state)
+			__change_disk_state(device, disk_state);
+	}
+	rcu_read_unlock();
 }
 
-int notify_connection_state_change(struct sk_buff *skb,
-				    unsigned int seq,
-				    void *state_change,
-				    enum drbd_notification_type type)
+void __outdate_myself(struct drbd_resource *resource)
 {
-	struct drbd_connection_state_change *p = state_change;
-	struct drbd_connection *connection = p->connection;
-	struct connection_info connection_info = {
-		.conn_connection_state = p->cstate[NEW],
-		.conn_role = p->peer_role[NEW],
-	};
+	struct drbd_device *device;
+	int vnr;
 
-	return notify_connection_state(skb, seq, connection, &connection_info, type);
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (device->disk_state[NOW] > D_OUTDATED)
+			__change_disk_state(device, D_OUTDATED);
+	}
 }
 
-int notify_device_state_change(struct sk_buff *skb,
-				unsigned int seq,
-				void *state_change,
-				enum drbd_notification_type type)
+static bool device_has_connected_peer_devices(struct drbd_device *device)
 {
-	struct drbd_device_state_change *device_state_change = state_change;
-	struct drbd_device *device = device_state_change->device;
-	struct device_info device_info = {
-		.dev_disk_state = device_state_change->disk_state[NEW],
-	};
+	struct drbd_peer_device *peer_device;
 
-	return notify_device_state(skb, seq, device, &device_info, type);
+	for_each_peer_device(peer_device, device)
+		if (peer_device->repl_state[NOW] >= L_ESTABLISHED)
+			return true;
+	return false;
 }
 
-int notify_peer_device_state_change(struct sk_buff *skb,
-				     unsigned int seq,
-				     void *state_change,
-				     enum drbd_notification_type type)
+static bool device_has_peer_devices_with_disk(struct drbd_device *device)
 {
-	struct drbd_peer_device_state_change *p = state_change;
-	struct drbd_peer_device *peer_device = p->peer_device;
-	struct peer_device_info peer_device_info = {
-		.peer_repl_state = p->repl_state[NEW],
-		.peer_disk_state = p->disk_state[NEW],
-		.peer_resync_susp_user = p->resync_susp_user[NEW],
-		.peer_resync_susp_peer = p->resync_susp_peer[NEW],
-		.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
-	};
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+
+	for_each_peer_device(peer_device, device) {
+		if (peer_device->connection->cstate[NOW] == C_CONNECTED) {
+			/* We expect to receive up-to-date UUIDs soon.
+			   To avoid a race in receive_state, "clear" uuids while
+			   holding state_rwlock. I.e. atomic with the state change */
+			clear_bit(UUIDS_RECEIVED, &peer_device->flags);
+			if (peer_device->disk_state[NOW] > D_DISKLESS)
+				rv = true;
+		}
+	}
 
-	return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+	return rv;
 }
 
-static void broadcast_state_change(struct drbd_state_change *state_change)
+static void restore_outdated_in_pdsk(struct drbd_device *device)
 {
-	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
-	bool resource_state_has_changed;
-	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
-	int (*last_func)(struct sk_buff *, unsigned int,
-		void *, enum drbd_notification_type) = NULL;
-	void *last_arg = NULL;
+	struct drbd_peer_device *peer_device;
 
-#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
-#define FINAL_STATE_CHANGE(type) \
-	({ if (last_func) \
-		last_func(NULL, 0, last_arg, type); \
-	})
-#define REMEMBER_STATE_CHANGE(func, arg, type) \
-	({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
-	   last_func = func; \
-	   last_arg = arg; \
-	 })
+	if (!get_ldev_if_state(device, D_ATTACHING))
+		return;
 
-	mutex_lock(&notification_mutex);
+	for_each_peer_device(peer_device, device) {
+		int node_id = peer_device->connection->peer_node_id;
+		struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id];
 
-	resource_state_has_changed =
-	    HAS_CHANGED(resource_state_change->role) ||
-	    HAS_CHANGED(resource_state_change->susp) ||
-	    HAS_CHANGED(resource_state_change->susp_nod) ||
-	    HAS_CHANGED(resource_state_change->susp_fen);
+		if ((peer_md->flags & MDF_PEER_OUTDATED) &&
+		    peer_device->disk_state[NEW] == D_UNKNOWN)
+			__change_peer_disk_state(peer_device, D_OUTDATED);
+	}
 
-	if (resource_state_has_changed)
-		REMEMBER_STATE_CHANGE(notify_resource_state_change,
-				      resource_state_change, NOTIFY_CHANGE);
+	put_ldev(device);
+}
 
-	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
-		struct drbd_connection_state_change *connection_state_change =
-				&state_change->connections[n_connection];
+static bool do_twopc_after_lost_peer(struct change_context *context, enum change_phase phase)
+{
+	struct drbd_resource *resource = context->resource;
+	struct twopc_reply *reply = &resource->twopc_reply;
+	u64 directly_reachable = directly_connected_nodes(resource, NEW) |
+		NODE_MASK(resource->res_opts.node_id);
+	bool pri_incapable = reply->primary_nodes && !reply->weak_nodes; /* TWOPC_PRI_INCAPABLE */
+
+	if (phase == PH_COMMIT && (reply->primary_nodes & ~directly_reachable && !pri_incapable)) {
+		__outdate_myself(resource);
+	} else {
+		struct drbd_device *device;
+		int vnr;
 
-		if (HAS_CHANGED(connection_state_change->peer_role) ||
-		    HAS_CHANGED(connection_state_change->cstate))
-			REMEMBER_STATE_CHANGE(notify_connection_state_change,
-					      connection_state_change, NOTIFY_CHANGE);
+		idr_for_each_entry(&resource->devices, device, vnr) {
+			if (device->disk_state[NOW] == D_CONSISTENT &&
+			    may_return_to_up_to_date(device, NOW))
+				__change_disk_state(device, D_UP_TO_DATE);
+		}
 	}
 
-	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
-		struct drbd_device_state_change *device_state_change =
-			&state_change->devices[n_device];
+	return phase != PH_PREPARE || reply->reachable_nodes != NODE_MASK(resource->res_opts.node_id);
+}
+
+static enum drbd_state_rv twopc_after_lost_peer(struct drbd_resource *resource,
+						enum chg_state_flags flags)
+{
+	struct change_context context = {
+		.resource = resource,
+		.vnr = -1,
+		.mask = { },
+		.val = { },
+		.target_node_id = -1,
+		.flags = flags | (resource->res_opts.quorum != QOU_OFF ? CS_FORCE_RECALC : 0),
+		.change_local_state_last = false,
+	};
+
+	/* The other nodes get the request for an empty state change. I.e. they
+	   will agree to this change request. At commit time we know where to
+	   go from the D_CONSISTENT, since we got the primary mask. */
+	return change_cluster_wide_state(do_twopc_after_lost_peer, &context, "lost-peer");
+}
+
+void drbd_empty_twopc_work_fn(struct work_struct *work)
+{
+	struct drbd_resource *resource = container_of(work, struct drbd_resource, empty_twopc);
 
-		if (HAS_CHANGED(device_state_change->disk_state))
-			REMEMBER_STATE_CHANGE(notify_device_state_change,
-					      device_state_change, NOTIFY_CHANGE);
-	}
+	twopc_after_lost_peer(resource, CS_VERBOSE);
 
-	n_peer_devices = state_change->n_devices * state_change->n_connections;
-	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
-		struct drbd_peer_device_state_change *p =
-			&state_change->peer_devices[n_peer_device];
+	clear_bit(TRY_BECOME_UP_TO_DATE_PENDING, &resource->flags);
+	wake_up_all(&resource->state_wait);
 
-		if (HAS_CHANGED(p->disk_state) ||
-		    HAS_CHANGED(p->repl_state) ||
-		    HAS_CHANGED(p->resync_susp_user) ||
-		    HAS_CHANGED(p->resync_susp_peer) ||
-		    HAS_CHANGED(p->resync_susp_dependency))
-			REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
-					      p, NOTIFY_CHANGE);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static bool do_change_disk_state(struct change_context *context, enum change_phase phase)
+{
+	struct drbd_device *device =
+		container_of(context, struct change_disk_state_context, context)->device;
+	bool cluster_wide_state_change = false;
+
+	if (device->disk_state[NOW] == D_ATTACHING &&
+	    context->val.disk == D_NEGOTIATING) {
+		if (device_has_peer_devices_with_disk(device)) {
+			cluster_wide_state_change =
+				supports_two_phase_commit(device->resource);
+		} else {
+			/* very last part of attach */
+			/* ldev_safe: D_ATTACHING->D_NEGOTIATING, state_rwlock held, ldev exists */
+			context->val.disk = disk_state_from_md(device);
+			restore_outdated_in_pdsk(device);
+		}
+	} else if (device->disk_state[NOW] != D_DETACHING &&
+		   context->val.disk == D_DETACHING &&
+		   device_has_connected_peer_devices(device)) {
+		cluster_wide_state_change = true;
 	}
+	__change_disk_state(device, context->val.disk);
+	return phase != PH_PREPARE || cluster_wide_state_change;
+}
 
-	FINAL_STATE_CHANGE(NOTIFY_CHANGE);
-	mutex_unlock(&notification_mutex);
+enum drbd_state_rv change_disk_state(struct drbd_device *device,
+				     enum drbd_disk_state disk_state,
+				     enum chg_state_flags flags,
+				     const char *tag,
+				     const char **err_str)
+{
+	struct change_disk_state_context disk_state_context = {
+		.context = {
+			.resource = device->resource,
+			.vnr = device->vnr,
+			.mask = { { .disk = disk_MASK } },
+			.val = { { .disk = disk_state } },
+			.target_node_id = -1,
+			.flags = flags,
+			.change_local_state_last = true,
+			.err_str = err_str,
+		},
+		.device = device,
+	};
 
-#undef HAS_CHANGED
-#undef FINAL_STATE_CHANGE
-#undef REMEMBER_STATE_CHANGE
+	return change_cluster_wide_state(do_change_disk_state,
+					 &disk_state_context.context, tag);
 }
 
-/* takes old and new peer disk state */
-static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
+void __change_cstate(struct drbd_connection *connection, enum drbd_conn_state cstate)
 {
-	if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
-	&&  (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
-		return true;
+	if (cstate == C_DISCONNECTING)
+		set_bit(DISCONNECT_EXPECTED, &connection->flags);
 
-	/* Scenario, starting with normal operation
-	 * Connected Primary/Secondary UpToDate/UpToDate
-	 * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
-	 * ...
-	 * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
-	 */
-	if (os == D_UNKNOWN
-	&&  (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
-		return true;
+	connection->cstate[NEW] = cstate;
+	if (cstate < C_CONNECTED) {
+		struct drbd_peer_device *peer_device;
+		int vnr;
 
-	return false;
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			__change_repl_state(peer_device, L_OFF);
+		rcu_read_unlock();
+	}
 }
 
-/**
- * after_state_ch() - Perform after state change actions that may sleep
- * @device:	DRBD device.
- * @os:		old state.
- * @ns:		new state.
- * @flags:	Flags
- * @state_change: state change to broadcast
- */
-static void after_state_ch(struct drbd_device *device, union drbd_state os,
-			   union drbd_state ns, enum chg_state_flags flags,
-			   struct drbd_state_change *state_change)
+static bool connection_has_connected_peer_devices(struct drbd_connection *connection)
 {
-	struct drbd_resource *resource = device->resource;
-	struct drbd_peer_device *peer_device = first_peer_device(device);
-	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
-	struct sib_info sib;
+	struct drbd_peer_device *peer_device;
+	int vnr;
 
-	broadcast_state_change(state_change);
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (peer_device->repl_state[NOW] >= L_ESTABLISHED)
+			return true;
+	}
+	return false;
+}
 
-	sib.sib_reason = SIB_STATE_CHANGE;
-	sib.os = os;
-	sib.ns = ns;
+enum outdate_what { OUTDATE_NOTHING, OUTDATE_DISKS, OUTDATE_PEER_DISKS };
 
-	if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
-	&&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
-		clear_bit(CRASHED_PRIMARY, &device->flags);
-		if (device->p_uuid)
-			device->p_uuid[UI_FLAGS] &= ~((u64)2);
+static enum outdate_what outdate_on_disconnect(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+
+	if (connection->cstate[NOW] == C_CONNECTED &&
+	    (connection->fencing_policy >= FP_RESOURCE ||
+	     connection->resource->res_opts.quorum != QOU_OFF) &&
+	    resource->role[NOW] != connection->peer_role[NOW]) {
+		/* primary politely disconnects from secondary,
+		 * tells peer to please outdate itself */
+		if (resource->role[NOW] == R_PRIMARY)
+			return OUTDATE_PEER_DISKS;
+
+		/* secondary politely disconnect from primary,
+		 * proposes to outdate itself. */
+		if (connection->peer_role[NOW] == R_PRIMARY)
+			return OUTDATE_DISKS;
 	}
+	return OUTDATE_NOTHING;
+}
 
-	/* Inform userspace about the change... */
-	drbd_bcast_event(device, &sib);
+static void __change_cstate_and_outdate(struct drbd_connection *connection,
+					enum drbd_conn_state cstate,
+					enum outdate_what outdate_what)
+{
+	__change_cstate(connection, cstate);
+	switch (outdate_what) {
+	case OUTDATE_DISKS:
+		__downgrade_disk_states(connection->resource, D_OUTDATED);
+		break;
+	case OUTDATE_PEER_DISKS:
+		__downgrade_peer_disk_states(connection, D_OUTDATED);
+		break;
+	case OUTDATE_NOTHING:
+		break;
+	}
+}
 
-	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
-	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
-		drbd_khelper(device, "pri-on-incon-degr");
+void apply_connect(struct drbd_connection *connection, bool commit)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
 
-	/* Here we have the actions that are performed after a
-	   state change. This function might sleep */
+	if (!commit || connection->cstate[NEW] != C_CONNECTED)
+		return;
 
-	if (ns.susp_nod) {
-		enum drbd_req_event what = NOTHING;
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		union drbd_state s = peer_device->connect_state;
 
-		spin_lock_irq(&device->resource->req_lock);
-		if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
-			what = RESEND;
+		if (s.disk != D_MASK)
+			__change_disk_state(device, s.disk);
+		if (device->disk_state[NOW] != D_NEGOTIATING)
+			__change_repl_state(peer_device, s.conn);
+		__change_peer_disk_state(peer_device, s.pdsk);
+		__change_resync_susp_peer(peer_device, s.peer_isp);
 
-		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-		    conn_lowest_disk(connection) == D_UP_TO_DATE)
-			what = RESTART_FROZEN_DISK_IO;
+		if (s.conn == L_OFF)
+			__change_cstate(connection, C_DISCONNECTING);
 
-		if (resource->susp_nod && what != NOTHING) {
-			_tl_restart(connection, what);
-			_conn_request_state(connection,
-					    (union drbd_state) { { .susp_nod = 1 } },
-					    (union drbd_state) { { .susp_nod = 0 } },
-					    CS_VERBOSE);
-		}
-		spin_unlock_irq(&device->resource->req_lock);
+		if (commit)
+			clear_bit(DISCARD_MY_DATA, &peer_device->flags);
 	}
+}
 
-	if (ns.susp_fen) {
-		spin_lock_irq(&device->resource->req_lock);
-		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
-			/* case2: The connection was established again: */
-			struct drbd_peer_device *peer_device;
-			int vnr;
-
-			rcu_read_lock();
-			idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-				clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
-			rcu_read_unlock();
-
-			/* We should actively create a new uuid, _before_
-			 * we resume/resent, if the peer is diskless
-			 * (recovery from a multiple error scenario).
-			 * Currently, this happens with a slight delay
-			 * below when checking lost_contact_to_peer_data() ...
-			 */
-			_tl_restart(connection, RESEND);
-			_conn_request_state(connection,
-					    (union drbd_state) { { .susp_fen = 1 } },
-					    (union drbd_state) { { .susp_fen = 0 } },
-					    CS_VERBOSE);
-		}
-		spin_unlock_irq(&device->resource->req_lock);
-	}
-
-	/* Became sync source.  With protocol >= 96, we still need to send out
-	 * the sync uuid now. Need to do that before any drbd_send_state, or
-	 * the other side may go "paused sync" before receiving the sync uuids,
-	 * which is unexpected. */
-	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
-	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-	    connection->agreed_pro_version >= 96 && get_ldev(device)) {
-		drbd_gen_and_send_sync_uuid(peer_device);
-		put_ldev(device);
-	}
+struct change_cstate_context {
+	struct change_context context;
+	struct drbd_connection *connection;
+	enum outdate_what outdate_what;
+};
 
-	/* Do not change the order of the if above and the two below... */
-	if (os.pdsk == D_DISKLESS &&
-	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
-		/* we probably will start a resync soon.
-		 * make sure those things are properly reset. */
-		device->rs_total = 0;
-		device->rs_failed = 0;
-		atomic_set(&device->rs_pending_cnt, 0);
-		drbd_rs_cancel_all(device);
-
-		drbd_send_uuids(peer_device);
-		drbd_send_state(peer_device, ns);
-	}
-	/* No point in queuing send_bitmap if we don't have a connection
-	 * anymore, so check also the _current_ state, not only the new state
-	 * at the time this work was queued. */
-	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
-	    device->state.conn == C_WF_BITMAP_S)
-		drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
-				"send_bitmap (WFBitMapS)",
-				BM_LOCKED_TEST_ALLOWED, peer_device);
-
-	/* Lost contact to peer's copy of the data */
-	if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
-		if (get_ldev(device)) {
-			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-			    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-				if (drbd_suspended(device)) {
-					set_bit(NEW_CUR_UUID, &device->flags);
-				} else {
-					drbd_uuid_new_current(device);
-					drbd_send_uuids(peer_device);
-				}
+static bool do_change_cstate(struct change_context *context, enum change_phase phase)
+{
+	struct change_cstate_context *cstate_context =
+		container_of(context, struct change_cstate_context, context);
+	struct drbd_connection *connection = cstate_context->connection;
+	struct drbd_resource *resource = context->resource;
+	struct twopc_reply *reply = &resource->twopc_reply;
+
+	if (phase == PH_PREPARE) {
+		cstate_context->outdate_what = OUTDATE_NOTHING;
+		if (context->val.conn == C_DISCONNECTING && !(context->flags & CS_HARD)) {
+			cstate_context->outdate_what =
+				outdate_on_disconnect(connection);
+			switch (cstate_context->outdate_what) {
+			case OUTDATE_DISKS:
+				context->mask.disk = disk_MASK;
+				context->val.disk = D_OUTDATED;
+				break;
+			case OUTDATE_PEER_DISKS:
+				context->mask.pdsk = pdsk_MASK;
+				context->val.pdsk = D_OUTDATED;
+				break;
+			case OUTDATE_NOTHING:
+				break;
 			}
-			put_ldev(device);
 		}
 	}
+	if ((context->val.conn == C_CONNECTED && connection->cstate[NEW] == C_CONNECTING) ||
+	    context->val.conn != C_CONNECTED)
+		__change_cstate_and_outdate(connection,
+					    context->val.conn,
+					    cstate_context->outdate_what);
+
+	if (context->val.conn == C_CONNECTED &&
+	    connection->agreed_pro_version >= 117)
+		apply_connect(connection, phase == PH_COMMIT);
+
+	if (phase == PH_COMMIT) {
+		u64 directly_reachable = directly_connected_nodes(resource, NEW) |
+			NODE_MASK(resource->res_opts.node_id);
+
+		if (reply->primary_nodes & ~directly_reachable)
+			__outdate_myself(resource);
+	}
 
-	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
-		if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
-		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-			drbd_uuid_new_current(device);
-			drbd_send_uuids(peer_device);
-		}
-		/* D_DISKLESS Peer becomes secondary */
-		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
-			/* We may still be Primary ourselves.
-			 * No harm done if the bitmap still changes,
-			 * redirtied pages will follow later. */
-			drbd_bitmap_io_from_worker(device, &drbd_bm_write,
-				"demote diskless peer", BM_LOCKED_SET_ALLOWED, peer_device);
-		put_ldev(device);
+	if (context->val.conn == C_CONNECTED && connection->peer_role[NOW] == R_UNKNOWN) {
+		enum drbd_role target_role =
+			(reply->primary_nodes & NODE_MASK(context->target_node_id)) ?
+			R_PRIMARY : R_SECONDARY;
+
+		__change_peer_role(connection, target_role);
 	}
 
-	/* Write out all changed bits on demote.
-	 * Though, no need to da that just yet
-	 * if there is a resync going on still */
-	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
-		device->state.conn <= C_CONNECTED && get_ldev(device)) {
-		/* No changes to the bitmap expected this time, so assert that,
-		 * even though no harm was done if it did change. */
-		drbd_bitmap_io_from_worker(device, &drbd_bm_write,
-				"demote", BM_LOCKED_TEST_ALLOWED, peer_device);
-		put_ldev(device);
+	return phase != PH_PREPARE ||
+	       context->val.conn == C_CONNECTED ||
+	       (context->val.conn == C_DISCONNECTING &&
+		connection_has_connected_peer_devices(connection));
+}
+
+/**
+ * change_cstate_tag()  -  change the connection state of a connection
+ * @connection: DRBD connection.
+ * @cstate: The connection state to change to.
+ * @flags: State change flags.
+ * @tag: State change tag to print in status messages.
+ * @err_str: Pointer to save the error string to.
+ *
+ * When disconnecting from a peer, we may also need to outdate the local or
+ * peer disks depending on the fencing policy.  This cannot easily be split
+ * into two state changes.
+ */
+enum drbd_state_rv change_cstate_tag(struct drbd_connection *connection,
+				    enum drbd_conn_state cstate,
+				    enum chg_state_flags flags,
+				    const char *tag,
+				    const char **err_str)
+{
+	struct change_cstate_context cstate_context = {
+		.context = {
+			.resource = connection->resource,
+			.vnr = -1,
+			.mask = { { .conn = conn_MASK } },
+			.val = { { .conn = cstate } },
+			.target_node_id = connection->peer_node_id,
+			.flags = flags,
+			.change_local_state_last = true,
+			.err_str = err_str,
+		},
+		.connection = connection,
+	};
+
+	if (cstate == C_CONNECTED) {
+		cstate_context.context.mask.role = role_MASK;
+		cstate_context.context.val.role = connection->resource->role[NOW];
 	}
 
-	/* Last part of the attaching process ... */
-	if (ns.conn >= C_CONNECTED &&
-	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-		drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
-		drbd_send_uuids(peer_device);
-		drbd_send_state(peer_device, ns);
-	}
-
-	/* We want to pause/continue resync, tell peer. */
-	if (ns.conn >= C_CONNECTED &&
-	     ((os.aftr_isp != ns.aftr_isp) ||
-	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(peer_device, ns);
-
-	/* In case one of the isp bits got set, suspend other devices. */
-	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
-	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
-		suspend_other_sg(device);
-
-	/* Make sure the peer gets informed about eventual state
-	   changes (ISP bits) while we were in WFReportParams. */
-	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(peer_device, ns);
-
-	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(peer_device, ns);
-
-	/* We are in the progress to start a full sync... */
-	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
-		/* no other bitmap changes expected during this phase */
-		drbd_queue_bitmap_io(device,
-			&drbd_bmio_set_n_write, &abw_start_sync,
-			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED,
-			peer_device);
-
-	/* first half of local IO error, failure to attach,
-	 * or administrative detach */
-	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh = EP_PASS_ON;
-		int was_io_error = 0;
-		/* corresponding get_ldev was in _drbd_set_state, to serialize
-		 * our cleanup here with the transition to D_DISKLESS.
-		 * But is is still not save to dreference ldev here, since
-		 * we might come from an failed Attach before ldev was set. */
-		if (device->ldev) {
-			rcu_read_lock();
-			eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
-			rcu_read_unlock();
+	/*
+	 * Hard connection state changes like a protocol error or forced
+	 * disconnect may occur while we are holding resource->state_sem.  In
+	 * that case, omit CS_SERIALIZE so that we don't deadlock trying to
+	 * grab that mutex again.
+	 */
+	if (!(flags & CS_HARD))
+		cstate_context.context.flags |= CS_SERIALIZE;
 
-			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
-
-			/* Intentionally call this handler first, before drbd_send_state().
-			 * See: 2932204 drbd: call local-io-error handler early
-			 * People may chose to hard-reset the box from this handler.
-			 * It is useful if this looks like a "regular node crash". */
-			if (was_io_error && eh == EP_CALL_HELPER)
-				drbd_khelper(device, "local-io-error");
-
-			/* Immediately allow completion of all application IO,
-			 * that waits for completion from the local disk,
-			 * if this was a force-detach due to disk_timeout
-			 * or administrator request (drbdsetup detach --force).
-			 * Do NOT abort otherwise.
-			 * Aborting local requests may cause serious problems,
-			 * if requests are completed to upper layers already,
-			 * and then later the already submitted local bio completes.
-			 * This can cause DMA into former bio pages that meanwhile
-			 * have been re-used for other things.
-			 * So aborting local requests may cause crashes,
-			 * or even worse, silent data corruption.
-			 */
-			if (test_and_clear_bit(FORCE_DETACH, &device->flags))
-				tl_abort_disk_io(device);
+	return change_cluster_wide_state(do_change_cstate, &cstate_context.context, tag);
+}
 
-			/* current state still has to be D_FAILED,
-			 * there is only one way out: to D_DISKLESS,
-			 * and that may only happen after our put_ldev below. */
-			if (device->state.disk != D_FAILED)
-				drbd_err(device,
-					"ASSERT FAILED: disk is %s during detach\n",
-					drbd_disk_str(device->state.disk));
+void __change_peer_role(struct drbd_connection *connection, enum drbd_role peer_role)
+{
+	connection->peer_role[NEW] = peer_role;
+}
 
-			if (ns.conn >= C_CONNECTED)
-				drbd_send_state(peer_device, ns);
+void __change_repl_state(struct drbd_peer_device *peer_device, enum drbd_repl_state repl_state)
+{
+	peer_device->repl_state[NEW] = repl_state;
+	if (repl_state > L_OFF)
+		peer_device->connection->cstate[NEW] = C_CONNECTED;
+}
 
-			drbd_rs_cancel_all(device);
+struct change_repl_context {
+	struct change_context context;
+	struct drbd_peer_device *peer_device;
+};
 
-			/* In case we want to get something to stable storage still,
-			 * this may be the last chance.
-			 * Following put_ldev may transition to D_DISKLESS. */
-			drbd_md_sync(device);
-		}
-		put_ldev(device);
-	}
+static bool do_change_repl_state(struct change_context *context, enum change_phase phase)
+{
+	struct change_repl_context *repl_context =
+		container_of(context, struct change_repl_context, context);
+	struct drbd_peer_device *peer_device = repl_context->peer_device;
+	enum drbd_repl_state *repl_state = peer_device->repl_state;
+	enum drbd_repl_state new_repl_state = context->val.conn;
+	bool cluster_wide = context->flags & CS_CLUSTER_WIDE;
+
+	__change_repl_state(peer_device, new_repl_state);
+
+	return phase != PH_PREPARE ||
+		((repl_state[NOW] >= L_ESTABLISHED &&
+		  (new_repl_state == L_STARTING_SYNC_S || new_repl_state == L_STARTING_SYNC_T)) ||
+		 (repl_state[NOW] == L_ESTABLISHED &&
+		  (new_repl_state == L_VERIFY_S || new_repl_state == L_OFF)) ||
+		 (repl_state[NOW] == L_ESTABLISHED && cluster_wide &&
+		  (new_repl_state == L_WF_BITMAP_S || new_repl_state == L_WF_BITMAP_T)));
+}
 
-	/* second half of local IO error, failure to attach,
-	 * or administrative detach,
-	 * after local_cnt references have reached zero again */
-	if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
-		/* We must still be diskless,
-		 * re-attach has to be serialized with this! */
-		if (device->state.disk != D_DISKLESS)
-			drbd_err(device,
-				 "ASSERT FAILED: disk is %s while going diskless\n",
-				 drbd_disk_str(device->state.disk));
-
-		if (ns.conn >= C_CONNECTED)
-			drbd_send_state(peer_device, ns);
-		/* corresponding get_ldev in __drbd_set_state
-		 * this may finally trigger drbd_ldev_destroy. */
-		put_ldev(device);
-	}
+enum drbd_state_rv change_repl_state(struct drbd_peer_device *peer_device,
+				     enum drbd_repl_state new_repl_state,
+				     enum chg_state_flags flags,
+				     const char *tag)
+{
+	struct change_repl_context repl_context = {
+		.context = {
+			.resource = peer_device->device->resource,
+			.vnr = peer_device->device->vnr,
+			.mask = { { .conn = conn_MASK } },
+			.val = { { .conn = new_repl_state } },
+			.target_node_id = peer_device->node_id,
+			.flags = flags
+		},
+		.peer_device = peer_device
+	};
 
-	/* Notify peer that I had a local IO error, and did not detached.. */
-	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-		drbd_send_state(peer_device, ns);
-
-	/* Disks got bigger while they were detached */
-	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
-	    test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
-		if (ns.conn == C_CONNECTED)
-			resync_after_online_grow(device);
-	}
-
-	/* A resync finished or aborted, wake paused devices... */
-	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
-	    (os.peer_isp && !ns.peer_isp) ||
-	    (os.user_isp && !ns.user_isp))
-		resume_next_sg(device);
-
-	/* sync target done with resync.  Explicitly notify peer, even though
-	 * it should (at least for non-empty resyncs) already know itself. */
-	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(peer_device, ns);
-
-	/* Verify finished, or reached stop sector.  Peer did not know about
-	 * the stop sector, and we may even have changed the stop sector during
-	 * verify to interrupt/stop early.  Send the new state. */
-	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
-	&& verify_can_do_stop_sector(device))
-		drbd_send_state(peer_device, ns);
-
-	/* This triggers bitmap writeout of potentially still unwritten pages
-	 * if the resync finished cleanly, or aborted because of peer disk
-	 * failure, or on transition from resync back to AHEAD/BEHIND.
-	 *
-	 * Connection loss is handled in drbd_disconnected() by the receiver.
-	 *
-	 * For resync aborted because of local disk failure, we cannot do
-	 * any bitmap writeout anymore.
-	 *
-	 * No harm done if some bits change during this phase.
-	 */
-	if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
-	    (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
-		drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
-			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED,
-			peer_device);
-		put_ldev(device);
-	}
+	if (new_repl_state == L_WF_BITMAP_S || new_repl_state == L_VERIFY_S)
+		repl_context.context.change_local_state_last = true;
 
-	if (ns.disk == D_DISKLESS &&
-	    ns.conn == C_STANDALONE &&
-	    ns.role == R_SECONDARY) {
-		if (os.aftr_isp != ns.aftr_isp)
-			resume_next_sg(device);
-	}
+	return change_cluster_wide_state(do_change_repl_state, &repl_context.context, tag);
+}
 
-	drbd_md_sync(device);
+enum drbd_state_rv stable_change_repl_state(struct drbd_peer_device *peer_device,
+					    enum drbd_repl_state repl_state,
+					    enum chg_state_flags flags,
+					    const char *tag)
+{
+	return stable_state_change(peer_device->device->resource,
+		change_repl_state(peer_device, repl_state, flags, tag));
 }
 
-struct after_conn_state_chg_work {
-	struct drbd_work w;
-	enum drbd_conns oc;
-	union drbd_state ns_min;
-	union drbd_state ns_max; /* new, max state, over all devices */
-	enum chg_state_flags flags;
-	struct drbd_connection *connection;
-	struct drbd_state_change *state_change;
-};
+void __change_peer_disk_state(struct drbd_peer_device *peer_device, enum drbd_disk_state disk_state)
+{
+	peer_device->disk_state[NEW] = disk_state;
+}
 
-static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+void __downgrade_peer_disk_states(struct drbd_connection *connection, enum drbd_disk_state disk_state)
 {
-	struct after_conn_state_chg_work *acscw =
-		container_of(w, struct after_conn_state_chg_work, w);
-	struct drbd_connection *connection = acscw->connection;
-	enum drbd_conns oc = acscw->oc;
-	union drbd_state ns_max = acscw->ns_max;
 	struct drbd_peer_device *peer_device;
 	int vnr;
 
-	broadcast_state_change(acscw->state_change);
-	forget_state_change(acscw->state_change);
-	kfree(acscw);
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (peer_device->disk_state[NEW] > disk_state)
+			__change_peer_disk_state(peer_device, disk_state);
+	}
+	rcu_read_unlock();
+}
 
-	/* Upon network configuration, we need to start the receiver */
-	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
-		drbd_thread_start(&connection->receiver);
+enum drbd_state_rv change_peer_disk_state(struct drbd_peer_device *peer_device,
+					  enum drbd_disk_state disk_state,
+					  enum chg_state_flags flags,
+					  const char *tag)
+{
+	struct drbd_resource *resource = peer_device->device->resource;
+	unsigned long irq_flags;
 
-	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
-		struct net_conf *old_conf;
+	begin_state_change(resource, &irq_flags, flags);
+	__change_peer_disk_state(peer_device, disk_state);
+	return end_state_change(resource, &irq_flags, tag);
+}
 
-		mutex_lock(&notification_mutex);
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-			notify_peer_device_state(NULL, 0, peer_device, NULL,
-						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
-		notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
-		mutex_unlock(&notification_mutex);
+void __change_resync_susp_user(struct drbd_peer_device *peer_device,
+				       bool value)
+{
+	peer_device->resync_susp_user[NEW] = value;
+}
 
-		mutex_lock(&connection->resource->conf_update);
-		old_conf = connection->net_conf;
-		connection->my_addr_len = 0;
-		connection->peer_addr_len = 0;
-		RCU_INIT_POINTER(connection->net_conf, NULL);
-		conn_free_crypto(connection);
-		mutex_unlock(&connection->resource->conf_update);
+enum drbd_state_rv change_resync_susp_user(struct drbd_peer_device *peer_device,
+						   bool value,
+						   enum chg_state_flags flags)
+{
+	struct drbd_resource *resource = peer_device->device->resource;
+	unsigned long irq_flags;
 
-		kvfree_rcu_mightsleep(old_conf);
-	}
+	begin_state_change(resource, &irq_flags, flags);
+	__change_resync_susp_user(peer_device, value);
+	return end_state_change(resource, &irq_flags, value ? "pause-sync" : "resume-sync");
+}
 
-	if (ns_max.susp_fen) {
-		/* case1: The outdate peer handler is successful: */
-		if (ns_max.pdsk <= D_OUTDATED) {
-			rcu_read_lock();
-			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-				struct drbd_device *device = peer_device->device;
-				if (test_bit(NEW_CUR_UUID, &device->flags)) {
-					drbd_uuid_new_current(device);
-					clear_bit(NEW_CUR_UUID, &device->flags);
-				}
-			}
-			rcu_read_unlock();
-			spin_lock_irq(&connection->resource->req_lock);
-			_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
-			_conn_request_state(connection,
-					    (union drbd_state) { { .susp_fen = 1 } },
-					    (union drbd_state) { { .susp_fen = 0 } },
-					    CS_VERBOSE);
-			spin_unlock_irq(&connection->resource->req_lock);
-		}
-	}
-	conn_md_sync(connection);
-	kref_put(&connection->kref, drbd_destroy_connection);
+void __change_resync_susp_peer(struct drbd_peer_device *peer_device,
+				       bool value)
+{
+	peer_device->resync_susp_peer[NEW] = value;
+}
 
-	return 0;
+void __change_resync_susp_dependency(struct drbd_peer_device *peer_device,
+					     bool value)
+{
+	peer_device->resync_susp_dependency[NEW] = value;
 }
 
-static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+static void log_current_uuids(struct drbd_device *device)
 {
-	enum chg_state_flags flags = ~0;
 	struct drbd_peer_device *peer_device;
-	int vnr, first_vol = 1;
-	union drbd_dev_state os, cs = {
-		{ .role = R_SECONDARY,
-		  .peer = R_UNKNOWN,
-		  .conn = connection->cstate,
-		  .disk = D_DISKLESS,
-		  .pdsk = D_UNKNOWN,
-		} };
+	struct drbd_connection *connection;
+	char msg[120];
+	int ret, pos = 0;
 
 	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		os = device->state;
-
-		if (first_vol) {
-			cs = os;
-			first_vol = 0;
+	for_each_peer_device_rcu(peer_device, device) {
+		if (peer_device->disk_state[NOW] != D_UP_TO_DATE)
 			continue;
+		connection = peer_device->connection;
+		ret = snprintf(msg + pos, 120 - pos, "%s: %016llX ",
+			       rcu_dereference(connection->transport.net_conf)->name,
+			       peer_device->current_uuid);
+		if (ret > 0)
+			pos += ret;
+		if (pos >= 120)
+			break;
+	}
+	rcu_read_unlock();
+	drbd_warn(device, "%s", msg);
+}
+
+bool drbd_res_data_accessible(struct drbd_resource *resource)
+{
+	bool data_accessible = false;
+	struct drbd_device *device;
+	int vnr;
+
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (drbd_data_accessible(device, NOW)) {
+			data_accessible = true;
+			break;
 		}
+	}
 
-		if (cs.role != os.role)
-			flags &= ~CS_DC_ROLE;
+	return data_accessible;
+}
 
-		if (cs.peer != os.peer)
-			flags &= ~CS_DC_PEER;
+/**
+ * calc_data_accessible() - returns if up-to-data data is reachable
+ *
+ * @state_change: where to get the state information from
+ * @n_device:     index into the devices array
+ * @which:        OLD or NEW
+ *
+ * calc_data_accessible() returns true if either the local disk is up-to-date
+ * or of the peers. The related drbd_data_accessible() computes the same
+ * result from different inputs.
+ */
+static bool calc_data_accessible(struct drbd_state_change *state_change, int n_device,
+				 enum which_state which)
+{
+	struct drbd_device_state_change *device_state_change = &state_change->devices[n_device];
+	enum drbd_disk_state *disk_state = device_state_change->disk_state;
+	int n_connection;
 
-		if (cs.conn != os.conn)
-			flags &= ~CS_DC_CONN;
+	if (disk_state[which] == D_UP_TO_DATE)
+		return true;
 
-		if (cs.disk != os.disk)
-			flags &= ~CS_DC_DISK;
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[
+				n_device * state_change->n_connections + n_connection];
+		struct drbd_peer_device *peer_device = peer_device_state_change->peer_device;
+		enum drbd_disk_state *peer_disk_state = peer_device_state_change->disk_state;
+		struct net_conf *nc;
+		bool allow_remote_read;
 
-		if (cs.pdsk != os.pdsk)
-			flags &= ~CS_DC_PDSK;
+		rcu_read_lock();
+		nc = rcu_dereference(peer_device->connection->transport.net_conf);
+		allow_remote_read = nc->allow_remote_read;
+		rcu_read_unlock();
+		if (nc && !allow_remote_read)
+			continue;
+		if (peer_disk_state[which] == D_UP_TO_DATE)
+			return true;
 	}
-	rcu_read_unlock();
 
-	*pf |= CS_DC_MASK;
-	*pf &= flags;
-	(*pcs).i = cs.i;
+	return false;
 }
 
-static enum drbd_state_rv
-conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
-			 enum chg_state_flags flags)
+/**
+ * drbd_data_accessible() - returns if up-to-data data is reachable
+ *
+ * @device: the device, the question is about
+ * @which:  OLD, NEW, or NOW (Only use OLD within a state change!)
+ *
+ * drbd_data_accessible() returns true if either the local disk is up-to-date
+ * or of the peers. The related calc_data_accessible() computes the same
+ * result from different inputs.
+ */
+bool drbd_data_accessible(struct drbd_device *device, enum which_state which)
 {
-	enum drbd_state_rv rv = SS_SUCCESS;
-	union drbd_state ns, os;
 	struct drbd_peer_device *peer_device;
-	int vnr;
+	bool data_accessible = false;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		os = drbd_read_state(device);
-		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
-
-		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
-			ns.disk = os.disk;
+	if (device->disk_state[which] == D_UP_TO_DATE)
+		return true;
 
-		if (ns.i == os.i)
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		struct net_conf *nc;
+		nc = rcu_dereference(peer_device->connection->transport.net_conf);
+		if (nc && !nc->allow_remote_read)
 			continue;
-
-		rv = is_valid_transition(os, ns);
-
-		if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
-			rv = is_valid_state(device, ns);
-			if (rv < SS_SUCCESS) {
-				if (is_valid_state(device, os) == rv)
-					rv = is_valid_soft_transition(os, ns, connection);
-			} else
-				rv = is_valid_soft_transition(os, ns, connection);
-		}
-
-		if (rv < SS_SUCCESS) {
-			if (flags & CS_VERBOSE)
-				print_st_err(device, os, ns, rv);
+		if (peer_device->disk_state[which] == D_UP_TO_DATE) {
+			data_accessible = true;
 			break;
 		}
 	}
 	rcu_read_unlock();
 
-	return rv;
+	return data_accessible;
 }
-
-static void
-conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
-	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+/* drbd_data_accessible() and exposable_data_uuid() have the same structure. By intention. */
+static u64 exposable_data_uuid(struct drbd_device *device)
 {
-	union drbd_state ns, os, ns_max = { };
-	union drbd_state ns_min = {
-		{ .role = R_MASK,
-		  .peer = R_MASK,
-		  .conn = val.conn,
-		  .disk = D_MASK,
-		  .pdsk = D_MASK
-		} };
 	struct drbd_peer_device *peer_device;
-	enum drbd_state_rv rv;
-	int vnr, number_of_volumes = 0;
-
-	if (mask.conn == C_MASK) {
-		/* remember last connect time so request_timer_fn() won't
-		 * kill newly established sessions while we are still trying to thaw
-		 * previously frozen IO */
-		if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
-			connection->last_reconnect_jif = jiffies;
+	u64 uuid = 0;
 
-		connection->cstate = val.conn;
+	if (get_ldev_if_state(device, D_UP_TO_DATE)) {
+		uuid = device->ldev->md.current_uuid;
+		put_ldev(device);
+		return uuid;
 	}
 
 	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		number_of_volumes++;
-		os = drbd_read_state(device);
-		ns = apply_mask_val(os, mask, val);
-		ns = sanitize_state(device, os, ns, NULL);
-
-		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
-			ns.disk = os.disk;
-
-		rv = _drbd_set_state(device, ns, flags, NULL);
-		BUG_ON(rv < SS_SUCCESS);
-		ns.i = device->state.i;
-		ns_max.role = max_role(ns.role, ns_max.role);
-		ns_max.peer = max_role(ns.peer, ns_max.peer);
-		ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
-		ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
-		ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
-
-		ns_min.role = min_role(ns.role, ns_min.role);
-		ns_min.peer = min_role(ns.peer, ns_min.peer);
-		ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
-		ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
-		ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+	for_each_peer_device_rcu(peer_device, device) {
+		struct net_conf *nc;
+		nc = rcu_dereference(peer_device->connection->transport.net_conf);
+		if (nc && !nc->allow_remote_read)
+			continue;
+		if (peer_device->disk_state[NOW] == D_UP_TO_DATE &&
+		    (uuid & ~UUID_PRIMARY) != (peer_device->current_uuid & ~UUID_PRIMARY)) {
+			if (!uuid) {
+				uuid = peer_device->current_uuid;
+				continue;
+			}
+			drbd_err(device, "Multiple UpToDate peers have different current UUIDs\n");
+			log_current_uuids(device);
+		}
 	}
 	rcu_read_unlock();
 
-	if (number_of_volumes == 0) {
-		ns_min = ns_max = (union drbd_state) { {
-				.role = R_SECONDARY,
-				.peer = R_UNKNOWN,
-				.conn = val.conn,
-				.disk = D_DISKLESS,
-				.pdsk = D_UNKNOWN
-			} };
-	}
-
-	ns_min.susp = ns_max.susp = connection->resource->susp;
-	ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
-	ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
-
-	*pns_min = ns_min;
-	*pns_max = ns_max;
+	return uuid;
 }
 
-static enum drbd_state_rv
-_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+static void ensure_exposed_data_uuid(struct drbd_device *device)
 {
-	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
-
-	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
-		rv = SS_CW_SUCCESS;
-
-	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
-		rv = SS_CW_FAILED_BY_PEER;
+	u64 uuid = exposable_data_uuid(device);
 
-	err = conn_is_valid_transition(connection, mask, val, 0);
-	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
-		return rv;
+	if (uuid)
+		drbd_uuid_set_exposed(device, uuid, true);
 
-	return err;
 }
 
-enum drbd_state_rv
-_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
-		    enum chg_state_flags flags)
+/* Between 9.1.7 and 9.1.12 drbd was setting MDF_NODE_EXISTS for all peers.
+ * With that the flag got useless. It is a meta-data flag that persists.
+ * Clear it for all not configured nodes if we find it in every peer slot.
+ */
+static void check_wrongly_set_mdf_exists(struct drbd_device *device)
 {
-	enum drbd_state_rv rv = SS_SUCCESS;
-	struct after_conn_state_chg_work *acscw;
-	enum drbd_conns oc = connection->cstate;
-	union drbd_state ns_max, ns_min, os;
-	bool have_mutex = false;
-	struct drbd_state_change *state_change;
+	struct drbd_resource *resource = device->resource;
+	const int my_node_id = resource->res_opts.node_id;
+	bool wrong = true;
+	int node_id;
 
-	if (mask.conn) {
-		rv = is_valid_conn_transition(oc, val.conn);
-		if (rv < SS_SUCCESS)
-			goto abort;
-	}
+	if (!get_ldev(device))
+		return;
 
-	rv = conn_is_valid_transition(connection, mask, val, flags);
-	if (rv < SS_SUCCESS)
-		goto abort;
-
-	if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
-	    !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
-
-		/* This will be a cluster-wide state change.
-		 * Need to give up the spinlock, grab the mutex,
-		 * then send the state change request, ... */
-		spin_unlock_irq(&connection->resource->req_lock);
-		mutex_lock(&connection->cstate_mutex);
-		have_mutex = true;
-
-		set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
-		if (conn_send_state_req(connection, mask, val)) {
-			/* sending failed. */
-			clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
-			rv = SS_CW_FAILED_BY_PEER;
-			/* need to re-aquire the spin lock, though */
-			goto abort_unlocked;
-		}
-
-		if (val.conn == C_DISCONNECTING)
-			set_bit(DISCONNECT_SENT, &connection->flags);
-
-		/* ... and re-aquire the spinlock.
-		 * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
-		 * conn_set_state() within the same spinlock. */
-		spin_lock_irq(&connection->resource->req_lock);
-		wait_event_lock_irq(connection->ping_wait,
-				(rv = _conn_rq_cond(connection, mask, val)),
-				connection->resource->req_lock);
-		clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
-		if (rv < SS_SUCCESS)
-			goto abort;
-	}
-
-	state_change = remember_old_state(connection->resource, GFP_ATOMIC);
-	conn_old_common_state(connection, &os, &flags);
-	flags |= CS_DC_SUSP;
-	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
-	conn_pr_state_change(connection, os, ns_max, flags);
-	remember_new_state(state_change);
-
-	acscw = kmalloc_obj(*acscw, GFP_ATOMIC);
-	if (acscw) {
-		acscw->oc = os.conn;
-		acscw->ns_min = ns_min;
-		acscw->ns_max = ns_max;
-		acscw->flags = flags;
-		acscw->w.cb = w_after_conn_state_ch;
-		kref_get(&connection->kref);
-		acscw->connection = connection;
-		acscw->state_change = state_change;
-		drbd_queue_work(&connection->sender_work, &acscw->w);
-	} else {
-		drbd_err(connection, "Could not kmalloc an acscw\n");
-	}
+	rcu_read_lock();
 
- abort:
-	if (have_mutex) {
-		/* mutex_unlock() "... must not be used in interrupt context.",
-		 * so give up the spinlock, then re-aquire it */
-		spin_unlock_irq(&connection->resource->req_lock);
- abort_unlocked:
-		mutex_unlock(&connection->cstate_mutex);
-		spin_lock_irq(&connection->resource->req_lock);
-	}
-	if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
-		drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
-		drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
-		drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
-	}
-	return rv;
-}
+	for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) {
+		struct drbd_peer_device *peer_device = peer_device_by_node_id(device, node_id);
+		struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id];
 
-enum drbd_state_rv
-conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
-		   enum chg_state_flags flags)
-{
-	enum drbd_state_rv rv;
+		if (!(peer_md->flags & MDF_NODE_EXISTS || peer_device || node_id == my_node_id)) {
+			wrong = false;
+			break;
+		}
+	}
 
-	spin_lock_irq(&connection->resource->req_lock);
-	rv = _conn_request_state(connection, mask, val, flags);
-	spin_unlock_irq(&connection->resource->req_lock);
+	if (wrong) {
+		for (node_id = 0; node_id < DRBD_NODE_ID_MAX; node_id++) {
+			struct drbd_peer_device *peer_device = peer_device_by_node_id(device, node_id);
+			struct drbd_peer_md *peer_md = &device->ldev->md.peers[node_id];
 
-	return rv;
+			if (!peer_device)
+				peer_md->flags &= ~MDF_NODE_EXISTS;
+		}
+		if (!test_bit(WRONG_MDF_EXISTS, &resource->flags)) {
+			set_bit(WRONG_MDF_EXISTS, &resource->flags);
+			drbd_warn(resource, "Clearing excess MDF_NODE_EXISTS flags\n");
+		}
+	}
+	rcu_read_unlock();
+	put_ldev(device);
 }
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 75e671a3c5d1..eaaf1a9c641f 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -236,6 +236,7 @@ GENL_struct(DRBD_NLA_DEVICE_CONF, 14, device_conf,
 	__u32_field_def(1, DRBD_F_INVARIANT,	max_bio_size, DRBD_MAX_BIO_SIZE_DEF)
 	__flg_field_def(2, 0 /* OPTIONAL */, intentional_diskless, DRBD_DISK_DISKLESS_DEF)
 	__u32_field_def(3, 0 /* OPTIONAL */, block_size, DRBD_BLOCK_SIZE_DEF)
+	__u32_field_def(4, 0 /* OPTIONAL */, discard_granularity, DRBD_DISCARD_GRANULARITY_DEF)
 )
 
 GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
@@ -357,6 +358,7 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_OPTS, 27, peer_device_conf,
 #if (PRO_FEATURES & DRBD_FF_RESYNC_WITHOUT_REPLICATION) || !defined(__KERNEL__)
 	__flg_field_def(8, 0 /* OPTIONAL */, resync_without_replication, DRBD_RESYNC_WITHOUT_REPLICATION_DEF)
 #endif
+	__flg_field_def(9, 0 /* OPTIONAL */, peer_tiebreaker, DRBD_PEER_TIEBREAKER_DEF)
 )
 
 GENL_struct(DRBD_NLA_PATH_PARMS, 28, path_parms,
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index ed38f94d43c6..bbcb5b0dc3be 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -313,6 +313,11 @@
 #define DRBD_BLOCK_SIZE_DEF 512
 #define DRBD_BLOCK_SIZE_SCALE '1' /* Bytes */
 
+#define DRBD_DISCARD_GRANULARITY_SCALE '1'             /* Bytes */
+#define DRBD_DISCARD_GRANULARITY_MIN 0U                /* 0 = disable discards */
+#define DRBD_DISCARD_GRANULARITY_MAX (128U<<20)        /* 128 MiB, current DRBD_MAX_BATCH_BIO_SIZE */
+#define DRBD_DISCARD_GRANULARITY_DEF 0xFFFFFFFFU       /* sentinel: not configured; use legacy behavior */
+
 /* By default freeze IO, if set error all IOs as quick as possible */
 #define DRBD_ON_NO_QUORUM_DEF ONQ_SUSPEND_IO
 
@@ -326,6 +331,8 @@
 
 #define DRBD_LOAD_BALANCE_PATHS_DEF 0U
 
+#define DRBD_PEER_TIEBREAKER_DEF 1U
+
 #define DRBD_RDMA_CTRL_RCVBUF_SIZE_MIN  0U
 #define DRBD_RDMA_CTRL_RCVBUF_SIZE_MAX  (10U<<20)
 #define DRBD_RDMA_CTRL_RCVBUF_SIZE_DEF 0
-- 
2.53.0