[PATCH 17/20] drbd: rework receiver for DRBD 9 transport and multi-peer protocol

Christoph Böhmwalder posted 20 patches 5 days, 15 hours ago
[PATCH 17/20] drbd: rework receiver for DRBD 9 transport and multi-peer protocol
Posted by Christoph Böhmwalder 5 days, 15 hours ago
Adapt the receiver to the DRBD 9 multi-peer architecture.
Replace all direct socket I/O by calls through the transport abstraction
layer, with the transport managing buffer allocation.
Move peer request tracking from per-device lists to connection-level
structures, enabling a single receiver thread to serve all volumes on
a connection.

UUID-based resync decisions replace the old integer heuristic with enums
for strategies and rules, so each sync handshake outcome and its reason
are self-describing and logged by name.
Update UUID comparison for multi-peer: each peer now carries per-node
bitmap UUIDs and a history array, replacing the fixed four-slot layout.

Introduce DAG-tag ordering as a causal consistency mechanism, letting
peer requests declare dependencies on writes seen at another node and
waiting until those dependencies are resolved.

Add two-phase commit handling so that coordinated state changes (role
transitions, resync initiation, resize) can be propagated to all nodes
atomically.

Write conflict detection moves from a flag-based approach to an
interval-tree with typed intervals, using asynchronous deferred
submission for conflicting requests.

The disconnect path is restructured at the connection level: it
cancels dagtag-dependent requests, drains resync activity, flushes
workqueues, and performs per-peer-device teardown before returning
the connection to the unconnected state.

Co-developed-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Co-developed-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Co-developed-by: Joel Colledge <joel.colledge@linbit.com>
Signed-off-by: Joel Colledge <joel.colledge@linbit.com>
Co-developed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
---
 drivers/block/drbd/drbd_receiver.c         | 12258 ++++++++++++++-----
 drivers/block/drbd/drbd_transport.h        |   127 +-
 drivers/block/drbd/drbd_transport_lb-tcp.c |    50 +-
 drivers/block/drbd/drbd_transport_rdma.c   |    74 +-
 drivers/block/drbd/drbd_transport_tcp.c    |    49 +-
 5 files changed, 9029 insertions(+), 3529 deletions(-)

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 58b95bf4bdca..e8c4cd1cda14 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -10,42 +10,54 @@
 
  */
 
-
-#include <linux/module.h>
-
-#include <linux/uaccess.h>
 #include <net/sock.h>
 
+#include <linux/bio.h>
 #include <linux/drbd.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/in.h>
 #include <linux/mm.h>
-#include <linux/memcontrol.h>
+#include <linux/memcontrol.h> /* needed on kernels <4.3 */
 #include <linux/mm_inline.h>
 #include <linux/slab.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/sched/signal.h>
 #include <linux/pkt_sched.h>
-#include <linux/unistd.h>
+#include <uapi/linux/sched/types.h>
 #include <linux/vmalloc.h>
 #include <linux/random.h>
-#include <linux/string.h>
-#include <linux/scatterlist.h>
+#include <net/ipv6.h>
 #include <linux/part_stat.h>
-#include <linux/mempool.h>
+
 #include "drbd_int.h"
+#include "drbd_meta_data.h"
 #include "drbd_protocol.h"
 #include "drbd_req.h"
 #include "drbd_vli.h"
 
-#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
 
-struct packet_info {
-	enum drbd_packet cmd;
-	unsigned int size;
-	unsigned int vnr;
-	void *data;
+enum ao_op {
+	OUTDATE_DISKS,
+	OUTDATE_DISKS_AND_DISCONNECT,
+};
+
+struct flush_work {
+	struct drbd_work w;
+	struct drbd_epoch *epoch;
+};
+
+struct update_peers_work {
+       struct drbd_work w;
+       struct drbd_peer_device *peer_device;
+       sector_t sector_start;
+       sector_t sector_end;
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BARRIER_DONE,
+	EV_BECAME_LAST,
+	EV_CLEANUP = 32, /* used as flag */
 };
 
 enum finish_epoch {
@@ -54,201 +66,508 @@ enum finish_epoch {
 	FE_RECYCLED,
 };
 
-static int drbd_do_features(struct drbd_connection *connection);
-static int drbd_do_auth(struct drbd_connection *connection);
-static int drbd_disconnected(struct drbd_peer_device *);
-static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+enum resync_reason {
+	AFTER_UNSTABLE,
+	DISKLESS_PRIMARY,
+};
+
+enum sync_rule {
+	RULE_SYNC_SOURCE_MISSED_FINISH,
+	RULE_SYNC_SOURCE_PEER_MISSED_FINISH,
+	RULE_SYNC_TARGET_MISSED_FINISH,
+	RULE_SYNC_TARGET_PEER_MISSED_FINISH,
+	RULE_SYNC_TARGET_MISSED_START,
+	RULE_SYNC_SOURCE_MISSED_START,
+	RULE_INITIAL_HANDSHAKE_CHANGED,
+	RULE_JUST_CREATED_PEER,
+	RULE_JUST_CREATED_SELF,
+	RULE_JUST_CREATED_BOTH,
+	RULE_CRASHED_PRIMARY,
+	RULE_LOST_QUORUM,
+	RULE_RECONNECTED,
+	RULE_BOTH_OFF,
+	RULE_BITMAP_PEER,
+	RULE_BITMAP_PEER_OTHER,
+	RULE_BITMAP_SELF,
+	RULE_BITMAP_SELF_OTHER,
+	RULE_BITMAP_BOTH,
+	RULE_HISTORY_PEER,
+	RULE_HISTORY_SELF,
+	RULE_HISTORY_BOTH,
+};
+
+static const char * const sync_rule_names[] = {
+	[RULE_SYNC_SOURCE_MISSED_FINISH] = "sync-source-missed-finish",
+	[RULE_SYNC_SOURCE_PEER_MISSED_FINISH] = "sync-source-peer-missed-finish",
+	[RULE_SYNC_TARGET_MISSED_FINISH] = "sync-target-missed-finish",
+	[RULE_SYNC_TARGET_PEER_MISSED_FINISH] = "sync-target-peer-missed-finish",
+	[RULE_SYNC_TARGET_MISSED_START] = "sync-target-missed-start",
+	[RULE_SYNC_SOURCE_MISSED_START] = "sync-source-missed-start",
+	[RULE_INITIAL_HANDSHAKE_CHANGED] = "initial-handshake-changed",
+	[RULE_JUST_CREATED_PEER] = "just-created-peer",
+	[RULE_JUST_CREATED_SELF] = "just-created-self",
+	[RULE_JUST_CREATED_BOTH] = "just-created-both",
+	[RULE_CRASHED_PRIMARY] = "crashed-primary",
+	[RULE_LOST_QUORUM] = "lost-quorum",
+	[RULE_RECONNECTED] = "reconnected",
+	[RULE_BOTH_OFF] = "both-off",
+	[RULE_BITMAP_PEER] = "bitmap-peer",
+	[RULE_BITMAP_PEER_OTHER] = "bitmap-peer-other",
+	[RULE_BITMAP_SELF] = "bitmap-self",
+	[RULE_BITMAP_SELF_OTHER] = "bitmap-self-other",
+	[RULE_BITMAP_BOTH] = "bitmap-both",
+	[RULE_HISTORY_PEER] = "history-peer",
+	[RULE_HISTORY_SELF] = "history-self",
+	[RULE_HISTORY_BOTH] = "history-both",
+};
+
+enum sync_strategy {
+	UNDETERMINED = 0,
+	NO_SYNC,
+	SYNC_SOURCE_IF_BOTH_FAILED,
+	SYNC_SOURCE_USE_BITMAP,
+	SYNC_SOURCE_SET_BITMAP,
+	SYNC_SOURCE_COPY_BITMAP,
+	SYNC_TARGET_IF_BOTH_FAILED,
+	SYNC_TARGET_USE_BITMAP,
+	SYNC_TARGET_SET_BITMAP,
+	SYNC_TARGET_CLEAR_BITMAP,
+	SPLIT_BRAIN_AUTO_RECOVER,
+	SPLIT_BRAIN_DISCONNECT,
+	UNRELATED_DATA,
+	RETRY_CONNECT,
+	REQUIRES_PROTO_91,
+	REQUIRES_PROTO_96,
+	REQUIRES_PROTO_124,
+	SYNC_TARGET_PRIMARY_RECONNECT,
+	SYNC_TARGET_PRIMARY_DISCONNECT,
+};
+
+struct sync_descriptor {
+	char * const name;
+	int required_protocol;
+	bool is_split_brain;
+	bool is_sync_source;
+	bool is_sync_target;
+	bool reconnect;
+	bool disconnect;
+	int resync_peer_preference;
+	enum sync_strategy full_sync_equivalent;
+	enum sync_strategy reverse;
+};
+
+static const struct sync_descriptor sync_descriptors[] = {
+	[UNDETERMINED] = {
+		.name = "?",
+	},
+	[NO_SYNC] = {
+		.name = "no-sync",
+		.resync_peer_preference = 5,
+	},
+	[SYNC_SOURCE_IF_BOTH_FAILED] = {
+		.name = "source-if-both-failed",
+		.is_sync_source = true,
+		.reverse = SYNC_TARGET_IF_BOTH_FAILED,
+	},
+	[SYNC_SOURCE_USE_BITMAP] = {
+		.name = "source-use-bitmap",
+		.is_sync_source = true,
+		.full_sync_equivalent = SYNC_SOURCE_SET_BITMAP,
+		.reverse = SYNC_TARGET_USE_BITMAP,
+	},
+	[SYNC_SOURCE_SET_BITMAP] = {
+		.name = "source-set-bitmap",
+		.is_sync_source = true,
+		.reverse = SYNC_TARGET_SET_BITMAP,
+	},
+	[SYNC_SOURCE_COPY_BITMAP] = {
+		.name = "source-copy-other-bitmap",
+		.is_sync_source = true,
+	},
+	[SYNC_TARGET_IF_BOTH_FAILED] = {
+		.name = "target-if-both-failed",
+		.is_sync_target = true,
+		.resync_peer_preference = 4,
+		.reverse = SYNC_SOURCE_IF_BOTH_FAILED,
+	},
+	[SYNC_TARGET_USE_BITMAP] = {
+		.name = "target-use-bitmap",
+		.is_sync_target = true,
+		.full_sync_equivalent = SYNC_TARGET_SET_BITMAP,
+		.resync_peer_preference = 3,
+		.reverse = SYNC_SOURCE_USE_BITMAP,
+	},
+	[SYNC_TARGET_SET_BITMAP] = {
+		.name = "target-set-bitmap",
+		.is_sync_target = true,
+		.resync_peer_preference = 2,
+		.reverse = SYNC_SOURCE_SET_BITMAP,
+	},
+	[SYNC_TARGET_CLEAR_BITMAP] = {
+		.name = "target-clear-bitmap",
+		.is_sync_target = true,
+		.resync_peer_preference = 1,
+	},
+	[SPLIT_BRAIN_AUTO_RECOVER] = {
+		.name = "split-brain-auto-recover",
+		.is_split_brain = true,
+		.disconnect = true,
+	},
+	[SPLIT_BRAIN_DISCONNECT] = {
+		.name = "split-brain-disconnect",
+		.is_split_brain = true,
+		.disconnect = true,
+	},
+	[UNRELATED_DATA] = {
+		.name = "unrelated-data",
+		.disconnect = true,
+	},
+	[RETRY_CONNECT] = {
+		.name = "retry-connect",
+		.reconnect = true,
+	},
+	[REQUIRES_PROTO_91] = {
+		.name = "requires-proto-91",
+		.required_protocol = 91,
+		.disconnect = true,
+	},
+	[REQUIRES_PROTO_96] = {
+		.name = "requires-proto-96",
+		.required_protocol = 96,
+		.disconnect = true,
+	},
+	[REQUIRES_PROTO_124] = {
+		.name = "requires-proto-124",
+		.required_protocol = 124,
+		.disconnect = true,
+	},
+	[SYNC_TARGET_PRIMARY_RECONNECT] = {
+		.name = "sync-target-primary-reconnect",
+		.is_sync_target = true,
+		.reconnect = true,
+	},
+	[SYNC_TARGET_PRIMARY_DISCONNECT] = {
+		.name = "sync-target-primary-disconnect",
+		.is_sync_target = true,
+		.disconnect = true,
+	},
+};
+
+enum rcv_timeou_kind {
+	PING_TIMEOUT,
+	REGULAR_TIMEOUT,
+};
+
+int drbd_do_features(struct drbd_connection *connection);
+int drbd_do_auth(struct drbd_connection *connection);
+static void conn_disconnect(struct drbd_connection *connection);
+
 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
 static int e_end_block(struct drbd_work *, int);
+static void cleanup_unacked_peer_requests(struct drbd_connection *connection);
+static void cleanup_peer_ack_list(struct drbd_connection *connection);
+static u64 node_ids_to_bitmap(struct drbd_device *device, u64 node_ids);
+static void process_twopc(struct drbd_connection *, struct twopc_reply *, struct packet_info *, unsigned long);
+static void drbd_resync(struct drbd_peer_device *, enum resync_reason);
+static void drbd_unplug_all_devices(struct drbd_connection *connection);
+static int decode_header(struct drbd_connection *, const void *, struct packet_info *);
+static void check_resync_source(struct drbd_device *device, u64 weak_nodes);
+static void set_rcvtimeo(struct drbd_connection *connection, enum rcv_timeou_kind kind);
+static bool disconnect_expected(struct drbd_connection *connection);
+static bool uuid_in_peer_history(struct drbd_peer_device *peer_device, u64 uuid);
+static bool uuid_in_my_history(struct drbd_device *device, u64 uuid);
+static void drbd_cancel_conflicting_resync_requests(struct drbd_peer_device *peer_device);
+
+static const char *drbd_sync_rule_str(enum sync_rule rule)
+{
+	if (rule < 0 || rule >= ARRAY_SIZE(sync_rule_names)) {
+		WARN_ON(true);
+		return "?";
+	}
+	return sync_rule_names[rule];
+}
 
+static struct sync_descriptor strategy_descriptor(enum sync_strategy strategy)
+{
+	if (strategy < 0 || strategy > ARRAY_SIZE(sync_descriptors)) {
+		WARN_ON(true);
+		return sync_descriptors[UNDETERMINED];
+	}
+	return sync_descriptors[strategy];
+}
+
+static bool is_strategy_determined(enum sync_strategy strategy)
+{
+	return strategy == NO_SYNC ||
+			strategy_descriptor(strategy).is_sync_source ||
+			strategy_descriptor(strategy).is_sync_target;
+}
 
-#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+static struct drbd_epoch *previous_epoch(struct drbd_connection *connection, struct drbd_epoch *epoch)
+{
+	struct drbd_epoch *prev;
+	spin_lock(&connection->epoch_lock);
+	prev = list_entry(epoch->list.prev, struct drbd_epoch, list);
+	if (prev == epoch || prev == connection->current_epoch)
+		prev = NULL;
+	spin_unlock(&connection->epoch_lock);
+	return prev;
+}
 
-static struct page *__drbd_alloc_pages(unsigned int number)
+static void rs_sectors_came_in(struct drbd_peer_device *peer_device, int size)
 {
-	struct page *page = NULL;
-	struct page *tmp = NULL;
-	unsigned int i = 0;
+	int rs_sect_in = atomic_add_return(size >> 9, &peer_device->rs_sect_in);
 
-	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
-	 * "criss-cross" setup, that might cause write-out on some other DRBD,
-	 * which in turn might block on the other node at this very place.  */
-	for (i = 0; i < number; i++) {
-		tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY);
-		if (!tmp)
-			goto fail;
-		set_page_private(tmp, (unsigned long)page);
-		page = tmp;
+	/* When resync runs faster than anticipated, consider running the
+	 * resync_work early. */
+	if (rs_sect_in >= peer_device->rs_in_flight)
+		drbd_rs_all_in_flight_came_back(peer_device, rs_sect_in);
+}
+
+void drbd_peer_req_strip_bio(struct drbd_peer_request *peer_req)
+{
+	struct drbd_transport *transport = &peer_req->peer_device->connection->transport;
+	struct bvec_iter iter;
+	struct bio_vec bvec;
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&peer_req->bios))) {
+		bio_for_each_bvec(bvec, bio, iter) {
+			struct page *page = bvec.bv_page;
+			unsigned int len = bvec.bv_len;
+
+			/* bio_add_page() may have merged contiguous pages from
+			 * separate allocations into a single bvec. Step through
+			 * by compound_order to free each allocation unit.
+			 */
+			while (len) {
+				unsigned int order = compound_order(page);
+
+				drbd_free_page(transport, page);
+				page += 1 << order;
+				len -= min_t(unsigned int, PAGE_SIZE << order, len);
+			}
+		}
+		bio_put(bio);
 	}
+}
+
+static struct page *
+__drbd_alloc_pages(struct drbd_connection *connection, gfp_t gfp_mask, int order)
+{
+	struct page *page;
+	unsigned int mxb;
+
+	rcu_read_lock();
+	mxb = rcu_dereference(connection->transport.net_conf)->max_buffers;
+	rcu_read_unlock();
+
+	if (atomic_read(&connection->pp_in_use) >= mxb)
+		schedule_timeout_interruptible(HZ / 10);
+
+	if (order == 0)
+		page = mempool_alloc(&drbd_buffer_page_pool, gfp_mask);
+	else
+		page = alloc_pages(gfp_mask | __GFP_COMP | __GFP_NORETRY, order);
+
+	if (page)
+		atomic_add(1 << order, &connection->pp_in_use);
+
 	return page;
-fail:
-	page_chain_for_each_safe(page, tmp) {
-		set_page_private(page, 0);
-		mempool_free(page, &drbd_buffer_page_pool);
-	}
-	return NULL;
 }
 
 /**
- * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
- * @peer_device:	DRBD device.
- * @number:		number of pages requested
- * @retry:		whether to retry, if not enough pages are available right now
- *
- * Tries to allocate number pages, first from our own page pool, then from
- * the kernel.
- * Possibly retry until DRBD frees sufficient pages somewhere else.
+ * drbd_alloc_pages() - Returns a page, which might be a single or compound page
+ * @transport:	DRBD transport
+ * @gfp_mask:	how to allocate and whether to loop until we succeed
+ * @size:	Desired size, gets rounded down to the closest power of two
  *
- * If this allocation would exceed the max_buffers setting, we throttle
- * allocation (schedule_timeout) to give the system some room to breathe.
+ * Allocates a page from the kernel or from the private mempool. When this
+ * allocation exceeds the max_buffers setting, throttle the allocation via
+ * schedule_timeout.
  *
- * We do not use max-buffers as hard limit, because it could lead to
- * congestion and further to a distributed deadlock during online-verify or
- * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * We do not use max-buffers as a hard limit, because it could lead to
+ * congestion and, further, to a distributed deadlock during online-verify or
+ * (checksum-based) resync, if the max-buffers, socket buffer sizes, and
  * resync-rate settings are mis-configured.
- *
- * Returns a page chain linked via page->private.
  */
-struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
-			      bool retry)
+struct page *drbd_alloc_pages(struct drbd_transport *transport, gfp_t gfp_mask, unsigned int size)
 {
-	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection =
+		container_of(transport, struct drbd_connection, transport);
+	int order = max(ilog2(size) - PAGE_SHIFT, 0);
 	struct page *page;
-	struct net_conf *nc;
-	unsigned int mxb;
 
-	rcu_read_lock();
-	nc = rcu_dereference(peer_device->connection->net_conf);
-	mxb = nc ? nc->max_buffers : 1000000;
-	rcu_read_unlock();
+	if (order && drbd_insert_fault_conn(connection, DRBD_FAULT_BIO_TOO_SMALL))
+		order = 0;
 
-	if (atomic_read(&device->pp_in_use) >= mxb)
-		schedule_timeout_interruptible(HZ / 10);
-	page = __drbd_alloc_pages(number);
+	page = __drbd_alloc_pages(connection, gfp_mask | __GFP_NOWARN, order);
+	if (!page && order)
+		page = __drbd_alloc_pages(connection, gfp_mask, 0);
 
-	if (page)
-		atomic_add(number, &device->pp_in_use);
 	return page;
 }
+EXPORT_SYMBOL(drbd_alloc_pages); /* for transports */
 
-/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
- * Is also used from inside an other spin_lock_irq(&resource->req_lock);
- * Either links the page chain back to the global pool,
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages().
+ * Either links the page chain back to the pool of free pages,
  * or returns all pages to the system. */
-static void drbd_free_pages(struct drbd_device *device, struct page *page)
+void drbd_free_page(struct drbd_transport *transport, struct page *page)
 {
-	struct page *tmp;
-	int i = 0;
+	struct drbd_connection *connection =
+		container_of(transport, struct drbd_connection, transport);
+	int order = compound_order(page), i = 0;
 
 	if (page == NULL)
 		return;
 
-	page_chain_for_each_safe(page, tmp) {
-		set_page_private(page, 0);
-		if (page_count(page) == 1)
-			mempool_free(page, &drbd_buffer_page_pool);
-		else
-			put_page(page);
-		i++;
-	}
-	i = atomic_sub_return(i, &device->pp_in_use);
+	if (page_count(page) == 1 && order == 0)
+		mempool_free(page, &drbd_buffer_page_pool);
+	else
+		put_page(page);
+
+	i = atomic_sub_return(1 << order, &connection->pp_in_use);
 	if (i < 0)
-		drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+		drbd_warn(connection, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
 }
+EXPORT_SYMBOL(drbd_free_page);
 
-/*
-You need to hold the req_lock:
- _drbd_wait_ee_list_empty()
-
-You must not have the req_lock:
- drbd_free_peer_req()
- drbd_alloc_peer_req()
- drbd_free_peer_reqs()
- drbd_ee_fix_bhs()
- drbd_finish_peer_reqs()
- drbd_clear_done_ee()
- drbd_wait_ee_list_empty()
-*/
+static int
+peer_req_alloc_bio(struct drbd_peer_request *peer_req, size_t size, gfp_t gfp_mask, blk_opf_t opf)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_transport *transport = &peer_device->connection->transport;
+	struct drbd_device *device = peer_device->device;
+	enum req_op op = opf & REQ_OP_MASK;
+	unsigned short nr_vecs;
+	struct page *page;
+	struct bio *bio;
+
+	nr_vecs = DIV_ROUND_UP(size, PAGE_SIZE);
+	if (nr_vecs > BIO_MAX_VECS)
+		nr_vecs = BIO_MAX_VECS;
+
+	if (drbd_insert_fault(device, DRBD_FAULT_BIO_TOO_SMALL))
+		nr_vecs = DIV_ROUND_UP(nr_vecs, 4);
+
+	bio = bio_alloc(device->ldev->backing_bdev, nr_vecs, opf, gfp_mask);
+	if (!bio)
+		return -ENOMEM;
+
+	bio_list_add(&peer_req->bios, bio);
+
+	if (op == REQ_OP_READ) {
+		while (size) {
+			int len;
+
+			page = drbd_alloc_pages(transport, gfp_mask, size);
+			if (!page)
+				goto out_free_pages;
+			len = min(PAGE_SIZE << compound_order(page), size);
+
+			len = drbd_bio_add_page(transport, &peer_req->bios, page, len, 0);
+			if (len < 0)
+				goto out_free_pages;
+			size -= len;
+		}
+		if (!mempool_is_saturated(&drbd_buffer_page_pool))
+			peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
+	}
+	return 0;
+
+out_free_pages:
+	drbd_peer_req_strip_bio(peer_req);
+	return -ENOMEM;
+}
 
-/* normal: payload_size == request size (bi_size)
- * w_same: payload_size == logical_block_size
- * trim: payload_size == 0 */
+/**
+ * drbd_alloc_peer_req() - Allocate a drbd_peer_request
+ * @drbd_peer_device: peer device object
+ * @gfp_mask:	      how to allocate and whether to loop until we succeed
+ * @size:	      size (normal I/O), logical_block_size (w_same), 0 (trim)
+ * @opf:              REQ_OP_READ or REQ_OP_WRITE
+ *
+ * For REQ_OP_READ, it allocates the peer_req with a BIO and populates it
+ * entirely with buffer pages. Otherwise it allocates the peer_req with
+ * an empty BIO.
+ */
 struct drbd_peer_request *
-drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-		    unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, gfp_t gfp_mask,
+		    size_t size, blk_opf_t opf)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_peer_request *peer_req;
-	struct page *page = NULL;
-	unsigned int nr_pages = PFN_UP(payload_size);
+	int err;
 
+	gfp_mask &= ~__GFP_HIGHMEM;
 	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 		return NULL;
 
-	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask);
 	if (!peer_req) {
 		if (!(gfp_mask & __GFP_NOWARN))
 			drbd_err(device, "%s: allocation failed\n", __func__);
 		return NULL;
 	}
-
-	if (nr_pages) {
-		page = drbd_alloc_pages(peer_device, nr_pages,
-					gfpflags_allow_blocking(gfp_mask));
-		if (!page)
-			goto fail;
-		if (!mempool_is_saturated(&drbd_buffer_page_pool))
-			peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
-	}
-
 	memset(peer_req, 0, sizeof(*peer_req));
+
 	INIT_LIST_HEAD(&peer_req->w.list);
 	drbd_clear_interval(&peer_req->i);
-	peer_req->i.size = request_size;
-	peer_req->i.sector = sector;
+	INIT_LIST_HEAD(&peer_req->recv_order);
 	peer_req->submit_jif = jiffies;
+	kref_get(&device->kref); /* this kref holds the peer_req->peer_device object alive */
 	peer_req->peer_device = peer_device;
-	peer_req->pages = page;
-	/*
-	 * The block_id is opaque to the receiver.  It is not endianness
-	 * converted, and sent back to the sender unchanged.
-	 */
-	peer_req->block_id = id;
+	peer_req->block_id = (unsigned long) peer_req;
+
+	if (opf == REQ_NO_BIO)
+		return peer_req;
+
+	err = peer_req_alloc_bio(peer_req, size, gfp_mask, opf);
+	if (err)
+		goto out_free_peer_req;
 
 	return peer_req;
 
- fail:
+out_free_peer_req:
 	mempool_free(peer_req, &drbd_ee_mempool);
 	return NULL;
 }
 
-void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req)
+void drbd_free_peer_req(struct drbd_peer_request *peer_req)
 {
-	might_sleep();
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_connection *connection = peer_device->connection;
+
+	if (peer_req->flags & EE_ON_RECV_ORDER) {
+		spin_lock_irq(&connection->peer_reqs_lock);
+		if (peer_req->i.type == INTERVAL_RESYNC_WRITE)
+			drbd_list_del_resync_request(peer_req);
+		else
+			list_del(&peer_req->recv_order);
+		spin_unlock_irq(&connection->peer_reqs_lock);
+	}
+
 	if (peer_req->flags & EE_HAS_DIGEST)
 		kfree(peer_req->digest);
-	drbd_free_pages(device, peer_req->pages);
-	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
-	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
-	if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
-		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
-		drbd_al_complete_io(device, &peer_req->i);
-	}
+	D_ASSERT(peer_device, atomic_read(&peer_req->pending_bios) == 0);
+	D_ASSERT(peer_device, drbd_interval_empty(&peer_req->i));
+	drbd_peer_req_strip_bio(peer_req);
+	kref_put(&peer_device->device->kref, drbd_destroy_device);
 	mempool_free(peer_req, &drbd_ee_mempool);
 }
 
-int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+int drbd_free_peer_reqs(struct drbd_connection *connection, struct list_head *list)
 {
 	LIST_HEAD(work_list);
 	struct drbd_peer_request *peer_req, *t;
 	int count = 0;
 
-	spin_lock_irq(&device->resource->req_lock);
+	spin_lock_irq(&connection->peer_reqs_lock);
 	list_splice_init(list, &work_list);
-	spin_unlock_irq(&device->resource->req_lock);
+	spin_unlock_irq(&connection->peer_reqs_lock);
 
 	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
-		drbd_free_peer_req(device, peer_req);
+		drbd_free_peer_req(peer_req);
 		count++;
 	}
 	return count;
@@ -257,90 +576,58 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 /*
  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
  */
-static int drbd_finish_peer_reqs(struct drbd_device *device)
+static int drbd_finish_peer_reqs(struct drbd_connection *connection)
 {
 	LIST_HEAD(work_list);
 	struct drbd_peer_request *peer_req, *t;
 	int err = 0;
+	int n = 0;
 
-	spin_lock_irq(&device->resource->req_lock);
-	list_splice_init(&device->done_ee, &work_list);
-	spin_unlock_irq(&device->resource->req_lock);
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_splice_init(&connection->done_ee, &work_list);
+	spin_unlock_irq(&connection->peer_reqs_lock);
 
 	/* possible callbacks here:
-	 * e_end_block, and e_end_resync_block, e_send_superseded.
+	 * e_end_block, and e_end_resync_block.
 	 * all ignore the last argument.
 	 */
 	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 		int err2;
 
+		++n;
 		/* list_del not necessary, next/prev members not touched */
+		/* The callback may free peer_req. */
 		err2 = peer_req->w.cb(&peer_req->w, !!err);
 		if (!err)
 			err = err2;
-		drbd_free_peer_req(device, peer_req);
 	}
-	wake_up(&device->ee_wait);
+	if (atomic_sub_and_test(n, &connection->done_ee_cnt))
+		wake_up(&connection->ee_wait);
 
 	return err;
 }
 
-static void _drbd_wait_ee_list_empty(struct drbd_device *device,
-				     struct list_head *head)
-{
-	DEFINE_WAIT(wait);
-
-	/* avoids spin_lock/unlock
-	 * and calling prepare_to_wait in the fast path */
-	while (!list_empty(head)) {
-		prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
-		spin_unlock_irq(&device->resource->req_lock);
-		io_schedule();
-		finish_wait(&device->ee_wait, &wait);
-		spin_lock_irq(&device->resource->req_lock);
-	}
-}
-
-static void drbd_wait_ee_list_empty(struct drbd_device *device,
-				    struct list_head *head)
-{
-	spin_lock_irq(&device->resource->req_lock);
-	_drbd_wait_ee_list_empty(device, head);
-	spin_unlock_irq(&device->resource->req_lock);
-}
-
-static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
-{
-	struct kvec iov = {
-		.iov_base = buf,
-		.iov_len = size,
-	};
-	struct msghdr msg = {
-		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
-	};
-	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size);
-	return sock_recvmsg(sock, &msg, msg.msg_flags);
-}
-
-static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+static int drbd_recv(struct drbd_connection *connection, void **buf, size_t size, int flags)
 {
+	struct drbd_transport_ops *tr_ops = &connection->transport.class->ops;
 	int rv;
 
-	rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+	rv = tr_ops->recv(&connection->transport, DATA_STREAM, buf, size, flags);
 
 	if (rv < 0) {
 		if (rv == -ECONNRESET)
 			drbd_info(connection, "sock was reset by peer\n");
 		else if (rv != -ERESTARTSYS)
-			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+			drbd_info(connection, "sock_recvmsg returned %d\n", rv);
 	} else if (rv == 0) {
-		if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+		if (test_bit(DISCONNECT_EXPECTED, &connection->flags)) {
 			long t;
 			rcu_read_lock();
-			t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+			t = rcu_dereference(connection->transport.net_conf)->ping_timeo * HZ/10;
 			rcu_read_unlock();
 
-			t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+			t = wait_event_timeout(connection->resource->state_wait,
+					       connection->cstate[NOW] < C_CONNECTED, t);
 
 			if (t)
 				goto out;
@@ -349,17 +636,32 @@ static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 	}
 
 	if (rv != size)
-		conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+		change_cstate(connection, C_BROKEN_PIPE, CS_HARD);
 
 out:
 	return rv;
 }
 
-static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+static int drbd_recv_into(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv(connection, &buf, size, CALLER_BUFFER);
+
+	if (err != size) {
+		if (err >= 0)
+			err = -EIO;
+	} else
+		err = 0;
+	return err;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void **buf, size_t size)
 {
 	int err;
 
-	err = drbd_recv(connection, buf, size);
+	err = drbd_recv(connection, buf, size, 0);
+
 	if (err != size) {
 		if (err >= 0)
 			err = -EIO;
@@ -368,7 +670,7 @@ static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t s
 	return err;
 }
 
-static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+static int drbd_recv_all_warn(struct drbd_connection *connection, void **buf, size_t size)
 {
 	int err;
 
@@ -378,628 +680,545 @@ static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, siz
 	return err;
 }
 
-/* quoting tcp(7):
- *   On individual connections, the socket buffer size must be set prior to the
- *   listen(2) or connect(2) calls in order to have it take effect.
- * This is our wrapper to do so.
- */
-static void drbd_setbufsize(struct socket *sock, unsigned int snd,
-		unsigned int rcv)
+static int drbd_send_disconnect(struct drbd_connection *connection)
 {
-	/* open coded SO_SNDBUF, SO_RCVBUF */
-	if (snd) {
-		sock->sk->sk_sndbuf = snd;
-		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
-	}
-	if (rcv) {
-		sock->sk->sk_rcvbuf = rcv;
-		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-	}
+	if (connection->agreed_pro_version < 118)
+		return 0;
+
+	if (!conn_prepare_command(connection, 0, DATA_STREAM))
+		return -EIO;
+	return send_command(connection, -1, P_DISCONNECT, DATA_STREAM);
 }
 
-static struct socket *drbd_try_connect(struct drbd_connection *connection)
+static void initialize_send_buffer(struct drbd_connection *connection, enum drbd_stream drbd_stream)
 {
-	const char *what;
-	struct socket *sock;
-	struct sockaddr_in6 src_in6;
-	struct sockaddr_in6 peer_in6;
-	struct net_conf *nc;
-	int err, peer_addr_len, my_addr_len;
-	int sndbuf_size, rcvbuf_size, connect_int;
-	int disconnect_on_error = 1;
-
-	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-	if (!nc) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	sndbuf_size = nc->sndbuf_size;
-	rcvbuf_size = nc->rcvbuf_size;
-	connect_int = nc->connect_int;
-	rcu_read_unlock();
-
-	my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
-	memcpy(&src_in6, &connection->my_addr, my_addr_len);
+	struct drbd_send_buffer *sbuf = &connection->send_buffer[drbd_stream];
 
-	if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
-		src_in6.sin6_port = 0;
-	else
-		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+	sbuf->unsent =
+	sbuf->pos = page_address(sbuf->page);
+	sbuf->allocated_size = 0;
+	sbuf->additional_size = 0;
+}
 
-	peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
-	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+/* Gets called if a connection is established, or if a new minor gets created
+   in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	u64 weak_nodes = 0;
+	int err;
 
-	what = "sock_create_kern";
-	err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
-			       SOCK_STREAM, IPPROTO_TCP, &sock);
-	if (err < 0) {
-		sock = NULL;
-		goto out;
-	}
+	atomic_set(&peer_device->packet_seq, 0);
+	peer_device->peer_seq = 0;
 
-	sock->sk->sk_rcvtimeo =
-	sock->sk->sk_sndtimeo = connect_int * HZ;
-	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
-
-       /* explicitly bind to the configured IP as source IP
-	*  for the outgoing connections.
-	*  This is needed for multihomed hosts and to be
-	*  able to use lo: interfaces for drbd.
-	* Make sure to use 0 as port number, so linux selects
-	*  a free one dynamically.
-	*/
-	what = "bind before connect";
-	err = sock->ops->bind(sock, (struct sockaddr_unsized *) &src_in6, my_addr_len);
-	if (err < 0)
-		goto out;
+	if (device->resource->role[NOW] == R_PRIMARY)
+		weak_nodes = drbd_weak_nodes_device(device);
 
-	/* connect may fail, peer not yet available.
-	 * stay C_WF_CONNECTION, don't go Disconnecting! */
-	disconnect_on_error = 0;
-	what = "connect";
-	err = sock->ops->connect(sock, (struct sockaddr_unsized *) &peer_in6, peer_addr_len, 0);
+	err = drbd_send_sync_param(peer_device);
 
-out:
-	if (err < 0) {
-		if (sock) {
-			sock_release(sock);
-			sock = NULL;
-		}
-		switch (-err) {
-			/* timeout, busy, signal pending */
-		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
-		case EINTR: case ERESTARTSYS:
-			/* peer not (yet) available, network problem */
-		case ECONNREFUSED: case ENETUNREACH:
-		case EHOSTDOWN:    case EHOSTUNREACH:
-			disconnect_on_error = 0;
-			break;
-		default:
-			drbd_err(connection, "%s failed, err = %d\n", what, err);
-		}
-		if (disconnect_on_error)
-			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	if (!err)
+		err = drbd_send_enable_replication_next(peer_device);
+	if (!err)
+		err = drbd_send_sizes(peer_device, 0, 0);
+	if (!err)
+		err = drbd_send_uuids(peer_device, 0, weak_nodes);
+	if (!err) {
+		set_bit(INITIAL_STATE_SENT, &peer_device->flags);
+		err = drbd_send_current_state(peer_device);
 	}
 
-	return sock;
+	clear_bit(USE_DEGR_WFC_T, &peer_device->flags);
+	clear_bit(RESIZE_PENDING, &peer_device->flags);
+	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+	return err;
 }
 
-struct accept_wait_data {
-	struct drbd_connection *connection;
-	struct socket *s_listen;
-	struct completion door_bell;
-	void (*original_sk_state_change)(struct sock *sk);
-
-};
-
-static void drbd_incoming_connection(struct sock *sk)
+void conn_connect2(struct drbd_connection *connection)
 {
-	struct accept_wait_data *ad = sk->sk_user_data;
-	void (*state_change)(struct sock *sk);
+	struct drbd_peer_device *peer_device;
+	int vnr;
 
-	state_change = ad->original_sk_state_change;
-	if (sk->sk_state == TCP_ESTABLISHED)
-		complete(&ad->door_bell);
-	state_change(sk);
-}
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
 
-static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
-{
-	int err, sndbuf_size, rcvbuf_size, my_addr_len;
-	struct sockaddr_in6 my_addr;
-	struct socket *s_listen;
-	struct net_conf *nc;
-	const char *what;
+		kref_get(&device->kref);
 
-	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-	if (!nc) {
+		/* connection cannot go away: caller holds a reference. */
 		rcu_read_unlock();
-		return -EIO;
+
+		/* In the compatibility case with protocol version < 110, that
+		 * is DRBD 8.4, we do not hold uuid_sem while exchanging the
+		 * initial UUID and state packets. There is no need because
+		 * there are no other peers which could interfere. */
+		if (connection->agreed_pro_version >= 110) {
+			down_read_non_owner(&device->uuid_sem);
+			set_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags);
+			/* since drbd_connected() is also called from drbd_create_device()
+			   aquire lock here before calling drbd_connected(). */
+		}
+		drbd_connected(peer_device);
+
+		rcu_read_lock();
+		kref_put(&device->kref, drbd_destroy_device);
 	}
-	sndbuf_size = nc->sndbuf_size;
-	rcvbuf_size = nc->rcvbuf_size;
 	rcu_read_unlock();
+	drbd_uncork(connection, DATA_STREAM);
+}
 
-	my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
-	memcpy(&my_addr, &connection->my_addr, my_addr_len);
+static bool initial_states_received(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+	bool rv = true;
 
-	what = "sock_create_kern";
-	err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
-			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
-	if (err) {
-		s_listen = NULL;
-		goto out;
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (!test_bit(INITIAL_STATE_RECEIVED, &peer_device->flags)) {
+			rv = false;
+			break;
+		}
 	}
+	rcu_read_unlock();
 
-	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
-
-	what = "bind before listen";
-	err = s_listen->ops->bind(s_listen, (struct sockaddr_unsized *)&my_addr, my_addr_len);
-	if (err < 0)
-		goto out;
+	return rv;
+}
 
-	ad->s_listen = s_listen;
-	write_lock_bh(&s_listen->sk->sk_callback_lock);
-	ad->original_sk_state_change = s_listen->sk->sk_state_change;
-	s_listen->sk->sk_state_change = drbd_incoming_connection;
-	s_listen->sk->sk_user_data = ad;
-	write_unlock_bh(&s_listen->sk->sk_callback_lock);
+void wait_initial_states_received(struct drbd_connection *connection)
+{
+	struct net_conf *nc;
+	long timeout;
 
-	what = "listen";
-	err = s_listen->ops->listen(s_listen, 5);
-	if (err < 0)
-		goto out;
+	rcu_read_lock();
+	nc = rcu_dereference(connection->transport.net_conf);
+	timeout = nc->ping_timeo * HZ/10;
+	rcu_read_unlock();
+	wait_event_interruptible_timeout(connection->ee_wait,
+					 initial_states_received(connection),
+					 timeout);
+}
 
-	return 0;
-out:
-	if (s_listen)
-		sock_release(s_listen);
-	if (err < 0) {
-		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
-			drbd_err(connection, "%s failed, err = %d\n", what, err);
-			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
-		}
-	}
+void connect_timer_fn(struct timer_list *t)
+{
+	struct drbd_connection *connection = timer_container_of(connection, t, connect_timer);
 
-	return -EIO;
+	drbd_queue_work(&connection->sender_work, &connection->connect_timer_work);
 }
 
-static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+static void arm_connect_timer(struct drbd_connection *connection, unsigned long expires)
 {
-	write_lock_bh(&sk->sk_callback_lock);
-	sk->sk_state_change = ad->original_sk_state_change;
-	sk->sk_user_data = NULL;
-	write_unlock_bh(&sk->sk_callback_lock);
+	bool was_pending = mod_timer(&connection->connect_timer, expires);
+
+	if (was_pending) {
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
 }
 
-static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+static bool retry_by_rr_conflict(struct drbd_connection *connection)
 {
-	int timeo, connect_int, err = 0;
-	struct socket *s_estab = NULL;
+	enum drbd_after_sb_p rr_conflict;
 	struct net_conf *nc;
 
 	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-	if (!nc) {
-		rcu_read_unlock();
-		return NULL;
-	}
-	connect_int = nc->connect_int;
+	nc = rcu_dereference(connection->transport.net_conf);
+	rr_conflict = nc->rr_conflict;
 	rcu_read_unlock();
 
-	timeo = connect_int * HZ;
-	/* 28.5% random jitter */
-	timeo += get_random_u32_below(2) ? timeo / 7 : -timeo / 7;
-
-	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
-	if (err <= 0)
-		return NULL;
-
-	err = kernel_accept(ad->s_listen, &s_estab, 0);
-	if (err < 0) {
-		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
-			drbd_err(connection, "accept failed, err = %d\n", err);
-			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
-		}
-	}
-
-	if (s_estab)
-		unregister_state_change(s_estab->sk, ad);
-
-	return s_estab;
+	return rr_conflict == ASB_RETRY_CONNECT;
 }
 
-static int decode_header(struct drbd_connection *, void *, struct packet_info *);
-
-static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
-			     enum drbd_packet cmd)
+static void apply_local_state_change(struct drbd_connection *connection, enum ao_op ao_op, bool force_demote)
 {
-	if (!conn_prepare_command(connection, sock))
-		return -EIO;
-	return conn_send_command(connection, sock, cmd, 0, NULL, 0);
-}
+	/* Although the connect failed, outdate local disks if we learn from the
+	 * handshake that the peer has more recent data */
+	struct drbd_resource *resource = connection->resource;
+	unsigned long irq_flags;
+	int vnr;
 
-static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
-{
-	unsigned int header_size = drbd_header_size(connection);
-	struct packet_info pi;
-	struct net_conf *nc;
-	int err;
+	mutex_lock(&resource->open_release);
+	begin_state_change(resource, &irq_flags, CS_HARD | (force_demote ? CS_FS_IGN_OPENERS : 0));
+	if (ao_op == OUTDATE_DISKS_AND_DISCONNECT)
+		__change_cstate(connection, C_DISCONNECTING);
+	if (resource->role[NOW] == R_SECONDARY ||
+	    (resource->cached_susp && (
+		    resource->res_opts.on_no_data == OND_IO_ERROR ||
+		    resource->res_opts.on_susp_primary_outdated == SPO_FORCE_SECONDARY))) {
+		/* One day we might relax the above condition to
+		 * resource->role[NOW] == R_SECONDARY || resource->cached_susp
+		 * Right now it is that way, because we do not offer a way to gracefully
+		 * get out of a Primary/Outdated state */
+		struct drbd_peer_device *peer_device;
+		bool set_fail_io = false;
 
-	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-	if (!nc) {
-		rcu_read_unlock();
-		return -EIO;
-	}
-	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
-	rcu_read_unlock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			enum drbd_repl_state r = peer_device->connect_state.conn;
+			struct drbd_device *device = peer_device->device;
 
-	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
-	if (err != header_size) {
-		if (err >= 0)
-			err = -EIO;
-		return err;
+			if (r == L_WF_BITMAP_T || r == L_SYNC_TARGET || r == L_PAUSED_SYNC_T)
+				__change_disk_state(device, D_OUTDATED);
+
+			if (device->open_cnt)
+				set_fail_io = true;
+		}
+		if (resource->role[NOW] == R_PRIMARY && force_demote) {
+			drbd_warn(connection, "Remote node has more recent data;"
+				  " force secondary!\n");
+			resource->role[NEW] = R_SECONDARY;
+			if (set_fail_io)
+				resource->fail_io[NEW] = true;
+		}
 	}
-	err = decode_header(connection, connection->data.rbuf, &pi);
-	if (err)
-		return err;
-	return pi.cmd;
+	end_state_change(resource, &irq_flags, "connect-failed");
+	mutex_unlock(&resource->open_release);
 }
 
-/**
- * drbd_socket_okay() - Free the socket if its connection is not okay
- * @sock:	pointer to the pointer to the socket.
- */
-static bool drbd_socket_okay(struct socket **sock)
+static int connect_work(struct drbd_work *work, int cancel)
 {
-	int rr;
-	char tb[4];
+	struct drbd_connection *connection =
+		container_of(work, struct drbd_connection, connect_timer_work);
+	struct drbd_resource *resource = connection->resource;
+	enum drbd_state_rv rv;
+	long t = resource->res_opts.auto_promote_timeout * HZ / 10;
+	bool retry = retry_by_rr_conflict(connection);
+	bool incompat_states, force_demote;
 
-	if (!*sock)
-		return false;
+	if (connection->cstate[NOW] != C_CONNECTING)
+		goto out_put;
 
-	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+	if (connection->agreed_pro_version == 117)
+		wait_initial_states_received(connection);
 
-	if (rr > 0 || rr == -EAGAIN) {
-		return true;
+	do {
+		/* Carefully check if it is okay to do a two_phase_commit from sender context */
+		if (down_trylock(&resource->state_sem)) {
+			rv = SS_CONCURRENT_ST_CHG;
+			break;
+		}
+		rv = change_cstate_tag(connection, C_CONNECTED, CS_SERIALIZE |
+				   CS_ALREADY_SERIALIZED | CS_VERBOSE | CS_DONT_RETRY,
+				   "connected", NULL);
+		up(&resource->state_sem);
+		if (rv != SS_PRIMARY_READER)
+			break;
+
+		/* We have a connection established, peer is primary. On my side is a
+		   read-only opener, probably udev or some other scanning after device creating.
+		   This short lived read-only open prevents now that we can continue.
+		   Better retry after the read-only opener goes away. */
+
+		t = wait_event_interruptible_timeout(resource->state_wait,
+						     !drbd_open_ro_count(resource),
+						     t);
+	} while (t > 0);
+
+	incompat_states = (rv == SS_CW_FAILED_BY_PEER || rv == SS_TWO_PRIMARIES);
+	force_demote = resource->role[NOW] == R_PRIMARY &&
+		resource->res_opts.on_susp_primary_outdated == SPO_FORCE_SECONDARY;
+	retry = retry || force_demote;
+
+	if (rv >= SS_SUCCESS) {
+		if (connection->agreed_pro_version < 117)
+			conn_connect2(connection);
+	} else if (rv == SS_TIMEOUT || rv == SS_CONCURRENT_ST_CHG) {
+		if (connection->cstate[NOW] != C_CONNECTING)
+			goto out_put;
+		arm_connect_timer(connection, jiffies + HZ/20);
+		return 0; /* Return early. Keep the reference on the connection! */
+	} else if (rv == SS_HANDSHAKE_RETRY || (incompat_states && retry)) {
+		arm_connect_timer(connection, jiffies + HZ);
+		apply_local_state_change(connection, OUTDATE_DISKS, force_demote);
+		return 0; /* Keep reference */
+	} else if (rv == SS_HANDSHAKE_DISCONNECT || (incompat_states && !retry)) {
+		drbd_send_disconnect(connection);
+		apply_local_state_change(connection, OUTDATE_DISKS_AND_DISCONNECT, force_demote);
 	} else {
-		sock_release(*sock);
-		*sock = NULL;
-		return false;
+		drbd_info(connection, "Failure to connect: %s (%d); retrying\n",
+			  drbd_set_st_err_str(rv), rv);
+		change_cstate(connection, C_NETWORK_FAILURE, CS_HARD);
 	}
-}
-
-static bool connection_established(struct drbd_connection *connection,
-				   struct socket **sock1,
-				   struct socket **sock2)
-{
-	struct net_conf *nc;
-	int timeout;
-	bool ok;
 
-	if (!*sock1 || !*sock2)
-		return false;
-
-	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
-	rcu_read_unlock();
-	schedule_timeout_interruptible(timeout);
-
-	ok = drbd_socket_okay(sock1);
-	ok = drbd_socket_okay(sock2) && ok;
-
-	return ok;
+ out_put:
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
 }
 
-/* Gets called if a connection is established, or if a new minor gets created
-   in a connection */
-int drbd_connected(struct drbd_peer_device *peer_device)
+static int drbd_transport_connect(struct drbd_connection *connection)
 {
-	struct drbd_device *device = peer_device->device;
-	int err;
-
-	atomic_set(&device->packet_seq, 0);
-	device->peer_seq = 0;
+	struct drbd_transport *transport = &connection->transport;
+	struct drbd_resource *resource = connection->resource;
+	int err = 0;
 
-	device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
-		&peer_device->connection->cstate_mutex :
-		&device->own_state_mutex;
+	mutex_lock(&resource->conf_update);
+	err = transport->class->ops.prepare_connect(transport);
+	mutex_unlock(&resource->conf_update);
 
-	err = drbd_send_sync_param(peer_device);
-	if (!err)
-		err = drbd_send_sizes(peer_device, 0, 0);
-	if (!err)
-		err = drbd_send_uuids(peer_device);
 	if (!err)
-		err = drbd_send_current_state(peer_device);
-	clear_bit(USE_DEGR_WFC_T, &device->flags);
-	clear_bit(RESIZE_PENDING, &device->flags);
-	atomic_set(&device->ap_in_flight, 0);
-	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+		err = transport->class->ops.connect(transport);
+
+	mutex_lock(&resource->conf_update);
+	transport->class->ops.finish_connect(transport);
+	mutex_unlock(&resource->conf_update);
+
 	return err;
 }
 
 /*
- * return values:
- *   1 yes, we have a valid connection
- *   0 oops, did not work out, please try again
- *  -1 peer talks different language,
- *     no point in trying again, please go standalone.
- *  -2 We do not have a network config...
+ * Returns true if we have a valid connection.
  */
-static int conn_connect(struct drbd_connection *connection)
+static bool conn_connect(struct drbd_connection *connection)
 {
-	struct drbd_socket sock, msock;
+	struct drbd_transport *transport = &connection->transport;
+	struct drbd_resource *resource = connection->resource;
+	int ping_timeo, ping_int, h, err, vnr;
 	struct drbd_peer_device *peer_device;
+	enum drbd_stream stream;
 	struct net_conf *nc;
-	int vnr, timeout, h;
-	bool discard_my_data, ok;
-	enum drbd_state_rv rv;
-	struct accept_wait_data ad = {
-		.connection = connection,
-		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
-	};
-
-	clear_bit(DISCONNECT_SENT, &connection->flags);
-	if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
-		return -2;
-
-	mutex_init(&sock.mutex);
-	sock.sbuf = connection->data.sbuf;
-	sock.rbuf = connection->data.rbuf;
-	sock.socket = NULL;
-	mutex_init(&msock.mutex);
-	msock.sbuf = connection->meta.sbuf;
-	msock.rbuf = connection->meta.rbuf;
-	msock.socket = NULL;
-
-	/* Assume that the peer only understands protocol 80 until we know better.  */
-	connection->agreed_pro_version = 80;
-
-	if (prepare_listen_socket(connection, &ad))
-		return 0;
+	bool discard_my_data;
+	bool have_mutex;
+	bool no_addr = false;
+
+start:
+	have_mutex = false;
+	clear_bit(PING_PENDING, &connection->flags);
+	clear_bit(DISCONNECT_EXPECTED, &connection->flags);
+	if (change_cstate_tag(connection, C_CONNECTING, CS_VERBOSE, "connecting", NULL)
+			< SS_SUCCESS) {
+		/* We do not have a network config. */
+		return false;
+	}
 
-	do {
-		struct socket *s;
-
-		s = drbd_try_connect(connection);
-		if (s) {
-			if (!sock.socket) {
-				sock.socket = s;
-				send_first_packet(connection, &sock, P_INITIAL_DATA);
-			} else if (!msock.socket) {
-				clear_bit(RESOLVE_CONFLICTS, &connection->flags);
-				msock.socket = s;
-				send_first_packet(connection, &msock, P_INITIAL_META);
-			} else {
-				drbd_err(connection, "Logic error in conn_connect()\n");
-				goto out_release_sockets;
-			}
-		}
+	/* Assume that the peer only understands our minimum supported
+	 * protocol version; until we know better. */
+	connection->agreed_pro_version = drbd_protocol_version_min;
 
-		if (connection_established(connection, &sock.socket, &msock.socket))
-			break;
+	err = drbd_transport_connect(connection);
+	if (err == -EAGAIN) {
+		enum drbd_conn_state cstate;
+		read_lock_irq(&resource->state_rwlock); /* See commit message */
+		cstate = connection->cstate[NOW];
+		read_unlock_irq(&resource->state_rwlock);
+		if (cstate == C_DISCONNECTING)
+			return false;
+		goto retry;
+	} else if (err == -EADDRNOTAVAIL) {
+		struct net_conf *nc;
+		int connect_int;
+		long t;
 
-retry:
-		s = drbd_wait_for_connect(connection, &ad);
-		if (s) {
-			int fp = receive_first_packet(connection, s);
-			drbd_socket_okay(&sock.socket);
-			drbd_socket_okay(&msock.socket);
-			switch (fp) {
-			case P_INITIAL_DATA:
-				if (sock.socket) {
-					drbd_warn(connection, "initial packet S crossed\n");
-					sock_release(sock.socket);
-					sock.socket = s;
-					goto randomize;
-				}
-				sock.socket = s;
-				break;
-			case P_INITIAL_META:
-				set_bit(RESOLVE_CONFLICTS, &connection->flags);
-				if (msock.socket) {
-					drbd_warn(connection, "initial packet M crossed\n");
-					sock_release(msock.socket);
-					msock.socket = s;
-					goto randomize;
-				}
-				msock.socket = s;
-				break;
-			default:
-				drbd_warn(connection, "Error receiving initial packet\n");
-				sock_release(s);
-randomize:
-				if (get_random_u32_below(2))
-					goto retry;
-			}
-		}
+		rcu_read_lock();
+		nc = rcu_dereference(transport->net_conf);
+		connect_int = nc ? nc->connect_int : 10;
+		rcu_read_unlock();
 
-		if (connection->cstate <= C_DISCONNECTING)
-			goto out_release_sockets;
-		if (signal_pending(current)) {
-			flush_signals(current);
-			smp_rmb();
-			if (get_t_state(&connection->receiver) == EXITING)
-				goto out_release_sockets;
+		if (!no_addr) {
+			drbd_warn(connection,
+				  "Configured local address not found, retrying every %d sec, "
+				  "err=%d\n", connect_int, err);
+			no_addr = true;
 		}
 
-		ok = connection_established(connection, &sock.socket, &msock.socket);
-	} while (!ok);
-
-	if (ad.s_listen)
-		sock_release(ad.s_listen);
-
-	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-
-	sock.socket->sk->sk_allocation = GFP_NOIO;
-	msock.socket->sk->sk_allocation = GFP_NOIO;
-
-	sock.socket->sk->sk_use_task_frag = false;
-	msock.socket->sk->sk_use_task_frag = false;
+		t = schedule_timeout_interruptible(connect_int * HZ);
+		if (t || connection->cstate[NOW] == C_DISCONNECTING)
+			return false;
+		goto start;
+	} else if (err == -EDESTADDRREQ) {
+		/*
+		 * No destination address, we cannot possibly make a connection.
+		 * Maybe a resource was partially left over due to some other bug?
+		 * Either way, abort here and go StandAlone to prevent reconnection.
+		 */
+		drbd_err(connection, "No destination address, err=%d\n", err);
+		change_cstate_tag(connection, C_STANDALONE, CS_HARD, "no-dest-addr", NULL);
+		return false;
+	} else if (err < 0) {
+		drbd_warn(connection, "Failed to initiate connection, err=%d\n", err);
+		goto abort;
+	}
 
-	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
-	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+	connection->reassemble_buffer.avail = 0;
 
-	/* NOT YET ...
-	 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
-	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-	 * first set it to the P_CONNECTION_FEATURES timeout,
-	 * which we set to 4x the configured ping_timeout. */
 	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
-
-	sock.socket->sk->sk_sndtimeo =
-	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
-
-	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
-	timeout = nc->timeout * HZ / 10;
-	discard_my_data = nc->discard_my_data;
+	nc = rcu_dereference(connection->transport.net_conf);
+	ping_timeo = nc->ping_timeo;
+	ping_int = nc->ping_int;
 	rcu_read_unlock();
 
-	msock.socket->sk->sk_sndtimeo = timeout;
-
-	/* we don't want delays.
-	 * we use TCP_CORK where appropriate, though */
-	tcp_sock_set_nodelay(sock.socket->sk);
-	tcp_sock_set_nodelay(msock.socket->sk);
+	/* Make sure we are "uncorked", otherwise we risk timeouts,
+	 * in case this is a reconnect and we had been corked before. */
+	for (stream = DATA_STREAM; stream <= CONTROL_STREAM; stream++) {
+		initialize_send_buffer(connection, stream);
+		drbd_uncork(connection, stream);
+	}
 
-	connection->data.socket = sock.socket;
-	connection->meta.socket = msock.socket;
-	connection->last_received = jiffies;
+	/* Make sure the handshake happens without interference from other threads,
+	 * or the challenge response authentication could be garbled. */
+	mutex_lock(&connection->mutex[DATA_STREAM]);
+	have_mutex = true;
+	transport->class->ops.set_rcvtimeo(transport, DATA_STREAM, ping_timeo * 4 * HZ/10);
+	transport->class->ops.set_rcvtimeo(transport, CONTROL_STREAM, ping_int * HZ);
 
 	h = drbd_do_features(connection);
-	if (h <= 0)
-		return h;
+	if (h < 0)
+		goto abort;
+	if (h == 0)
+		goto retry;
 
 	if (connection->cram_hmac_tfm) {
-		/* drbd_request_state(device, NS(conn, WFAuth)); */
 		switch (drbd_do_auth(connection)) {
 		case -1:
 			drbd_err(connection, "Authentication of peer failed\n");
-			return -1;
+			goto abort;
 		case 0:
 			drbd_err(connection, "Authentication of peer failed, trying again.\n");
-			return 0;
+			goto retry;
 		}
 	}
 
-	connection->data.socket->sk->sk_sndtimeo = timeout;
-	connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-
-	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
-		return -1;
-
-	/* Prevent a race between resync-handshake and
-	 * being promoted to Primary.
-	 *
-	 * Grab and release the state mutex, so we know that any current
-	 * drbd_set_role() is finished, and any incoming drbd_set_role
-	 * will see the STATE_SENT flag, and wait for it to be cleared.
-	 */
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-		mutex_lock(peer_device->device->state_mutex);
-
-	/* avoid a race with conn_request_state( C_DISCONNECTING ) */
-	spin_lock_irq(&connection->resource->req_lock);
-	set_bit(STATE_SENT, &connection->flags);
-	spin_unlock_irq(&connection->resource->req_lock);
+	discard_my_data = test_bit(CONN_DISCARD_MY_DATA, &connection->flags);
 
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
-		mutex_unlock(peer_device->device->state_mutex);
+	if (__drbd_send_protocol(connection, P_PROTOCOL) == -EOPNOTSUPP)
+		goto abort;
 
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
-		kref_get(&device->kref);
-		rcu_read_unlock();
-
+		set_bit(REPLICATION_NEXT, &peer_device->flags);
 		if (discard_my_data)
-			set_bit(DISCARD_MY_DATA, &device->flags);
+			set_bit(DISCARD_MY_DATA, &peer_device->flags);
 		else
-			clear_bit(DISCARD_MY_DATA, &device->flags);
-
-		drbd_connected(peer_device);
-		kref_put(&device->kref, drbd_destroy_device);
-		rcu_read_lock();
+			clear_bit(DISCARD_MY_DATA, &peer_device->flags);
 	}
 	rcu_read_unlock();
+	mutex_unlock(&connection->mutex[DATA_STREAM]);
+	have_mutex = false;
 
-	rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
-	if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
-		clear_bit(STATE_SENT, &connection->flags);
-		return 0;
-	}
-
-	drbd_thread_start(&connection->ack_receiver);
-	/* opencoded create_singlethread_workqueue(),
-	 * to be able to use format string arguments */
 	connection->ack_sender =
-		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
+		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, resource->name);
 	if (!connection->ack_sender) {
 		drbd_err(connection, "Failed to create workqueue ack_sender\n");
-		return 0;
+		schedule_timeout_uninterruptible(HZ);
+		goto retry;
 	}
 
-	mutex_lock(&connection->resource->conf_update);
-	/* The discard_my_data flag is a single-shot modifier to the next
-	 * connection attempt, the handshake of which is now well underway.
-	 * No need for rcu style copying of the whole struct
-	 * just to clear a single value. */
-	connection->net_conf->discard_my_data = 0;
-	mutex_unlock(&connection->resource->conf_update);
+	atomic_set(&connection->ap_in_flight, 0);
+	atomic_set(&connection->rs_in_flight, 0);
 
-	return h;
+	if (connection->agreed_pro_version >= 110) {
+		/* Allow 10 times the ping_timeo for two-phase commits. That is
+		 * 5 seconds by default. The unit of ping_timeo is tenths of a
+		 * second. */
+		transport->class->ops.set_rcvtimeo(transport, DATA_STREAM, ping_timeo * HZ);
 
-out_release_sockets:
-	if (ad.s_listen)
-		sock_release(ad.s_listen);
-	if (sock.socket)
-		sock_release(sock.socket);
-	if (msock.socket)
-		sock_release(msock.socket);
-	return -1;
+		if (connection->agreed_pro_version == 117)
+			conn_connect2(connection);
+
+		if (resource->res_opts.node_id < connection->peer_node_id) {
+			kref_get(&connection->kref);
+			connection->connect_timer_work.cb = connect_work;
+			arm_connect_timer(connection, jiffies);
+		}
+	} else {
+		enum drbd_state_rv rv;
+		rv = change_cstate(connection, C_CONNECTED,
+				   CS_VERBOSE | CS_WAIT_COMPLETE | CS_SERIALIZE | CS_LOCAL_ONLY);
+		if (rv < SS_SUCCESS || connection->cstate[NOW] != C_CONNECTED)
+			goto retry;
+		conn_connect2(connection);
+	}
+
+	clear_bit(PING_TIMEOUT_ACTIVE, &connection->flags);
+	return true;
+
+retry:
+	if (have_mutex)
+		mutex_unlock(&connection->mutex[DATA_STREAM]);
+	conn_disconnect(connection);
+	schedule_timeout_interruptible(HZ);
+	goto start;
+
+abort:
+	if (have_mutex)
+		mutex_unlock(&connection->mutex[DATA_STREAM]);
+	change_cstate(connection, C_DISCONNECTING, CS_HARD);
+	return false;
 }
 
-static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+static unsigned int decode_header_size(const void *header)
 {
-	unsigned int header_size = drbd_header_size(connection);
+	const u32 first_dword = *(u32 *)header;
+	const u16 first_word = *(u16 *)header;
 
-	if (header_size == sizeof(struct p_header100) &&
-	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
-		struct p_header100 *h = header;
-		if (h->pad != 0) {
-			drbd_err(connection, "Header padding is not zero\n");
-			return -EINVAL;
-		}
-		pi->vnr = be16_to_cpu(h->volume);
+	return first_dword == cpu_to_be32(DRBD_MAGIC_100) ? sizeof(struct p_header100) :
+		first_word == cpu_to_be16(DRBD_MAGIC_BIG) ? sizeof(struct p_header95) :
+		sizeof(struct p_header80);
+}
+
+static int __decode_header(const void *header, struct packet_info *pi)
+{
+	const u32 first_dword = *(u32 *)header;
+	const u16 first_word = *(u16 *)header;
+	unsigned int header_size;
+	int header_version;
+
+	if (first_dword == cpu_to_be32(DRBD_MAGIC_100)) {
+		const struct p_header100 *h = header;
+		u16 vnr = be16_to_cpu(h->volume);
+
+		if (h->pad != 0)
+			return -ENOENT;
+
+		pi->vnr = vnr == ((u16) 0xFFFF) ? -1 : vnr;
 		pi->cmd = be16_to_cpu(h->command);
 		pi->size = be32_to_cpu(h->length);
-	} else if (header_size == sizeof(struct p_header95) &&
-		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
-		struct p_header95 *h = header;
+		header_size = sizeof(*h);
+		header_version = 100;
+	} else if (first_word == cpu_to_be16(DRBD_MAGIC_BIG)) {
+		const struct p_header95 *h = header;
+
 		pi->cmd = be16_to_cpu(h->command);
 		pi->size = be32_to_cpu(h->length);
 		pi->vnr = 0;
-	} else if (header_size == sizeof(struct p_header80) &&
-		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
-		struct p_header80 *h = header;
+		header_size = sizeof(*h);
+		header_version = 95;
+	} else if (first_dword == cpu_to_be32(DRBD_MAGIC)) {
+		const struct p_header80 *h = header;
+
 		pi->cmd = be16_to_cpu(h->command);
 		pi->size = be16_to_cpu(h->length);
 		pi->vnr = 0;
+		header_size = sizeof(*h);
+		header_version = 80;
 	} else {
-		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
-			 be32_to_cpu(*(__be32 *)header),
-			 connection->agreed_pro_version);
 		return -EINVAL;
 	}
-	pi->data = header + header_size;
+
+	pi->data = (void *)(header + header_size); /* casting away 'const'! */
+	return header_version;
+}
+
+static bool header_version_good(int header_version, int protocol_version)
+{
+	switch (header_version) {
+	case 100: return protocol_version >= 100;
+	case 95: return protocol_version < 100;
+	case 80: return protocol_version < 95;
+	default: return false;
+	}
+}
+
+static int decode_header(struct drbd_connection *connection, const void *header,
+			 struct packet_info *pi)
+{
+	const int agreed_pro_version = connection->agreed_pro_version;
+	int header_version = __decode_header(header, pi);
+
+	if (header_version == -ENOENT) {
+		drbd_err(connection, "Header padding is not zero\n");
+		return -EINVAL;
+	} else if (header_version < 0 || !header_version_good(header_version, agreed_pro_version)) {
+		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d, %d [data]\n",
+			 be32_to_cpu(*(__be32 *)header), agreed_pro_version, header_version);
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -1013,49 +1232,58 @@ static void drbd_unplug_all_devices(struct drbd_connection *connection)
 
 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
 {
-	void *buffer = connection->data.rbuf;
+	void *buffer;
 	int err;
 
-	err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+	err = drbd_recv_all_warn(connection, &buffer, drbd_header_size(connection));
 	if (err)
 		return err;
 
 	err = decode_header(connection, buffer, pi);
-	connection->last_received = jiffies;
 
 	return err;
 }
 
 static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
 {
-	void *buffer = connection->data.rbuf;
+	struct drbd_transport_ops *tr_ops = &connection->transport.class->ops;
 	unsigned int size = drbd_header_size(connection);
+	void *buffer;
 	int err;
 
-	err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
+	err = tr_ops->recv(&connection->transport, DATA_STREAM, &buffer,
+			   size, MSG_NOSIGNAL | MSG_DONTWAIT);
 	if (err != size) {
+		int rflags = 0;
+
 		/* If we have nothing in the receive buffer now, to reduce
 		 * application latency, try to drain the backend queues as
 		 * quickly as possible, and let remote TCP know what we have
 		 * received so far. */
 		if (err == -EAGAIN) {
-			tcp_sock_set_quickack(connection->data.socket->sk, 2);
+			tr_ops->hint(&connection->transport, DATA_STREAM, QUICKACK);
 			drbd_unplug_all_devices(connection);
-		}
-		if (err > 0) {
-			buffer += err;
+		} else if (err > 0) {
 			size -= err;
+			rflags |= GROW_BUFFER;
 		}
-		err = drbd_recv_all_warn(connection, buffer, size);
+
+		err = drbd_recv(connection, &buffer, size, rflags);
+		if (err != size) {
+			if (err >= 0)
+				err = -EIO;
+		} else
+			err = 0;
+
 		if (err)
 			return err;
 	}
 
-	err = decode_header(connection, connection->data.rbuf, pi);
-	connection->last_received = jiffies;
+	err = decode_header(connection, buffer, pi);
 
 	return err;
 }
+
 /* This is blkdev_issue_flush, but asynchronous.
  * We want to submit to all component volumes in parallel,
  * then wait for all completions.
@@ -1076,9 +1304,11 @@ static void one_flush_endio(struct bio *bio)
 	struct drbd_device *device = octx->device;
 	struct issue_flush_context *ctx = octx->ctx;
 
-	if (bio->bi_status) {
-		ctx->error = blk_status_to_errno(bio->bi_status);
-		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
+	blk_status_t status = bio->bi_status;
+
+	if (status) {
+		ctx->error = blk_status_to_errno(status);
+		drbd_info(device, "local disk FLUSH FAILED with status %d\n", status);
 	}
 	kfree(octx);
 	bio_put(bio);
@@ -1094,7 +1324,7 @@ static void one_flush_endio(struct bio *bio)
 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
 {
 	struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
-				    REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO);
+			REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO);
 	struct one_flush_context *octx = kmalloc_obj(*octx, GFP_NOIO);
 
 	if (!octx) {
@@ -1121,10 +1351,12 @@ static void submit_one_flush(struct drbd_device *device, struct issue_flush_cont
 	submit_bio(bio);
 }
 
-static void drbd_flush(struct drbd_connection *connection)
+static enum finish_epoch drbd_flush_after_epoch(struct drbd_connection *connection, struct drbd_epoch *epoch)
 {
-	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
-		struct drbd_peer_device *peer_device;
+	struct drbd_resource *resource = connection->resource;
+
+	if (resource->write_ordering >= WO_BDEV_FLUSH) {
+		struct drbd_device *device;
 		struct issue_flush_context ctx;
 		int vnr;
 
@@ -1133,9 +1365,7 @@ static void drbd_flush(struct drbd_connection *connection)
 		init_completion(&ctx.done);
 
 		rcu_read_lock();
-		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-			struct drbd_device *device = peer_device->device;
-
+		idr_for_each_entry(&resource->devices, device, vnr) {
 			if (!get_ldev(device))
 				continue;
 			kref_get(&device->kref);
@@ -1160,6 +1390,88 @@ static void drbd_flush(struct drbd_connection *connection)
 			drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
 		}
 	}
+
+	/* If called before sending P_CONFIRM_STABLE, we don't have the epoch
+	 * (and must not finish it yet, anyways) */
+	if (epoch == NULL)
+		return FE_STILL_LIVE;
+	return drbd_may_finish_epoch(connection, epoch, EV_BARRIER_DONE);
+}
+
+static int w_flush(struct drbd_work *w, int cancel)
+{
+	struct flush_work *fw = container_of(w, struct flush_work, w);
+	struct drbd_epoch *epoch = fw->epoch;
+	struct drbd_connection *connection = epoch->connection;
+
+	kfree(fw);
+
+	if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags))
+		drbd_flush_after_epoch(connection, epoch);
+
+	drbd_may_finish_epoch(connection, epoch, EV_PUT |
+			      (connection->cstate[NOW] < C_CONNECTED ? EV_CLEANUP : 0));
+
+	return 0;
+}
+
+static void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+	struct p_barrier_ack *p;
+
+	if (connection->cstate[NOW] < C_CONNECTED)
+		return;
+
+	p = conn_prepare_command(connection, sizeof(*p), CONTROL_STREAM);
+	if (!p)
+		return;
+	p->barrier = barrier_nr;
+	p->set_size = cpu_to_be32(set_size);
+	send_command(connection, -1, P_BARRIER_ACK, CONTROL_STREAM);
+}
+
+static void drbd_send_confirm_stable(struct drbd_peer_request *peer_req)
+{
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+	struct drbd_epoch *epoch = peer_req->epoch;
+	struct drbd_peer_request *oldest, *youngest;
+	struct p_confirm_stable *p;
+	int count;
+
+	if (connection->cstate[NOW] < C_CONNECTED)
+		return;
+
+	/* peer_req is not on stable storage yet, but the only one in this epoch.
+	 * Nothing to confirm, just wait for the normal barrier_ack and peer_ack
+	 * to do their work. */
+	oldest = epoch->oldest_unconfirmed_peer_req;
+	if (oldest == peer_req)
+		return;
+
+	p = conn_prepare_command(connection, sizeof(*p), CONTROL_STREAM);
+	if (!p)
+		return;
+
+	/* peer_req has not been added to connection->peer_requests yet, so
+	 * connection->peer_requests.prev is the youngest request that should
+	 * now be on stable storage. */
+	spin_lock_irq(&connection->peer_reqs_lock);
+	youngest = list_entry(connection->peer_requests.prev, struct drbd_peer_request, recv_order);
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	count = atomic_read(&epoch->epoch_size) - atomic_read(&epoch->confirmed) - 1;
+	atomic_add(count, &epoch->confirmed);
+	epoch->oldest_unconfirmed_peer_req = peer_req;
+
+	D_ASSERT(connection, oldest->epoch == youngest->epoch);
+	D_ASSERT(connection, count > 0);
+
+	p->oldest_block_id = oldest->block_id;
+	p->youngest_block_id = youngest->block_id;
+	p->set_size = cpu_to_be32(count);
+	p->pad = 0;
+
+	send_command(connection, -1, P_CONFIRM_STABLE, CONTROL_STREAM);
 }
 
 /**
@@ -1172,13 +1484,16 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 					       struct drbd_epoch *epoch,
 					       enum epoch_event ev)
 {
-	int epoch_size;
+	int finish, epoch_size;
 	struct drbd_epoch *next_epoch;
+	int schedule_flush = 0;
 	enum finish_epoch rv = FE_STILL_LIVE;
+	struct drbd_resource *resource = connection->resource;
 
 	spin_lock(&connection->epoch_lock);
 	do {
 		next_epoch = NULL;
+		finish = 0;
 
 		epoch_size = atomic_read(&epoch->epoch_size);
 
@@ -1188,6 +1503,16 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 			break;
 		case EV_GOT_BARRIER_NR:
 			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+
+			/* Special case: If we just switched from WO_BIO_BARRIER to
+			   WO_BDEV_FLUSH we should not finish the current epoch */
+			if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 &&
+			    resource->write_ordering != WO_BIO_BARRIER &&
+			    epoch == connection->current_epoch)
+				clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags);
+			break;
+		case EV_BARRIER_DONE:
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags);
 			break;
 		case EV_BECAME_LAST:
 			/* nothing to do*/
@@ -1196,18 +1521,30 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP) &&
+		    epoch->list.prev == &connection->current_epoch->list &&
+		    !test_bit(DE_IS_FINISHING, &epoch->flags)) {
+			/* Nearly all conditions are met to finish that epoch... */
+			if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ||
+			    resource->write_ordering == WO_NONE ||
+			    (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) ||
+			    ev & EV_CLEANUP) {
+				finish = 1;
+				set_bit(DE_IS_FINISHING, &epoch->flags);
+			} else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) &&
+				 resource->write_ordering == WO_BIO_BARRIER) {
+				atomic_inc(&epoch->active);
+				schedule_flush = 1;
+			}
+		}
+		if (finish) {
 			if (!(ev & EV_CLEANUP)) {
+				/* adjust for nr requests already confirmed via P_CONFIRM_STABLE, if any. */
+				epoch_size -= atomic_read(&epoch->confirmed);
 				spin_unlock(&connection->epoch_lock);
 				drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
 				spin_lock(&connection->epoch_lock);
 			}
-#if 0
-			/* FIXME: dec unacked on connection, once we have
-			 * something to count pending connection packets in. */
-			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
-				dec_unacked(epoch->connection);
-#endif
 
 			if (connection->current_epoch != epoch) {
 				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1219,9 +1556,11 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 				if (rv == FE_STILL_LIVE)
 					rv = FE_DESTROYED;
 			} else {
+				epoch->oldest_unconfirmed_peer_req = NULL;
 				epoch->flags = 0;
 				atomic_set(&epoch->epoch_size, 0);
-				/* atomic_set(&epoch->active, 0); is already zero */
+				atomic_set(&epoch->confirmed, 0);
+				/* atomic_set(&epoch->active, 0); is alrady zero */
 				if (rv == FE_STILL_LIVE)
 					rv = FE_RECYCLED;
 			}
@@ -1235,6 +1574,22 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connectio
 
 	spin_unlock(&connection->epoch_lock);
 
+	if (schedule_flush) {
+		struct flush_work *fw;
+		fw = kmalloc_obj(*fw, GFP_ATOMIC);
+		if (fw) {
+			fw->w.cb = w_flush;
+			fw->epoch = epoch;
+			drbd_queue_work(&resource->work, &fw->w);
+		} else {
+			drbd_warn(resource, "Could not kmalloc a flush_work obj\n");
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+			/* That is not a recursion, only one level */
+			drbd_may_finish_epoch(connection, epoch, EV_BARRIER_DONE);
+			drbd_may_finish_epoch(connection, epoch, EV_PUT);
+		}
+	}
+
 	return rv;
 }
 
@@ -1245,6 +1600,8 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
 
 	dc = rcu_dereference(bdev->disk_conf);
 
+	if (wo == WO_BIO_BARRIER && !dc->disk_barrier)
+		wo = WO_BDEV_FLUSH;
 	if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
 		wo = WO_DRAIN_IO;
 	if (wo == WO_DRAIN_IO && !dc->disk_drain)
@@ -1262,18 +1619,22 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 {
 	struct drbd_device *device;
 	enum write_ordering_e pwo;
-	int vnr;
+	int vnr, i = 0;
 	static char *write_ordering_str[] = {
 		[WO_NONE] = "none",
 		[WO_DRAIN_IO] = "drain",
 		[WO_BDEV_FLUSH] = "flush",
+		[WO_BIO_BARRIER] = "barrier",
 	};
 
 	pwo = resource->write_ordering;
-	if (wo != WO_BDEV_FLUSH)
+	if (wo != WO_BIO_BARRIER)
 		wo = min(pwo, wo);
 	rcu_read_lock();
 	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (i++ == 1 && wo == WO_BIO_BARRIER)
+			wo = WO_BDEV_FLUSH; /* WO = barrier does not handle multiple volumes */
+
 		if (get_ldev(device)) {
 			wo = max_allowed_wo(device->ldev, wo);
 			if (device->ldev == bdev)
@@ -1288,21 +1649,11 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 	rcu_read_unlock();
 
 	resource->write_ordering = wo;
-	if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
+	if (pwo != resource->write_ordering || wo == WO_BIO_BARRIER)
 		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
 }
 
 /*
- * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
- * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
- * will directly go to fallback mode, submitting normal writes, and
- * never even try to UNMAP.
- *
- * And dm-thin does not do this (yet), mostly because in general it has
- * to assume that "skip_block_zeroing" is set.  See also:
- * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
- * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
- *
  * We *may* ignore the discard-zeroes-data setting, if so configured.
  *
  * Assumption is that this "discard_zeroes_data=0" is only because the backend
@@ -1325,6 +1676,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin
 int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
 {
 	struct block_device *bdev = device->ldev->backing_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
 	sector_t tmp, nr;
 	unsigned int max_discard_sectors, granularity;
 	int alignment;
@@ -1334,7 +1686,7 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u
 		goto zero_out;
 
 	/* Zero-sector (unknown) and one-sector granularities are the same.  */
-	granularity = max(bdev_discard_granularity(bdev) >> 9, 1U);
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
 	max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22));
@@ -1361,8 +1713,7 @@ int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, u
 		start = tmp;
 	}
 	while (nr_sectors >= max_discard_sectors) {
-		err |= blkdev_issue_discard(bdev, start, max_discard_sectors,
-					    GFP_NOIO);
+		err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO);
 		nr_sectors -= max_discard_sectors;
 		start += max_discard_sectors;
 	}
@@ -1419,11 +1770,11 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
 
 static int peer_request_fault_type(struct drbd_peer_request *peer_req)
 {
-	if (peer_req_op(peer_req) == REQ_OP_READ) {
-		return peer_req->flags & EE_APPLICATION ?
+	if (bio_op(peer_req->bios.head) == REQ_OP_READ) {
+		return drbd_interval_is_application(&peer_req->i) ?
 			DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
 	} else {
-		return peer_req->flags & EE_APPLICATION ?
+		return drbd_interval_is_application(&peer_req->i) ?
 			DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
 	}
 }
@@ -1441,18 +1792,23 @@ static int peer_request_fault_type(struct drbd_peer_request *peer_req)
  *  single page to an empty bio (which should never happen and likely indicates
  *  that the lower level IO stack is in some way broken). This has been observed
  *  on certain Xen deployments.
+ *
+ *  When this function returns 0, it "consumes" an ldev reference; the
+ *  reference is released when the request completes.
  */
 /* TODO allocate from our own bio_set. */
 int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
 {
 	struct drbd_device *device = peer_req->peer_device->device;
-	struct bio *bios = NULL;
-	struct bio *bio;
-	struct page *page = peer_req->pages;
+	struct bio *bio, *next_bio;
 	sector_t sector = peer_req->i.sector;
-	unsigned int data_size = peer_req->i.size;
-	unsigned int n_bios = 0;
-	unsigned int nr_pages = PFN_UP(data_size);
+	struct bio_list bios;
+	struct page *page;
+	int fault_type, err, nr_bios = 0;
+
+	if (peer_req->flags & EE_SET_OUT_OF_SYNC)
+		drbd_set_out_of_sync(peer_req->peer_device,
+				sector, peer_req->i.size);
 
 	/* TRIM/DISCARD: for now, always use the helper function
 	 * blkdev_issue_zeroout(..., discard=true).
@@ -1460,27 +1816,18 @@ int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
 	 * Correctness first, performance later.  Next step is to code an
 	 * asynchronous variant of the same.
 	 */
-	if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) {
-		/* wait for all pending IO completions, before we start
-		 * zeroing things out. */
-		conn_wait_active_ee_empty(peer_req->peer_device->connection);
-		/* add it to the active list now,
-		 * so we can find it to present it in debugfs */
+	if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) {
 		peer_req->submit_jif = jiffies;
-		peer_req->flags |= EE_SUBMITTED;
-
-		/* If this was a resync request from receive_rs_deallocated(),
-		 * it is already on the sync_ee list */
-		if (list_empty(&peer_req->w.list)) {
-			spin_lock_irq(&device->resource->req_lock);
-			list_add_tail(&peer_req->w.list, &device->active_ee);
-			spin_unlock_irq(&device->resource->req_lock);
-		}
 
+		/* ldev_safe: a peer_req has a ldev reference */
 		drbd_issue_peer_discard_or_zero_out(device, peer_req);
 		return 0;
 	}
 
+	fault_type = peer_request_fault_type(peer_req);
+	bios = peer_req->bios;
+	bio_list_init(&peer_req->bios);
+
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
@@ -1489,90 +1836,167 @@ int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
 	 * Plain bio_alloc is good enough here, this is no DRBD internally
 	 * generated bio, but a bio allocated on behalf of the peer.
 	 */
-next_bio:
 	/* _DISCARD, _WRITE_ZEROES handled above.
 	 * REQ_OP_FLUSH (empty flush) not expected,
 	 * should have been mapped to a "drbd protocol barrier".
 	 * REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
 	 */
-	if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
-				peer_req_op(peer_req) == REQ_OP_READ)) {
-		drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
-		return -EINVAL;
+	bio = bio_list_peek(&bios);
+	if (!(bio_op(bio) == REQ_OP_WRITE || bio_op(bio) == REQ_OP_READ)) {
+		drbd_err(device, "Invalid bio op received: 0x%x\n", bio->bi_opf);
+		err = -EINVAL;
+		goto fail;
 	}
 
-	bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
-	/* > peer_req->i.sector, unless this is the first bio */
-	bio->bi_iter.bi_sector = sector;
-	bio->bi_private = peer_req;
-	bio->bi_end_io = drbd_peer_request_endio;
+	/* we special case some flags in the multi-bio case, see below
+	 * (REQ_PREFLUSH, or BIO_RW_BARRIER in older kernels) */
 
-	bio->bi_next = bios;
-	bios = bio;
-	++n_bios;
+	/* Get reference for the first bio */
+	atomic_inc(&peer_req->pending_bios);
 
-	page_chain_for_each(page) {
-		unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
-		if (!bio_add_page(bio, page, len, 0))
-			goto next_bio;
-		data_size -= len;
-		sector += len >> 9;
-		--nr_pages;
-	}
-	D_ASSERT(device, data_size == 0);
-	D_ASSERT(device, page == NULL);
-
-	atomic_set(&peer_req->pending_bios, n_bios);
 	/* for debugfs: update timestamp, mark as submitted */
 	peer_req->submit_jif = jiffies;
-	peer_req->flags |= EE_SUBMITTED;
-	do {
-		bio = bios;
-		bios = bios->bi_next;
-		bio->bi_next = NULL;
+	while ((bio = bio_list_pop(&bios))) {
+		/* bio_list_pop() clears bio->bi_next; it is a kernel-private
+		 * field used during I/O; used temprorarily by DRBD pre submit
+		 * and post completion
+		 */
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_private = peer_req;
+		bio->bi_end_io = drbd_peer_request_endio;
 
-		drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
-	} while (bios);
-	return 0;
-}
+		/* Store sector and size in first struct page for restoration after I/O. */
+		page = bio->bi_io_vec[0].bv_page;
+		page->private = sector - peer_req->i.sector;
+		page->lru.next = (void *)(unsigned long)bio->bi_iter.bi_size;
 
-static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
-					     struct drbd_peer_request *peer_req)
-{
-	struct drbd_interval *i = &peer_req->i;
+		sector += bio_sectors(bio);
 
-	drbd_remove_interval(&device->write_requests, i);
-	drbd_clear_interval(i);
+		nr_bios++;
 
-	/* Wake up any processes waiting for this peer request to complete.  */
-	if (i->waiting)
-		wake_up(&device->misc_wait);
-}
+		/* Get reference for the next bio (if any) now to prevent premature completion */
+		next_bio = bio_list_peek(&bios);
+		if (next_bio)
+			atomic_inc(&peer_req->pending_bios);
+		drbd_submit_bio_noacct(device, fault_type, bio);
 
-static void conn_wait_active_ee_empty(struct drbd_connection *connection)
-{
-	struct drbd_peer_device *peer_device;
-	int vnr;
+		/* strip off REQ_PREFLUSH,
+		 * unless it is the first or last bio */
+		if (next_bio && next_bio->bi_next)
+			next_bio->bi_opf &= ~REQ_PREFLUSH;
+	}
+	if (nr_bios > 1)
+		device->multi_bio_cnt++;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
-		struct drbd_device *device = peer_device->device;
+	return 0;
 
-		kref_get(&device->kref);
-		rcu_read_unlock();
-		drbd_wait_ee_list_empty(device, &device->active_ee);
-		kref_put(&device->kref, drbd_destroy_device);
-		rcu_read_lock();
+fail:
+	while ((bio = bio_list_pop(&bios)))
+		bio_put(bio);
+	return err;
+}
+
+void drbd_remove_peer_req_interval(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_interval *i = &peer_req->i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->interval_lock, flags);
+	D_ASSERT(device, !drbd_interval_empty(i));
+	drbd_remove_interval(&device->requests, i);
+	drbd_clear_interval(i);
+	if (!drbd_interval_is_verify(&peer_req->i))
+		drbd_release_conflicts(device, i);
+	spin_unlock_irqrestore(&device->interval_lock, flags);
+}
+
+/**
+ * w_e_reissue() - Worker callback; Resubmit a bio
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways (unused in this callback)
+ */
+int w_e_reissue(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	int err;
+	/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
+	   (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
+	   so that we can finish that epoch in drbd_may_finish_epoch().
+	   That is necessary if we already have a long chain of Epochs, before
+	   we realize that BARRIER is actually not supported */
+
+	/* As long as the -ENOTSUPP on the barrier is reported immediately
+	   that will never trigger. If it is reported late, we will just
+	   print that warning and continue correctly for all future requests
+	   with WO_BDEV_FLUSH */
+	if (previous_epoch(connection, peer_req->epoch))
+		drbd_warn(device, "Write ordering was not enforced (one time event)\n");
+
+	/* we still have a local reference,
+	 * get_ldev was done in receive_Data. */
+
+	peer_req->w.cb = e_end_block;
+	err = drbd_submit_peer_request(peer_req);
+	switch (err) {
+	case -ENOMEM:
+		peer_req->w.cb = w_e_reissue;
+		drbd_queue_work(&connection->sender_work,
+				&peer_req->w);
+		/* retry later */
+		fallthrough;
+	case 0:
+		/* keep worker happy and connection up */
+		return 0;
+
+	case -ENOSPC:
+		/* no other error expected, but anyways: */
+	default:
+		/* forget the object,
+		 * and cause a "Network failure" */
+		drbd_remove_peer_req_interval(peer_req);
+		drbd_al_complete_io(device, &peer_req->i);
+		drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
+		drbd_free_peer_req(peer_req);
+		drbd_err(device, "submit failed, triggering re-connect\n");
+		return err;
 	}
-	rcu_read_unlock();
+}
+
+static void conn_wait_done_ee_empty_or_disconnect(struct drbd_connection *connection)
+{
+	wait_event(connection->ee_wait,
+		atomic_read(&connection->done_ee_cnt) == 0
+		|| connection->cstate[NOW] < C_CONNECTED);
+}
+
+static void conn_wait_active_ee_empty_or_disconnect(struct drbd_connection *connection)
+{
+	if (atomic_read(&connection->active_ee_cnt) == 0)
+		return;
+
+	drbd_unplug_all_devices(connection);
+
+	wait_event(connection->ee_wait,
+		atomic_read(&connection->active_ee_cnt) == 0
+		|| connection->cstate[NOW] < C_CONNECTED);
 }
 
 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
 {
-	int rv;
+	struct drbd_transport_ops *tr_ops = &connection->transport.class->ops;
+	int rv, issue_flush;
 	struct p_barrier *p = pi->data;
 	struct drbd_epoch *epoch;
 
+	tr_ops->hint(&connection->transport, DATA_STREAM, QUICKACK);
+	drbd_unplug_all_devices(connection);
+
 	/* FIXME these are unacked on connection,
 	 * not a specific (peer)device.
 	 */
@@ -1586,41 +2010,48 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	 * Therefore we must send the barrier_ack after the barrier request was
 	 * completed. */
 	switch (connection->resource->write_ordering) {
+	case WO_BIO_BARRIER:
 	case WO_NONE:
 		if (rv == FE_RECYCLED)
 			return 0;
-
-		/* receiver context, in the writeout path of the other node.
-		 * avoid potential distributed deadlock */
-		epoch = kmalloc_obj(struct drbd_epoch, GFP_NOIO);
-		if (epoch)
-			break;
-		else
-			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
-		fallthrough;
+		break;
 
 	case WO_BDEV_FLUSH:
 	case WO_DRAIN_IO:
-		conn_wait_active_ee_empty(connection);
-		drbd_flush(connection);
+		if (rv == FE_STILL_LIVE) {
+			set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &connection->current_epoch->flags);
+			conn_wait_active_ee_empty_or_disconnect(connection);
+			rv = drbd_flush_after_epoch(connection, connection->current_epoch);
+		}
+		if (rv == FE_RECYCLED)
+			return 0;
 
-		if (atomic_read(&connection->current_epoch->epoch_size)) {
-			epoch = kmalloc_obj(struct drbd_epoch, GFP_NOIO);
-			if (epoch)
-				break;
+		/*
+		 * The ack_sender will send all the ACKs and barrier ACKs out,
+		 * since all EEs added to done_ee. We need to provide a new
+		 * epoch object for the EEs that come in soon.
+		 */
+		break;
+	}
+
+	/* receiver context, in the writeout path of the other node.
+	 * avoid potential distributed deadlock */
+	epoch = kzalloc_obj(struct drbd_epoch, GFP_NOIO);
+	if (!epoch) {
+		drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+		issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &connection->current_epoch->flags);
+		conn_wait_active_ee_empty_or_disconnect(connection);
+		if (issue_flush) {
+			rv = drbd_flush_after_epoch(connection, connection->current_epoch);
+			if (rv == FE_RECYCLED)
+				return 0;
 		}
 
+		conn_wait_done_ee_empty_or_disconnect(connection);
+
 		return 0;
-	default:
-		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
-			 connection->resource->write_ordering);
-		return -EIO;
 	}
 
-	epoch->flags = 0;
-	atomic_set(&epoch->epoch_size, 0);
-	atomic_set(&epoch->active, 0);
-
 	spin_lock(&connection->epoch_lock);
 	if (atomic_read(&connection->current_epoch->epoch_size)) {
 		list_add(&epoch->list, &connection->current_epoch->list);
@@ -1635,15 +2066,25 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf
 	return 0;
 }
 
-/* quick wrapper in case payload size != request_size (write same) */
-static void drbd_csum_ee_size(struct crypto_shash *h,
-			      struct drbd_peer_request *r, void *d,
-			      unsigned int payload_size)
+/* pi->data points into some recv buffer, which may be
+ * re-used/recycled/overwritten by the next receive operation.
+ * (read_in_block via recv_resync_read) */
+static void p_req_detail_from_pi(struct drbd_connection *connection,
+		struct drbd_peer_request_details *d, struct packet_info *pi)
 {
-	unsigned int tmp = r->i.size;
-	r->i.size = payload_size;
-	drbd_csum_ee(h, r, d);
-	r->i.size = tmp;
+	struct p_trim *p = pi->data;
+	bool is_trim_or_zeroes = pi->cmd == P_TRIM || pi->cmd == P_ZEROES;
+	unsigned int digest_size =
+		pi->cmd != P_TRIM && connection->peer_integrity_tfm ?
+		crypto_shash_digestsize(connection->peer_integrity_tfm) : 0;
+
+	d->sector = be64_to_cpu(p->p_data.sector);
+	d->block_id = p->p_data.block_id;
+	d->peer_seq = be32_to_cpu(p->p_data.seq_num);
+	d->dp_flags = be32_to_cpu(p->p_data.dp_flags);
+	d->length = pi->size;
+	d->bi_size = is_trim_or_zeroes ? be32_to_cpu(p->size) : pi->size - digest_size;
+	d->digest_size = digest_size;
 }
 
 /* used from receive_RSDataReply (recv_resync_read)
@@ -1655,140 +2096,103 @@ static void drbd_csum_ee_size(struct crypto_shash *h,
  * both trim and write same have the bi_size ("data len to be affected")
  * as extra argument in the packet header.
  */
-static struct drbd_peer_request *
-read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-	      struct packet_info *pi) __must_hold(local)
+static int
+read_in_block(struct drbd_peer_request *peer_req, struct drbd_peer_request_details *d)
 {
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
 	struct drbd_device *device = peer_device->device;
-	const sector_t capacity = get_capacity(device->vdisk);
-	struct drbd_peer_request *peer_req;
-	struct page *page;
-	int digest_size, err;
-	unsigned int data_size = pi->size, ds;
-	void *dig_in = peer_device->connection->int_dig_in;
-	void *dig_vv = peer_device->connection->int_dig_vv;
-	unsigned long *data;
-	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
-	struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
-
-	digest_size = 0;
-	if (!trim && peer_device->connection->peer_integrity_tfm) {
-		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
-		/*
-		 * FIXME: Receive the incoming digest into the receive buffer
-		 *	  here, together with its struct p_data?
-		 */
-		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+	struct drbd_connection *connection = peer_device->connection;
+	const uint64_t capacity = get_capacity(device->vdisk);
+	void *dig_in = connection->int_dig_in;
+	void *dig_vv = connection->int_dig_vv;
+	struct drbd_transport *transport = &connection->transport;
+	struct drbd_transport_ops *tr_ops = &transport->class->ops;
+	int size, err;
+
+	if (d->digest_size) {
+		err = drbd_recv_into(connection, dig_in, d->digest_size);
 		if (err)
-			return NULL;
-		data_size -= digest_size;
-	}
-
-	/* assume request_size == data_size, but special case trim. */
-	ds = data_size;
-	if (trim) {
-		if (!expect(peer_device, data_size == 0))
-			return NULL;
-		ds = be32_to_cpu(trim->size);
-	} else if (zeroes) {
-		if (!expect(peer_device, data_size == 0))
-			return NULL;
-		ds = be32_to_cpu(zeroes->size);
+			return err;
 	}
 
-	if (!expect(peer_device, IS_ALIGNED(ds, 512)))
-		return NULL;
-	if (trim || zeroes) {
-		if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
-			return NULL;
-	} else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
-		return NULL;
+	if (!expect(peer_device, IS_ALIGNED(d->bi_size, 512)))
+		return -EINVAL;
+	/* The WSAME mechanism was removed in Linux 5.18,
+	 * and subsequently from drbd.
+	 * In theory, a "modern" drbd will never advertise support for
+	 * WRITE_SAME, so a compliant peer should never send a DP_WSAME
+	 * packet. If we receive one anyway, that's a protocol error.
+	 */
+	if (!expect(peer_device, (d->dp_flags & DP_WSAME) == 0))
+		return -EINVAL;
+	if (d->dp_flags & (DP_DISCARD|DP_ZEROES)) {
+		if (!expect(peer_device, d->bi_size <= (DRBD_MAX_BBIO_SECTORS << 9)))
+			return -EINVAL;
+	} else if (!expect(peer_device, d->bi_size <= DRBD_MAX_BIO_SIZE))
+		return -EINVAL;
 
-	/* even though we trust out peer,
+	/* even though we trust our peer,
 	 * we sometimes have to double check. */
-	if (sector + (ds>>9) > capacity) {
+	if (d->sector + (d->bi_size>>9) > capacity) {
 		drbd_err(device, "request from peer beyond end of local disk: "
 			"capacity: %llus < sector: %llus + size: %u\n",
-			(unsigned long long)capacity,
-			(unsigned long long)sector, ds);
-		return NULL;
+			capacity, d->sector, d->bi_size);
+		return -EINVAL;
 	}
 
-	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
-	 * "criss-cross" setup, that might cause write-out on some other DRBD,
-	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
-	if (!peer_req)
-		return NULL;
+	peer_req->block_id = d->block_id;
 
-	peer_req->flags |= EE_WRITE;
-	if (trim) {
-		peer_req->flags |= EE_TRIM;
-		return peer_req;
-	}
-	if (zeroes) {
-		peer_req->flags |= EE_ZEROOUT;
-		return peer_req;
+	if (d->length == 0)
+		return 0;
+
+	size = d->length - d->digest_size;
+	if (bio_list_empty(&peer_req->bios)) {
+		/* For a checksum resync, the bio was consumed for reading. */
+		err = peer_req_alloc_bio(peer_req, size, GFP_NOIO, REQ_OP_WRITE);
+		if (err)
+			return err;
 	}
+	err = tr_ops->recv_bio(transport, &peer_req->bios, size);
+	if (err)
+		return err;
 
-	/* receive payload size bytes into page chain */
-	ds = data_size;
-	page = peer_req->pages;
-	page_chain_for_each(page) {
-		unsigned len = min_t(int, ds, PAGE_SIZE);
-		data = kmap_local_page(page);
-		err = drbd_recv_all_warn(peer_device->connection, data, len);
-		if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
-			drbd_err(device, "Fault injection: Corrupting data on receive\n");
-			data[0] = data[0] ^ (unsigned long)-1;
-		}
-		kunmap_local(data);
-		if (err) {
-			drbd_free_peer_req(device, peer_req);
-			return NULL;
-		}
-		ds -= len;
+	if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+		struct bio *bio = bio_list_peek(&peer_req->bios);
+		unsigned long *data;
+
+		drbd_err(device, "Fault injection: Corrupting data on receive, sector %llu\n",
+				d->sector);
+
+		data = bvec_virt(&bio->bi_io_vec[0]);
+		data[0] = ~data[0];
 	}
 
-	if (digest_size) {
-		drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
-		if (memcmp(dig_in, dig_vv, digest_size)) {
+	if (d->digest_size) {
+		drbd_csum_bios(connection->peer_integrity_tfm, &peer_req->bios, dig_vv);
+		if (memcmp(dig_in, dig_vv, d->digest_size)) {
 			drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
-				(unsigned long long)sector, data_size);
-			drbd_free_peer_req(device, peer_req);
-			return NULL;
+				d->sector, d->bi_size);
+			return -EINVAL;
 		}
 	}
-	device->recv_cnt += data_size >> 9;
-	return peer_req;
+	peer_device->recv_cnt += d->bi_size >> 9;
+	return 0;
 }
 
-/* drbd_drain_block() just takes a data block
- * out of the socket input buffer, and discards it.
- */
-static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+static int ignore_remaining_packet(struct drbd_connection *connection, int size)
 {
-	struct page *page;
-	int err = 0;
-	void *data;
-
-	if (!data_size)
-		return 0;
-
-	page = drbd_alloc_pages(peer_device, 1, 1);
+	void *data_to_ignore;
 
-	data = kmap_local_page(page);
-	while (data_size) {
-		unsigned int len = min_t(int, data_size, PAGE_SIZE);
+	while (size) {
+		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+		int rv = drbd_recv(connection, &data_to_ignore, s, 0);
+		if (rv < 0)
+			return rv;
 
-		err = drbd_recv_all_warn(peer_device->connection, data, len);
-		if (err)
-			break;
-		data_size -= len;
+		size -= rv;
 	}
-	kunmap_local(data);
-	drbd_free_pages(peer_device->device, page);
-	return err;
+
+	return 0;
 }
 
 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
@@ -1804,7 +2208,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 	digest_size = 0;
 	if (peer_device->connection->peer_integrity_tfm) {
 		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
-		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+		err = drbd_recv_into(peer_device->connection, dig_in, digest_size);
 		if (err)
 			return err;
 		data_size -= digest_size;
@@ -1812,7 +2216,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 
 	/* optimistically update recv_cnt.  if receiving fails below,
 	 * we disconnect anyways, and counters will be reset. */
-	peer_device->device->recv_cnt += data_size>>9;
+	peer_device->recv_cnt += data_size >> 9;
 
 	bio = req->master_bio;
 	D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
@@ -1820,7 +2224,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 	bio_for_each_segment(bvec, bio, iter) {
 		void *mapped = bvec_kmap_local(&bvec);
 		expect = min_t(int, data_size, bvec.bv_len);
-		err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+		err = drbd_recv_into(peer_device->connection, mapped, expect);
 		kunmap_local(mapped);
 		if (err)
 			return err;
@@ -1839,250 +2243,662 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req
 	return 0;
 }
 
-/*
- * e_end_resync_block() is called in ack_sender context via
- * drbd_finish_peer_reqs().
- */
-static int e_end_resync_block(struct drbd_work *w, int unused)
+static bool bits_in_sync(struct drbd_peer_device *peer_device, sector_t sector_start, sector_t sector_end)
 {
-	struct drbd_peer_request *peer_req =
-		container_of(w, struct drbd_peer_request, w);
-	struct drbd_peer_device *peer_device = peer_req->peer_device;
 	struct drbd_device *device = peer_device->device;
-	sector_t sector = peer_req->i.sector;
-	int err;
-
-	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
-
-	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
-		drbd_set_in_sync(peer_device, sector, peer_req->i.size);
-		err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
-	} else {
-		/* Record failure to sync */
-		drbd_rs_failed_io(peer_device, sector, peer_req->i.size);
-
-		err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+	struct drbd_bitmap *bm = device->bitmap;
+
+	if (peer_device->repl_state[NOW] == L_ESTABLISHED ||
+			peer_device->repl_state[NOW] == L_SYNC_SOURCE ||
+			peer_device->repl_state[NOW] == L_SYNC_TARGET ||
+			peer_device->repl_state[NOW] == L_PAUSED_SYNC_S ||
+			peer_device->repl_state[NOW] == L_PAUSED_SYNC_T) {
+		if (drbd_bm_total_weight(peer_device) == 0)
+			return true;
+		if (drbd_bm_count_bits(device, peer_device->bitmap_index,
+					bm_sect_to_bit(bm, sector_start),
+					bm_sect_to_bit(bm, sector_end - 1)) == 0)
+			return true;
 	}
-	dec_unacked(device);
-
-	return err;
+	return false;
 }
 
-static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
-			    struct packet_info *pi) __releases(local)
+static void update_peers_for_interval(struct drbd_peer_device *peer_device,
+		struct drbd_interval *interval)
 {
 	struct drbd_device *device = peer_device->device;
-	struct drbd_peer_request *peer_req;
+	struct drbd_bitmap *bm = device->bitmap;
+	u64 mask = NODE_MASK(peer_device->node_id), im;
+	struct drbd_peer_device *p;
+	sector_t sector_end = interval->sector + (interval->size >> SECTOR_SHIFT);
+
+	/* Only send P_PEERS_IN_SYNC if we are actually in sync with this peer. */
+	if (drbd_bm_count_bits(device, peer_device->bitmap_index,
+				bm_sect_to_bit(bm, interval->sector),
+				bm_sect_to_bit(bm, sector_end - 1)))
+		return;
 
-	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
-	if (!peer_req)
-		goto fail;
+	for_each_peer_device_ref(p, im, device) {
+		if (p == peer_device)
+			continue;
 
-	dec_rs_pending(peer_device);
+		if (bits_in_sync(p, interval->sector, sector_end))
+			mask |= NODE_MASK(p->node_id);
+	}
 
-	inc_unacked(device);
-	/* corresponding dec_unacked() in e_end_resync_block()
-	 * respective _drbd_clear_done_ee */
+	for_each_peer_device_ref(p, im, device) {
+		/* Only send to the peer whose bitmap bits have been cleared if
+		 * we are connected to that peer. The bits may have been
+		 * cleared by a P_PEERS_IN_SYNC from another peer while we are
+		 * connecting to this one. We mustn't send P_PEERS_IN_SYNC
+		 * during the initial connection handshake. */
+		if (p == peer_device && p->connection->cstate[NOW] != C_CONNECTED)
+			continue;
 
-	peer_req->w.cb = e_end_resync_block;
-	peer_req->opf = REQ_OP_WRITE;
-	peer_req->submit_jif = jiffies;
+		if (mask & NODE_MASK(p->node_id))
+			drbd_send_peers_in_sync(p, mask, interval->sector, interval->size);
+	}
+}
 
-	spin_lock_irq(&device->resource->req_lock);
-	list_add_tail(&peer_req->w.list, &device->sync_ee);
-	spin_unlock_irq(&device->resource->req_lock);
+/* Potentially send P_PEERS_IN_SYNC for a range with size that fits in an int. */
+static void update_peers_for_small_range(struct drbd_peer_device *peer_device,
+		sector_t sector, int size)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_interval interval;
 
-	atomic_add(pi->size >> 9, &device->rs_sect_ev);
-	if (drbd_submit_peer_request(peer_req) == 0)
-		return 0;
+	memset(&interval, 0, sizeof(interval));
+	drbd_clear_interval(&interval);
+	interval.sector = sector;
+	interval.size = size;
+	interval.type = INTERVAL_PEERS_IN_SYNC_LOCK;
 
-	/* don't care for the reason here */
-	drbd_err(device, "submit failed, triggering re-connect\n");
-	spin_lock_irq(&device->resource->req_lock);
-	list_del(&peer_req->w.list);
-	spin_unlock_irq(&device->resource->req_lock);
+	spin_lock_irq(&device->interval_lock);
+	if (drbd_find_conflict(device, &interval, 0)) {
+		spin_unlock_irq(&device->interval_lock);
+		return;
+	}
+	drbd_insert_interval(&device->requests, &interval);
+	/* Interval is not waiting for conflicts to resolve so mark as "submitted". */
+	set_bit(INTERVAL_SUBMITTED, &interval.flags);
+	spin_unlock_irq(&device->interval_lock);
 
-	drbd_free_peer_req(device, peer_req);
-fail:
-	put_ldev(device);
-	return -EIO;
+	/* Check for activity in the activity log extent _after_ locking the
+	 * interval. Otherwise a write might occur between checking and
+	 * locking. */
+	if (!drbd_al_active(device, sector, size))
+		update_peers_for_interval(peer_device, &interval);
+
+	spin_lock_irq(&device->interval_lock);
+	drbd_remove_interval(&device->requests, &interval);
+	drbd_release_conflicts(device, &interval);
+	spin_unlock_irq(&device->interval_lock);
 }
 
-static struct drbd_request *
-find_request(struct drbd_device *device, struct rb_root *root, u64 id,
-	     sector_t sector, bool missing_ok, const char *func)
+static void update_peers_for_range(struct drbd_peer_device *peer_device,
+		sector_t sector_start, sector_t sector_end)
 {
-	struct drbd_request *req;
+	struct drbd_device *device = peer_device->device;
+	unsigned int enr_start = sector_start >> (AL_EXTENT_SHIFT - SECTOR_SHIFT);
+	unsigned int enr_end = ((sector_end - 1) >> (AL_EXTENT_SHIFT - SECTOR_SHIFT)) + 1;
+	unsigned int enr;
 
-	/* Request object according to our peer */
-	req = (struct drbd_request *)(unsigned long)id;
-	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
-		return req;
-	if (!missing_ok) {
-		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
-			(unsigned long)id, (unsigned long long)sector);
+	if (!get_ldev(device))
+		return;
+
+	for (enr = enr_start; enr < enr_end; enr++) {
+		sector_t enr_start_sector = max(sector_start,
+				((sector_t) enr) << (AL_EXTENT_SHIFT - SECTOR_SHIFT));
+		sector_t enr_end_sector = min(sector_end,
+				((sector_t) (enr + 1)) << (AL_EXTENT_SHIFT - SECTOR_SHIFT));
+
+		update_peers_for_small_range(peer_device,
+				enr_start_sector, (enr_end_sector - enr_start_sector) << SECTOR_SHIFT);
 	}
-	return NULL;
+
+	put_ldev(device);
 }
 
-static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+static int w_update_peers(struct drbd_work *w, int unused)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct drbd_request *req;
-	sector_t sector;
-	int err;
-	struct p_data *p = pi->data;
+	struct update_peers_work *upw = container_of(w, struct update_peers_work, w);
+	struct drbd_peer_device *peer_device = upw->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	if (connection->agreed_pro_version >= 110)
+		update_peers_for_range(peer_device, upw->sector_start, upw->sector_end);
 
-	sector = be64_to_cpu(p->sector);
+	kfree(upw);
 
-	spin_lock_irq(&device->resource->req_lock);
-	req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
-	spin_unlock_irq(&device->resource->req_lock);
-	if (unlikely(!req))
-		return -EIO;
+	kref_put(&device->kref, drbd_destroy_device);
 
-	err = recv_dless_read(peer_device, req, sector, pi->size);
-	if (!err)
-		req_mod(req, DATA_RECEIVED, peer_device);
-	/* else: nothing. handled from drbd_disconnect...
-	 * I don't think we may complete this just yet
-	 * in case we are "on-disconnect: freeze" */
+	kref_put(&connection->kref, drbd_destroy_connection);
 
-	return err;
+	return 0;
 }
 
-static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+void drbd_queue_update_peers(struct drbd_peer_device *peer_device,
+		sector_t sector_start, sector_t sector_end)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	sector_t sector;
-	int err;
-	struct p_data *p = pi->data;
+	struct drbd_device *device = peer_device->device;
+	struct update_peers_work *upw;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	upw = kmalloc_obj(*upw, GFP_ATOMIC | __GFP_NOWARN);
+	if (upw) {
+		upw->sector_start = sector_start;
+		upw->sector_end = sector_end;
+		upw->w.cb = w_update_peers;
 
-	sector = be64_to_cpu(p->sector);
-	D_ASSERT(device, p->block_id == ID_SYNCER);
+		kref_get(&peer_device->device->kref);
 
-	if (get_ldev(device)) {
-		/* data is submitted to disk within recv_resync_read.
-		 * corresponding put_ldev done below on error,
-		 * or in drbd_peer_request_endio. */
-		err = recv_resync_read(peer_device, sector, pi);
+		kref_get(&peer_device->connection->kref);
+
+		upw->peer_device = peer_device;
+		drbd_queue_work(&device->resource->work, &upw->w);
 	} else {
 		if (drbd_ratelimit())
-			drbd_err(device, "Can not write resync data to local disk.\n");
+			drbd_warn(peer_device, "kmalloc(upw) failed.\n");
+	}
+}
 
-		err = drbd_drain_block(peer_device, pi->size);
+static void drbd_peers_in_sync_progress(struct drbd_peer_device *peer_device,
+		sector_t sector_start, sector_t sector_end)
+{
+	/* P_PEERS_IN_SYNC "steps" are represented by their start sector */
+	sector_t step = sector_start & ~PEERS_IN_SYNC_STEP_SECT_MASK;
+	sector_t end_step = sector_end & ~PEERS_IN_SYNC_STEP_SECT_MASK;
+	sector_t last_end = peer_device->last_in_sync_end;
+	sector_t last_step = last_end & ~PEERS_IN_SYNC_STEP_SECT_MASK;
+	sector_t last_step_end = min(get_capacity(peer_device->device->vdisk),
+			last_step + PEERS_IN_SYNC_STEP_SECT);
 
-		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
-	}
+	/* Send for last request if it was part way through a different step */
+	if (last_end > last_step && step != last_step)
+		drbd_queue_update_peers(peer_device, last_step, last_step_end);
 
-	atomic_add(pi->size >> 9, &device->rs_sect_in);
+	/* Send if the request reaches or passes a step boundary */
+	if (end_step != step)
+		drbd_queue_update_peers(peer_device, step, end_step);
 
-	return err;
+	peer_device->last_in_sync_end = sector_end;
+
+	/*
+	 * Consider scheduling a bitmap update to reduce the size of the next
+	 * resync if this one is disrupted.
+	 */
+	if (drbd_lazy_bitmap_update_due(peer_device))
+		drbd_peer_device_post_work(peer_device, RS_LAZY_BM_WRITE);
 }
 
-static void restart_conflicting_writes(struct drbd_device *device,
-				       sector_t sector, int size)
+static void drbd_check_peers_in_sync_progress(struct drbd_peer_device *peer_device)
 {
-	struct drbd_interval *i;
-	struct drbd_request *req;
+	struct drbd_connection *connection = peer_device->connection;
+	LIST_HEAD(completed);
+	struct drbd_peer_request *peer_req, *tmp;
 
-	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
-		if (!i->local)
-			continue;
-		req = container_of(i, struct drbd_request, i);
-		if (req->rq_state & RQ_LOCAL_PENDING ||
-		    !(req->rq_state & RQ_POSTPONED))
-			continue;
-		/* as it is RQ_POSTPONED, this will cause it to
-		 * be queued on the retry workqueue. */
-		__req_mod(req, CONFLICT_RESOLVED, NULL, NULL);
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_for_each_entry_safe(peer_req, tmp, &peer_device->resync_requests, recv_order) {
+		if (!test_bit(INTERVAL_COMPLETED, &peer_req->i.flags))
+			break;
+
+		drbd_peers_in_sync_progress(peer_device, peer_req->i.sector,
+			peer_req->i.sector + (peer_req->i.size >> SECTOR_SHIFT));
+
+		drbd_list_del_resync_request(peer_req);
+		list_add_tail(&peer_req->recv_order, &completed);
 	}
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	list_for_each_entry_safe(peer_req, tmp, &completed, recv_order)
+		drbd_free_peer_req(peer_req);
 }
 
-/*
- * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
- */
-static int e_end_block(struct drbd_work *w, int cancel)
+static void drbd_resync_request_complete(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+
+	/*
+	 * Free the pages now but leave the peer request until the
+	 * corresponding peers-in-sync has been scheduled.
+	 */
+	drbd_peer_req_strip_bio(peer_req);
+
+	/*
+	 * The interval is no longer in the tree, but use this flag anyway,
+	 * since it has an appropriate meaning. After setting the flag,
+	 * peer_req may be freed by another thread.
+	 */
+	set_bit(INTERVAL_COMPLETED, &peer_req->i.flags);
+	peer_req = NULL;
+
+	drbd_check_peers_in_sync_progress(peer_device);
+}
+
+/*
+ * e_end_resync_block() is called in ack_sender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
 {
 	struct drbd_peer_request *peer_req =
 		container_of(w, struct drbd_peer_request, w);
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
-	struct drbd_device *device = peer_device->device;
 	sector_t sector = peer_req->i.sector;
-	int err = 0, pcmd;
+	unsigned int size = peer_req->requested_size;
+	u64 block_id = peer_req->block_id;
+	int err;
 
-	if (peer_req->flags & EE_SEND_WRITE_ACK) {
-		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
-			pcmd = (device->state.conn >= C_SYNC_SOURCE &&
-				device->state.conn <= C_PAUSED_SYNC_T &&
-				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
-				P_RS_WRITE_ACK : P_WRITE_ACK;
-			err = drbd_send_ack(peer_device, pcmd, peer_req);
-			if (pcmd == P_RS_WRITE_ACK)
-				drbd_set_in_sync(peer_device, sector, peer_req->i.size);
-		} else {
-			err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
-			/* we expect it to be marked out of sync anyways...
-			 * maybe assert this?  */
-		}
-		dec_unacked(device);
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(peer_device, sector, size);
+		err = drbd_send_ack_be(peer_device, P_RS_WRITE_ACK, sector, size, block_id);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(peer_device, sector, size);
+
+		err = drbd_send_ack_be(peer_device, P_RS_NEG_ACK, sector, size, block_id);
 	}
 
-	/* we delete from the conflict detection hash _after_ we sent out the
-	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
-	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
-		spin_lock_irq(&device->resource->req_lock);
-		D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
-		drbd_remove_epoch_entry_interval(device, peer_req);
-		if (peer_req->flags & EE_RESTART_REQUESTS)
-			restart_conflicting_writes(device, sector, peer_req->i.size);
-		spin_unlock_irq(&device->resource->req_lock);
-	} else
-		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	dec_unacked(peer_device);
 
-	drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+	/*
+	 * If INTERVAL_SUBMITTED is not set, this request was merged into
+	 * another discard. It has already been removed from the interval tree.
+	 */
+	if (test_bit(INTERVAL_SUBMITTED, &peer_req->i.flags))
+		drbd_remove_peer_req_interval(peer_req);
 
+	drbd_resync_request_complete(peer_req);
 	return err;
 }
 
-static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+static struct drbd_peer_request *find_resync_request(struct drbd_peer_device *peer_device,
+		unsigned long type_mask, sector_t sector, unsigned int size, u64 block_id)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_interval *i;
+	struct drbd_peer_request *peer_req = NULL;
+
+	spin_lock_irq(&device->interval_lock);
+	drbd_for_each_overlap(i, &device->requests, sector, size) {
+		struct drbd_peer_request *pr;
+
+		if (!test_bit(INTERVAL_READY_TO_SEND, &i->flags))
+			continue;
+
+		if (!(INTERVAL_TYPE_MASK(i->type) & type_mask))
+			continue;
+
+		if (i->sector != sector || i->size != size)
+			continue;
+
+		pr = container_of(i, struct drbd_peer_request, i);
+		/* With agreed_pro_version < 122, block_id is always ID_SYNCER. */
+		if (pr->peer_device == peer_device &&
+				(block_id == ID_SYNCER || pr->block_id == block_id)) {
+			peer_req = pr;
+			break;
+		}
+	}
+	spin_unlock_irq(&device->interval_lock);
+
+	if (peer_req)
+		D_ASSERT(peer_device, peer_req->i.size == size);
+	else if (drbd_ratelimit())
+		drbd_err(peer_device, "Unexpected resync reply at %llus+%u\n",
+				(unsigned long long) sector, size);
+
+	return peer_req;
+}
+
+static void drbd_cleanup_received_resync_write(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+
+	drbd_remove_peer_req_interval(peer_req);
+
+	atomic_sub(peer_req->i.size >> SECTOR_SHIFT, &device->rs_sect_ev);
+	dec_unacked(peer_device);
+
+	drbd_free_peer_req(peer_req);
+	put_ldev(device);
+
+	if (atomic_dec_and_test(&connection->backing_ee_cnt))
+		wake_up(&connection->ee_wait);
+}
+
+void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req)
 {
-	struct drbd_peer_request *peer_req =
-		container_of(w, struct drbd_peer_request, w);
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	bool conflict;
+	bool canceled;
+
+	spin_lock_irq(&device->interval_lock);
+	clear_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &peer_req->i.flags);
+	canceled = test_bit(INTERVAL_CANCELED, &peer_req->i.flags);
+	set_bit(INTERVAL_RECEIVED, &peer_req->i.flags);
+	conflict = drbd_find_conflict(device, &peer_req->i, 0);
+	if (!conflict)
+		set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags);
+	spin_unlock_irq(&device->interval_lock);
+
+	if (!conflict) {
+		int err = drbd_submit_peer_request(peer_req);
+		if (err) {
+			if (drbd_ratelimit())
+				drbd_err(device, "submit failed, triggering re-connect\n");
+
+			drbd_cleanup_received_resync_write(peer_req);
+			change_cstate(peer_device->connection, C_PROTOCOL_ERROR, CS_HARD);
+		}
+	} else if (canceled) {
+		drbd_cleanup_received_resync_write(peer_req);
+	}
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device,
+			    struct drbd_peer_request *peer_req,
+			    struct drbd_peer_request_details *d)
+{
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	unsigned int size;
+	sector_t sector;
+	int err;
+	u64 im;
+
+	err = read_in_block(peer_req, d);
+	if (err)
+		return err;
+
+	if (test_bit(UNSTABLE_RESYNC, &peer_device->flags))
+		clear_bit(STABLE_RESYNC, &device->flags);
+
+	dec_rs_pending(peer_device);
+
+	inc_unacked(peer_device);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	peer_req->w.cb = e_end_resync_block;
+	peer_req->submit_jif = jiffies;
+
+	atomic_add(d->bi_size >> 9, &device->rs_sect_ev);
+
+	sector = peer_req->i.sector;
+	size = peer_req->i.size;
+
+	/* Setting all peers out of sync here. The sync source peer will be
+	 * set in sync when the write completes. The sync source will soon
+	 * set other peers in sync with a P_PEERS_IN_SYNC packet.
+	 */
+	drbd_set_all_out_of_sync(device, sector, size);
+
+	atomic_inc(&connection->backing_ee_cnt);
+	drbd_conflict_submit_resync_request(peer_req);
+	peer_req = NULL; /* since submitted, might be destroyed already */
+
+	drbd_process_rs_discards(peer_device, false);
+
+	for_each_peer_device_ref(peer_device, im, device) {
+		enum drbd_repl_state repl_state = peer_device->repl_state[NOW];
+
+		if (repl_is_sync_source(repl_state) || repl_state == L_WF_BITMAP_S)
+			drbd_send_out_of_sync(peer_device, sector, size);
+	}
+	return 0;
+}
+
+/* caller must hold interval_lock */
+static struct drbd_request *
+find_request(struct drbd_device *device, enum drbd_interval_type type, u64 id,
+	     sector_t sector, bool missing_ok, const char *func)
+{
+	struct rb_root *root = type == INTERVAL_LOCAL_READ ? &device->read_requests : &device->requests;
+	struct drbd_request *req;
+
+	/* Request object according to our peer */
+	req = (struct drbd_request *)(unsigned long)id;
+	if (drbd_contains_interval(root, sector, &req->i) && req->i.type == type)
+		return req;
+	if (!missing_ok) {
+		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+			(unsigned long)id, (unsigned long long)sector);
+	}
+	return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_request *req;
+	sector_t sector;
 	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&device->interval_lock);
+	req = find_request(device, INTERVAL_LOCAL_READ, p->block_id, sector, false, __func__);
+	spin_unlock_irq(&device->interval_lock);
+	if (unlikely(!req))
+		return -EIO;
 
-	err = drbd_send_ack(peer_device, ack, peer_req);
-	dec_unacked(peer_device->device);
+	err = recv_dless_read(peer_device, req, sector, pi->size);
+	if (!err)
+		req_mod(req, DATA_RECEIVED, peer_device);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
 
 	return err;
 }
 
-static int e_send_superseded(struct drbd_work *w, int unused)
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @peer_device: DRBD peer device.
+ * @cmd:	Packet command code.
+ * @sector:	sector, needs to be in big endian byte order
+ * @blksize:	size in byte, needs to be in big endian byte order
+ * @block_id:	Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+			  u64 sector, u32 blksize, u64 block_id)
+{
+	struct p_block_ack *p;
+
+	if (peer_device->repl_state[NOW] < L_ESTABLISHED)
+		return -EIO;
+
+	p = drbd_prepare_command(peer_device, sizeof(*p), CONTROL_STREAM);
+	if (!p)
+		return -EIO;
+	p->sector = sector;
+	p->block_id = block_id;
+	p->blksize = blksize;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->packet_seq));
+
+	if (peer_device->connection->agreed_pro_version < 122) {
+		switch (cmd) {
+		case P_RS_NEG_ACK:
+			cmd = P_NEG_ACK;
+			p->block_id = ID_SYNCER;
+			break;
+		case P_WRITE_ACK_IN_SYNC:
+			cmd = P_RS_WRITE_ACK;
+			break;
+		case P_RS_WRITE_ACK:
+			p->block_id = ID_SYNCER;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return drbd_send_command(peer_device, cmd, CONTROL_STREAM);
+}
+
+static int drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		  struct drbd_peer_request_details *d)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(d->sector),
+			      cpu_to_be32(d->bi_size),
+			      d->block_id);
+}
+
+/* Send an ack packet wih a block ID that is already in big endian byte order. */
+int drbd_send_ack_be(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      sector_t sector, int size, u64 block_id)
+{
+	return _drbd_send_ack(peer_device, cmd, cpu_to_be64(sector), cpu_to_be32(size), block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @peer_device: DRBD peer device
+ * @cmd:	packet command code
+ * @peer_req:	peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		  struct drbd_peer_request *peer_req)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(peer_req->i.sector),
+			      cpu_to_be32(peer_req->i.size),
+			      peer_req->block_id);
+}
+
+int drbd_send_ov_result(struct drbd_peer_device *peer_device, sector_t sector, int blksize,
+		u64 block_id, enum ov_result result)
+{
+	struct p_ov_result *p;
+
+	if (peer_device->connection->agreed_pro_version < 122)
+		/* Misuse the block_id field to signal if the blocks are is sync or not. */
+		return _drbd_send_ack(peer_device, P_OV_RESULT,
+				cpu_to_be64(sector),
+				cpu_to_be32(blksize),
+				cpu_to_be64(drbd_ov_result_to_block_id(result)));
+
+	if (peer_device->repl_state[NOW] < L_ESTABLISHED)
+		return -EIO;
+
+	p = drbd_prepare_command(peer_device, sizeof(*p), CONTROL_STREAM);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = block_id;
+	p->blksize = cpu_to_be32(blksize);
+	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->packet_seq));
+	p->result = cpu_to_be32(result);
+	p->pad = 0;
+
+	return drbd_send_command(peer_device, P_OV_RESULT_ID, CONTROL_STREAM);
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
 {
-	return e_send_ack(w, P_SUPERSEDED);
+	struct drbd_peer_request_details d;
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_peer_request *peer_req;
+	int err;
+
+	p_req_detail_from_pi(connection, &d, pi);
+	pi->data = NULL;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_RESYNC_WRITE),
+			d.sector, d.bi_size, d.block_id);
+	if (!peer_req)
+		return -EIO;
+
+	if (get_ldev(device)) {
+		err = recv_resync_read(peer_device, peer_req, &d);
+		if (err)
+			put_ldev(device);
+	} else {
+		drbd_err_ratelimit(device, "Cannot write resync data to local disk.\n");
+
+		err = ignore_remaining_packet(connection, pi->size);
+
+		drbd_send_ack_dp(peer_device, P_RS_NEG_ACK, &d);
+
+		dec_rs_pending(peer_device);
+		drbd_remove_peer_req_interval(peer_req);
+		drbd_free_peer_req(peer_req);
+	}
+
+	rs_sectors_came_in(peer_device, d.bi_size);
+
+	return err;
 }
 
-static int e_send_retry_write(struct drbd_work *w, int unused)
+/*
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
 {
 	struct drbd_peer_request *peer_req =
 		container_of(w, struct drbd_peer_request, w);
-	struct drbd_connection *connection = peer_req->peer_device->connection;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+	sector_t sector = peer_req->i.sector;
+	struct drbd_epoch *epoch;
+	int err = 0, pcmd;
+
+	if (peer_req->flags & EE_IS_BARRIER) {
+		epoch = previous_epoch(connection, peer_req->epoch);
+		if (epoch)
+			drbd_may_finish_epoch(connection, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0));
+	}
+
+	if (peer_req->flags & EE_SEND_WRITE_ACK) {
+		if (unlikely(peer_req->flags & EE_WAS_ERROR)) {
+			pcmd = P_NEG_ACK;
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		} else if (peer_device->repl_state[NOW] >= L_SYNC_SOURCE &&
+			   peer_device->repl_state[NOW] <= L_PAUSED_SYNC_T &&
+			   peer_req->flags & EE_MAY_SET_IN_SYNC) {
+			pcmd = P_WRITE_ACK_IN_SYNC;
+			drbd_set_in_sync(peer_device, sector, peer_req->i.size);
+		} else
+			pcmd = P_WRITE_ACK;
+		err = drbd_send_ack(peer_device, pcmd, peer_req);
+		dec_unacked(peer_device);
+	}
+
+	drbd_remove_peer_req_interval(peer_req);
+
+	if (connection->agreed_pro_version < 110) {
+		drbd_al_complete_io(device, &peer_req->i);
+		drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+		drbd_free_peer_req(peer_req);
+	} else {
+		drbd_peer_req_strip_bio(peer_req);
+		drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+		/* Do not use peer_req after this point. We may have sent the
+		 * corresponding barrier and received the corresponding peer ack. As a
+		 * result, peer_req may have been freed. */
+	}
 
-	return e_send_ack(w, connection->agreed_pro_version >= 100 ?
-			     P_RETRY_WRITE : P_SUPERSEDED);
+	return err;
 }
 
 static bool seq_greater(u32 a, u32 b)
@@ -2102,42 +2918,17 @@ static u32 seq_max(u32 a, u32 b)
 
 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
 {
-	struct drbd_device *device = peer_device->device;
 	unsigned int newest_peer_seq;
 
-	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
-		spin_lock(&device->peer_seq_lock);
-		newest_peer_seq = seq_max(device->peer_seq, peer_seq);
-		device->peer_seq = newest_peer_seq;
-		spin_unlock(&device->peer_seq_lock);
-		/* wake up only if we actually changed device->peer_seq */
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->transport.flags)) {
+		spin_lock_bh(&peer_device->peer_seq_lock);
+		newest_peer_seq = seq_max(peer_device->peer_seq, peer_seq);
+		peer_device->peer_seq = newest_peer_seq;
+		spin_unlock_bh(&peer_device->peer_seq_lock);
+		/* wake up only if we actually changed peer_device->peer_seq */
 		if (peer_seq == newest_peer_seq)
-			wake_up(&device->seq_wait);
-	}
-}
-
-static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
-{
-	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
-}
-
-/* maybe change sync_ee into interval trees as well? */
-static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
-{
-	struct drbd_peer_request *rs_req;
-	bool rv = false;
-
-	spin_lock_irq(&device->resource->req_lock);
-	list_for_each_entry(rs_req, &device->sync_ee, w.list) {
-		if (overlaps(peer_req->i.sector, peer_req->i.size,
-			     rs_req->i.sector, rs_req->i.size)) {
-			rv = true;
-			break;
-		}
+			wake_up(&peer_device->device->seq_wait);
 	}
-	spin_unlock_irq(&device->resource->req_lock);
-
-	return rv;
 }
 
 /* Called from receive_Data.
@@ -2149,9 +2940,9 @@ static bool overlapping_resync_write(struct drbd_device *device, struct drbd_pee
  *
  * Note: we don't care for Ack packets overtaking P_DATA packets.
  *
- * In case packet_seq is larger than device->peer_seq number, there are
+ * In case packet_seq is larger than peer_device->peer_seq number, there are
  * outstanding packets on the msock. We wait for them to arrive.
- * In case we are the logically next packet, we update device->peer_seq
+ * In case we are the logically next packet, we update peer_device->peer_seq
  * ourselves. Correctly handles 32bit wrap around.
  *
  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
@@ -2163,18 +2954,18 @@ static bool overlapping_resync_write(struct drbd_device *device, struct drbd_pee
  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
 {
-	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
 	DEFINE_WAIT(wait);
 	long timeout;
 	int ret = 0, tp;
 
-	if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+	if (!test_bit(RESOLVE_CONFLICTS, &connection->transport.flags))
 		return 0;
 
-	spin_lock(&device->peer_seq_lock);
+	spin_lock_bh(&peer_device->peer_seq_lock);
 	for (;;) {
-		if (!seq_greater(peer_seq - 1, device->peer_seq)) {
-			device->peer_seq = seq_max(device->peer_seq, peer_seq);
+		if (!seq_greater(peer_seq - 1, peer_device->peer_seq)) {
+			peer_device->peer_seq = seq_max(peer_device->peer_seq, peer_seq);
 			break;
 		}
 
@@ -2184,28 +2975,28 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
 		}
 
 		rcu_read_lock();
-		tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+		tp = rcu_dereference(connection->transport.net_conf)->two_primaries;
 		rcu_read_unlock();
 
 		if (!tp)
 			break;
 
 		/* Only need to wait if two_primaries is enabled */
-		prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
-		spin_unlock(&device->peer_seq_lock);
+		prepare_to_wait(&peer_device->device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock_bh(&peer_device->peer_seq_lock);
 		rcu_read_lock();
-		timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+		timeout = rcu_dereference(connection->transport.net_conf)->ping_timeo*HZ/10;
 		rcu_read_unlock();
 		timeout = schedule_timeout(timeout);
-		spin_lock(&device->peer_seq_lock);
+		spin_lock_bh(&peer_device->peer_seq_lock);
 		if (!timeout) {
 			ret = -ETIMEDOUT;
-			drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+			drbd_err(peer_device, "Timed out waiting for missing ack packets; disconnecting\n");
 			break;
 		}
 	}
-	spin_unlock(&device->peer_seq_lock);
-	finish_wait(&device->seq_wait, &wait);
+	spin_unlock_bh(&peer_device->peer_seq_lock);
+	finish_wait(&peer_device->device->seq_wait, &wait);
 	return ret;
 }
 
@@ -2215,182 +3006,268 @@ static enum req_op wire_flags_to_bio_op(u32 dpf)
 		return REQ_OP_WRITE_ZEROES;
 	if (dpf & DP_DISCARD)
 		return REQ_OP_DISCARD;
-	else
-		return REQ_OP_WRITE;
+	return REQ_OP_WRITE;
 }
 
 /* see also bio_flags_to_wire() */
 static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
 {
-	return wire_flags_to_bio_op(dpf) |
-		(dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
-		(dpf & DP_FUA ? REQ_FUA : 0) |
-		(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
-}
+	blk_opf_t opf = wire_flags_to_bio_op(dpf) |
+		(dpf & DP_RW_SYNC ? REQ_SYNC : 0);
 
-static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
-				    unsigned int size)
-{
-	struct drbd_peer_device *peer_device = first_peer_device(device);
-	struct drbd_interval *i;
+	/* we used to communicate one bit only in older DRBD */
+	if (connection->agreed_pro_version >= 95)
+		opf |= (dpf & DP_FUA ? REQ_FUA : 0) |
+			    (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
 
-    repeat:
-	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
-		struct drbd_request *req;
-		struct bio_and_error m;
+	return opf;
+}
 
-		if (!i->local)
-			continue;
-		req = container_of(i, struct drbd_request, i);
-		if (!(req->rq_state & RQ_POSTPONED))
-			continue;
-		req->rq_state &= ~RQ_POSTPONED;
-		__req_mod(req, NEG_ACKED, peer_device, &m);
-		spin_unlock_irq(&device->resource->req_lock);
-		if (m.bio)
-			complete_master_bio(device, &m);
-		spin_lock_irq(&device->resource->req_lock);
-		goto repeat;
+static void drbd_wait_for_activity_log_extents(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	struct lru_cache *al;
+	int nr_al_extents;
+	int nr, used, ecnt;
+
+	/* Let the activity log know we are about to use it.
+	 * See also drbd_request_prepare() for the "request" entry point. */
+	nr_al_extents = interval_to_al_extents(&peer_req->i);
+	ecnt = atomic_add_return(nr_al_extents, &device->wait_for_actlog_ecnt);
+
+	spin_lock_irq(&device->al_lock);
+	al = device->act_log;
+	nr = al->nr_elements;
+	used = al->used;
+	spin_unlock_irq(&device->al_lock);
+
+	/* note: due to the slight delay between being accounted in "used" after
+	 * being committed to the activity log with drbd_al_begin_io_commit(),
+	 * and being subtracted from "wait_for_actlog_ecnt" in __drbd_submit_peer_request(),
+	 * this can err, but only on the conservative side (overestimating ecnt).
+	 * ecnt also includes any requests which are held due to conflicts,
+	 * conservatively overestimating the number of activity log extents
+	 * required. */
+	if (ecnt > nr - used) {
+		conn_wait_active_ee_empty_or_disconnect(connection);
+		drbd_flush_after_epoch(connection, NULL);
+		conn_wait_done_ee_empty_or_disconnect(connection);
+
+		/* would this peer even understand me? */
+		if (connection->agreed_pro_version >= 114)
+			drbd_send_confirm_stable(peer_req);
 	}
 }
 
-static int handle_write_conflicts(struct drbd_device *device,
-				  struct drbd_peer_request *peer_req)
+static int drbd_peer_write_conflicts(struct drbd_peer_request *peer_req)
 {
-	struct drbd_connection *connection = peer_req->peer_device->connection;
-	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
 	sector_t sector = peer_req->i.sector;
 	const unsigned int size = peer_req->i.size;
 	struct drbd_interval *i;
-	bool equal;
-	int err;
 
-	/*
-	 * Inserting the peer request into the write_requests tree will prevent
-	 * new conflicting local requests from being added.
-	 */
-	drbd_insert_interval(&device->write_requests, &peer_req->i);
+	i = drbd_find_conflict(device, &peer_req->i, CONFLICT_FLAG_APPLICATION_ONLY);
 
-    repeat:
-	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
-		if (i == &peer_req->i)
-			continue;
-		if (i->completed)
+	if (i) {
+		drbd_alert(device, "Concurrent writes detected: "
+				"local=%llus +%u, remote=%llus +%u\n",
+				(unsigned long long) i->sector, i->size,
+				(unsigned long long) sector, size);
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+static void drbd_queue_peer_request(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	atomic_inc(&device->wait_for_actlog);
+	spin_lock(&device->submit.lock);
+	list_add_tail(&peer_req->w.list, &device->submit.peer_writes);
+	spin_unlock(&device->submit.lock);
+	queue_work(device->submit.wq, &device->submit.worker);
+	/* do_submit() may sleep internally on al_wait, too */
+	wake_up(&device->al_wait);
+}
+
+static struct drbd_peer_request *find_released_peer_request(struct drbd_resource *resource, unsigned int node_id, u64 dagtag)
+{
+	struct drbd_connection *connection;
+	struct drbd_peer_request *released_peer_req = NULL;
+
+	read_lock_irq(&resource->state_rwlock);
+	for_each_connection(connection, resource) {
+		struct drbd_peer_request *peer_req;
+
+		/* Skip if we are not connected. If we are in the process of
+		 * disconnecting, the requests on dagtag_wait_ee will be
+		 * cleared up. Do not interfere with that process. */
+		if (connection->cstate[NOW] < C_CONNECTED)
 			continue;
 
-		if (!i->local) {
-			/*
-			 * Our peer has sent a conflicting remote request; this
-			 * should not happen in a two-node setup.  Wait for the
-			 * earlier peer request to complete.
-			 */
-			err = drbd_wait_misc(device, i);
-			if (err)
-				goto out;
-			goto repeat;
+		spin_lock(&connection->peer_reqs_lock);
+		list_for_each_entry(peer_req, &connection->dagtag_wait_ee, w.list) {
+			if (!peer_req->depend_dagtag ||
+					peer_req->depend_dagtag_node_id != node_id ||
+					peer_req->depend_dagtag > dagtag)
+				continue;
+
+			dynamic_drbd_dbg(peer_req->peer_device, "%s at %llus+%u: Wait for dagtag %llus from peer %u complete\n",
+					drbd_interval_type_str(&peer_req->i),
+					(unsigned long long) peer_req->i.sector, peer_req->i.size,
+					(unsigned long long) peer_req->depend_dagtag,
+					peer_req->depend_dagtag_node_id);
+
+			list_del(&peer_req->w.list);
+			released_peer_req = peer_req;
+			break;
 		}
+		spin_unlock(&connection->peer_reqs_lock);
 
-		equal = i->sector == sector && i->size == size;
-		if (resolve_conflicts) {
-			/*
-			 * If the peer request is fully contained within the
-			 * overlapping request, it can be considered overwritten
-			 * and thus superseded; otherwise, it will be retried
-			 * once all overlapping requests have completed.
-			 */
-			bool superseded = i->sector <= sector && i->sector +
-				       (i->size >> 9) >= sector + (size >> 9);
-
-			if (!equal)
-				drbd_alert(device, "Concurrent writes detected: "
-					       "local=%llus +%u, remote=%llus +%u, "
-					       "assuming %s came first\n",
-					  (unsigned long long)i->sector, i->size,
-					  (unsigned long long)sector, size,
-					  superseded ? "local" : "remote");
-
-			peer_req->w.cb = superseded ? e_send_superseded :
-						   e_send_retry_write;
-			list_add_tail(&peer_req->w.list, &device->done_ee);
-			/* put is in drbd_send_acks_wf() */
-			kref_get(&device->kref);
-			if (!queue_work(connection->ack_sender,
-					&peer_req->peer_device->send_acks_work))
-				kref_put(&device->kref, drbd_destroy_device);
+		if (released_peer_req)
+			break;
+	}
+	read_unlock_irq(&resource->state_rwlock);
 
-			err = -ENOENT;
-			goto out;
-		} else {
-			struct drbd_request *req =
-				container_of(i, struct drbd_request, i);
+	return released_peer_req;
+}
 
-			if (!equal)
-				drbd_alert(device, "Concurrent writes detected: "
-					       "local=%llus +%u, remote=%llus +%u\n",
-					  (unsigned long long)i->sector, i->size,
-					  (unsigned long long)sector, size);
+static void release_dagtag_wait(struct drbd_resource *resource, unsigned int node_id, u64 dagtag)
+{
+	struct drbd_peer_request *peer_req;
 
-			if (req->rq_state & RQ_LOCAL_PENDING ||
-			    !(req->rq_state & RQ_POSTPONED)) {
-				/*
-				 * Wait for the node with the discard flag to
-				 * decide if this request has been superseded
-				 * or needs to be retried.
-				 * Requests that have been superseded will
-				 * disappear from the write_requests tree.
-				 *
-				 * In addition, wait for the conflicting
-				 * request to finish locally before submitting
-				 * the conflicting peer request.
-				 */
-				err = drbd_wait_misc(device, &req->i);
-				if (err) {
-					_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
-					fail_postponed_requests(device, sector, size);
-					goto out;
-				}
-				goto repeat;
-			}
-			/*
-			 * Remember to restart the conflicting requests after
-			 * the new peer request has completed.
-			 */
-			peer_req->flags |= EE_RESTART_REQUESTS;
-		}
+	while ((peer_req = find_released_peer_request(resource, node_id, dagtag))) {
+		atomic_inc(&peer_req->peer_device->connection->backing_ee_cnt);
+		drbd_conflict_submit_peer_read(peer_req);
 	}
-	err = 0;
+}
+
+static void set_connection_dagtag(struct drbd_connection *connection, u64 dagtag)
+{
+	atomic64_set(&connection->last_dagtag_sector, dagtag);
+	set_bit(RECEIVED_DAGTAG, &connection->flags);
+
+	release_dagtag_wait(connection->resource, connection->peer_node_id, dagtag);
+}
+
+static void submit_peer_request_activity_log(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+	int nr_al_extents = interval_to_al_extents(&peer_req->i);
+
+	if (nr_al_extents != 1 || !drbd_al_begin_io_fastpath(device, &peer_req->i)) {
+		drbd_queue_peer_request(device, peer_req);
+		return;
+	}
+
+	peer_req->flags |= EE_IN_ACTLOG;
+	atomic_sub(nr_al_extents, &device->wait_for_actlog_ecnt);
 
-    out:
+	err = drbd_submit_peer_request(peer_req);
 	if (err)
-		drbd_remove_epoch_entry_interval(device, peer_req);
-	return err;
+		drbd_cleanup_after_failed_submit_peer_write(peer_req);
+}
+
+void drbd_conflict_submit_peer_write(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	bool conflict = false;
+
+	spin_lock_irq(&device->interval_lock);
+	clear_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &peer_req->i.flags);
+	conflict = drbd_find_conflict(device, &peer_req->i, 0);
+	if (!conflict)
+		set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags);
+	spin_unlock_irq(&device->interval_lock);
+
+	if (!conflict)
+		submit_peer_request_activity_log(peer_req);
 }
 
-/* mirrored write */
+/* mirrored write
+ *
+ * Request handling flow:
+ *
+ *                      conflict
+ * receive_Data -----------------------+
+ *       |                             |
+ *       |                            ...
+ *       |                             |
+ *       |                             v
+ *       |                   drbd_do_submit_conflict
+ *       |                             |
+ *       |                             v
+ *       +------------------ drbd_conflict_submit_peer_write
+ *       |
+ *       v                         wait for AL
+ * submit_peer_request_activity_log --------> drbd_queue_peer_request
+ *       |                                         |
+ *       |                                        ...
+ *       |                                         |
+ *       |                                         v    AL extent active
+ *       |                                    do_submit ----------------+
+ *       |                                         |                    |
+ *       |                                         v                    v
+ *       |                              send_and_submit_pending   submit_fast_path
+ *       |                                         |                    |
+ *       v                                         v                    |
+ * drbd_submit_peer_request <-------- __drbd_submit_peer_request <------+
+ *       |
+ *      ... backing device
+ *       |
+ *       v
+ * drbd_peer_request_endio
+ *       |
+ *       v
+ * drbd_endio_write_sec_final
+ *       |
+ *      ... done_ee
+ *       |
+ *       v
+ * drbd_finish_peer_reqs
+ *       |
+ *       v
+ * e_end_block
+ *       |
+ *      ... via peer
+ *       |
+ *       v
+ * got_peer_ack
+ */
 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
 	struct net_conf *nc;
-	sector_t sector;
 	struct drbd_peer_request *peer_req;
-	struct p_data *p = pi->data;
-	u32 peer_seq = be32_to_cpu(p->seq_num);
-	u32 dp_flags;
+	struct drbd_peer_request_details d;
 	int err, tp;
+	bool conflict = false;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
 		return -EIO;
 	device = peer_device->device;
 
+	if (pi->cmd == P_TRIM)
+		D_ASSERT(peer_device, pi->size == 0);
+
+	p_req_detail_from_pi(connection, &d, pi);
+	pi->data = NULL;
+
 	if (!get_ldev(device)) {
 		int err2;
 
-		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
-		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+		err = wait_for_and_update_peer_seq(peer_device, d.peer_seq);
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, &d);
 		atomic_inc(&connection->current_epoch->epoch_size);
-		err2 = drbd_drain_block(peer_device, pi->size);
+		err2 = ignore_remaining_packet(connection, pi->size);
 		if (!err)
 			err = err2;
 		return err;
@@ -2402,71 +3279,107 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 	 * end of this function.
 	 */
 
-	sector = be64_to_cpu(p->sector);
-	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+	peer_req = drbd_alloc_peer_req(peer_device, GFP_TRY, d.bi_size,
+				       wire_flags_to_bio(connection, d.dp_flags));
 	if (!peer_req) {
 		put_ldev(device);
 		return -EIO;
 	}
+	peer_req->i.size = d.bi_size; /* storage size */
+	peer_req->i.sector = d.sector;
+	peer_req->i.type = INTERVAL_PEER_WRITE;
+
+	err = read_in_block(peer_req, &d);
+	if (err) {
+		drbd_free_peer_req(peer_req);
+		put_ldev(device);
+		return err;
+	}
+
+	if (pi->cmd == P_TRIM)
+		peer_req->flags |= EE_TRIM;
+	else if (pi->cmd == P_ZEROES)
+		peer_req->flags |= EE_ZEROOUT;
 
 	peer_req->w.cb = e_end_block;
 	peer_req->submit_jif = jiffies;
-	peer_req->flags |= EE_APPLICATION;
 
-	dp_flags = be32_to_cpu(p->dp_flags);
-	peer_req->opf = wire_flags_to_bio(connection, dp_flags);
 	if (pi->cmd == P_TRIM) {
 		D_ASSERT(peer_device, peer_req->i.size > 0);
-		D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
-		D_ASSERT(peer_device, peer_req->pages == NULL);
+		D_ASSERT(peer_device, d.dp_flags & DP_DISCARD);
+		D_ASSERT(peer_device, bio_op(peer_req->bios.head) == REQ_OP_DISCARD);
 		/* need to play safe: an older DRBD sender
 		 * may mean zero-out while sending P_TRIM. */
 		if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
 			peer_req->flags |= EE_ZEROOUT;
 	} else if (pi->cmd == P_ZEROES) {
 		D_ASSERT(peer_device, peer_req->i.size > 0);
-		D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
-		D_ASSERT(peer_device, peer_req->pages == NULL);
+		D_ASSERT(peer_device, d.dp_flags & DP_ZEROES);
+		D_ASSERT(peer_device, bio_op(peer_req->bios.head) == REQ_OP_WRITE_ZEROES);
 		/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
-		if (dp_flags & DP_DISCARD)
+		if (d.dp_flags & DP_DISCARD)
 			peer_req->flags |= EE_TRIM;
-	} else if (peer_req->pages == NULL) {
-		D_ASSERT(device, peer_req->i.size == 0);
-		D_ASSERT(device, dp_flags & DP_FLUSH);
+	} else {
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, bio_op(peer_req->bios.head) == REQ_OP_WRITE);
 	}
 
-	if (dp_flags & DP_MAY_SET_IN_SYNC)
+	if (d.dp_flags & DP_MAY_SET_IN_SYNC)
 		peer_req->flags |= EE_MAY_SET_IN_SYNC;
 
 	spin_lock(&connection->epoch_lock);
 	peer_req->epoch = connection->current_epoch;
 	atomic_inc(&peer_req->epoch->epoch_size);
 	atomic_inc(&peer_req->epoch->active);
+	if (peer_req->epoch->oldest_unconfirmed_peer_req == NULL)
+		peer_req->epoch->oldest_unconfirmed_peer_req = peer_req;
+
+	if (connection->resource->write_ordering == WO_BIO_BARRIER &&
+	    atomic_read(&peer_req->epoch->epoch_size) == 1) {
+		struct drbd_epoch *epoch;
+		/* Issue a barrier if we start a new epoch, and the previous epoch
+		   was not a epoch containing a single request which already was
+		   a Barrier. */
+		epoch = list_entry(peer_req->epoch->list.prev, struct drbd_epoch, list);
+		if (epoch == peer_req->epoch) {
+			set_bit(DE_CONTAINS_A_BARRIER, &peer_req->epoch->flags);
+			peer_req->bios.head->bi_opf |= REQ_PREFLUSH | REQ_FUA;
+			peer_req->flags |= EE_IS_BARRIER;
+		} else {
+			if (atomic_read(&epoch->epoch_size) > 1 ||
+			    !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) {
+				set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags);
+				set_bit(DE_CONTAINS_A_BARRIER, &peer_req->epoch->flags);
+				peer_req->bios.head->bi_opf |= REQ_PREFLUSH | REQ_FUA;
+				peer_req->flags |= EE_IS_BARRIER;
+			}
+		}
+	}
 	spin_unlock(&connection->epoch_lock);
 
 	rcu_read_lock();
-	nc = rcu_dereference(peer_device->connection->net_conf);
+	nc = rcu_dereference(connection->transport.net_conf);
 	tp = nc->two_primaries;
-	if (peer_device->connection->agreed_pro_version < 100) {
+	if (connection->agreed_pro_version < 100) {
 		switch (nc->wire_protocol) {
 		case DRBD_PROT_C:
-			dp_flags |= DP_SEND_WRITE_ACK;
+			d.dp_flags |= DP_SEND_WRITE_ACK;
 			break;
 		case DRBD_PROT_B:
-			dp_flags |= DP_SEND_RECEIVE_ACK;
+			d.dp_flags |= DP_SEND_RECEIVE_ACK;
 			break;
 		}
 	}
 	rcu_read_unlock();
 
-	if (dp_flags & DP_SEND_WRITE_ACK) {
+	if (d.dp_flags & DP_SEND_WRITE_ACK) {
 		peer_req->flags |= EE_SEND_WRITE_ACK;
-		inc_unacked(device);
+		inc_unacked(peer_device);
 		/* corresponding dec_unacked() in e_end_block()
 		 * respective _drbd_clear_done_ee */
 	}
 
-	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+	if (d.dp_flags & DP_SEND_RECEIVE_ACK) {
 		/* I really don't like it that the receiver thread
 		 * sends on the msock, but anyways */
 		drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
@@ -2474,66 +3387,137 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
 
 	if (tp) {
 		/* two primaries implies protocol C */
-		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
-		peer_req->flags |= EE_IN_INTERVAL_TREE;
-		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		D_ASSERT(device, d.dp_flags & DP_SEND_WRITE_ACK);
+		err = wait_for_and_update_peer_seq(peer_device, d.peer_seq);
 		if (err)
-			goto out_interrupted;
-		spin_lock_irq(&device->resource->req_lock);
-		err = handle_write_conflicts(device, peer_req);
-		if (err) {
-			spin_unlock_irq(&device->resource->req_lock);
-			if (err == -ENOENT) {
-				put_ldev(device);
-				return 0;
-			}
-			goto out_interrupted;
-		}
+			goto out;
 	} else {
-		update_peer_seq(peer_device, peer_seq);
-		spin_lock_irq(&device->resource->req_lock);
-	}
-	/* TRIM and is processed synchronously,
-	 * we wait for all pending requests, respectively wait for
-	 * active_ee to become empty in drbd_submit_peer_request();
-	 * better not add ourselves here. */
-	if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0)
-		list_add_tail(&peer_req->w.list, &device->active_ee);
-	spin_unlock_irq(&device->resource->req_lock);
-
-	if (device->state.conn == C_SYNC_TARGET)
-		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
-
-	if (device->state.pdsk < D_INCONSISTENT) {
-		/* In case we have the only disk of the cluster, */
-		drbd_set_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size);
-		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
-		drbd_al_begin_io(device, &peer_req->i);
-		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+		update_peer_seq(peer_device, d.peer_seq);
 	}
 
-	err = drbd_submit_peer_request(peer_req);
-	if (!err)
-		return 0;
+	peer_req->dagtag_sector = atomic64_read(&connection->last_dagtag_sector) + (peer_req->i.size >> 9);
 
-	/* don't care for the reason here */
-	drbd_err(device, "submit failed, triggering re-connect\n");
-	spin_lock_irq(&device->resource->req_lock);
-	list_del(&peer_req->w.list);
-	drbd_remove_epoch_entry_interval(device, peer_req);
-	spin_unlock_irq(&device->resource->req_lock);
-	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
-		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
-		drbd_al_complete_io(device, &peer_req->i);
+	drbd_wait_for_activity_log_extents(peer_req);
+
+	atomic_inc(&connection->active_ee_cnt);
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_add_tail(&peer_req->recv_order, &connection->peer_requests);
+	peer_req->flags |= EE_ON_RECV_ORDER;
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	/* Note: this now may or may not be "hot" in the activity log.
+	 * Still, it is the best time to record that we need to set the
+	 * out-of-sync bit, if we delay that until drbd_submit_peer_request(),
+	 * we may introduce a race with some re-attach on the peer.
+	 * Unless we want to guarantee that we drain all in-flight IO
+	 * whenever we receive a state change. Which I'm not sure about.
+	 * Use the EE_SET_OUT_OF_SYNC flag, to be acted on just before
+	 * the actual submit, when we can be sure it is "hot".
+	 */
+	if (peer_device->disk_state[NOW] < D_INCONSISTENT) {
+		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+		peer_req->flags |= EE_SET_OUT_OF_SYNC;
 	}
 
-out_interrupted:
-	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
+	spin_lock_irq(&device->interval_lock);
+	if (tp) {
+		err = drbd_peer_write_conflicts(peer_req);
+		if (err) {
+			spin_unlock_irq(&device->interval_lock);
+			goto out_del_list;
+		}
+	}
+	conflict = drbd_find_conflict(device, &peer_req->i, 0);
+	drbd_insert_interval(&device->requests, &peer_req->i);
+	if (!conflict)
+		set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags);
+	spin_unlock_irq(&device->interval_lock);
+
+	/* The connection dagtag may only be set after inserting the interval
+	 * into the tree, so that requests that were waiting for the dagtag
+	 * enter the interval tree after the request with the dagtag itself. */
+	set_connection_dagtag(connection, peer_req->dagtag_sector);
+
+	if (!conflict)
+		submit_peer_request_activity_log(peer_req);
+	/* ldev_ref_transfer: put_ldev in peer_req endio */
+	return 0;
+
+out_del_list:
+	spin_lock_irq(&connection->peer_reqs_lock);
+	peer_req->flags &= ~EE_ON_RECV_ORDER;
+	list_del(&peer_req->recv_order);
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	atomic_dec(&connection->active_ee_cnt);
+	atomic_sub(interval_to_al_extents(&peer_req->i), &device->wait_for_actlog_ecnt);
+
+out:
+	if (peer_req->flags & EE_SEND_WRITE_ACK)
+		dec_unacked(peer_device);
+	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
 	put_ldev(device);
-	drbd_free_peer_req(device, peer_req);
+	drbd_free_peer_req(peer_req);
 	return err;
 }
 
+/*
+ * To be called when drbd_submit_peer_request() fails for a peer write request.
+ */
+void drbd_cleanup_after_failed_submit_peer_write(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+
+	drbd_err_ratelimit(peer_device, "submit failed, triggering re-connect\n");
+
+	if (peer_req->flags & EE_IN_ACTLOG)
+		drbd_al_complete_io(device, &peer_req->i);
+
+	drbd_remove_peer_req_interval(peer_req);
+
+	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+	put_ldev(device);
+	drbd_free_peer_req(peer_req);
+	change_cstate(connection, C_PROTOCOL_ERROR, CS_HARD);
+}
+
+/* Possibly "cancel" and forget about all peer_requests that had still been
+ * waiting for the activity log (wfa) when the connection to their peer failed,
+ * and pretend we never received them.
+ */
+void drbd_cleanup_peer_requests_wfa(struct drbd_device *device, struct list_head *cleanup)
+{
+	struct drbd_connection *connection;
+	struct drbd_peer_request *peer_req, *pr_tmp;
+
+	write_lock_irq(&device->resource->state_rwlock);
+	list_for_each_entry(peer_req, cleanup, w.list) {
+		atomic_dec(&peer_req->peer_device->connection->active_ee_cnt);
+		drbd_remove_peer_req_interval(peer_req);
+	}
+	write_unlock_irq(&device->resource->state_rwlock);
+
+	list_for_each_entry_safe(peer_req, pr_tmp, cleanup, w.list) {
+		atomic_sub(interval_to_al_extents(&peer_req->i), &device->wait_for_actlog_ecnt);
+		atomic_dec(&device->wait_for_actlog);
+		if (peer_req->flags & EE_SEND_WRITE_ACK)
+			dec_unacked(peer_req->peer_device);
+		list_del_init(&peer_req->w.list);
+		drbd_may_finish_epoch(peer_req->peer_device->connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
+		drbd_free_peer_req(peer_req);
+		put_ldev(device);
+	}
+	/*
+	 * We changed (likely: cleared) active_ee_cnt for "at least one" connection.
+	 * We should wake potential waiters, just in case.
+	 */
+	for_each_connection(connection, device->resource)
+		wake_up(&connection->ee_wait);
+}
+
 /* We may throttle resync, if the lower device seems to be busy,
  * and current sync rate is above c_min_rate.
  *
@@ -2545,69 +3529,45 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector,
-		bool throttle_if_app_is_waiting)
+bool drbd_rs_c_min_rate_throttle(struct drbd_peer_device *peer_device)
 {
 	struct drbd_device *device = peer_device->device;
-	struct lc_element *tmp;
-	bool throttle = drbd_rs_c_min_rate_throttle(device);
-
-	if (!throttle || throttle_if_app_is_waiting)
-		return throttle;
-
-	spin_lock_irq(&device->al_lock);
-	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
-	if (tmp) {
-		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
-		if (test_bit(BME_PRIORITY, &bm_ext->flags))
-			throttle = false;
-		/* Do not slow down if app IO is already waiting for this extent,
-		 * and our progress is necessary for application IO to complete. */
-	}
-	spin_unlock_irq(&device->al_lock);
-
-	return throttle;
-}
-
-bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
-{
 	struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
 	unsigned long db, dt, dbdt;
 	unsigned int c_min_rate;
 	int curr_events;
 
 	rcu_read_lock();
-	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+	c_min_rate = rcu_dereference(peer_device->conf)->c_min_rate;
 	rcu_read_unlock();
 
 	/* feature disabled? */
 	if (c_min_rate == 0)
 		return false;
 
-	curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
-			atomic_read(&device->rs_sect_ev);
+	curr_events = (int)part_stat_read_accum(disk->part0, sectors)
+		- atomic_read(&device->rs_sect_ev);
 
-	if (atomic_read(&device->ap_actlog_cnt)
-	    || curr_events - device->rs_last_events > 64) {
+	if (atomic_read(&device->ap_actlog_cnt) || curr_events - peer_device->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
 
-		device->rs_last_events = curr_events;
+		peer_device->rs_last_events = curr_events;
 
 		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
 		 * approx. */
-		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		i = (peer_device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
 
-		if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
-			rs_left = device->ov_left;
+		if (peer_device->repl_state[NOW] == L_VERIFY_S || peer_device->repl_state[NOW] == L_VERIFY_T)
+			rs_left = atomic64_read(&peer_device->ov_left);
 		else
-			rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+			rs_left = drbd_bm_total_weight(peer_device) - peer_device->rs_failed;
 
-		dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+		dt = ((long)jiffies - (long)peer_device->rs_mark_time[i]) / HZ;
 		if (!dt)
 			dt++;
-		db = device->rs_mark_left[i] - rs_left;
-		dbdt = Bit2KB(db/dt);
+		db = peer_device->rs_mark_left[i] - rs_left;
+		dbdt = device_bit_to_kb(device, db/dt);
 
 		if (dbdt > c_min_rate)
 			return true;
@@ -2615,16 +3575,199 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
 	return false;
 }
 
-static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+void drbd_verify_skipped_block(struct drbd_peer_device *peer_device,
+		const sector_t sector, const unsigned int size)
+{
+	++peer_device->ov_skipped;
+	if (peer_device->ov_last_skipped_start + peer_device->ov_last_skipped_size == sector) {
+		peer_device->ov_last_skipped_size += size>>9;
+	} else {
+		ov_skipped_print(peer_device);
+		peer_device->ov_last_skipped_start = sector;
+		peer_device->ov_last_skipped_size = size>>9;
+	}
+}
+
+static void drbd_cleanup_peer_read(
+		struct drbd_peer_request *peer_req, bool in_interval_tree)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+
+	if (in_interval_tree)
+		drbd_remove_peer_req_interval(peer_req);
+
+	if (peer_req->i.type != INTERVAL_PEER_READ)
+		atomic_sub(peer_req->i.size >> SECTOR_SHIFT, &device->rs_sect_ev);
+	dec_unacked(peer_device);
+
+	drbd_free_peer_req(peer_req);
+	put_ldev(device);
+
+	if (atomic_dec_and_test(&connection->backing_ee_cnt))
+		wake_up(&connection->ee_wait);
+}
+
+void drbd_conflict_submit_peer_read(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	bool submit = true;
+	bool interval_tree = false;
+	bool canceled = false;
+
+	/* Hold resync reads until conflicts have cleared so that we know which
+	 * bitmap bits we can safely clear. Also add verify requests on the
+	 * target to the interval tree so that conflicts can be detected.
+	 * Verify requests on the source have already been added. */
+	if (drbd_interval_is_resync(&peer_req->i) || peer_req->i.type == INTERVAL_OV_READ_TARGET) {
+		bool conflict = false;
+		interval_tree = true;
+		spin_lock_irq(&device->interval_lock);
+		clear_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &peer_req->i.flags);
+		canceled = test_bit(INTERVAL_CANCELED, &peer_req->i.flags);
+		conflict = drbd_find_conflict(device, &peer_req->i, 0);
+		if (drbd_interval_empty(&peer_req->i)) {
+			if (conflict)
+				set_bit(INTERVAL_CONFLICT, &peer_req->i.flags);
+			drbd_insert_interval(&device->requests, &peer_req->i);
+		}
+		if (!conflict || drbd_interval_is_verify(&peer_req->i))
+			set_bit(INTERVAL_SUBMITTED, &peer_req->i.flags);
+		else
+			submit = false;
+		spin_unlock_irq(&device->interval_lock);
+	}
+
+	/* Wait if there are conflicts unless this is a verify request, in
+	 * which case we submit it anyway but skip the block if it conflicted. */
+	if (submit) {
+		int err = drbd_submit_peer_request(peer_req);
+		if (err) {
+			if (drbd_ratelimit())
+				drbd_err(peer_device, "submit failed, triggering re-connect\n");
+
+			drbd_cleanup_peer_read(peer_req, interval_tree);
+			change_cstate(peer_device->connection, C_PROTOCOL_ERROR, CS_HARD);
+		}
+	} else if (canceled) {
+		drbd_cleanup_peer_read(peer_req, interval_tree);
+	}
+}
+
+static bool need_to_wait_for_dagtag_of_peer_request(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_resource *resource = device->resource;
+	struct drbd_connection *connection;
+	bool ret = false;
+
+	rcu_read_lock();
+	connection = drbd_connection_by_node_id(resource, peer_req->depend_dagtag_node_id);
+	if (connection && connection->cstate[NOW] == C_CONNECTED) {
+		if (atomic64_read(&connection->last_dagtag_sector) < peer_req->depend_dagtag)
+			ret = true;
+	}
+	/*
+	 * I am a weak node if the resync source (myself) is not connected to the
+	 * depend_dagtag_node_id. The resync target will abort this resync soon.
+	 * See check_resync_source().
+	 */
+	rcu_read_unlock();
+	return ret;
+}
+
+static void drbd_peer_resync_read_cancel(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	sector_t sector = peer_req->i.sector;
+	int size = peer_req->i.size;
+	u64 block_id = peer_req->block_id;
+
+	if (peer_req->i.type == INTERVAL_OV_READ_SOURCE) {
+		/* P_OV_REPLY */
+		dec_rs_pending(peer_device);
+		drbd_send_ov_result(peer_device, sector, size, block_id, OV_RESULT_SKIP);
+	} else if (peer_req->i.type == INTERVAL_OV_READ_TARGET) {
+		/* P_OV_REQUEST */
+		drbd_verify_skipped_block(peer_device, sector, size);
+		verify_progress(peer_device, sector, size);
+		drbd_send_ack_be(peer_device, P_RS_CANCEL, sector, size, block_id);
+	} else {
+		/* P_RS_DATA_REQUEST etc */
+		drbd_send_ack_be(peer_device, P_RS_CANCEL, sector, size, block_id);
+	}
+}
+
+static void drbd_peer_resync_read(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+	unsigned int size = peer_req->i.size;
+
+	if (connection->peer_role[NOW] != R_PRIMARY &&
+			drbd_rs_c_min_rate_throttle(peer_device))
+		schedule_timeout_uninterruptible(HZ/10);
+
+	atomic_add(size >> 9, &device->rs_sect_ev);
+
+	/* dagtag 0 means that there is no dependency to be fulfilled,
+	 * so we can ignore it.
+	 * If we are the dependent node, we can also ignore the dagtag
+	 * dependency, because the request with this dagtag must already be in
+	 * the interval tree, so the read will wait until the interval tree
+	 * conflict is resolved before being submitted. */
+	if (peer_req->depend_dagtag &&
+	    peer_req->depend_dagtag_node_id != device->resource->res_opts.node_id &&
+	    need_to_wait_for_dagtag_of_peer_request(peer_req)) {
+		dynamic_drbd_dbg(peer_device,
+				 "%s at %llus+%u: Waiting for dagtag %llus from peer %u\n",
+				 drbd_interval_type_str(&peer_req->i),
+				 (unsigned long long)peer_req->i.sector, size,
+				 (unsigned long long)peer_req->depend_dagtag,
+				 peer_req->depend_dagtag_node_id);
+		spin_lock_irq(&connection->peer_reqs_lock);
+		list_add_tail(&peer_req->w.list, &connection->dagtag_wait_ee);
+		spin_unlock_irq(&connection->peer_reqs_lock);
+		return;
+	}
+
+	atomic_inc(&connection->backing_ee_cnt);
+	drbd_conflict_submit_peer_read(peer_req);
+}
+
+static int receive_digest(struct drbd_peer_request *peer_req, int digest_size)
+{
+	struct digest_info *di = NULL;
+
+	di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+	if (!di)
+		return -ENOMEM;
+
+	di->digest_size = digest_size;
+	di->digest = (((char *)di)+sizeof(struct digest_info));
+
+	peer_req->digest = di;
+	peer_req->flags |= EE_HAS_DIGEST;
+
+	return drbd_recv_into(peer_req->peer_device->connection, di->digest, digest_size);
+}
+
+static int receive_common_data_request(struct drbd_connection *connection, struct packet_info *pi,
+		struct p_block_req_common *p,
+		unsigned int depend_dagtag_node_id, u64 depend_dagtag)
 {
 	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
-	sector_t sector;
+	sector_t sector = be64_to_cpu(p->sector);
 	sector_t capacity;
 	struct drbd_peer_request *peer_req;
-	struct digest_info *di = NULL;
-	int size, verb;
-	struct p_block_req *p =	pi->data;
+	int size = be32_to_cpu(p->blksize);
+	enum drbd_disk_state min_d_state;
+	int err;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
@@ -2632,67 +3775,135 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 	device = peer_device->device;
 	capacity = get_capacity(device->vdisk);
 
-	sector = be64_to_cpu(p->sector);
-	size   = be32_to_cpu(p->blksize);
-
 	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
-		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+		drbd_err(peer_device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
 		return -EINVAL;
 	}
 	if (sector + (size>>9) > capacity) {
-		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+		drbd_err(peer_device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
 				(unsigned long long)sector, size);
 		return -EINVAL;
 	}
 
-	if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
-		verb = 1;
+	/* Tell target to have a retry, waiting for the rescheduled
+	 * drbd_start_resync to complete. Otherwise the concurrency
+	 * of send oos and resync may lead to data loss.
+	 */
+	if (peer_device->repl_state[NOW] == L_WF_BITMAP_S ||
+			peer_device->repl_state[NOW] == L_STARTING_SYNC_S) {
 		switch (pi->cmd) {
-		case P_DATA_REQUEST:
-			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
-			break;
-		case P_RS_THIN_REQ:
 		case P_RS_DATA_REQUEST:
+		case P_RS_DAGTAG_REQ:
 		case P_CSUM_RS_REQUEST:
-		case P_OV_REQUEST:
-			drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
-			break;
-		case P_OV_REPLY:
-			verb = 0;
-			dec_rs_pending(peer_device);
-			drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
-			break;
+		case P_RS_CSUM_DAGTAG_REQ:
+		case P_RS_THIN_REQ:
+		case P_RS_THIN_DAGTAG_REQ:
+			drbd_send_ack_be(peer_device, P_RS_CANCEL, sector, size, p->block_id);
+			return ignore_remaining_packet(connection, pi->size);
 		default:
-			BUG();
+			break;
 		}
-		if (verb && drbd_ratelimit())
-			drbd_err(device, "Can not satisfy peer's read request, "
-			    "no local data.\n");
-
-		/* drain possibly payload */
-		return drbd_drain_block(peer_device, pi->size);
 	}
 
-	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
-	 * "criss-cross" setup, that might cause write-out on some other DRBD,
-	 * which in turn might block on the other node at this very place.  */
-	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
-			size, GFP_NOIO);
-	if (!peer_req) {
-		put_ldev(device);
-		return -ENOMEM;
+	min_d_state = pi->cmd == P_DATA_REQUEST ? D_UP_TO_DATE : D_OUTDATED;
+	if (!get_ldev_if_state(device, min_d_state)) {
+		switch (pi->cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_be(peer_device, P_NEG_DREPLY, sector, size, p->block_id);
+			break;
+		case P_OV_REQUEST:
+		case P_OV_DAGTAG_REQ:
+			drbd_verify_skipped_block(peer_device, sector, size);
+			verify_progress(peer_device, sector, size);
+			drbd_send_ack_be(peer_device, P_RS_CANCEL, sector, size, p->block_id);
+			break;
+		case P_RS_DATA_REQUEST:
+		case P_RS_DAGTAG_REQ:
+		case P_CSUM_RS_REQUEST:
+		case P_RS_CSUM_DAGTAG_REQ:
+		case P_RS_THIN_REQ:
+		case P_RS_THIN_DAGTAG_REQ:
+			if (peer_device->repl_state[NOW] == L_PAUSED_SYNC_S)
+				drbd_send_ack_be(peer_device, P_RS_CANCEL, sector, size, p->block_id);
+			else
+				drbd_send_ack_be(peer_device, P_NEG_RS_DREPLY, sector, size, p->block_id);
+			break;
+		default:
+			BUG();
+		}
+
+		if (peer_device->repl_state[NOW] != L_PAUSED_SYNC_S)
+			drbd_err_ratelimit(device,
+				"Can not satisfy peer's read request, no local data.\n");
+
+		/* drain possible payload */
+		return ignore_remaining_packet(connection, pi->size);
+	}
+
+	if (pi->cmd != P_DATA_REQUEST
+	&& !IS_ALIGNED(size, bm_block_size(device->bitmap))
+	&& (sector + (size >> 9) != device->bitmap->bm_dev_capacity)) {
+		drbd_warn_ratelimit(peer_device,
+			"Unaligned %s request (%u vs %u) at %llu; may lead to hung or repeating resync.\n",
+			drbd_packet_name(pi->cmd), size, bm_block_size(device->bitmap), sector);
+		/* For now, try to continue anyways */
+	}
+
+	inc_unacked(peer_device);
+
+	peer_req = drbd_alloc_peer_req(peer_device, GFP_TRY, size, REQ_OP_READ);
+	err = -ENOMEM;
+	if (!peer_req)
+		goto fail;
+	peer_req->i.size = size;
+	peer_req->i.sector = sector;
+	peer_req->block_id = p->block_id;
+	peer_req->depend_dagtag_node_id = depend_dagtag_node_id;
+	peer_req->depend_dagtag = depend_dagtag;
+	/* no longer valid, about to call drbd_recv again for the digest... */
+	p = NULL;
+	pi->data = NULL;
+
+	if (peer_device->repl_state[NOW] == L_AHEAD) {
+		if (pi->cmd == P_DATA_REQUEST) {
+			/* P_DATA_REQUEST originates from a Primary,
+			 * so if I am "Ahead", the Primary would be "Behind":
+			 * Can not happen. */
+			drbd_err_ratelimit(peer_device, "received P_DATA_REQUEST while L_AHEAD\n");
+			err = -EINVAL;
+			goto fail2;
+		}
+		if (connection->agreed_pro_version >= 115) {
+			switch (pi->cmd) {
+			/* case P_DATA_REQUEST: see above, not based on protocol version */
+			case P_OV_REQUEST:
+			case P_OV_DAGTAG_REQ:
+				drbd_verify_skipped_block(peer_device, sector, size);
+				verify_progress(peer_device, sector, size);
+				fallthrough;
+			case P_RS_DATA_REQUEST:
+			case P_RS_DAGTAG_REQ:
+			case P_CSUM_RS_REQUEST:
+			case P_RS_CSUM_DAGTAG_REQ:
+			case P_RS_THIN_REQ:
+			case P_RS_THIN_DAGTAG_REQ:
+				err = drbd_send_ack(peer_device, P_RS_CANCEL_AHEAD, peer_req);
+				goto fail2;
+			default:
+				BUG();
+			}
+		}
 	}
-	peer_req->opf = REQ_OP_READ;
 
 	switch (pi->cmd) {
 	case P_DATA_REQUEST:
 		peer_req->w.cb = w_e_end_data_req;
-		/* application IO, don't drbd_rs_begin_io */
-		peer_req->flags |= EE_APPLICATION;
+		peer_req->i.type = INTERVAL_PEER_READ;
 		goto submit;
 
 	case P_RS_THIN_REQ:
+	case P_RS_THIN_DAGTAG_REQ:
 		/* If at some point in the future we have a smart way to
 		   find out if this data block is completely deallocated,
 		   then we would do something smarter here than reading
@@ -2700,56 +3911,44 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		peer_req->flags |= EE_RS_THIN_REQ;
 		fallthrough;
 	case P_RS_DATA_REQUEST:
+	case P_RS_DAGTAG_REQ:
+		peer_req->i.type = INTERVAL_RESYNC_READ;
 		peer_req->w.cb = w_e_end_rsdata_req;
-		/* used in the sector offset progress display */
-		device->bm_resync_fo = BM_SECT_TO_BIT(sector);
 		break;
 
-	case P_OV_REPLY:
 	case P_CSUM_RS_REQUEST:
-		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
-		if (!di)
-			goto out_free_e;
-
-		di->digest_size = pi->size;
-		di->digest = (((char *)di)+sizeof(struct digest_info));
-
-		peer_req->digest = di;
-		peer_req->flags |= EE_HAS_DIGEST;
-
-		if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
-			goto out_free_e;
-
-		if (pi->cmd == P_CSUM_RS_REQUEST) {
-			D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
-			peer_req->w.cb = w_e_end_csum_rs_req;
-			/* used in the sector offset progress display */
-			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
-			/* remember to report stats in drbd_resync_finished */
-			device->use_csums = true;
-		} else if (pi->cmd == P_OV_REPLY) {
-			/* track progress, we may need to throttle */
-			atomic_add(size >> 9, &device->rs_sect_in);
-			peer_req->w.cb = w_e_end_ov_reply;
-			dec_rs_pending(peer_device);
-			/* drbd_rs_begin_io done when we sent this request,
-			 * but accounting still needs to be done. */
-			goto submit_for_resync;
-		}
+	case P_RS_CSUM_DAGTAG_REQ:
+		D_ASSERT(device, connection->agreed_pro_version >= 89);
+		peer_req->i.type = INTERVAL_RESYNC_READ;
+
+		err = receive_digest(peer_req, pi->size);
+		if (err)
+			goto fail2;
+
+		peer_req->w.cb = w_e_end_rsdata_req;
+		/* remember to report stats in drbd_resync_finished */
+		peer_device->use_csums = true;
 		break;
 
 	case P_OV_REQUEST:
-		if (device->ov_start_sector == ~(sector_t)0 &&
-		    peer_device->connection->agreed_pro_version >= 90) {
+	case P_OV_DAGTAG_REQ:
+		peer_req->i.type = INTERVAL_OV_READ_TARGET;
+		peer_device->ov_position = sector;
+		if (peer_device->ov_start_sector == ~(sector_t)0 &&
+		    connection->agreed_pro_version >= 90) {
 			unsigned long now = jiffies;
 			int i;
-			device->ov_start_sector = sector;
-			device->ov_position = sector;
-			device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
-			device->rs_total = device->ov_left;
+			unsigned long ov_left = drbd_bm_bits(device)
+					- bm_sect_to_bit(device->bitmap, sector);
+			atomic64_set(&peer_device->ov_left, ov_left);
+			peer_device->ov_start_sector = sector;
+			peer_device->ov_skipped = 0;
+			peer_device->rs_total = ov_left;
+			peer_device->rs_last_writeout = now;
+			peer_device->rs_last_progress_report_ts = now;
 			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-				device->rs_mark_left[i] = device->ov_left;
-				device->rs_mark_time[i] = now;
+				peer_device->rs_mark_left[i] = ov_left;
+				peer_device->rs_mark_time[i] = now;
 			}
 			drbd_info(device, "Online Verify start sector: %llu\n",
 					(unsigned long long)sector);
@@ -2761,146 +3960,372 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
 		BUG();
 	}
 
-	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
-	 * wrt the receiver, but it is not as straightforward as it may seem.
-	 * Various places in the resync start and stop logic assume resync
-	 * requests are processed in order, requeuing this on the worker thread
-	 * introduces a bunch of new code for synchronization between threads.
-	 *
-	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
-	 * "forever", throttling after drbd_rs_begin_io will lock that extent
-	 * for application writes for the same time.  For now, just throttle
-	 * here, where the rest of the code expects the receiver to sleep for
-	 * a while, anyways.
-	 */
+submit:
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_add_tail(&peer_req->recv_order, &connection->peer_reads);
+	peer_req->flags |= EE_ON_RECV_ORDER;
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	if (pi->cmd == P_DATA_REQUEST) {
+		atomic_inc(&connection->backing_ee_cnt);
+		drbd_conflict_submit_peer_read(peer_req);
+	} else {
+		drbd_peer_resync_read(peer_req);
+	}
+	/* ldev_ref_transfer: put_ldev in peer_req endio */
+	return 0;
+fail2:
+	drbd_free_peer_req(peer_req);
+fail:
+	dec_unacked(peer_device);
+	put_ldev(device);
+	return err;
+}
 
-	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
-	 * this defers syncer requests for some time, before letting at least
-	 * on request through.  The resync controller on the receiving side
-	 * will adapt to the incoming rate accordingly.
-	 *
-	 * We cannot throttle here if remote is Primary/SyncTarget:
-	 * we would also throttle its application reads.
-	 * In that case, throttling is done on the SyncTarget only.
+static int receive_data_request(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_block_req *p_block_req = pi->data;
+
+	return receive_common_data_request(connection, pi,
+			&p_block_req->req_common,
+			0, 0);
+}
+
+/* receive_dagtag_data_request() - handle a request for data with dagtag
+ * dependency initiated by the peer
+ *
+ * Request handling flow:
+ *
+ * receive_dagtag_data_request
+ *        |
+ *        V
+ * receive_common_data_request
+ *        |
+ *        v              dagtag waiting
+ * drbd_peer_resync_read --------------+
+ *        |                            |
+ *        |                           ... dagtag_wait_ee
+ *        |                            |
+ *        |                            v
+ *        +--------------- release_dagtag_wait
+ *        |
+ *        v                       conflict (resync only)
+ * drbd_conflict_submit_peer_read -----+
+ *        |          ^                 |
+ *        |          |                ...
+ *        |          |                 |
+ *        |          |                 v
+ *        |          +---- drbd_do_submit_conflict
+ *        v
+ * drbd_submit_peer_request
+ *        |
+ *       ... backing device
+ *        |
+ *        v
+ * drbd_peer_request_endio
+ *        |
+ *        v                  online verify request
+ * drbd_endio_read_sec_final ------------------+
+ *        |                                    |
+ *       ... sender_work                      ... sender_work
+ *        |                                    |
+ *        v                                    v
+ * w_e_end_rsdata_req                    w_e_end_ov_req
+ *        |                                    |
+ *       ... via peer                         ... via peer
+ *        |                                    |
+ *        v                                    v
+ * got_RSWriteAck                         got_OVResult
+ */
+static int receive_dagtag_data_request(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_rs_req *p_rs_req = pi->data;
+
+	return receive_common_data_request(connection, pi,
+			&p_rs_req->req_common,
+			be32_to_cpu(p_rs_req->dagtag_node_id), be64_to_cpu(p_rs_req->dagtag));
+}
+
+static int receive_common_ov_reply(struct drbd_connection *connection, struct packet_info *pi,
+		struct p_block_req_common *p,
+		unsigned int depend_dagtag_node_id, u64 depend_dagtag)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector = be64_to_cpu(p->sector);
+	struct drbd_peer_request *peer_req;
+	int size = be32_to_cpu(p->blksize);
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_OV_READ_SOURCE),
+			sector, size, p->block_id);
+	if (!peer_req)
+		return -EIO;
+
+	dec_rs_pending(peer_device);
+
+	if (!get_ldev_if_state(device, D_OUTDATED)) {
+		drbd_peer_resync_read_cancel(peer_req);
+		drbd_remove_peer_req_interval(peer_req);
+		drbd_free_peer_req(peer_req);
+
+		/* drain payload */
+		return ignore_remaining_packet(connection, pi->size);
+	}
+
+	err = receive_digest(peer_req, pi->size);
+	if (err)
+		goto fail;
+
+	set_bit(INTERVAL_RECEIVED, &peer_req->i.flags);
+
+	err = peer_req_alloc_bio(peer_req, size, GFP_NOIO, REQ_OP_READ);
+	if (err)
+		goto fail;
+
+	inc_unacked(peer_device);
+
+	peer_req->depend_dagtag_node_id = depend_dagtag_node_id;
+	peer_req->depend_dagtag = depend_dagtag;
+	peer_req->w.cb = w_e_end_ov_reply;
+
+	/* track progress, we may need to throttle */
+	rs_sectors_came_in(peer_device, size);
+
+	drbd_peer_resync_read(peer_req);
+	/* ldev_ref_transfer: put_ldev in peer_req endio */
+	return 0;
+fail:
+	drbd_remove_peer_req_interval(peer_req);
+	drbd_free_peer_req(peer_req);
+	put_ldev(device);
+	return err;
+}
+
+static int receive_ov_reply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_block_req *p_block_req = pi->data;
+
+	return receive_common_ov_reply(connection, pi,
+			&p_block_req->req_common,
+			0, 0);
+}
+
+static int receive_dagtag_ov_reply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_rs_req *p_rs_req = pi->data;
+
+	return receive_common_ov_reply(connection, pi,
+			&p_rs_req->req_common,
+			be32_to_cpu(p_rs_req->dagtag_node_id), be64_to_cpu(p_rs_req->dagtag));
+}
+
+static int receive_flush_requests(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_connection *other_connection;
+	struct p_flush_requests *p_flush_requests = pi->data;
+	u64 flush_requests_dagtag;
+
+	spin_lock_irq(&resource->tl_update_lock);
+	/*
+	 * If the current dagtag was read from the metadata then there is no
+	 * associated request. Hence there is nothing to flush. Flush up to the
+	 * preceding dagtag instead.
 	 */
+	if (resource->dagtag_sector == resource->dagtag_from_backing_dev)
+		flush_requests_dagtag = resource->dagtag_before_attach;
+	else
+		flush_requests_dagtag = resource->dagtag_sector;
+	spin_unlock_irq(&resource->tl_update_lock);
 
-	/* Even though this may be a resync request, we do add to "read_ee";
-	 * "sync_ee" is only used for resync WRITEs.
-	 * Add to list early, so debugfs can find this request
-	 * even if we have to sleep below. */
-	spin_lock_irq(&device->resource->req_lock);
-	list_add_tail(&peer_req->w.list, &device->read_ee);
-	spin_unlock_irq(&device->resource->req_lock);
-
-	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
-	if (device->state.peer != R_PRIMARY
-	&& drbd_rs_should_slow_down(peer_device, sector, false))
-		schedule_timeout_uninterruptible(HZ/10);
-	update_receiver_timing_details(connection, drbd_rs_begin_io);
-	if (drbd_rs_begin_io(device, sector))
-		goto out_free_e;
+	spin_lock_irq(&connection->primary_flush_lock);
+	connection->flush_requests_dagtag = flush_requests_dagtag;
+	connection->flush_sequence = be64_to_cpu(p_flush_requests->flush_sequence);
+	connection->flush_forward_sent_mask = 0;
+	spin_unlock_irq(&connection->primary_flush_lock);
 
-submit_for_resync:
-	atomic_add(size >> 9, &device->rs_sect_ev);
+	/* Queue any request waiting for peer ack to be sent */
+	drbd_flush_peer_acks(resource);
 
-submit:
-	update_receiver_timing_details(connection, drbd_submit_peer_request);
-	inc_unacked(device);
-	if (drbd_submit_peer_request(peer_req) == 0)
+	/* For each peer, check if peer ack for this dagtag has already been sent */
+	rcu_read_lock();
+	for_each_connection_rcu(other_connection, resource) {
+		if (other_connection->cstate[NOW] == C_CONNECTED)
+			queue_work(other_connection->ack_sender, &other_connection->peer_ack_work);
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int receive_flush_requests_ack(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_connection *primary_connection;
+	struct p_flush_ack *p_flush_ack = pi->data;
+	u64 flush_sequence = be64_to_cpu(p_flush_ack->flush_sequence);
+	int primary_node_id = be32_to_cpu(p_flush_ack->primary_node_id);
+
+	spin_lock_irq(&resource->initiator_flush_lock);
+	if (flush_sequence < resource->current_flush_sequence) {
+		spin_unlock_irq(&resource->initiator_flush_lock);
 		return 0;
+	}
 
-	/* don't care for the reason here */
-	drbd_err(device, "submit failed, triggering re-connect\n");
+	rcu_read_lock();
+	primary_connection = drbd_connection_by_node_id(resource, primary_node_id);
+	if (primary_connection)
+		primary_connection->pending_flush_mask &= ~NODE_MASK(connection->peer_node_id);
+	rcu_read_unlock();
+	spin_unlock_irq(&resource->initiator_flush_lock);
+	return 0;
+}
 
-out_free_e:
-	spin_lock_irq(&device->resource->req_lock);
-	list_del(&peer_req->w.list);
-	spin_unlock_irq(&device->resource->req_lock);
-	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+/*
+ * config_unknown_volume  -  device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet.  It will warn and ignore these
+ * commands.  Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "%s packet received for volume %d, which is not configured locally\n",
+		  drbd_packet_name(pi->cmd), pi->vnr);
+	return ignore_remaining_packet(connection, pi->size);
+}
 
-	put_ldev(device);
-	drbd_free_peer_req(device, peer_req);
-	return -EIO;
+static int receive_enable_replication_next(struct drbd_connection *connection,
+		struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_enable_replication *p_enable_replication = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+
+	if (p_enable_replication->enable)
+		set_bit(REPLICATION_NEXT, &peer_device->flags);
+	else
+		clear_bit(REPLICATION_NEXT, &peer_device->flags);
+
+	return 0;
+}
+
+static int receive_enable_replication(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device;
+	struct p_enable_replication *p_enable_replication = pi->data;
+	unsigned long irq_flags;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	begin_state_change(resource, &irq_flags, CS_VERBOSE);
+	peer_device->replication[NEW] = p_enable_replication->enable;
+	end_state_change(resource, &irq_flags, "enable-replication");
+	return 0;
 }
 
 /*
  * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
  */
-static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+static enum sync_strategy drbd_asb_recover_0p(struct drbd_peer_device *peer_device)
 {
-	struct drbd_device *device = peer_device->device;
-	int self, peer, rv = -100;
+	const int node_id = peer_device->device->resource->res_opts.node_id;
+	int self, peer;
+	enum sync_strategy rv = SPLIT_BRAIN_DISCONNECT;
 	unsigned long ch_self, ch_peer;
 	enum drbd_after_sb_p after_sb_0p;
 
-	self = device->ldev->md.uuid[UI_BITMAP] & 1;
-	peer = device->p_uuid[UI_BITMAP] & 1;
+	self = drbd_bitmap_uuid(peer_device) & UUID_PRIMARY;
+	peer = peer_device->bitmap_uuids[node_id] & UUID_PRIMARY;
 
-	ch_peer = device->p_uuid[UI_SIZE];
-	ch_self = device->comm_bm_set;
+	ch_peer = peer_device->dirty_bits;
+	ch_self = peer_device->comm_bm_set;
 
 	rcu_read_lock();
-	after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+	after_sb_0p = rcu_dereference(peer_device->connection->transport.net_conf)->after_sb_0p;
 	rcu_read_unlock();
 	switch (after_sb_0p) {
 	case ASB_CONSENSUS:
 	case ASB_DISCARD_SECONDARY:
 	case ASB_CALL_HELPER:
 	case ASB_VIOLENTLY:
-		drbd_err(device, "Configuration error.\n");
+	case ASB_RETRY_CONNECT:
+	case ASB_AUTO_DISCARD:
+		drbd_err(peer_device, "Configuration error.\n");
 		break;
 	case ASB_DISCONNECT:
 		break;
 	case ASB_DISCARD_YOUNGER_PRI:
 		if (self == 0 && peer == 1) {
-			rv = -1;
+			rv = SYNC_TARGET_USE_BITMAP;
 			break;
 		}
 		if (self == 1 && peer == 0) {
-			rv =  1;
+			rv = SYNC_SOURCE_USE_BITMAP;
 			break;
 		}
 		fallthrough;	/* to one of the other strategies */
 	case ASB_DISCARD_OLDER_PRI:
 		if (self == 0 && peer == 1) {
-			rv = 1;
+			rv = SYNC_SOURCE_USE_BITMAP;
 			break;
 		}
 		if (self == 1 && peer == 0) {
-			rv = -1;
+			rv = SYNC_TARGET_USE_BITMAP;
 			break;
 		}
-		/* Else fall through to one of the other strategies... */
-		drbd_warn(device, "Discard younger/older primary did not find a decision\n"
-		     "Using discard-least-changes instead\n");
+		drbd_warn(peer_device, "Discard younger/older primary did not find a decision\n"
+			  "Using discard-least-changes instead\n");
 		fallthrough;
 	case ASB_DISCARD_ZERO_CHG:
 		if (ch_peer == 0 && ch_self == 0) {
-			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
-				? -1 : 1;
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->transport.flags)
+				? SYNC_TARGET_USE_BITMAP : SYNC_SOURCE_USE_BITMAP;
 			break;
 		} else {
-			if (ch_peer == 0) { rv =  1; break; }
-			if (ch_self == 0) { rv = -1; break; }
+			if (ch_peer == 0) {
+				rv = SYNC_SOURCE_USE_BITMAP;
+				break;
+			}
+			if (ch_self == 0) {
+				rv = SYNC_TARGET_USE_BITMAP;
+				break;
+			}
 		}
 		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
 			break;
 		fallthrough;
 	case ASB_DISCARD_LEAST_CHG:
 		if	(ch_self < ch_peer)
-			rv = -1;
+			rv = SYNC_TARGET_USE_BITMAP;
 		else if (ch_self > ch_peer)
-			rv =  1;
+			rv = SYNC_SOURCE_USE_BITMAP;
 		else /* ( ch_self == ch_peer ) */
 		     /* Well, then use something else. */
-			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
-				? -1 : 1;
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->transport.flags)
+				? SYNC_TARGET_USE_BITMAP : SYNC_SOURCE_USE_BITMAP;
 		break;
 	case ASB_DISCARD_LOCAL:
-		rv = -1;
+		rv = SYNC_TARGET_USE_BITMAP;
 		break;
 	case ASB_DISCARD_REMOTE:
-		rv =  1;
+		rv = SYNC_SOURCE_USE_BITMAP;
 	}
 
 	return rv;
@@ -2909,14 +4334,16 @@ static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold
 /*
  * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
  */
-static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+static enum sync_strategy drbd_asb_recover_1p(struct drbd_peer_device *peer_device)
 {
 	struct drbd_device *device = peer_device->device;
-	int hg, rv = -100;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_resource *resource = device->resource;
+	enum sync_strategy strategy, rv = SPLIT_BRAIN_DISCONNECT;
 	enum drbd_after_sb_p after_sb_1p;
 
 	rcu_read_lock();
-	after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+	after_sb_1p = rcu_dereference(connection->transport.net_conf)->after_sb_1p;
 	rcu_read_unlock();
 	switch (after_sb_1p) {
 	case ASB_DISCARD_YOUNGER_PRI:
@@ -2925,39 +4352,42 @@ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold
 	case ASB_DISCARD_LOCAL:
 	case ASB_DISCARD_REMOTE:
 	case ASB_DISCARD_ZERO_CHG:
+	case ASB_RETRY_CONNECT:
+	case ASB_AUTO_DISCARD:
 		drbd_err(device, "Configuration error.\n");
 		break;
 	case ASB_DISCONNECT:
 		break;
 	case ASB_CONSENSUS:
-		hg = drbd_asb_recover_0p(peer_device);
-		if (hg == -1 && device->state.role == R_SECONDARY)
-			rv = hg;
-		if (hg == 1  && device->state.role == R_PRIMARY)
-			rv = hg;
+		strategy = drbd_asb_recover_0p(peer_device);
+		if (strategy == SYNC_TARGET_USE_BITMAP && resource->role[NOW] == R_SECONDARY)
+			rv = strategy;
+		if (strategy == SYNC_SOURCE_USE_BITMAP && resource->role[NOW] == R_PRIMARY)
+			rv = strategy;
 		break;
 	case ASB_VIOLENTLY:
 		rv = drbd_asb_recover_0p(peer_device);
 		break;
 	case ASB_DISCARD_SECONDARY:
-		return device->state.role == R_PRIMARY ? 1 : -1;
+		return resource->role[NOW] == R_PRIMARY ? SYNC_SOURCE_USE_BITMAP : SYNC_TARGET_USE_BITMAP;
 	case ASB_CALL_HELPER:
-		hg = drbd_asb_recover_0p(peer_device);
-		if (hg == -1 && device->state.role == R_PRIMARY) {
+		strategy = drbd_asb_recover_0p(peer_device);
+		if (strategy == SYNC_TARGET_USE_BITMAP && resource->role[NOW] == R_PRIMARY) {
 			enum drbd_state_rv rv2;
 
 			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
-			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we might be here in L_OFF which is transient.
 			  * we do not need to wait for the after state change work either. */
-			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			rv2 = change_role(resource, R_SECONDARY, CS_VERBOSE,
+					"after-sb-1pri", NULL);
 			if (rv2 != SS_SUCCESS) {
-				drbd_khelper(device, "pri-lost-after-sb");
+				drbd_maybe_khelper(device, connection, "pri-lost-after-sb");
 			} else {
 				drbd_warn(device, "Successfully gave up primary role.\n");
-				rv = hg;
+				rv = strategy;
 			}
 		} else
-			rv = hg;
+			rv = strategy;
 	}
 
 	return rv;
@@ -2966,14 +4396,15 @@ static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold
 /*
  * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
  */
-static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+static enum sync_strategy drbd_asb_recover_2p(struct drbd_peer_device *peer_device)
 {
 	struct drbd_device *device = peer_device->device;
-	int hg, rv = -100;
+	struct drbd_connection *connection = peer_device->connection;
+	enum sync_strategy strategy, rv = SPLIT_BRAIN_DISCONNECT;
 	enum drbd_after_sb_p after_sb_2p;
 
 	rcu_read_lock();
-	after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+	after_sb_2p = rcu_dereference(connection->transport.net_conf)->after_sb_2p;
 	rcu_read_unlock();
 	switch (after_sb_2p) {
 	case ASB_DISCARD_YOUNGER_PRI:
@@ -2984,6 +4415,8 @@ static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold
 	case ASB_CONSENSUS:
 	case ASB_DISCARD_SECONDARY:
 	case ASB_DISCARD_ZERO_CHG:
+	case ASB_RETRY_CONNECT:
+	case ASB_AUTO_DISCARD:
 		drbd_err(device, "Configuration error.\n");
 		break;
 	case ASB_VIOLENTLY:
@@ -2992,440 +4425,1021 @@ static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold
 	case ASB_DISCONNECT:
 		break;
 	case ASB_CALL_HELPER:
-		hg = drbd_asb_recover_0p(peer_device);
-		if (hg == -1) {
+		strategy = drbd_asb_recover_0p(peer_device);
+		if (strategy == SYNC_TARGET_USE_BITMAP) {
 			enum drbd_state_rv rv2;
 
 			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
-			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we might be here in L_OFF which is transient.
 			  * we do not need to wait for the after state change work either. */
-			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			rv2 = change_role(device->resource, R_SECONDARY, CS_VERBOSE,
+					"after-sb-2pri", NULL);
 			if (rv2 != SS_SUCCESS) {
-				drbd_khelper(device, "pri-lost-after-sb");
+				drbd_maybe_khelper(device, connection, "pri-lost-after-sb");
 			} else {
 				drbd_warn(device, "Successfully gave up primary role.\n");
-				rv = hg;
+				rv = strategy;
 			}
 		} else
-			rv = hg;
+			rv = strategy;
 	}
 
 	return rv;
 }
 
-static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
-			   u64 bits, u64 flags)
+static void drbd_uuid_dump_self(struct drbd_peer_device *peer_device, u64 bits, u64 flags)
 {
-	if (!uuid) {
-		drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
-		return;
-	}
-	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
-	     text,
-	     (unsigned long long)uuid[UI_CURRENT],
-	     (unsigned long long)uuid[UI_BITMAP],
-	     (unsigned long long)uuid[UI_HISTORY_START],
-	     (unsigned long long)uuid[UI_HISTORY_END],
-	     (unsigned long long)bits,
-	     (unsigned long long)flags);
+	struct drbd_device *device = peer_device->device;
+
+	drbd_info(peer_device, "self %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+		  (unsigned long long)drbd_resolved_uuid(peer_device, NULL),
+		  (unsigned long long)drbd_bitmap_uuid(peer_device),
+		  (unsigned long long)drbd_history_uuid(device, 0),
+		  (unsigned long long)drbd_history_uuid(device, 1),
+		  (unsigned long long)bits,
+		  (unsigned long long)flags);
 }
 
-/*
-  100	after split brain try auto recover
-    2	C_SYNC_SOURCE set BitMap
-    1	C_SYNC_SOURCE use BitMap
-    0	no Sync
-   -1	C_SYNC_TARGET use BitMap
-   -2	C_SYNC_TARGET set BitMap
- -100	after split brain, disconnect
--1000	unrelated data
--1091   requires proto 91
--1096   requires proto 96
- */
 
-static int drbd_uuid_compare(struct drbd_peer_device *const peer_device,
-		enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
+static void drbd_uuid_dump_peer(struct drbd_peer_device *peer_device, u64 bits, u64 flags)
 {
-	struct drbd_connection *const connection = peer_device->connection;
-	struct drbd_device *device = peer_device->device;
-	u64 self, peer;
-	int i, j;
+	const int node_id = peer_device->device->resource->res_opts.node_id;
 
-	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
-	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	drbd_info(peer_device, "peer %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     (unsigned long long)peer_device->current_uuid,
+	     (unsigned long long)peer_device->bitmap_uuids[node_id],
+	     (unsigned long long)peer_device->history_uuids[0],
+	     (unsigned long long)peer_device->history_uuids[1],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
 
-	*rule_nr = 10;
-	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
-		return 0;
+/* find the peer's bitmap slot for the given UUID, if they have one */
+static int drbd_find_peer_bitmap_by_uuid(struct drbd_peer_device *peer_device, u64 uuid)
+{
+	u64 peer;
+	int i;
 
-	*rule_nr = 20;
-	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
-	     peer != UUID_JUST_CREATED)
-		return -2;
+	for (i = 0; i < DRBD_PEERS_MAX; i++) {
+		peer = peer_device->bitmap_uuids[i] & ~UUID_PRIMARY;
+		if (uuid == peer)
+			return i;
+	}
 
-	*rule_nr = 30;
-	if (self != UUID_JUST_CREATED &&
-	    (peer == UUID_JUST_CREATED || peer == (u64)0))
-		return 2;
+	return -1;
+}
 
-	if (self == peer) {
-		int rct, dc; /* roles at crash time */
+/* find our bitmap slot for the given UUID, if we have one */
+static int drbd_find_bitmap_by_uuid(struct drbd_peer_device *peer_device, u64 uuid)
+{
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	u64 self;
+	int i;
 
-		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+	for (i = 0; i < DRBD_NODE_ID_MAX; i++) {
+		if (i == device->ldev->md.node_id)
+			continue;
+		if (connection->agreed_pro_version < 116 &&
+		    device->ldev->md.peers[i].bitmap_index == -1)
+			continue;
+		self = device->ldev->md.peers[i].bitmap_uuid & ~UUID_PRIMARY;
+		if (self == uuid)
+			return i;
+	}
 
-			if (connection->agreed_pro_version < 91)
-				return -1091;
+	return -1;
+}
 
-			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
-			    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
-				drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
-				drbd_uuid_move_history(device);
-				device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
-				device->ldev->md.uuid[UI_BITMAP] = 0;
+static enum sync_strategy
+uuid_fixup_resync_end(struct drbd_peer_device *peer_device, enum sync_rule *rule)
+{
+	struct drbd_device *device = peer_device->device;
+	const int node_id = device->resource->res_opts.node_id;
 
-				drbd_uuid_dump(device, "self", device->ldev->md.uuid,
-					       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
-				*rule_nr = 34;
-			} else {
-				drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
-				*rule_nr = 36;
-			}
+	if (peer_device->bitmap_uuids[node_id] == (u64)0 && drbd_bitmap_uuid(peer_device) != (u64)0) {
 
-			return 1;
-		}
+		if (peer_device->connection->agreed_pro_version < 91)
+			return REQUIRES_PROTO_91;
 
-		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+		if ((drbd_bitmap_uuid(peer_device) & ~UUID_PRIMARY) ==
+		    (peer_device->history_uuids[0] & ~UUID_PRIMARY) &&
+		    (drbd_history_uuid(device, 0) & ~UUID_PRIMARY) ==
+		    (peer_device->history_uuids[0] & ~UUID_PRIMARY)) {
+			struct drbd_peer_md *peer_md = &device->ldev->md.peers[peer_device->node_id];
+			u64 previous_bitmap_uuid = peer_md->bitmap_uuid;
 
-			if (connection->agreed_pro_version < 91)
-				return -1091;
+			drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+			peer_md->bitmap_uuid = 0;
+			_drbd_uuid_push_history(device, previous_bitmap_uuid);
 
-			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
-			    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
-				drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+			drbd_uuid_dump_self(peer_device,
+					    device->disk_state[NOW] >= D_NEGOTIATING ? drbd_bm_total_weight(peer_device) : 0, 0);
+			*rule = RULE_SYNC_SOURCE_MISSED_FINISH;
+		} else {
+			drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+			*rule = RULE_SYNC_SOURCE_PEER_MISSED_FINISH;
+		}
 
-				device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
-				device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
-				device->p_uuid[UI_BITMAP] = 0UL;
+		return SYNC_SOURCE_USE_BITMAP;
+	}
 
-				drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
-				*rule_nr = 35;
-			} else {
-				drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
-				*rule_nr = 37;
-			}
+	if (drbd_bitmap_uuid(peer_device) == (u64)0 && peer_device->bitmap_uuids[node_id] != (u64)0) {
 
-			return -1;
-		}
+		if (peer_device->connection->agreed_pro_version < 91)
+			return REQUIRES_PROTO_91;
 
-		/* Common power [off|failure] */
-		rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
-			(device->p_uuid[UI_FLAGS] & 2);
-		/* lowest bit is set when we were primary,
-		 * next bit (weight 2) is set when peer was primary */
-		*rule_nr = 40;
+		if ((drbd_history_uuid(device, 0) & ~UUID_PRIMARY) ==
+		    (peer_device->bitmap_uuids[node_id] & ~UUID_PRIMARY) &&
+		    (drbd_history_uuid(device, 1) & ~UUID_PRIMARY) ==
+		    (peer_device->history_uuids[0] & ~UUID_PRIMARY)) {
+			int i;
 
-		/* Neither has the "crashed primary" flag set,
-		 * only a replication link hickup. */
-		if (rct == 0)
-			return 0;
+			drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
 
-		/* Current UUID equal and no bitmap uuid; does not necessarily
-		 * mean this was a "simultaneous hard crash", maybe IO was
-		 * frozen, so no UUID-bump happened.
-		 * This is a protocol change, overload DRBD_FF_WSAME as flag
-		 * for "new-enough" peer DRBD version. */
-		if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
-			*rule_nr = 41;
-			if (!(connection->agreed_features & DRBD_FF_WSAME)) {
-				drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
-				return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
-			}
-			if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
-				/* At least one has the "crashed primary" bit set,
-				 * both are primary now, but neither has rotated its UUIDs?
-				 * "Can not happen." */
-				drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
-				return -100;
-			}
-			if (device->state.role == R_PRIMARY)
-				return 1;
-			return -1;
-		}
+			for (i = ARRAY_SIZE(peer_device->history_uuids) - 1; i > 0; i--)
+				peer_device->history_uuids[i] = peer_device->history_uuids[i - 1];
+			peer_device->history_uuids[i] = peer_device->bitmap_uuids[node_id];
+			peer_device->bitmap_uuids[node_id] = 0;
 
-		/* Both are secondary.
-		 * Really looks like recovery from simultaneous hard crash.
-		 * Check which had been primary before, and arbitrate. */
-		switch (rct) {
-		case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
-		case 1: /*  self_pri && !peer_pri */ return 1;
-		case 2: /* !self_pri &&  peer_pri */ return -1;
-		case 3: /*  self_pri &&  peer_pri */
-			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
-			return dc ? -1 : 1;
+			drbd_uuid_dump_peer(peer_device, peer_device->dirty_bits, peer_device->uuid_flags);
+			*rule = RULE_SYNC_TARGET_PEER_MISSED_FINISH;
+		} else {
+			drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+			*rule = RULE_SYNC_TARGET_MISSED_FINISH;
 		}
+
+		return SYNC_TARGET_USE_BITMAP;
 	}
 
-	*rule_nr = 50;
-	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
-	if (self == peer)
-		return -1;
+	return UNDETERMINED;
+}
+
+static enum sync_strategy
+uuid_fixup_resync_start1(struct drbd_peer_device *peer_device, enum sync_rule *rule)
+{
+	struct drbd_device *device = peer_device->device;
+	const int node_id = peer_device->device->resource->res_opts.node_id;
+	u64 self, peer;
+
+	self = drbd_current_uuid(device) & ~UUID_PRIMARY;
+	peer = peer_device->history_uuids[0] & ~UUID_PRIMARY;
 
-	*rule_nr = 51;
-	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (connection->agreed_pro_version < 96 ?
-		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
-		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
-		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+		if (peer_device->connection->agreed_pro_version < 96 ?
+		    (drbd_history_uuid(device, 0) & ~UUID_PRIMARY) ==
+		    (peer_device->history_uuids[1] & ~UUID_PRIMARY) :
+		    peer + UUID_NEW_BM_OFFSET == (peer_device->bitmap_uuids[node_id] & ~UUID_PRIMARY)) {
+			int i;
+
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of the peer's UUIDs. */
+			*rule = RULE_SYNC_TARGET_MISSED_START;
 
-			if (connection->agreed_pro_version < 91)
-				return -1091;
+			if (peer_device->connection->agreed_pro_version < 91)
+				return REQUIRES_PROTO_91;
 
-			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
-			device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+			peer_device->bitmap_uuids[node_id] = peer_device->history_uuids[0];
+			for (i = 0; i < ARRAY_SIZE(peer_device->history_uuids) - 1; i++)
+				peer_device->history_uuids[i] = peer_device->history_uuids[i + 1];
+			peer_device->history_uuids[i] = 0;
 
 			drbd_info(device, "Lost last syncUUID packet, corrected:\n");
-			drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+			drbd_uuid_dump_peer(peer_device, peer_device->dirty_bits, peer_device->uuid_flags);
 
-			return -1;
+			return SYNC_TARGET_USE_BITMAP;
 		}
 	}
 
-	*rule_nr = 60;
-	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
-	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
-		peer = device->p_uuid[i] & ~((u64)1);
-		if (self == peer)
-			return -2;
-	}
+	return UNDETERMINED;
+}
 
-	*rule_nr = 70;
-	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
-	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
-	if (self == peer)
-		return 1;
+static enum sync_strategy
+uuid_fixup_resync_start2(struct drbd_peer_device *peer_device, enum sync_rule *rule)
+{
+	struct drbd_device *device = peer_device->device;
+	u64 self, peer;
+
+	self = drbd_history_uuid(device, 0) & ~UUID_PRIMARY;
+	peer = peer_device->current_uuid & ~UUID_PRIMARY;
 
-	*rule_nr = 71;
-	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
 	if (self == peer) {
-		if (connection->agreed_pro_version < 96 ?
-		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
-		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
-		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+		if (peer_device->connection->agreed_pro_version < 96 ?
+		    (drbd_history_uuid(device, 1) & ~UUID_PRIMARY) ==
+		    (peer_device->history_uuids[0] & ~UUID_PRIMARY) :
+		    self + UUID_NEW_BM_OFFSET == (drbd_bitmap_uuid(peer_device) & ~UUID_PRIMARY)) {
+			u64 bitmap_uuid;
+
 			/* The last P_SYNC_UUID did not get though. Undo the last start of
 			   resync as sync source modifications of our UUIDs. */
+			*rule = RULE_SYNC_SOURCE_MISSED_START;
 
-			if (connection->agreed_pro_version < 91)
-				return -1091;
+			if (peer_device->connection->agreed_pro_version < 91)
+				return REQUIRES_PROTO_91;
 
-			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
-			__drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+			bitmap_uuid = _drbd_uuid_pull_history(peer_device);
+			_drbd_uuid_set_bitmap(peer_device, bitmap_uuid);
 
 			drbd_info(device, "Last syncUUID did not get through, corrected:\n");
-			drbd_uuid_dump(device, "self", device->ldev->md.uuid,
-				       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+			drbd_uuid_dump_self(peer_device,
+					    device->disk_state[NOW] >= D_NEGOTIATING ? drbd_bm_total_weight(peer_device) : 0, 0);
 
-			return 1;
+			return SYNC_SOURCE_USE_BITMAP;
 		}
 	}
 
+	return UNDETERMINED;
+}
 
-	*rule_nr = 80;
-	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
-	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
-		self = device->ldev->md.uuid[i] & ~((u64)1);
-		if (self == peer)
-			return 2;
-	}
-
-	*rule_nr = 90;
-	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
-	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
-	if (self == peer && self != ((u64)0))
-		return 100;
+static enum sync_strategy drbd_uuid_compare(struct drbd_peer_device *peer_device,
+			     enum sync_rule *rule, int *peer_node_id)
+{
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	const int node_id = device->resource->res_opts.node_id;
+	bool my_current_in_peers_history, peers_current_in_my_history;
+	bool bitmap_matches, flags_matches, uuid_matches;
+	u64 resolved_uuid, bitmap_uuid;
+	u64 local_uuid_flags = 0;
+	u64 self, peer;
+	int i, j;
 
-	*rule_nr = 100;
-	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
-		self = device->ldev->md.uuid[i] & ~((u64)1);
-		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
-			peer = device->p_uuid[j] & ~((u64)1);
-			if (self == peer)
-				return -100;
+	resolved_uuid = drbd_resolved_uuid(peer_device, &local_uuid_flags) & ~UUID_PRIMARY;
+	bitmap_uuid = drbd_bitmap_uuid(peer_device);
+	local_uuid_flags |= drbd_collect_local_uuid_flags(peer_device, NULL);
+
+	uuid_matches = resolved_uuid == (peer_device->comm_current_uuid & ~UUID_PRIMARY);
+	bitmap_matches = bitmap_uuid == peer_device->comm_bitmap_uuid;
+	/* UUID_FLAG_INCONSISTENT is not relevant for the handshake, allow it to change */
+	flags_matches = !((local_uuid_flags ^ peer_device->comm_uuid_flags) & ~UUID_FLAG_INCONSISTENT);
+	if (!test_bit(INITIAL_STATE_SENT, &peer_device->flags)) {
+		drbd_warn(peer_device, "Initial UUIDs and state not sent yet. Not verifying\n");
+	} else if (!uuid_matches || !flags_matches || !bitmap_matches) {
+		if (!uuid_matches)
+			drbd_warn(peer_device, "My current UUID changed during handshake.\n");
+		if (!bitmap_matches)
+			drbd_warn(peer_device, "My bitmap UUID changed during "
+				  "handshake. 0x%llX to 0x%llX\n",
+				  (unsigned long long)peer_device->comm_bitmap_uuid,
+				  (unsigned long long)bitmap_uuid);
+		if (!flags_matches)
+			drbd_warn(peer_device,
+				  "My uuid_flags changed from 0x%llX to 0x%llX during handshake.\n",
+				  (unsigned long long)peer_device->comm_uuid_flags,
+				  (unsigned long long)local_uuid_flags);
+		if (connection->cstate[NOW] == C_CONNECTING) {
+			*rule = RULE_INITIAL_HANDSHAKE_CHANGED;
+			return RETRY_CONNECT;
 		}
 	}
 
-	return -1000;
-}
-
-/* drbd_sync_handshake() returns the new conn state on success, or
-   CONN_MASK (-1) on failure.
- */
-static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
-					   enum drbd_role peer_role,
-					   enum drbd_disk_state peer_disk) __must_hold(local)
-{
-	struct drbd_device *device = peer_device->device;
-	enum drbd_conns rv = C_MASK;
-	enum drbd_disk_state mydisk;
-	struct net_conf *nc;
-	int hg, rule_nr, rr_conflict, tentative, always_asbp;
+	self = resolved_uuid;
+	peer = peer_device->current_uuid & ~UUID_PRIMARY;
 
-	mydisk = device->state.disk;
-	if (mydisk == D_NEGOTIATING)
-		mydisk = device->new_state_tmp.disk;
+	/* Before DRBD 8.0.2 (from 2007), the uuid on sync targets was set to
+	 * zero during resyncs for no good reason. */
+	if (self == 0)
+		self = UUID_JUST_CREATED;
+	if (peer == 0)
+		peer = UUID_JUST_CREATED;
 
-	drbd_info(device, "drbd_sync_handshake:\n");
+	*rule = RULE_JUST_CREATED_BOTH;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return NO_SYNC;
 
-	spin_lock_irq(&device->ldev->md.uuid_lock);
-	drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
-	drbd_uuid_dump(device, "peer", device->p_uuid,
-		       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+	*rule = RULE_JUST_CREATED_SELF;
+	if (self == UUID_JUST_CREATED)
+		return SYNC_TARGET_SET_BITMAP;
 
-	hg = drbd_uuid_compare(peer_device, peer_role, &rule_nr);
-	spin_unlock_irq(&device->ldev->md.uuid_lock);
+	*rule = RULE_JUST_CREATED_PEER;
+	if (peer == UUID_JUST_CREATED)
+		return SYNC_SOURCE_SET_BITMAP;
 
-	drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+	if (self == peer) {
+		struct net_conf *nc;
+		int wire_protocol;
 
-	if (hg == -1000) {
-		drbd_alert(device, "Unrelated data, aborting!\n");
-		return C_MASK;
+		rcu_read_lock();
+		nc = rcu_dereference(connection->transport.net_conf);
+		wire_protocol = nc->wire_protocol;
+		rcu_read_unlock();
+
+		if (connection->agreed_pro_version < 110) {
+			enum sync_strategy rv = uuid_fixup_resync_end(peer_device, rule);
+			if (rv != UNDETERMINED)
+				return rv;
+		}
+
+		if (test_bit(RS_SOURCE_MISSED_END, &peer_device->flags)) {
+			*rule = RULE_SYNC_SOURCE_MISSED_FINISH;
+			return SYNC_SOURCE_USE_BITMAP;
+		}
+		if (test_bit(RS_PEER_MISSED_END, &peer_device->flags)) {
+			*rule = RULE_SYNC_TARGET_PEER_MISSED_FINISH;
+			return SYNC_TARGET_USE_BITMAP;
+		}
+
+		if (connection->agreed_pro_version >= 120) {
+			*rule = RULE_RECONNECTED;
+			if (peer_device->uuid_flags & UUID_FLAG_RECONNECT &&
+			    local_uuid_flags & UUID_FLAG_RECONNECT)
+				return NO_SYNC;
+		}
+
+		if (connection->agreed_pro_version >= 121 &&
+		    (wire_protocol == DRBD_PROT_A || wire_protocol == DRBD_PROT_B)) {
+			*rule = RULE_CRASHED_PRIMARY;
+			if (local_uuid_flags & UUID_FLAG_CRASHED_PRIMARY &&
+			    !(peer_device->uuid_flags & UUID_FLAG_CRASHED_PRIMARY))
+				return SYNC_SOURCE_USE_BITMAP;
+
+			if (peer_device->uuid_flags & UUID_FLAG_CRASHED_PRIMARY &&
+			    !(local_uuid_flags & UUID_FLAG_CRASHED_PRIMARY))
+				return SYNC_TARGET_USE_BITMAP;
+		}
+
+		*rule = RULE_LOST_QUORUM;
+		if (peer_device->uuid_flags & UUID_FLAG_PRIMARY_LOST_QUORUM &&
+		    !test_bit(PRIMARY_LOST_QUORUM, &device->flags))
+			return SYNC_TARGET_IF_BOTH_FAILED;
+
+		if (!(peer_device->uuid_flags & UUID_FLAG_PRIMARY_LOST_QUORUM) &&
+		    test_bit(PRIMARY_LOST_QUORUM, &device->flags))
+			return SYNC_SOURCE_IF_BOTH_FAILED;
+
+		if (peer_device->uuid_flags & UUID_FLAG_PRIMARY_LOST_QUORUM &&
+		    test_bit(PRIMARY_LOST_QUORUM, &device->flags))
+			return test_bit(RESOLVE_CONFLICTS, &connection->transport.flags) ?
+				SYNC_SOURCE_IF_BOTH_FAILED :
+				SYNC_TARGET_IF_BOTH_FAILED;
+
+		if (connection->agreed_pro_version < 120) {
+			*rule = RULE_RECONNECTED;
+			if (peer_device->uuid_flags & UUID_FLAG_RECONNECT &&
+			    local_uuid_flags & UUID_FLAG_RECONNECT)
+				return NO_SYNC;
+		}
+
+		/* Peer crashed as primary, I survived, resync from me */
+		if (peer_device->uuid_flags & UUID_FLAG_CRASHED_PRIMARY &&
+		    local_uuid_flags & UUID_FLAG_RECONNECT)
+			return SYNC_SOURCE_IF_BOTH_FAILED;
+
+		/* I am a crashed primary, peer survived, resync to me */
+		if (local_uuid_flags & UUID_FLAG_CRASHED_PRIMARY &&
+		    peer_device->uuid_flags & UUID_FLAG_RECONNECT)
+			return SYNC_TARGET_IF_BOTH_FAILED;
+
+		/* One of us had a connection to the other node before.
+		   i.e. this is not a common power failure. */
+		if (peer_device->uuid_flags & UUID_FLAG_RECONNECT ||
+		    local_uuid_flags & UUID_FLAG_RECONNECT)
+			return NO_SYNC;
+
+		/* Common power [off|failure]? */
+		*rule = RULE_BOTH_OFF;
+		if (local_uuid_flags & UUID_FLAG_CRASHED_PRIMARY) {
+			if ((peer_device->uuid_flags & UUID_FLAG_CRASHED_PRIMARY) &&
+			    test_bit(RESOLVE_CONFLICTS, &connection->transport.flags))
+				return SYNC_TARGET_IF_BOTH_FAILED;
+			return SYNC_SOURCE_IF_BOTH_FAILED;
+		} else if (peer_device->uuid_flags & UUID_FLAG_CRASHED_PRIMARY)
+				return SYNC_TARGET_IF_BOTH_FAILED;
+		else
+			return NO_SYNC;
+	}
+
+	*rule = RULE_BITMAP_PEER;
+	peer = peer_device->bitmap_uuids[node_id] & ~UUID_PRIMARY;
+	if (self == peer)
+		return SYNC_TARGET_USE_BITMAP;
+
+	*rule = RULE_BITMAP_PEER_OTHER;
+	i = drbd_find_peer_bitmap_by_uuid(peer_device, self);
+	if (i != -1) {
+		*peer_node_id = i;
+		return SYNC_TARGET_CLEAR_BITMAP;
+	}
+
+	if (connection->agreed_pro_version < 110) {
+		enum sync_strategy rv = uuid_fixup_resync_start1(peer_device, rule);
+		if (rv != UNDETERMINED)
+			return rv;
+	}
+
+	*rule = RULE_BITMAP_SELF;
+	self = bitmap_uuid & ~UUID_PRIMARY;
+	peer = peer_device->current_uuid & ~UUID_PRIMARY;
+	if (self == peer)
+		return SYNC_SOURCE_USE_BITMAP;
+
+	*rule = RULE_BITMAP_SELF_OTHER;
+	i = drbd_find_bitmap_by_uuid(peer_device, peer);
+	if (i != -1) {
+		*peer_node_id = i;
+		return SYNC_SOURCE_COPY_BITMAP;
+	}
+
+	self = resolved_uuid;
+	my_current_in_peers_history = uuid_in_peer_history(peer_device, self);
+
+	if (connection->agreed_pro_version < 110) {
+		enum sync_strategy rv = uuid_fixup_resync_start2(peer_device, rule);
+		if (rv != UNDETERMINED)
+			return rv;
+	}
+
+	peer = peer_device->current_uuid & ~UUID_PRIMARY;
+	peers_current_in_my_history = uuid_in_my_history(device, peer);
+
+	if (my_current_in_peers_history && !peers_current_in_my_history) {
+		*rule = RULE_HISTORY_PEER;
+		return SYNC_TARGET_SET_BITMAP;
+	}
+	if (!my_current_in_peers_history && peers_current_in_my_history) {
+		*rule = RULE_HISTORY_SELF;
+		return SYNC_SOURCE_SET_BITMAP;
+	}
+
+	*rule = RULE_BITMAP_BOTH;
+	self = bitmap_uuid & ~UUID_PRIMARY;
+	peer = peer_device->bitmap_uuids[node_id] & ~UUID_PRIMARY;
+	if (self == peer && self != ((u64)0))
+		return SPLIT_BRAIN_AUTO_RECOVER;
+
+	*rule = RULE_HISTORY_BOTH;
+	for (i = 0; i < HISTORY_UUIDS; i++) {
+		self = drbd_history_uuid(device, i) & ~UUID_PRIMARY;
+		/* Don't conclude to have "data divergence" from a "common ancestor"
+		 * if that common ancestor is just a not used yet slot in the history,
+		 * which is still initialized to zero on both peers. */
+		if (self == 0)
+			break;
+		for (j = 0; j < ARRAY_SIZE(peer_device->history_uuids); j++) {
+			peer = peer_device->history_uuids[j] & ~UUID_PRIMARY;
+			if (peer == 0)
+				break;
+			if (self == peer)
+				return SPLIT_BRAIN_DISCONNECT;
+		}
+	}
+
+	return UNRELATED_DATA;
+}
+
+static void log_handshake(struct drbd_peer_device *peer_device)
+{
+	u64 uuid_flags = drbd_collect_local_uuid_flags(peer_device, NULL);
+
+	drbd_info(peer_device, "drbd_sync_handshake:\n");
+	drbd_uuid_dump_self(peer_device, peer_device->comm_bm_set, uuid_flags);
+	drbd_uuid_dump_peer(peer_device, peer_device->dirty_bits, peer_device->uuid_flags);
+}
+
+static enum sync_strategy drbd_handshake(struct drbd_peer_device *peer_device,
+			  enum sync_rule *rule,
+			  int *peer_node_id,
+			  bool always_verbose)
+{
+	struct drbd_device *device = peer_device->device;
+	enum sync_strategy strategy;
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	if (always_verbose)
+		log_handshake(peer_device);
+
+	strategy = drbd_uuid_compare(peer_device, rule, peer_node_id);
+	if (strategy != NO_SYNC && !always_verbose)
+		log_handshake(peer_device);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	if (strategy != NO_SYNC || always_verbose)
+		drbd_info(peer_device, "uuid_compare()=%s by rule=%s\n",
+				strategy_descriptor(strategy).name,
+				drbd_sync_rule_str(*rule));
+
+	return strategy;
+}
+
+static bool is_resync_running(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		enum drbd_repl_state repl_state = peer_device->repl_state[NOW];
+		if (repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static int bitmap_mod_after_handshake(struct drbd_peer_device *peer_device, enum sync_strategy strategy, int peer_node_id)
+{
+	struct drbd_device *device = peer_device->device;
+
+	if (strategy == SYNC_SOURCE_COPY_BITMAP) {
+		int from = device->ldev->md.peers[peer_node_id].bitmap_index;
+
+		if (from == -1)
+			from = drbd_unallocated_index(device->ldev);
+
+		if (peer_device->bitmap_index == -1)
+			return 0;
+
+		if (from == -1)
+			drbd_info(peer_device,
+				  "Setting all bitmap bits, day0 bm not available node_id=%d\n",
+				  peer_node_id);
+		else
+			drbd_info(peer_device,
+				  "Copying bitmap of peer node_id=%d (bitmap_index=%d)\n",
+				  peer_node_id, from);
+
+		drbd_suspend_io(device, WRITE_ONLY);
+		drbd_bm_slot_lock(peer_device, "copy_slot/set_many sync_handshake", BM_LOCK_BULK);
+		if (from == -1)
+			drbd_bm_set_many_bits(peer_device, 0, -1UL);
+		else
+			drbd_bm_copy_slot(device, from, peer_device->bitmap_index);
+		drbd_bm_write(device, NULL);
+		drbd_bm_slot_unlock(peer_device);
+		drbd_resume_io(device);
+	} else if (strategy == SYNC_TARGET_CLEAR_BITMAP) {
+		drbd_info(peer_device, "Resync source provides bitmap (node_id=%d)\n", peer_node_id);
+		drbd_suspend_io(device, WRITE_ONLY);
+		drbd_bm_slot_lock(peer_device, "bm_clear_many_bits sync_handshake", BM_LOCK_BULK);
+		drbd_bm_clear_many_bits(peer_device, 0, -1UL);
+		drbd_bm_write(device, NULL);
+		drbd_bm_slot_unlock(peer_device);
+		drbd_resume_io(device);
+	} else if (strategy == SYNC_SOURCE_SET_BITMAP || strategy == SYNC_TARGET_SET_BITMAP) {
+		int (*io_func)(struct drbd_device *, struct drbd_peer_device *);
+		int err;
+
+		if (strategy == SYNC_TARGET_SET_BITMAP &&
+		    drbd_current_uuid(device) == UUID_JUST_CREATED &&
+		    is_resync_running(device))
+			return 0;
+
+		if (drbd_current_uuid(device) == UUID_JUST_CREATED) {
+			drbd_info(peer_device, "Setting and writing the whole bitmap, fresh node\n");
+			io_func = &drbd_bmio_set_allocated_n_write;
+		} else {
+			drbd_info(peer_device, "Setting and writing one bitmap slot, after drbd_sync_handshake\n");
+			io_func = &drbd_bmio_set_n_write;
+		}
+		err = drbd_bitmap_io(device, io_func, "set_n_write sync_handshake",
+				     BM_LOCK_CLEAR | BM_LOCK_BULK, peer_device);
+		if (err)
+			return err;
+
+		if (drbd_current_uuid(device) != UUID_JUST_CREATED &&
+		    peer_device->current_uuid != UUID_JUST_CREATED &&
+		    strategy == SYNC_SOURCE_SET_BITMAP) {
+			/*
+			 * We have just written the bitmap slot. Update the
+			 * bitmap UUID so that the resync does not start from
+			 * the beginning again if we disconnect and reconnect.
+			 *
+			 * Initial resync continuation is handled in
+			 * drbd_start_resync() at comment:
+			 * prepare to continue an interrupted initial resync later
+			 */
+			drbd_uuid_set_bitmap(peer_device, peer_device->current_uuid);
+			drbd_print_uuids(peer_device, "updated bitmap UUID");
+			drbd_md_sync(device);
+		}
+	}
+	return 0;
+}
+
+static enum drbd_repl_state strategy_to_repl_state(struct drbd_peer_device *peer_device,
+						   enum drbd_role peer_role,
+						   enum sync_strategy strategy)
+{
+	enum drbd_role role = peer_device->device->resource->role[NOW];
+	enum drbd_repl_state rv;
+
+	if (strategy == SYNC_SOURCE_IF_BOTH_FAILED || strategy == SYNC_TARGET_IF_BOTH_FAILED) {
+		if (role == R_PRIMARY || peer_role == R_PRIMARY) {
+			/* We have at least one primary, follow that with the resync decision */
+			rv = peer_role == R_SECONDARY ? L_WF_BITMAP_S :
+				role == R_SECONDARY ? L_WF_BITMAP_T :
+				L_ESTABLISHED;
+			return rv;
+		}
+		/* No current primary. Handle it as a common power failure, consider the
+		   roles at crash time */
+	}
+
+	if (strategy_descriptor(strategy).is_sync_source) {
+		rv = L_WF_BITMAP_S;
+	} else if (strategy_descriptor(strategy).is_sync_target) {
+		rv = L_WF_BITMAP_T;
+	} else {
+		rv = L_ESTABLISHED;
+	}
+
+	return rv;
+}
+
+static enum sync_strategy drbd_disk_states_source_strategy(
+		struct drbd_peer_device *peer_device,
+		int *peer_node_id)
+{
+	const int node_id = peer_device->device->resource->res_opts.node_id;
+	u64 bitmap_uuid;
+	int i = -1;
+
+	if (!(peer_device->uuid_flags & UUID_FLAG_SYNC_TARGET))
+		return SYNC_SOURCE_USE_BITMAP;
+
+	/* A resync with identical current-UUIDs -> USE_BITMAP */
+	bitmap_uuid = peer_device->bitmap_uuids[node_id];
+	if (bitmap_uuid == peer_device->current_uuid &&
+	    bitmap_uuid == drbd_current_uuid(peer_device->device))
+		return SYNC_SOURCE_USE_BITMAP;
+
+	/* When the peer is already a sync target, we actually see its
+	 * current UUID in the bitmap UUID slot towards us. We may need
+	 * to pick a different bitmap as a result. */
+	if (bitmap_uuid)
+		i = drbd_find_bitmap_by_uuid(peer_device, bitmap_uuid);
+
+	if (i == -1)
+		return SYNC_SOURCE_SET_BITMAP;
+
+	if (i == peer_device->node_id)
+		return SYNC_SOURCE_USE_BITMAP;
+
+	*peer_node_id = i;
+	return SYNC_SOURCE_COPY_BITMAP;
+}
+
+static enum sync_strategy drbd_disk_states_target_strategy(
+		struct drbd_peer_device *peer_device,
+		int *peer_node_id)
+{
+	const int node_id = peer_device->device->resource->res_opts.node_id;
+	u64 bitmap_uuid;
+	int i;
+
+	if (!(peer_device->comm_uuid_flags & UUID_FLAG_SYNC_TARGET))
+		return SYNC_TARGET_USE_BITMAP;
+
+	bitmap_uuid = drbd_bitmap_uuid(peer_device);
+	if (bitmap_uuid == peer_device->current_uuid &&
+	    bitmap_uuid == drbd_current_uuid(peer_device->device))
+		return SYNC_TARGET_USE_BITMAP;
+
+	/* When we are already a sync target, we need to choose our
+	 * strategy to mirror the peer's choice (see
+	 * drbd_disk_states_source_strategy). */
+	i = drbd_find_peer_bitmap_by_uuid(peer_device, bitmap_uuid);
+
+	if (i == -1)
+		return SYNC_TARGET_SET_BITMAP;
+
+	if (i == node_id)
+		return SYNC_TARGET_USE_BITMAP;
+
+	*peer_node_id = i;
+	return SYNC_TARGET_CLEAR_BITMAP;
+}
+
+static void disk_states_to_strategy(struct drbd_peer_device *peer_device,
+				    enum drbd_disk_state peer_disk_state,
+				    enum sync_strategy *strategy, enum sync_rule rule,
+				    int *peer_node_id)
+{
+	enum drbd_disk_state disk_state = peer_device->comm_state.disk;
+	struct drbd_device *device = peer_device->device;
+	bool decide_based_on_dstates = false;
+	bool prefer_local, either_inconsistent;
+
+	if (disk_state == D_NEGOTIATING)
+		disk_state = disk_state_from_md(device);
+
+	either_inconsistent =
+		(disk_state == D_INCONSISTENT && peer_disk_state > D_INCONSISTENT) ||
+		(peer_disk_state == D_INCONSISTENT && disk_state > D_INCONSISTENT);
+
+	if (peer_device->connection->agreed_pro_version >= 119) {
+		bool dstates_want_resync =
+			disk_state != peer_disk_state && disk_state >= D_INCONSISTENT &&
+			peer_disk_state >= D_INCONSISTENT && peer_disk_state != D_UNKNOWN;
+		bool resync_direction_arbitrary =
+			*strategy == SYNC_TARGET_IF_BOTH_FAILED ||
+			*strategy == SYNC_SOURCE_IF_BOTH_FAILED;
+
+		decide_based_on_dstates =
+			dstates_want_resync &&
+			(((rule == RULE_RECONNECTED || rule == RULE_LOST_QUORUM || rule == RULE_BOTH_OFF) &&
+			  resync_direction_arbitrary) ||
+			 (*strategy == NO_SYNC && either_inconsistent));
+
+		prefer_local = disk_state > peer_disk_state;
+		/* RULE_BOTH_OFF means that the current UUIDs are equal. The decision
+		   was found by looking at the crashed_primary bits.
+		   The current disk states might give a better basis for decision-making! */
+
+		/* RULE_LOST_QUORUM means that the current UUIDs are equal. The resync direction
+		   was found by looking if a node lost quorum while being primary */
+	} else {
+		decide_based_on_dstates =
+			(rule == RULE_BOTH_OFF || *strategy == NO_SYNC) && either_inconsistent;
+
+		prefer_local = disk_state > D_INCONSISTENT;
+	}
+
+	if (decide_based_on_dstates) {
+		*strategy = prefer_local ?
+			drbd_disk_states_source_strategy(peer_device, peer_node_id) :
+			drbd_disk_states_target_strategy(peer_device, peer_node_id);
+		drbd_info(peer_device, "strategy = %s due to disk states. (%s/%s)\n",
+			  strategy_descriptor(*strategy).name,
+			  drbd_disk_str(disk_state), drbd_disk_str(peer_disk_state));
+	}
+}
+
+static enum sync_strategy drbd_attach_handshake(struct drbd_peer_device *peer_device,
+						  enum drbd_disk_state peer_disk_state)
+{
+	enum sync_strategy strategy;
+	enum sync_rule rule;
+	int peer_node_id, err;
+
+	strategy = drbd_handshake(peer_device, &rule, &peer_node_id, true);
+
+	if (!is_strategy_determined(strategy))
+		return strategy;
+
+	disk_states_to_strategy(peer_device, peer_disk_state, &strategy, rule, &peer_node_id);
+	err = bitmap_mod_after_handshake(peer_device, strategy, peer_node_id);
+	if (err)
+		return RETRY_CONNECT;
+
+	return strategy;
+}
+
+static enum sync_strategy discard_my_data_to_strategy(struct drbd_peer_device *peer_device)
+{
+	enum sync_strategy strategy = UNDETERMINED;
+
+	if (test_bit(DISCARD_MY_DATA, &peer_device->flags) &&
+	    !(peer_device->uuid_flags & UUID_FLAG_DISCARD_MY_DATA))
+		strategy = SYNC_TARGET_USE_BITMAP;
+
+	if (!test_bit(DISCARD_MY_DATA, &peer_device->flags) &&
+	    (peer_device->uuid_flags & UUID_FLAG_DISCARD_MY_DATA))
+		strategy = SYNC_SOURCE_USE_BITMAP;
+
+	return strategy;
+}
+
+/* drbd_sync_handshake() returns the new replication state on success, and -1
+ * on failure.
+ */
+static enum sync_strategy drbd_sync_handshake(struct drbd_peer_device *peer_device,
+					      union drbd_state peer_state)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct net_conf *nc;
+	enum sync_strategy strategy;
+	enum sync_rule rule;
+	int rr_conflict, always_asbp, peer_node_id = 0, err;
+	enum drbd_role peer_role = peer_state.role;
+	enum drbd_disk_state peer_disk_state = peer_state.disk;
+	int required_protocol;
+	enum sync_strategy strategy_from_user = discard_my_data_to_strategy(peer_device);
+	bool need_full_sync_after_split_brain;
+
+	strategy = drbd_handshake(peer_device, &rule, &peer_node_id, true);
+
+	if (strategy == RETRY_CONNECT)
+		return strategy;
+
+	if (strategy == UNRELATED_DATA) {
+		drbd_alert(peer_device, "Unrelated data, aborting!\n");
+		return strategy;
 	}
-	if (hg < -0x10000) {
-		int proto, fflags;
-		hg = -hg;
-		proto = hg & 0xff;
-		fflags = (hg >> 8) & 0xff;
-		drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
-					proto, fflags);
-		return C_MASK;
+	required_protocol = strategy_descriptor(strategy).required_protocol;
+	if (required_protocol) {
+		drbd_alert(peer_device, "To resolve this both sides have to support at least protocol %d\n", required_protocol);
+		return strategy;
 	}
-	if (hg < -1000) {
-		drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
-		return C_MASK;
+
+	/* Protocol < 124 peers don't handle 0-bit missed-end-of-resync correctly.
+	 * Retry the connection to let UUID cleanup resolve it, or keep retrying
+	 * if the peer needs to be upgraded.
+	 */
+	if (connection->agreed_pro_version < 124 &&
+	    peer_device->comm_bm_set == 0 && peer_device->dirty_bits == 0) {
+		if (strategy == SYNC_SOURCE_USE_BITMAP &&
+		    rule == RULE_SYNC_SOURCE_MISSED_FINISH) {
+			drbd_info(peer_device, "Missed end of resync as sync-source with 0 bits;"
+				  " retrying to let UUID cleanup resolve it\n");
+			return RETRY_CONNECT;
+		}
+		if (strategy == SYNC_TARGET_USE_BITMAP &&
+		    rule == RULE_SYNC_TARGET_PEER_MISSED_FINISH &&
+		    device->resource->role[NOW] == R_PRIMARY) {
+			drbd_info(peer_device, "Missed end of resync as sync-target with 0 bits on Primary;"
+				  " peer needs protocol 124+ to resolve, retrying\n");
+			return REQUIRES_PROTO_124;
+		}
 	}
 
-	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
-	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
-		int f = (hg == -100) || abs(hg) == 2;
-		hg = mydisk > D_INCONSISTENT ? 1 : -1;
-		if (f)
-			hg = hg*2;
-		drbd_info(device, "Becoming sync %s due to disk states.\n",
-		     hg > 0 ? "source" : "target");
+	disk_states_to_strategy(peer_device, peer_disk_state, &strategy, rule, &peer_node_id);
+
+	if (strategy == SPLIT_BRAIN_AUTO_RECOVER && (!drbd_device_stable(device, NULL) || !(peer_device->uuid_flags & UUID_FLAG_STABLE))) {
+		drbd_warn(peer_device, "Ignore Split-Brain, for now, at least one side unstable\n");
+		strategy = NO_SYNC;
 	}
 
-	if (abs(hg) == 100)
-		drbd_khelper(device, "initial-split-brain");
+	if (strategy_descriptor(strategy).is_split_brain)
+		drbd_maybe_khelper(device, connection, "initial-split-brain");
 
 	rcu_read_lock();
-	nc = rcu_dereference(peer_device->connection->net_conf);
+	nc = rcu_dereference(connection->transport.net_conf);
 	always_asbp = nc->always_asbp;
 	rr_conflict = nc->rr_conflict;
-	tentative = nc->tentative;
 	rcu_read_unlock();
 
-	if (hg == 100 || (hg == -100 && always_asbp)) {
-		int pcount = (device->state.role == R_PRIMARY)
+	/* Evaluate the original strategy,
+	 * before it is re-mapped by additional configuration below.
+	 */
+	need_full_sync_after_split_brain = (strategy == SPLIT_BRAIN_DISCONNECT);
+
+	if (strategy == SPLIT_BRAIN_AUTO_RECOVER || (strategy == SPLIT_BRAIN_DISCONNECT && always_asbp)) {
+		int pcount = (device->resource->role[NOW] == R_PRIMARY)
 			   + (peer_role == R_PRIMARY);
-		int forced = (hg == -100);
 
-		switch (pcount) {
-		case 0:
-			hg = drbd_asb_recover_0p(peer_device);
-			break;
-		case 1:
-			hg = drbd_asb_recover_1p(peer_device);
-			break;
-		case 2:
-			hg = drbd_asb_recover_2p(peer_device);
-			break;
+		if (device->resource->res_opts.quorum != QOU_OFF &&
+		    connection->agreed_pro_version >= 113) {
+			if (device->have_quorum[NOW] && !peer_state.quorum)
+				strategy = SYNC_SOURCE_USE_BITMAP;
+			else if (!device->have_quorum[NOW] && peer_state.quorum)
+				strategy = SYNC_TARGET_USE_BITMAP;
+		}
+		if (strategy_descriptor(strategy).is_split_brain) {
+			switch (pcount) {
+			case 0:
+				strategy = drbd_asb_recover_0p(peer_device);
+				break;
+			case 1:
+				strategy = drbd_asb_recover_1p(peer_device);
+				break;
+			case 2:
+				strategy = drbd_asb_recover_2p(peer_device);
+				break;
+			}
 		}
-		if (abs(hg) < 100) {
-			drbd_warn(device, "Split-Brain detected, %d primaries, "
+		if (!strategy_descriptor(strategy).is_split_brain) {
+			drbd_warn(peer_device, "Split-Brain detected, %d primaries, "
 			     "automatically solved. Sync from %s node\n",
-			     pcount, (hg < 0) ? "peer" : "this");
-			if (forced) {
-				drbd_warn(device, "Doing a full sync, since"
+			     pcount, strategy_descriptor(strategy).is_sync_target ? "peer" : "this");
+			if (need_full_sync_after_split_brain) {
+				if (!strategy_descriptor(strategy).full_sync_equivalent) {
+					drbd_alert(peer_device, "Want full sync but cannot decide direction, dropping connection!\n");
+					return SPLIT_BRAIN_DISCONNECT;
+				}
+				drbd_warn(peer_device, "Doing a full sync, since"
 				     " UUIDs where ambiguous.\n");
-				hg = hg*2;
+				strategy = strategy_descriptor(strategy).full_sync_equivalent;
 			}
 		}
 	}
 
-	if (hg == -100) {
-		if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
-			hg = -1;
-		if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
-			hg = 1;
+	if (strategy == SPLIT_BRAIN_DISCONNECT && strategy_from_user != UNDETERMINED) {
+		/* strategy_from_user via "--discard-my-data" is either
+		 * SYNC_TARGET_USE_BITMAP or SYNC_SOURCE_USE_BITMAP.
+		 * But here we do no longer have a relevant bitmap anymore.
+		 * Map to their "full sync equivalent".
+		 */
+		if (need_full_sync_after_split_brain)
+			strategy = strategy_descriptor(strategy_from_user).full_sync_equivalent;
+		else
+			strategy = strategy_from_user;
+		drbd_warn(peer_device, "Split-Brain detected, manually solved. %s from %s node\n",
+			  need_full_sync_after_split_brain ? "Full sync" : "Sync",
+			  strategy_descriptor(strategy).is_sync_target ? "peer" : "this");
+	}
 
-		if (abs(hg) < 100)
-			drbd_warn(device, "Split-Brain detected, manually solved. "
-			     "Sync from %s node\n",
-			     (hg < 0) ? "peer" : "this");
+	if (strategy_descriptor(strategy).is_split_brain) {
+		drbd_alert(peer_device, "Split-Brain detected but unresolved, dropping connection!\n");
+		drbd_maybe_khelper(device, connection, "split-brain");
+		return strategy;
 	}
 
-	if (hg == -100) {
-		/* FIXME this log message is not correct if we end up here
-		 * after an attempted attach on a diskless node.
-		 * We just refuse to attach -- well, we drop the "connection"
-		 * to that disk, in a way... */
-		drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
-		drbd_khelper(device, "split-brain");
-		return C_MASK;
+	if (!is_strategy_determined(strategy)) {
+		drbd_alert(peer_device, "Failed to fully determine sync strategy, dropping connection!\n");
+		return strategy;
 	}
 
-	if (hg > 0 && mydisk <= D_INCONSISTENT) {
-		drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
-		return C_MASK;
+	if (connection->agreed_pro_version >= 121 && strategy != NO_SYNC &&
+	    strategy_from_user != UNDETERMINED &&
+	    strategy_descriptor(strategy).is_sync_source != strategy_descriptor(strategy_from_user).is_sync_source) {
+		if (strategy_descriptor(strategy).reverse != UNDETERMINED) {
+			enum sync_strategy reversed = strategy_descriptor(strategy).reverse;
+			enum drbd_disk_state resync_source_disk_state =
+				strategy_descriptor(reversed).is_sync_source ? device->disk_state[NOW] : peer_disk_state;
+			if (resync_source_disk_state > D_INCONSISTENT) {
+				strategy = reversed;
+				drbd_warn(peer_device, "Resync direction reversed by --discard-my-data. Reverting to older data!\n");
+			} else {
+				drbd_warn(peer_device, "Ignoring --discard-my-data\n");
+			}
+		} else {
+			drbd_warn(peer_device, "Can not reverse resync direction (requested via --discard-my-data)\n");
+		}
 	}
 
-	if (hg < 0 && /* by intention we do not use mydisk here. */
-	    device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+	if (strategy_descriptor(strategy).is_sync_target &&
+	    strategy != SYNC_TARGET_IF_BOTH_FAILED &&
+	    device->resource->role[NOW] == R_PRIMARY && device->disk_state[NOW] >= D_CONSISTENT &&
+	    (peer_device->comm_bm_set > 0 || peer_device->dirty_bits > 0)) {
 		switch (rr_conflict) {
 		case ASB_CALL_HELPER:
-			drbd_khelper(device, "pri-lost");
+			drbd_maybe_khelper(device, connection, "pri-lost");
 			fallthrough;
 		case ASB_DISCONNECT:
-			drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
-			return C_MASK;
+		case ASB_RETRY_CONNECT:
+			drbd_err(peer_device, "I shall become SyncTarget, but I am primary!\n");
+			strategy = rr_conflict == ASB_RETRY_CONNECT ?
+				SYNC_TARGET_PRIMARY_RECONNECT : SYNC_TARGET_PRIMARY_DISCONNECT;
+			break;
 		case ASB_VIOLENTLY:
-			drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+			drbd_warn(peer_device, "Becoming SyncTarget, violating the stable-data"
 			     "assumption\n");
+			break;
+		case ASB_AUTO_DISCARD:
+			if (strategy == SYNC_TARGET_USE_BITMAP && rule == RULE_CRASHED_PRIMARY) {
+				drbd_warn(peer_device, "reversing resync by auto-discard\n");
+				strategy = SYNC_SOURCE_USE_BITMAP;
+			}
 		}
 	}
-
-	if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
-		if (hg == 0)
-			drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
-		else
-			drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
-				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
-				 abs(hg) >= 2 ? "full" : "bit-map based");
-		return C_MASK;
+	if (strategy == SYNC_SOURCE_USE_BITMAP && rule == RULE_CRASHED_PRIMARY &&
+	    peer_role == R_PRIMARY && peer_disk_state >= D_CONSISTENT &&
+	    rr_conflict == ASB_AUTO_DISCARD) {
+		drbd_warn(peer_device, "reversing resync by auto-discard\n");
+		strategy = SYNC_TARGET_USE_BITMAP;
+	}
+
+	if (rule == RULE_SYNC_SOURCE_MISSED_FINISH || rule == RULE_SYNC_SOURCE_PEER_MISSED_FINISH ||
+	    rule == RULE_SYNC_TARGET_MISSED_FINISH || rule == RULE_SYNC_TARGET_PEER_MISSED_FINISH) {
+		if (strategy == SYNC_SOURCE_USE_BITMAP) {
+			enum drbd_disk_state disk_state = peer_device->comm_state.disk;
+
+			if (disk_state == D_NEGOTIATING)
+				disk_state = disk_state_from_md(device);
+			if (disk_state != D_UP_TO_DATE) {
+				drbd_info(peer_device,
+					  "Resync (rule=%s) skipped: sync-source (%s)\n",
+					  drbd_sync_rule_str(rule), drbd_disk_str(disk_state));
+				strategy = NO_SYNC;
+			}
+		} else if (strategy == SYNC_TARGET_USE_BITMAP) {
+			if (peer_disk_state != D_UP_TO_DATE) {
+				int peer_node_id = peer_device->node_id;
+				u64 previous = device->ldev->md.peers[peer_node_id].bitmap_uuid;
+
+				if (previous) {
+					device->ldev->md.peers[peer_node_id].bitmap_uuid = 0;
+					_drbd_uuid_push_history(device, previous);
+					drbd_md_mark_dirty(device);
+				}
+				drbd_info(peer_device,
+					  "Resync (rule=%s) skipped: peer sync-source (%s)\n",
+					  drbd_sync_rule_str(rule), drbd_disk_str(peer_disk_state));
+				strategy = NO_SYNC;
+			}
+		}
 	}
 
-	if (abs(hg) >= 2) {
-		drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
-		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
-					BM_LOCKED_SET_ALLOWED, NULL))
-			return C_MASK;
+	if (test_bit(CONN_DRY_RUN, &connection->flags)) {
+		if (strategy == NO_SYNC)
+			drbd_info(peer_device, "dry-run connect: No resync, would become Connected immediately.\n");
+		else
+			drbd_info(peer_device, "dry-run connect: Would become %s, doing a %s resync.",
+				 drbd_repl_str(strategy_descriptor(strategy).is_sync_target ? L_SYNC_TARGET : L_SYNC_SOURCE),
+				 strategy_descriptor(strategy).name);
+		return -2;
 	}
 
-	if (hg > 0) { /* become sync source. */
-		rv = C_WF_BITMAP_S;
-	} else if (hg < 0) { /* become sync target */
-		rv = C_WF_BITMAP_T;
-	} else {
-		rv = C_CONNECTED;
-		if (drbd_bm_total_weight(device)) {
-			drbd_info(device, "No resync, but %lu bits in bitmap!\n",
-			     drbd_bm_total_weight(device));
-		}
-	}
+	err = bitmap_mod_after_handshake(peer_device, strategy, peer_node_id);
+	if (err)
+		return RETRY_CONNECT;
 
-	return rv;
+	return strategy;
 }
 
 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
@@ -3465,20 +5479,18 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 
 		if (pi->size > sizeof(integrity_alg))
 			return -EIO;
-		err = drbd_recv_all(connection, integrity_alg, pi->size);
+		err = drbd_recv_into(connection, integrity_alg, pi->size);
 		if (err)
 			return err;
 		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
 	}
 
 	if (pi->cmd != P_PROTOCOL_UPDATE) {
-		clear_bit(CONN_DRY_RUN, &connection->flags);
-
 		if (cf & CF_DRY_RUN)
 			set_bit(CONN_DRY_RUN, &connection->flags);
 
 		rcu_read_lock();
-		nc = rcu_dereference(connection->net_conf);
+		nc = rcu_dereference(connection->transport.net_conf);
 
 		if (p_proto != nc->wire_protocol) {
 			drbd_err(connection, "incompatible %s settings\n", "protocol");
@@ -3500,7 +5512,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 			goto disconnect_rcu_unlock;
 		}
 
-		if (p_discard_my_data && nc->discard_my_data) {
+		if (p_discard_my_data && test_bit(CONN_DISCARD_MY_DATA, &connection->flags)) {
 			drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
 			goto disconnect_rcu_unlock;
 		}
@@ -3551,9 +5563,13 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 	if (!new_net_conf)
 		goto disconnect;
 
-	mutex_lock(&connection->data.mutex);
-	mutex_lock(&connection->resource->conf_update);
-	old_net_conf = connection->net_conf;
+	if (mutex_lock_interruptible(&connection->resource->conf_update)) {
+		drbd_err(connection, "Interrupted while waiting for conf_update\n");
+		goto disconnect;
+	}
+
+	mutex_lock(&connection->mutex[DATA_STREAM]);
+	old_net_conf = connection->transport.net_conf;
 	*new_net_conf = *old_net_conf;
 
 	new_net_conf->wire_protocol = p_proto;
@@ -3562,9 +5578,9 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
 	new_net_conf->two_primaries = p_two_primaries;
 
-	rcu_assign_pointer(connection->net_conf, new_net_conf);
+	rcu_assign_pointer(connection->transport.net_conf, new_net_conf);
+	mutex_unlock(&connection->mutex[DATA_STREAM]);
 	mutex_unlock(&connection->resource->conf_update);
-	mutex_unlock(&connection->data.mutex);
 
 	crypto_free_shash(connection->peer_integrity_tfm);
 	kfree(connection->int_dig_in);
@@ -3583,10 +5599,11 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
 disconnect_rcu_unlock:
 	rcu_read_unlock();
 disconnect:
+	kfree(new_net_conf);
 	crypto_free_shash(peer_integrity_tfm);
 	kfree(int_dig_in);
 	kfree(int_dig_vv);
-	conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	change_cstate(connection, C_DISCONNECTING, CS_HARD);
 	return -EIO;
 }
 
@@ -3595,8 +5612,7 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in
  * return: NULL (alg name was "")
  *         ERR_PTR(error) if something goes wrong
  *         or the crypto hash ptr, if it worked out ok. */
-static struct crypto_shash *drbd_crypto_alloc_digest_safe(
-		const struct drbd_device *device,
+static struct crypto_shash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
 		const char *alg, const char *name)
 {
 	struct crypto_shash *tfm;
@@ -3613,44 +5629,11 @@ static struct crypto_shash *drbd_crypto_alloc_digest_safe(
 	return tfm;
 }
 
-static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
-{
-	void *buffer = connection->data.rbuf;
-	int size = pi->size;
-
-	while (size) {
-		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
-		s = drbd_recv(connection, buffer, s);
-		if (s <= 0) {
-			if (s < 0)
-				return s;
-			break;
-		}
-		size -= s;
-	}
-	if (size)
-		return -EIO;
-	return 0;
-}
-
-/*
- * config_unknown_volume  -  device configuration command for unknown volume
- *
- * When a device is added to an existing connection, the node on which the
- * device is added first will send configuration commands to its peer but the
- * peer will not know about the device yet.  It will warn and ignore these
- * commands.  Once the device is added on the second node, the second node will
- * send the same device configuration commands, but in the other direction.
- *
- * (We can also end up here if drbd is misconfigured.)
- */
-static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
-{
-	drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
-		  cmdname(pi->cmd), pi->vnr);
-	return ignore_remaining_packet(connection, pi);
-}
-
+/* Receive P_SYNC_PARAM89 and the older P_SYNC_PARAM. The peer_device fields
+ * related to resync configuration are ignored. These include resync_rate,
+ * c_max_rate and the like. We ignore them because applying them to our own
+ * configuration would be confusing. It would cause us to swap configuration
+ * with our peer each time we connected. */
 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;
@@ -3660,10 +5643,10 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 	struct crypto_shash *verify_tfm = NULL;
 	struct crypto_shash *csums_tfm = NULL;
 	struct net_conf *old_net_conf, *new_net_conf = NULL;
-	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+	struct peer_device_conf *old_peer_device_conf = NULL;
 	const int apv = connection->agreed_pro_version;
 	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
-	unsigned int fifo_size = 0;
+	struct drbd_resource *resource = connection->resource;
 	int err;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
@@ -3696,48 +5679,26 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 		D_ASSERT(device, data_size == 0);
 	}
 
-	/* initialize verify_alg and csums_alg */
-	p = pi->data;
-	BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
-	memset(&p->algs, 0, sizeof(p->algs));
-
-	err = drbd_recv_all(peer_device->connection, p, header_size);
+	err = drbd_recv_all(connection, (void **)&p, header_size + data_size);
 	if (err)
 		return err;
 
-	mutex_lock(&connection->resource->conf_update);
-	old_net_conf = peer_device->connection->net_conf;
-	if (get_ldev(device)) {
-		new_disk_conf = kzalloc_obj(struct disk_conf);
-		if (!new_disk_conf) {
-			put_ldev(device);
-			mutex_unlock(&connection->resource->conf_update);
-			drbd_err(device, "Allocation of new disk_conf failed\n");
-			return -ENOMEM;
-		}
-
-		old_disk_conf = device->ldev->disk_conf;
-		*new_disk_conf = *old_disk_conf;
-
-		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+	err = mutex_lock_interruptible(&resource->conf_update);
+	if (err) {
+		drbd_err(connection, "Interrupted while waiting for conf_update\n");
+		return err;
 	}
+	old_net_conf = connection->transport.net_conf;
 
 	if (apv >= 88) {
 		if (apv == 88) {
 			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
-				drbd_err(device, "verify-alg of wrong size, "
-					"peer wants %u, accepting only up to %u byte\n",
-					data_size, SHARED_SECRET_MAX);
+				drbd_err(device, "verify-alg too long, "
+					 "peer wants %u, accepting only %u byte\n",
+					 data_size, SHARED_SECRET_MAX);
 				goto reconnect;
 			}
-
-			err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
-			if (err)
-				goto reconnect;
-			/* we expect NUL terminated string */
-			/* but just in case someone tries to be evil */
-			D_ASSERT(device, p->verify_alg[data_size-1] == 0);
-			p->verify_alg[data_size-1] = 0;
+			p->verify_alg[data_size] = 0;
 
 		} else /* apv >= 89 */ {
 			/* we still expect NUL terminated strings */
@@ -3749,7 +5710,7 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 		}
 
 		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
-			if (device->state.conn == C_WF_REPORT_PARAMS) {
+			if (peer_device->repl_state[NOW] == L_OFF) {
 				drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
 				    old_net_conf->verify_alg, p->verify_alg);
 				goto disconnect;
@@ -3763,7 +5724,7 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 		}
 
 		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
-			if (device->state.conn == C_WF_REPORT_PARAMS) {
+			if (peer_device->repl_state[NOW] == L_OFF) {
 				drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
 				    old_net_conf->csums_alg, p->csums_alg);
 				goto disconnect;
@@ -3776,23 +5737,6 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 			}
 		}
 
-		if (apv > 94 && new_disk_conf) {
-			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
-			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
-			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
-			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
-
-			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
-			if (fifo_size != device->rs_plan_s->size) {
-				new_plan = fifo_alloc(fifo_size);
-				if (!new_plan) {
-					drbd_err(device, "kmalloc of fifo_buffer failed");
-					put_ldev(device);
-					goto disconnect;
-				}
-			}
-		}
-
 		if (verify_tfm || csums_tfm) {
 			new_net_conf = kzalloc_obj(struct net_conf);
 			if (!new_net_conf)
@@ -3803,66 +5747,58 @@ static int receive_SyncParam(struct drbd_connection *connection, struct packet_i
 			if (verify_tfm) {
 				strscpy(new_net_conf->verify_alg, p->verify_alg);
 				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
-				crypto_free_shash(peer_device->connection->verify_tfm);
-				peer_device->connection->verify_tfm = verify_tfm;
+				crypto_free_shash(connection->verify_tfm);
+				connection->verify_tfm = verify_tfm;
 				drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
 			}
 			if (csums_tfm) {
 				strscpy(new_net_conf->csums_alg, p->csums_alg);
 				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
-				crypto_free_shash(peer_device->connection->csums_tfm);
-				peer_device->connection->csums_tfm = csums_tfm;
+				crypto_free_shash(connection->csums_tfm);
+				connection->csums_tfm = csums_tfm;
 				drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
 			}
-			rcu_assign_pointer(connection->net_conf, new_net_conf);
+			rcu_assign_pointer(connection->transport.net_conf, new_net_conf);
 		}
 	}
 
-	if (new_disk_conf) {
-		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
-		put_ldev(device);
-	}
-
-	if (new_plan) {
-		old_plan = device->rs_plan_s;
-		rcu_assign_pointer(device->rs_plan_s, new_plan);
-	}
+	if (new_plan)
+		rcu_assign_pointer(peer_device->rs_plan_s, new_plan);
 
-	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&resource->conf_update);
 	synchronize_rcu();
 	if (new_net_conf)
 		kfree(old_net_conf);
-	kfree(old_disk_conf);
-	kfree(old_plan);
+	kfree(old_peer_device_conf);
+	if (new_plan)
+		kfree(old_plan);
 
 	return 0;
 
 reconnect:
-	if (new_disk_conf) {
-		put_ldev(device);
-		kfree(new_disk_conf);
-	}
-	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&resource->conf_update);
 	return -EIO;
 
 disconnect:
 	kfree(new_plan);
-	if (new_disk_conf) {
-		put_ldev(device);
-		kfree(new_disk_conf);
-	}
-	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&resource->conf_update);
 	/* just for completeness: actually not needed,
 	 * as this is not reached if csums_tfm was ok. */
 	crypto_free_shash(csums_tfm);
 	/* but free the verify_tfm again, if csums_tfm did not work out */
 	crypto_free_shash(verify_tfm);
-	conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	change_cstate(connection, C_DISCONNECTING, CS_HARD);
 	return -EIO;
 }
 
+static void drbd_setup_order_type(struct drbd_device *device, int peer)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ */
+}
+
 /* warn if the arguments differ by more than 12.5% */
-static void warn_if_differ_considerably(struct drbd_device *device,
+static void warn_if_differ_considerably(struct drbd_peer_device *peer_device,
 	const char *s, sector_t a, sector_t b)
 {
 	sector_t d;
@@ -3870,135 +5806,325 @@ static void warn_if_differ_considerably(struct drbd_device *device,
 		return;
 	d = (a > b) ? (a - b) : (b - a);
 	if (d > (a>>3) || d > (b>>3))
-		drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+		drbd_warn(peer_device, "Considerable difference in %s: %llus vs. %llus\n", s,
 		     (unsigned long long)a, (unsigned long long)b);
 }
 
-static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+static bool drbd_other_peer_smaller(struct drbd_peer_device *reference_peer_device, uint64_t new_size)
 {
+	struct drbd_device *device = reference_peer_device->device;
 	struct drbd_peer_device *peer_device;
+	bool smaller = false;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		if (peer_device == reference_peer_device)
+			continue;
+
+		/* Ignore peers without an attached disk. */
+		if (peer_device->disk_state[NOW] < D_INCONSISTENT)
+			continue;
+
+		if (peer_device->d_size != 0 && peer_device->d_size < new_size)
+			smaller = true;
+	}
+	rcu_read_unlock();
+
+	return smaller;
+}
+
+/* Maximum bio size that a protocol version supports. */
+static unsigned int conn_max_bio_size(struct drbd_connection *connection)
+{
+	if (connection->agreed_pro_version >= 100)
+		return DRBD_MAX_BIO_SIZE;
+	else if (connection->agreed_pro_version >= 95)
+		return DRBD_MAX_BIO_SIZE_P95;
+	else
+		return DRBD_MAX_SIZE_H80_PACKET;
+}
+
+static struct drbd_peer_device *get_neighbor_device(struct drbd_device *device,
+		enum drbd_neighbor neighbor)
+{
+	s32 self_id, peer_id, pivot;
+	struct drbd_peer_device *peer_device, *peer_device_ret = NULL;
+
+	if (!get_ldev(device))
+		return NULL;
+	self_id = device->ldev->md.node_id;
+	put_ldev(device);
+
+	pivot = neighbor == NEXT_LOWER ? 0 : neighbor == NEXT_HIGHER ? S32_MAX : -1;
+	if (pivot == -1)
+		return NULL;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		bool found_new = false;
+		peer_id = peer_device->node_id;
+
+		if (neighbor == NEXT_LOWER && peer_id < self_id && peer_id >= pivot)
+			found_new = true;
+		else if (neighbor == NEXT_HIGHER && peer_id > self_id && peer_id <= pivot)
+			found_new = true;
+
+		if (found_new && peer_device->disk_state[NOW] >= D_INCONSISTENT) {
+			pivot = peer_id;
+			peer_device_ret = peer_device;
+		}
+	}
+	rcu_read_unlock();
+
+	return peer_device_ret;
+}
+
+static void maybe_trigger_resync(struct drbd_device *device, struct drbd_peer_device *peer_device, bool grew, bool skip)
+{
+	if (!peer_device)
+		return;
+	if (peer_device->repl_state[NOW] <= L_OFF)
+		return;
+	if (test_and_clear_bit(RESIZE_PENDING, &peer_device->flags) ||
+	    (grew && peer_device->repl_state[NOW] == L_ESTABLISHED)) {
+		if (peer_device->disk_state[NOW] >= D_INCONSISTENT &&
+		    device->disk_state[NOW] >= D_INCONSISTENT) {
+			if (skip)
+				drbd_info(peer_device, "Resync of new storage suppressed with --assume-clean\n");
+			else
+				resync_after_online_grow(peer_device);
+		} else
+			set_bit(RESYNC_AFTER_NEG, &peer_device->flags);
+	}
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device, *peer_device_it = NULL;
 	struct drbd_device *device;
 	struct p_sizes *p = pi->data;
-	struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
+	uint64_t p_size, p_usize, p_csize;
+	uint64_t my_usize, my_max_size, cur_size;
 	enum determine_dev_size dd = DS_UNCHANGED;
-	sector_t p_size, p_usize, p_csize, my_usize;
-	sector_t new_size, cur_size;
-	int ldsc = 0; /* local disk size changed */
+	bool should_send_sizes = false;
 	enum dds_flags ddsf;
+	unsigned int protocol_max_bio_size;
+	bool have_ldev = false;
+	bool have_mutex = false;
+	bool is_handshake;
+	int err;
+	u64 im;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
 		return config_unknown_volume(connection, pi);
 	device = peer_device->device;
-	cur_size = get_capacity(device->vdisk);
 
+	err = mutex_lock_interruptible(&connection->resource->conf_update);
+	if (err) {
+		drbd_err(connection, "Interrupted while waiting for conf_update\n");
+		goto out;
+	}
+	have_mutex = true;
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
 	p_size = be64_to_cpu(p->d_size);
 	p_usize = be64_to_cpu(p->u_size);
 	p_csize = be64_to_cpu(p->c_size);
 
-	/* just store the peer's disk size for now.
-	 * we still need to figure out whether we accept that. */
-	device->p_size = p_size;
+	peer_device->d_size = p_size;
+	peer_device->u_size = p_usize;
+	peer_device->c_size = p_csize;
+
+	/* Ignore "current" size for calculating "max" size. */
+	/* If it used to have a disk, but now is detached, don't revert back to zero. */
+	if (p_size)
+		peer_device->max_size = p_size;
+
+	cur_size = get_capacity(device->vdisk);
+	dynamic_drbd_dbg(device, "current_size: %llu\n", (unsigned long long)cur_size);
+	dynamic_drbd_dbg(peer_device, "c_size: %llu u_size: %llu d_size: %llu max_size: %llu\n",
+			(unsigned long long)p_csize,
+			(unsigned long long)p_usize,
+			(unsigned long long)p_size,
+			(unsigned long long)peer_device->max_size);
+
+	if ((p_size && p_csize > p_size) || (p_usize && p_csize > p_usize)) {
+		drbd_warn(peer_device, "Peer sent bogus sizes, disconnecting\n");
+		goto disconnect;
+	}
+
+	/* The protocol version limits how big requests can be.  In addition,
+	 * peers before protocol version 94 cannot split large requests into
+	 * multiple bios; their reported max_bio_size is a hard limit.
+	 */
+	protocol_max_bio_size = conn_max_bio_size(connection);
+	peer_device->q_limits.max_bio_size = min(be32_to_cpu(p->max_bio_size),
+						 protocol_max_bio_size);
+	ddsf = be16_to_cpu(p->dds_flags);
+	is_handshake = (peer_device->repl_state[NOW] == L_OFF);
+	set_bit(HAVE_SIZES, &peer_device->flags);
 
 	if (get_ldev(device)) {
+		sector_t new_size;
+
+		have_ldev = true;
+
 		rcu_read_lock();
 		my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
 		rcu_read_unlock();
 
-		warn_if_differ_considerably(device, "lower level device sizes",
-			   p_size, drbd_get_max_capacity(device->ldev));
-		warn_if_differ_considerably(device, "user requested size",
+		my_max_size = drbd_get_max_capacity(device, device->ldev, false);
+		dynamic_drbd_dbg(peer_device, "la_size: %llu my_usize: %llu my_max_size: %llu\n",
+			(unsigned long long)device->ldev->md.effective_size,
+			(unsigned long long)my_usize,
+			(unsigned long long)my_max_size);
+
+		if (peer_device->disk_state[NOW] > D_DISKLESS)
+			warn_if_differ_considerably(peer_device, "lower level device sizes",
+				   p_size, my_max_size);
+		warn_if_differ_considerably(peer_device, "user requested size",
 					    p_usize, my_usize);
 
-		/* if this is the first connect, or an otherwise expected
-		 * param exchange, choose the minimum */
-		if (device->state.conn == C_WF_REPORT_PARAMS)
+		if (is_handshake)
 			p_usize = min_not_zero(my_usize, p_usize);
 
+		if (p_usize == 0) {
+			/* Peer may reset usize to zero only if it has a backend.
+			 * Because a diskless node has no disk config,
+			 * and always sends zero. */
+			if (p_size == 0)
+				p_usize = my_usize;
+		}
+
+		new_size = drbd_new_dev_size(device, p_csize, p_usize, ddsf);
+
 		/* Never shrink a device with usable data during connect,
 		 * or "attach" on the peer.
 		 * But allow online shrinking if we are connected. */
-		new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
 		if (new_size < cur_size &&
-		    device->state.disk >= D_OUTDATED &&
-		    (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
-			drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
+		    device->disk_state[NOW] >= D_OUTDATED &&
+		    (peer_device->repl_state[NOW] < L_ESTABLISHED || peer_device->disk_state[NOW] == D_DISKLESS)) {
+			drbd_err(peer_device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
 					(unsigned long long)new_size, (unsigned long long)cur_size);
-			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
-			put_ldev(device);
-			return -EIO;
+			goto disconnect;
+		}
+
+		/* Disconnect, if we cannot grow to the peer's current size */
+		if (my_max_size < p_csize && !is_handshake) {
+			drbd_err(peer_device, "Peer's size larger than my maximum capacity (%llu < %llu sectors)\n",
+					(unsigned long long)my_max_size, (unsigned long long)p_csize);
+			goto disconnect;
 		}
 
 		if (my_usize != p_usize) {
-			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+			struct disk_conf *old_disk_conf, *new_disk_conf;
 
 			new_disk_conf = kzalloc_obj(struct disk_conf);
 			if (!new_disk_conf) {
-				put_ldev(device);
-				return -ENOMEM;
+				err = -ENOMEM;
+				goto out;
 			}
 
-			mutex_lock(&connection->resource->conf_update);
 			old_disk_conf = device->ldev->disk_conf;
 			*new_disk_conf = *old_disk_conf;
 			new_disk_conf->disk_size = p_usize;
 
 			rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
-			mutex_unlock(&connection->resource->conf_update);
 			kvfree_rcu_mightsleep(old_disk_conf);
 
-			drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
-				 (unsigned long)p_usize, (unsigned long)my_usize);
+			drbd_info(peer_device, "Peer sets u_size to %llu sectors (old: %llu)\n",
+				 (unsigned long long)p_usize, (unsigned long long)my_usize);
+			/* Do not set should_send_sizes here. That might cause packet storms */
 		}
+	}
 
-		put_ldev(device);
+	if (connection->agreed_features & DRBD_FF_WSAME) {
+		struct o_qlim *qlim = p->qlim;
+
+		peer_device->q_limits.physical_block_size = be32_to_cpu(qlim->physical_block_size);
+		peer_device->q_limits.logical_block_size = be32_to_cpu(qlim->logical_block_size);
+		peer_device->q_limits.alignment_offset = be32_to_cpu(qlim->alignment_offset);
+		peer_device->q_limits.io_min = be32_to_cpu(qlim->io_min);
+		peer_device->q_limits.io_opt = be32_to_cpu(qlim->io_opt);
+	}
+
+	if (connection->agreed_features & DRBD_FF_BM_BLOCK_SHIFT) {
+		peer_device->bm_block_shift =
+			p->qlim->bm_block_shift_minus_12 + BM_BLOCK_SHIFT_4k;
+	} else {
+		int bbs = have_ldev ? bm_block_size(device->bitmap) : BM_BLOCK_SIZE_4k;
+		/* May work as long as this one is SyncTarget. May result in
+		 * funny never ending / repeating resyncs if the other guy is
+		 * SyncTarget, but unaware of bitmap granularity issues.
+		 */
+		if (bbs != BM_BLOCK_SIZE_4k)
+			drbd_warn(peer_device,
+				"My bitmap granularity is %u. Upgrade this peer to make it aware.\n",
+				bbs);
+		peer_device->bm_block_shift = BM_BLOCK_SHIFT_4k;
 	}
 
-	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
 	/* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
 	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
 	   drbd_reconsider_queue_parameters(), we can be sure that after
-	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+	   drbd_determine_dev_size() no REQ_OP_DISCARDs are in the queue. */
+	if (have_ldev) {
+		enum dds_flags local_ddsf = ddsf;
+		drbd_reconsider_queue_parameters(device, device->ldev);
 
-	ddsf = be16_to_cpu(p->dds_flags);
-	if (get_ldev(device)) {
-		drbd_reconsider_queue_parameters(device, device->ldev, o);
-		dd = drbd_determine_dev_size(device, ddsf, NULL);
-		put_ldev(device);
-		if (dd == DS_ERROR)
-			return -EIO;
-		drbd_md_sync(device);
+		/* To support thinly provisioned nodes (partial resync) joining later,
+		   clear all bitmap slots, including the unused ones. */
+		if (device->ldev->md.effective_size == 0)
+			local_ddsf |= DDSF_NO_RESYNC;
+
+		dd = drbd_determine_dev_size(device, p_csize, local_ddsf, NULL);
+
+		if (dd == DS_GREW || dd == DS_SHRUNK)
+			should_send_sizes = true;
+
+		if (dd == DS_ERROR) {
+			err = -EIO;
+			goto out;
+		}
+		drbd_md_sync_if_dirty(device);
 	} else {
-		/*
-		 * I am diskless, need to accept the peer's *current* size.
-		 * I must NOT accept the peers backing disk size,
-		 * it may have been larger than mine all along...
+		uint64_t new_size = 0;
+
+		drbd_reconsider_queue_parameters(device, NULL);
+		/* In case I am diskless, need to accept the peer's *current* size.
 		 *
 		 * At this point, the peer knows more about my disk, or at
 		 * least about what we last agreed upon, than myself.
 		 * So if his c_size is less than his d_size, the most likely
-		 * reason is that *my* d_size was smaller last time we checked.
-		 *
-		 * However, if he sends a zero current size,
-		 * take his (user-capped or) backing disk size anyways.
+		 * reason is that *my* d_size was smaller last time we checked,
+		 * or some other peer does not (yet) have enough room.
 		 *
 		 * Unless of course he does not have a disk himself.
 		 * In which case we ignore this completely.
 		 */
-		sector_t new_size = p_csize ?: p_usize ?: p_size;
-		drbd_reconsider_queue_parameters(device, NULL, o);
+		new_size = p_csize;
+		new_size = min_not_zero(new_size, p_usize);
+		new_size = min_not_zero(new_size, p_size);
+
 		if (new_size == 0) {
 			/* Ignore, peer does not know nothing. */
 		} else if (new_size == cur_size) {
 			/* nothing to do */
 		} else if (cur_size != 0 && p_size == 0) {
-			drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
+			dynamic_drbd_dbg(peer_device,
+					"Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
 					(unsigned long long)new_size, (unsigned long long)cur_size);
-		} else if (new_size < cur_size && device->state.role == R_PRIMARY) {
-			drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
-					(unsigned long long)new_size, (unsigned long long)cur_size);
-			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
-			return -EIO;
+		} else if (new_size < cur_size && device->resource->role[NOW] == R_PRIMARY) {
+			drbd_err(peer_device,
+				"The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
+				(unsigned long long)new_size, (unsigned long long)cur_size);
+			goto disconnect;
+		} else if (drbd_other_peer_smaller(peer_device, new_size)) {
+			dynamic_drbd_dbg(peer_device,
+					"Ignored peer device size (peer:%llu sectors); other peer smaller!\n",
+					(unsigned long long)new_size);
 		} else {
 			/* I believe the peer, if
 			 *  - I don't have a current size myself
@@ -4009,1071 +6135,3893 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info
 			 *    and he has the only disk,
 			 *    which is larger than my current size
 			 */
+			should_send_sizes = true;
 			drbd_set_my_capacity(device, new_size);
 		}
 	}
 
-	if (get_ldev(device)) {
+	if (device->device_conf.max_bio_size > protocol_max_bio_size ||
+	    (connection->agreed_pro_version < 94 &&
+	     device->device_conf.max_bio_size > peer_device->q_limits.max_bio_size)) {
+		drbd_err(device, "Peer cannot deal with requests bigger than %u. "
+			 "Please reduce max_bio_size in the configuration.\n",
+			 peer_device->q_limits.max_bio_size);
+		goto disconnect;
+	}
+
+	if (have_ldev) {
 		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
 			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
-			ldsc = 1;
+			should_send_sizes = true;
 		}
 
+		drbd_setup_order_type(device, be16_to_cpu(p->queue_order_type));
+	}
+
+	cur_size = get_capacity(device->vdisk);
+
+	for_each_peer_device_ref(peer_device_it, im, device) {
+		struct drbd_connection *con_it = peer_device_it->connection;
+
+		/* drop cached max_size, if we already grew beyond it */
+		if (peer_device_it->max_size < cur_size)
+			peer_device_it->max_size = 0;
+
+		if (con_it->cstate[NOW] < C_CONNECTED)
+			continue;
+
+		/* Send size updates only if something relevant has changed.
+		 * TODO: only tell the sender thread to do so,
+		 * or we may end up in a distributed deadlock on congestion. */
+
+		if (should_send_sizes)
+			drbd_send_sizes(peer_device_it, p_usize, ddsf);
+	}
+
+	maybe_trigger_resync(device, get_neighbor_device(device, NEXT_HIGHER),
+					dd == DS_GREW, ddsf & DDSF_NO_RESYNC);
+	maybe_trigger_resync(device, get_neighbor_device(device, NEXT_LOWER),
+					dd == DS_GREW, ddsf & DDSF_NO_RESYNC);
+	err = 0;
+
+out:
+	if (have_ldev)
 		put_ldev(device);
+	if (have_mutex)
+		mutex_unlock(&connection->resource->conf_update);
+	return err;
+
+disconnect:
+	/* don't let a rejected peer confuse future handshakes with different peers. */
+	peer_device->max_size = 0;
+
+	if (connection->resource->remote_state_change)
+		set_bit(TWOPC_RECV_SIZES_ERR, &connection->resource->flags);
+	else
+		err = -EIO;
+	goto out;
+}
+
+static enum sync_strategy resolve_splitbrain_from_disk_states(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	enum drbd_disk_state peer_disk_state = peer_device->disk_state[NOW];
+	enum drbd_disk_state disk_state = device->disk_state[NOW];
+
+	return  disk_state <= D_UP_TO_DATE && peer_disk_state == D_UP_TO_DATE ? SYNC_TARGET_USE_BITMAP :
+		disk_state == D_UP_TO_DATE && peer_disk_state <= D_UP_TO_DATE ? SYNC_SOURCE_USE_BITMAP :
+		SPLIT_BRAIN_AUTO_RECOVER;
+}
+
+static void drbd_resync(struct drbd_peer_device *peer_device,
+			enum resync_reason reason)
+{
+	enum drbd_role peer_role = peer_device->connection->peer_role[NOW];
+	enum drbd_repl_state new_repl_state;
+	enum drbd_disk_state peer_disk_state;
+	enum sync_strategy strategy;
+	enum sync_rule rule;
+	int peer_node_id;
+	enum drbd_state_rv rv;
+	const char *tag = reason == AFTER_UNSTABLE ? "after-unstable" : "diskless-primary";
+
+	strategy = drbd_handshake(peer_device, &rule, &peer_node_id, reason == DISKLESS_PRIMARY);
+	if (strategy == SPLIT_BRAIN_AUTO_RECOVER && reason == AFTER_UNSTABLE)
+		strategy = resolve_splitbrain_from_disk_states(peer_device);
+
+	if (!is_strategy_determined(strategy)) {
+		drbd_info(peer_device, "Unexpected result of handshake() %s!\n", strategy_descriptor(strategy).name);
+		return;
 	}
 
-	if (device->state.conn > C_WF_REPORT_PARAMS) {
-		if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) ||
-		    ldsc) {
-			/* we have different sizes, probably peer
-			 * needs to know my new size... */
-			drbd_send_sizes(peer_device, 0, ddsf);
-		}
-		if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
-		    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
-			if (device->state.pdsk >= D_INCONSISTENT &&
-			    device->state.disk >= D_INCONSISTENT) {
-				if (ddsf & DDSF_NO_RESYNC)
-					drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
-				else
-					resync_after_online_grow(device);
-			} else
-				set_bit(RESYNC_AFTER_NEG, &device->flags);
+	peer_disk_state = peer_device->disk_state[NOW];
+	if (reason == DISKLESS_PRIMARY)
+		disk_states_to_strategy(peer_device, peer_disk_state, &strategy, rule, &peer_node_id);
+
+	new_repl_state = strategy_to_repl_state(peer_device, peer_role, strategy);
+	if (new_repl_state != L_ESTABLISHED) {
+		bitmap_mod_after_handshake(peer_device, strategy, peer_node_id);
+		drbd_info(peer_device, "Becoming %s %s\n", drbd_repl_str(new_repl_state),
+			  reason == AFTER_UNSTABLE ? "after unstable" : "because primary is diskless");
+	}
+
+	if (new_repl_state == L_ESTABLISHED && peer_disk_state >= D_CONSISTENT &&
+	    peer_device->device->disk_state[NOW] == D_OUTDATED) {
+		/* No resync with up-to-date peer -> I should be consistent or up-to-date as well.
+		   Note: Former unstable (but up-to-date) nodes become consistent for a short
+		   time after loosing their primary peer. Therefore consider consistent here
+		   as well. */
+		drbd_info(peer_device, "Upgrading local disk to %s after unstable/weak (and no resync).\n",
+			  drbd_disk_str(peer_disk_state));
+		change_disk_state(peer_device->device, peer_disk_state, CS_VERBOSE, tag, NULL);
+		return;
+	}
+
+	rv = change_repl_state(peer_device, new_repl_state, CS_VERBOSE, tag);
+	if ((rv == SS_NOTHING_TO_DO || rv == SS_RESYNC_RUNNING) &&
+	    (new_repl_state == L_WF_BITMAP_S || new_repl_state == L_WF_BITMAP_T)) {
+		/* Those events might happen very quickly. In case we are still processing
+		   the previous resync we need to re-enter that state. Schedule sending of
+		   the bitmap here explicitly */
+		peer_device->resync_again++;
+		drbd_info(peer_device, "...postponing this until current resync finished\n");
+	}
+}
+
+static void update_bitmap_slot_of_peer(struct drbd_peer_device *peer_device, int node_id, u64 bitmap_uuid)
+{
+	struct drbd_device *device = peer_device->device;
+
+	if (peer_device->bitmap_uuids[node_id] && bitmap_uuid == 0) {
+		/* If we learn from a neighbor that it no longer has a bitmap
+		   against a third node, we need to deduce from that knowledge
+		   that in the other direction the bitmap was cleared as well.
+		 */
+		struct drbd_peer_device *peer_device2;
+
+		rcu_read_lock();
+		peer_device2 = peer_device_by_node_id(peer_device->device, node_id);
+		if (peer_device2) {
+			int node_id2 = peer_device->connection->peer_node_id;
+			peer_device2->bitmap_uuids[node_id2] = 0;
 		}
+		rcu_read_unlock();
 	}
 
-	return 0;
+	if (node_id != device->resource->res_opts.node_id && bitmap_uuid != -1 && get_ldev(device)) {
+		_drbd_uuid_push_history(device, bitmap_uuid);
+		put_ldev(device);
+	}
+	peer_device->bitmap_uuids[node_id] = bitmap_uuid;
 }
 
-static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+static void propagate_skip_initial_to_diskless(struct drbd_device *device)
 {
 	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_uuids *p = pi->data;
-	u64 *p_uuid;
-	int i, updated_uuids = 0;
-
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return config_unknown_volume(connection, pi);
-	device = peer_device->device;
+	u64 im;
 
-	p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
-	if (!p_uuid)
-		return false;
+	for_each_peer_device_ref(peer_device, im, device) {
+		if (peer_device->disk_state[NOW] == D_DISKLESS)
+			drbd_send_uuids(peer_device, UUID_FLAG_SKIP_INITIAL_SYNC, 0);
+	}
+}
 
-	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
-		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+static int __receive_uuids(struct drbd_peer_device *peer_device, u64 node_mask)
+{
+	enum drbd_repl_state repl_state = peer_device->repl_state[NOW];
+	struct drbd_device *device = peer_device->device;
+	struct drbd_resource *resource = device->resource;
+	int updated_uuids = 0, err = 0;
+	bool bad_server, uuid_match;
+	struct net_conf *nc;
+	bool two_primaries_allowed;
 
-	kfree(device->p_uuid);
-	device->p_uuid = p_uuid;
+	uuid_match =
+		(device->exposed_data_uuid & ~UUID_PRIMARY) ==
+		(peer_device->current_uuid & ~UUID_PRIMARY);
+	bad_server =
+		repl_state < L_ESTABLISHED &&
+		device->disk_state[NOW] < D_INCONSISTENT &&
+		device->resource->role[NOW] == R_PRIMARY && !uuid_match;
 
-	if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
-	    device->state.disk < D_INCONSISTENT &&
-	    device->state.role == R_PRIMARY &&
-	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+	if (peer_device->connection->agreed_pro_version < 110 && bad_server) {
 		drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
-		    (unsigned long long)device->ed_uuid);
-		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		    (unsigned long long)device->exposed_data_uuid);
+		change_cstate(peer_device->connection, C_DISCONNECTING, CS_HARD);
 		return -EIO;
 	}
 
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->transport.net_conf);
+	two_primaries_allowed = nc && nc->two_primaries;
+	rcu_read_unlock();
+
 	if (get_ldev(device)) {
-		int skip_initial_sync =
-			device->state.conn == C_CONNECTED &&
+		bool skip_initial_sync =
+			repl_state == L_ESTABLISHED &&
 			peer_device->connection->agreed_pro_version >= 90 &&
-			device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
-			(p_uuid[UI_FLAGS] & 8);
+			drbd_current_uuid(device) == UUID_JUST_CREATED &&
+			(peer_device->uuid_flags & UUID_FLAG_SKIP_INITIAL_SYNC);
 		if (skip_initial_sync) {
+			unsigned long irq_flags;
+
 			drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
-			drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+			drbd_bitmap_io(device, &drbd_bmio_clear_all_n_write,
 					"clear_n_write from receive_uuids",
-					BM_LOCKED_TEST_ALLOWED, NULL);
-			_drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
-			_drbd_uuid_set(device, UI_BITMAP, 0);
-			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
-					CS_VERBOSE, NULL);
-			drbd_md_sync(device);
+					BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK, NULL);
+			_drbd_uuid_set_current(device, peer_device->current_uuid);
+			peer_device->comm_current_uuid = peer_device->current_uuid;
+			peer_device->comm_uuid_flags = peer_device->uuid_flags;
+			peer_device->comm_bitmap_uuid = 0;
+			_drbd_uuid_set_bitmap(peer_device, 0);
+			begin_state_change(device->resource, &irq_flags, CS_VERBOSE);
+			__change_disk_state(device, D_UP_TO_DATE);
+			__change_peer_disk_state(peer_device, D_UP_TO_DATE);
+			end_state_change(device->resource, &irq_flags, "skip-initial-sync");
 			updated_uuids = 1;
+			propagate_skip_initial_to_diskless(device);
 		}
+
+		if (peer_device->uuid_flags & UUID_FLAG_NEW_DATAGEN) {
+			drbd_warn(peer_device, "received new current UUID: %016llX "
+				  "weak_nodes=%016llX\n", peer_device->current_uuid, node_mask);
+			drbd_uuid_received_new_current(peer_device, peer_device->current_uuid, node_mask);
+		}
+
+		drbd_uuid_detect_finished_resyncs(peer_device);
+
+		drbd_md_sync_if_dirty(device);
 		put_ldev(device);
-	} else if (device->state.disk < D_INCONSISTENT &&
-		   device->state.role == R_PRIMARY) {
-		/* I am a diskless primary, the peer just created a new current UUID
-		   for me. */
-		updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
-	}
-
-	/* Before we test for the disk state, we should wait until an eventually
-	   ongoing cluster wide state change is finished. That is important if
-	   we are primary and are detaching from our disk. We need to see the
-	   new disk state... */
-	mutex_lock(device->state_mutex);
-	mutex_unlock(device->state_mutex);
-	if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
-		updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+	} else if (device->disk_state[NOW] < D_INCONSISTENT && repl_state >= L_ESTABLISHED &&
+		   peer_device->disk_state[NOW] == D_UP_TO_DATE && !uuid_match &&
+		   (resource->role[NOW] == R_SECONDARY ||
+		    (two_primaries_allowed && test_and_clear_bit(NEW_CUR_UUID, &device->flags)))) {
+
+		write_lock_irq(&resource->state_rwlock);
+		if (resource->remote_state_change) {
+			drbd_info(peer_device, "Delaying update of exposed data uuid\n");
+			device->next_exposed_data_uuid = peer_device->current_uuid;
+		} else {
+			updated_uuids =
+				drbd_uuid_set_exposed(device, peer_device->current_uuid, false);
+		}
+		write_unlock_irq(&resource->state_rwlock);
+
+	}
+
+	if (device->disk_state[NOW] == D_DISKLESS && uuid_match &&
+	    peer_device->disk_state[NOW] == D_CONSISTENT) {
+		drbd_info(peer_device, "Peer is on same UUID now\n");
+		change_peer_disk_state(peer_device, D_UP_TO_DATE, CS_VERBOSE, "receive-uuids");
+	}
 
 	if (updated_uuids)
-		drbd_print_uuids(device, "receiver updated UUIDs to");
+		drbd_print_uuids(peer_device, "receiver updated UUIDs to");
 
-	return 0;
+	peer_device->uuid_node_mask = node_mask;
+
+	if ((repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) &&
+	    !(peer_device->uuid_flags & UUID_FLAG_STABLE) &&
+	    !drbd_stable_sync_source_present(peer_device, NOW))
+		set_bit(UNSTABLE_RESYNC, &peer_device->flags);
+
+	/* send notification in case UUID flags have changed */
+	drbd_broadcast_peer_device_state(peer_device);
+
+	return err;
 }
 
-/**
- * convert_state() - Converts the peer's view of the cluster state to our point of view
- * @ps:		The state as seen by the peer.
- */
-static union drbd_state convert_state(union drbd_state ps)
+/* drbd 8.4 compat */
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
 {
-	union drbd_state ms;
-
-	static enum drbd_conns c_tab[] = {
-		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
-		[C_CONNECTED] = C_CONNECTED,
+	const int node_id = connection->resource->res_opts.node_id;
+	struct drbd_peer_device *peer_device;
+	struct p_uuids *p = pi->data;
+	int history_uuids, i;
 
-		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
-		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
-		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
-		[C_VERIFY_S]       = C_VERIFY_T,
-		[C_MASK]   = C_MASK,
-	};
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
 
-	ms.i = ps.i;
+	history_uuids = min_t(int, HISTORY_UUIDS_V08,
+			      ARRAY_SIZE(peer_device->history_uuids));
 
-	ms.conn = c_tab[ps.conn];
-	ms.peer = ps.role;
-	ms.role = ps.peer;
-	ms.pdsk = ps.disk;
-	ms.disk = ps.pdsk;
-	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+	peer_device->current_uuid = be64_to_cpu(p->current_uuid);
+	peer_device->bitmap_uuids[node_id] = be64_to_cpu(p->bitmap_uuid);
+	for (i = 0; i < history_uuids; i++)
+		peer_device->history_uuids[i] = be64_to_cpu(p->history_uuids[i]);
+	for (; i < ARRAY_SIZE(peer_device->history_uuids); i++)
+		peer_device->history_uuids[i] = 0;
+	peer_device->dirty_bits = be64_to_cpu(p->dirty_bits);
+	peer_device->uuid_flags = be64_to_cpu(p->uuid_flags) | UUID_FLAG_STABLE;
+	set_bit(UUIDS_RECEIVED, &peer_device->flags);
 
-	return ms;
+	return __receive_uuids(peer_device, 0);
 }
 
-static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+static int receive_uuids110(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;
+	struct p_uuids110 *p = pi->data;
+	int bitmap_uuids, history_uuids, rest, i, pos, err;
+	u64 bitmap_uuids_mask, node_mask;
+	struct drbd_peer_md *peer_md = NULL;
 	struct drbd_device *device;
-	struct p_req_state *p = pi->data;
-	union drbd_state mask, val;
-	enum drbd_state_rv rv;
+	int not_allocated = -1;
+
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
-		return -EIO;
+		return config_unknown_volume(connection, pi);
+
 	device = peer_device->device;
+	bitmap_uuids_mask = be64_to_cpu(p->bitmap_uuids_mask);
+	if (bitmap_uuids_mask & ~(NODE_MASK(DRBD_PEERS_MAX) - 1))
+		return -EIO;
+	bitmap_uuids = hweight64(bitmap_uuids_mask);
+
+	if (pi->size / sizeof(p->other_uuids[0]) < bitmap_uuids)
+		return -EIO;
+	history_uuids = pi->size / sizeof(p->other_uuids[0]) - bitmap_uuids;
+	if (history_uuids > ARRAY_SIZE(peer_device->history_uuids))
+		history_uuids = ARRAY_SIZE(peer_device->history_uuids);
 
-	mask.i = be32_to_cpu(p->mask);
-	val.i = be32_to_cpu(p->val);
+	err = drbd_recv_into(connection, p->other_uuids,
+			     (bitmap_uuids + history_uuids) *
+			     sizeof(p->other_uuids[0]));
+	if (err)
+		return err;
 
-	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
-	    mutex_is_locked(device->state_mutex)) {
-		drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
-		return 0;
+	rest = pi->size - (bitmap_uuids + history_uuids) * sizeof(p->other_uuids[0]);
+	if (rest) {
+		err = ignore_remaining_packet(connection, rest);
+		if (err)
+			return err;
 	}
 
-	mask = convert_state(mask);
-	val = convert_state(val);
+	if (get_ldev(device)) {
+		peer_md = device->ldev->md.peers;
+		spin_lock_irq(&device->ldev->md.uuid_lock);
+	}
 
-	rv = drbd_change_state(device, CS_VERBOSE, mask, val);
-	drbd_send_sr_reply(peer_device, rv);
+	if (device->resource->role[NOW] != R_PRIMARY ||
+	    device->disk_state[NOW] != D_DISKLESS ||
+	    (peer_device->current_uuid & ~UUID_PRIMARY) !=
+						(device->exposed_data_uuid & ~UUID_PRIMARY) ||
+	    (peer_device->comm_current_uuid & ~UUID_PRIMARY) !=
+						(device->exposed_data_uuid & ~UUID_PRIMARY))
+		peer_device->current_uuid = be64_to_cpu(p->current_uuid);
 
-	drbd_md_sync(device);
+	peer_device->dirty_bits = be64_to_cpu(p->dirty_bits);
+	peer_device->uuid_flags = be64_to_cpu(p->uuid_flags);
+	if (peer_device->uuid_flags & UUID_FLAG_HAS_UNALLOC) {
+		not_allocated = peer_device->uuid_flags >> UUID_FLAG_UNALLOC_SHIFT;
+		peer_device->uuid_flags &= ~UUID_FLAG_UNALLOC_MASK;
+	}
 
-	return 0;
-}
+	pos = 0;
+	for (i = 0; i < ARRAY_SIZE(peer_device->bitmap_uuids); i++) {
+		u64 bitmap_uuid;
 
-static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
-{
-	struct p_req_state *p = pi->data;
-	union drbd_state mask, val;
-	enum drbd_state_rv rv;
+		if (bitmap_uuids_mask & NODE_MASK(i)) {
+			bitmap_uuid = be64_to_cpu(p->other_uuids[pos++]);
 
-	mask.i = be32_to_cpu(p->mask);
-	val.i = be32_to_cpu(p->val);
+			if (peer_md && !(peer_md[i].flags & MDF_HAVE_BITMAP) &&
+			    i != not_allocated)
+				peer_md[i].flags |= MDF_NODE_EXISTS;
+		} else {
+			bitmap_uuid = -1;
+		}
 
-	if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
-	    mutex_is_locked(&connection->cstate_mutex)) {
-		conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
-		return 0;
+		update_bitmap_slot_of_peer(peer_device, i, bitmap_uuid);
 	}
 
-	mask = convert_state(mask);
-	val = convert_state(val);
-
-	rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
-	conn_send_sr_reply(connection, rv);
+	for (i = 0; i < history_uuids; i++)
+		peer_device->history_uuids[i] = be64_to_cpu(p->other_uuids[pos++]);
+	while (i < ARRAY_SIZE(peer_device->history_uuids))
+		peer_device->history_uuids[i++] = 0;
+	set_bit(UUIDS_RECEIVED, &peer_device->flags);
+	if (peer_md) {
+		spin_unlock_irq(&device->ldev->md.uuid_lock);
+		put_ldev(device);
+	}
 
-	return 0;
-}
+	node_mask = be64_to_cpu(p->node_mask);
 
-static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
-{
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_state *p = pi->data;
-	union drbd_state os, ns, peer_state;
-	enum drbd_disk_state real_peer_disk;
-	enum chg_state_flags cs_flags;
-	int rv;
+	if (peer_device->connection->peer_role[NOW] == R_PRIMARY &&
+	    peer_device->uuid_flags & UUID_FLAG_STABLE)
+		check_resync_source(device, node_mask);
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return config_unknown_volume(connection, pi);
-	device = peer_device->device;
+	err = __receive_uuids(peer_device, node_mask);
 
-	peer_state.i = be32_to_cpu(p->state);
+	if (!test_bit(RECONCILIATION_RESYNC, &peer_device->flags)) {
+		if (peer_device->uuid_flags & UUID_FLAG_GOT_STABLE) {
+			struct drbd_device *device = peer_device->device;
 
-	real_peer_disk = peer_state.disk;
-	if (peer_state.disk == D_NEGOTIATING) {
-		real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
-		drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
-	}
+			if (peer_device->repl_state[NOW] == L_ESTABLISHED &&
+			    drbd_device_stable(device, NULL) && get_ldev(device)) {
+				drbd_send_uuids(peer_device, UUID_FLAG_RESYNC, 0);
+				drbd_resync(peer_device, AFTER_UNSTABLE);
+				put_ldev(device);
+			}
+		}
 
-	spin_lock_irq(&device->resource->req_lock);
- retry:
-	os = ns = drbd_read_state(device);
-	spin_unlock_irq(&device->resource->req_lock);
+		if (peer_device->uuid_flags & UUID_FLAG_RESYNC) {
+			if (get_ldev(device)) {
+				bool dp = peer_device->uuid_flags & UUID_FLAG_DISKLESS_PRIMARY;
+				drbd_resync(peer_device, dp ? DISKLESS_PRIMARY : AFTER_UNSTABLE);
+				put_ldev(device);
+			}
+		}
+	}
 
-	/* If some other part of the code (ack_receiver thread, timeout)
-	 * already decided to close the connection again,
-	 * we must not "re-establish" it here. */
-	if (os.conn <= C_TEAR_DOWN)
-		return -ECONNRESET;
+	return err;
+}
 
-	/* If this is the "end of sync" confirmation, usually the peer disk
-	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
-	 * set) resync started in PausedSyncT, or if the timing of pause-/
-	 * unpause-sync events has been "just right", the peer disk may
-	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
-	 */
-	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
-	    real_peer_disk == D_UP_TO_DATE &&
-	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
-		/* If we are (becoming) SyncSource, but peer is still in sync
-		 * preparation, ignore its uptodate-ness to avoid flapping, it
-		 * will change to inconsistent once the peer reaches active
-		 * syncing states.
-		 * It may have changed syncer-paused flags, however, so we
-		 * cannot ignore this completely. */
-		if (peer_state.conn > C_CONNECTED &&
-		    peer_state.conn < C_SYNC_SOURCE)
-			real_peer_disk = D_INCONSISTENT;
+/**
+ * check_resync_source() - Abort resync if the source is weak
+ * @device: The device to check
+ * @weak_nodes: Mask of currently weak nodes in the cluster
+ *
+ * If a primary loses connection to a SYNC_SOURCE node from us, then we
+ * need to abort that resync. Why?
+ *
+ * When the primary sends a write, we get that and write that as well. With
+ * the peer_ack packet, we will set that as out-of-sync towards the sync
+ * source node.
+ * When the resync process finds such bits, we request outdated
+ * data from the sync source!
+ * We are stopping the resync from such an outdated source here and waiting
+ * until all the resync activity has drained (P_RS_DATA_REPLY packets).
+ */
+static void check_resync_source(struct drbd_device *device, u64 weak_nodes)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_connection *connection;
 
-		/* if peer_state changes to connected at the same time,
-		 * it explicitly notifies us that it finished resync.
-		 * Maybe we should finish it up, too? */
-		else if (os.conn >= C_SYNC_SOURCE &&
-			 peer_state.conn == C_CONNECTED) {
-			if (drbd_bm_total_weight(device) <= device->rs_failed)
-				drbd_resync_finished(peer_device);
-			return 0;
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		enum drbd_repl_state repl_state = peer_device->repl_state[NOW];
+		if ((repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T) &&
+		    NODE_MASK(peer_device->node_id) & weak_nodes) {
+			rcu_read_unlock();
+			goto abort;
 		}
 	}
+	rcu_read_unlock();
+	return;
+abort:
+	connection = peer_device->connection;
+	drbd_info(peer_device, "My sync source became a weak node, aborting resync!\n");
+	change_repl_state(peer_device, L_ESTABLISHED, CS_VERBOSE, "abort-resync");
+	drbd_flush_workqueue(&connection->sender_work);
+	drbd_cancel_conflicting_resync_requests(peer_device);
 
-	/* explicit verify finished notification, stop sector reached. */
-	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
-	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
-		ov_out_of_sync_print(peer_device);
-		drbd_resync_finished(peer_device);
-		return 0;
-	}
+	wait_event_interruptible(connection->ee_wait,
+				 peer_device->repl_state[NOW] <= L_ESTABLISHED ||
+				 atomic_read(&connection->backing_ee_cnt) == 0);
+	wait_event_interruptible(device->misc_wait,
+				 peer_device->repl_state[NOW] <= L_ESTABLISHED ||
+				 atomic_read(&peer_device->rs_pending_cnt) == 0);
 
-	/* peer says his disk is inconsistent, while we think it is uptodate,
-	 * and this happens while the peer still thinks we have a sync going on,
-	 * but we think we are already done with the sync.
-	 * We ignore this to avoid flapping pdsk.
-	 * This should not happen, if the peer is a recent version of drbd. */
-	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
-	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
-		real_peer_disk = D_UP_TO_DATE;
+	peer_device->rs_total  = 0;
+	peer_device->rs_failed = 0;
+	peer_device->rs_paused = 0;
+}
 
-	if (ns.conn == C_WF_REPORT_PARAMS)
-		ns.conn = C_CONNECTED;
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @peer_state:	The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state peer_state)
+{
+	union drbd_state state;
 
-	if (peer_state.conn == C_AHEAD)
-		ns.conn = C_BEHIND;
+	static unsigned int c_tab[] = {
+		[L_OFF] = L_OFF,
+		[L_ESTABLISHED] = L_ESTABLISHED,
 
-	/* TODO:
-	 * if (primary and diskless and peer uuid != effective uuid)
-	 *     abort attach on peer;
-	 *
-	 * If this node does not have good data, was already connected, but
-	 * the peer did a late attach only now, trying to "negotiate" with me,
-	 * AND I am currently Primary, possibly frozen, with some specific
-	 * "effective" uuid, this should never be reached, really, because
-	 * we first send the uuids, then the current state.
-	 *
-	 * In this scenario, we already dropped the connection hard
-	 * when we received the unsuitable uuids (receive_uuids().
-	 *
-	 * Should we want to change this, that is: not drop the connection in
-	 * receive_uuids() already, then we would need to add a branch here
-	 * that aborts the attach of "unsuitable uuids" on the peer in case
-	 * this node is currently Diskless Primary.
-	 */
+		[L_STARTING_SYNC_S] = L_STARTING_SYNC_T,
+		[L_STARTING_SYNC_T] = L_STARTING_SYNC_S,
+		[L_WF_BITMAP_S] = L_WF_BITMAP_T,
+		[L_WF_BITMAP_T] = L_WF_BITMAP_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_CONNECTING] = C_CONNECTING,
+		[L_VERIFY_S] = L_VERIFY_T,
+		[C_MASK] = C_MASK,
+	};
 
-	if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
-	    get_ldev_if_state(device, D_NEGOTIATING)) {
-		int cr; /* consider resync */
+	state.i = peer_state.i;
 
-		/* if we established a new connection */
-		cr  = (os.conn < C_CONNECTED);
-		/* if we had an established connection
-		 * and one of the nodes newly attaches a disk */
-		cr |= (os.conn == C_CONNECTED &&
-		       (peer_state.disk == D_NEGOTIATING ||
-			os.disk == D_NEGOTIATING));
-		/* if we have both been inconsistent, and the peer has been
-		 * forced to be UpToDate with --force */
-		cr |= test_bit(CONSIDER_RESYNC, &device->flags);
-		/* if we had been plain connected, and the admin requested to
-		 * start a sync by "invalidate" or "invalidate-remote" */
-		cr |= (os.conn == C_CONNECTED &&
-				(peer_state.conn >= C_STARTING_SYNC_S &&
-				 peer_state.conn <= C_WF_BITMAP_T));
+	state.conn = c_tab[peer_state.conn];
+	state.peer = peer_state.role;
+	state.role = peer_state.peer;
+	state.pdsk = peer_state.disk;
+	state.disk = peer_state.pdsk;
+	state.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
 
-		if (cr)
-			ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+	return state;
+}
 
-		put_ldev(device);
-		if (ns.conn == C_MASK) {
-			ns.conn = C_CONNECTED;
-			if (device->state.disk == D_NEGOTIATING) {
-				drbd_force_state(device, NS(disk, D_FAILED));
-			} else if (peer_state.disk == D_NEGOTIATING) {
-				drbd_err(device, "Disk attach process on the peer node was aborted.\n");
-				peer_state.disk = D_DISKLESS;
-				real_peer_disk = D_DISKLESS;
-			} else {
-				if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
-					return -EIO;
-				D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
-				conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
-				return -EIO;
-			}
-		}
-	}
+static enum drbd_state_rv
+__change_connection_state(struct drbd_connection *connection,
+			  union drbd_state mask, union drbd_state val,
+			  enum chg_state_flags flags)
+{
+	struct drbd_resource *resource = connection->resource;
 
-	spin_lock_irq(&device->resource->req_lock);
-	if (os.i != drbd_read_state(device).i)
-		goto retry;
-	clear_bit(CONSIDER_RESYNC, &device->flags);
-	ns.peer = peer_state.role;
-	ns.pdsk = real_peer_disk;
-	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
-	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
-		ns.disk = device->new_state_tmp.disk;
-	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
-	if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
-	    test_bit(NEW_CUR_UUID, &device->flags)) {
-		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
-		   for temporal network outages! */
-		spin_unlock_irq(&device->resource->req_lock);
-		drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
-		tl_clear(peer_device->connection);
-		drbd_uuid_new_current(device);
-		clear_bit(NEW_CUR_UUID, &device->flags);
-		conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
-		return -EIO;
+	if (mask.role) {
+		/* not allowed */
+	}
+	if (mask.susp) {
+		mask.susp ^= -1;
+		__change_io_susp_user(resource, val.susp);
+	}
+	if (mask.susp_nod) {
+		mask.susp_nod ^= -1;
+		__change_io_susp_no_data(resource, val.susp_nod);
+	}
+	if (mask.susp_fen) {
+		mask.susp_fen ^= -1;
+		__change_io_susp_fencing(connection, val.susp_fen);
+	}
+	if (mask.disk) {
+		/* Handled in __change_peer_device_state(). */
+		mask.disk ^= -1;
 	}
-	rv = _drbd_set_state(device, ns, cs_flags, NULL);
-	ns = drbd_read_state(device);
-	spin_unlock_irq(&device->resource->req_lock);
+	if (mask.conn) {
+		mask.conn ^= -1;
+		__change_cstate(connection,
+				min_t(enum drbd_conn_state, val.conn, C_CONNECTED));
+	}
+	if (mask.pdsk) {
+		/* Handled in __change_peer_device_state(). */
+		mask.pdsk ^= -1;
+	}
+	if (mask.peer) {
+		mask.peer ^= -1;
+		__change_peer_role(connection, val.peer);
+	}
+	if (mask.i) {
+		drbd_info(connection, "Remote state change: request %u/%u not "
+		"understood\n", mask.i, val.i & mask.i);
+		return SS_NOT_SUPPORTED;
+	}
+	return SS_SUCCESS;
+}
 
-	if (rv < SS_SUCCESS) {
-		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
-		return -EIO;
+static enum drbd_state_rv
+__change_peer_device_state(struct drbd_peer_device *peer_device,
+			   union drbd_state mask, union drbd_state val)
+{
+	struct drbd_device *device = peer_device->device;
+
+	if (mask.peer) {
+		/* Handled in __change_connection_state(). */
+		mask.peer ^= -1;
+	}
+	if (mask.disk) {
+		mask.disk ^= -1;
+		__change_disk_state(device, val.disk);
 	}
 
-	if (os.conn > C_WF_REPORT_PARAMS) {
-		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
-		    peer_state.disk != D_NEGOTIATING ) {
-			/* we want resync, peer has not yet decided to sync... */
-			/* Nowadays only used when forcing a node into primary role and
-			   setting its disk to UpToDate with that */
-			drbd_send_uuids(peer_device);
-			drbd_send_current_state(peer_device);
-		}
+	if (mask.conn) {
+		mask.conn ^= -1;
+		__change_repl_state(peer_device,
+				max_t(enum drbd_repl_state, val.conn, L_OFF));
 	}
+	if (mask.pdsk) {
+		mask.pdsk ^= -1;
+		__change_peer_disk_state(peer_device, val.pdsk);
+	}
+	if (mask.user_isp) {
+		mask.user_isp ^= -1;
+		__change_resync_susp_user(peer_device, val.user_isp);
+	}
+	if (mask.peer_isp) {
+		mask.peer_isp ^= -1;
+		__change_resync_susp_peer(peer_device, val.peer_isp);
+	}
+	if (mask.aftr_isp) {
+		mask.aftr_isp ^= -1;
+		__change_resync_susp_dependency(peer_device, val.aftr_isp);
+	}
+	if (mask.i) {
+		drbd_info(peer_device, "Remote state change: request %u/%u not "
+		"understood\n", mask.i, val.i & mask.i);
+		return SS_NOT_SUPPORTED;
+	}
+	return SS_SUCCESS;
+}
 
-	clear_bit(DISCARD_MY_DATA, &device->flags);
+static union drbd_state
+sanitize_outdate(struct drbd_peer_device *peer_device,
+		 union drbd_state mask,
+		 union drbd_state val)
+{
+	struct drbd_device *device = peer_device->device;
+	union drbd_state result_mask = mask;
 
-	drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+	if (val.pdsk == D_OUTDATED && peer_device->disk_state[NEW] < D_OUTDATED)
+		result_mask.pdsk = 0;
+	if (val.disk == D_OUTDATED && device->disk_state[NEW] < D_OUTDATED)
+		result_mask.disk = 0;
 
-	return 0;
+	return result_mask;
 }
 
-static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+static void log_openers(struct drbd_resource *resource)
 {
-	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
-	struct p_rs_uuid *p = pi->data;
+	int vnr;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		struct opener *opener;
 
-	wait_event(device->misc_wait,
-		   device->state.conn == C_WF_SYNC_UUID ||
-		   device->state.conn == C_BEHIND ||
-		   device->state.conn < C_CONNECTED ||
-		   device->state.disk < D_NEGOTIATING);
+		spin_lock(&device->openers_lock);
+		opener = list_first_entry_or_null(&device->openers, struct opener, list);
+		if (opener)
+			drbd_warn(device, "Held open by %s(%d)\n", opener->comm, opener->pid);
+		spin_unlock(&device->openers_lock);
+	}
+	rcu_read_unlock();
+}
 
-	/* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
+/**
+ * change_connection_state()  -  change state of a connection and all its peer devices
+ * @connection: DRBD connection to operate on
+ * @state_change: Prepared state change
+ * @reply: Two phase commit reply
+ * @flags: State change flags
+ *
+ * Also changes the state of the peer devices' devices and of the resource.
+ * Cluster-wide state changes are not supported.
+ */
+static enum drbd_state_rv
+change_connection_state(struct drbd_connection *connection,
+			struct twopc_state_change *state_change,
+			struct twopc_reply *reply,
+			enum chg_state_flags flags)
+{
+	struct drbd_resource *resource = connection->resource;
+	long t = resource->res_opts.auto_promote_timeout * HZ / 10;
+	union drbd_state mask = state_change->mask;
+	union drbd_state val = state_change->val;
+	bool is_disconnect = false;
+	bool is_connect = false;
+	bool abort = flags & CS_ABORT;
+	struct drbd_peer_device *peer_device;
+	unsigned long irq_flags;
+	enum drbd_state_rv rv;
+	int vnr;
 
-	/* Here the _drbd_uuid_ functions are right, current should
-	   _not_ be rotated into the history */
-	if (get_ldev_if_state(device, D_NEGOTIATING)) {
-		_drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
-		_drbd_uuid_set(device, UI_BITMAP, 0UL);
+	if (reply) {
+		is_disconnect = reply->is_disconnect;
+		is_connect = reply->is_connect;
+	} else if (mask.conn == conn_MASK) {
+		is_connect = val.conn == C_CONNECTED;
+		is_disconnect = val.conn == C_DISCONNECTING;
+	}
 
-		drbd_print_uuids(device, "updated sync uuid");
-		drbd_start_resync(device, C_SYNC_TARGET);
+	mask = convert_state(mask);
+	val = convert_state(val);
 
-		put_ldev(device);
-	} else
-		drbd_err(device, "Ignoring SyncUUID packet!\n");
+	if (is_connect && connection->agreed_pro_version >= 118) {
+		if (flags & CS_PREPARE)
+			conn_connect2(connection);
+		if (abort)
+			abort_connect(connection);
+	}
+retry:
+	begin_state_change(resource, &irq_flags, flags & ~CS_VERBOSE);
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		union drbd_state l_mask;
+		l_mask = is_disconnect ? sanitize_outdate(peer_device, mask, val) : mask;
+		rv = __change_peer_device_state(peer_device, l_mask, val);
+		if (rv < SS_SUCCESS)
+			goto fail;
+	}
+	rv = __change_connection_state(connection, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto fail;
 
-	return 0;
+	if (reply && !abort) {
+		u64 directly_reachable = directly_connected_nodes(resource, NEW) |
+			NODE_MASK(resource->res_opts.node_id);
+
+		if (reply->primary_nodes & ~directly_reachable)
+			__outdate_myself(resource);
+	}
+
+	if (is_connect && connection->agreed_pro_version >= 117)
+		apply_connect(connection, (flags & CS_PREPARED) && !abort);
+	rv = end_state_change(resource, &irq_flags, "remote");
+out:
+
+	if ((rv == SS_NO_UP_TO_DATE_DISK && resource->role[NOW] != R_PRIMARY) ||
+	    rv == SS_PRIMARY_READER) {
+		/* Most probably udev opened it read-only. That might happen
+		   if it was demoted very recently. Wait up to one second. */
+		t = wait_event_interruptible_timeout(resource->state_wait,
+						     drbd_open_ro_count(resource) == 0,
+						     t);
+		if (t > 0)
+			goto retry;
+	}
+
+	if (rv < SS_SUCCESS) {
+		drbd_err(resource, "State change failed: %s (%d)\n", drbd_set_st_err_str(rv), rv);
+		if (rv == SS_PRIMARY_READER)
+			log_openers(resource);
+	}
+
+	return rv;
+fail:
+	abort_state_change(resource, &irq_flags);
+	goto out;
 }
 
-/*
- * receive_bitmap_plain
+/**
+ * change_peer_device_state()  -  change state of a peer and its connection
+ * @peer_device: DRBD peer device
+ * @state_change: Prepared state change
+ * @flags: State change flags
  *
- * Return 0 when done, 1 when another iteration is needed, and a negative error
- * code upon failure.
+ * Also changes the state of the peer device's device and of the resource.
+ * Cluster-wide state changes are not supported.
  */
-static int
-receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
-		     unsigned long *p, struct bm_xfer_ctx *c)
+static enum drbd_state_rv
+change_peer_device_state(struct drbd_peer_device *peer_device,
+			 struct twopc_state_change *state_change,
+			 enum chg_state_flags flags)
 {
-	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
-				 drbd_header_size(peer_device->connection);
-	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
-				       c->bm_words - c->word_offset);
-	unsigned int want = num_words * sizeof(*p);
-	int err;
+	struct drbd_connection *connection = peer_device->connection;
+	union drbd_state mask = state_change->mask;
+	union drbd_state val = state_change->val;
+	unsigned long irq_flags;
+	enum drbd_state_rv rv;
 
-	if (want != size) {
-		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	begin_state_change(connection->resource, &irq_flags, flags);
+	rv = __change_peer_device_state(peer_device, mask, val);
+	if (rv < SS_SUCCESS)
+		goto fail;
+	rv = __change_connection_state(connection, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto fail;
+	rv = end_state_change(connection->resource, &irq_flags, "remote");
+out:
+	return rv;
+fail:
+	abort_state_change(connection->resource, &irq_flags);
+	goto out;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct twopc_state_change *state_change = &resource->twopc.state_change;
+	struct drbd_peer_device *peer_device = NULL;
+	struct p_req_state *p = pi->data;
+	enum chg_state_flags flags = CS_VERBOSE | CS_LOCAL_ONLY | CS_TWOPC;
+	enum drbd_state_rv rv;
+	int vnr = -1;
+
+	if (!expect(connection, connection->agreed_pro_version < 110)) {
+		drbd_err(connection, "Packet %s not allowed in protocol version %d\n",
+			 drbd_packet_name(pi->cmd),
+			 connection->agreed_pro_version);
 		return -EIO;
 	}
-	if (want == 0)
+
+	state_change->mask.i = be32_to_cpu(p->mask);
+	state_change->val.i = be32_to_cpu(p->val);
+
+	/* P_STATE_CHG_REQ packets must have a valid vnr.  P_CONN_ST_CHG_REQ
+	 * packets have an undefined vnr. */
+	if (pi->cmd == P_STATE_CHG_REQ) {
+		peer_device = conn_peer_device(connection, pi->vnr);
+		if (!peer_device) {
+			const union drbd_state conn_mask = { .conn = conn_MASK };
+			const union drbd_state val_off = { .conn = L_OFF };
+
+			if (state_change->mask.i == conn_mask.i &&
+			    state_change->val.i == val_off.i) {
+				/* The peer removed this volume, we do not have it... */
+				drbd_send_sr_reply(connection, vnr, SS_NOTHING_TO_DO);
+				return 0;
+			}
+
+			return -EIO;
+		}
+		vnr = peer_device->device->vnr;
+	}
+
+	rv = SS_SUCCESS;
+	write_lock_irq(&resource->state_rwlock);
+	if (resource->remote_state_change)
+		rv = SS_CONCURRENT_ST_CHG;
+	else
+		resource->remote_state_change = true;
+	write_unlock_irq(&resource->state_rwlock);
+
+	if (rv != SS_SUCCESS) {
+		drbd_info(connection, "Rejecting concurrent remote state change\n");
+		drbd_send_sr_reply(connection, vnr, rv);
 		return 0;
-	err = drbd_recv_all(peer_device->connection, p, want);
-	if (err)
-		return err;
+	}
 
-	drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+	/* Send the reply before carrying out the state change: this is needed
+	 * for connection state changes which close the network connection.  */
+	if (peer_device) {
+		rv = change_peer_device_state(peer_device, state_change, flags | CS_PREPARE);
+		drbd_send_sr_reply(connection, vnr, rv);
+		rv = change_peer_device_state(peer_device, state_change, flags | CS_PREPARED);
+		if (rv >= SS_SUCCESS)
+			drbd_md_sync_if_dirty(peer_device->device);
+	} else {
+		flags |= CS_IGN_OUTD_FAIL;
+		rv = change_connection_state(connection, state_change, NULL, flags | CS_PREPARE);
+		drbd_send_sr_reply(connection, vnr, rv);
+		change_connection_state(connection, state_change, NULL, flags | CS_PREPARED);
+	}
 
-	c->word_offset += num_words;
-	c->bit_offset = c->word_offset * BITS_PER_LONG;
-	if (c->bit_offset > c->bm_bits)
-		c->bit_offset = c->bm_bits;
+	write_lock_irq(&resource->state_rwlock);
+	resource->remote_state_change = false;
+	write_unlock_irq(&resource->state_rwlock);
+	wake_up_all(&resource->twopc_wait);
 
-	return 1;
+	return 0;
 }
 
-static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+static void drbd_abort_twopc(struct drbd_resource *resource)
 {
-	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+	struct drbd_connection *connection;
+	int initiator_node_id;
+	bool is_connect;
+
+	initiator_node_id = resource->twopc_reply.initiator_node_id;
+	if (initiator_node_id != -1) {
+		connection = drbd_get_connection_by_node_id(resource, initiator_node_id);
+		is_connect = resource->twopc_reply.is_connect &&
+			resource->twopc_reply.target_node_id == resource->res_opts.node_id;
+		resource->remote_state_change = false;
+		resource->twopc_reply.initiator_node_id = -1;
+		resource->twopc_parent_nodes = 0;
+
+		if (connection) {
+			if (is_connect)
+				abort_connect(connection);
+			kref_put(&connection->kref, drbd_destroy_connection);
+			connection = NULL;
+		}
+
+		/* Aborting a prepared state change. Give up the state mutex! */
+		up(&resource->state_sem);
+	}
+
+	wake_up_all(&resource->twopc_wait);
 }
 
-static int dcbp_get_start(struct p_compressed_bm *p)
+void twopc_timer_fn(struct timer_list *t)
 {
-	return (p->encoding & 0x80) != 0;
+	struct drbd_resource *resource = timer_container_of(resource, t, twopc_timer);
+	unsigned long irq_flags;
+
+	write_lock_irqsave(&resource->state_rwlock, irq_flags);
+	if (!test_bit(TWOPC_WORK_PENDING, &resource->flags)) {
+		drbd_err(resource, "Two-phase commit %u timeout\n",
+			   resource->twopc_reply.tid);
+		drbd_abort_twopc(resource);
+	} else {
+		mod_timer(&resource->twopc_timer, jiffies + HZ/10);
+	}
+	write_unlock_irqrestore(&resource->state_rwlock, irq_flags);
 }
 
-static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+bool drbd_have_local_disk(struct drbd_resource *resource)
 {
-	return (p->encoding >> 4) & 0x7;
+	struct drbd_device *device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (device->disk_state[NOW] > D_DISKLESS) {
+			rcu_read_unlock();
+			return true;
+		}
+	}
+	rcu_read_unlock();
+	return false;
 }
 
-/*
- * recv_bm_rle_bits
- *
- * Return 0 when done, 1 when another iteration is needed, and a negative error
- * code upon failure.
- */
-static int
-recv_bm_rle_bits(struct drbd_peer_device *peer_device,
-		struct p_compressed_bm *p,
-		 struct bm_xfer_ctx *c,
-		 unsigned int len)
+static enum drbd_state_rv
+far_away_change(struct drbd_connection *connection,
+		struct twopc_request *request,
+		struct twopc_reply *reply,
+		enum chg_state_flags flags)
 {
-	struct bitstream bs;
-	u64 look_ahead;
-	u64 rl;
-	u64 tmp;
-	unsigned long s = c->bit_offset;
-	unsigned long e;
-	int toggle = dcbp_get_start(p);
-	int have;
-	int bits;
+	struct drbd_resource *resource = connection->resource;
+	struct twopc_state_change *state_change = &resource->twopc.state_change;
+	u64 directly_reachable = directly_connected_nodes(resource, NOW) |
+		NODE_MASK(resource->res_opts.node_id);
+	union drbd_state mask = state_change->mask;
+	union drbd_state val = state_change->val;
+	int vnr = resource->twopc_reply.vnr;
+	struct drbd_device *device;
+	unsigned long irq_flags;
+	int iterate_vnr;
 
-	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
 
-	bits = bitstream_get_bits(&bs, &look_ahead, 64);
-	if (bits < 0)
-		return -EIO;
+	if (flags & CS_PREPARE && mask.role == role_MASK && val.role == R_PRIMARY &&
+	    resource->role[NOW] == R_PRIMARY) {
+		struct net_conf *nc;
+		bool two_primaries_allowed = false;
 
-	for (have = bits; have > 0; s += rl, toggle = !toggle) {
-		bits = vli_decode_bits(&rl, look_ahead);
-		if (bits <= 0)
-			return -EIO;
+		rcu_read_lock();
+		nc = rcu_dereference(connection->transport.net_conf);
+		if (nc)
+			two_primaries_allowed = nc->two_primaries;
+		rcu_read_unlock();
+		if (!two_primaries_allowed)
+			return SS_TWO_PRIMARIES;
 
-		if (toggle) {
-			e = s + rl -1;
-			if (e >= c->bm_bits) {
-				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
-				return -EIO;
+		/* A node further away wants to become primary. In case I am primary allow it only
+		 * when I am diskless. See also check_primaries_distances() in drbd_state.c
+		 */
+		if (drbd_have_local_disk(resource))
+			return SS_WEAKLY_CONNECTED;
+	}
+
+	begin_state_change(resource, &irq_flags, flags);
+	if (mask.i == 0 && val.i == 0 &&
+	    resource->role[NOW] == R_PRIMARY && vnr == -1) {
+		/* A node far away test if there are primaries. I am the guy he is concerned
+		 * about... He learned about me in the CS_PREPARE phase. Since he is committing it
+		 * I know that he is outdated now...
+		 */
+		struct drbd_connection *affected_connection;
+		int initiator_node_id = resource->twopc_reply.initiator_node_id;
+
+		affected_connection = drbd_get_connection_by_node_id(resource, initiator_node_id);
+		if (affected_connection) {
+			__downgrade_peer_disk_states(affected_connection, D_OUTDATED);
+			kref_put(&affected_connection->kref, drbd_destroy_connection);
+		} else if (flags & CS_PREPARED) {
+			idr_for_each_entry(&resource->devices, device, iterate_vnr) {
+				struct drbd_peer_md *peer_md;
+
+				if (!get_ldev(device))
+					continue;
+
+				peer_md = &device->ldev->md.peers[initiator_node_id];
+				peer_md->flags |= MDF_PEER_OUTDATED;
+				put_ldev(device);
+				drbd_md_mark_dirty(device);
 			}
-			_drbd_bm_set_bits(peer_device->device, s, e);
 		}
+	}
 
-		if (have < bits) {
-			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
-				have, bits, look_ahead,
-				(unsigned int)(bs.cur.b - p->code),
-				(unsigned int)bs.buf_len);
-			return -EIO;
-		}
-		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
-		if (likely(bits < 64))
-			look_ahead >>= bits;
-		else
-			look_ahead = 0;
-		have -= bits;
+	if (state_change->primary_nodes & ~directly_reachable &&
+	    !(request->flags & TWOPC_PRI_INCAPABLE))
+		__outdate_myself(resource);
 
-		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
-		if (bits < 0)
-			return -EIO;
-		look_ahead |= tmp << have;
-		have += bits;
+	idr_for_each_entry(&resource->devices, device, iterate_vnr) {
+		if (test_bit(OUTDATE_ON_2PC_COMMIT, &device->flags) &&
+		    device->disk_state[NEW] > D_OUTDATED)
+			__change_disk_state(device, D_OUTDATED);
 	}
 
-	c->bit_offset = s;
-	bm_xfer_ctx_bit_to_word_offset(c);
+	/* even if no outdate happens, CS_FORCE_RECALC might be set here */
+	return end_state_change(resource, &irq_flags, "far-away");
+}
+
+static void handle_neighbor_demotion(struct drbd_connection *connection,
+				     struct twopc_state_change *state_change,
+				     struct twopc_reply *reply)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_device *device;
+	int vnr;
+
+	if (reply->initiator_node_id != connection->peer_node_id ||
+	    connection->peer_role[NOW] != R_PRIMARY ||
+	    state_change->mask.role != role_MASK || state_change->val.role != R_SECONDARY)
+		return;
+
+	/* A directly connected neighbor that was primary demotes to secondary */
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		if (get_ldev(device)) {
+			drbd_bitmap_io(device, &drbd_bm_write, "peer demote",
+				       BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK, NULL);
+			put_ldev(device);
+		}
+		rcu_read_lock();
+		kref_put(&device->kref, drbd_destroy_device);
+	}
+	rcu_read_unlock();
+}
 
-	return (s != c->bm_bits);
+static void peer_device_init_connect_state(struct drbd_peer_device *peer_device)
+{
+	clear_bit(INITIAL_STATE_SENT, &peer_device->flags);
+	clear_bit(INITIAL_STATE_RECEIVED, &peer_device->flags);
+	clear_bit(HAVE_SIZES, &peer_device->flags);
+	clear_bit(UUIDS_RECEIVED, &peer_device->flags);
+	clear_bit(CURRENT_UUID_RECEIVED, &peer_device->flags);
+	clear_bit(PEER_QUORATE, &peer_device->flags);
+	peer_device->connect_state = (union drbd_state) {{ .disk = D_MASK }};
 }
 
-/*
- * decode_bitmap_c
+
+/**
+ * drbd_init_connect_state() - Prepare twopc that establishes the connection
+ * @connection:	The connection this is about
  *
- * Return 0 when done, 1 when another iteration is needed, and a negative error
- * code upon failure.
+ * After a transport implementation has established the lower-level aspects
+ * of a connection, DRBD executes a two-phase commit so that the membership
+ * information changes in a cluster-wide, consistent way. During that
+ * two-phase commit, DRBD exchanges the UUIDs, size information, and the
+ * initial state. A two-phase commit might be aborted, which needs to be
+ * retried. This function re-initializes the struct members for this. The
+ * callsites are at the beginning of a two-phase connect commit, active and
+ * passive side.
  */
-static int
-decode_bitmap_c(struct drbd_peer_device *peer_device,
-		struct p_compressed_bm *p,
-		struct bm_xfer_ctx *c,
-		unsigned int len)
+void drbd_init_connect_state(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		peer_device_init_connect_state(peer_device);
+	rcu_read_unlock();
+	clear_bit(CONN_HANDSHAKE_DISCONNECT, &connection->flags);
+	clear_bit(CONN_HANDSHAKE_RETRY, &connection->flags);
+	clear_bit(CONN_HANDSHAKE_READY, &connection->flags);
+}
+
+enum csc_rv {
+	CSC_CLEAR,
+	CSC_REJECT,
+	CSC_ABORT_LOCAL,
+	CSC_TID_MISS,
+	CSC_MATCH,
+};
+
+static enum csc_rv
+check_concurrent_transactions(struct drbd_resource *resource, struct twopc_reply *new_r)
 {
-	if (dcbp_get_code(p) == RLE_VLI_Bits)
-		return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+	struct twopc_reply *ongoing = &resource->twopc_reply;
 
-	/* other variants had been implemented for evaluation,
-	 * but have been dropped as this one turned out to be "best"
-	 * during all our tests. */
+	if (!resource->remote_state_change)
+		return CSC_CLEAR;
 
-	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
-	conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
-	return -EIO;
+	if (new_r->initiator_node_id < ongoing->initiator_node_id) {
+		if (ongoing->initiator_node_id == resource->res_opts.node_id)
+			return CSC_ABORT_LOCAL;
+		else
+			return CSC_REJECT;
+	} else if (new_r->initiator_node_id > ongoing->initiator_node_id) {
+		return CSC_REJECT;
+	}
+	if (new_r->tid != ongoing->tid)
+		return CSC_TID_MISS;
+
+	return CSC_MATCH;
 }
 
-void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device,
-		const char *direction, struct bm_xfer_ctx *c)
+
+enum alt_rv {
+	ALT_LOCKED,
+	ALT_MATCH,
+	ALT_TIMEOUT,
+};
+
+static enum alt_rv when_done_lock(struct drbd_resource *resource, unsigned int for_tid)
 {
-	/* what would it take to transfer it "plaintext" */
-	unsigned int header_size = drbd_header_size(peer_device->connection);
-	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
-	unsigned int plain =
-		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
-		c->bm_words * sizeof(unsigned long);
-	unsigned int total = c->bytes[0] + c->bytes[1];
-	unsigned int r;
+	write_lock_irq(&resource->state_rwlock);
+	if (!resource->remote_state_change)
+		return ALT_LOCKED;
+	write_unlock_irq(&resource->state_rwlock);
+	if (resource->twopc_reply.tid == for_tid)
+		return ALT_MATCH;
 
-	/* total can not be zero. but just in case: */
-	if (total == 0)
-		return;
+	return ALT_TIMEOUT;
+}
+static enum alt_rv abort_local_transaction(struct drbd_connection *connection, unsigned int for_tid)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct net_conf *nc;
+	enum alt_rv rv;
+	long t;
 
-	/* don't report if not compressed */
-	if (total >= plain)
-		return;
+	rcu_read_lock();
+	nc = rcu_dereference(connection->transport.net_conf);
+	t = nc->ping_timeo * HZ/10 * 3 / 2;
+	rcu_read_unlock();
 
-	/* total < plain. check for overflow, still */
-	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
-		                    : (1000 * total / plain);
+	set_bit(TWOPC_ABORT_LOCAL, &resource->flags);
+	write_unlock_irq(&resource->state_rwlock);
+	wake_up_all(&resource->state_wait);
+	wait_event_timeout(resource->twopc_wait,
+			   (rv = when_done_lock(resource, for_tid)) != ALT_TIMEOUT, t);
+	clear_bit(TWOPC_ABORT_LOCAL, &resource->flags);
+	return rv;
+}
 
-	if (r > 1000)
-		r = 1000;
+static int receive_twopc(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct p_twopc_request *p = pi->data;
+	struct twopc_reply reply = {0};
 
-	r = 1000 - r;
-	drbd_info(peer_device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
-	     "total %u; compression: %u.%u%%\n",
-			direction,
-			c->bytes[1], c->packets[1],
-			c->bytes[0], c->packets[0],
-			total, r/10, r % 10);
+	reply.vnr = pi->vnr;
+	reply.tid = be32_to_cpu(p->tid);
+	if (connection->agreed_features & DRBD_FF_2PC_V2) {
+		reply.initiator_node_id = p->s8_initiator_node_id;
+		reply.target_node_id = p->s8_target_node_id;
+	} else {
+		reply.initiator_node_id = be32_to_cpu(p->u32_initiator_node_id);
+		reply.target_node_id = be32_to_cpu(p->u32_target_node_id);
+	}
+	reply.reachable_nodes = directly_connected_nodes(resource, NOW) |
+				NODE_MASK(resource->res_opts.node_id);
+
+	if (pi->cmd == P_TWOPC_PREPARE)
+		clear_bit(TWOPC_RECV_SIZES_ERR, &resource->flags);
+
+	process_twopc(connection, &reply, pi, jiffies);
+
+	return 0;
 }
 
-/* Since we are processing the bitfield from lower addresses to higher,
-   it does not matter if the process it in 32 bit chunks or 64 bit
-   chunks as long as it is little endian. (Understand it as byte stream,
-   beginning with the lowest byte...) If we would use big endian
-   we would need to process it from the highest address to the lowest,
-   in order to be agnostic to the 32 vs 64 bits issue.
+static void nested_twopc_abort(struct drbd_resource *resource, struct twopc_request *request)
+{
+	struct drbd_connection *connection;
+	u64 nodes_to_reach, reach_immediately, im;
 
-   returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+	read_lock_irq(&resource->state_rwlock);
+	nodes_to_reach = request->nodes_to_reach;
+	reach_immediately = directly_connected_nodes(resource, NOW) & nodes_to_reach;
+	nodes_to_reach &= ~(reach_immediately | NODE_MASK(resource->res_opts.node_id));
+	request->nodes_to_reach = nodes_to_reach;
+	read_unlock_irq(&resource->state_rwlock);
+
+	for_each_connection_ref(connection, im, resource) {
+		u64 mask = NODE_MASK(connection->peer_node_id);
+		if (reach_immediately & mask)
+			conn_send_twopc_request(connection, request);
+	}
+}
+
+static bool is_prepare(enum drbd_packet cmd)
+{
+	return cmd == P_TWOPC_PREP_RSZ || cmd == P_TWOPC_PREPARE;
+}
+
+
+enum determine_dev_size
+drbd_commit_size_change(struct drbd_device *device, struct resize_parms *rs, u64 nodes_to_reach)
 {
+	struct twopc_resize *tr = &device->resource->twopc.resize;
 	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct bm_xfer_ctx c;
-	int err;
+	enum determine_dev_size dd;
+	uint64_t my_usize;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		/* update cached sizes, relevant for the next handshake */
+		peer_device->c_size = tr->new_size;
+		peer_device->u_size = tr->user_size;
 
-	drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
-	/* you are supposed to send additional out-of-sync information
-	 * if you actually set bits during this phase */
+		if (peer_device->d_size)
+			peer_device->d_size = tr->new_size;
+		peer_device->max_size = tr->new_size;
+	}
+	rcu_read_unlock();
 
-	c = (struct bm_xfer_ctx) {
-		.bm_bits = drbd_bm_bits(device),
-		.bm_words = drbd_bm_words(device),
-	};
+	if (!get_ldev(device)) {
+		drbd_set_my_capacity(device, tr->new_size);
+		return DS_UNCHANGED; /* Not entirely true, but we are diskless... */
+	}
 
-	for(;;) {
-		if (pi->cmd == P_BITMAP)
-			err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
-		else if (pi->cmd == P_COMPRESSED_BITMAP) {
-			/* MAYBE: sanity check that we speak proto >= 90,
-			 * and the feature is enabled! */
-			struct p_compressed_bm *p = pi->data;
+	rcu_read_lock();
+	my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
 
-			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
-				drbd_err(device, "ReportCBitmap packet too large\n");
-				err = -EIO;
-				goto out;
-			}
-			if (pi->size <= sizeof(*p)) {
-				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
-				err = -EIO;
-				goto out;
-			}
-			err = drbd_recv_all(peer_device->connection, p, pi->size);
-			if (err)
-			       goto out;
-			err = decode_bitmap_c(peer_device, p, &c, pi->size);
-		} else {
-			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
-			err = -EIO;
-			goto out;
-		}
+	if (my_usize != tr->user_size) {
+		struct disk_conf *old_disk_conf, *new_disk_conf;
 
-		c.packets[pi->cmd == P_BITMAP]++;
-		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+		drbd_info(device, "New u_size %llu sectors\n",
+			  (unsigned long long)tr->user_size);
 
-		if (err <= 0) {
-			if (err < 0)
-				goto out;
-			break;
+		new_disk_conf = kzalloc_obj(struct disk_conf);
+		if (!new_disk_conf) {
+			device->ldev->disk_conf->disk_size = tr->user_size;
+			goto cont;
 		}
-		err = drbd_recv_header(peer_device->connection, pi);
-		if (err)
-			goto out;
+
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = tr->user_size;
+
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		kvfree_rcu_mightsleep(old_disk_conf);
 	}
+cont:
+	dd = drbd_determine_dev_size(device, tr->new_size, tr->dds_flags | DDSF_2PC, rs);
 
-	INFO_bm_xfer_stats(peer_device, "receive", &c);
+	if (dd == DS_GREW && !(tr->dds_flags & DDSF_NO_RESYNC)) {
+		struct drbd_resource *resource = device->resource;
+		const int my_node_id = resource->res_opts.node_id;
+		struct drbd_peer_device *peer_device;
+		u64 im;
 
-	if (device->state.conn == C_WF_BITMAP_T) {
-		enum drbd_state_rv rv;
+		for_each_peer_device_ref(peer_device, im, device) {
+			if (peer_device->repl_state[NOW] != L_ESTABLISHED ||
+			    peer_device->disk_state[NOW] < D_INCONSISTENT)
+				continue;
 
-		err = drbd_send_bitmap(device, peer_device);
-		if (err)
-			goto out;
-		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
-		rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-		D_ASSERT(device, rv == SS_SUCCESS);
-	} else if (device->state.conn != C_WF_BITMAP_S) {
-		/* admin may have requested C_DISCONNECTING,
-		 * other threads may have noticed network errors */
-		drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
-		    drbd_conn_str(device->state.conn));
+			if (tr->diskful_primary_nodes) {
+				if (tr->diskful_primary_nodes & NODE_MASK(my_node_id)) {
+					enum drbd_repl_state resync;
+					if (tr->diskful_primary_nodes & NODE_MASK(peer_device->node_id)) {
+						/* peer is also primary */
+						resync = peer_device->node_id < my_node_id ?
+							L_SYNC_TARGET : L_SYNC_SOURCE;
+					} else {
+						/* peer is secondary */
+						resync = L_SYNC_SOURCE;
+					}
+					drbd_start_resync(peer_device, resync, "resize");
+				} else {
+					if (tr->diskful_primary_nodes & NODE_MASK(peer_device->node_id))
+						drbd_start_resync(peer_device, L_SYNC_TARGET,
+								"resize");
+					/* else  no resync */
+				}
+			} else {
+				if (resource->twopc_parent_nodes & NODE_MASK(peer_device->node_id))
+					drbd_start_resync(peer_device, L_SYNC_TARGET, "resize");
+				else if (nodes_to_reach & NODE_MASK(peer_device->node_id))
+					drbd_start_resync(peer_device, L_SYNC_SOURCE, "resize");
+				/* else  no resync */
+			}
+		}
 	}
-	err = 0;
 
- out:
-	drbd_bm_unlock(device);
-	if (!err && device->state.conn == C_WF_BITMAP_S)
-		drbd_start_resync(device, C_SYNC_SOURCE);
-	return err;
+	put_ldev(device);
+	return dd;
 }
 
-static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+enum drbd_state_rv drbd_support_2pc_resize(struct drbd_resource *resource)
 {
-	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
-		 pi->cmd, pi->size);
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv = SS_SUCCESS;
 
-	return ignore_remaining_packet(connection, pi);
-}
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		if (connection->cstate[NOW] == C_CONNECTED &&
+		    connection->agreed_pro_version < 112) {
+			rv = SS_NOT_SUPPORTED;
+			break;
+		}
+	}
+	rcu_read_unlock();
 
-static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
-{
-	/* Make sure we've acked all the TCP data associated
-	 * with the data requests being unplugged */
-	tcp_sock_set_quickack(connection->data.socket->sk, 2);
-	return 0;
+	return rv;
 }
 
-static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+static bool any_neighbor_quorate(struct drbd_resource *resource)
 {
 	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_block_desc *p = pi->data;
+	struct drbd_connection *connection;
+	bool peer_with_quorum = false;
+	int vnr;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		peer_with_quorum = true;
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			if (test_bit(PEER_QUORATE, &peer_device->flags))
+				continue;
+			peer_with_quorum = false;
+			break;
+		}
 
-	switch (device->state.conn) {
-	case C_WF_SYNC_UUID:
-	case C_WF_BITMAP_T:
-	case C_BEHIND:
+		if (peer_with_quorum)
 			break;
-	default:
-		drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
-				drbd_conn_str(device->state.conn));
 	}
+	rcu_read_unlock();
 
-	drbd_set_out_of_sync(peer_device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+	return peer_with_quorum;
+}
+
+static void process_twopc(struct drbd_connection *connection,
+			 struct twopc_reply *reply,
+			 struct packet_info *pi,
+			 unsigned long receive_jif)
+{
+	struct drbd_connection *affected_connection = connection;
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device = NULL;
+	struct p_twopc_request *p = pi->data;
+	struct twopc_state_change *state_change = &resource->twopc.state_change;
+	enum chg_state_flags flags = CS_VERBOSE | CS_LOCAL_ONLY;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct twopc_request request;
+	bool waiting_allowed = true;
+	enum csc_rv csc_rv;
+
+	request.tid = be32_to_cpu(p->tid);
+	if (connection->agreed_features & DRBD_FF_2PC_V2) {
+		request.flags = be32_to_cpu(p->flags);
+		request.initiator_node_id = p->s8_initiator_node_id;
+		request.target_node_id = p->s8_target_node_id;
+	} else {
+		request.flags = 0;
+		request.initiator_node_id = be32_to_cpu(p->u32_initiator_node_id);
+		request.target_node_id = be32_to_cpu(p->u32_target_node_id);
+	}
+	request.nodes_to_reach = be64_to_cpu(p->nodes_to_reach);
+	request.cmd = pi->cmd;
+	request.vnr = pi->vnr;
 
-	return 0;
-}
+	/* Check for concurrent transactions and duplicate packets. */
+retry:
+	write_lock_irq(&resource->state_rwlock);
 
-static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
-{
-	struct drbd_peer_device *peer_device;
+	csc_rv = check_concurrent_transactions(resource, reply);
+
+	if (csc_rv == CSC_CLEAR && pi->cmd != P_TWOPC_ABORT) {
+		struct drbd_device *device;
+		int iterate_vnr;
+
+		if (!is_prepare(pi->cmd)) {
+			/* We have committed or aborted this transaction already. */
+			write_unlock_irq(&resource->state_rwlock);
+			dynamic_drbd_dbg(connection, "Ignoring %s packet %u\n",
+				   drbd_packet_name(pi->cmd),
+				   reply->tid);
+			return;
+		}
+		if (reply->is_aborted) {
+			write_unlock_irq(&resource->state_rwlock);
+			return;
+		}
+		resource->remote_state_change = true;
+		resource->twopc.type =
+			pi->cmd == P_TWOPC_PREPARE ? TWOPC_STATE_CHANGE : TWOPC_RESIZE;
+		resource->twopc_prepare_reply_cmd = 0;
+		resource->twopc_parent_nodes = NODE_MASK(connection->peer_node_id);
+		clear_bit(TWOPC_EXECUTED, &resource->flags);
+		idr_for_each_entry(&resource->devices, device, iterate_vnr)
+			clear_bit(OUTDATE_ON_2PC_COMMIT, &device->flags);
+	} else if (csc_rv == CSC_MATCH && !is_prepare(pi->cmd)) {
+		flags |= CS_PREPARED;
+
+		if (test_and_set_bit(TWOPC_EXECUTED, &resource->flags)) {
+			write_unlock_irq(&resource->state_rwlock);
+			drbd_info(connection, "Ignoring redundant %s packet %u.\n",
+				  drbd_packet_name(pi->cmd),
+				  reply->tid);
+			return;
+		}
+	} else if (csc_rv == CSC_ABORT_LOCAL && is_prepare(pi->cmd)) {
+		enum alt_rv alt_rv;
+
+		drbd_info(connection, "Aborting local state change %u to yield to remote "
+			  "state change %u.\n",
+			  resource->twopc_reply.tid,
+			  reply->tid);
+		alt_rv = abort_local_transaction(connection, reply->tid);
+		if (alt_rv == ALT_MATCH) {
+			/* abort_local_transaction() comes back unlocked in this case... */
+			goto match;
+		} else if (alt_rv == ALT_TIMEOUT) {
+			/* abort_local_transaction() comes back unlocked in this case... */
+			drbd_info(connection, "Aborting local state change %u "
+				  "failed. Rejecting remote state change %u.\n",
+				  resource->twopc_reply.tid,
+				  reply->tid);
+			drbd_send_twopc_reply(connection, P_TWOPC_RETRY, reply);
+			return;
+		}
+		/* abort_local_transaction() returned with the state_rwlock write lock */
+		if (reply->is_aborted) {
+			write_unlock_irq(&resource->state_rwlock);
+			return;
+		}
+		resource->remote_state_change = true;
+		resource->twopc.type =
+			pi->cmd == P_TWOPC_PREPARE ? TWOPC_STATE_CHANGE : TWOPC_RESIZE;
+		resource->twopc_parent_nodes = NODE_MASK(connection->peer_node_id);
+		resource->twopc_prepare_reply_cmd = 0;
+		clear_bit(TWOPC_EXECUTED, &resource->flags);
+	} else if (pi->cmd == P_TWOPC_ABORT) {
+		/* crc_rc != CRC_MATCH */
+		write_unlock_irq(&resource->state_rwlock);
+		nested_twopc_abort(resource, &request);
+		return;
+	} else {
+		write_unlock_irq(&resource->state_rwlock);
+
+		if (csc_rv == CSC_TID_MISS && is_prepare(pi->cmd) && waiting_allowed) {
+			/* CSC_TID_MISS implies the two transactions are from the same initiator */
+			if (!(resource->twopc_parent_nodes & NODE_MASK(connection->peer_node_id))) {
+				long timeout = twopc_timeout(resource) / 20; /* usually 1.5 sec */
+				/*
+				 * We are expecting the P_TWOPC_COMMIT or P_TWOPC_ABORT through
+				 * another connection. So we can wait without deadlocking.
+				 */
+				wait_event_interruptible_timeout(resource->twopc_wait,
+						!resource->remote_state_change, timeout);
+				waiting_allowed = false; /* retry only once */
+				goto retry;
+			}
+		}
+
+		if (csc_rv == CSC_REJECT ||
+		    (csc_rv == CSC_TID_MISS && is_prepare(pi->cmd))) {
+			drbd_info(connection, "Rejecting concurrent "
+				  "remote state change %u because of "
+				  "state change %u\n",
+				  reply->tid,
+				  resource->twopc_reply.tid);
+			drbd_send_twopc_reply(connection, P_TWOPC_RETRY, reply);
+			return;
+		}
+
+		if (is_prepare(pi->cmd)) {
+			if (csc_rv == CSC_MATCH) {
+				/* We have prepared this transaction already. */
+				enum drbd_packet reply_cmd;
+
+			match:
+				drbd_info(connection,
+						"Duplicate prepare for remote state change %u\n",
+						reply->tid);
+				write_lock_irq(&resource->state_rwlock);
+				resource->twopc_parent_nodes |= NODE_MASK(connection->peer_node_id);
+				reply_cmd = resource->twopc_prepare_reply_cmd;
+				write_unlock_irq(&resource->state_rwlock);
+
+				if (reply_cmd) {
+					drbd_send_twopc_reply(connection, reply_cmd,
+							      &resource->twopc_reply);
+				} else {
+					/* if a node sends us a prepare, that means he has
+					   prepared this himsilf successfully. */
+					write_lock_irq(&resource->state_rwlock);
+					set_bit(TWOPC_YES, &connection->flags);
+					drbd_maybe_cluster_wide_reply(resource);
+					write_unlock_irq(&resource->state_rwlock);
+				}
+			}
+		} else {
+			drbd_info(connection, "Ignoring %s packet %u "
+				  "current processing state change %u\n",
+				  drbd_packet_name(pi->cmd),
+				  reply->tid,
+				  resource->twopc_reply.tid);
+		}
+		return;
+	}
+
+	if (reply->initiator_node_id != connection->peer_node_id) {
+		/*
+		 * This is an indirect request.  Unless we are directly
+		 * connected to the initiator as well as indirectly, we don't
+		 * have connection or peer device objects for this peer.
+		 */
+		affected_connection = drbd_connection_by_node_id(resource, reply->initiator_node_id);
+	}
+
+	if (reply->target_node_id != -1 &&
+	    reply->target_node_id != resource->res_opts.node_id) {
+		affected_connection = NULL;
+	}
+
+	switch (resource->twopc.type) {
+	case TWOPC_STATE_CHANGE:
+		if (pi->cmd == P_TWOPC_PREPARE) {
+			state_change->mask.i = be32_to_cpu(p->mask);
+			state_change->val.i = be32_to_cpu(p->val);
+		} else { /* P_TWOPC_COMMIT */
+			state_change->primary_nodes = be64_to_cpu(p->primary_nodes);
+			state_change->reachable_nodes = be64_to_cpu(p->reachable_nodes);
+		}
+		break;
+	case TWOPC_RESIZE:
+		if (request.cmd == P_TWOPC_PREP_RSZ) {
+			resource->twopc.resize.user_size = be64_to_cpu(p->user_size);
+			resource->twopc.resize.dds_flags = be16_to_cpu(p->dds_flags);
+		} else { /* P_TWOPC_COMMIT */
+			resource->twopc.resize.diskful_primary_nodes =
+				be64_to_cpu(p->diskful_primary_nodes);
+			resource->twopc.resize.new_size = be64_to_cpu(p->exposed_size);
+		}
+	}
+
+	if (affected_connection && affected_connection->cstate[NOW] < C_CONNECTED &&
+	    state_change->mask.conn == 0)
+		affected_connection = NULL;
+
+	if (pi->vnr != -1 && affected_connection) {
+		peer_device = conn_peer_device(affected_connection, pi->vnr);
+		/* If we do not know the peer_device, then we are fine with
+		   whatever is going on in the cluster. E.g. detach and del-minor
+		   one each node, one after the other */
+
+		affected_connection = NULL; /* It is intended for a peer_device! */
+	}
+
+	if (state_change->mask.conn == conn_MASK) {
+		u64 m = NODE_MASK(reply->initiator_node_id);
+
+		if (state_change->val.conn == C_CONNECTED) {
+			reply->reachable_nodes |= m;
+			if (affected_connection) {
+				reply->is_connect = 1;
+
+				if (pi->cmd == P_TWOPC_PREPARE)
+					drbd_init_connect_state(connection);
+			}
+		}
+		if (state_change->val.conn == C_DISCONNECTING) {
+			reply->reachable_nodes &= ~m;
+			reply->is_disconnect = 1;
+		}
+	}
+
+	if (pi->cmd == P_TWOPC_PREPARE) {
+		reply->primary_nodes = be64_to_cpu(p->primary_nodes);
+		if (resource->role[NOW] == R_PRIMARY) {
+			reply->primary_nodes |= NODE_MASK(resource->res_opts.node_id);
+
+			if (drbd_res_data_accessible(resource))
+				reply->weak_nodes = ~reply->reachable_nodes;
+		}
+	}
+	if (pi->cmd == P_TWOPC_PREP_RSZ) {
+		struct drbd_device *device;
+
+		device = (peer_device ?: conn_peer_device(connection, pi->vnr))->device;
+		if (get_ldev(device)) {
+			if (resource->role[NOW] == R_PRIMARY)
+				reply->diskful_primary_nodes = NODE_MASK(resource->res_opts.node_id);
+			reply->max_possible_size = drbd_local_max_size(device);
+			put_ldev(device);
+		} else {
+			reply->max_possible_size = DRBD_MAX_SECTORS;
+			reply->diskful_primary_nodes = 0;
+		}
+	}
+
+	resource->twopc_reply = *reply;
+	write_unlock_irq(&resource->state_rwlock);
+
+	if (affected_connection && affected_connection != connection &&
+	    affected_connection->cstate[NOW] == C_CONNECTED) {
+		drbd_ping_peer(affected_connection);
+		if (affected_connection->cstate[NOW] < C_CONNECTED)
+			affected_connection = NULL;
+	}
+
+	switch (pi->cmd) {
+	case P_TWOPC_PREPARE:
+		drbd_print_cluster_wide_state_change(resource, "Preparing remote state change",
+				reply->tid, reply->initiator_node_id, reply->target_node_id,
+				state_change->mask, state_change->val);
+		flags |= CS_PREPARE;
+		break;
+	case P_TWOPC_PREP_RSZ:
+		drbd_info(connection, "Preparing remote state change %u "
+			  "(local_max_size = %llu KiB)\n",
+			  reply->tid, (unsigned long long)reply->max_possible_size >> 1);
+		flags |= CS_PREPARE;
+		break;
+	case P_TWOPC_ABORT:
+		drbd_info(connection, "Aborting remote state change %u\n",
+			  reply->tid);
+		flags |= CS_ABORT;
+		break;
+	case P_TWOPC_COMMIT:
+		drbd_info(connection, "Committing remote state change %u (primary_nodes=%llX)\n",
+			  reply->tid, be64_to_cpu(p->primary_nodes));
+		break;
+	default:
+		BUG();
+	}
+
+	switch (resource->twopc.type) {
+	case TWOPC_STATE_CHANGE:
+		if (flags & CS_PREPARED && !(flags & CS_ABORT)) {
+			reply->primary_nodes = state_change->primary_nodes;
+			handle_neighbor_demotion(connection, state_change, reply);
+
+			if ((resource->cached_all_devices_have_quorum ||
+			     any_neighbor_quorate(resource)) &&
+			    request.flags & TWOPC_HAS_REACHABLE) {
+				resource->members = state_change->reachable_nodes;
+				if (!resource->cached_all_devices_have_quorum)
+					flags |= CS_FORCE_RECALC;
+			}
+			if (state_change->mask.conn == conn_MASK &&
+			    state_change->val.conn == C_CONNECTED) {
+				/* Add nodes connecting "far away" to members */
+				u64 add_mask = NODE_MASK(reply->initiator_node_id) |
+					NODE_MASK(reply->target_node_id);
+
+				resource->members |= add_mask;
+			}
+		}
+
+		if (peer_device)
+			rv = change_peer_device_state(peer_device, state_change, flags);
+		else if (affected_connection)
+			rv = change_connection_state(affected_connection, state_change, reply,
+						     flags | CS_IGN_OUTD_FAIL);
+		else
+			rv = far_away_change(connection, &request, reply, flags);
+		break;
+	case TWOPC_RESIZE:
+		if (flags & CS_PREPARE)
+			rv = drbd_support_2pc_resize(resource);
+		break;
+	}
+
+	if (flags & CS_PREPARE) {
+		mod_timer(&resource->twopc_timer, receive_jif + twopc_timeout(resource));
+
+		/* Retry replies can be sent immediately. Otherwise use the
+		 * nested twopc path. This waits for the state handshake to
+		 * complete in the case of a twopc for transitioning to
+		 * C_CONNECTED. */
+		if (rv == SS_IN_TRANSIENT_STATE) {
+			resource->twopc_prepare_reply_cmd = P_TWOPC_RETRY;
+			drbd_send_twopc_reply(connection, P_TWOPC_RETRY, reply);
+		} else {
+			resource->twopc_reply.state_change_failed = rv < SS_SUCCESS;
+			nested_twopc_request(resource, &request);
+		}
+	} else {
+		if (flags & CS_PREPARED) {
+			if (rv < SS_SUCCESS)
+				drbd_err(resource, "FATAL: Local commit of prepared %u failed! \n",
+					 reply->tid);
+
+			timer_delete(&resource->twopc_timer);
+		}
+
+		nested_twopc_request(resource, &request);
+
+		if (resource->twopc.type == TWOPC_RESIZE && flags & CS_PREPARED &&
+		    !(flags & CS_ABORT)) {
+			struct drbd_device *device;
+
+			device = (peer_device ?: conn_peer_device(connection, pi->vnr))->device;
+
+			drbd_commit_size_change(device, NULL, request.nodes_to_reach);
+			rv = SS_SUCCESS;
+		}
+
+		clear_remote_state_change(resource);
+
+		if (peer_device && rv >= SS_SUCCESS && !(flags & CS_ABORT))
+			drbd_md_sync_if_dirty(peer_device->device);
+
+		if (connection->agreed_pro_version < 117 &&
+		    rv >= SS_SUCCESS && !(flags & CS_ABORT) &&
+		    affected_connection &&
+		    state_change->mask.conn == conn_MASK && state_change->val.conn == C_CONNECTED)
+			conn_connect2(connection);
+	}
+}
+
+void drbd_try_to_get_resynced(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device, *best_peer_device = NULL;
+	enum sync_strategy best_strategy = UNDETERMINED;
+	int best_preference = 0;
+
+	if (!get_ldev(device))
+		return;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		enum sync_strategy strategy;
+		enum sync_rule rule;
+		int peer_node_id;
+
+		if (peer_device->disk_state[NOW] != D_UP_TO_DATE)
+			continue;
+
+		strategy = drbd_uuid_compare(peer_device, &rule, &peer_node_id);
+		disk_states_to_strategy(peer_device, peer_device->disk_state[NOW], &strategy, rule,
+					&peer_node_id);
+		drbd_info(peer_device, "strategy = %s\n", strategy_descriptor(strategy).name);
+		if (strategy_descriptor(strategy).resync_peer_preference > best_preference) {
+			best_preference = strategy_descriptor(strategy).resync_peer_preference;
+			best_peer_device = peer_device;
+			best_strategy = strategy;
+		}
+	}
+	rcu_read_unlock();
+	peer_device = best_peer_device;
+
+	if (best_strategy == NO_SYNC) {
+		change_disk_state(device, D_UP_TO_DATE, CS_VERBOSE, "get-resync", NULL);
+	} else if (peer_device &&
+		   (!repl_is_sync_target(peer_device->repl_state[NOW]) ||
+		    test_bit(UNSTABLE_RESYNC, &peer_device->flags))) {
+		drbd_resync(peer_device, DISKLESS_PRIMARY);
+		drbd_send_uuids(peer_device, UUID_FLAG_RESYNC | UUID_FLAG_DISKLESS_PRIMARY, 0);
+	}
+	put_ldev(device);
+}
+
+static void finish_nested_twopc(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device;
+	int vnr = 0;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		if (!test_bit(INITIAL_STATE_RECEIVED, &peer_device->flags))
+			return;
+	}
+
+	set_bit(CONN_HANDSHAKE_READY, &connection->flags);
+
+	wake_up_all(&resource->state_wait);
+
+	write_lock_irq(&resource->state_rwlock);
+	drbd_maybe_cluster_wide_reply(resource);
+	write_unlock_irq(&resource->state_rwlock);
+}
+
+static bool uuid_in_peer_history(struct drbd_peer_device *peer_device, u64 uuid)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(peer_device->history_uuids); i++)
+		if ((peer_device->history_uuids[i] & ~UUID_PRIMARY) == uuid)
+			return true;
+
+	return false;
+}
+
+static bool uuid_in_my_history(struct drbd_device *device, u64 uuid)
+{
+	int i;
+
+	for (i = 0; i < HISTORY_UUIDS; i++) {
+		if ((drbd_history_uuid(device, i) & ~UUID_PRIMARY) == uuid)
+			return true;
+	}
+
+	return false;
+}
+
+static bool peer_data_is_successor_of_mine(struct drbd_peer_device *peer_device)
+{
+	u64 exposed = peer_device->device->exposed_data_uuid & ~UUID_PRIMARY;
+	int i;
+
+	i = drbd_find_peer_bitmap_by_uuid(peer_device, exposed);
+	if (i != -1)
+		return true;
+
+	return uuid_in_peer_history(peer_device, exposed);
+}
+
+static bool peer_data_is_ancestor_of_mine(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	u64 peer_uuid = peer_device->current_uuid;
+	struct drbd_peer_device *p2;
+	bool rv = false;
+	int i;
+
+	rcu_read_lock();
+	for_each_peer_device_rcu(p2, device) {
+		if (peer_device == p2)
+			continue;
+		i = drbd_find_peer_bitmap_by_uuid(p2, peer_uuid);
+		if (i != -1 || uuid_in_peer_history(peer_device, peer_uuid)) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void propagate_exposed_uuid(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+	u64 im;
+
+	for_each_peer_device_ref(peer_device, im, device) {
+		if (!test_bit(INITIAL_STATE_SENT, &peer_device->flags))
+			continue;
+		drbd_send_current_uuid(peer_device, device->exposed_data_uuid, 0);
+	}
+}
+
+static void maybe_force_secondary(struct drbd_peer_device *peer_device)
+{
+	struct drbd_resource *resource = peer_device->device->resource;
+	unsigned long irq_flags;
+
+	if (!resource->fail_io[NOW] && resource->cached_susp &&
+	    resource->res_opts.on_susp_primary_outdated == SPO_FORCE_SECONDARY) {
+		drbd_warn(peer_device, "force secondary!\n");
+		begin_state_change(resource, &irq_flags,
+				   CS_VERBOSE | CS_HARD | CS_FS_IGN_OPENERS);
+		resource->role[NEW] = R_SECONDARY;
+		/* resource->fail_io[NEW] gets set via CS_FS_IGN_OPENERS */
+		end_state_change(resource, &irq_flags, "peer-state");
+	}
+}
+
+static void diskless_with_peers_different_current_uuids(struct drbd_peer_device *peer_device,
+							enum drbd_disk_state *peer_disk_state)
+{
+	bool data_successor = peer_data_is_successor_of_mine(peer_device);
+	bool data_ancestor = peer_data_is_ancestor_of_mine(peer_device);
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_device *device = peer_device->device;
+
+	if (data_successor && resource->role[NOW] == R_PRIMARY) {
+		drbd_warn(peer_device, "Remote node has more recent data\n");
+		maybe_force_secondary(peer_device);
+		set_bit(CONN_HANDSHAKE_RETRY, &connection->flags);
+	} else if (data_successor && resource->role[NOW] == R_SECONDARY) {
+		drbd_uuid_set_exposed(device, peer_device->current_uuid, true);
+		propagate_exposed_uuid(device);
+	} else if (data_ancestor) {
+		drbd_warn(peer_device, "Downgrading joining peer's disk as its data is older\n");
+		if (*peer_disk_state > D_OUTDATED)
+			*peer_disk_state = D_OUTDATED;
+			/* See "Do not trust this guy!" in sanitize_state() */
+	} else {
+		drbd_warn(peer_device, "Current UUID of peer does not match my exposed UUID.");
+		set_bit(CONN_HANDSHAKE_DISCONNECT, &connection->flags);
+	}
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device = NULL;
+	enum drbd_repl_state *repl_state;
+	struct drbd_device *device = NULL;
+	struct p_state *p = pi->data;
+	union drbd_state old_peer_state, peer_state;
+	enum drbd_disk_state peer_disk_state;
+	enum drbd_repl_state new_repl_state;
+	bool peer_was_resync_target, do_handshake = false;
+	enum chg_state_flags begin_state_chg_flags = CS_VERBOSE | CS_WAIT_COMPLETE;
+	unsigned long irq_flags;
+	int rv;
+
+	if (pi->vnr != -1) {
+		peer_device = conn_peer_device(connection, pi->vnr);
+		if (!peer_device)
+			return config_unknown_volume(connection, pi);
+		device = peer_device->device;
+	}
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	if (connection->agreed_pro_version < 110) {
+		/* Before drbd-9.0 there was no D_DETACHING it was D_FAILED... */
+		if (peer_state.disk >= D_DETACHING)
+			peer_state.disk++;
+		if (peer_state.pdsk >= D_DETACHING)
+			peer_state.pdsk++;
+	}
+
+	if (pi->vnr == -1) {
+		if (peer_state.role == R_SECONDARY) {
+			begin_state_change(resource, &irq_flags, CS_HARD | CS_VERBOSE);
+			__change_peer_role(connection, R_SECONDARY);
+			rv = end_state_change(resource, &irq_flags, "peer-state");
+			if (rv < SS_SUCCESS)
+				goto fail;
+		}
+		return 0;
+	}
+
+	peer_disk_state = peer_state.disk;
+
+	if (peer_disk_state > D_DISKLESS && !want_bitmap(peer_device)) {
+		drbd_warn(peer_device, "The peer is configured to be diskless but presents %s\n",
+			  drbd_disk_str(peer_disk_state));
+		goto fail;
+	}
+
+	if (peer_state.disk == D_NEGOTIATING) {
+		peer_disk_state = peer_device->uuid_flags & UUID_FLAG_INCONSISTENT ?
+			D_INCONSISTENT : D_CONSISTENT;
+		drbd_info(peer_device, "real peer disk state = %s\n", drbd_disk_str(peer_disk_state));
+	}
+
+	read_lock_irq(&resource->state_rwlock);
+	old_peer_state = drbd_get_peer_device_state(peer_device, NOW);
+	read_unlock_irq(&resource->state_rwlock);
+ retry:
+	new_repl_state = max_t(enum drbd_repl_state, old_peer_state.conn, L_OFF);
+
+	/* If some other part of the code (ack_receiver thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (old_peer_state.conn <= C_TEAR_DOWN)
+		return -ECONNRESET;
+
+	if (!test_bit(INITIAL_STATE_RECEIVED, &peer_device->flags) &&
+	    peer_state.role == R_PRIMARY && peer_device->uuid_flags & UUID_FLAG_STABLE)
+		check_resync_source(device, peer_device->uuid_node_mask);
+
+	peer_was_resync_target =
+		connection->agreed_pro_version >= 110 ?
+		peer_device->last_repl_state == L_SYNC_TARGET ||
+		peer_device->last_repl_state == L_PAUSED_SYNC_T
+		:
+		true;
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * was D_INCONSISTENT or D_CONSISTENT. (Since the peer might be
+	 * weak we do not know anything about its new disk state)
+	 */
+	if (peer_was_resync_target &&
+	    (old_peer_state.pdsk == D_INCONSISTENT || old_peer_state.pdsk == D_CONSISTENT) &&
+	    old_peer_state.conn > L_ESTABLISHED && old_peer_state.disk >= D_INCONSISTENT) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > L_ESTABLISHED &&
+		    peer_state.conn < L_SYNC_SOURCE)
+			peer_disk_state = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (peer_state.conn == L_ESTABLISHED) {
+			bool finish_now = false;
+
+			if (old_peer_state.conn == L_WF_BITMAP_S) {
+				read_lock_irq(&resource->state_rwlock);
+				if (peer_device->repl_state[NOW] == L_WF_BITMAP_S)
+					peer_device->resync_finished_pdsk = peer_state.disk;
+				else if (peer_device->repl_state[NOW] == L_SYNC_SOURCE)
+					finish_now = true;
+				read_unlock_irq(&resource->state_rwlock);
+			}
+
+			if (finish_now || old_peer_state.conn == L_SYNC_SOURCE ||
+			    old_peer_state.conn == L_PAUSED_SYNC_S) {
+				drbd_resync_finished(peer_device, peer_state.disk);
+				peer_device->last_repl_state = peer_state.conn;
+			}
+			return 0;
+		}
+	}
+
+	/* explicit verify finished notification, stop sector reached. */
+	if (old_peer_state.conn == L_VERIFY_T && old_peer_state.disk == D_UP_TO_DATE &&
+	    peer_state.conn == L_ESTABLISHED && peer_disk_state == D_UP_TO_DATE) {
+		ov_out_of_sync_print(peer_device);
+		drbd_resync_finished(peer_device, D_MASK);
+		peer_device->last_repl_state = peer_state.conn;
+		return 0;
+	}
+
+	/* Start resync after AHEAD/BEHIND */
+	if (connection->agreed_pro_version >= 110 &&
+	    peer_state.conn == L_SYNC_SOURCE && old_peer_state.conn == L_BEHIND) {
+		/*
+		 * Become Inconsistent immediately because we may now receive
+		 * data. Delay the start of the resync itself until any
+		 * previous resync is no longer active.
+		 */
+		rv = change_disk_state(device, D_INCONSISTENT, CS_VERBOSE,
+				"resync-after-behind", NULL);
+		if (rv < SS_SUCCESS)
+			goto fail;
+
+		peer_device->start_resync_side = L_SYNC_TARGET;
+		drbd_peer_device_post_work(peer_device, RS_START);
+		return 0;
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (old_peer_state.pdsk == D_UP_TO_DATE && peer_disk_state == D_INCONSISTENT &&
+	    old_peer_state.conn == L_ESTABLISHED && peer_state.conn > L_SYNC_SOURCE)
+		peer_disk_state = D_UP_TO_DATE;
+
+	if (new_repl_state == L_OFF)
+		new_repl_state = L_ESTABLISHED;
+
+	if (peer_state.conn == L_AHEAD)
+		new_repl_state = L_BEHIND;
+
+	/* with protocol >= 118 uuid & state packets come after the 2PC prepare packet */
+	do_handshake =
+		(test_bit(UUIDS_RECEIVED, &peer_device->flags) ||
+			test_bit(CURRENT_UUID_RECEIVED, &peer_device->flags)) &&
+		(connection->agreed_pro_version < 118 ||
+			drbd_twopc_between_peer_and_me(connection)) &&
+		old_peer_state.conn < L_ESTABLISHED;
+
+	if (test_bit(UUIDS_RECEIVED, &peer_device->flags) &&
+	    peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		enum sync_strategy strategy = UNDETERMINED;
+		bool consider_resync;
+
+		/* clear CONN_DISCARD_MY_DATA so late, to not lose it if peer
+		   gets aborted before we are able to do the resync handshake. */
+		clear_bit(CONN_DISCARD_MY_DATA, &connection->flags);
+
+		/* if we established a new connection */
+		consider_resync = do_handshake &&
+					!test_bit(INITIAL_STATE_RECEIVED, &peer_device->flags);
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --force */
+		consider_resync |= test_bit(CONSIDER_RESYNC, &peer_device->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		consider_resync |= (old_peer_state.conn == L_ESTABLISHED &&
+				    (peer_state.conn == L_STARTING_SYNC_S ||
+				     peer_state.conn == L_STARTING_SYNC_T));
+
+		consider_resync |= peer_state.conn == L_WF_BITMAP_T &&
+				   peer_device->flags & UUID_FLAG_CRASHED_PRIMARY;
+
+		if (consider_resync) {
+			strategy = drbd_sync_handshake(peer_device, peer_state);
+			new_repl_state = strategy_to_repl_state(peer_device, peer_state.role, strategy);
+		} else if (old_peer_state.conn == L_ESTABLISHED &&
+			   (peer_state.disk == D_NEGOTIATING ||
+			    old_peer_state.disk == D_NEGOTIATING)) {
+			strategy = drbd_attach_handshake(peer_device, peer_disk_state);
+			new_repl_state = strategy_to_repl_state(peer_device, peer_state.role, strategy);
+			if (new_repl_state == L_ESTABLISHED && device->disk_state[NOW] == D_UP_TO_DATE)
+				peer_disk_state = D_UP_TO_DATE;
+		}
+
+		put_ldev(device);
+		if (strategy_descriptor(strategy).reconnect) { /* retry connect */
+			maybe_force_secondary(peer_device);
+			if (connection->agreed_pro_version >= 118)
+				set_bit(CONN_HANDSHAKE_RETRY, &connection->flags);
+			else
+				return -EIO; /* retry connect */
+		} else if (strategy_descriptor(strategy).disconnect) {
+			if (device->disk_state[NOW] == D_NEGOTIATING) {
+				new_repl_state = L_NEG_NO_RESULT;
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				if (connection->agreed_pro_version < 110) {
+					drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+					peer_state.disk = D_DISKLESS;
+					peer_disk_state = D_DISKLESS;
+				} else {
+					/* The peer will decide later and let us know... */
+					peer_disk_state = D_NEGOTIATING;
+				}
+			} else {
+				if (test_and_clear_bit(CONN_DRY_RUN, &connection->flags))
+					return -EIO;
+				if (connection->agreed_pro_version >= 118)
+					set_bit(CONN_HANDSHAKE_DISCONNECT, &connection->flags);
+				else
+					goto fail;
+			}
+		}
+
+		if (device->disk_state[NOW] == D_NEGOTIATING) {
+			begin_state_chg_flags |= CS_FORCE_RECALC;
+			peer_device->negotiation_result = new_repl_state;
+		}
+	}
+
+	if (test_bit(UUIDS_RECEIVED, &peer_device->flags) &&
+	    peer_device->repl_state[NOW] == L_OFF && device->disk_state[NOW] == D_DISKLESS) {
+		u64 exposed_data_uuid = device->exposed_data_uuid;
+		u64 peer_current_uuid = peer_device->current_uuid;
+
+		drbd_info(peer_device, "my exposed UUID: %016llX\n", exposed_data_uuid);
+		drbd_uuid_dump_peer(peer_device, peer_device->dirty_bits, peer_device->uuid_flags);
+
+		/* I am diskless connecting to a peer with disk, check that UUID match
+		   We only check if the peer claims to have D_UP_TO_DATE data. Only then is the
+		   peer a source for my data anyways. */
+		if (exposed_data_uuid && peer_state.disk == D_UP_TO_DATE &&
+		    (exposed_data_uuid & ~UUID_PRIMARY) != (peer_current_uuid & ~UUID_PRIMARY))
+			diskless_with_peers_different_current_uuids(peer_device, &peer_disk_state);
+		if (!exposed_data_uuid && peer_state.disk == D_UP_TO_DATE) {
+			drbd_uuid_set_exposed(device, peer_current_uuid, true);
+			propagate_exposed_uuid(device);
+		}
+	}
+	if (peer_device->repl_state[NOW] == L_OFF && peer_state.disk == D_DISKLESS && get_ldev(device)) {
+		u64 uuid_flags = 0;
+
+		drbd_collect_local_uuid_flags(peer_device, NULL);
+		drbd_uuid_dump_self(peer_device, peer_device->comm_bm_set, uuid_flags);
+		drbd_info(peer_device, "peer's exposed UUID: %016llX\n", peer_device->current_uuid);
+
+		if (peer_state.role == R_PRIMARY &&
+		    (peer_device->current_uuid & ~UUID_PRIMARY) ==
+		    (drbd_current_uuid(device) & ~UUID_PRIMARY)) {
+			/* Connecting to diskless primary peer. When the state change is committed,
+			 * sanitize_state might set me D_UP_TO_DATE. Make sure the
+			 * effective_size is set. */
+			peer_device->max_size = peer_device->c_size;
+			drbd_determine_dev_size(device, peer_device->max_size, 0, NULL);
+		}
+
+		put_ldev(device);
+	}
+
+	if (test_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags) ||
+			connection->agreed_pro_version < 110) {
+		struct drbd_transport *transport = &connection->transport;
+		/* Last packet of handshake received, disarm receive timeout */
+		transport->class->ops.set_rcvtimeo(transport, DATA_STREAM, MAX_SCHEDULE_TIMEOUT);
+	}
+
+	if (new_repl_state == L_ESTABLISHED && peer_disk_state == D_CONSISTENT &&
+	    drbd_suspended(device) && peer_device->repl_state[NOW] < L_ESTABLISHED &&
+	    test_and_clear_bit(NEW_CUR_UUID, &device->flags)) {
+		/* Do not allow RESEND for a rebooted peer. We can only allow this
+	 * for temporary network outages! */
+		drbd_err(peer_device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		drbd_uuid_new_current(device, false);
+		begin_state_change(resource, &irq_flags, CS_HARD);
+		__change_cstate(connection, C_PROTOCOL_ERROR);
+		__change_io_susp_user(resource, false);
+		end_state_change(resource, &irq_flags, "abort-connect");
+		return -EIO;
+	}
+
+	clear_bit(RS_SOURCE_MISSED_END, &peer_device->flags);
+	clear_bit(RS_PEER_MISSED_END, &peer_device->flags);
+
+	if (peer_state.quorum)
+		set_bit(PEER_QUORATE, &peer_device->flags);
+	else
+		clear_bit(PEER_QUORATE, &peer_device->flags);
+
+	if (do_handshake) {
+		/* Ignoring state packets before the 2PC; they are from aborted 2PCs */
+		bool done = test_bit(INITIAL_STATE_RECEIVED, &peer_device->flags);
+
+		set_bit(INITIAL_STATE_RECEIVED, &peer_device->flags);
+		if (connection->cstate[NOW] == C_CONNECTING) {
+			peer_device->connect_state.peer_isp =
+				peer_state.aftr_isp | peer_state.user_isp;
+
+			if (!done) {
+				peer_device->connect_state.conn = new_repl_state;
+				peer_device->connect_state.peer = peer_state.role;
+				peer_device->connect_state.pdsk = peer_disk_state;
+			}
+			wake_up(&connection->ee_wait);
+			finish_nested_twopc(connection);
+		}
+	}
+
+	/* State change will be performed when the two-phase commit is committed. */
+	if (connection->cstate[NOW] == C_CONNECTING)
+		return 0;
+
+	if (peer_state.conn == L_OFF) {
+		/* device/minor hot add on the peer of a minor already locally known */
+		if (peer_device->repl_state[NOW] == L_NEGOTIATING) {
+			drbd_send_enable_replication_next(peer_device);
+			drbd_send_sizes(peer_device, 0, 0);
+			drbd_send_uuids(peer_device, 0, 0);
+		}
+		drbd_send_current_state(peer_device);
+	}
+
+	begin_state_change(resource, &irq_flags, begin_state_chg_flags);
+	if (old_peer_state.i != drbd_get_peer_device_state(peer_device, NOW).i) {
+		old_peer_state = drbd_get_peer_device_state(peer_device, NOW);
+		abort_state_change_locked(resource);
+		write_unlock_irq(&resource->state_rwlock);
+		goto retry;
+	}
+	clear_bit(CONSIDER_RESYNC, &peer_device->flags);
+	if (device->disk_state[NOW] != D_NEGOTIATING)
+		__change_repl_state(peer_device, new_repl_state);
+	__change_peer_role(connection, peer_state.role);
+	if (peer_state.disk != D_NEGOTIATING)
+		__change_peer_disk_state(peer_device, peer_disk_state);
+	__change_resync_susp_peer(peer_device, peer_state.aftr_isp | peer_state.user_isp);
+	repl_state = peer_device->repl_state;
+	if (repl_state[OLD] < L_ESTABLISHED && repl_state[NEW] >= L_ESTABLISHED)
+		resource->state_change_flags |= CS_HARD;
+
+	rv = end_state_change(resource, &irq_flags, "peer-state");
+	new_repl_state = peer_device->repl_state[NOW];
+
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (old_peer_state.conn > L_OFF) {
+		if (new_repl_state > L_ESTABLISHED && peer_state.conn <= L_ESTABLISHED &&
+		    peer_state.disk != D_NEGOTIATING) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(peer_device, 0, 0);
+			drbd_send_current_state(peer_device);
+		}
+	}
+
+	clear_bit(DISCARD_MY_DATA, &peer_device->flags); /* Only relevant for agreed_pro_version < 117 */
+
+	drbd_md_sync(device); /* update connected indicator, effective_size, ... */
+
+	peer_device->last_repl_state = peer_state.conn;
+	return 0;
+fail:
+	change_cstate(connection, C_DISCONNECTING, CS_HARD);
+	return -EIO;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_uuid *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	wait_event(device->misc_wait,
+		   peer_device->repl_state[NOW] == L_WF_SYNC_UUID ||
+		   peer_device->repl_state[NOW] == L_BEHIND ||
+		   peer_device->repl_state[NOW] < L_ESTABLISHED ||
+		   device->disk_state[NOW] < D_NEGOTIATING);
+
+	/* D_ASSERT(device,  peer_device->repl_state[NOW] == L_WF_SYNC_UUID ); */
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		_drbd_uuid_set_current(device, be64_to_cpu(p->uuid));
+		_drbd_uuid_set_bitmap(peer_device, 0UL);
+
+		drbd_print_uuids(peer_device, "updated sync uuid");
+		drbd_start_resync(peer_device, L_SYNC_TARGET, "peer-sync-uuid");
+
+		put_ldev(device);
+	} else
+		drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+	return 0;
+}
+
+static void scale_bits(unsigned long *base, unsigned int num_4k, unsigned int scale)
+{
+	unsigned int bits = num_4k * BITS_PER_LONG;
+	unsigned int sbit;
+
+	if (scale == 0)
+		return;
+
+	for (sbit = 0; sbit < bits; sbit++)
+		if (test_bit(sbit, base))
+			__set_bit(sbit >> scale, base);
+}
+
+/*
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ *
+ * Received bitmap is 4k per bit, need to aggregate by c->scale.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+		     struct bm_xfer_ctx *c)
+{
+	unsigned long *p;
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+				 drbd_header_size(peer_device->connection);
+	unsigned int num_words_4k = min_t(size_t, data_size / sizeof(*p),
+				       (c->bm_words - c->word_offset) << c->scale);
+	unsigned int want = num_words_4k * sizeof(*p);
+	int err;
+
+	if (want != size) {
+		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+		return -EIO;
+	}
+	if (want == 0)
+		return 0;
+	err = drbd_recv_all(peer_device->connection, (void **)&p, want);
+	if (err)
+		return err;
+
+	if ((num_words_4k & ((1 << c->scale)-1)) != 0) {
+		drbd_err(peer_device,
+			"number of words %u not aligned to scale %u while receiving bitmap\n",
+			num_words_4k, c->scale);
+		return -ERANGE;
+	}
+
+	if (get_ldev(peer_device->device)) {
+		scale_bits(p, num_words_4k, c->scale);
+		drbd_bm_merge_lel(peer_device, c->word_offset, num_words_4k >> c->scale, p);
+		put_ldev(peer_device->device);
+	} else {
+		drbd_err(peer_device, "lost backend device while receiving bitmap\n");
+		return -EIO;
+	}
+
+	c->word_offset += num_words_4k >> c->scale;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	c->bit_offset_4k = (c->word_offset << c->scale) * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+/*
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		 struct bm_xfer_ctx *c,
+		 unsigned int len)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl_4k;
+	u64 tmp;
+	unsigned long s_4k = c->bit_offset_4k;
+	int toggle = dcbp_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return -EIO;
+
+	for (have = bits; have > 0; s_4k += rl_4k, toggle = !toggle) {
+		bits = vli_decode_bits(&rl_4k, look_ahead);
+		if (bits <= 0)
+			return -EIO;
+
+		if (toggle) {
+			/* If peers bm_block_size is smaller than ours,
+			 * this may be a "partially" set bit ;-)
+			 * there is no such thing. Round down s, round up e.
+			 */
+			unsigned long s = s_4k >> c->scale;
+			unsigned long e = ((s_4k + rl_4k + (1UL << c->scale)-1) >> c->scale) - 1;
+
+			if (e >= c->bm_bits) {
+				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return -EIO;
+			}
+			drbd_bm_set_many_bits(peer_device, s, e);
+		}
+
+		if (have < bits) {
+			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return -EIO;
+		}
+		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+		if (likely(bits < 64))
+			look_ahead >>= bits;
+		else
+			look_ahead = 0;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return -EIO;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset_4k = s_4k;
+	c->bit_offset = s_4k >> c->scale;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (c->bit_offset_4k != c->bm_bits_4k);
+}
+
+/*
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c,
+		unsigned int len)
+{
+	if (dcbp_get_code(p) == RLE_VLI_Bits) {
+		struct drbd_device *device = peer_device->device;
+		int res;
+
+		if (!get_ldev(device)) {
+			drbd_err(peer_device, "lost backend device while receiving bitmap\n");
+			return -EIO;
+		}
+
+		res = recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+		put_ldev(device);
+		return res;
+	}
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests.
+	 */
+
+	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	change_cstate(peer_device->connection, C_PROTOCOL_ERROR, CS_HARD);
+	return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned int header_size = drbd_header_size(peer_device->connection);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+	unsigned int plain =
+		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+		c->bm_words * sizeof(unsigned long);
+	unsigned int total = c->bytes[0] + c->bytes[1];
+	unsigned int r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+				    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	drbd_info(peer_device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+static bool ready_for_bitmap(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	bool ready = true;
+
+	read_lock_irq(&resource->state_rwlock);
+	if (device->disk_state[NOW] == D_NEGOTIATING)
+		ready = false;
+	if (test_bit(TWOPC_STATE_CHANGE_PENDING, &resource->flags))
+		ready = false;
+	read_unlock_irq(&resource->state_rwlock);
+
+	return ready;
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	enum drbd_repl_state repl_state;
+	struct drbd_device *device;
+	struct bm_xfer_ctx c;
+	int err = -EIO;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	if (peer_device->bitmap_index == -1) {
+		drbd_err(peer_device, "No bitmap allocated in receive_bitmap()!\n");
+		return -EIO;
+	}
+	device = peer_device->device;
+
+	/* Final repl_states become visible when the disk leaves NEGOTIATING state */
+	wait_event_interruptible(device->resource->state_wait,
+				 ready_for_bitmap(device));
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "Cannot receive bitmap, local disk gone\n");
+		return -EIO;
+	}
+
+	drbd_bm_slot_lock(peer_device, "receive bitmap", BM_LOCK_CLEAR | BM_LOCK_BULK);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
+
+	if (!get_ldev(device))
+		goto out;
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits_4k = drbd_bm_bits_4k(device),
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+		.scale = device->bitmap->bm_block_shift - BM_BLOCK_SHIFT_4k,
+	};
+	put_ldev(device);
+
+	for (;;) {
+		if (pi->cmd == P_BITMAP)
+			err = receive_bitmap_plain(peer_device, pi->size, &c);
+		else if (pi->cmd == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p;
+
+			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+				drbd_err(device, "ReportCBitmap packet too large\n");
+				err = -EIO;
+				goto out;
+			}
+			if (pi->size <= sizeof(*p)) {
+				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+				err = -EIO;
+				goto out;
+			}
+			err = drbd_recv_all(connection, (void **)&p, pi->size);
+			if (err)
+			       goto out;
+			err = decode_bitmap_c(peer_device, p, &c, pi->size);
+		} else {
+			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+			err = -EIO;
+			goto out;
+		}
+
+		c.packets[pi->cmd == P_BITMAP]++;
+		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
+			break;
+		}
+		err = drbd_recv_header(connection, pi);
+		if (err)
+			goto out;
+	}
+
+	INFO_bm_xfer_stats(peer_device, "receive", &c);
+
+	repl_state = peer_device->repl_state[NOW];
+	if (repl_state == L_WF_BITMAP_T) {
+		err = drbd_send_bitmap(device, peer_device);
+		if (err)
+			goto out;
+	}
+
+	drbd_bm_slot_unlock(peer_device);
+	put_ldev(device);
+
+	if (test_bit(B_RS_H_DONE, &peer_device->flags)) {
+		/* We have entered drbd_start_resync() since starting the bitmap exchange. */
+		drbd_warn(peer_device, "Received bitmap more than once; ignoring\n");
+	} else if (repl_state == L_WF_BITMAP_S) {
+		drbd_start_resync(peer_device, L_SYNC_SOURCE, "receive-bitmap");
+	} else if (repl_state == L_WF_BITMAP_T) {
+		if (connection->agreed_pro_version < 110) {
+			enum drbd_state_rv rv;
+
+			/* Omit CS_WAIT_COMPLETE and CS_SERIALIZE with this state
+			 * transition to avoid deadlocks. */
+			rv = stable_change_repl_state(peer_device, L_WF_SYNC_UUID, CS_VERBOSE,
+					"receive-bitmap");
+			D_ASSERT(device, rv == SS_SUCCESS);
+		} else {
+			drbd_start_resync(peer_device, L_SYNC_TARGET, "receive-bitmap");
+		}
+	} else {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		drbd_info(peer_device, "unexpected repl_state (%s) in receive_bitmap\n",
+			  drbd_repl_str(repl_state));
+	}
+
+	return 0;
+ out:
+	drbd_bm_slot_unlock(peer_device);
+	put_ldev(device);
+	return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+		 pi->cmd, pi->size);
+
+	return ignore_remaining_packet(connection, pi->size);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_transport *transport = &connection->transport;
+
+	/* Make sure we've acked all the data associated
+	 * with the data requests being unplugged */
+	transport->class->ops.hint(transport, DATA_STREAM, QUICKACK);
+
+	/* just unplug all devices always, regardless which volume number */
+	drbd_unplug_all_devices(connection);
+
+	return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
 	struct p_block_desc *p = pi->data;
+	sector_t sector;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	sector = be64_to_cpu(p->sector);
+
+	/* see also process_one_request(), before drbd_send_out_of_sync().
+	 * Make sure any pending write requests that potentially may
+	 * set in-sync have drained, before setting it out-of-sync.
+	 * That should be implicit, because of the "epoch" and P_BARRIER logic,
+	 * But let's just double-check.
+	 */
+	conn_wait_active_ee_empty_or_disconnect(connection);
+	conn_wait_done_ee_empty_or_disconnect(connection);
+
+	drbd_set_out_of_sync(peer_device, sector, be32_to_cpu(p->blksize));
+
+	return 0;
+}
+
+static int receive_dagtag(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_dagtag *p = pi->data;
+
+	set_connection_dagtag(connection, be64_to_cpu(p->dagtag));
+	return 0;
+}
+
+struct drbd_connection *drbd_connection_by_node_id(struct drbd_resource *resource, int node_id)
+{
+	/* Caller needs to hold rcu_read_lock(), conf_update */
+	struct drbd_connection *connection;
+
+	for_each_connection_rcu(connection, resource) {
+		if (connection->peer_node_id == node_id)
+			return connection;
+	}
+
+	return NULL;
+}
+
+struct drbd_connection *drbd_get_connection_by_node_id(struct drbd_resource *resource, int node_id)
+{
+	struct drbd_connection *connection;
+
+	rcu_read_lock();
+	connection = drbd_connection_by_node_id(resource, node_id);
+	if (connection)
+		kref_get(&connection->kref);
+	rcu_read_unlock();
+
+	return connection;
+}
+
+static int receive_peer_dagtag(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device;
+	enum drbd_repl_state new_repl_state;
+	struct p_peer_dagtag *p = pi->data;
+	struct drbd_connection *lost_peer;
+	enum sync_strategy strategy = NO_SYNC;
+	s64 dagtag_offset;
+	int vnr = 0;
+
+	lost_peer = drbd_get_connection_by_node_id(resource, be32_to_cpu(p->node_id));
+	if (!lost_peer)
+		return 0;
+
+
+	if (lost_peer->cstate[NOW] == C_CONNECTED) {
+		drbd_ping_peer(lost_peer);
+		if (lost_peer->cstate[NOW] == C_CONNECTED)
+			goto out;
+	}
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		enum sync_strategy ps;
+		enum sync_rule rule;
+		int unused;
+
+		if (peer_device->repl_state[NOW] > L_ESTABLISHED)
+			goto out;
+		if (peer_device->device->disk_state[NOW] != D_CONSISTENT &&
+		    peer_device->device->disk_state[NOW] != D_UP_TO_DATE)
+			goto out;
+		if (!get_ldev(peer_device->device))
+			continue;
+		ps = drbd_uuid_compare(peer_device, &rule, &unused);
+		put_ldev(peer_device->device);
+
+		if (strategy == NO_SYNC) {
+			strategy = ps;
+			if (strategy != NO_SYNC &&
+			    strategy != SYNC_SOURCE_USE_BITMAP &&
+			    strategy != SYNC_TARGET_USE_BITMAP) {
+				drbd_info(peer_device,
+					  "%s(): %s by rule=%s\n",
+					  __func__,
+					  strategy_descriptor(strategy).name,
+					  drbd_sync_rule_str(rule));
+				goto out;
+			}
+		} else if (ps != strategy) {
+			drbd_err(peer_device,
+				 "%s(): Inconsistent resync directions %s %s\n",
+				 __func__,
+				 strategy_descriptor(strategy).name, strategy_descriptor(ps).name);
+			goto out;
+		}
+	}
+
+	/* We must wait until the other receiver thread has called the
+	 * cleanup_unacked_peer_requests() and drbd_notify_peers_lost_primary() functions. If we
+	 * become a resync target, the peer would complain about being in the wrong state when he
+	 * gets the bitmap before the P_PEER_DAGTAG packet.
+	 */
+	wait_event(resource->state_wait,
+		   !test_bit(NOTIFY_PEERS_LOST_PRIMARY, &lost_peer->flags));
+
+	dagtag_offset = atomic64_read(&lost_peer->last_dagtag_sector) - (s64)be64_to_cpu(p->dagtag);
+	if (strategy == SYNC_SOURCE_USE_BITMAP)  {
+		new_repl_state = L_WF_BITMAP_S;
+	} else if (strategy == SYNC_TARGET_USE_BITMAP)  {
+		new_repl_state = L_WF_BITMAP_T;
+	} else {
+		if (dagtag_offset > 0)
+			new_repl_state = L_WF_BITMAP_S;
+		else if (dagtag_offset < 0)
+			new_repl_state = L_WF_BITMAP_T;
+		else
+			new_repl_state = L_ESTABLISHED;
+	}
+
+	if (new_repl_state != L_ESTABLISHED) {
+		unsigned long irq_flags;
+		enum drbd_state_rv rv;
+
+		if (new_repl_state == L_WF_BITMAP_T) {
+			connection->after_reconciliation.dagtag_sector = be64_to_cpu(p->dagtag);
+			connection->after_reconciliation.lost_node_id = be32_to_cpu(p->node_id);
+		}
+
+		begin_state_change(resource, &irq_flags, CS_VERBOSE);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			__change_repl_state(peer_device, new_repl_state);
+			set_bit(RECONCILIATION_RESYNC, &peer_device->flags);
+		}
+		rv = end_state_change(resource, &irq_flags, "receive-peer-dagtag");
+		if (rv == SS_SUCCESS)
+			drbd_info(connection, "Reconciliation resync because \'%s\' disappeared. (o=%d)\n",
+				  lost_peer->transport.net_conf->name, (int)dagtag_offset);
+		else if (rv == SS_NOTHING_TO_DO)
+			drbd_info(connection, "\'%s\' disappeared (o=%d), no reconciliation since one diskless\n",
+				  lost_peer->transport.net_conf->name, (int)dagtag_offset);
+			/* sanitize_state() silently removes the resync and the RECONCILIATION_RESYNC bit */
+		else
+			drbd_info(connection, "rv = %d", rv);
+	} else {
+		drbd_info(connection, "No reconciliation resync even though \'%s\' disappeared. (o=%d)\n",
+			  lost_peer->transport.net_conf->name, (int)dagtag_offset);
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			if (get_ldev(peer_device->device)) {
+				drbd_bm_clear_many_bits(peer_device, 0, -1UL);
+				put_ldev(peer_device->device);
+			}
+		}
+	}
+
+out:
+	kref_put(&lost_peer->kref, drbd_destroy_connection);
+	return 0;
+}
+
+static bool drbd_diskless_moved_on(struct drbd_peer_device *peer_device, u64 current_uuid)
+{
+	struct drbd_device *device = peer_device->device;
+	u64 previous = peer_device->current_uuid;
+	bool from_the_past = false;
+
+	/* No exposed UUID => did not move on. */
+	if (!current_uuid)
+		return false;
+
+	/* Same as last time => did not move on. */
+	if ((previous & ~UUID_PRIMARY) == (current_uuid & ~UUID_PRIMARY))
+		return false;
+
+	/* Only consider the peer to have moved on if we were on the same UUID. */
+	if ((previous & ~UUID_PRIMARY) != (drbd_current_uuid(device) & ~UUID_PRIMARY))
+		return false;
+
+	if (get_ldev(device)) {
+		from_the_past =
+			drbd_find_bitmap_by_uuid(peer_device, current_uuid & ~UUID_PRIMARY) != -1;
+		if (!from_the_past)
+			from_the_past = uuid_in_my_history(device, current_uuid & ~UUID_PRIMARY);
+		put_ldev(device);
+	}
+
+	/* It is an old UUID => did not move on. */
+	if (from_the_past)
+		return false;
+
+	return true;
+}
+
+/* Accept a new current UUID generated on a diskless node, that just became primary
+   (or during handshake) */
+static int receive_current_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_current_uuid *p = pi->data;
+	u64 current_uuid, weak_nodes;
+	bool moved_on;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	current_uuid = be64_to_cpu(p->uuid);
+	weak_nodes = be64_to_cpu(p->weak_nodes);
+	weak_nodes |= NODE_MASK(peer_device->node_id);
+	moved_on = drbd_diskless_moved_on(peer_device, current_uuid);
+
+	peer_device->current_uuid = current_uuid;
+
+	if (get_ldev(device)) {
+		struct drbd_peer_md *peer_md = &device->ldev->md.peers[peer_device->node_id];
+		peer_md->flags |= MDF_NODE_EXISTS;
+		put_ldev(device);
+	}
+	if (connection->peer_role[NOW] == R_PRIMARY)
+		check_resync_source(device, weak_nodes);
+
+	if (connection->peer_role[NOW] == R_UNKNOWN) {
+		set_bit(CURRENT_UUID_RECEIVED, &peer_device->flags);
+		if (moved_on && device->disk_state[NOW] > D_OUTDATED)
+			peer_device->connect_state.disk = D_OUTDATED;
+		return 0;
+	}
+
+	if (current_uuid == drbd_current_uuid(device))
+		return 0;
+
+	if (peer_device->repl_state[NOW] >= L_ESTABLISHED &&
+	    get_ldev_if_state(device, D_UP_TO_DATE)) {
+		if (connection->peer_role[NOW] == R_PRIMARY) {
+			drbd_warn(peer_device, "received new current UUID: %016llX "
+				  "weak_nodes=%016llX\n", current_uuid, weak_nodes);
+			drbd_uuid_received_new_current(peer_device, current_uuid, weak_nodes);
+			drbd_md_sync_if_dirty(device);
+		} else if (moved_on) {
+			if (resource->remote_state_change)
+				set_bit(OUTDATE_ON_2PC_COMMIT, &device->flags);
+			else
+				change_disk_state(device, D_OUTDATED, CS_VERBOSE,
+						"receive-current-uuid", NULL);
+		}
+		put_ldev(device);
+	} else if (device->disk_state[NOW] == D_DISKLESS && resource->role[NOW] == R_PRIMARY) {
+		drbd_uuid_set_exposed(device, peer_device->current_uuid, true);
+	}
+
+	return 0;
+}
+
+static bool interval_is_adjacent(const struct drbd_interval *i1, const struct drbd_interval *i2)
+{
+	return i1->sector + (i1->size >> SECTOR_SHIFT) == i2->sector;
+}
+
+/* Advance caching pointers received_last and discard_last. Return next discard to be submitted. */
+static struct drbd_peer_request *drbd_advance_to_next_rs_discard(
+		struct drbd_peer_device *peer_device, unsigned int align, bool submit_all)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+	struct drbd_peer_request *discard_last = peer_device->discard_last;
+	bool discard_range_end = false;
+
+	/* Advance received_last. */
+	peer_req = list_prepare_entry(peer_device->received_last,
+			&peer_device->resync_requests, recv_order);
+	list_for_each_entry_continue(peer_req, &peer_device->resync_requests, recv_order) {
+		if (!test_bit(INTERVAL_RECEIVED, &peer_req->i.flags) && !submit_all)
+			break;
+
+		if (peer_req->flags & EE_TRIM)
+			break;
+
+		peer_device->received_last = peer_req;
+	}
+
+	/* Advance discard_last. */
+	peer_req = discard_last ? discard_last :
+		list_prepare_entry(peer_device->received_last,
+				&peer_device->resync_requests, recv_order);
+	list_for_each_entry_continue(peer_req, &peer_device->resync_requests, recv_order) {
+		/* Consider submitting previous discards. */
+		if (discard_last && !interval_is_adjacent(&discard_last->i, &peer_req->i)) {
+			discard_range_end = true;
+			break;
+		}
+
+		if (!(peer_req->flags & EE_TRIM)) {
+			discard_range_end =
+				test_bit(INTERVAL_RECEIVED, &peer_req->i.flags) || submit_all;
+			break;
+		}
+
+		discard_last = peer_req;
+
+		if (IS_ALIGNED(peer_req->i.sector + (peer_req->i.size >> SECTOR_SHIFT), align)) {
+			discard_range_end = true;
+			break;
+		}
+	}
+
+	/*
+	 * If we haven't found a discard range, or that range is not
+	 * finished, then there is nothing to submit.
+	 */
+	if (!discard_last || !(discard_range_end || discard_last->flags & EE_LAST_RESYNC_REQUEST)) {
+		peer_device->discard_last = discard_last;
+		return NULL;
+	}
+
+	/* Find start of discard range. */
+	peer_req = list_next_entry(list_prepare_entry(peer_device->received_last,
+				&peer_device->resync_requests, recv_order), recv_order);
+
+	spin_lock(&device->interval_lock); /* irqs already disabled */
+	if (peer_req != discard_last) {
+		struct drbd_peer_request *peer_req_merged = peer_req;
+
+		list_for_each_entry_continue(peer_req_merged,
+				&peer_device->resync_requests, recv_order) {
+			drbd_remove_interval(&device->requests, &peer_req_merged->i);
+			drbd_clear_interval(&peer_req_merged->i);
+			peer_req_merged->w.cb = e_end_resync_block;
+			if (peer_req_merged == discard_last)
+				break;
+		}
+	}
+	drbd_update_interval_size(&peer_req->i,
+			discard_last->i.size +
+			((discard_last->i.sector - peer_req->i.sector) << SECTOR_SHIFT));
+	spin_unlock(&device->interval_lock);
+
+	peer_device->received_last = discard_last;
+	peer_device->discard_last = NULL;
+
+	return peer_req;
+}
+
+static void drbd_submit_rs_discard(struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+
+	if (get_ldev(device)) {
+		list_del(&peer_req->w.list);
+
+		peer_req->w.cb = e_end_resync_block;
+		peer_req->bios.head->bi_opf = REQ_OP_DISCARD;
+
+		atomic_inc(&connection->backing_ee_cnt);
+		drbd_conflict_submit_resync_request(peer_req);
+
+		/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(). */
+	} else {
+		LIST_HEAD(free_list);
+		struct drbd_peer_request *t;
+
+		if (drbd_ratelimit())
+			drbd_err(device, "Cannot discard on local disk.\n");
+
+		drbd_send_ack(peer_device, P_RS_NEG_ACK, peer_req);
+
+		drbd_remove_peer_req_interval(peer_req);
+		list_move_tail(&peer_req->w.list, &free_list);
+
+		spin_lock_irq(&connection->peer_reqs_lock);
+		drbd_unmerge_discard(peer_req, &free_list);
+		spin_unlock_irq(&connection->peer_reqs_lock);
+
+		list_for_each_entry_safe(peer_req, t, &free_list, w.list)
+			drbd_free_peer_req(peer_req);
+	}
+}
+
+/* Find and submit discards in resync_requests which are ready. */
+void drbd_process_rs_discards(struct drbd_peer_device *peer_device, bool submit_all)
+{
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+	struct drbd_peer_request *pr_tmp;
+	unsigned int align = DRBD_MAX_RS_DISCARD_SIZE;
+	LIST_HEAD(work_list);
+
+	if (get_ldev(device)) {
+		/*
+		 * Limit the size of the merged requests. We want to allow the size to
+		 * increase up to the backing discard granularity.  If that is smaller
+		 * than DRBD_MAX_RS_DISCARD_SIZE, then allow merging up to a size of
+		 * DRBD_MAX_RS_DISCARD_SIZE.
+		 */
+		align = max(DRBD_MAX_RS_DISCARD_SIZE, bdev_discard_granularity(
+					device->ldev->backing_bdev)) >> SECTOR_SHIFT;
+		put_ldev(device);
+	}
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	while (true) {
+		peer_req = drbd_advance_to_next_rs_discard(peer_device, align, submit_all);
+		if (!peer_req)
+			break;
+
+		list_add_tail(&peer_req->w.list, &work_list);
+	}
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	list_for_each_entry_safe(peer_req, pr_tmp, &work_list, w.list)
+		drbd_submit_rs_discard(peer_req); /* removes it from the work_list */
+}
+
+static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
+	struct drbd_peer_request *peer_req;
 	sector_t sector;
 	int size, err = 0;
+	u64 block_id;
+	u64 im;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
 		return -EIO;
 	device = peer_device->device;
 
-	sector = be64_to_cpu(p->sector);
-	size = be32_to_cpu(p->blksize);
+	if (pi->cmd == P_RS_DEALLOCATED) {
+		struct p_block_desc *p = pi->data;
+
+		sector = be64_to_cpu(p->sector);
+		size = be32_to_cpu(p->blksize);
+		block_id = ID_SYNCER;
+	} else { /* P_RS_DEALLOCATED_ID */
+		struct p_block_ack *p = pi->data;
+
+		sector = be64_to_cpu(p->sector);
+		size = be32_to_cpu(p->blksize);
+		block_id = p->block_id;
+	}
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_RESYNC_WRITE),
+			sector, size, block_id);
+	if (!peer_req)
+		return -EIO;
+
+	dec_rs_pending(peer_device);
+	inc_unacked(peer_device);
+	atomic_add(size >> 9, &device->rs_sect_ev);
+	peer_req->flags |= EE_TRIM;
+
+	/* Setting all peers out of sync here. The sync source peer will be
+	 * set in sync when the discard completes. The sync source will soon
+	 * set other peers in sync with a P_PEERS_IN_SYNC packet.
+	 */
+	drbd_set_all_out_of_sync(device, sector, size);
+	drbd_process_rs_discards(peer_device, false);
+	rs_sectors_came_in(peer_device, size);
+
+	for_each_peer_device_ref(peer_device, im, device) {
+		enum drbd_repl_state repl_state = peer_device->repl_state[NOW];
+
+		if (repl_is_sync_source(repl_state) || repl_state == L_WF_BITMAP_S)
+			drbd_send_out_of_sync(peer_device, sector, size);
+	}
+
+	return err;
+}
+
+void drbd_last_resync_request(struct drbd_peer_device *peer_device, bool submit_all)
+{
+	struct drbd_connection *connection = peer_device->connection;
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	if (!list_empty(&peer_device->resync_requests)) {
+		struct drbd_peer_request *peer_req = list_last_entry(&peer_device->resync_requests,
+				struct drbd_peer_request, recv_order);
+		peer_req->flags |= EE_LAST_RESYNC_REQUEST;
+	}
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	drbd_process_rs_discards(peer_device, submit_all);
+}
+
+static int receive_disconnect(struct drbd_connection *connection, struct packet_info *pi)
+{
+	change_cstate_tag(connection, C_DISCONNECTING, CS_HARD, "receive-disconnect", NULL);
+	return 0;
+}
+
+struct data_cmd {
+	int expect_payload;
+	unsigned int pkt_size;
+	int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply },
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier },
+	[P_BITMAP]	    = { 1, 0, receive_bitmap },
+	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap },
+	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_data_request },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_data_request },
+	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_data_request },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_ov_reply },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_data_request },
+	[P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_data_request },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TWOPC_PREPARE] = { 0, sizeof(struct p_twopc_request), receive_twopc },
+	[P_TWOPC_PREP_RSZ]  = { 0, sizeof(struct p_twopc_request), receive_twopc },
+	[P_TWOPC_ABORT] = { 0, sizeof(struct p_twopc_request), receive_twopc },
+	[P_DAGTAG]	    = { 0, sizeof(struct p_dagtag), receive_dagtag },
+	[P_UUIDS110]	    = { 1, sizeof(struct p_uuids110), receive_uuids110 },
+	[P_PEER_DAGTAG]     = { 0, sizeof(struct p_peer_dagtag), receive_peer_dagtag },
+	[P_CURRENT_UUID]    = { 0, sizeof(struct p_current_uuid), receive_current_uuid },
+	[P_TWOPC_COMMIT]    = { 0, sizeof(struct p_twopc_request), receive_twopc },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
+	[P_ZEROES]	    = { 0, sizeof(struct p_trim), receive_Data },
+	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
+	[P_RS_DEALLOCATED_ID] = { 0, sizeof(struct p_block_ack), receive_rs_deallocated },
+	[P_DISCONNECT]      = { 0, 0, receive_disconnect },
+	[P_RS_DAGTAG_REQ]   = { 0, sizeof(struct p_rs_req), receive_dagtag_data_request },
+	[P_RS_CSUM_DAGTAG_REQ]   = { 1, sizeof(struct p_rs_req), receive_dagtag_data_request },
+	[P_RS_THIN_DAGTAG_REQ]   = { 0, sizeof(struct p_rs_req), receive_dagtag_data_request },
+	[P_OV_DAGTAG_REQ]   = { 0, sizeof(struct p_rs_req), receive_dagtag_data_request },
+	[P_OV_DAGTAG_REPLY]   = { 1, sizeof(struct p_rs_req), receive_dagtag_ov_reply },
+	[P_FLUSH_REQUESTS]  = { 0, sizeof(struct p_flush_requests), receive_flush_requests },
+	[P_FLUSH_REQUESTS_ACK] = { 0, sizeof(struct p_flush_ack), receive_flush_requests_ack },
+	[P_ENABLE_REPLICATION_NEXT] = { 0, sizeof(struct p_enable_replication),
+		receive_enable_replication_next },
+	[P_ENABLE_REPLICATION] = { 0, sizeof(struct p_enable_replication),
+		receive_enable_replication },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+	struct packet_info pi;
+	size_t shs; /* sub header size */
+	int err;
+
+	while (get_t_state(&connection->receiver) == RUNNING) {
+		struct data_cmd const *cmd;
+
+		drbd_thread_current_set_cpu(&connection->receiver);
+		update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
+		if (drbd_recv_header_maybe_unplug(connection, &pi))
+			goto err_out;
+
+		cmd = &drbd_cmd_handler[pi.cmd];
+		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+				 drbd_packet_name(pi.cmd), pi.cmd);
+			goto err_out;
+		}
+
+		shs = cmd->pkt_size;
+		if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
+			shs += sizeof(struct o_qlim);
+		if (pi.size > shs && !cmd->expect_payload) {
+			drbd_err(connection, "No payload expected %s l:%d\n",
+				 drbd_packet_name(pi.cmd), pi.size);
+			goto err_out;
+		}
+		if (pi.size < shs) {
+			drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
+				 drbd_packet_name(pi.cmd), (int)shs, pi.size);
+			goto err_out;
+		}
+
+		if (shs) {
+			update_receiver_timing_details(connection, drbd_recv_all_warn);
+			err = drbd_recv_all_warn(connection, &pi.data, shs);
+			if (err)
+				goto err_out;
+			pi.size -= shs;
+		}
+
+		update_receiver_timing_details(connection, cmd->fn);
+		err = cmd->fn(connection, &pi);
+		if (err) {
+			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+				 drbd_packet_name(pi.cmd), err, pi.size);
+			goto err_out;
+		}
+	}
+	return;
+
+    err_out:
+	change_cstate(connection, C_PROTOCOL_ERROR, CS_HARD);
+}
+
+static void drbd_cancel_conflicting_resync_requests(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	struct conflict_worker *submit_conflict = &device->submit_conflict;
+	struct rb_node *node;
+	bool any_queued = false;
+
+	spin_lock_irq(&device->interval_lock);
+	for (node = rb_first(&device->requests); node; node = rb_next(node)) {
+		struct drbd_interval *i = rb_entry(node, struct drbd_interval, rb);
+		struct drbd_peer_request *peer_req;
+
+		if (!drbd_interval_is_resync(i))
+			continue;
+
+		peer_req = container_of(i, struct drbd_peer_request, i);
+
+		if (peer_req->peer_device != peer_device)
+			continue;
+
+		/* Only cancel requests which are waiting for conflicts to resolve. */
+		if (test_bit(INTERVAL_SUBMITTED, &i->flags) ||
+				(test_bit(INTERVAL_READY_TO_SEND, &i->flags) &&
+				 !test_bit(INTERVAL_RECEIVED, &i->flags)) ||
+				test_bit(INTERVAL_CANCELED, &i->flags))
+			continue;
+
+		set_bit(INTERVAL_CANCELED, &i->flags);
+
+		dynamic_drbd_dbg(device,
+				"Cancel %s %s request at %llus+%u (sent=%d)\n",
+				test_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &i->flags) ?
+					"already queued" : "unqueued",
+				drbd_interval_type_str(i),
+				(unsigned long long) i->sector, i->size,
+				test_bit(INTERVAL_READY_TO_SEND, &i->flags));
+
+		if (test_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &i->flags))
+			continue;
+
+		set_bit(INTERVAL_SUBMIT_CONFLICT_QUEUED, &i->flags);
+
+		spin_lock(&submit_conflict->lock);
+		switch (i->type) {
+		case INTERVAL_RESYNC_WRITE:
+			list_add_tail(&peer_req->w.list, &submit_conflict->resync_writes);
+			break;
+		case INTERVAL_RESYNC_READ:
+			list_add_tail(&peer_req->w.list, &submit_conflict->resync_reads);
+			break;
+		default:
+			drbd_err(peer_device, "Unexpected interval type in %s\n", __func__);
+		}
+		spin_unlock(&submit_conflict->lock);
+
+		any_queued = true;
+	}
+	spin_unlock_irq(&device->interval_lock);
+
+	if (any_queued)
+		queue_work(submit_conflict->wq, &submit_conflict->worker);
+}
+
+static void cancel_dagtag_dependent_requests(struct drbd_resource *resource, unsigned int node_id)
+{
+	struct drbd_connection *connection;
+	LIST_HEAD(work_list);
+	struct drbd_peer_request *peer_req, *t;
+
+	rcu_read_lock();
+	for_each_connection_rcu(connection, resource) {
+		spin_lock_irq(&connection->peer_reqs_lock);
+		list_for_each_entry(peer_req, &connection->dagtag_wait_ee, w.list) {
+			if (peer_req->depend_dagtag_node_id != node_id)
+				continue;
+
+			dynamic_drbd_dbg(peer_req->peer_device, "%s at %llus+%u: Wait for dagtag %llus from peer %u cancelled\n",
+					drbd_interval_type_str(&peer_req->i),
+					(unsigned long long) peer_req->i.sector, peer_req->i.size,
+					(unsigned long long) peer_req->depend_dagtag,
+					node_id);
+
+			list_move_tail(&peer_req->w.list, &work_list);
+			break;
+		}
+		spin_unlock_irq(&connection->peer_reqs_lock);
+	}
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		drbd_peer_resync_read_cancel(peer_req);
+		drbd_free_peer_req(peer_req);
+	}
+}
+
+static void cleanup_resync_leftovers(struct drbd_peer_device *peer_device)
+{
+	peer_device->rs_total = 0;
+	peer_device->rs_failed = 0;
+	D_ASSERT(peer_device, atomic_read(&peer_device->rs_pending_cnt) == 0);
+
+	timer_delete_sync(&peer_device->resync_timer);
+	resync_timer_fn(&peer_device->resync_timer);
+	timer_delete_sync(&peer_device->start_resync_timer);
+}
+
+static void free_waiting_resync_requests(struct drbd_connection *connection)
+{
+	LIST_HEAD(free_list);
+	struct drbd_peer_device *peer_device;
+	struct drbd_peer_request *peer_req, *t;
+	int vnr;
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		list_for_each_entry_safe(peer_req, t, &peer_device->resync_requests, recv_order) {
+			drbd_list_del_resync_request(peer_req);
+			list_add_tail(&peer_req->w.list, &free_list);
+		}
+	}
+	rcu_read_unlock();
+
+	list_for_each_entry_safe(peer_req, t, &connection->peer_reads, recv_order) {
+		if (peer_req->i.type == INTERVAL_PEER_READ)
+			continue;
+
+		peer_req->flags &= ~EE_ON_RECV_ORDER;
+		list_del(&peer_req->recv_order);
+
+		list_add_tail(&peer_req->w.list, &free_list);
+	}
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	list_for_each_entry_safe(peer_req, t, &free_list, w.list) {
+		/*
+		 * Resync write requests waiting for peers-in-sync to be sent
+		 * just need to be freed.
+		 */
+		if (test_bit(INTERVAL_COMPLETED, &peer_req->i.flags)) {
+			drbd_free_peer_req(peer_req);
+			continue;
+		}
+
+		D_ASSERT(connection, test_bit(INTERVAL_READY_TO_SEND, &peer_req->i.flags));
+		D_ASSERT(connection, !test_bit(INTERVAL_RECEIVED, &peer_req->i.flags));
+		D_ASSERT(connection, !(peer_req->flags & EE_TRIM));
+
+		if (peer_req->i.type == INTERVAL_RESYNC_READ)
+			atomic_sub(peer_req->i.size >> 9, &connection->rs_in_flight);
+
+		dec_rs_pending(peer_req->peer_device);
+		drbd_remove_peer_req_interval(peer_req);
+		drbd_free_peer_req(peer_req);
+	}
+}
+
+static void free_dagtag_wait_requests(struct drbd_connection *connection)
+{
+	LIST_HEAD(dagtag_wait_work_list);
+	struct drbd_peer_request *peer_req, *t;
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_splice_init(&connection->dagtag_wait_ee, &dagtag_wait_work_list);
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
+	list_for_each_entry_safe(peer_req, t, &dagtag_wait_work_list, w.list) {
+		struct drbd_peer_device *peer_device = peer_req->peer_device;
+
+		/* Verify requests are placed in the interval tree when
+		 * the request is made, so they need to be removed if
+		 * the reply was waiting for a dagtag to be reached. */
+		if (peer_req->i.type == INTERVAL_OV_READ_SOURCE)
+			drbd_remove_peer_req_interval(peer_req);
+
+		drbd_free_peer_req(peer_req);
+		dec_unacked(peer_device);
+		put_ldev(peer_device->device);
+	}
+}
+
+static void drain_resync_activity(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	/*
+	 * In order to understand this function, refer to the flow diagrams in
+	 * the comments for make_resync_request(), make_ov_request() and
+	 * receive_dagtag_data_request().
+	 */
+
+	/*
+	 * We could receive data from a peer at any point. This might release a
+	 * request that is waiting for a dagtag. That would cause it to
+	 * progress to waiting for conflicts or the backing disk. So we need to
+	 * remove these requests before flushing the other stages.
+	 */
+	free_dagtag_wait_requests(connection);
+
+	/* Wait for w_resync_timer/w_e_send_csum to finish, if running. */
+	drbd_flush_workqueue(&connection->sender_work);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+
+		/* Cause remaining discards to be submitted. */
+		drbd_last_resync_request(peer_device, true);
+		/* Cause requests waiting due to conflicts to be canceled. */
+		drbd_cancel_conflicting_resync_requests(peer_device);
+
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	/* Drain conflicting and backing requests. */
+	wait_event(connection->ee_wait, atomic_read(&connection->backing_ee_cnt) == 0);
+
+	/* Wait for work queued when backing requests finished. */
+	drbd_flush_workqueue(&connection->sender_work);
+
+	/* Clear up and remove requests that have progressed to done_ee. */
+	drbd_finish_peer_reqs(connection);
+
+	/*
+	 * Requests that are waiting for a resync reply must be removed from
+	 * the interval tree and then freed.
+	 */
+	free_waiting_resync_requests(connection);
+
+	/* Requests that are waiting for a dagtag on this connection must be
+	 * cancelled, because the dependency will never be fulfilled. */
+	cancel_dagtag_dependent_requests(connection->resource, connection->peer_node_id);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+
+		cleanup_resync_leftovers(peer_device);
+
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
 
-	dec_rs_pending(peer_device);
+static void peer_device_disconnected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
 
-	if (get_ldev(device)) {
-		struct drbd_peer_request *peer_req;
+	if (test_and_clear_bit(HOLDING_UUID_READ_LOCK, &peer_device->flags))
+		up_read_non_owner(&device->uuid_sem);
 
-		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
-					       size, 0, GFP_NOIO);
-		if (!peer_req) {
-			put_ldev(device);
-			return -ENOMEM;
-		}
+	peer_device_init_connect_state(peer_device);
 
-		peer_req->w.cb = e_end_resync_block;
-		peer_req->opf = REQ_OP_DISCARD;
-		peer_req->submit_jif = jiffies;
-		peer_req->flags |= EE_TRIM;
+	/* No need to start additional resyncs after reconnection. */
+	peer_device->resync_again = 0;
 
-		spin_lock_irq(&device->resource->req_lock);
-		list_add_tail(&peer_req->w.list, &device->sync_ee);
-		spin_unlock_irq(&device->resource->req_lock);
+	if (!drbd_suspended(device)) {
+		struct drbd_resource *resource = device->resource;
 
-		atomic_add(pi->size >> 9, &device->rs_sect_ev);
-		err = drbd_submit_peer_request(peer_req);
+		/* We need to create the new UUID immediately when we finish
+		   requests that did not reach the lost peer.
+		   But when we lost quorum we are going to finish those
+		   requests with error, therefore do not create the new UUID
+		   immediately! */
+		if (!list_empty(&resource->transfer_log) &&
+		    drbd_data_accessible(device, NOW) &&
+		    !test_bit(PRIMARY_LOST_QUORUM, &device->flags) &&
+		    test_and_clear_bit(NEW_CUR_UUID, &device->flags))
+			drbd_check_peers_new_current_uuid(device);
+	}
 
-		if (err) {
-			spin_lock_irq(&device->resource->req_lock);
-			list_del(&peer_req->w.list);
-			spin_unlock_irq(&device->resource->req_lock);
+	drbd_md_sync(device);
 
-			drbd_free_peer_req(device, peer_req);
-			put_ldev(device);
-			err = 0;
-			goto fail;
-		}
+	if (get_ldev(device)) {
+		drbd_bitmap_io(device, &drbd_bm_write_copy_pages, "write from disconnected",
+				BM_LOCK_BULK | BM_LOCK_SINGLE_SLOT, peer_device);
+		put_ldev(device);
+	}
+}
 
-		inc_unacked(device);
+static bool initiator_can_commit_or_abort(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	bool remote = resource->twopc_reply.initiator_node_id != resource->res_opts.node_id;
 
-		/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
-		   as well as drbd_rs_complete_io() */
-	} else {
-	fail:
-		drbd_rs_complete_io(device, sector);
-		drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
+	if (remote) {
+		u64 parents = resource->twopc_parent_nodes & ~NODE_MASK(connection->peer_node_id);
+
+		if (!parents)
+			return false;
+		resource->twopc_parent_nodes = parents;
 	}
 
-	atomic_add(size >> 9, &device->rs_sect_in);
+	if (test_bit(TWOPC_PREPARED, &connection->flags) &&
+	    !(test_bit(TWOPC_YES, &connection->flags) ||
+	      test_bit(TWOPC_NO, &connection->flags) ||
+	      test_bit(TWOPC_RETRY, &connection->flags)))
+		return false;
 
-	return err;
+	return true;
 }
 
-struct data_cmd {
-	int expect_payload;
-	unsigned int pkt_size;
-	int (*fn)(struct drbd_connection *, struct packet_info *);
-};
+static void cleanup_remote_state_change(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct twopc_reply *reply = &resource->twopc_reply;
+	struct twopc_request request;
+	bool remote = false;
 
-static struct data_cmd drbd_cmd_handler[] = {
-	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
-	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
-	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
-	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
-	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
-	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
-	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
-	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
-	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
-	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
-	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
-	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
-	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
-	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
-	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
-	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
-	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
-	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
-	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
-	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
-	[P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
-	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
-	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
-	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
-	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
-	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
-	[P_ZEROES]	    = { 0, sizeof(struct p_trim), receive_Data },
-	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
-};
+	write_lock_irq(&resource->state_rwlock);
+	if (resource->remote_state_change && !initiator_can_commit_or_abort(connection)) {
+		remote = reply->initiator_node_id != resource->res_opts.node_id;
 
-static void drbdd(struct drbd_connection *connection)
+		if (remote)
+			request = (struct twopc_request) {
+				.nodes_to_reach = ~0,
+				.cmd = P_TWOPC_ABORT,
+				.tid = reply->tid,
+				.initiator_node_id = reply->initiator_node_id,
+				.target_node_id = reply->target_node_id,
+				.vnr = reply->vnr,
+			};
+
+		drbd_info(connection, "Aborting %s state change %u commit not possible\n",
+			  remote ? "remote" : "local", reply->tid);
+		if (remote) {
+			timer_delete(&resource->twopc_timer);
+			__clear_remote_state_change(resource);
+		} else {
+			enum alt_rv alt_rv = abort_local_transaction(connection, 0);
+			if (alt_rv != ALT_LOCKED)
+				return;
+		}
+	}
+	write_unlock_irq(&resource->state_rwlock);
+
+	/* for a local transaction, change_cluster_wide_state() sends the P_TWOPC_ABORTs */
+	if (remote)
+		nested_twopc_abort(resource, &request);
+}
+
+static void drbd_notify_peers_lost_primary(struct drbd_connection *lost_peer)
 {
-	struct packet_info pi;
-	size_t shs; /* sub header size */
-	int err;
+	struct drbd_resource *resource = lost_peer->resource;
+	struct drbd_connection *connection;
+	u64 im;
 
-	while (get_t_state(&connection->receiver) == RUNNING) {
-		struct data_cmd const *cmd;
+	for_each_connection_ref(connection, im, resource) {
+		struct drbd_peer_device *peer_device;
+		bool send_dagtag = false;
+		int vnr;
 
-		drbd_thread_current_set_cpu(&connection->receiver);
-		update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
-		if (drbd_recv_header_maybe_unplug(connection, &pi))
-			goto err_out;
+		if (connection == lost_peer)
+			continue;
+		if (connection->cstate[NOW] != C_CONNECTED)
+			continue;
 
-		cmd = &drbd_cmd_handler[pi.cmd];
-		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
-			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
-				 cmdname(pi.cmd), pi.cmd);
-			goto err_out;
-		}
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+			u64 current_uuid = drbd_current_uuid(device);
+			u64 weak_nodes = drbd_weak_nodes_device(device);
 
-		shs = cmd->pkt_size;
-		if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
-			shs += sizeof(struct o_qlim);
-		if (pi.size > shs && !cmd->expect_payload) {
-			drbd_err(connection, "No payload expected %s l:%d\n",
-				 cmdname(pi.cmd), pi.size);
-			goto err_out;
-		}
-		if (pi.size < shs) {
-			drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
-				 cmdname(pi.cmd), (int)shs, pi.size);
-			goto err_out;
-		}
+			if (device->disk_state[NOW] < D_INCONSISTENT ||
+			    peer_device->disk_state[NOW] < D_INCONSISTENT)
+				continue; /* Ignore if one side is diskless */
 
-		if (shs) {
-			update_receiver_timing_details(connection, drbd_recv_all_warn);
-			err = drbd_recv_all_warn(connection, pi.data, shs);
-			if (err)
-				goto err_out;
-			pi.size -= shs;
+			drbd_send_current_uuid(peer_device, current_uuid, weak_nodes);
+			send_dagtag = true;
 		}
 
-		update_receiver_timing_details(connection, cmd->fn);
-		err = cmd->fn(connection, &pi);
-		if (err) {
-			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
-				 cmdname(pi.cmd), err, pi.size);
-			goto err_out;
-		}
+		if (send_dagtag)
+			drbd_send_peer_dagtag(connection, lost_peer);
 	}
-	return;
-
-    err_out:
-	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
 }
 
 static void conn_disconnect(struct drbd_connection *connection)
 {
+	struct drbd_resource *resource = connection->resource;
 	struct drbd_peer_device *peer_device;
-	enum drbd_conns oc;
-	int vnr;
+	enum drbd_conn_state oc;
+	unsigned long irq_flags;
+	int vnr, i;
 
-	if (connection->cstate == C_STANDALONE)
+	clear_bit(CONN_DRY_RUN, &connection->flags);
+	clear_bit(CONN_CONGESTED, &connection->flags);
+
+	if (connection->cstate[NOW] == C_STANDALONE)
 		return;
 
 	/* We are about to start the cleanup after connection loss.
-	 * Make sure drbd_make_request knows about that.
+	 * Make sure drbd_submit_bio knows about that.
 	 * Usually we should be in some network failure state already,
 	 * but just in case we are not, we fix it up here.
 	 */
-	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+	change_cstate_tag(connection, C_NETWORK_FAILURE, CS_HARD, "disconnected", NULL);
+
+	del_connect_timer(connection);
 
 	/* ack_receiver does not clean up anything. it must not interfere, either */
-	drbd_thread_stop(&connection->ack_receiver);
 	if (connection->ack_sender) {
 		destroy_workqueue(connection->ack_sender);
 		connection->ack_sender = NULL;
 	}
-	drbd_free_sock(connection);
+
+	/* restart sender thread,
+	 * potentially get it out of blocking network operations */
+	drbd_thread_stop(&connection->sender);
+	drbd_thread_start(&connection->sender);
+
+	mutex_lock(&resource->conf_update);
+	drbd_transport_shutdown(connection, CLOSE_CONNECTION);
+	mutex_unlock(&resource->conf_update);
+
+	cleanup_remote_state_change(connection);
+
+	drain_resync_activity(connection);
+
+	connection->after_reconciliation.lost_node_id = -1;
+
+	/* Wait for current activity to cease.  This includes waiting for
+	 * peer_request queued to the submitter workqueue. */
+	wait_event(connection->ee_wait,
+		atomic_read(&connection->active_ee_cnt) == 0);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * etc. which may still be on the worker queue to be "canceled" */
+	drbd_flush_workqueue(&connection->sender_work);
+
+	drbd_finish_peer_reqs(connection);
+
+	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+	   might have issued a work again. The one before drbd_finish_peer_reqs() is
+	   necessary to reclaim net_ee in drbd_finish_peer_reqs(). */
+	drbd_flush_workqueue(&connection->sender_work);
 
 	rcu_read_lock();
 	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 		struct drbd_device *device = peer_device->device;
+
 		kref_get(&device->kref);
 		rcu_read_unlock();
-		drbd_disconnected(peer_device);
+
+		peer_device_disconnected(peer_device);
+		if (get_ldev(device)) {
+			drbd_reconsider_queue_parameters(device, device->ldev);
+			put_ldev(device);
+		} else {
+			drbd_reconsider_queue_parameters(device, NULL);
+		}
+
 		kref_put(&device->kref, drbd_destroy_device);
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
 
+	/* Apply these changes after peer_device_disconnected() because that
+	 * may cause the loss of other connections to be detected, which can
+	 * change the suspended state. */
+	tl_walk(connection, &connection->req_not_net_done,
+			resource->cached_susp ? CONNECTION_LOST_WHILE_SUSPENDED : CONNECTION_LOST);
+
+	i = drbd_free_peer_reqs(connection, &connection->done_ee);
+	if (i)
+		drbd_info(connection, "done_ee not empty, killed %u entries\n", i);
+	i = drbd_free_peer_reqs(connection, &connection->dagtag_wait_ee);
+	if (i)
+		drbd_info(connection, "dagtag_wait_ee not empty, killed %u entries\n", i);
+
+	cleanup_unacked_peer_requests(connection);
+	cleanup_peer_ack_list(connection);
+
+	i = atomic_read(&connection->pp_in_use);
+	if (i)
+		drbd_info(connection, "pp_in_use = %d, expected 0\n", i);
+	i = atomic_read(&connection->pp_in_use_by_net);
+	if (i)
+		drbd_info(connection, "pp_in_use_by_net = %d, expected 0\n", i);
+
 	if (!list_empty(&connection->current_epoch->list))
 		drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
 	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
 	atomic_set(&connection->current_epoch->epoch_size, 0);
 	connection->send.seen_any_write_yet = false;
+	connection->send.current_dagtag_sector =
+		resource->dagtag_sector - ((BIO_MAX_VECS << PAGE_SHIFT) >> SECTOR_SHIFT) - 1;
+	connection->current_epoch->oldest_unconfirmed_peer_req = NULL;
+
+	/* Indicate that last_dagtag_sector may no longer be up-to-date. We
+	 * need to keep last_dagtag_sector because we may still need it to
+	 * resolve a reconciliation resync. However, we need to avoid issuing a
+	 * resync request dependent on that dagtag because the resync source
+	 * may not be aware of the dagtag, even though it has newer data. This
+	 * can occur if the peer has been re-started since the request with the
+	 * dagtag.
+	 * */
+	clear_bit(RECEIVED_DAGTAG, &connection->flags);
+
+	/* Release any threads waiting for a barrier to be acked. */
+	clear_bit(BARRIER_ACK_PENDING, &connection->flags);
+	wake_up(&resource->barrier_wait);
 
 	drbd_info(connection, "Connection closed\n");
 
-	if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+	if (resource->role[NOW] == R_PRIMARY &&
+	    connection->fencing_policy != FP_DONT_CARE &&
+	    conn_highest_pdsk(connection) >= D_UNKNOWN)
 		conn_try_outdate_peer_async(connection);
 
-	spin_lock_irq(&connection->resource->req_lock);
-	oc = connection->cstate;
-	if (oc >= C_UNCONNECTED)
-		_conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
-
-	spin_unlock_irq(&connection->resource->req_lock);
-
-	if (oc == C_DISCONNECTING)
-		conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
-}
-
-static int drbd_disconnected(struct drbd_peer_device *peer_device)
-{
-	struct drbd_device *device = peer_device->device;
-	unsigned int i;
-
-	/* wait for current activity to cease. */
-	spin_lock_irq(&device->resource->req_lock);
-	_drbd_wait_ee_list_empty(device, &device->active_ee);
-	_drbd_wait_ee_list_empty(device, &device->sync_ee);
-	_drbd_wait_ee_list_empty(device, &device->read_ee);
-	spin_unlock_irq(&device->resource->req_lock);
-
-	/* We do not have data structures that would allow us to
-	 * get the rs_pending_cnt down to 0 again.
-	 *  * On C_SYNC_TARGET we do not have any data structures describing
-	 *    the pending RSDataRequest's we have sent.
-	 *  * On C_SYNC_SOURCE there is no data structure that tracks
-	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
-	 *  And no, it is not the sum of the reference counts in the
-	 *  resync_LRU. The resync_LRU tracks the whole operation including
-	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
-	 *  on the fly. */
-	drbd_rs_cancel_all(device);
-	device->rs_total = 0;
-	device->rs_failed = 0;
-	atomic_set(&device->rs_pending_cnt, 0);
-	wake_up(&device->misc_wait);
-
-	timer_delete_sync(&device->resync_timer);
-	resync_timer_fn(&device->resync_timer);
-
-	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
-	 * w_make_resync_request etc. which may still be on the worker queue
-	 * to be "canceled" */
-	drbd_flush_workqueue(&peer_device->connection->sender_work);
-
-	drbd_finish_peer_reqs(device);
-
-	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
-	   might have issued a work again. The one before drbd_finish_peer_reqs() is
-	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
-	drbd_flush_workqueue(&peer_device->connection->sender_work);
-
-	/* need to do it again, drbd_finish_peer_reqs() may have populated it
-	 * again via drbd_try_clear_on_disk_bm(). */
-	drbd_rs_cancel_all(device);
-
-	kfree(device->p_uuid);
-	device->p_uuid = NULL;
-
-	if (!drbd_suspended(device))
-		tl_clear(peer_device->connection);
-
-	drbd_md_sync(device);
+	drbd_maybe_khelper(NULL, connection, "disconnected");
 
-	if (get_ldev(device)) {
-		drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
-				"write from disconnected", BM_LOCKED_CHANGE_ALLOWED, NULL);
-		put_ldev(device);
+	begin_state_change(resource, &irq_flags, CS_VERBOSE | CS_LOCAL_ONLY);
+	oc = connection->cstate[NOW];
+	if (oc >= C_UNCONNECTED) {
+		__change_cstate(connection, C_UNCONNECTED);
+		/* drbd_receiver() has to be restarted after it returns */
+		drbd_thread_restart_nowait(&connection->receiver);
 	}
+	end_state_change(resource, &irq_flags, "disconnected");
 
-	i = atomic_read(&device->pp_in_use_by_net);
-	if (i)
-		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
-	i = atomic_read(&device->pp_in_use);
-	if (i)
-		drbd_info(device, "pp_in_use = %d, expected 0\n", i);
-
-	D_ASSERT(device, list_empty(&device->read_ee));
-	D_ASSERT(device, list_empty(&device->active_ee));
-	D_ASSERT(device, list_empty(&device->sync_ee));
-	D_ASSERT(device, list_empty(&device->done_ee));
+	if (test_bit(NOTIFY_PEERS_LOST_PRIMARY, &connection->flags)) {
+		drbd_notify_peers_lost_primary(connection);
+		clear_bit(NOTIFY_PEERS_LOST_PRIMARY, &connection->flags);
+	}
 
-	return 0;
+	if (oc == C_DISCONNECTING)
+		change_cstate_tag(connection, C_STANDALONE, CS_VERBOSE | CS_HARD | CS_LOCAL_ONLY,
+				"disconnected", NULL);
 }
 
 /*
- * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
- * we can agree on is stored in agreed_pro_version.
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX.
+ * But see also drbd_protocol_version_acceptable() and module parameter
+ * drbd_protocol_version_min.
+ * The protocol version we can agree on is stored in agreed_pro_version.
  *
  * feature flags and the reserved array should be enough room for future
  * enhancements of the handshake protocol, and possible plugins...
+ * See also PRO_FEATURES.
  *
- * for now, they are expected to be zero, but ignored.
  */
 static int drbd_send_features(struct drbd_connection *connection)
 {
-	struct drbd_socket *sock;
 	struct p_connection_features *p;
 
-	sock = &connection->data;
-	p = conn_prepare_command(connection, sock);
+	p = __conn_prepare_command(connection, sizeof(*p), DATA_STREAM);
 	if (!p)
 		return -EIO;
 	memset(p, 0, sizeof(*p));
-	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_min = cpu_to_be32(drbd_protocol_version_min);
 	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->sender_node_id = cpu_to_be32(connection->resource->res_opts.node_id);
+	p->receiver_node_id = cpu_to_be32(connection->peer_node_id);
 	p->feature_flags = cpu_to_be32(PRO_FEATURES);
-	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+	return __send_command(connection, -1, P_CONNECTION_FEATURES, DATA_STREAM);
 }
 
 /*
@@ -5083,9 +10031,10 @@ static int drbd_send_features(struct drbd_connection *connection)
  *  -1 peer talks different language,
  *     no point in trying again, please go standalone.
  */
-static int drbd_do_features(struct drbd_connection *connection)
+int drbd_do_features(struct drbd_connection *connection)
 {
 	/* ASSERT current == connection->receiver ... */
+	struct drbd_resource *resource = connection->resource;
 	struct p_connection_features *p;
 	const int expect = sizeof(struct p_connection_features);
 	struct packet_info pi;
@@ -5096,12 +10045,15 @@ static int drbd_do_features(struct drbd_connection *connection)
 		return 0;
 
 	err = drbd_recv_header(connection, &pi);
-	if (err)
+	if (err) {
+		if (err == -EAGAIN)
+			drbd_err(connection, "timeout while waiting for feature packet\n");
 		return 0;
+	}
 
 	if (pi.cmd != P_CONNECTION_FEATURES) {
 		drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
-			 cmdname(pi.cmd), pi.cmd);
+			 drbd_packet_name(pi.cmd), pi.cmd);
 		return -1;
 	}
 
@@ -5111,8 +10063,7 @@ static int drbd_do_features(struct drbd_connection *connection)
 		return -1;
 	}
 
-	p = pi.data;
-	err = drbd_recv_all_warn(connection, p, expect);
+	err = drbd_recv_all_warn(connection, (void **)&p, expect);
 	if (err)
 		return 0;
 
@@ -5122,42 +10073,102 @@ static int drbd_do_features(struct drbd_connection *connection)
 		p->protocol_max = p->protocol_min;
 
 	if (PRO_VERSION_MAX < p->protocol_min ||
-	    PRO_VERSION_MIN > p->protocol_max)
-		goto incompat;
+	    drbd_protocol_version_min > p->protocol_max) {
+		drbd_err(connection, "incompatible DRBD dialects: "
+		    "I support %d-%d, peer supports %d-%d\n",
+		    drbd_protocol_version_min, PRO_VERSION_MAX,
+		    p->protocol_min, p->protocol_max);
+		return -1;
+	}
+	/* Older DRBD will always expect us to agree to their max,
+	 * if it falls within our [min, max] range.
+	 * But we have a gap in there that we do not support.
+	 */
+	if (p->protocol_max > PRO_VERSION_8_MAX &&
+	    p->protocol_max < PRO_VERSION_MIN) {
+		drbd_err(connection, "incompatible DRBD 9 dialects: I support %u-%u, peer supports %u-%u\n",
+		    PRO_VERSION_MIN, PRO_VERSION_MAX,
+		    p->protocol_min, p->protocol_max);
+		return -1;
+	}
 
 	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
 	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
 
-	drbd_info(connection, "Handshake successful: "
-	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
+	if (connection->agreed_pro_version == 121 &&
+			(connection->agreed_features & DRBD_FF_RESYNC_DAGTAG)) {
+		/*
+		 * Releases drbd-9.2.0, drbd-9.2.1 and drbd-9.2.2 used an
+		 * implementation of discard merging which caused one
+		 * P_RS_WRITE_ACK to be sent for the whole merged interval.
+		 * These are precisely the releases with PRO_VERSION_MAX == 121
+		 * and feature DRBD_FF_RESYNC_DAGTAG.
+		 *
+		 * We do no support this case, so reject the connection.
+		 */
+		drbd_err(connection, "incompatible DRBD 9 dialects: protocol 121 with feature RESYNC_DAGTAG; upgrade via DRBD 9.2.16\n");
+		return -1;
+	}
+
+	if (connection->agreed_pro_version < 110) {
+		struct drbd_connection *connection2;
+		bool multiple = false;
+
+		rcu_read_lock();
+		for_each_connection_rcu(connection2, resource) {
+			if (connection == connection2)
+				continue;
+			multiple = true;
+		}
+		rcu_read_unlock();
+
+		if (multiple) {
+			drbd_err(connection, "Peer supports protocols %d-%d, but "
+				 "multiple connections are only supported in protocol "
+				 "110 and above\n", p->protocol_min, p->protocol_max);
+			return -1;
+		}
+	}
 
-	drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
+	if (connection->agreed_pro_version >= 110) {
+		if (be32_to_cpu(p->sender_node_id) != connection->peer_node_id) {
+			drbd_err(connection, "Peer presented a node_id of %d instead of %d\n",
+				 be32_to_cpu(p->sender_node_id), connection->peer_node_id);
+			return 0;
+		}
+		if (be32_to_cpu(p->receiver_node_id) != resource->res_opts.node_id) {
+			drbd_err(connection, "Peer expects me to have a node_id of %d instead of %d\n",
+				 be32_to_cpu(p->receiver_node_id), resource->res_opts.node_id);
+			return 0;
+		}
+	}
+
+	drbd_info(connection, "Handshake to peer %d successful: "
+			"Agreed network protocol version %d\n",
+			connection->peer_node_id,
+			connection->agreed_pro_version);
+
+	drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s%s\n",
 		  connection->agreed_features,
 		  connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
 		  connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
 		  connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
-		  connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
+		  connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : "",
+		  connection->agreed_features & DRBD_FF_RESYNC_DAGTAG ? " RESYNC_DAGTAG" :
 		  connection->agreed_features ? "" : " none");
 
 	return 1;
-
- incompat:
-	drbd_err(connection, "incompatible DRBD dialects: "
-	    "I support %d-%d, peer supports %d-%d\n",
-	    PRO_VERSION_MIN, PRO_VERSION_MAX,
-	    p->protocol_min, p->protocol_max);
-	return -1;
 }
 
 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
-static int drbd_do_auth(struct drbd_connection *connection)
+int drbd_do_auth(struct drbd_connection *connection)
 {
 	drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
 	drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
 	return -1;
 }
 #else
-#define CHALLENGE_LEN 64
+#define CHALLENGE_LEN 64 /* must be multiple of 4 */
 
 /* Return value:
 	1 - auth succeeded,
@@ -5165,25 +10176,28 @@ static int drbd_do_auth(struct drbd_connection *connection)
 	-1 - auth failed, don't try again.
 */
 
-static int drbd_do_auth(struct drbd_connection *connection)
+struct auth_challenge {
+	char d[CHALLENGE_LEN];
+	u32 i;
+} __attribute__((packed));
+
+int drbd_do_auth(struct drbd_connection *connection)
 {
-	struct drbd_socket *sock;
-	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
-	char *response = NULL;
+	struct auth_challenge my_challenge, *peers_ch = NULL;
+	void *response;
 	char *right_response = NULL;
-	char *peers_ch = NULL;
 	unsigned int key_len;
 	char secret[SHARED_SECRET_MAX]; /* 64 byte */
 	unsigned int resp_size;
 	struct shash_desc *desc;
 	struct packet_info pi;
 	struct net_conf *nc;
-	int err, rv;
-
-	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
+	int err, rv, dig_size;
+	bool peer_is_drbd_9 = connection->agreed_pro_version >= 110;
+	void *packet_body;
 
 	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
+	nc = rcu_dereference(connection->transport.net_conf);
 	key_len = strlen(nc->shared_secret);
 	memcpy(secret, nc->shared_secret, key_len);
 	rcu_read_unlock();
@@ -5204,15 +10218,16 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}
 
-	get_random_bytes(my_challenge, CHALLENGE_LEN);
+	get_random_bytes(my_challenge.d, sizeof(my_challenge.d));
 
-	sock = &connection->data;
-	if (!conn_prepare_command(connection, sock)) {
+	packet_body = __conn_prepare_command(connection, sizeof(my_challenge.d), DATA_STREAM);
+	if (!packet_body) {
 		rv = 0;
 		goto fail;
 	}
-	rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
-				my_challenge, CHALLENGE_LEN);
+	memcpy(packet_body, my_challenge.d, sizeof(my_challenge.d));
+
+	rv = !__send_command(connection, -1, P_AUTH_CHALLENGE, DATA_STREAM);
 	if (!rv)
 		goto fail;
 
@@ -5224,61 +10239,56 @@ static int drbd_do_auth(struct drbd_connection *connection)
 
 	if (pi.cmd != P_AUTH_CHALLENGE) {
 		drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
-			 cmdname(pi.cmd), pi.cmd);
-		rv = -1;
-		goto fail;
-	}
-
-	if (pi.size > CHALLENGE_LEN * 2) {
-		drbd_err(connection, "expected AuthChallenge payload too big.\n");
+			 drbd_packet_name(pi.cmd), pi.cmd);
 		rv = -1;
 		goto fail;
 	}
 
-	if (pi.size < CHALLENGE_LEN) {
-		drbd_err(connection, "AuthChallenge payload too small.\n");
+	if (pi.size != sizeof(peers_ch->d)) {
+		drbd_err(connection, "unexpected AuthChallenge payload.\n");
 		rv = -1;
 		goto fail;
 	}
 
-	peers_ch = kmalloc(pi.size, GFP_NOIO);
+	peers_ch = kmalloc_obj(*peers_ch, GFP_NOIO);
 	if (!peers_ch) {
 		rv = -1;
 		goto fail;
 	}
 
-	err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+	err = drbd_recv_into(connection, peers_ch->d, sizeof(peers_ch->d));
 	if (err) {
 		rv = 0;
 		goto fail;
 	}
 
-	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+	if (!memcmp(my_challenge.d, peers_ch->d, sizeof(my_challenge.d))) {
 		drbd_err(connection, "Peer presented the same challenge!\n");
 		rv = -1;
 		goto fail;
 	}
 
 	resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
-	response = kmalloc(resp_size, GFP_NOIO);
+	response = __conn_prepare_command(connection, resp_size, DATA_STREAM);
 	if (!response) {
-		rv = -1;
+		rv = 0;
 		goto fail;
 	}
 
-	rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
+	dig_size = pi.size;
+	if (peer_is_drbd_9) {
+		peers_ch->i = cpu_to_be32(connection->resource->res_opts.node_id);
+		dig_size += sizeof(peers_ch->i);
+	}
+
+	rv = crypto_shash_digest(desc, peers_ch->d, dig_size, response);
 	if (rv) {
-		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		drbd_err(connection, "crypto_shash_digest() failed with %d\n", rv);
 		rv = -1;
 		goto fail;
 	}
 
-	if (!conn_prepare_command(connection, sock)) {
-		rv = 0;
-		goto fail;
-	}
-	rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
-				response, resp_size);
+	rv = !__send_command(connection, -1, P_AUTH_RESPONSE, DATA_STREAM);
 	if (!rv)
 		goto fail;
 
@@ -5290,18 +10300,19 @@ static int drbd_do_auth(struct drbd_connection *connection)
 
 	if (pi.cmd != P_AUTH_RESPONSE) {
 		drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
-			 cmdname(pi.cmd), pi.cmd);
+			 drbd_packet_name(pi.cmd), pi.cmd);
 		rv = 0;
 		goto fail;
 	}
 
 	if (pi.size != resp_size) {
-		drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+		drbd_err(connection, "expected AuthResponse payload of %u bytes, received %u\n",
+				resp_size, pi.size);
 		rv = 0;
 		goto fail;
 	}
 
-	err = drbd_recv_all_warn(connection, response , resp_size);
+	err = drbd_recv_all(connection, &response, resp_size);
 	if (err) {
 		rv = 0;
 		goto fail;
@@ -5313,10 +10324,15 @@ static int drbd_do_auth(struct drbd_connection *connection)
 		goto fail;
 	}
 
-	rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
-				 right_response);
+	dig_size = sizeof(my_challenge.d);
+	if (peer_is_drbd_9) {
+		my_challenge.i = cpu_to_be32(connection->peer_node_id);
+		dig_size += sizeof(my_challenge.i);
+	}
+
+	rv = crypto_shash_digest(desc, my_challenge.d, dig_size, right_response);
 	if (rv) {
-		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		drbd_err(connection, "crypto_shash_digest() failed with %d\n", rv);
 		rv = -1;
 		goto fail;
 	}
@@ -5331,7 +10347,6 @@ static int drbd_do_auth(struct drbd_connection *connection)
 
  fail:
 	kfree(peers_ch);
-	kfree(response);
 	kfree(right_response);
 	if (desc) {
 		shash_desc_zero(desc);
@@ -5345,94 +10360,260 @@ static int drbd_do_auth(struct drbd_connection *connection)
 int drbd_receiver(struct drbd_thread *thi)
 {
 	struct drbd_connection *connection = thi->connection;
-	int h;
 
-	drbd_info(connection, "receiver (re)started\n");
+	if (conn_connect(connection)) {
+		blk_start_plug(&connection->receiver_plug);
+		drbdd(connection);
+		blk_finish_plug(&connection->receiver_plug);
+	}
+
+	conn_disconnect(connection);
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static void drbd_check_flush_dagtag_reached(struct drbd_connection *peer_ack_connection)
+{
+	struct drbd_resource *resource = peer_ack_connection->resource;
+	struct drbd_connection *flush_requests_connection;
+	u64 peer_ack_node_mask = NODE_MASK(peer_ack_connection->peer_node_id);
+	u64 last_peer_ack_dagtag_seen = peer_ack_connection->last_peer_ack_dagtag_seen;
+	u64 im;
+
+	for_each_connection_ref(flush_requests_connection, im, resource) {
+		u64 flush_sequence;
+		u64 *sent_mask;
+		u64 flush_requests_dagtag;
+
+		spin_lock_irq(&flush_requests_connection->primary_flush_lock);
+		flush_requests_dagtag = flush_requests_connection->flush_requests_dagtag;
+		flush_sequence = flush_requests_connection->flush_sequence;
+		sent_mask = &flush_requests_connection->flush_forward_sent_mask;
+
+		if (!flush_sequence || /* Active flushes use non-zero sequence numbers */
+				*sent_mask & peer_ack_node_mask ||
+				last_peer_ack_dagtag_seen < flush_requests_dagtag) {
+			spin_unlock_irq(&flush_requests_connection->primary_flush_lock);
+			continue;
+		}
+
+		*sent_mask |= peer_ack_node_mask;
+		spin_unlock_irq(&flush_requests_connection->primary_flush_lock);
+
+		if (peer_ack_connection == flush_requests_connection)
+			drbd_send_flush_requests_ack(peer_ack_connection,
+					flush_sequence,
+					resource->res_opts.node_id);
+		else
+			drbd_send_flush_forward(peer_ack_connection,
+					flush_sequence,
+					flush_requests_connection->peer_node_id);
+	}
+}
+
+static int process_peer_ack_list(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_ack *peer_ack, *tmp;
+	u64 node_id_mask;
+	int err = 0;
+
+	node_id_mask = NODE_MASK(connection->peer_node_id);
+
+	spin_lock_irq(&resource->peer_ack_lock);
+	peer_ack = list_first_entry(&resource->peer_ack_list, struct drbd_peer_ack, list);
+	while (&peer_ack->list != &resource->peer_ack_list) {
+		u64 pending_mask = peer_ack->pending_mask;
+		u64 mask = peer_ack->mask;
+		u64 dagtag_sector = peer_ack->dagtag_sector;
+
+		tmp = list_next_entry(peer_ack, list);
+
+		if (!(peer_ack->queued_mask & node_id_mask)) {
+			peer_ack = tmp;
+			continue;
+		}
+
+		/*
+		 * After disconnecting, queue_peer_ack_send() sets
+		 * last_peer_ack_dagtag_seen directly. Do not jump back if we
+		 * process a peer ack with a lower dagtag here shortly after.
+		 */
+		connection->last_peer_ack_dagtag_seen =
+			max(connection->last_peer_ack_dagtag_seen, dagtag_sector);
+
+		peer_ack->queued_mask &= ~node_id_mask;
+		drbd_destroy_peer_ack_if_done(peer_ack);
+		peer_ack = tmp;
+
+		if (!(pending_mask & node_id_mask))
+			continue;
+		spin_unlock_irq(&resource->peer_ack_lock);
+
+		err = drbd_send_peer_ack(connection, mask, dagtag_sector);
+
+		spin_lock_irq(&resource->peer_ack_lock);
+		if (err)
+			break;
+	}
+	spin_unlock_irq(&resource->peer_ack_lock);
+
+	if (!err && connection->agreed_pro_version >= 123)
+		drbd_check_flush_dagtag_reached(connection);
+
+	return err;
+}
+
+static int got_peers_in_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_peer_block_desc *p = pi->data;
+	sector_t sector;
+	u64 in_sync_b;
+	int size;
 
-	do {
-		h = conn_connect(connection);
-		if (h == 0) {
-			conn_disconnect(connection);
-			schedule_timeout_interruptible(HZ);
-		}
-		if (h == -1) {
-			drbd_warn(connection, "Discarding network configuration.\n");
-			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
-		}
-	} while (h == 0);
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
 
-	if (h > 0) {
-		blk_start_plug(&connection->receiver_plug);
-		drbdd(connection);
-		blk_finish_plug(&connection->receiver_plug);
-	}
+	device = peer_device->device;
 
-	conn_disconnect(connection);
+	if (get_ldev(device)) {
+		unsigned long modified;
 
-	drbd_info(connection, "receiver terminated\n");
-	return 0;
-}
+		sector = be64_to_cpu(p->sector);
+		size = be32_to_cpu(p->size);
+		in_sync_b = node_ids_to_bitmap(device, be64_to_cpu(p->mask));
 
-/* ********* acknowledge sender ******** */
+		modified = drbd_set_sync(device, sector, size, 0, in_sync_b);
 
-static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
-{
-	struct p_req_state_reply *p = pi->data;
-	int retcode = be32_to_cpu(p->retcode);
+		/* If we are SyncSource then we rely on P_PEERS_IN_SYNC from
+		 * the peer to inform us of sync progress. Otherwise only send
+		 * peers-in-sync when we have actually cleared some bits.
+		 * This prevents an infinite loop with the peer. */
+		if (modified || peer_device->repl_state[NOW] == L_SYNC_SOURCE)
+			drbd_queue_update_peers(peer_device, sector, sector + (size >> SECTOR_SHIFT));
 
-	if (retcode >= SS_SUCCESS) {
-		set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
-	} else {
-		set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
-		drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
-			 drbd_set_st_err_str(retcode), retcode);
+		put_ldev(device);
 	}
-	wake_up(&connection->ping_wait);
 
 	return 0;
 }
 
 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
 	struct p_req_state_reply *p = pi->data;
 	int retcode = be32_to_cpu(p->retcode);
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
-
-	if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
-		D_ASSERT(device, connection->agreed_pro_version < 100);
-		return got_conn_RqSReply(connection, pi);
+	if (retcode >= SS_SUCCESS)
+		set_bit(TWOPC_YES, &connection->flags);
+	else {
+		set_bit(TWOPC_NO, &connection->flags);
+		dynamic_drbd_dbg(connection, "Requested state change failed by peer: %s (%d)\n",
+			   drbd_set_st_err_str(retcode), retcode);
 	}
 
-	if (retcode >= SS_SUCCESS) {
-		set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+	wake_up_all(&connection->resource->state_wait);
+
+	return 0;
+}
+
+static int got_twopc_reply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct p_twopc_reply *p = pi->data;
+
+	write_lock_irq(&resource->state_rwlock);
+	if (resource->twopc_reply.initiator_node_id == be32_to_cpu(p->initiator_node_id) &&
+	    resource->twopc_reply.tid == be32_to_cpu(p->tid)) {
+		dynamic_drbd_dbg(connection, "Got a %s reply for state change %u\n",
+			   drbd_packet_name(pi->cmd),
+			   resource->twopc_reply.tid);
+
+		if (pi->cmd == P_TWOPC_YES) {
+			struct drbd_peer_device *peer_device;
+			u64 reachable_nodes;
+			u64 max_size;
+
+			reachable_nodes = be64_to_cpu(p->reachable_nodes);
+
+			switch (resource->twopc.type) {
+			case TWOPC_STATE_CHANGE:
+				if (resource->res_opts.node_id ==
+				    resource->twopc_reply.initiator_node_id &&
+				    connection->peer_node_id ==
+				    resource->twopc_reply.target_node_id) {
+					resource->twopc_reply.target_reachable_nodes |=
+						reachable_nodes;
+				} else {
+					resource->twopc_reply.reachable_nodes |=
+						reachable_nodes;
+				}
+				resource->twopc_reply.primary_nodes |=
+					be64_to_cpu(p->primary_nodes);
+				resource->twopc_reply.weak_nodes |=
+					be64_to_cpu(p->weak_nodes);
+				break;
+			case TWOPC_RESIZE:
+				resource->twopc_reply.reachable_nodes |= reachable_nodes;
+				resource->twopc_reply.diskful_primary_nodes |=
+					be64_to_cpu(p->diskful_primary_nodes);
+				max_size = be64_to_cpu(p->max_possible_size);
+				resource->twopc_reply.max_possible_size =
+					min_t(sector_t, resource->twopc_reply.max_possible_size,
+					      max_size);
+				peer_device = conn_peer_device(connection, resource->twopc_reply.vnr);
+				if (peer_device)
+					peer_device->max_size = max_size;
+				break;
+			}
+		}
+
+		if (pi->cmd == P_TWOPC_YES)
+			set_bit(TWOPC_YES, &connection->flags);
+		else if (pi->cmd == P_TWOPC_NO)
+			set_bit(TWOPC_NO, &connection->flags);
+		else if (pi->cmd == P_TWOPC_RETRY)
+			set_bit(TWOPC_RETRY, &connection->flags);
+		drbd_maybe_cluster_wide_reply(resource);
 	} else {
-		set_bit(CL_ST_CHG_FAIL, &device->flags);
-		drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
-			drbd_set_st_err_str(retcode), retcode);
+		dynamic_drbd_dbg(connection, "Ignoring %s reply for state change %u\n",
+			   drbd_packet_name(pi->cmd),
+			   be32_to_cpu(p->tid));
 	}
-	wake_up(&device->state_wait);
+	write_unlock_irq(&resource->state_rwlock);
 
 	return 0;
 }
 
-static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+void twopc_connection_down(struct drbd_connection *connection)
 {
-	return drbd_send_ping_ack(connection);
+	struct drbd_resource *resource = connection->resource;
 
+	if (resource->twopc_reply.initiator_node_id != -1 &&
+	    test_bit(TWOPC_PREPARED, &connection->flags)) {
+		set_bit(TWOPC_RETRY, &connection->flags);
+		drbd_maybe_cluster_wide_reply(resource);
+	}
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+	queue_work(ping_ack_sender, &connection->send_ping_ack_work);
+	return 0;
 }
 
 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
 {
-	/* restore idle timeout */
-	connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
-	if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
-		wake_up(&connection->ping_wait);
+	clear_bit(PING_TIMEOUT_ACTIVE, &connection->flags);
+	set_rcvtimeo(connection, REGULAR_TIMEOUT);
+
+	if (test_bit(PING_PENDING, &connection->flags)) {
+		clear_bit(PING_PENDING, &connection->flags);
+		wake_up_all(&connection->resource->state_wait);
+	}
 
 	return 0;
 }
@@ -5441,6 +10622,7 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info *
 {
 	struct drbd_peer_device *peer_device;
 	struct drbd_device *device;
+	struct drbd_peer_request *peer_req;
 	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
 	int blksize = be32_to_cpu(p->blksize);
@@ -5450,69 +10632,74 @@ static int got_IsInSync(struct drbd_connection *connection, struct packet_info *
 		return -EIO;
 	device = peer_device->device;
 
-	D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+	D_ASSERT(device, connection->agreed_pro_version >= 89);
 
 	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
 
+	/* Do not rely on the block_id from older peers. */
+	if (connection->agreed_pro_version < 122)
+		p->block_id = ID_SYNCER;
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_RESYNC_WRITE),
+			sector, blksize, p->block_id);
+	if (!peer_req)
+		return -EIO;
+
+	dec_rs_pending(peer_device);
+
+	set_bit(INTERVAL_RECEIVED, &peer_req->i.flags);
+
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&connection->peer_reqs_lock);
+
 	if (get_ldev(device)) {
-		drbd_rs_complete_io(device, sector);
 		drbd_set_in_sync(peer_device, sector, blksize);
 		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
-		device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		peer_device->rs_same_csum += (blksize >> device->ldev->md.bm_block_shift);
 		put_ldev(device);
 	}
-	dec_rs_pending(peer_device);
-	atomic_add(blksize >> 9, &device->rs_sect_in);
+	rs_sectors_came_in(peer_device, blksize);
 
+	drbd_remove_peer_req_interval(peer_req);
+	drbd_resync_request_complete(peer_req);
 	return 0;
 }
 
 static int
 validate_req_change_req_state(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
-			      struct rb_root *root, const char *func,
+			      enum drbd_interval_type type, const char *func,
 			      enum drbd_req_event what, bool missing_ok)
 {
 	struct drbd_device *device = peer_device->device;
 	struct drbd_request *req;
-	struct bio_and_error m;
 
-	spin_lock_irq(&device->resource->req_lock);
-	req = find_request(device, root, id, sector, missing_ok, func);
-	if (unlikely(!req)) {
-		spin_unlock_irq(&device->resource->req_lock);
+	spin_lock_irq(&device->interval_lock);
+	req = find_request(device, type, id, sector, missing_ok, func);
+	spin_unlock_irq(&device->interval_lock);
+	if (unlikely(!req))
 		return -EIO;
-	}
-	__req_mod(req, what, peer_device, &m);
-	spin_unlock_irq(&device->resource->req_lock);
+	req_mod(req, what, peer_device);
 
-	if (m.bio)
-		complete_master_bio(device, &m);
 	return 0;
 }
 
 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
 {
 	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
 	struct p_block_ack *p = pi->data;
 	sector_t sector = be64_to_cpu(p->sector);
-	int blksize = be32_to_cpu(p->blksize);
 	enum drbd_req_event what;
 
 	peer_device = conn_peer_device(connection, pi->vnr);
 	if (!peer_device)
 		return -EIO;
-	device = peer_device->device;
 
 	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
 
-	if (p->block_id == ID_SYNCER) {
-		drbd_set_in_sync(peer_device, sector, blksize);
-		dec_rs_pending(peer_device);
-		return 0;
-	}
 	switch (pi->cmd) {
-	case P_RS_WRITE_ACK:
+	case P_RS_WRITE_ACK: /* agreed_pro_version < 122 */
+	case P_WRITE_ACK_IN_SYNC:
 		what = WRITE_ACKED_BY_PEER_AND_SIS;
 		break;
 	case P_WRITE_ACK:
@@ -5521,209 +10708,591 @@ static int got_BlockAck(struct drbd_connection *connection, struct packet_info *
 	case P_RECV_ACK:
 		what = RECV_ACKED_BY_PEER;
 		break;
-	case P_SUPERSEDED:
-		what = CONFLICT_RESOLVED;
-		break;
-	case P_RETRY_WRITE:
-		what = POSTPONE_WRITE;
-		break;
 	default:
 		BUG();
 	}
 
-	return validate_req_change_req_state(peer_device, p->block_id, sector,
-					     &device->write_requests, __func__,
-					     what, false);
+	return validate_req_change_req_state(peer_device, p->block_id, sector,
+					     INTERVAL_LOCAL_WRITE, __func__,
+					     what, false);
+}
+
+/* Process acks for resync writes. */
+static int got_RSWriteAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_block_ack *p = pi->data;
+	bool is_neg_ack = pi->cmd == P_NEG_ACK || pi->cmd == P_RS_NEG_ACK;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	struct drbd_peer_request *peer_req;
+
+	/* P_RS_WRITE_ACK used to be used instead of P_WRITE_ACK_IN_SYNC. */
+	if (connection->agreed_pro_version < 122 && p->block_id != ID_SYNCER)
+		return got_BlockAck(connection, pi);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (is_neg_ack && peer_device->disk_state[NOW] == D_UP_TO_DATE)
+		set_bit(GOT_NEG_ACK, &peer_device->flags);
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_RESYNC_READ),
+			sector, size, p->block_id);
+	if (!peer_req)
+		return -EIO;
+
+	if (is_neg_ack)
+		drbd_rs_failed_io(peer_device, sector, size);
+	else
+		drbd_set_in_sync(peer_device, sector, size);
+
+	atomic_sub(size >> 9, &connection->rs_in_flight);
+
+	dec_rs_pending(peer_device);
+
+	/*
+	 * Remove from the interval tree now so that
+	 * find_resync_request() cannot find this request again
+	 * if we get another ack for this interval.
+	 */
+	drbd_remove_peer_req_interval(peer_req);
+
+	drbd_resync_read_req_mod(peer_req, INTERVAL_RECEIVED);
+	return 0;
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	int err;
+
+	/* P_NEG_ACK used to be used instead of P_RS_NEG_ACK. */
+	if (p->block_id == ID_SYNCER)
+		return got_RSWriteAck(connection, pi);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (peer_device->disk_state[NOW] == D_UP_TO_DATE)
+		set_bit(GOT_NEG_ACK, &peer_device->flags);
+
+	err = validate_req_change_req_state(peer_device, p->block_id, sector,
+			INTERVAL_LOCAL_WRITE, __func__, NEG_ACKED, true);
+	if (err) {
+		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+		   The master bio might already be completed, therefore the
+		   request is no longer in the collision hash. */
+		/* In Protocol B we might already have got a P_RECV_ACK
+		   but then get a P_NEG_ACK afterwards. */
+		drbd_set_out_of_sync(peer_device, sector, size);
+	}
+	return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	drbd_warn_ratelimit(peer_device, "Got NegDReply; Sector %llus, len %u.\n",
+			(unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(peer_device, p->block_id, sector,
+					     INTERVAL_LOCAL_READ, __func__,
+					     NEG_ACKED, false);
+}
+
+void drbd_unsuccessful_resync_request(struct drbd_peer_request *peer_req, bool failed)
+{
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+
+	if (get_ldev_if_state(device, D_DETACHING)) {
+		if (failed) {
+			drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size);
+		} else {
+			if (drbd_interval_is_verify(&peer_req->i)) {
+				drbd_verify_skipped_block(peer_device, peer_req->i.sector, peer_req->i.size);
+				verify_progress(peer_device, peer_req->i.sector, peer_req->i.size);
+			} else {
+				set_bit(RS_REQUEST_UNSUCCESSFUL, &peer_device->flags);
+			}
+		}
+
+		rs_sectors_came_in(peer_device, peer_req->i.size);
+		mod_timer(&peer_device->resync_timer, jiffies + RS_MAKE_REQS_INTV);
+		put_ldev(device);
+	}
+
+	drbd_remove_peer_req_interval(peer_req);
+	drbd_free_peer_req(peer_req);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_peer_request *peer_req;
+	sector_t sector;
+	int size;
+	u64 block_id;
+	struct p_block_ack *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	/* Prior to protocol version 122, block_id may be meaningless. */
+	block_id = peer_device->connection->agreed_pro_version >= 122 ? p->block_id : ID_SYNCER;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_RESYNC_WRITE) |
+			INTERVAL_TYPE_MASK(INTERVAL_OV_READ_SOURCE),
+			sector, size, block_id);
+	if (!peer_req)
+		return -EIO;
+
+	dec_rs_pending(peer_device);
+
+	if (pi->cmd == P_RS_CANCEL_AHEAD)
+		set_bit(SYNC_TARGET_TO_BEHIND, &peer_device->flags);
+
+	drbd_unsuccessful_resync_request(peer_req, pi->cmd == P_NEG_RS_DREPLY);
+	return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_barrier_ack *p = pi->data;
+
+	return tl_release(connection, 0, 0, p->barrier, be32_to_cpu(p->set_size));
+}
+
+static int got_confirm_stable(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_confirm_stable *p = pi->data;
+
+	return tl_release(connection, p->oldest_block_id, p->youngest_block_id, 0,
+			  be32_to_cpu(p->set_size));
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_peer_request *peer_req;
+	sector_t sector;
+	int size;
+	u64 block_id;
+	u32 seq_num;
+	enum ov_result result;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (pi->cmd == P_OV_RESULT) {
+		struct p_block_ack *p = pi->data;
+
+		sector = be64_to_cpu(p->sector);
+		size = be32_to_cpu(p->blksize);
+		block_id = ID_SYNCER;
+		seq_num = be32_to_cpu(p->seq_num);
+		result = drbd_block_id_to_ov_result(be64_to_cpu(p->block_id));
+	} else { /* P_OV_RESULT_ID */
+		struct p_ov_result *p = pi->data;
+
+		sector = be64_to_cpu(p->sector);
+		size = be32_to_cpu(p->blksize);
+		block_id = p->block_id;
+		seq_num = be32_to_cpu(p->seq_num);
+		result = be32_to_cpu(p->result);
+	}
+
+	update_peer_seq(peer_device, seq_num);
+
+	peer_req = find_resync_request(peer_device, INTERVAL_TYPE_MASK(INTERVAL_OV_READ_TARGET),
+			sector, size, block_id);
+	if (!peer_req)
+		return -EIO;
+
+	drbd_remove_peer_req_interval(peer_req);
+
+	/* This may be a request that we could not cancel because the peer does
+	 * not understand P_RS_CANCEL. Treat it as a skipped block. */
+	if (connection->agreed_pro_version < 110 && test_bit(INTERVAL_CONFLICT, &peer_req->i.flags))
+		result = OV_RESULT_SKIP;
+
+	drbd_free_peer_req(peer_req);
+	peer_req = NULL;
+
+	if (result == OV_RESULT_SKIP)
+		drbd_verify_skipped_block(peer_device, sector, size);
+	if (result == OV_RESULT_OUT_OF_SYNC)
+		drbd_ov_out_of_sync_found(peer_device, sector, size);
+	else
+		ov_out_of_sync_print(peer_device);
+
+	if (!get_ldev(device))
+		return 0;
+
+	dec_rs_pending(peer_device);
+
+	verify_progress(peer_device, sector, size);
+
+	put_ldev(device);
+	return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return 0;
+}
+
+static u64 node_id_to_mask(struct drbd_peer_md *peer_md, int node_id)
+{
+	int bitmap_bit = peer_md[node_id].bitmap_index;
+	return (bitmap_bit >= 0) ? NODE_MASK(bitmap_bit) : 0;
+}
+
+static u64 node_ids_to_bitmap(struct drbd_device *device, u64 node_ids)
+{
+	struct drbd_peer_md *peer_md = device->ldev->md.peers;
+	u64 bitmap_bits = 0;
+	int node_id;
+
+	for_each_set_bit(node_id, (unsigned long *)&node_ids, DRBD_NODE_ID_MAX)
+		bitmap_bits |= node_id_to_mask(peer_md, node_id);
+	return bitmap_bits;
+}
+
+static struct drbd_peer_request *drbd_send_oos_next_req(struct drbd_connection *peer_ack_connection,
+		int oos_node_id, struct drbd_peer_request *peer_req)
+{
+	lockdep_assert_held(&peer_ack_connection->send_oos_lock);
+
+	if (peer_req == NULL)
+		peer_req = list_entry(&peer_ack_connection->send_oos,
+				struct drbd_peer_request, recv_order);
+
+	list_for_each_entry_continue(peer_req, &peer_ack_connection->send_oos, recv_order) {
+		if (NODE_MASK(oos_node_id) & peer_req->send_oos_pending)
+			return peer_req;
+	}
+
+	return NULL;
 }
 
-static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+static void drbd_send_oos_from(struct drbd_connection *oos_connection, int peer_ack_node_id)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_block_ack *p = pi->data;
-	sector_t sector = be64_to_cpu(p->sector);
-	int size = be32_to_cpu(p->blksize);
-	int err;
+	int oos_node_id = oos_connection->peer_node_id;
+	struct drbd_resource *resource = oos_connection->resource;
+	struct drbd_connection *peer_ack_connection;
+	struct drbd_peer_request *peer_req;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	rcu_read_lock();
+	peer_ack_connection = drbd_connection_by_node_id(resource, peer_ack_node_id);
+	/* Valid to use peer_ack_connection after unlock because we have kref */
+	rcu_read_unlock();
 
-	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+	spin_lock_irq(&peer_ack_connection->send_oos_lock);
+	peer_req = drbd_send_oos_next_req(peer_ack_connection, oos_node_id, NULL);
+	spin_unlock_irq(&peer_ack_connection->send_oos_lock);
 
-	if (p->block_id == ID_SYNCER) {
-		dec_rs_pending(peer_device);
-		drbd_rs_failed_io(peer_device, sector, size);
-		return 0;
+	while (peer_req) {
+		struct drbd_peer_device *peer_device =
+			conn_peer_device(oos_connection, peer_req->peer_device->device->vnr);
+		struct drbd_peer_request *free_peer_req = NULL;
+
+		/* Ignore errors and keep iterating to clear up list */
+		drbd_send_out_of_sync(peer_device, peer_req->i.sector, peer_req->i.size);
+
+		spin_lock_irq(&peer_ack_connection->send_oos_lock);
+		peer_req->send_oos_pending &= ~NODE_MASK(oos_node_id);
+		if (!peer_req->send_oos_pending)
+			free_peer_req = peer_req;
+
+		peer_req = drbd_send_oos_next_req(peer_ack_connection, oos_node_id, peer_req);
+		spin_unlock_irq(&peer_ack_connection->send_oos_lock);
+
+		if (free_peer_req)
+			drbd_free_peer_req(free_peer_req);
 	}
 
-	err = validate_req_change_req_state(peer_device, p->block_id, sector,
-					    &device->write_requests, __func__,
-					    NEG_ACKED, true);
-	if (err) {
-		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
-		   The master bio might already be completed, therefore the
-		   request is no longer in the collision hash. */
-		/* In Protocol B we might already have got a P_RECV_ACK
-		   but then get a P_NEG_ACK afterwards. */
-		drbd_set_out_of_sync(peer_device, sector, size);
+	kref_put(&peer_ack_connection->kref, drbd_destroy_connection);
+}
+
+int drbd_send_out_of_sync_wf(struct drbd_work *w, int cancel)
+{
+	struct drbd_connection *oos_connection = container_of(w, struct drbd_connection,
+			send_oos_work);
+	unsigned long send_oos_from_mask = READ_ONCE(oos_connection->send_oos_from_mask);
+	int peer_ack_node_id;
+
+	for_each_set_bit(peer_ack_node_id, &send_oos_from_mask, sizeof(unsigned long)) {
+		clear_bit(peer_ack_node_id, &oos_connection->send_oos_from_mask);
+		drbd_send_oos_from(oos_connection, peer_ack_node_id);
 	}
+
 	return 0;
 }
 
-static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+static bool is_sync_source(struct drbd_peer_device *peer_device)
+{
+	return is_sync_source_state(peer_device, NOW) ||
+		peer_device->repl_state[NOW] == L_WF_BITMAP_S;
+}
+
+static u64 drbd_calculate_send_oos_pending(struct drbd_device *device, u64 in_sync)
 {
 	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_block_ack *p = pi->data;
-	sector_t sector = be64_to_cpu(p->sector);
+	u64 send_oos_pending = 0;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	rcu_read_lock();
+	for_each_peer_device_rcu(peer_device, device) {
+		if (!(NODE_MASK(peer_device->node_id) & in_sync) &&
+				is_sync_source(peer_device))
+			send_oos_pending |= NODE_MASK(peer_device->node_id);
+	}
+	rcu_read_unlock();
 
-	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+	return send_oos_pending;
+}
 
-	drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
-	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+static void drbd_queue_send_out_of_sync(struct drbd_connection *peer_ack_connection,
+		struct list_head *send_oos_peer_req_list, u64 any_send_oos_pending)
+{
+	struct drbd_resource *resource = peer_ack_connection->resource;
+	int peer_ack_node_id = peer_ack_connection->peer_node_id;
+	struct drbd_connection *oos_connection;
 
-	return validate_req_change_req_state(peer_device, p->block_id, sector,
-					     &device->read_requests, __func__,
-					     NEG_ACKED, false);
+	if (!any_send_oos_pending)
+		return;
+
+	spin_lock_irq(&peer_ack_connection->send_oos_lock);
+	list_splice_tail(send_oos_peer_req_list, &peer_ack_connection->send_oos);
+	spin_unlock_irq(&peer_ack_connection->send_oos_lock);
+
+	/* Take state_rwlock to ensure work is queued on sender that is still running */
+	read_lock_irq(&resource->state_rwlock);
+	for_each_connection(oos_connection, resource) {
+		if (!(NODE_MASK(oos_connection->peer_node_id) & any_send_oos_pending) ||
+				oos_connection->cstate[NOW] < C_CONNECTED)
+			continue;
+
+		if (test_and_set_bit(peer_ack_node_id, &oos_connection->send_oos_from_mask))
+			continue; /* Only get kref if we set the bit here */
+
+		kref_get(&peer_ack_connection->kref);
+		drbd_queue_work_if_unqueued(&oos_connection->sender_work,
+				&oos_connection->send_oos_work);
+	}
+	read_unlock_irq(&resource->state_rwlock);
 }
 
-static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+static int got_peer_ack(struct drbd_connection *connection, struct packet_info *pi)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	sector_t sector;
-	int size;
-	struct p_block_ack *p = pi->data;
+	struct p_peer_ack *p = pi->data;
+	u64 dagtag, in_sync;
+	struct drbd_peer_request *peer_req, *tmp;
+	struct list_head work_list;
+	u64 any_send_oos_pending = 0;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	dagtag = be64_to_cpu(p->dagtag);
+	in_sync = be64_to_cpu(p->mask);
 
-	sector = be64_to_cpu(p->sector);
-	size = be32_to_cpu(p->blksize);
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_for_each_entry(peer_req, &connection->peer_requests, recv_order) {
+		if (dagtag == peer_req->dagtag_sector)
+			goto found;
+	}
+	spin_unlock_irq(&connection->peer_reqs_lock);
 
-	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+	drbd_err(connection, "peer request with dagtag %llu not found\n", dagtag);
+	return -EIO;
 
-	dec_rs_pending(peer_device);
+found:
+	list_cut_position(&work_list, &connection->peer_requests, &peer_req->recv_order);
+	spin_unlock_irq(&connection->peer_reqs_lock);
 
-	if (get_ldev_if_state(device, D_FAILED)) {
-		drbd_rs_complete_io(device, sector);
-		switch (pi->cmd) {
-		case P_NEG_RS_DREPLY:
-			drbd_rs_failed_io(peer_device, sector, size);
-			break;
-		case P_RS_CANCEL:
-			break;
-		default:
-			BUG();
+	list_for_each_entry_safe(peer_req, tmp, &work_list, recv_order) {
+		struct drbd_peer_device *peer_device = peer_req->peer_device;
+		struct drbd_device *device = peer_device->device;
+		u64 in_sync_b, mask;
+
+		D_ASSERT(peer_device, peer_req->flags & EE_IN_ACTLOG);
+
+		if (get_ldev(device)) {
+			if ((peer_req->flags & EE_WAS_ERROR) == 0)
+				in_sync_b = node_ids_to_bitmap(device, in_sync);
+			else
+				in_sync_b = 0;
+			mask = ~node_id_to_mask(device->ldev->md.peers,
+						connection->peer_node_id);
+
+			drbd_set_sync(device, peer_req->i.sector,
+				      peer_req->i.size, ~in_sync_b, mask);
+			drbd_al_complete_io(device, &peer_req->i);
+			put_ldev(device);
 		}
-		put_ldev(device);
+
+		peer_req->send_oos_pending = drbd_calculate_send_oos_pending(device, in_sync);
+		any_send_oos_pending |= peer_req->send_oos_pending;
+		if (!peer_req->send_oos_pending)
+			drbd_free_peer_req(peer_req);
 	}
 
+	drbd_queue_send_out_of_sync(connection, &work_list, any_send_oos_pending);
 	return 0;
 }
 
-static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+void apply_unacked_peer_requests(struct drbd_connection *connection)
 {
-	struct p_barrier_ack *p = pi->data;
-	struct drbd_peer_device *peer_device;
-	int vnr;
-
-	tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+	struct drbd_peer_request *peer_req;
+	unsigned long flags;
 
-	rcu_read_lock();
-	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+	spin_lock_irqsave(&connection->peer_reqs_lock, flags);
+	list_for_each_entry(peer_req, &connection->peer_requests, recv_order) {
+		struct drbd_peer_device *peer_device = peer_req->peer_device;
 		struct drbd_device *device = peer_device->device;
+		int bitmap_index = peer_device->bitmap_index;
+		u64 mask = ~(bitmap_index != -1 ? 1UL << bitmap_index : 0UL);
 
-		if (device->state.conn == C_AHEAD &&
-		    atomic_read(&device->ap_in_flight) == 0 &&
-		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
-			device->start_resync_timer.expires = jiffies + HZ;
-			add_timer(&device->start_resync_timer);
-		}
+		drbd_set_sync(device, peer_req->i.sector, peer_req->i.size,
+			      mask, mask);
 	}
-	rcu_read_unlock();
-
-	return 0;
+	spin_unlock_irqrestore(&connection->peer_reqs_lock, flags);
 }
 
-static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+static void cleanup_unacked_peer_requests(struct drbd_connection *connection)
 {
-	struct drbd_peer_device *peer_device;
-	struct drbd_device *device;
-	struct p_block_ack *p = pi->data;
-	struct drbd_device_work *dw;
-	sector_t sector;
-	int size;
+	struct drbd_peer_request *peer_req, *tmp;
+	LIST_HEAD(work_list);
+	u64 any_send_oos_pending = 0;
 
-	peer_device = conn_peer_device(connection, pi->vnr);
-	if (!peer_device)
-		return -EIO;
-	device = peer_device->device;
+	spin_lock_irq(&connection->peer_reqs_lock);
+	list_splice_init(&connection->peer_requests, &work_list);
+	spin_unlock_irq(&connection->peer_reqs_lock);
 
-	sector = be64_to_cpu(p->sector);
-	size = be32_to_cpu(p->blksize);
+	list_for_each_entry_safe(peer_req, tmp, &work_list, recv_order) {
+		struct drbd_peer_device *peer_device = peer_req->peer_device;
+		struct drbd_device *device = peer_device->device;
+		int bitmap_index = peer_device->bitmap_index;
+		u64 mask = ~(bitmap_index != -1 ? 1UL << bitmap_index : 0UL);
 
-	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+		if (get_ldev(device)) {
+			drbd_set_sync(device, peer_req->i.sector, peer_req->i.size,
+				      mask, mask);
+			drbd_al_complete_io(device, &peer_req->i);
+			put_ldev(device);
+		}
 
-	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
-		drbd_ov_out_of_sync_found(peer_device, sector, size);
-	else
-		ov_out_of_sync_print(peer_device);
+		peer_req->send_oos_pending = drbd_calculate_send_oos_pending(device, 0);
+		any_send_oos_pending |= peer_req->send_oos_pending;
+		if (!peer_req->send_oos_pending)
+			drbd_free_peer_req(peer_req);
+	}
 
-	if (!get_ldev(device))
-		return 0;
+	drbd_queue_send_out_of_sync(connection, &work_list, any_send_oos_pending);
+}
 
-	drbd_rs_complete_io(device, sector);
-	dec_rs_pending(peer_device);
+static void cleanup_peer_ack_list(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_peer_ack *peer_ack, *tmp;
+	struct drbd_request *req;
+	int idx = connection->peer_node_id;
+	u64 node_id_mask = NODE_MASK(idx);
+
+	spin_lock_irq(&resource->peer_ack_lock);
+	list_for_each_entry_safe(peer_ack, tmp, &resource->peer_ack_list, list) {
+		if (!(peer_ack->queued_mask & node_id_mask))
+			continue;
+		peer_ack->queued_mask &= ~node_id_mask;
+		drbd_destroy_peer_ack_if_done(peer_ack);
+	}
+	req = resource->peer_ack_req;
+	if (req)
+		req->net_rq_state[idx] &= ~RQ_NET_SENT;
+	spin_unlock_irq(&resource->peer_ack_lock);
+}
+
+int drbd_flush_ack_wf(struct drbd_work *w, int unused)
+{
+	struct drbd_connection *connection =
+		container_of(w, struct drbd_connection, flush_ack_work);
+	int primary_node_id;
 
-	--device->ov_left;
+	for (primary_node_id = 0; primary_node_id < DRBD_PEERS_MAX; primary_node_id++) {
+		u64 flush_sequence;
 
-	/* let's advance progress step marks only for every other megabyte */
-	if ((device->ov_left & 0x200) == 0x200)
-		drbd_advance_rs_marks(peer_device, device->ov_left);
+		spin_lock_irq(&connection->flush_ack_lock);
+		flush_sequence = connection->flush_ack_sequence[primary_node_id];
+		connection->flush_ack_sequence[primary_node_id] = 0;
+		spin_unlock_irq(&connection->flush_ack_lock);
 
-	if (device->ov_left == 0) {
-		dw = kmalloc_obj(*dw, GFP_NOIO);
-		if (dw) {
-			dw->w.cb = w_ov_finished;
-			dw->device = device;
-			drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
-		} else {
-			drbd_err(device, "kmalloc(dw) failed.");
-			ov_out_of_sync_print(peer_device);
-			drbd_resync_finished(peer_device);
-		}
+		if (flush_sequence) /* Active flushes use non-zero sequence numbers */
+			drbd_send_flush_requests_ack(connection, flush_sequence, primary_node_id);
 	}
-	put_ldev(device);
+
 	return 0;
 }
 
-static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+static int got_flush_forward(struct drbd_connection *connection, struct packet_info *pi)
 {
+	struct drbd_resource *resource = connection->resource;
+	struct drbd_connection *initiator_connection;
+	struct p_flush_forward *p = pi->data;
+	u64 flush_sequence = be64_to_cpu(p->flush_sequence);
+	int initiator_node_id = be32_to_cpu(p->initiator_node_id);
+
+	rcu_read_lock();
+	initiator_connection = drbd_connection_by_node_id(resource, initiator_node_id);
+	if (!initiator_connection) {
+		rcu_read_unlock();
+		return 0;
+	}
+
+	spin_lock_irq(&initiator_connection->flush_ack_lock);
+	initiator_connection->flush_ack_sequence[connection->peer_node_id] = flush_sequence;
+	drbd_queue_work_if_unqueued(&initiator_connection->sender_work,
+			&initiator_connection->flush_ack_work);
+	spin_unlock_irq(&initiator_connection->flush_ack_lock);
+	rcu_read_unlock();
 	return 0;
 }
 
-struct meta_sock_cmd {
-	size_t pkt_size;
-	int (*fn)(struct drbd_connection *connection, struct packet_info *);
-};
-
-static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
+static void set_rcvtimeo(struct drbd_connection *connection, enum rcv_timeou_kind kind)
 {
-	long t;
+	struct drbd_transport *transport = &connection->transport;
+	struct drbd_transport_ops *tr_ops = &transport->class->ops;
+	bool ping_timeout = kind == PING_TIMEOUT;
 	struct net_conf *nc;
+	long t;
 
 	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
+	nc = rcu_dereference(transport->net_conf);
 	t = ping_timeout ? nc->ping_timeo : nc->ping_int;
 	rcu_read_unlock();
 
@@ -5731,202 +11300,263 @@ static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
 	if (ping_timeout)
 		t /= 10;
 
-	connection->meta.socket->sk->sk_rcvtimeo = t;
+	tr_ops->set_rcvtimeo(transport, CONTROL_STREAM, t);
 }
 
-static void set_ping_timeout(struct drbd_connection *connection)
+void drbd_send_ping_wf(struct work_struct *ws)
 {
-	set_rcvtimeo(connection, 1);
-}
+	struct drbd_connection *connection =
+		container_of(ws, struct drbd_connection, send_ping_work);
+	int err;
 
-static void set_idle_timeout(struct drbd_connection *connection)
-{
-	set_rcvtimeo(connection, 0);
+	set_rcvtimeo(connection, PING_TIMEOUT);
+	set_bit(PING_TIMEOUT_ACTIVE, &connection->flags);
+	err = drbd_send_ping(connection);
+	if (err)
+		change_cstate(connection, C_NETWORK_FAILURE, CS_HARD);
 }
 
+struct meta_sock_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
 static struct meta_sock_cmd ack_receiver_tbl[] = {
-	[P_PING]	    = { 0, got_Ping },
-	[P_PING_ACK]	    = { 0, got_PingAck },
-	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
-	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
-	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
-	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
-	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
-	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
-	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
-	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
-	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
-	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
-	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
-	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
-	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
-	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
-	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_PING]	      = { 0, got_Ping },
+	[P_PING_ACK]	      = { 0, got_PingAck },
+	[P_RECV_ACK]	      = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	      = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK_IN_SYNC] = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	      = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	      = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]     = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_RS_WRITE_ACK]      = { sizeof(struct p_block_ack), got_RSWriteAck },
+	[P_RS_NEG_ACK]	      = { sizeof(struct p_block_ack), got_RSWriteAck },
+	[P_OV_RESULT]	      = { sizeof(struct p_block_ack), got_OVResult },
+	[P_OV_RESULT_ID]      = { sizeof(struct p_ov_result), got_OVResult },
+	[P_BARRIER_ACK]	      = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_CONFIRM_STABLE]    = { sizeof(struct p_confirm_stable), got_confirm_stable },
+	[P_STATE_CHG_REPLY]   = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]     = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]	      = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]	      = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_RS_CANCEL_AHEAD]   = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_CONN_ST_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_PEER_ACK]	      = { sizeof(struct p_peer_ack), got_peer_ack },
+	[P_PEERS_IN_SYNC]     = { sizeof(struct p_peer_block_desc), got_peers_in_sync },
+	[P_TWOPC_YES]	      = { sizeof(struct p_twopc_reply), got_twopc_reply },
+	[P_TWOPC_NO]	      = { sizeof(struct p_twopc_reply), got_twopc_reply },
+	[P_TWOPC_RETRY]	      = { sizeof(struct p_twopc_reply), got_twopc_reply },
+	[P_FLUSH_FORWARD]     = { sizeof(struct p_flush_forward), got_flush_forward },
 };
 
-int drbd_ack_receiver(struct drbd_thread *thi)
+static void fillup_buffer_from(struct drbd_mutable_buffer *to_fill, unsigned int need, struct drbd_const_buffer *pool)
 {
-	struct drbd_connection *connection = thi->connection;
-	struct meta_sock_cmd *cmd = NULL;
+	if (to_fill->avail < need) {
+		unsigned int missing = min(need - to_fill->avail, pool->avail);
+
+		memcpy(to_fill->buffer + to_fill->avail, pool->buffer, missing);
+		pool->buffer += missing;
+		pool->avail -= missing;
+		to_fill->avail += missing;
+	}
+}
+
+static int decode_meta_cmd(struct drbd_connection *connection, const u8 *pos, struct packet_info *pi)
+{
+	int header_version, payload_size;
+	struct meta_sock_cmd *cmd;
+
+	/*
+	 * A ping packet (via the control stream) can overtake the
+	 * feature packet. We might get it with a different header version
+	 * than expected since we will agree on the protocol version
+	 * by receiving the feature packet.
+	 */
+	header_version = __decode_header(pos, pi);
+	if (header_version < 0) {
+		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d [control]\n",
+			 be32_to_cpu(*(__be32 *)pos), header_version);
+		return -EINVAL;
+	}
+
+	if (pi->cmd >= ARRAY_SIZE(ack_receiver_tbl)) {
+		drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+			 drbd_packet_name(pi->cmd), pi->cmd);
+		return -ENOENT;
+	}
+
+	cmd = &ack_receiver_tbl[pi->cmd];
+	payload_size = cmd->pkt_size;
+	if (pi->size != payload_size) {
+		drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+			 pi->cmd, pi->size);
+		return -EINVAL;
+	}
+
+	return payload_size;
+}
+
+static int process_previous_part(struct drbd_connection *connection, struct drbd_const_buffer *pool)
+{
+	struct drbd_mutable_buffer *buffer = &connection->reassemble_buffer;
+	int payload_size, packet_size;
+	unsigned int header_size;
 	struct packet_info pi;
-	unsigned long pre_recv_jif;
-	int rv;
-	void *buf    = connection->meta.rbuf;
-	int received = 0;
-	unsigned int header_size = drbd_header_size(connection);
-	int expect   = header_size;
-	bool ping_timeout_active = false;
+	int err;
+
+	fillup_buffer_from(buffer, sizeof(u32), pool);
+	if (buffer->avail < sizeof(u32))
+		return 0;
 
-	sched_set_fifo_low(current);
+	header_size = decode_header_size(buffer->buffer);
+	fillup_buffer_from(buffer, header_size, pool);
+	if (buffer->avail < header_size)
+		return 0;
 
-	while (get_t_state(thi) == RUNNING) {
-		drbd_thread_current_set_cpu(thi);
+	payload_size = decode_meta_cmd(connection, buffer->buffer, &pi);
+	if (payload_size < 0)
+		return payload_size;
 
-		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
-			if (drbd_send_ping(connection)) {
-				drbd_err(connection, "drbd_send_ping has failed\n");
-				goto reconnect;
-			}
-			set_ping_timeout(connection);
-			ping_timeout_active = true;
-		}
-
-		pre_recv_jif = jiffies;
-		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
-
-		/* Note:
-		 * -EINTR	 (on meta) we got a signal
-		 * -EAGAIN	 (on meta) rcvtimeo expired
-		 * -ECONNRESET	 other side closed the connection
-		 * -ERESTARTSYS  (on data) we got a signal
-		 * rv <  0	 other than above: unexpected error!
-		 * rv == expected: full header or command
-		 * rv <  expected: "woken" by signal during receive
-		 * rv == 0	 : "connection shut down by peer"
-		 */
-		if (likely(rv > 0)) {
-			received += rv;
-			buf	 += rv;
-		} else if (rv == 0) {
-			if (test_bit(DISCONNECT_SENT, &connection->flags)) {
-				long t;
-				rcu_read_lock();
-				t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
-				rcu_read_unlock();
-
-				t = wait_event_timeout(connection->ping_wait,
-						       connection->cstate < C_WF_REPORT_PARAMS,
-						       t);
-				if (t)
-					break;
-			}
-			drbd_err(connection, "meta connection shut down by peer.\n");
+	packet_size = header_size + payload_size;
+	fillup_buffer_from(buffer, packet_size, pool);
+	if (buffer->avail < packet_size)
+		return 0;
+
+	err = ack_receiver_tbl[pi.cmd].fn(connection, &pi);
+	connection->reassemble_buffer.avail = 0;
+	return err;
+}
+
+void drbd_control_data_ready(struct drbd_transport *transport, struct drbd_const_buffer *pool)
+{
+	struct drbd_connection *connection =
+		container_of(transport, struct drbd_connection, transport);
+	unsigned int header_size;
+	int err;
+
+	if (connection->cstate[NOW] < C_TEAR_DOWN)
+		return;
+
+	if (connection->reassemble_buffer.avail) {
+		err = process_previous_part(connection, pool);
+		if (err < 0)
 			goto reconnect;
-		} else if (rv == -EAGAIN) {
-			/* If the data socket received something meanwhile,
-			 * that is good enough: peer is still alive. */
-			if (time_after(connection->last_received, pre_recv_jif))
-				continue;
-			if (ping_timeout_active) {
-				drbd_err(connection, "PingAck did not arrive in time.\n");
-				goto reconnect;
-			}
-			set_bit(SEND_PING, &connection->flags);
-			continue;
-		} else if (rv == -EINTR) {
-			/* maybe drbd_thread_stop(): the while condition will notice.
-			 * maybe woken for send_ping: we'll send a ping above,
-			 * and change the rcvtimeo */
-			flush_signals(current);
-			continue;
-		} else {
-			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+	}
+
+	while (pool->avail >= sizeof(u32)) {
+		int payload_size, packet_size;
+		struct packet_info pi;
+
+		header_size = decode_header_size(pool->buffer);
+		if (header_size > pool->avail)
+			goto keep_part;
+
+		payload_size = decode_meta_cmd(connection, pool->buffer, &pi);
+		if (payload_size < 0) {
+			err = payload_size;
 			goto reconnect;
 		}
 
-		if (received == expect && cmd == NULL) {
-			if (decode_header(connection, connection->meta.rbuf, &pi))
-				goto reconnect;
-			cmd = &ack_receiver_tbl[pi.cmd];
-			if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
-				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
-					 cmdname(pi.cmd), pi.cmd);
-				goto disconnect;
-			}
-			expect = header_size + cmd->pkt_size;
-			if (pi.size != expect - header_size) {
-				drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
-					pi.cmd, pi.size);
-				goto reconnect;
-			}
-		}
-		if (received == expect) {
-			bool err;
+		packet_size = header_size + payload_size;
+		if (packet_size > pool->avail)
+			goto keep_part;
 
-			err = cmd->fn(connection, &pi);
-			if (err) {
-				drbd_err(connection, "%ps failed\n", cmd->fn);
-				goto reconnect;
-			}
+		err = ack_receiver_tbl[pi.cmd].fn(connection, &pi);
+		if (err)
+			goto reconnect;
 
-			connection->last_received = jiffies;
+		pool->buffer += packet_size;
+		pool->avail -= packet_size;
+	}
+	if (pool->avail > 0) {
+keep_part:
+		memcpy(connection->reassemble_buffer.buffer, pool->buffer, pool->avail);
+		connection->reassemble_buffer.avail = pool->avail;
+		pool->avail = 0;
+	}
+	return;
 
-			if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
-				set_idle_timeout(connection);
-				ping_timeout_active = false;
-			}
+reconnect:
+	change_cstate(connection, err == -EPROTO ? C_PROTOCOL_ERROR : C_NETWORK_FAILURE, CS_HARD);
+}
+EXPORT_SYMBOL(drbd_control_data_ready);
+
+void drbd_control_event(struct drbd_transport *transport, enum drbd_tr_event event)
+{
+	struct drbd_connection *connection =
+		container_of(transport, struct drbd_connection, transport);
 
-			buf	 = connection->meta.rbuf;
-			received = 0;
-			expect	 = header_size;
-			cmd	 = NULL;
+	if (event == TIMEOUT) {
+		if (!test_bit(PING_TIMEOUT_ACTIVE, &connection->flags)) {
+			schedule_work(&connection->send_ping_work);
+			return;
+		} else {
+			if (connection->cstate[NOW] == C_CONNECTED)
+				drbd_warn(connection, "PingAck did not arrive in time.\n");
 		}
+	} else /* event == CLOSED_BY_PEER */ {
+		if (connection->cstate[NOW] == C_CONNECTED && disconnect_expected(connection))
+			return;
+		drbd_warn(connection, "meta connection shut down by peer.\n");
 	}
 
-	if (0) {
-reconnect:
-		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
-		conn_md_sync(connection);
-	}
-	if (0) {
-disconnect:
-		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
-	}
+	change_cstate(connection, C_NETWORK_FAILURE, CS_HARD);
+}
+EXPORT_SYMBOL(drbd_control_event);
 
-	drbd_info(connection, "ack_receiver terminated\n");
+static bool disconnect_expected(struct drbd_connection *connection)
+{
+	struct drbd_resource *resource = connection->resource;
+	bool expect_disconnect;
 
-	return 0;
+	/* We are reacting to a not-committed state change! The disconnect might
+	   get aborted. This is not a problem worth much more complex code.
+	   In the unlikely case, it happens that a two-phase-commit of a graceful
+	   disconnect gets aborted and the control connection breaks in exactly
+	   this time window, we will notice it as soon as sending something on the
+	   control stream. */
+	read_lock_irq(&resource->state_rwlock);
+	expect_disconnect = resource->remote_state_change &&
+		drbd_twopc_between_peer_and_me(connection) &&
+		resource->twopc_reply.is_disconnect;
+	read_unlock_irq(&resource->state_rwlock);
+	return expect_disconnect;
 }
 
 void drbd_send_acks_wf(struct work_struct *ws)
 {
-	struct drbd_peer_device *peer_device =
-		container_of(ws, struct drbd_peer_device, send_acks_work);
-	struct drbd_connection *connection = peer_device->connection;
-	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection =
+		container_of(ws, struct drbd_connection, send_acks_work);
+	struct drbd_transport *transport = &connection->transport;
 	struct net_conf *nc;
 	int tcp_cork, err;
 
 	rcu_read_lock();
-	nc = rcu_dereference(connection->net_conf);
+	nc = rcu_dereference(transport->net_conf);
 	tcp_cork = nc->tcp_cork;
 	rcu_read_unlock();
 
+	/* TODO: conditionally cork; it may hurt latency if we cork without
+	   much to send */
 	if (tcp_cork)
-		tcp_sock_set_cork(connection->meta.socket->sk, true);
+		drbd_cork(connection, CONTROL_STREAM);
+	err = drbd_finish_peer_reqs(connection);
 
-	err = drbd_finish_peer_reqs(device);
-	kref_put(&device->kref, drbd_destroy_device);
-	/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
-	   struct work_struct send_acks_work alive, which is in the peer_device object */
+	/* but unconditionally uncork unless disabled */
+	if (err)
+		change_cstate(connection, C_NETWORK_FAILURE, CS_HARD);
+	else if (tcp_cork)
+		drbd_uncork(connection, CONTROL_STREAM);
 
-	if (err) {
-		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
-		return;
-	}
+}
 
-	if (tcp_cork)
-		tcp_sock_set_cork(connection->meta.socket->sk, false);
+void drbd_send_peer_ack_wf(struct work_struct *ws)
+{
+	struct drbd_connection *connection =
+		container_of(ws, struct drbd_connection, peer_ack_work);
 
-	return;
+	if (process_peer_ack_list(connection))
+		change_cstate(connection, C_NETWORK_FAILURE, CS_HARD);
 }
+
diff --git a/drivers/block/drbd/drbd_transport.h b/drivers/block/drbd/drbd_transport.h
index ff393e8d12dc..b65950796f52 100644
--- a/drivers/block/drbd/drbd_transport.h
+++ b/drivers/block/drbd/drbd_transport.h
@@ -57,6 +57,7 @@
 struct drbd_resource;
 struct drbd_listener;
 struct drbd_transport;
+struct bio;
 
 enum drbd_stream {
 	DATA_STREAM,
@@ -136,12 +137,6 @@ struct drbd_transport_stats {
 	int send_buffer_used;
 };
 
-/* argument to ->recv_pages() */
-struct drbd_page_chain_head {
-	struct page *head;
-	unsigned int nr_pages;
-};
-
 struct drbd_const_buffer {
 	const u8 *buffer;
 	unsigned int avail;
@@ -208,18 +203,19 @@ struct drbd_transport_ops {
 	int (*recv)(struct drbd_transport *, enum drbd_stream, void **buf, size_t size, int flags);
 
 /**
- * recv_pages() - Receive bulk data via the transport's DATA_STREAM
+ * recv_bio() - Receive bulk data via the transport's DATA_STREAM into bios
  * @peer_device: Identify the transport and the device
- * @page_chain:	Here recv_pages() will place the page chain head and length
+ * @bios:	the bio_list to add received data to
  * @size:	Number of bytes to receive
  *
- * recv_pages() will return the requested amount of data from DATA_STREAM,
- * and place it into pages allocated with drbd_alloc_pages().
+ * recv_bio() receives the requested amount of data from DATA_STREAM. It
+ * allocates pages by using drbd_alloc_pages() and adds them to bios in the
+ * bio_list.
  *
  * Upon success the function returns 0. Upon error the function returns a
  * negative value
  */
-	int (*recv_pages)(struct drbd_transport *, struct drbd_page_chain_head *, size_t size);
+	int (*recv_bio)(struct drbd_transport *, struct bio_list *bios, size_t size);
 
 	void (*stats)(struct drbd_transport *, struct drbd_transport_stats *stats);
 /**
@@ -240,7 +236,7 @@ struct drbd_transport_ops {
 	long (*get_rcvtimeo)(struct drbd_transport *, enum drbd_stream);
 	int (*send_page)(struct drbd_transport *, enum drbd_stream, struct page *,
 			 int offset, size_t size, unsigned msg_flags);
-	int (*send_zc_bio)(struct drbd_transport *, struct bio *bio);
+	int (*send_bio)(struct drbd_transport *, struct bio *bio, unsigned int msg_flags);
 	bool (*stream_ok)(struct drbd_transport *, enum drbd_stream);
 	bool (*hint)(struct drbd_transport *, enum drbd_stream, enum drbd_tr_hints hint);
 	void (*debugfs_show)(struct drbd_transport *, struct seq_file *m);
@@ -324,6 +320,8 @@ void drbd_path_event(struct drbd_transport *transport, struct drbd_path *path);
 void drbd_listener_destroy(struct kref *kref);
 struct drbd_path *__drbd_next_path_ref(struct drbd_path *drbd_path,
 				       struct drbd_transport *transport);
+int drbd_bio_add_page(struct drbd_transport *transport, struct bio_list *bios,
+		      struct page *page, unsigned int len, unsigned int offset);
 
 /* Might restart iteration, if current element is removed from list!! */
 #define for_each_path_ref(path, transport)			\
@@ -332,112 +330,11 @@ struct drbd_path *__drbd_next_path_ref(struct drbd_path *drbd_path,
 	     path = __drbd_next_path_ref(path, transport))
 
 /* drbd_receiver.c*/
-struct page *drbd_alloc_pages(struct drbd_transport *transport,
-			      unsigned int number, gfp_t gfp_mask);
-void drbd_free_pages(struct drbd_transport *transport, struct page *page);
+struct page *drbd_alloc_pages(struct drbd_transport *transport, gfp_t gfp_mask, unsigned int size);
+void drbd_free_page(struct drbd_transport *transport, struct page *page);
 void drbd_control_data_ready(struct drbd_transport *transport,
 			     struct drbd_const_buffer *pool);
 void drbd_control_event(struct drbd_transport *transport,
 			enum drbd_tr_event event);
 
-static inline void drbd_alloc_page_chain(struct drbd_transport *t,
-	struct drbd_page_chain_head *chain, unsigned int nr, gfp_t gfp_flags)
-{
-	chain->head = drbd_alloc_pages(t, nr, gfp_flags);
-	chain->nr_pages = chain->head ? nr : 0;
-}
-
-static inline void drbd_free_page_chain(struct drbd_transport *transport,
-					struct drbd_page_chain_head *chain)
-{
-	drbd_free_pages(transport, chain->head);
-	chain->head = NULL;
-	chain->nr_pages = 0;
-}
-
-/*
- * Some helper functions to deal with our page chains.
- */
-/* Our transports may sometimes need to only partially use a page.
- * We need to express that somehow.  Use this struct, and "graft" it into
- * struct page at page->lru.
- *
- * According to include/linux/mm.h:
- *  | A page may be used by anyone else who does a __get_free_page().
- *  | In this case, page_count still tracks the references, and should only
- *  | be used through the normal accessor functions. The top bits of page->flags
- *  | and page->virtual store page management information, but all other fields
- *  | are unused and could be used privately, carefully. The management of this
- *  | page is the responsibility of the one who allocated it, and those who have
- *  | subsequently been given references to it.
- * (we do alloc_page(), that is equivalent).
- *
- * Red Hat struct page is different from upstream (layout and members) :(
- * So I am not too sure about the "all other fields", and it is not as easy to
- * find a place where sizeof(struct drbd_page_chain) would fit on all archs and
- * distribution-changed layouts.
- *
- * But (upstream) struct page also says:
- *  | struct list_head lru;   * ...
- *  |       * Can be used as a generic list
- *  |       * by the page owner.
- *
- * On 32bit, use unsigned short for offset and size,
- * to still fit in sizeof(page->lru).
- */
-
-/* grafted over struct page.lru */
-struct drbd_page_chain {
-	struct page *next;	/* next page in chain, if any */
-#ifdef CONFIG_64BIT
-	unsigned int offset;	/* start offset of data within this page */
-	unsigned int size;	/* number of data bytes within this page */
-#else
-#if PAGE_SIZE > (1U<<16)
-#error "won't work."
-#endif
-	unsigned short offset;	/* start offset of data within this page */
-	unsigned short size;	/* number of data bytes within this page */
-#endif
-};
-
-static inline void dummy_for_buildbug(void)
-{
-	struct page *dummy;
-	BUILD_BUG_ON(sizeof(struct drbd_page_chain) > sizeof(dummy->lru));
-}
-
-#define page_chain_next(page) \
-	(((struct drbd_page_chain *)&(page)->lru)->next)
-#define page_chain_size(page) \
-	(((struct drbd_page_chain *)&(page)->lru)->size)
-#define page_chain_offset(page) \
-	(((struct drbd_page_chain *)&(page)->lru)->offset)
-#define set_page_chain_next(page, v) \
-	(((struct drbd_page_chain *)&(page)->lru)->next = (v))
-#define set_page_chain_size(page, v) \
-	(((struct drbd_page_chain *)&(page)->lru)->size = (v))
-#define set_page_chain_offset(page, v) \
-	(((struct drbd_page_chain *)&(page)->lru)->offset = (v))
-#define set_page_chain_next_offset_size(page, n, o, s)		\
-	(*((struct drbd_page_chain *)&(page)->lru) =		\
-	((struct drbd_page_chain) {				\
-		.next = (n),					\
-		.offset = (o),					\
-		.size = (s),					\
-	 }))
-
-#define page_chain_for_each(page) \
-	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
-			page = page_chain_next(page))
-#define page_chain_for_each_safe(page, n) \
-	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-
-#ifndef SK_CAN_REUSE
-/* This constant was introduced by Pavel Emelyanov <xemul@parallels.com> on
-   Thu Apr 19 03:39:36 2012 +0000. Before the release of linux-3.5
-   commit 4a17fd52 sock: Introduce named constants for sk_reuse */
-#define SK_CAN_REUSE   1
-#endif
-
 #endif
diff --git a/drivers/block/drbd/drbd_transport_lb-tcp.c b/drivers/block/drbd/drbd_transport_lb-tcp.c
index 29f18df2be88..03ea93e7352f 100644
--- a/drivers/block/drbd/drbd_transport_lb-tcp.c
+++ b/drivers/block/drbd/drbd_transport_lb-tcp.c
@@ -121,7 +121,6 @@ struct dtl_path {
 	struct dtl_flow flow[2];
 };
 
-
 static int dtl_init(struct drbd_transport *transport);
 static void dtl_free(struct drbd_transport *transport, enum drbd_tr_free_op free_op);
 static void dtl_socket_free(struct drbd_transport *transport, struct socket **sock);
@@ -130,8 +129,7 @@ static int dtl_connect(struct drbd_transport *transport);
 static void dtl_finish_connect(struct drbd_transport *transport);
 static int dtl_recv(struct drbd_transport *transport, enum drbd_stream stream, void **buf,
 		    size_t size, int flags);
-static int dtl_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain,
-			  size_t size);
+static int dtl_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size);
 static void dtl_stats(struct drbd_transport *transport, struct drbd_transport_stats *stats);
 static int dtl_net_conf_change(struct drbd_transport *transport, struct net_conf *new_net_conf);
 static void dtl_set_rcvtimeo(struct drbd_transport *transport, enum drbd_stream stream,
@@ -139,7 +137,7 @@ static void dtl_set_rcvtimeo(struct drbd_transport *transport, enum drbd_stream
 static long dtl_get_rcvtimeo(struct drbd_transport *transport, enum drbd_stream stream);
 static int dtl_send_page(struct drbd_transport *transport, enum drbd_stream, struct page *page,
 		int offset, size_t size, unsigned int msg_flags);
-static int dtl_send_zc_bio(struct drbd_transport *, struct bio *bio);
+static int dtl_send_bio(struct drbd_transport *, struct bio *bio, unsigned int msg_flags);
 static bool dtl_stream_ok(struct drbd_transport *transport, enum drbd_stream stream);
 static bool dtl_hint(struct drbd_transport *transport, enum drbd_stream stream,
 		     enum drbd_tr_hints hint);
@@ -173,13 +171,13 @@ static struct drbd_transport_class dtl_transport_class = {
 		.connect = dtl_connect,
 		.finish_connect = dtl_finish_connect,
 		.recv = dtl_recv,
-		.recv_pages = dtl_recv_pages,
+		.recv_bio = dtl_recv_bio,
 		.stats = dtl_stats,
 		.net_conf_change = dtl_net_conf_change,
 		.set_rcvtimeo = dtl_set_rcvtimeo,
 		.get_rcvtimeo = dtl_get_rcvtimeo,
 		.send_page = dtl_send_page,
-		.send_zc_bio = dtl_send_zc_bio,
+		.send_bio = dtl_send_bio,
 		.stream_ok = dtl_stream_ok,
 		.hint = dtl_hint,
 		.debugfs_show = dtl_debugfs_show,
@@ -470,7 +468,7 @@ _dtl_recv_page(struct dtl_transport *dtl_transport, struct page *page, int size)
 		if (err)
 			goto out;
 
-		err = dtl_recv_short(flow->sock, data, min(size, flow->recv_bytes), 0);
+		err = dtl_recv_short(flow->sock, pos, min(size, flow->recv_bytes), 0);
 		if (err < 0)
 			goto out;
 		size -= err;
@@ -484,36 +482,37 @@ _dtl_recv_page(struct dtl_transport *dtl_transport, struct page *page, int size)
 }
 
 static int
-dtl_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain, size_t size)
+dtl_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size)
 {
 	struct dtl_transport *dtl_transport =
 		container_of(transport, struct dtl_transport, transport);
 	struct page *page;
 	int err;
 
-	drbd_alloc_page_chain(transport, chain, DIV_ROUND_UP(size, PAGE_SIZE), GFP_TRY);
-	page = chain->head;
-	if (!page)
-		return -ENOMEM;
+	do {
+		size_t len;
 
-	page_chain_for_each(page) {
-		size_t len = min_t(int, size, PAGE_SIZE);
+		page = drbd_alloc_pages(transport, GFP_KERNEL, size);
+		if (!page)
+			return -ENOMEM;
+		len = min(PAGE_SIZE << compound_order(page), size);
 
 		err = _dtl_recv_page(dtl_transport, page, len);
 		if (err < 0)
 			goto fail;
-		set_page_chain_offset(page, 0);
-		set_page_chain_size(page, len);
 		size -= err;
-	}
+		err = drbd_bio_add_page(transport, bios, page, len, 0);
+		if (err < 0)
+			goto fail;
+	} while (size > 0);
+
 	if (unlikely(size)) {
 		tr_warn(transport, "Not enough data received; missing %zu bytes\n", size);
-		err = -ENODATA;
-		goto fail;
+		return -ENODATA;
 	}
 	return 0;
 fail:
-	drbd_free_page_chain(transport, chain);
+	drbd_free_page(transport, page);
 	return err;
 }
 
@@ -1631,7 +1630,7 @@ static int dtl_select_send_flow(struct dtl_transport *dtl_transport,
 static int _dtl_send_page(struct dtl_transport *dtl_transport, struct dtl_flow *flow,
 			  struct page *page, int offset, size_t size, unsigned int msg_flags)
 {
-	struct msghdr msg = { .msg_flags = msg_flags | MSG_NOSIGNAL | MSG_SPLICE_PAGES };
+	struct msghdr msg = { .msg_flags = msg_flags | MSG_NOSIGNAL };
 	struct drbd_transport *transport = &dtl_transport->transport;
 	struct socket *sock = flow->sock;
 	struct bio_vec bvec;
@@ -1716,7 +1715,7 @@ static int dtl_bio_chunk_size_available(struct bio *bio, int wmem_available,
 }
 
 static int dtl_send_bio_pages(struct dtl_transport *dtl_transport, struct dtl_flow *flow,
-		struct bio *bio, struct bvec_iter *iter, int chunk)
+			struct bio *bio, struct bvec_iter *iter, int chunk, unsigned int msg_flags)
 {
 	struct bio_vec bvec;
 
@@ -1726,7 +1725,7 @@ static int dtl_send_bio_pages(struct dtl_transport *dtl_transport, struct dtl_fl
 		bvec = bio_iter_iovec(bio, *iter);
 		err = _dtl_send_page(dtl_transport, flow, bvec.bv_page,
 				bvec.bv_offset, bvec.bv_len,
-				bio_iter_last(bvec, *iter) ? 0 : MSG_MORE);
+				msg_flags | (bio_iter_last(bvec, *iter) ? 0 : MSG_MORE));
 		if (err)
 			return err;
 		chunk -= bvec.bv_len;
@@ -1736,7 +1735,8 @@ static int dtl_send_bio_pages(struct dtl_transport *dtl_transport, struct dtl_fl
 	return 0;
 }
 
-static int dtl_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
+static int dtl_send_bio(struct drbd_transport *transport, struct bio *bio,
+			   unsigned int msg_flags)
 {
 	struct dtl_transport *dtl_transport =
 		container_of(transport, struct dtl_transport, transport);
@@ -1777,7 +1777,7 @@ static int dtl_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
 				goto out;
 		}
 
-		err = dtl_send_bio_pages(dtl_transport, flow, bio, &iter, chunk);
+		err = dtl_send_bio_pages(dtl_transport, flow, bio, &iter, chunk, msg_flags);
 		if (err)
 			goto out;
 	} while (iter.bi_size);
diff --git a/drivers/block/drbd/drbd_transport_rdma.c b/drivers/block/drbd/drbd_transport_rdma.c
index fbdf6a4bcda9..69850bef34f8 100644
--- a/drivers/block/drbd/drbd_transport_rdma.c
+++ b/drivers/block/drbd/drbd_transport_rdma.c
@@ -322,8 +322,8 @@ static void dtr_set_rcvtimeo(struct drbd_transport *transport, enum drbd_stream
 static long dtr_get_rcvtimeo(struct drbd_transport *transport, enum drbd_stream stream);
 static int dtr_send_page(struct drbd_transport *transport, enum drbd_stream stream, struct page *page,
 		int offset, size_t size, unsigned msg_flags);
-static int dtr_send_zc_bio(struct drbd_transport *, struct bio *bio);
-static int dtr_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain, size_t size);
+static int dtr_send_bio(struct drbd_transport *, struct bio *bio, unsigned int msg_flags);
+static int dtr_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size);
 static bool dtr_stream_ok(struct drbd_transport *transport, enum drbd_stream stream);
 static bool dtr_hint(struct drbd_transport *transport, enum drbd_stream stream, enum drbd_tr_hints hint);
 static void dtr_debugfs_show(struct drbd_transport *, struct seq_file *m);
@@ -392,8 +392,8 @@ static struct drbd_transport_class rdma_transport_class = {
 		.set_rcvtimeo = dtr_set_rcvtimeo,
 		.get_rcvtimeo = dtr_get_rcvtimeo,
 		.send_page = dtr_send_page,
-		.send_zc_bio = dtr_send_zc_bio,
-		.recv_pages = dtr_recv_pages,
+		.send_bio = dtr_send_bio,
+		.recv_bio = dtr_recv_bio,
 		.stream_ok = dtr_stream_ok,
 		.hint = dtr_hint,
 		.debugfs_show = dtr_debugfs_show,
@@ -609,13 +609,13 @@ static int dtr_send(struct dtr_path *path, void *buf, size_t size, gfp_t gfp_mas
 }
 
 
-static int dtr_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain, size_t size)
+static int dtr_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size)
 {
 	struct dtr_transport *rdma_transport =
 		container_of(transport, struct dtr_transport, transport);
 	struct dtr_stream *rdma_stream = &rdma_transport->stream[DATA_STREAM];
-	struct page *page, *head = NULL, *tail = NULL;
-	int i = 0;
+	struct page *page;
+	int err, i = 0;
 
 	if (!dtr_transport_ok(transport))
 		return -ECONNRESET;
@@ -633,15 +633,8 @@ static int dtr_recv_pages(struct drbd_transport *transport, struct drbd_page_cha
 					dtr_receive_rx_desc(rdma_transport, DATA_STREAM, &rx_desc),
 					rdma_stream->recv_timeout);
 
-		if (t <= 0) {
-			/*
-			 * Cannot give back pages that may still be in use!
-			 * (More reason why we only have one rx_desc per page,
-			 * and don't get_page() in dtr_create_rx_desc).
-			 */
-			drbd_free_pages(transport, head);
+		if (t <= 0)
 			return t == 0 ? -EAGAIN : -EINTR;
-		}
 
 		page = rx_desc->page;
 		/* put_page() if we would get_page() in
@@ -655,24 +648,10 @@ static int dtr_recv_pages(struct drbd_transport *transport, struct drbd_page_cha
 		 * unaligned bvecs (as xfs often creates), rx_desc->size and
 		 * offset may well be not the PAGE_SIZE and 0 we hope for.
 		 */
-		if (tail) {
-			/* See also dtr_create_rx_desc().
-			 * For PAGE_SIZE > 4k, we may create several RR per page.
-			 * We cannot link a page to itself, though.
-			 *
-			 * Adding to size would be easy enough.
-			 * But what do we do about possible holes?
-			 * FIXME
-			 */
-			BUG_ON(page == tail);
 
-			set_page_chain_next(tail, page);
-			tail = page;
-		} else
-			head = tail = page;
-
-		set_page_chain_offset(page, 0);
-		set_page_chain_size(page, rx_desc->size);
+		err = drbd_bio_add_page(transport, bios, page, rx_desc->size, 0);
+		if (err < 0)
+			return err;
 
 		atomic_dec(&rx_desc->cm->path->flow[DATA_STREAM].rx_descs_allocated);
 		dtr_free_rx_desc(rx_desc);
@@ -682,8 +661,6 @@ static int dtr_recv_pages(struct drbd_transport *transport, struct drbd_page_cha
 	}
 
 	// pr_info("%s: rcvd %d pages\n", rdma_stream->name, i);
-	chain->head = head;
-	chain->nr_pages = i;
 	return 0;
 }
 
@@ -2023,7 +2000,7 @@ static void dtr_free_rx_desc(struct dtr_rx_desc *rx_desc)
 
 		/* put_page(), if we had more than one rx_desc per page,
 		 * but see comments in dtr_create_rx_desc */
-		drbd_free_pages(transport, rx_desc->page);
+		drbd_free_page(transport, rx_desc->page);
 	}
 	kfree(rx_desc);
 }
@@ -2032,23 +2009,17 @@ static int dtr_create_rx_desc(struct dtr_flow *flow, gfp_t gfp_mask, bool connec
 {
 	struct dtr_path *path = flow->path;
 	struct drbd_transport *transport = path->path.transport;
-	struct dtr_transport *rdma_transport =
-		container_of(transport, struct dtr_transport, transport);
 	struct dtr_rx_desc *rx_desc;
 	struct page *page;
-	int err, alloc_size = rdma_transport->rx_allocation_size;
-	int nr_pages = alloc_size / PAGE_SIZE;
+	int err;
 	struct dtr_cm *cm;
 
 	rx_desc = kzalloc_obj(*rx_desc, gfp_mask);
 	if (!rx_desc)
 		return -ENOMEM;
 
-	/* As of now, this MUST NEVER return a highmem page!
-	 * Which means no other user may ever have requested and then given
-	 * back a highmem page!
-	 */
-	page = drbd_alloc_pages(transport, nr_pages, gfp_mask);
+	/* Ignoring rdma_transport->rx_allocation_size for now! */
+	page = drbd_alloc_pages(transport, gfp_mask, PAGE_SIZE);
 	if (!page) {
 		kfree(rx_desc);
 		return -ENOMEM;
@@ -2066,14 +2037,14 @@ static int dtr_create_rx_desc(struct dtr_flow *flow, gfp_t gfp_mask, bool connec
 	rx_desc->page = page;
 	rx_desc->size = 0;
 	rx_desc->sge.lkey = dtr_cm_to_lkey(cm);
-	rx_desc->sge.addr = ib_dma_map_single(cm->id->device, page_address(page), alloc_size,
+	rx_desc->sge.addr = ib_dma_map_single(cm->id->device, page_address(page), PAGE_SIZE,
 					      DMA_FROM_DEVICE);
 	err = ib_dma_mapping_error(cm->id->device, rx_desc->sge.addr);
 	if (err) {
 		tr_err(transport, "ib_dma_map_single() failed %d\n", err);
 		goto out_put;
 	}
-	rx_desc->sge.length = alloc_size;
+	rx_desc->sge.length = PAGE_SIZE;
 
 	atomic_inc(&flow->rx_descs_allocated);
 	atomic_inc(&flow->rx_descs_posted);
@@ -2090,7 +2061,7 @@ static int dtr_create_rx_desc(struct dtr_flow *flow, gfp_t gfp_mask, bool connec
 	kref_put(&cm->kref, dtr_destroy_cm);
 out:
 	kfree(rx_desc);
-	drbd_free_pages(transport, page);
+	drbd_free_page(transport, page);
 	return err;
 }
 
@@ -3170,11 +3141,12 @@ static void dtr_update_congested(struct drbd_transport *transport)
 }
 
 static int dtr_send_page(struct drbd_transport *transport, enum drbd_stream stream,
-			 struct page *page, int offset, size_t size, unsigned msg_flags)
+			 struct page *caller_page, int offset, size_t size, unsigned int msg_flags)
 {
 	struct dtr_transport *rdma_transport =
 		container_of(transport, struct dtr_transport, transport);
 	struct dtr_tx_desc *tx_desc;
+	struct page *page;
 	int err;
 
 	// pr_info("%s: in send_page, size: %zu\n", rdma_stream->name, size);
@@ -3311,7 +3283,7 @@ static int dtr_send_bio_part(struct dtr_transport *rdma_transport,
 }
 #endif
 
-static int dtr_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
+static int dtr_send_bio(struct drbd_transport *transport, struct bio *bio, unsigned int msg_flags)
 {
 #if SENDER_COMPACTS_BVECS
 	struct dtr_transport *rdma_transport =
@@ -3329,6 +3301,7 @@ static int dtr_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
 		return -ECONNRESET;
 
 #if SENDER_COMPACTS_BVECS
+	/* TODO obey !MSG_SPLICE_PAGES in msg_flags */
 	bio_for_each_segment(bvec, bio, iter) {
 		size_tx_desc += bvec.bv_len;
 		//tr_info(transport, " bvec len = %d\n", bvec.bv_len);
@@ -3358,8 +3331,7 @@ static int dtr_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
 #else
 	bio_for_each_segment(bvec, bio, iter) {
 		err = dtr_send_page(transport, DATA_STREAM,
-			bvec.bv_page, bvec.bv_offset, bvec.bv_len,
-			0 /* flags currently unused by dtr_send_page */);
+			bvec.bv_page, bvec.bv_offset, bvec.bv_len, msg_flags);
 		if (err)
 			break;
 	}
diff --git a/drivers/block/drbd/drbd_transport_tcp.c b/drivers/block/drbd/drbd_transport_tcp.c
index 5faa6b82c358..51169d7a5902 100644
--- a/drivers/block/drbd/drbd_transport_tcp.c
+++ b/drivers/block/drbd/drbd_transport_tcp.c
@@ -115,14 +115,14 @@ static int dtt_prepare_connect(struct drbd_transport *transport);
 static int dtt_connect(struct drbd_transport *transport);
 static void dtt_finish_connect(struct drbd_transport *transport);
 static int dtt_recv(struct drbd_transport *transport, enum drbd_stream stream, void **buf, size_t size, int flags);
-static int dtt_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain, size_t size);
+static int dtt_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size);
 static void dtt_stats(struct drbd_transport *transport, struct drbd_transport_stats *stats);
 static int dtt_net_conf_change(struct drbd_transport *transport, struct net_conf *new_net_conf);
 static void dtt_set_rcvtimeo(struct drbd_transport *transport, enum drbd_stream stream, long timeout);
 static long dtt_get_rcvtimeo(struct drbd_transport *transport, enum drbd_stream stream);
 static int dtt_send_page(struct drbd_transport *transport, enum drbd_stream, struct page *page,
-		int offset, size_t size, unsigned msg_flags);
-static int dtt_send_zc_bio(struct drbd_transport *, struct bio *bio);
+		int offset, size_t size, unsigned int msg_flags);
+static int dtt_send_bio(struct drbd_transport *, struct bio *bio, unsigned int msg_flags);
 static bool dtt_stream_ok(struct drbd_transport *transport, enum drbd_stream stream);
 static bool dtt_hint(struct drbd_transport *transport, enum drbd_stream stream, enum drbd_tr_hints hint);
 static void dtt_debugfs_show(struct drbd_transport *transport, struct seq_file *m);
@@ -146,13 +146,13 @@ static struct drbd_transport_class tcp_transport_class = {
 		.connect = dtt_connect,
 		.finish_connect = dtt_finish_connect,
 		.recv = dtt_recv,
-		.recv_pages = dtt_recv_pages,
+		.recv_bio = dtt_recv_bio,
 		.stats = dtt_stats,
 		.net_conf_change = dtt_net_conf_change,
 		.set_rcvtimeo = dtt_set_rcvtimeo,
 		.get_rcvtimeo = dtt_get_rcvtimeo,
 		.send_page = dtt_send_page,
-		.send_zc_bio = dtt_send_zc_bio,
+		.send_bio = dtt_send_bio,
 		.stream_ok = dtt_stream_ok,
 		.hint = dtt_hint,
 		.debugfs_show = dtt_debugfs_show,
@@ -357,7 +357,8 @@ static int dtt_recv(struct drbd_transport *transport, enum drbd_stream stream, v
 	return rv;
 }
 
-static int dtt_recv_pages(struct drbd_transport *transport, struct drbd_page_chain_head *chain, size_t size)
+
+static int dtt_recv_bio(struct drbd_transport *transport, struct bio_list *bios, size_t size)
 {
 	struct drbd_tcp_transport *tcp_transport =
 		container_of(transport, struct drbd_tcp_transport, transport);
@@ -368,30 +369,30 @@ static int dtt_recv_pages(struct drbd_transport *transport, struct drbd_page_cha
 	if (!socket)
 		return -ENOTCONN;
 
-	drbd_alloc_page_chain(transport, chain, DIV_ROUND_UP(size, PAGE_SIZE), GFP_TRY);
-	page = chain->head;
-	if (!page)
-		return -ENOMEM;
+	do {
+		size_t len;
+
+		page = drbd_alloc_pages(transport, GFP_KERNEL, size);
+		if (!page)
+			return -ENOMEM;
+		len = min(PAGE_SIZE << compound_order(page), size);
 
-	page_chain_for_each(page) {
-		size_t len = min_t(int, size, PAGE_SIZE);
-		void *data = kmap(page);
-		err = dtt_recv_short(socket, data, len, 0);
-		kunmap(page);
-		set_page_chain_offset(page, 0);
-		set_page_chain_size(page, len);
+		err = dtt_recv_short(socket, page_address(page), len, 0);
 		if (err < 0)
 			goto fail;
 		size -= err;
-	}
+		err = drbd_bio_add_page(transport, bios, page, len, 0);
+		if (err < 0)
+			goto fail;
+	} while (size > 0);
+
 	if (unlikely(size)) {
 		tr_warn(transport, "Not enough data received; missing %zu bytes\n", size);
-		err = -ENODATA;
-		goto fail;
+		return -ENODATA;
 	}
 	return 0;
 fail:
-	drbd_free_page_chain(transport, chain);
+	drbd_free_page(transport, page);
 	return err;
 }
 
@@ -1492,7 +1493,7 @@ static int dtt_send_page(struct drbd_transport *transport, enum drbd_stream stre
 	struct drbd_tcp_transport *tcp_transport =
 		container_of(transport, struct drbd_tcp_transport, transport);
 	struct socket *socket = tcp_transport->stream[stream];
-	struct msghdr msg = { .msg_flags = msg_flags | MSG_NOSIGNAL | MSG_SPLICE_PAGES };
+	struct msghdr msg = { .msg_flags = msg_flags | MSG_NOSIGNAL };
 	struct bio_vec bvec;
 	int len = size;
 	int err = -EIO;
@@ -1537,7 +1538,7 @@ static int dtt_send_page(struct drbd_transport *transport, enum drbd_stream stre
 	return err;
 }
 
-static int dtt_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
+static int dtt_send_bio(struct drbd_transport *transport, struct bio *bio, unsigned int msg_flags)
 {
 	struct bio_vec bvec;
 	struct bvec_iter iter;
@@ -1547,7 +1548,7 @@ static int dtt_send_zc_bio(struct drbd_transport *transport, struct bio *bio)
 
 		err = dtt_send_page(transport, DATA_STREAM, bvec.bv_page,
 				      bvec.bv_offset, bvec.bv_len,
-				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+				      msg_flags | (bio_iter_last(bvec, iter) ? 0 : MSG_MORE));
 		if (err)
 			return err;
 	}
-- 
2.53.0