Overhaul the internal header definitions to support DRBD 9's
multi-peer replication model.
The fundamental shift is that per-peer state (replication progress,
UUIDs, resync bookkeeping) moves from per-device to per-peer-device
scope, and all mutable state is now tracked as a [NOW]/[NEW] pair
on each object to support atomic, cluster-visible state transitions.
Redesign the locking model to match: remove the coarse per-resource
spinlock in favor of a resource-level rwlock for state, a
per-connection lock for peer request lists, and a per-device lock
for interval tree operations.
Replace direct socket members on the connection wth the transport
abstraction.
Move the transfer log with its peer-ack machinery up to the resource
level so that writes can be serialized and acknowledged across all
peers consistently.
Move the state change API to a two-phase commit model at the
resource level, enabling cluster-wide coordinated transitions for
connect, disconnect, role change, and resize operations.
Co-developed-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Co-developed-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Co-developed-by: Joel Colledge <joel.colledge@linbit.com>
Signed-off-by: Joel Colledge <joel.colledge@linbit.com>
Co-developed-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
---
drivers/block/drbd/drbd_buildtag.c | 2 +-
drivers/block/drbd/drbd_config.h | 38 +
drivers/block/drbd/drbd_debugfs.h | 2 +
.../block/drbd}/drbd_genl_api.h | 19 +-
drivers/block/drbd/drbd_int.h | 3278 +++++++++++------
drivers/block/drbd/drbd_interval.h | 156 +-
drivers/block/drbd/drbd_nl.c | 2 +-
drivers/block/drbd/drbd_nla.c | 2 +-
drivers/block/drbd/drbd_nla.h | 7 +-
drivers/block/drbd/drbd_polymorph_printk.h | 265 +-
drivers/block/drbd/drbd_req.h | 303 +-
drivers/block/drbd/drbd_state.h | 298 +-
drivers/block/drbd/drbd_state_change.h | 66 +-
drivers/block/drbd/drbd_strings.h | 25 +-
drivers/block/drbd/drbd_transport_lb-tcp.c | 4 +-
drivers/block/drbd/drbd_transport_rdma.c | 4 +-
drivers/block/drbd/drbd_transport_tcp.c | 4 +-
include/linux/drbd.h | 190 +-
include/linux/drbd_config.h | 16 -
include/linux/drbd_genl.h | 350 +-
include/linux/drbd_limits.h | 105 +-
include/linux/genl_magic_func.h | 50 +-
22 files changed, 3361 insertions(+), 1825 deletions(-)
create mode 100644 drivers/block/drbd/drbd_config.h
rename {include/linux => drivers/block/drbd}/drbd_genl_api.h (68%)
delete mode 100644 include/linux/drbd_config.h
diff --git a/drivers/block/drbd/drbd_buildtag.c b/drivers/block/drbd/drbd_buildtag.c
index cb1aa66d7d5d..812f78070a0b 100644
--- a/drivers/block/drbd/drbd_buildtag.c
+++ b/drivers/block/drbd/drbd_buildtag.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
-#include <linux/drbd_config.h>
#include <linux/module.h>
+#include "drbd_config.h"
const char *drbd_buildtag(void)
{
diff --git a/drivers/block/drbd/drbd_config.h b/drivers/block/drbd/drbd_config.h
new file mode 100644
index 000000000000..62fc91dc529a
--- /dev/null
+++ b/drivers/block/drbd/drbd_config.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ drbd_config.h
+ DRBD's compile time configuration.
+*/
+
+#ifndef DRBD_CONFIG_H
+#define DRBD_CONFIG_H
+
+#include "drbd_protocol.h"
+
+const char *drbd_buildtag(void);
+
+#define REL_VERSION "9.3.0"
+#define PRO_VERSION_MIN 118 /* 9.0.26 */
+#define PRO_VERSION_MAX 124
+#define PRO_FEATURES (DRBD_FF_TRIM | DRBD_FF_THIN_RESYNC | DRBD_FF_WSAME | DRBD_FF_WZEROES | \
+ DRBD_FF_RESYNC_DAGTAG | \
+ DRBD_FF_2PC_V2 | DRBD_FF_RS_SKIP_UUID | \
+ DRBD_FF_RESYNC_WITHOUT_REPLICATION)
+
+#define PRO_VERSION_8_MIN 86
+#define PRO_VERSION_8_MAX 101
+
+/* We support two ranges of DRBD protocol version:
+ * 86-101: accepted DRBD 8 protocol versions as "rolling upgrade" path
+ * 102-109: never defined
+ * 110-117: _rejected_ because of bugs in the backward compat path
+ * in more recent DRBD versions. That is 9.0.0 to 9.0.25 inclusive.
+ * "Rolling" upgrade path for those versions:
+ * first upgrade to 9.0.latest, then connect to 9.1/9.2 or later.
+ * 118-PRO_VERSION_MAX: accepted DRBD 9 protocol versions.
+ *
+ * Note that we also reject connections with protocol version 121 and feature
+ * DRBD_FF_RESYNC_DAGTAG.
+ */
+
+#endif
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
index ee3d66eb40c6..37037b196e4a 100644
--- a/drivers/block/drbd/drbd_debugfs.h
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -11,6 +11,7 @@ void drbd_debugfs_cleanup(void);
void drbd_debugfs_resource_add(struct drbd_resource *resource);
void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+void drbd_debugfs_resource_rename(struct drbd_resource *resource, const char *new_name);
void drbd_debugfs_connection_add(struct drbd_connection *connection);
void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
@@ -27,6 +28,7 @@ static inline void drbd_debugfs_cleanup(void) { }
static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_rename(struct drbd_resource *resource, const char *new_name) { }
static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
diff --git a/include/linux/drbd_genl_api.h b/drivers/block/drbd/drbd_genl_api.h
similarity index 68%
rename from include/linux/drbd_genl_api.h
rename to drivers/block/drbd/drbd_genl_api.h
index 70682c058027..7096b9c4f6dc 100644
--- a/include/linux/drbd_genl_api.h
+++ b/drivers/block/drbd/drbd_genl_api.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_GENL_STRUCT_H
#define DRBD_GENL_STRUCT_H
@@ -13,12 +13,6 @@
* is used instead.
* @flags: possible operation modifiers (relevant only for user->kernel):
* DRBD_GENL_F_SET_DEFAULTS
- * @volume:
- * When creating a new minor (adding it to a resource), the resource needs
- * to know which volume number within the resource this is supposed to be.
- * The volume number corresponds to the same volume number on the remote side,
- * whereas the minor number on the remote side may be different
- * (union with flags).
* @ret_code: kernel->userland unicast cfg reply return code (union with flags);
*/
struct drbd_genlmsghdr {
@@ -34,20 +28,13 @@ enum {
DRBD_GENL_F_SET_DEFAULTS = 1,
};
-enum drbd_state_info_bcast_reason {
- SIB_GET_STATUS_REPLY = 1,
- SIB_STATE_CHANGE = 2,
- SIB_HELPER_PRE = 3,
- SIB_HELPER_POST = 4,
- SIB_SYNC_PROGRESS = 5,
-};
-
/* hack around predefined gcc/cpp "linux=1",
* we cannot possibly include <1/drbd_genl.h> */
#undef linux
#include <linux/drbd.h>
-#define GENL_MAGIC_VERSION 1
+#include "drbd_config.h"
+#define GENL_MAGIC_VERSION 2
#define GENL_MAGIC_FAMILY drbd
#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr)
#define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f6d6276974ee..b7dc630cf784 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -18,55 +18,101 @@
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
-#include <linux/tcp.h>
#include <linux/mutex.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/idr.h>
-#include <linux/dynamic_debug.h>
-#include <net/tcp.h>
#include <linux/lru_cache.h>
#include <linux/prefetch.h>
-#include <linux/drbd_genl_api.h>
+#include "drbd_genl_api.h"
#include <linux/drbd.h>
-#include <linux/drbd_config.h>
+
+#include "drbd_config.h"
#include "drbd_strings.h"
#include "drbd_state.h"
+#include "drbd_state_change.h"
#include "drbd_protocol.h"
+#include "drbd_transport.h"
#include "drbd_polymorph_printk.h"
-/* shared module parameters, defined in drbd_main.c */
+/* module parameter, defined in drbd_main.c */
+extern unsigned int drbd_minor_count;
+extern unsigned int drbd_protocol_version_min;
+extern bool drbd_strict_names;
+
+static inline bool drbd_protocol_version_acceptable(unsigned int pv)
+{
+ return /* DRBD 9 */ (pv >= PRO_VERSION_MIN && pv <= PRO_VERSION_MAX) ||
+ /* DRBD 8 */ (pv >= PRO_VERSION_8_MIN && pv <= PRO_VERSION_8_MAX);
+}
+
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int drbd_enable_faults;
extern int drbd_fault_rate;
#endif
-extern unsigned int drbd_minor_count;
extern char drbd_usermode_helper[];
-extern int drbd_proc_details;
+enum {
+ /* drbd_khelper returns >= 0, we can use negative values as flags for drbd_maybe_khelper */
+ DRBD_UMH_DISABLED = INT_MIN,
+};
+#ifndef DRBD_MAJOR
+# define DRBD_MAJOR 147
+#endif
/* This is used to stop/restart our threads.
* Cannot use SIGTERM nor SIGKILL, since these
* are sent out by init on runlevel changes
* I choose SIGHUP for now.
+ *
+ * FIXME btw, we should register some reboot notifier.
*/
#define DRBD_SIGKILL SIGHUP
+/* For compatibility with protocol < 122 */
+#define ID_SKIP (4710ULL)
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
#define ID_SYNCER (-1ULL)
+static inline enum ov_result drbd_block_id_to_ov_result(u64 block_id)
+{
+ switch (block_id) {
+ case ID_IN_SYNC:
+ return OV_RESULT_IN_SYNC;
+ case ID_OUT_OF_SYNC:
+ return OV_RESULT_OUT_OF_SYNC;
+ default:
+ return OV_RESULT_SKIP;
+ }
+}
+
+static inline u64 drbd_ov_result_to_block_id(enum ov_result result)
+{
+ switch (result) {
+ case OV_RESULT_IN_SYNC:
+ return ID_IN_SYNC;
+ case OV_RESULT_OUT_OF_SYNC:
+ return ID_OUT_OF_SYNC;
+ default:
+ return ID_SKIP;
+ }
+}
+
#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
struct drbd_device;
struct drbd_connection;
-struct drbd_peer_device;
+
+/* I want to be able to grep for "drbd $resource_name"
+ * and get all relevant log lines. */
/* Defines to control fault insertion */
enum {
@@ -80,11 +126,12 @@ enum {
DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
DRBD_FAULT_AL_EE = 8, /* alloc ee */
DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+ DRBD_FAULT_BIO_TOO_SMALL = 10, /* Allocate smaller bios to trigger bio chaining */
DRBD_FAULT_MAX,
};
-extern unsigned int
+unsigned int
_drbd_insert_fault(struct drbd_device *device, unsigned int type);
static inline int
@@ -98,28 +145,31 @@ drbd_insert_fault(struct drbd_device *device, unsigned int type) {
#endif
}
-/* integer division, round _UP_ to the next integer */
-#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
-/* usual integer division */
-#define div_floor(A, B) ((A)/(B))
-
-extern struct ratelimit_state drbd_ratelimit_state;
-extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
-extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+/*
+ * our structs
+ *************************/
-extern const char *cmdname(enum drbd_packet cmd);
+extern struct idr drbd_devices; /* RCU, updates: drbd_devices_lock */
+extern struct list_head drbd_resources; /* RCU, updates: resources_mutex */
+extern struct mutex resources_mutex;
/* for sending/receiving the bitmap,
- * possibly in some encoding scheme */
+ * possibly in some encoding scheme.
+ * For compatibility, we transfer as if bm_block_size was 4k.
+ */
struct bm_xfer_ctx {
/* "const"
* stores total bits and long words
* of the bitmap, so we don't need to
* call the accessor functions over and again. */
+ unsigned long bm_bits_4k; /* unused on sending side */
unsigned long bm_bits;
unsigned long bm_words;
+ unsigned int scale; /* against BM_BLOCK_SHIFT_4k */
/* during xfer, current position within the bitmap */
unsigned long bit_offset;
+ /* receiving "partial" bits; unused on sending side. */
+ unsigned long bit_offset_4k;
unsigned long word_offset;
/* statistics; index: (h->command == P_BITMAP) */
@@ -127,8 +177,8 @@ struct bm_xfer_ctx {
unsigned bytes[2];
};
-extern void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device,
- const char *direction, struct bm_xfer_ctx *c);
+void INFO_bm_xfer_stats(struct drbd_peer_device *peer_device,
+ const char *direction, struct bm_xfer_ctx *c);
static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
{
@@ -149,7 +199,7 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
#endif
}
-extern unsigned int drbd_header_size(struct drbd_connection *connection);
+unsigned int drbd_header_size(struct drbd_connection *connection);
/**********************************************************************/
enum drbd_thread_state {
@@ -164,7 +214,7 @@ struct drbd_thread {
struct task_struct *task;
struct completion stop;
enum drbd_thread_state t_state;
- int (*function) (struct drbd_thread *);
+ int (*function)(struct drbd_thread *thi);
struct drbd_resource *resource;
struct drbd_connection *connection;
int reset_cpu_mask;
@@ -183,31 +233,61 @@ static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
struct drbd_work {
struct list_head list;
- int (*cb)(struct drbd_work *, int cancel);
+ int (*cb)(struct drbd_work *w, int cancel);
};
-struct drbd_device_work {
+struct drbd_peer_device_work {
struct drbd_work w;
- struct drbd_device *device;
+ struct drbd_peer_device *peer_device;
};
-#include "drbd_interval.h"
-
-extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+enum drbd_stream;
-extern void lock_all_resources(void);
-extern void unlock_all_resources(void);
+#include "drbd_interval.h"
+void lock_all_resources(void);
+void unlock_all_resources(void);
+
+enum drbd_disk_state disk_state_from_md(struct drbd_device *device);
+bool want_bitmap(struct drbd_peer_device *peer_device);
+long twopc_timeout(struct drbd_resource *resource);
+long twopc_retry_timeout(struct drbd_resource *resource, int retries);
+void twopc_connection_down(struct drbd_connection *connection);
+u64 directly_connected_nodes(struct drbd_resource *resource,
+ enum which_state which);
+
+/* sequence arithmetic for dagtag (data generation tag) sector numbers.
+ * dagtag_newer_eq: true, if a is newer than b */
+#define dagtag_newer_eq(a, b) \
+ (typecheck(u64, a) && \
+ typecheck(u64, b) && \
+ ((s64)(a) - (s64)(b) >= 0))
+
+#define dagtag_newer(a, b) \
+ (typecheck(u64, a) && \
+ typecheck(u64, b) && \
+ ((s64)(a) - (s64)(b) > 0))
+
+/* An application I/O request.
+ *
+ * Fields marked as "immutable" may only be modified when the request is
+ * exclusively owned, e.g. when the request is created or is being retried.
+ */
struct drbd_request {
- struct drbd_work w;
+ /* "immutable" */
struct drbd_device *device;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
- * see drbd_request_endio(). */
+ * see drbd_request_endio().
+ *
+ * Only accessed by app/submitter/endio - strictly sequential,
+ * no serialization required. */
struct bio *private_bio;
+ /* Fields sector and size are "immutable". Other fields protected
+ * by interval_lock. */
struct drbd_interval i;
/* epoch: used to check on "completion" whether this req was in
@@ -217,96 +297,152 @@ struct drbd_request {
* This corresponds to "barrier" in struct p_barrier[_ack],
* and to "barrier_nr" in struct drbd_epoch (and various
* comments/function parameters/local variable names).
+ *
+ * "immutable"
*/
unsigned int epoch;
- struct list_head tl_requests; /* ring list in the transfer log */
- struct bio *master_bio; /* master bio pointer */
+ /* Position of this request in the serialized per-resource change
+ * stream. Can be used to serialize with other events when
+ * communicating the change stream via multiple connections.
+ * Assigned from device->resource->dagtag_sector.
+ *
+ * Given that some IO backends write several GB per second meanwhile,
+ * lets just use a 64bit sequence space.
+ *
+ * "immutable"
+ */
+ u64 dagtag_sector;
+
+ /* list entry in transfer log (protected by RCU) */
+ struct list_head tl_requests;
+
+ /* list entry in submitter lists, peer ack list, or retry lists;
+ * protected by the locks for those lists */
+ struct list_head list;
+
+ /* master bio pointer; "immutable" */
+ struct bio *master_bio;
/* see struct drbd_device */
struct list_head req_pending_master_completion;
struct list_head req_pending_local;
- /* for generic IO accounting */
+ /* for generic IO accounting; "immutable" */
unsigned long start_jif;
- /* for DRBD internal statistics */
+ /* for request_timer_fn() */
+ unsigned long pre_submit_jif;
+ unsigned long pre_send_jif[DRBD_PEERS_MAX];
- /* Minimal set of time stamps to determine if we wait for activity log
- * transactions, local disk or peer. 32 bit "jiffies" are good enough,
- * we don't expect a DRBD request to be stalled for several month.
- */
+#ifdef CONFIG_DRBD_TIMING_STATS
+ /* for DRBD internal statistics */
+ ktime_t start_kt;
/* before actual request processing */
- unsigned long in_actlog_jif;
+ ktime_t in_actlog_kt;
/* local disk */
- unsigned long pre_submit_jif;
+ ktime_t pre_submit_kt;
/* per connection */
- unsigned long pre_send_jif;
- unsigned long acked_jif;
- unsigned long net_done_jif;
-
+ ktime_t pre_send_kt[DRBD_PEERS_MAX];
+ ktime_t acked_kt[DRBD_PEERS_MAX];
+ ktime_t net_done_kt[DRBD_PEERS_MAX];
+#endif
/* Possibly even more detail to track each phase:
- * master_completion_jif
+ * master_completion_kt
* how long did it take to complete the master bio
* (application visible latency)
- * allocated_jif
+ * allocated_kt
* how long the master bio was blocked until we finally allocated
* a tracking struct
- * in_actlog_jif
+ * in_actlog_kt
* how long did we wait for activity log transactions
*
- * net_queued_jif
+ * net_queued_kt
* when did we finally queue it for sending
- * pre_send_jif
+ * pre_send_kt
* when did we start sending it
- * post_send_jif
+ * post_send_kt
* how long did we block in the network stack trying to send it
- * acked_jif
+ * acked_kt
* when did we receive (or fake, in protocol A) a remote ACK
- * net_done_jif
+ * net_done_kt
* when did we receive final acknowledgement (P_BARRIER_ACK),
* or decide, e.g. on connection loss, that we do no longer expect
* anything from this peer for this request.
*
- * pre_submit_jif
- * post_sub_jif
+ * pre_submit_kt
+ * post_sub_kt
* when did we start submiting to the lower level device,
* and how long did we block in that submit function
- * local_completion_jif
+ * local_completion_kt
* how long did it take the lower level device to complete this request
*/
/* once it hits 0, we may complete the master_bio */
atomic_t completion_ref;
+ /* once it hits 0, we may remove the request from the interval tree and activity log */
+ refcount_t done_ref;
+ /* once it hits 0, we may remove from transfer log and send a corresponding peer ack */
+ refcount_t oos_send_ref;
/* once it hits 0, we may destroy this drbd_request object */
struct kref kref;
- unsigned rq_state; /* see comments above _req_mod() */
+ /* Creates a dependency chain between writes so that we know that a
+ * peer ack can be sent when done_ref reaches zero.
+ *
+ * If not NULL, when this drbd_request is done, one done_ref reference
+ * of ->done_next will be put.
+ *
+ * "immutable" */
+ struct drbd_request *next_write;
+
+ /* lock to protect state flags */
+ spinlock_t rq_lock;
+ unsigned int local_rq_state;
+ u16 net_rq_state[DRBD_NODE_ID_MAX];
+
+ /* for reclaim from transfer log */
+ struct rcu_head rcu;
+};
+
+/* Used to multicast peer acks. */
+struct drbd_peer_ack {
+ struct drbd_resource *resource;
+ struct list_head list;
+ /*
+ * Keeps track of which connections have not yet processed this peer
+ * ack. Peer acks are queued for connections on which they are not sent
+ * so that last_peer_ack_dagtag_seen is updated at the correct moment.
+ */
+ u64 queued_mask;
+ u64 pending_mask; /* Peer ack is sent to these nodes */
+ u64 mask; /* Nodes which successfully wrote the requests covered by this peer ack */
+ u64 dagtag_sector;
};
+/* Tracks received writes grouped in epochs. Protected by epoch_lock. */
struct drbd_epoch {
struct drbd_connection *connection;
+ struct drbd_peer_request *oldest_unconfirmed_peer_req;
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
atomic_t active; /* increased on every req. added, and dec on every finished. */
+ atomic_t confirmed; /* adjusted for every P_CONFIRM_STABLE */
unsigned long flags;
};
/* drbd_epoch flag bits */
enum {
+ DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
+ DE_BARRIER_IN_NEXT_EPOCH_DONE,
+ DE_CONTAINS_A_BARRIER,
DE_HAVE_BARRIER_NUMBER,
-};
-
-enum epoch_event {
- EV_PUT,
- EV_GOT_BARRIER_NR,
- EV_BECAME_LAST,
- EV_CLEANUP = 32, /* used as flag */
+ DE_IS_FINISHING,
};
struct digest_info {
@@ -317,23 +453,36 @@ struct digest_info {
struct drbd_peer_request {
struct drbd_work w;
struct drbd_peer_device *peer_device;
- struct drbd_epoch *epoch; /* for writes */
- struct page *pages;
- blk_opf_t opf;
+ struct list_head recv_order; /* see peer_requests, peer_reads, resync_requests */
+
+ union {
+ struct { /* read requests */
+ unsigned int depend_dagtag_node_id;
+ u64 depend_dagtag;
+ };
+ struct { /* resync target requests */
+ unsigned int requested_size;
+ };
+ };
+
+ struct bio_list bios;
atomic_t pending_bios;
struct drbd_interval i;
- /* see comments on ee flag bits below */
- unsigned long flags;
- unsigned long submit_jif;
+ unsigned long flags; /* see comments on ee flag bits below */
union {
- u64 block_id;
- struct digest_info *digest;
+ struct { /* regular peer_request */
+ struct drbd_epoch *epoch; /* for writes */
+ unsigned long submit_jif;
+ u64 block_id;
+ struct digest_info *digest;
+ u64 dagtag_sector;
+ };
+ struct { /* reused object for sending OOS to other nodes */
+ u64 send_oos_pending;
+ };
};
};
-/* Equivalent to bio_op and req_op. */
-#define peer_req_op(peer_req) \
- ((peer_req)->opf & REQ_OP_MASK)
/* ee flag bits.
* While corresponding bios are in flight, the only modification will be
@@ -342,9 +491,19 @@ struct drbd_peer_request {
* non-atomic modification to ee->flags is ok.
*/
enum {
- __EE_CALL_AL_COMPLETE_IO,
+ /* If successfully written,
+ * we may clear the corresponding out-of-sync bits */
__EE_MAY_SET_IN_SYNC,
+ /* Peer did not write this one, we must set-out-of-sync
+ * before actually submitting ourselves */
+ __EE_SET_OUT_OF_SYNC,
+
+ /* This peer request closes an epoch using a barrier.
+ * On successful completion, the epoch is released,
+ * and the P_BARRIER_ACK send. */
+ __EE_IS_BARRIER,
+
/* is this a TRIM aka REQ_OP_DISCARD? */
__EE_TRIM,
/* explicit zero-out requested, or
@@ -364,125 +523,201 @@ enum {
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
- /* Conflicting local requests need to be restarted after this request */
- __EE_RESTART_REQUESTS,
-
/* The peer wants a write ACK for this (wire proto C) */
__EE_SEND_WRITE_ACK,
- /* Is set when net_conf had two_primaries set while creating this peer_req */
- __EE_IN_INTERVAL_TREE,
-
- /* for debugfs: */
- /* has this been submitted, or does it still wait for something else? */
- __EE_SUBMITTED,
-
- /* this is/was a write request */
- __EE_WRITE,
-
/* hand back using mempool_free(e, drbd_buffer_page_pool) */
__EE_RELEASE_TO_MEMPOOL,
/* this is/was a write same request */
__EE_WRITE_SAME,
- /* this originates from application on peer
- * (not some resync or verify or other DRBD internal request) */
- __EE_APPLICATION,
-
- /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
+ /* On target: Send P_RS_THIN_REQ.
+ * On source: If it contains only 0 bytes, send back P_RS_DEALLOCATED. */
__EE_RS_THIN_REQ,
+
+ /* Hold reference in activity log */
+ __EE_IN_ACTLOG,
+
+ /* SyncTarget: This is the last resync request. */
+ __EE_LAST_RESYNC_REQUEST,
+
+ /* This peer_req->recv_order is on some list */
+ __EE_ON_RECV_ORDER,
};
-#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_SET_OUT_OF_SYNC (1<<__EE_SET_OUT_OF_SYNC)
+#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
#define EE_TRIM (1<<__EE_TRIM)
#define EE_ZEROOUT (1<<__EE_ZEROOUT)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
-#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
-#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
-#define EE_SUBMITTED (1<<__EE_SUBMITTED)
-#define EE_WRITE (1<<__EE_WRITE)
#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
-#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
+#define EE_IN_ACTLOG (1<<__EE_IN_ACTLOG)
+#define EE_LAST_RESYNC_REQUEST (1<<__EE_LAST_RESYNC_REQUEST)
+#define EE_ON_RECV_ORDER (1<<__EE_ON_RECV_ORDER)
+
+#define REQ_NO_BIO (REQ_OP_DRV_OUT) /* exception for drbd_alloc_peer_request(), DRBD private */
/* flag bits per device */
-enum {
- UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
+enum device_flag {
MD_DIRTY, /* current uuids and flags not yet on disk */
- USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
- CL_ST_CHG_SUCCESS,
- CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
* Gets cleared when the state.conn
- * goes into C_CONNECTED state. */
- CONSIDER_RESYNC,
-
- MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
-
- BITMAP_IO, /* suspend application io;
- once no more io in flight, start bitmap io */
- BITMAP_IO_QUEUED, /* Started bitmap IO */
- WAS_IO_ERROR, /* Local disk failed, returned IO error */
- WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
+ * goes into L_ESTABLISHED state. */
+ MD_NO_FUA, /* meta data device does not support barriers,
+ so don't even try */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
- RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
- RESIZE_PENDING, /* Size change detected locally, waiting for the response from
- * the peer, if it changed there as well. */
- NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ ABORT_MDIO, /* Interrupt ongoing meta-data I/O */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO or issuing local IO */
+ __NEW_CUR_UUID, /* Set NEW_CUR_UUID as soon as state change visible */
+ WRITING_NEW_CUR_UUID, /* Set while the new current ID gets generated. */
AL_SUSPENDED, /* Activity logging is currently suspended. */
- AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
- B_RS_H_DONE, /* Before resync handler done (already executed) */
- DISCARD_MY_DATA, /* discard_my_data flag per volume */
- READ_BALANCE_RR,
-
+ UNREGISTERED,
FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
* from drbd_flush_after_epoch() */
/* cleared only after backing device related structures have been destroyed. */
- GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
+ GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
/* to be used in drbd_device_post_work() */
- GO_DISKLESS, /* tell worker to schedule cleanup before detach */
- DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */
+ GO_DISKLESS, /* tell worker to schedule cleanup before detach */
MD_SYNC, /* tell worker to call drbd_md_sync() */
+ MAKE_NEW_CUR_UUID, /* tell worker to ping peers and eventually write new current uuid */
+
+ STABLE_RESYNC, /* One peer_device finished the resync stable! */
+ READ_BALANCE_RR,
+ PRIMARY_LOST_QUORUM,
+ TIEBREAKER_QUORUM, /* Tiebreaker keeps quorum; used to avoid too verbose logging */
+ DESTROYING_DEV,
+ TRY_TO_GET_RESYNC,
+ OUTDATE_ON_2PC_COMMIT,
+ RESTORE_QUORUM, /* Restore quorum when we have the same members as before */
+ RESTORING_QUORUM, /* sanitize_state() -> finish_state_change() */
+ LEGACY_84_MD,
+ BDEV_FROZEN, /* called bdev_freeze(), needs bdev_thaw() on resume-io */
+};
+
+/* flag bits per peer device */
+enum peer_device_flag {
+ CONSIDER_RESYNC,
+ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
+ RESIZE_PENDING, /* Size change detected locally, waiting for the response from
+ * the peer, if it changed there as well. */
RS_START, /* tell worker to start resync/OV */
RS_PROGRESS, /* tell worker that resync made significant progress */
+ RS_LAZY_BM_WRITE, /* -"- and bitmap writeout should be efficient now */
RS_DONE, /* tell worker that resync is done */
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
+ INITIAL_STATE_SENT,
+ INITIAL_STATE_RECEIVED,
+ RECONCILIATION_RESYNC,
+ UNSTABLE_RESYNC, /* Sync source went unstable during resync. */
+ SEND_STATE_AFTER_AHEAD,
+ GOT_NEG_ACK, /* got a neg_ack while primary, wait until peer_disk is lower than
+ D_UP_TO_DATE before becoming secondary! */
+ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
+ SYNC_TARGET_TO_BEHIND, /* SyncTarget, wait for Behind */
+ HANDLING_CONGESTION, /* Set while testing for congestion and handling it */
+ HANDLE_CONGESTION, /* tell worker to change state due to congestion */
+ HOLDING_UUID_READ_LOCK, /* did a down_read(&device->uuid_sem) */
+ RS_SOURCE_MISSED_END, /* SyncSource did not got P_UUIDS110 */
+ RS_PEER_MISSED_END, /* Peer (which was SyncSource) did not got P_UUIDS110 after resync */
+ SYNC_SRC_CRASHED_PRI, /* Source of this resync was a crashed primary */
+ HAVE_SIZES, /* Cleared when connection gets lost; set when sizes received */
+ UUIDS_RECEIVED, /* Have recent UUIDs from the peer */
+ CURRENT_UUID_RECEIVED, /* Got a p_current_uuid packet */
+ PEER_QUORATE, /* Peer has quorum */
+ RS_REQUEST_UNSUCCESSFUL, /* Some resync request was unsuccessful in current cycle */
+ REPLICATION_NEXT, /* If unset, do not replicate writes when next Inconsistent */
+ PEER_REPLICATION_NEXT, /* We have instructed peer not to replicate writes */
};
-struct drbd_bitmap; /* opaque for drbd_device */
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
/* definition of bits in bm_flags to be used in drbd_bm_lock
* and drbd_bitmap_io and friends. */
enum bm_flag {
- /* currently locked for bulk operation */
- BM_LOCKED_MASK = 0xf,
-
- /* in detail, that is: */
- BM_DONT_CLEAR = 0x1,
- BM_DONT_SET = 0x2,
- BM_DONT_TEST = 0x4,
+ /*
+ * The bitmap can be locked to prevent others from clearing, setting,
+ * and/or testing bits. The following combinations of lock flags make
+ * sense:
+ *
+ * BM_LOCK_CLEAR,
+ * BM_LOCK_SET, | BM_LOCK_CLEAR,
+ * BM_LOCK_TEST | BM_LOCK_SET | BM_LOCK_CLEAR.
+ */
- /* so we can mark it locked for bulk operation,
- * and still allow all non-bulk operations */
- BM_IS_LOCKED = 0x8,
+ BM_LOCK_TEST = 0x1,
+ BM_LOCK_SET = 0x2,
+ BM_LOCK_CLEAR = 0x4,
+ BM_LOCK_BULK = 0x8, /* locked for bulk operation, allow all non-bulk operations */
- /* (test bit, count bit) allowed (common case) */
- BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+ BM_LOCK_ALL = BM_LOCK_TEST | BM_LOCK_SET | BM_LOCK_CLEAR | BM_LOCK_BULK,
- /* testing bits, as well as setting new bits allowed, but clearing bits
- * would be unexpected. Used during bitmap receive. Setting new bits
- * requires sending of "out-of-sync" information, though. */
- BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+ BM_LOCK_SINGLE_SLOT = 0x10,
+ BM_ON_DAX_PMEM = 0x10000,
+};
- /* for drbd_bm_write_copy_pages, everything is allowed,
- * only concurrent bulk operations are locked out. */
- BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+struct drbd_bitmap {
+ union {
+ struct page **bm_pages;
+ void *bm_on_pmem;
+ };
+ spinlock_t bm_lock; /* fine-grain lock (TODO: per slot) */
+ spinlock_t bm_all_slots_lock; /* all bitmap slots lock */
+
+ unsigned long bm_set[DRBD_PEERS_MAX]; /* number of bits set */
+ unsigned long bm_bits; /* bits per peer */
+ unsigned long bm_bits_4k; /* bits per peer, if we had bm_block_size of 4k */
+ size_t bm_words; /* platform specitif word size; not 32bit!! */
+ size_t bm_number_of_pages;
+ sector_t bm_dev_capacity;
+ struct mutex bm_change; /* serializes resize operations */
+
+ wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+ enum bm_flag bm_flags;
+ unsigned int bm_max_peers;
+ unsigned int bm_block_shift; /* ln2 of bytes per bit for this bitmap */
+
+ /* exclusively to be used by __al_write_transaction(),
+ * and drbd_bm_write_hinted() -> bm_rw() called from there.
+ * One activity log extent represents 4MB of storage, which are 1024
+ * bits (at 4k per bit), times at most DRBD_PEERS_MAX (currently 32).
+ * The bitmap is created interleaved, with a potentially odd number
+ * of peer slots determined at create-md time. Which means that one
+ * AL-extent may be associated with one or two bitmap pages.
+ */
+ unsigned int n_bitmap_hints;
+ unsigned int al_bitmap_hints[2*AL_UPDATES_PER_TRANSACTION];
+
+ /* debugging aid, in case we are still racy somewhere */
+ const char *bm_why;
+ char bm_task_comm[TASK_COMM_LEN];
+ pid_t bm_task_pid;
+ struct drbd_peer_device *bm_locked_peer;
};
struct drbd_work_queue {
@@ -491,29 +726,37 @@ struct drbd_work_queue {
wait_queue_head_t q_wait;
};
-struct drbd_socket {
- struct mutex mutex;
- struct socket *socket;
- /* this way we get our
- * send/receive buffers off the stack */
- void *sbuf;
- void *rbuf;
+struct drbd_peer_md {
+ u64 bitmap_uuid;
+ u64 bitmap_dagtag;
+ u32 flags;
+ s32 bitmap_index;
};
struct drbd_md {
u64 md_offset; /* sector offset to 'super' block */
- u64 la_size_sect; /* last agreed size, unit sectors */
+ u64 effective_size; /* last agreed size (sectors) */
+ u64 prev_members; /* read from the meta-data */
+ u64 members; /* current member mask for writing meta-data */
spinlock_t uuid_lock;
- u64 uuid[UI_SIZE];
+ u64 current_uuid;
u64 device_uuid;
u32 flags;
+ s32 node_id;
u32 md_size_sect;
s32 al_offset; /* signed relative sector offset to activity log */
s32 bm_offset; /* signed relative sector offset to bitmap */
- /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ u32 max_peers;
+ u32 bm_block_size;
+ u32 bm_block_shift; /* ilog2(bm_block_size) */
+
+ struct drbd_peer_md peers[DRBD_NODE_ID_MAX];
+ u64 history_uuids[HISTORY_UUIDS];
+
+ /* cached value of bdev->disk_conf->meta_dev_idx */
s32 meta_dev_idx;
/* see al_tr_number_to_on_disk_sector() */
@@ -528,8 +771,13 @@ struct drbd_backing_dev {
struct block_device *md_bdev;
struct file *f_md_bdev;
struct drbd_md md;
- struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+ struct disk_conf __rcu *disk_conf; /* RCU, for updates: resource->conf_update */
sector_t known_size; /* last known size of that backing device */
+#if IS_ENABLED(CONFIG_DEV_DAX_PMEM)
+ struct dax_device *dax_dev;
+ struct meta_data_on_disk_9 *md_on_pmem; /* address of md_offset */
+ struct al_on_pmem *al_on_pmem;
+#endif
};
struct drbd_md_io {
@@ -544,43 +792,151 @@ struct drbd_md_io {
struct bm_io_work {
struct drbd_work w;
+ struct drbd_device *device;
struct drbd_peer_device *peer_device;
char *why;
enum bm_flag flags;
- int (*io_fn)(struct drbd_device *device, struct drbd_peer_device *peer_device);
- void (*done)(struct drbd_device *device, int rv);
+ int (*io_fn)(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+ void (*done)(struct drbd_device *device,
+ struct drbd_peer_device *peer_device,
+ int rv);
};
struct fifo_buffer {
+ /* singly linked list to accumulate multiple such struct fifo_buffers,
+ * to be freed after a single syncronize_rcu(),
+ * outside a critical section. */
+ struct fifo_buffer *next;
unsigned int head_index;
unsigned int size;
int total; /* sum of all values */
int values[] __counted_by(size);
};
-extern struct fifo_buffer *fifo_alloc(unsigned int fifo_size);
+struct fifo_buffer *fifo_alloc(unsigned int fifo_size);
/* flag bits per connection */
-enum {
- NET_CONGESTED, /* The data socket is congested */
- RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
- SEND_PING,
- GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
- CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
- CONN_WD_ST_CHG_OKAY,
- CONN_WD_ST_CHG_FAIL,
+enum connection_flag {
+ PING_PENDING, /* cleared upon receiveing a ping_ack packet, wakes state_wait */
+ TWOPC_PREPARED,
+ TWOPC_YES,
+ TWOPC_NO,
+ TWOPC_RETRY,
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
- CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
- STATE_SENT, /* Do not change state/UUIDs while this is set */
+ DISCONNECT_EXPECTED,
+ BARRIER_ACK_PENDING,
+ CORKED,
+ DATA_CORKED = CORKED, /* used as computed value CORKED + DATA_STREAM */
+ CONTROL_CORKED, /* used as computed value CORKED + CONTROL_STREAM */
+ C_UNREGISTERED,
+ RECONNECT,
+ CONN_DISCARD_MY_DATA,
+ SEND_STATE_AFTER_AHEAD_C,
+ NOTIFY_PEERS_LOST_PRIMARY,
+ CHECKING_PEER, /* used by make_new_urrent_uuid() to check liveliness */
+ CONN_CONGESTED,
+ CONN_HANDSHAKE_DISCONNECT,
+ CONN_HANDSHAKE_RETRY,
+ CONN_HANDSHAKE_READY,
+ RECEIVED_DAGTAG, /* Whether we received any write or dagtag since connecting. */
+ PING_TIMEOUT_ACTIVE,
+};
+
+/* flag bits per resource */
+enum resource_flag {
+ EXPLICIT_PRIMARY,
CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
* pending, from drbd worker context.
*/
- DISCONNECT_SENT,
+ TWOPC_ABORT_LOCAL,
+ TWOPC_WORK_PENDING, /* Set while work for sending reply is scheduled */
+ TWOPC_EXECUTED, /* Commited or aborted */
+ TWOPC_STATE_CHANGE_PENDING, /* set between sending commit and changing local state */
+
+ TRY_BECOME_UP_TO_DATE_PENDING,
DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
+ PEER_DEVICE_WORK_PENDING,/* tell worker that some peer_device has pending work */
+
+ /* to be used in drbd_post_work() */
+ R_UNREGISTERED,
+ DOWN_IN_PROGRESS,
+ CHECKING_PEERS,
+ WRONG_MDF_EXISTS, /* Warned about MDF_EXISTS flag on all peer slots */
+ TWOPC_RECV_SIZES_ERR, /* Error processing sizes packet during 2PC connect */
};
enum which_state { NOW, OLD = NOW, NEW };
+enum twopc_type {
+ TWOPC_STATE_CHANGE,
+ TWOPC_RESIZE,
+};
+
+struct twopc_reply {
+ int vnr;
+ unsigned int tid; /* transaction identifier */
+ int initiator_node_id; /* initiator of the transaction */
+ int target_node_id; /* target of the transaction (or -1) */
+ u64 target_reachable_nodes; /* behind the target node */
+ u64 reachable_nodes; /* behind other nodes */
+ union {
+ struct { /* type == TWOPC_STATE_CHANGE */
+ u64 primary_nodes;
+ u64 weak_nodes;
+ };
+ struct { /* type == TWOPC_RESIZE */
+ u64 diskful_primary_nodes;
+ u64 max_possible_size;
+ };
+ };
+ unsigned int is_disconnect:1;
+ unsigned int is_connect:1;
+ unsigned int is_aborted:1;
+ /* Whether the state change on receiving the twopc failed. When this is
+ * a twopc for transitioning to C_CONNECTED, we cannot immediately
+ * reply with P_TWOPC_NO. The state handshake must complete first to
+ * decide the appropriate reply. */
+ unsigned int state_change_failed:1;
+};
+
+struct twopc_request {
+ u64 nodes_to_reach;
+ enum drbd_packet cmd;
+ unsigned int tid;
+ int initiator_node_id;
+ int target_node_id;
+ int vnr;
+ u32 flags;
+};
+
+struct drbd_thread_timing_details {
+ unsigned long start_jif;
+ void *cb_addr;
+ const char *caller_fn;
+ unsigned int line;
+ unsigned int cb_nr;
+};
+#define DRBD_THREAD_DETAILS_HIST 16
+
+struct drbd_send_buffer {
+ struct page *page; /* current buffer page for sending data */
+ char *unsent; /* start of unsent area != pos if corked... */
+ char *pos; /* position within that page */
+ int allocated_size; /* currently allocated space */
+ int additional_size; /* additional space to be added to next packet's size */
+};
+
+struct drbd_mutable_buffer {
+ u8 *buffer;
+ unsigned int avail;
+};
+
+enum drbd_per_resource_ratelimit {
+ D_RL_R_NOLIMIT = -1,
+ D_RL_R_GENERIC,
+};
+
struct drbd_resource {
char *name;
#ifdef CONFIG_DEBUG_FS
@@ -588,32 +944,141 @@ struct drbd_resource {
struct dentry *debugfs_res_volumes;
struct dentry *debugfs_res_connections;
struct dentry *debugfs_res_in_flight_summary;
+ struct dentry *debugfs_res_state_twopc;
+ struct dentry *debugfs_res_worker_pid;
+ struct dentry *debugfs_res_members;
#endif
struct kref kref;
- struct idr devices; /* volume number to device mapping */
+
+ /* Volume number to device mapping. Updates protected by conf_update. */
+ struct idr devices;
+
+ struct ratelimit_state ratelimit[1];
+
+ /* RCU list. Updates protected by adm_mutex, conf_update and state_rwlock. */
struct list_head connections;
- struct list_head resources;
+
+ struct list_head resources; /* list entry in global resources list */
struct res_opts res_opts;
- struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ int max_node_id;
+ /*
+ * For read-copy-update of net_conf and disk_conf and devices,
+ * connection, peer_devices and paths lists.
+ */
+ struct mutex conf_update;
struct mutex adm_mutex; /* mutex to serialize administrative requests */
- spinlock_t req_lock;
+ struct mutex open_release; /* serialize open/release */
+ struct {
+ char comm[TASK_COMM_LEN];
+ unsigned int minor;
+ pid_t pid;
+ ktime_t opened;
+ } auto_promoted_by;
+
+ rwlock_t state_rwlock; /* serialize state changes */
+ u64 dagtag_sector; /* Protected by tl_update_lock.
+ * See also dagtag_sector in
+ * &drbd_request */
+ u64 dagtag_from_backing_dev;
+ u64 dagtag_before_attach;
+ u64 members; /* mask of online nodes */
+ unsigned long flags;
+
+ /* Protects updates to the transfer log and related counters. */
+ spinlock_t tl_update_lock;
+ struct list_head transfer_log; /* all requests not yet fully processed */
+ struct drbd_request *tl_previous_write;
+
+ spinlock_t peer_ack_lock;
+ struct list_head peer_ack_req_list; /* requests to send peer acks for */
+ struct list_head peer_ack_list; /* peer acks to send */
+ struct drbd_work peer_ack_work;
+ u64 last_peer_acked_dagtag; /* dagtag of last PEER_ACK'ed request */
+ struct drbd_request *peer_ack_req; /* last request not yet PEER_ACK'ed */
- unsigned susp:1; /* IO suspended by user */
- unsigned susp_nod:1; /* IO suspended because no data */
- unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+ /* Protects current_flush_sequence and pending_flush_mask (connection) */
+ spinlock_t initiator_flush_lock;
+ u64 current_flush_sequence;
+
+ struct semaphore state_sem;
+ wait_queue_head_t state_wait; /* upon each state change. */
+ enum chg_state_flags state_change_flags;
+ const char **state_change_err_str;
+ bool remote_state_change; /* remote state change in progress */
+ enum drbd_packet twopc_prepare_reply_cmd; /* this node's answer to the prepare phase or 0 */
+ u64 twopc_parent_nodes;
+ struct twopc_reply twopc_reply;
+ struct timer_list twopc_timer;
+ struct work_struct twopc_work;
+ wait_queue_head_t twopc_wait;
+ struct {
+ enum twopc_type type;
+ union {
+ struct twopc_resize {
+ int dds_flags; /* from prepare phase */
+ sector_t user_size; /* from prepare phase */
+ u64 diskful_primary_nodes; /* added in commit phase */
+ u64 new_size; /* added in commit phase */
+ } resize;
+ struct twopc_state_change {
+ union drbd_state mask; /* from prepare phase */
+ union drbd_state val; /* from prepare phase */
+ u64 primary_nodes; /* added in commit phase */
+ u64 reachable_nodes; /* added in commit phase */
+ } state_change;
+ };
+ } twopc;
+ enum drbd_role role[2];
+ bool susp_user[2]; /* IO suspended by user */
+ bool susp_nod[2]; /* IO suspended because no data */
+ bool susp_quorum[2]; /* IO suspended because no quorum */
+ bool susp_uuid[2]; /* IO suspended because waiting new current UUID */
+ bool fail_io[2]; /* Fail all IO requests because forced a demote */
+ bool cached_susp; /* cached result of looking at all different suspend bits */
+ bool cached_all_devices_have_quorum;
enum write_ordering_e write_ordering;
+ /* Protects the current transfer log (tle) fields. */
+ spinlock_t current_tle_lock;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned cached_min_aggreed_protocol_version;
+
cpumask_var_t cpu_mask;
+
+ struct drbd_work_queue work;
+ struct drbd_thread worker;
+
+ struct list_head listeners;
+ spinlock_t listeners_lock;
+
+ struct timer_list peer_ack_timer; /* send a P_PEER_ACK after last completion */
+
+ unsigned int w_cb_nr; /* keeps counting up */
+ struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+ wait_queue_head_t barrier_wait; /* upon each state change. */
+ struct rcu_head rcu;
+
+ struct list_head suspended_reqs;
+ /*
+ * The side effects of an empty state change two-phase commit are:
+ *
+ * * A local consistent disk can upgrade to up-to-date when no primary is reachable
+ * (or become outdated if the prepare packets reach a primary).
+ *
+ * * resource->members are updates
+ *
+ * * Faraway nodes might outdate themselves if they learn about the existence of a primary
+ * (with access to data) node.
+ */
+ struct work_struct empty_twopc;
};
-struct drbd_thread_timing_details
-{
- unsigned long start_jif;
- void *cb_addr;
- const char *caller_fn;
- unsigned int line;
- unsigned int cb_nr;
+enum drbd_per_connection_ratelimit {
+ D_RL_C_NOLIMIT = -1,
+ D_RL_C_GENERIC,
};
struct drbd_connection {
@@ -623,36 +1088,49 @@ struct drbd_connection {
struct dentry *debugfs_conn;
struct dentry *debugfs_conn_callback_history;
struct dentry *debugfs_conn_oldest_requests;
+ struct dentry *debugfs_conn_transport;
+ struct dentry *debugfs_conn_debug;
+ struct dentry *debugfs_conn_receiver_pid;
+ struct dentry *debugfs_conn_sender_pid;
#endif
struct kref kref;
struct idr peer_devices; /* volume number to peer device mapping */
- enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
- struct mutex cstate_mutex; /* Protects graceful disconnects */
- unsigned int connect_cnt; /* Inc each time a connection is established */
+ enum drbd_conn_state cstate[2];
+ enum drbd_role peer_role[2];
+ bool susp_fen[2]; /* IO suspended because fence peer handler runs */
+
+ struct ratelimit_state ratelimit[1];
unsigned long flags;
- struct net_conf *net_conf; /* content protected by rcu */
- wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+ enum drbd_fencing_policy fencing_policy;
- struct sockaddr_storage my_addr;
- int my_addr_len;
- struct sockaddr_storage peer_addr;
- int peer_addr_len;
+ struct drbd_send_buffer send_buffer[2];
+ struct mutex mutex[2]; /* Protect assembling of new packet until sending it (in send_buffer) */
+ /* scratch buffers for use while "owning" the DATA_STREAM send_buffer,
+ * to avoid larger on-stack temporary variables,
+ * introduced for holding digests in drbd_send_dblock() */
+ union {
+ /* MAX_DIGEST_SIZE in the linux kernel at this point is 64 byte, afaik */
+ struct {
+ char before[64];
+ char after[64];
+ } d;
+ } scratch_buffer;
- struct drbd_socket data; /* data/barrier/cstate/parameter packets */
- struct drbd_socket meta; /* ping/ack (metadata) packets */
int agreed_pro_version; /* actually used protocol version */
u32 agreed_features;
- unsigned long last_received; /* in jiffies, either socket */
- unsigned int ko_count;
+ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+ atomic_t rs_in_flight; /* Resync sectors in flight */
- struct list_head transfer_log; /* all requests not yet fully processed */
+ struct drbd_work connect_timer_work;
+ struct timer_list connect_timer;
struct crypto_shash *cram_hmac_tfm;
- struct crypto_shash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */
+ struct crypto_shash *integrity_tfm; /* checksums we compute, updates protected by connection->mutex[DATA_STREAM] */
struct crypto_shash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
struct crypto_shash *csums_tfm;
struct crypto_shash *verify_tfm;
+
void *int_dig_in;
void *int_dig_vv;
@@ -660,35 +1138,137 @@ struct drbd_connection {
struct drbd_epoch *current_epoch;
spinlock_t epoch_lock;
unsigned int epochs;
- atomic_t current_tle_nr; /* transfer log epoch number */
- unsigned current_tle_writes; /* writes seen within this tl epoch */
unsigned long last_reconnect_jif;
/* empty member on older kernels without blk_start_plug() */
struct blk_plug receiver_plug;
struct drbd_thread receiver;
- struct drbd_thread worker;
- struct drbd_thread ack_receiver;
+ struct drbd_thread sender;
struct workqueue_struct *ack_sender;
+ struct work_struct peer_ack_work;
+
+ /* Work for sending P_OUT_OF_SYNC due to P_PEER_ACK */
+ struct drbd_work send_oos_work;
+ /*
+ * These peers have sent us a P_PEER_ACK for which we need to send
+ * P_OUT_OF_SYNC on this connection.
+ */
+ unsigned long send_oos_from_mask;
+
+ atomic64_t last_dagtag_sector;
+ /* Record of last peer ack to determine whether we can ack flush */
+ u64 last_peer_ack_dagtag_seen;
+
+ /* Mask of nodes from which we are waiting for a flush ack corresponding to this Primary */
+ u64 pending_flush_mask;
+
+ /* Protects the flush members below for this connection */
+ spinlock_t primary_flush_lock;
+ /* For handling P_FLUSH_REQUESTS from this peer */
+ u64 flush_requests_dagtag;
+ u64 flush_sequence;
+ u64 flush_forward_sent_mask;
+
+ /* For handling forwarded flushes. On connection to initiator node. */
+ spinlock_t flush_ack_lock;
+ struct drbd_work flush_ack_work;
+ /* For forwarded flushes. On connection to initiator node. Indexed by primary node ID */
+ u64 flush_ack_sequence[DRBD_PEERS_MAX];
+
+ atomic_t active_ee_cnt; /* Peer write requests waiting for activity log or backing disk. */
+ atomic_t backing_ee_cnt; /* Other peer requests waiting for conflicts or backing disk. */
+ atomic_t done_ee_cnt;
+ spinlock_t peer_reqs_lock;
+ spinlock_t send_oos_lock; /* Protects send_oos list */
+
+ /* Lists using drbd_peer_request.recv_order (see also drbd_peer_device.resync_requests) */
+ struct list_head peer_requests; /* All peer writes in the order we received them */
+ struct list_head peer_reads; /* All reads in the order we received them */
+ /*
+ * Peer writes for which we need to send some P_OUT_OF_SYNC. These peer
+ * writes continue to be stored on the connection over which the writes
+ * and the P_PEER_ACK are received. They are accessed by the sender for
+ * each relevant peer. Protected by send_oos_lock on this connection.
+ */
+ struct list_head send_oos;
+
+ /* Lists using drbd_peer_request.w.list */
+ struct list_head done_ee; /* Need to send P_WRITE_ACK/P_RS_WRITE_ACK */
+ struct list_head dagtag_wait_ee; /* Resync read waiting for dagtag to be reached */
+
+ struct work_struct send_acks_work;
+ struct work_struct send_ping_ack_work;
+ struct work_struct send_ping_work;
+ wait_queue_head_t ee_wait;
+
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by transport */
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+ struct drbd_work send_dagtag_work;
+ u64 send_dagtag;
+
+ struct sender_todo {
+ struct list_head work_list;
+
+ /* If upper layers trigger an unplug on this side, we want to
+ * send and unplug hint over to the peer. Sending it too
+ * early, or missing it completely, causes a potential latency
+ * penalty (requests idling too long in the remote queue).
+ * There is no harm done if we occasionally send one too many
+ * such unplug hints.
+ *
+ * We have two slots, which are used in an alternating fashion:
+ * If a new unplug event happens while the current pending one
+ * has not even been processed yet, we overwrite the next
+ * pending slot: there is not much point in unplugging on the
+ * remote side, if we have a full request queue to be send on
+ * this side still, and not even reached the position in the
+ * change stream when the previous local unplug happened.
+ */
+ u64 unplug_dagtag_sector[2];
+ unsigned int unplug_slot; /* 0 or 1 */
+
+ /* the currently (or last) processed request,
+ * see process_sender_todo() */
+ struct drbd_request *req;
+
+ /* Points to the next request on the resource->transfer_log,
+ * which is RQ_NET_QUEUED for this connection, and so can
+ * safely be used as next starting point for the list walk
+ * in tl_next_request_for_connection().
+ *
+ * If it is NULL (we walked off the tail last time), it will be
+ * set by __req_mod( QUEUE_FOR.* ), so fast connections don't
+ * need to walk the full transfer_log list every time, even if
+ * the list is kept long by some slow connections.
+ *
+ * req_next is only accessed by drbd_sender thread, in
+ * case of a resend from some worker, but then regular IO
+ * is suspended.
+ */
+ struct drbd_request *req_next;
+ } todo;
/* cached pointers,
* so we can look up the oldest pending requests more quickly.
- * protected by resource->req_lock */
- struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+ * TODO: RCU */
struct drbd_request *req_ack_pending;
+ /* The oldest request that is or was queued for this peer, but is not
+ * done towards it. */
struct drbd_request *req_not_net_done;
+ /* Protects the caching pointers from being advanced concurrently. */
+ spinlock_t advance_cache_ptr_lock;
- /* sender side */
- struct drbd_work_queue sender_work;
-
-#define DRBD_THREAD_DETAILS_HIST 16
- unsigned int w_cb_nr; /* keeps counting up */
+ unsigned int s_cb_nr; /* keeps counting up */
unsigned int r_cb_nr; /* keeps counting up */
- struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+ struct drbd_thread_timing_details s_timing_details[DRBD_THREAD_DETAILS_HIST];
struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
struct {
unsigned long last_sent_barrier_jif;
+ int last_sent_epoch_nr;
/* whether this sender thread
* has processed a single write yet. */
@@ -701,52 +1281,245 @@ struct drbd_connection {
* with req->epoch == current_epoch_nr.
* If none, no P_BARRIER will be sent. */
unsigned current_epoch_writes;
- } send;
-};
-static inline bool has_net_conf(struct drbd_connection *connection)
-{
- bool has_net_conf;
+ /* Position in change stream of last write sent. */
+ u64 current_dagtag_sector;
- rcu_read_lock();
- has_net_conf = rcu_dereference(connection->net_conf);
- rcu_read_unlock();
+ /* Position in change stream of last ready request seen. */
+ u64 seen_dagtag_sector;
+ } send;
- return has_net_conf;
-}
+ struct {
+ u64 dagtag_sector;
+ int lost_node_id;
+ } after_reconciliation;
-void __update_timing_details(
- struct drbd_thread_timing_details *tdp,
- unsigned int *cb_nr,
- void *cb,
- const char *fn, const unsigned int line);
+ unsigned int peer_node_id;
-#define update_worker_timing_details(c, cb) \
- __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
-#define update_receiver_timing_details(c, cb) \
- __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+ struct drbd_mutable_buffer reassemble_buffer;
+ union {
+ u8 bytes[8];
+ struct p_block_ack block_ack;
+ struct p_barrier_ack barrier_ack;
+ struct p_confirm_stable confirm_stable;
+ struct p_peer_ack peer_ack;
+ struct p_peer_block_desc peer_block_desc;
+ struct p_twopc_reply twopc_reply;
+ } reassemble_buffer_bytes;
+
+ /* Used when a network namespace is removed to track all connections
+ * that need disconnecting. */
+ struct list_head remove_net_list;
+
+ struct rcu_head rcu;
+
+ unsigned int ctl_packets;
+ unsigned int ctl_bytes;
+
+ struct drbd_transport transport; /* The transport needs to be the last member. The acutal
+ implementation might have more members than the
+ abstract one. */
+};
-struct submit_worker {
- struct workqueue_struct *wq;
- struct work_struct worker;
+/* used to get the next lower or next higher peer_device depending on device node-id */
+enum drbd_neighbor {
+ NEXT_LOWER,
+ NEXT_HIGHER
+};
- /* protected by ..->resource->req_lock */
- struct list_head writes;
+enum drbd_per_peer_device_ratelimit {
+ D_RL_PD_NOLIMIT = -1,
+ D_RL_PD_GENERIC,
};
struct drbd_peer_device {
struct list_head peer_devices;
struct drbd_device *device;
struct drbd_connection *connection;
- struct work_struct send_acks_work;
+ struct peer_device_conf __rcu *conf; /* RCU, for updates: resource->conf_update */
+ enum drbd_disk_state disk_state[2];
+ enum drbd_repl_state repl_state[2];
+ bool resync_susp_user[2];
+ bool resync_susp_peer[2];
+ bool resync_susp_dependency[2];
+ bool resync_susp_other_c[2];
+ bool resync_active[2];
+ bool replication[2]; /* Only while peer is Inconsistent: Is replication enabled? */
+ bool peer_replication[2]; /* Whether we have instructed peer to replicate to us */
+ enum drbd_repl_state negotiation_result; /* To find disk state after attach */
+ unsigned int send_cnt;
+ unsigned int recv_cnt;
+ atomic_t packet_seq;
+ unsigned int peer_seq;
+ spinlock_t peer_seq_lock;
+ uint64_t d_size; /* size of disk */
+ uint64_t u_size; /* user requested size */
+ uint64_t c_size; /* current exported size */
+ uint64_t max_size;
+ int bitmap_index;
+ int node_id;
+
+ struct ratelimit_state ratelimit[1];
+
+ unsigned long flags;
+
+ enum drbd_repl_state start_resync_side;
+ enum drbd_repl_state last_repl_state; /* What we received from the peer */
+ struct timer_list start_resync_timer;
+ struct drbd_work resync_work;
+ struct timer_list resync_timer;
+ struct drbd_work propagate_uuids_work;
+
+ enum drbd_disk_state resync_finished_pdsk; /* Finished while starting resync */
+ int resync_again; /* decided to resync again while resync running */
+ sector_t last_in_sync_end; /* sector after end of last completed resync request */
+ unsigned long resync_next_bit; /* bitmap bit to search from for next resync request */
+ unsigned long last_resync_pass_bits; /* bitmap weight at end of previous pass */
+
+ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected (RQ_NET_PENDING set) */
+ atomic_t unacked_cnt; /* Need to send replies for */
+ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+
+ /* Protected by connection->peer_reqs_lock */
+ struct list_head resync_requests; /* Resync requests in the order we sent them */
+ /*
+ * If not NULL, all requests in resync_requests until this one have
+ * been received. Discards are only counted as "received" once merging
+ * is complete.
+ */
+ struct drbd_peer_request *received_last;
+ /*
+ * If not NULL, all requests in resync_requests after received_last
+ * until this one are discards.
+ */
+ struct drbd_peer_request *discard_last;
+
+ /* use checksums for *this* resync */
+ bool use_csums;
+ /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+ unsigned long rs_total;
+ /* number of resync blocks that failed in this run */
+ unsigned long rs_failed;
+ /* Syncer's start time [unit jiffies] */
+ unsigned long rs_start;
+ /* cumulated time in PausedSyncX state [unit jiffies] */
+ unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+ unsigned long rs_last_progress_report_ts;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+ /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+ /* marks's time [unit jiffies] */
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
+ unsigned long rs_last_writeout;
+
+ /* where does the admin want us to start? (sector) */
+ sector_t ov_start_sector;
+ sector_t ov_stop_sector;
+ /* where are we now? (sector) */
+ sector_t ov_position;
+ /* Start sector of out of sync range (to merge printk reporting). */
+ sector_t ov_last_oos_start;
+ /* size of out-of-sync range in sectors. */
+ sector_t ov_last_oos_size;
+ /* Start sector of skipped range (to merge printk reporting). */
+ sector_t ov_last_skipped_start;
+ /* size of skipped range in sectors. */
+ sector_t ov_last_skipped_size;
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer __rcu *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ ktime_t rs_last_mk_req_kt;
+ atomic64_t ov_left; /* in bits */
+ unsigned long ov_skipped; /* in bits */
+ u64 rs_start_uuid;
+
+ u64 current_uuid;
+ u64 bitmap_uuids[DRBD_PEERS_MAX];
+ u64 history_uuids[HISTORY_UUIDS];
+ u64 dirty_bits;
+ u64 uuid_flags;
+ u64 uuid_node_mask; /* might be authoritative_nodes or weak_nodes */
+
+ unsigned long comm_bm_set; /* communicated number of set bits. */
+ u64 comm_current_uuid; /* communicated current UUID */
+ u64 comm_uuid_flags; /* communicated UUID flags */
+ u64 comm_bitmap_uuid;
+ union drbd_state comm_state;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_peer_dev;
+ struct dentry *debugfs_peer_dev_proc_drbd;
#endif
+ ktime_t pre_send_kt;
+ ktime_t acked_kt;
+ ktime_t net_done_kt;
+
+ struct {/* sender todo per peer_device */
+ bool was_sending_out_of_sync;
+ } todo;
+ union drbd_state connect_state;
+ struct {
+ unsigned int physical_block_size;
+ unsigned int logical_block_size;
+ unsigned int alignment_offset;
+ unsigned int io_min;
+ unsigned int io_opt;
+ unsigned int max_bio_size;
+ } q_limits;
+ /* communicated as part of o_qlim, if agreed on DRBD_FF_BM_BLOCK_SHIFT */
+ unsigned int bm_block_shift;
+};
+
+struct conflict_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head resync_writes;
+ struct list_head resync_reads;
+ struct list_head writes;
+ struct list_head peer_writes;
+};
+
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+ struct list_head peer_writes;
+};
+
+struct opener {
+ struct list_head list;
+ char comm[TASK_COMM_LEN];
+ pid_t pid;
+ ktime_t opened;
+};
+
+enum drbd_per_device_ratelimit {
+ D_RL_D_NOLIMIT = -1,
+ D_RL_D_GENERIC,
+ D_RL_D_METADATA,
+ D_RL_D_BACKEND,
+ __D_RL_D_N
};
struct drbd_device {
struct drbd_resource *resource;
+
+ /* RCU list. Updates protected by adm_mutex, conf_update and state_rwlock. */
struct list_head peer_devices;
+
+ spinlock_t pending_bmio_lock;
struct list_head pending_bitmap_io;
unsigned long flush_jif;
@@ -755,12 +1528,22 @@ struct drbd_device {
struct dentry *debugfs_vol;
struct dentry *debugfs_vol_oldest_requests;
struct dentry *debugfs_vol_act_log_extents;
- struct dentry *debugfs_vol_resync_extents;
+ struct dentry *debugfs_vol_act_log_histogram;
struct dentry *debugfs_vol_data_gen_id;
+ struct dentry *debugfs_vol_io_frozen;
struct dentry *debugfs_vol_ed_gen_id;
+ struct dentry *debugfs_vol_openers;
+ struct dentry *debugfs_vol_md_io;
+ struct dentry *debugfs_vol_interval_tree;
+ struct dentry *debugfs_vol_al_updates;
+ struct dentry *debugfs_vol_multi_bio_cnt;
+#ifdef CONFIG_DRBD_TIMING_STATS
+ struct dentry *debugfs_vol_req_timing;
+#endif
#endif
+ struct ratelimit_state ratelimit[__D_RL_D_N];
- unsigned int vnr; /* volume number within the connection */
+ unsigned int vnr; /* volume number within the resource */
unsigned int minor; /* device minor number */
struct kref kref;
@@ -769,148 +1552,126 @@ struct drbd_device {
unsigned long flags;
/* configured by drbdsetup */
- struct drbd_backing_dev *ldev;
+ struct drbd_backing_dev *ldev; /* enclose accessing code in get_ldev() / put_ldev() */
+
+ /* Used to close backing devices and destroy related structures. */
+ struct work_struct ldev_destroy_work;
- sector_t p_size; /* partner's disk size */
struct request_queue *rq_queue;
struct gendisk *vdisk;
unsigned long last_reattach_jif;
- struct drbd_work resync_work;
- struct drbd_work unplug_work;
- struct timer_list resync_timer;
struct timer_list md_sync_timer;
- struct timer_list start_resync_timer;
struct timer_list request_timer;
- /* Used after attach while negotiating new disk state. */
- union drbd_state new_state_tmp;
-
- union drbd_dev_state state;
+ enum drbd_disk_state disk_state[2];
wait_queue_head_t misc_wait;
- wait_queue_head_t state_wait; /* upon each state change. */
- unsigned int send_cnt;
- unsigned int recv_cnt;
unsigned int read_cnt;
unsigned int writ_cnt;
unsigned int al_writ_cnt;
unsigned int bm_writ_cnt;
- atomic_t ap_bio_cnt; /* Requests we need to complete */
- atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
- atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
- atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
- atomic_t unacked_cnt; /* Need to send replies for */
+ unsigned int multi_bio_cnt; /* peer_reqs that needed multiple bios */
+ atomic_t ap_bio_cnt[2]; /* Requests we need to complete. [READ] and [WRITE] */
atomic_t local_cnt; /* Waiting for local completion */
- atomic_t suspend_cnt;
+ atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
+ atomic_t wait_for_actlog; /* Peer requests waiting for activity log */
+ /* worst case extent count needed to satisfy both requests and peer requests
+ * currently waiting for the activity log */
+ atomic_t wait_for_actlog_ecnt;
+
+ atomic_t suspend_cnt; /* recursive suspend counter, if non-zero, IO will be blocked. */
- /* Interval tree of pending local requests */
- struct rb_root read_requests;
- struct rb_root write_requests;
+ /* Interval trees of pending requests */
+ spinlock_t interval_lock;
+ struct rb_root read_requests; /* Local reads */
+ struct rb_root requests; /* Local and peer writes, resync operations etc. */
/* for statistics and timeouts */
/* [0] read, [1] write */
+ spinlock_t pending_completion_lock;
struct list_head pending_master_completion[2];
struct list_head pending_completion[2];
- /* use checksums for *this* resync */
- bool use_csums;
- /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
- unsigned long rs_total;
- /* number of resync blocks that failed in this run */
- unsigned long rs_failed;
- /* Syncer's start time [unit jiffies] */
- unsigned long rs_start;
- /* cumulated time in PausedSyncX state [unit jiffies] */
- unsigned long rs_paused;
- /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
- unsigned long rs_same_csum;
-#define DRBD_SYNC_MARKS 8
-#define DRBD_SYNC_MARK_STEP (3*HZ)
- /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
- unsigned long rs_mark_left[DRBD_SYNC_MARKS];
- /* marks's time [unit jiffies] */
- unsigned long rs_mark_time[DRBD_SYNC_MARKS];
- /* current index into rs_mark_{left,time} */
- int rs_last_mark;
- unsigned long rs_last_bcast; /* [unit jiffies] */
-
- /* where does the admin want us to start? (sector) */
- sector_t ov_start_sector;
- sector_t ov_stop_sector;
- /* where are we now? (sector) */
- sector_t ov_position;
- /* Start sector of out of sync range (to merge printk reporting). */
- sector_t ov_last_oos_start;
- /* size of out-of-sync range in sectors. */
- sector_t ov_last_oos_size;
- unsigned long ov_left; /* in bits */
-
- struct drbd_bitmap *bitmap;
- unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
-
- /* Used to track operations of resync... */
- struct lru_cache *resync;
- /* Number of locked elements in resync LRU */
- unsigned int resync_locked;
- /* resync extent number waiting for application requests */
- unsigned int resync_wenr;
+ struct drbd_bitmap *bitmap; /* enclose accessing code in get_ldev() / put_ldev() */
+ /* We may want to report on resync progress
+ * even after we detached again (bitmap == NULL).
+ * Cache the last bitmap block size here.
+ */
+ unsigned int last_bm_block_shift;
int open_cnt;
- u64 *p_uuid;
+ bool writable;
+ /* FIXME clean comments, restructure so it is more obvious which
+ * members are protected by what */
- struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
- struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
- struct list_head done_ee; /* need to send P_WRITE_ACK */
- struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
-
- struct list_head resync_reads;
- atomic_t pp_in_use; /* allocated from page pool */
- atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
- wait_queue_head_t ee_wait;
struct drbd_md_io md_io;
spinlock_t al_lock;
wait_queue_head_t al_wait;
struct lru_cache *act_log; /* activity log */
+ unsigned al_histogram[AL_UPDATES_PER_TRANSACTION+1];
unsigned int al_tr_number;
int al_tr_cycle;
wait_queue_head_t seq_wait;
- atomic_t packet_seq;
- unsigned int peer_seq;
- spinlock_t peer_seq_lock;
- unsigned long comm_bm_set; /* communicated number of set bits. */
- struct bm_io_work bm_io_work;
- u64 ed_uuid; /* UUID of the exposed data */
- struct mutex own_state_mutex;
- struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
- char congestion_reason; /* Why we where congested... */
- atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ u64 exposed_data_uuid; /* UUID of the exposed data */
+ u64 next_exposed_data_uuid;
+ struct rw_semaphore uuid_sem;
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
- int rs_last_sect_ev; /* counter to compare with */
- int rs_last_events; /* counter of read or write "events" (unit sectors)
- * on the lower level device when we last looked. */
- int c_sync_rate; /* current resync rate after syncer throttle magic */
- struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
- int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
- atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
- unsigned int peer_max_bio_size;
- unsigned int local_max_bio_size;
-
- /* any requests that would block in drbd_make_request()
- * are deferred to this single-threaded work queue */
+ struct pending_bitmap_work_s {
+ atomic_t n; /* inc when queued here, */
+ spinlock_t q_lock; /* dec only once finished. */
+ struct list_head q; /* n > 0 even if q already empty */
+ } pending_bitmap_work;
+ struct device_conf device_conf;
+
+ /* any requests that were blocked due to conflicts with other requests
+ * or resync are submitted on this ordered work queue */
+ struct conflict_worker submit_conflict;
+ /* any requests that would block due to the activity log
+ * are deferred to this ordered work queue */
struct submit_worker submit;
+ u64 read_nodes; /* used for balancing read requests among peers */
+ bool have_quorum[2]; /* no quorum -> suspend IO or error IO */
+ bool cached_state_unstable; /* updates with each state change */
+ bool cached_err_io; /* complete all IOs with error */
+
+#ifdef CONFIG_DRBD_TIMING_STATS
+ spinlock_t timing_lock;
+ unsigned long reqs;
+ ktime_t in_actlog_kt;
+ ktime_t pre_submit_kt; /* sum over over all reqs */
+
+ ktime_t before_queue_kt; /* sum over all al_misses */
+ ktime_t before_al_begin_io_kt;
+
+ ktime_t al_before_bm_write_hinted_kt; /* sum over all al_writ_cnt */
+ ktime_t al_mid_kt;
+ ktime_t al_after_sync_page_kt;
+#endif
+ struct list_head openers;
+ spinlock_t openers_lock;
+ spinlock_t peer_req_bio_completion_lock;
+
+ struct rcu_head rcu;
+ struct work_struct finalize_work;
};
struct drbd_bm_aio_ctx {
struct drbd_device *device;
- struct list_head list; /* on device->pending_bitmap_io */;
+ struct list_head list; /* on device->pending_bitmap_io */
unsigned long start_jif;
+ struct blk_plug bm_aio_plug;
atomic_t in_flight;
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
#define BM_AIO_WRITE_HINTED 2
#define BM_AIO_WRITE_ALL_PAGES 4
-#define BM_AIO_READ 8
+#define BM_AIO_READ 8
+#define BM_AIO_WRITE_LAZY 16
+ /* only report stats for global read, write, write all */
+#define BM_AIO_NO_STATS (BM_AIO_COPY_PAGES\
+ |BM_AIO_WRITE_HINTED\
+ |BM_AIO_WRITE_LAZY)
int error;
struct kref kref;
};
@@ -921,12 +1682,14 @@ struct drbd_config_context {
/* assigned from request attributes, if present */
unsigned int volume;
#define VOLUME_UNSPECIFIED (-1U)
+ unsigned int peer_node_id;
+#define PEER_NODE_ID_UNSPECIFIED (-1U)
/* pointer into the request skb,
* limited lifetime! */
char *resource_name;
- struct nlattr *my_addr;
- struct nlattr *peer_addr;
+ /* network namespace of the sending socket */
+ struct net *net;
/* reply buffer */
struct sk_buff *reply_skb;
/* pointer into reply buffer */
@@ -935,6 +1698,7 @@ struct drbd_config_context {
struct drbd_device *device;
struct drbd_resource *resource;
struct drbd_connection *connection;
+ struct drbd_peer_device *peer_device;
};
static inline struct drbd_device *minor_to_device(unsigned int minor)
@@ -942,10 +1706,6 @@ static inline struct drbd_device *minor_to_device(unsigned int minor)
return (struct drbd_device *)idr_find(&drbd_devices, minor);
}
-static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
-{
- return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
-}
static inline struct drbd_peer_device *
conn_peer_device(struct drbd_connection *connection, int volume_number)
@@ -959,18 +1719,19 @@ conn_peer_device(struct drbd_connection *connection, int volume_number)
#define for_each_resource_rcu(resource, _resources) \
list_for_each_entry_rcu(resource, _resources, resources)
-#define for_each_resource_safe(resource, tmp, _resources) \
- list_for_each_entry_safe(resource, tmp, _resources, resources)
-
+/* see drbd_resource.connections for locking requirements */
#define for_each_connection(connection, resource) \
list_for_each_entry(connection, &resource->connections, connections)
#define for_each_connection_rcu(connection, resource) \
list_for_each_entry_rcu(connection, &resource->connections, connections)
-#define for_each_connection_safe(connection, tmp, resource) \
- list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+#define for_each_connection_ref(connection, m, resource) \
+ for (connection = __drbd_next_connection_ref(&m, NULL, resource); \
+ connection; \
+ connection = __drbd_next_connection_ref(&m, connection, resource))
+/* see drbd_device.peer_devices for locking requirements */
#define for_each_peer_device(peer_device, device) \
list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
@@ -980,10 +1741,10 @@ conn_peer_device(struct drbd_connection *connection, int volume_number)
#define for_each_peer_device_safe(peer_device, tmp, device) \
list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
-static inline unsigned int device_to_minor(struct drbd_device *device)
-{
- return device->minor;
-}
+#define for_each_peer_device_ref(peer_device, m, device) \
+ for (peer_device = __drbd_next_peer_device_ref(&m, NULL, device); \
+ peer_device; \
+ peer_device = __drbd_next_peer_device_ref(&m, peer_device, device))
/*
* function declarations
@@ -992,97 +1753,163 @@ static inline unsigned int device_to_minor(struct drbd_device *device)
/* drbd_main.c */
enum dds_flags {
- DDSF_FORCED = 1,
+ /* This enum is part of the wire protocol!
+ * See P_SIZES, struct p_sizes; */
+ DDSF_ASSUME_UNCONNECTED_PEER_HAS_SPACE = 1,
DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+ DDSF_IGNORE_PEER_CONSTRAINTS = 4, /* no longer used */
+ DDSF_2PC = 8, /* local only, not on the wire */
};
+struct meta_data_on_disk_9;
-extern void drbd_init_set_defaults(struct drbd_device *device);
-extern int drbd_thread_start(struct drbd_thread *thi);
-extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+int drbd_thread_start(struct drbd_thread *thi);
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
#ifdef CONFIG_SMP
-extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+void drbd_thread_current_set_cpu(struct drbd_thread *thi);
#else
#define drbd_thread_current_set_cpu(A) ({})
#endif
-extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
- unsigned int set_size);
-extern void tl_clear(struct drbd_connection *);
-extern void drbd_free_sock(struct drbd_connection *connection);
-extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
- void *buf, size_t size, unsigned msg_flags);
-extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
- unsigned);
-
-extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
-extern int drbd_send_protocol(struct drbd_connection *connection);
-extern int drbd_send_uuids(struct drbd_peer_device *);
-extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
-extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
-extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
-extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
-extern int drbd_send_current_state(struct drbd_peer_device *);
-extern int drbd_send_sync_param(struct drbd_peer_device *);
-extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
- u32 set_size);
-extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
- struct drbd_peer_request *);
-extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
- struct p_block_req *rp);
-extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
- struct p_data *dp, int data_size);
-extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
- sector_t sector, int blksize, u64 block_id);
-extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
-extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
- struct drbd_peer_request *);
-extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
-extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
- sector_t sector, int size, u64 block_id);
-extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
- int size, void *digest, int digest_size,
- enum drbd_packet cmd);
-extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
-
-extern int drbd_send_bitmap(struct drbd_device *device, struct drbd_peer_device *peer_device);
-extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
-extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
-extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
-extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
-extern void drbd_device_cleanup(struct drbd_device *device);
-extern void drbd_print_uuids(struct drbd_device *device, const char *text);
-extern void drbd_queue_unplug(struct drbd_device *device);
-
-extern void conn_md_sync(struct drbd_connection *connection);
-extern void drbd_md_write(struct drbd_device *device, void *buffer);
-extern void drbd_md_sync(struct drbd_device *device);
-extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
-extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
-extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
-extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
-extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
-extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
-extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
-extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
-extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
-extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
-extern void drbd_md_mark_dirty(struct drbd_device *device);
-extern void drbd_queue_bitmap_io(struct drbd_device *device,
- int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
- void (*done)(struct drbd_device *, int),
- char *why, enum bm_flag flags,
- struct drbd_peer_device *peer_device);
-extern int drbd_bitmap_io(struct drbd_device *device,
- int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
- char *why, enum bm_flag flags,
- struct drbd_peer_device *peer_device);
-extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
- int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
- char *why, enum bm_flag flags,
- struct drbd_peer_device *peer_device);
-extern int drbd_bmio_set_n_write(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
-extern int drbd_bmio_clear_n_write(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
+int tl_release(struct drbd_connection *connection, uint64_t o_block_id,
+ uint64_t y_block_id, unsigned int barrier_nr,
+ unsigned int set_size);
+
+int __drbd_send_protocol(struct drbd_connection *connection,
+ enum drbd_packet cmd);
+u64 drbd_collect_local_uuid_flags(struct drbd_peer_device *peer_device,
+ u64 *authoritative_mask);
+u64 drbd_resolved_uuid(struct drbd_peer_device *peer_device_base,
+ u64 *uuid_flags);
+int drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags,
+ u64 node_mask);
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device);
+int drbd_send_sizes(struct drbd_peer_device *peer_device,
+ uint64_t u_size_diskless, enum dds_flags flags);
+int conn_send_state(struct drbd_connection *connection,
+ union drbd_state state);
+int drbd_send_state(struct drbd_peer_device *peer_device,
+ union drbd_state state);
+int drbd_send_current_state(struct drbd_peer_device *peer_device);
+int drbd_send_sync_param(struct drbd_peer_device *peer_device);
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device,
+ sector_t sector, unsigned int size);
+int drbd_send_block(struct drbd_peer_device *peer_device,
+ enum drbd_packet cmd, struct drbd_peer_request *peer_req);
+int drbd_send_dblock(struct drbd_peer_device *peer_device,
+ struct drbd_request *req);
+int drbd_send_drequest(struct drbd_peer_device *peer_device, sector_t sector,
+ int size, u64 block_id);
+int drbd_send_rs_request(struct drbd_peer_device *peer_device,
+ enum drbd_packet cmd, sector_t sector, int size,
+ u64 block_id, unsigned int dagtag_node_id,
+ u64 dagtag);
+void *drbd_prepare_drequest_csum(struct drbd_peer_request *peer_req,
+ enum drbd_packet cmd, int digest_size,
+ unsigned int dagtag_node_id, u64 dagtag);
+
+int drbd_send_bitmap(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_send_dagtag(struct drbd_connection *connection, u64 dagtag);
+void drbd_send_sr_reply(struct drbd_connection *connection, int vnr,
+ enum drbd_state_rv retcode);
+int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
+ struct drbd_peer_request *peer_req);
+void drbd_send_twopc_reply(struct drbd_connection *connection,
+ enum drbd_packet cmd, struct twopc_reply *reply);
+void drbd_send_peers_in_sync(struct drbd_peer_device *peer_device, u64 mask,
+ sector_t sector, int size);
+int drbd_send_peer_dagtag(struct drbd_connection *connection,
+ struct drbd_connection *lost_peer);
+int drbd_send_flush_requests(struct drbd_connection *connection,
+ u64 flush_sequence);
+int drbd_send_flush_forward(struct drbd_connection *connection,
+ u64 flush_sequence, int initiator_node_id);
+int drbd_send_flush_requests_ack(struct drbd_connection *connection,
+ u64 flush_sequence, int primary_node_id);
+int drbd_send_enable_replication_next(struct drbd_peer_device *peer_device);
+int drbd_send_enable_replication(struct drbd_peer_device *peer_device, bool enable);
+int drbd_send_current_uuid(struct drbd_peer_device *peer_device,
+ u64 current_uuid, u64 weak_nodes);
+void drbd_backing_dev_free(struct drbd_device *device,
+ struct drbd_backing_dev *ldev);
+void drbd_print_uuids(struct drbd_peer_device *peer_device, const char *text);
+void drbd_queue_unplug(struct drbd_device *device);
+
+u64 drbd_capacity_to_on_disk_bm_sect(u64 capacity_sect, const struct drbd_md *md);
+void drbd_md_set_sector_offsets(struct drbd_backing_dev *bdev);
+int drbd_md_write(struct drbd_device *device,
+ struct meta_data_on_disk_9 *buffer);
+int drbd_md_sync(struct drbd_device *device);
+int drbd_md_sync_if_dirty(struct drbd_device *device);
+void drbd_uuid_received_new_current(struct drbd_peer_device *from_pd, u64 val,
+ u64 weak_nodes);
+void drbd_uuid_set_bitmap(struct drbd_peer_device *peer_device, u64 uuid);
+void _drbd_uuid_set_bitmap(struct drbd_peer_device *peer_device, u64 val);
+void _drbd_uuid_set_current(struct drbd_device *device, u64 val);
+void drbd_uuid_new_current(struct drbd_device *device, bool forced);
+void drbd_uuid_new_current_by_user(struct drbd_device *device);
+void _drbd_uuid_push_history(struct drbd_device *device, u64 val);
+u64 _drbd_uuid_pull_history(struct drbd_peer_device *peer_device);
+void drbd_uuid_resync_starting(struct drbd_peer_device *peer_device);
+u64 drbd_uuid_resync_finished(struct drbd_peer_device *peer_device);
+void drbd_uuid_detect_finished_resyncs(struct drbd_peer_device *peer_device);
+bool drbd_uuid_set_exposed(struct drbd_device *device, u64 val, bool log);
+u64 drbd_weak_nodes_device(struct drbd_device *device);
+bool drbd_uuid_is_day0(struct drbd_device *device);
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, enum mdf_flag flag);
+void drbd_md_set_peer_flag(struct drbd_peer_device *peer_device,
+ enum mdf_peer_flag flag);
+void drbd_md_clear_peer_flag(struct drbd_peer_device *peer_device,
+ enum mdf_peer_flag flag);
+bool drbd_md_test_peer_flag(struct drbd_peer_device *peer_device,
+ enum mdf_peer_flag flag);
+void drbd_md_mark_dirty(struct drbd_device *device);
+void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *device,
+ struct drbd_peer_device *peer_device),
+ void (*done)(struct drbd_device *device,
+ struct drbd_peer_device *peer_device,
+ int rv),
+ char *why, enum bm_flag flags,
+ struct drbd_peer_device *peer_device);
+int drbd_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
+ char *why, enum bm_flag flags,
+ struct drbd_peer_device *peer_device);
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *, struct drbd_peer_device *),
+ char *why, enum bm_flag flags,
+ struct drbd_peer_device *peer_device);
+int drbd_bmio_set_n_write(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bmio_clear_all_n_write(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bmio_set_all_n_write(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bmio_set_allocated_n_write(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bmio_clear_one_peer(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+bool drbd_device_stable(struct drbd_device *device, u64 *authoritative_ptr);
+void drbd_flush_peer_acks(struct drbd_resource *resource);
+void drbd_cork(struct drbd_connection *connection, enum drbd_stream stream);
+int drbd_uncork(struct drbd_connection *connection, enum drbd_stream stream);
+void drbd_open_counts(struct drbd_resource *resource, int *rw_count_ptr,
+ int *ro_count_ptr);
+
+struct drbd_connection *
+__drbd_next_connection_ref(u64 *visited, struct drbd_connection *connection,
+ struct drbd_resource *resource);
+
+struct drbd_peer_device *
+__drbd_next_peer_device_ref(u64 *visited,
+ struct drbd_peer_device *peer_device,
+ struct drbd_device *device);
+
+void tl_abort_disk_io(struct drbd_device *device);
+
+sector_t drbd_get_max_capacity(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, bool warn);
+sector_t drbd_partition_data_capacity(struct drbd_device *device);
/* Meta data layout
*
@@ -1114,59 +1941,10 @@ extern int drbd_bmio_clear_n_write(struct drbd_device *device,
* but is about to become configurable.
*/
-/* Our old fixed size meta data layout
- * allows up to about 3.8TB, so if you want more,
- * you need to use the "flexible" meta data format. */
-#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
-#define MD_4kB_SECT 8
-#define MD_32kB_SECT 64
-
/* One activity log extent represents 4M of storage */
#define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
-/* We could make these currently hardcoded constants configurable
- * variables at create-md time (or even re-configurable at runtime?).
- * Which will require some more changes to the DRBD "super block"
- * and attach code.
- *
- * updates per transaction:
- * This many changes to the active set can be logged with one transaction.
- * This number is arbitrary.
- * context per transaction:
- * This many context extent numbers are logged with each transaction.
- * This number is resulting from the transaction block size (4k), the layout
- * of the transaction header, and the number of updates per transaction.
- * See drbd_actlog.c:struct al_transaction_on_disk
- * */
-#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
-#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
-
-#if BITS_PER_LONG == 32
-#define LN2_BPL 5
-#define cpu_to_lel(A) cpu_to_le32(A)
-#define lel_to_cpu(A) le32_to_cpu(A)
-#elif BITS_PER_LONG == 64
-#define LN2_BPL 6
-#define cpu_to_lel(A) cpu_to_le64(A)
-#define lel_to_cpu(A) le64_to_cpu(A)
-#else
-#error "LN2 of BITS_PER_LONG unknown!"
-#endif
-
-/* resync bitmap */
-/* 16MB sized 'bitmap extent' to track syncer usage */
-struct bm_extent {
- int rs_left; /* number of bits set (out of sync) in this extent. */
- int rs_failed; /* number of failed resync requests in this extent. */
- unsigned long flags;
- struct lc_element lce;
-};
-
-#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
-#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
-#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
-
/* drbd_bitmap.c */
/*
* We need to store one bit for a block.
@@ -1175,94 +1953,87 @@ struct bm_extent {
* Bit 1 ==> local node thinks this block needs to be synced.
*/
-#define SLEEP_TIME (HZ/10)
+#define RS_MAKE_REQS_INTV (HZ/10)
+#define RS_MAKE_REQS_INTV_NS (NSEC_PER_SEC/10)
-/* We do bitmap IO in units of 4k blocks.
- * We also still have a hardcoded 4k per bit relation. */
-#define BM_BLOCK_SHIFT 12 /* 4k per bit */
-#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
-/* mostly arbitrarily set the represented size of one bitmap extent,
- * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
- * at 4k per bit resolution) */
-#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
-#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
-
-#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
-#error "HAVE YOU FIXED drbdmeta AS WELL??"
-#endif
-
-/* thus many _storage_ sectors are described by one bit */
-#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
-#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
-#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
-
-/* bit to represented kilo byte conversion */
-#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
-
-/* in which _bitmap_ extent (resp. sector) the bit for a certain
- * _storage_ sector is located in */
-#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
-#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
-
-/* first storage sector a bitmap extent corresponds to */
-#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
-/* how much _storage_ sectors we have per bitmap extent */
-#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
-/* how many bits are covered by one bitmap extent (resync extent) */
-#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
-
-#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1)
+#define LEGACY_BM_EXT_SHIFT 27 /* 128 MiB per resync extent */
+#define LEGACY_BM_EXT_SECT_MASK ((1UL << (LEGACY_BM_EXT_SHIFT - SECTOR_SHIFT)) - 1)
+static inline unsigned int bm_block_size(const struct drbd_bitmap *bm)
+{
+ return 1 << bm->bm_block_shift;
+}
+static inline sector_t bm_bit_to_kb(const struct drbd_bitmap *bm, unsigned long bit)
+{
+ return (sector_t)bit << (bm->bm_block_shift - 10);
+}
+static inline unsigned long bm_sect_to_bit(const struct drbd_bitmap *bm, sector_t s)
+{
+ return s >> (bm->bm_block_shift - 9);
+}
+static inline sector_t bm_bit_to_sect(const struct drbd_bitmap *bm, unsigned long bit)
+{
+ return (sector_t)bit << (bm->bm_block_shift - 9);
+}
+static inline sector_t bm_sect_per_bit(const struct drbd_bitmap *bm)
+{
+ return (sector_t)1 << (bm->bm_block_shift - 9);
+}
-/* in one sector of the bitmap, we have this many activity_log extents. */
-#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+static inline sector_t bit_to_kb(unsigned long bit, unsigned int bm_block_shift)
+{
+ return (sector_t)bit << (bm_block_shift - 10);
+}
+static inline unsigned long sect_to_bit(sector_t s, unsigned int bm_block_shift)
+{
+ return s >> (bm_block_shift - 9);
+}
+static inline sector_t bit_to_sect(unsigned long bit, unsigned int bm_block_shift)
+{
+ return (sector_t)bit << (bm_block_shift - 9);
+}
+static inline sector_t sect_per_bit(unsigned int bm_block_shift)
+{
+ return (sector_t)1 << (bm_block_shift - 9);
+}
-/* the extent in "PER_EXTENT" below is an activity log extent
- * we need that many (long words/bytes) to store the bitmap
- * of one AL_EXTENT_SIZE chunk of storage.
- * we can store the bitmap for that many AL_EXTENTS within
- * one sector of the _on_disk_ bitmap:
- * bit 0 bit 37 bit 38 bit (512*8)-1
- * ...|........|........|.. // ..|........|
- * sect. 0 `296 `304 ^(512*8*8)-1
- *
-#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
-#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
-#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
+/* We may have just lost our backing device, and with it ->ldev and ->bitmap.
+ * But we can still report sync progress and similar based on our last known
+ * bitmap block size.
*/
+static inline sector_t device_bit_to_kb(struct drbd_device *device, unsigned long bit)
+{
+ return bit_to_kb(bit, device->last_bm_block_shift);
+}
-#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
-/* we have a certain meta data variant that has a fixed on-disk size of 128
- * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+/* Send P_PEERS_IN_SYNC in steps defined by this shift. Set to the activity log
+ * extent shift since the P_PEERS_IN_SYNC intervals are broken up based on
+ * activity log extents anyway. */
+#define PEERS_IN_SYNC_STEP_SHIFT AL_EXTENT_SHIFT
+#define PEERS_IN_SYNC_STEP_SECT (1UL << (PEERS_IN_SYNC_STEP_SHIFT - SECTOR_SHIFT))
+#define PEERS_IN_SYNC_STEP_SECT_MASK (PEERS_IN_SYNC_STEP_SECT - 1)
+
+/* Indexed external meta data has a fixed on-disk size of 128MiB, of which
+ * 4KiB are our "superblock", and 32KiB are the fixed size activity
* log, leaving this many sectors for the bitmap.
*/
+#define DRBD_BM_SECTORS_INDEXED \
+ (((128 << 20) - (32 << 10) - (4 << 10)) >> SECTOR_SHIFT)
-#define DRBD_MAX_SECTORS_FIXED_BM \
- ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
-#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
-/* 16 TB in units of sectors */
#if BITS_PER_LONG == 32
-/* adjust by one page worth of bitmap,
- * so we won't wrap around in drbd_bm_find_next_bit.
- * you should use 64bit OS for that much storage, anyways. */
-#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#if !defined(CONFIG_LBDAF) && !defined(CONFIG_LBD)
+#define DRBD_MAX_SECTORS (0xffffffffLU)
#else
-/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
-#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
-/* corresponds to (1UL << 38) bits right now. */
+/* With large block device support, the size is limited by the fact that we
+ * want to be able to address bitmap bits with a long. Additionally adjust by
+ * one page worth of bitmap, so we don't wrap around when iterating. */
+#define DRBD_MAX_SECTORS BM_BIT_TO_SECT(0xffff7fff)
#endif
-
-/* Estimate max bio size as 256 * PAGE_SIZE,
- * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte.
- * Since we may live in a mixed-platform cluster,
- * we limit us to a platform agnostic constant here for now.
- * A followup commit may allow even bigger BIO sizes,
- * once we thought that through. */
-#define DRBD_MAX_BIO_SIZE (1U << 20)
-#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT)
-#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#else
+/* We allow up to 1 PiB on 64 bit architectures as long as our meta data
+ * is large enough. */
+#define DRBD_MAX_SECTORS (1UL << (50 - SECTOR_SHIFT))
#endif
-#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
@@ -1273,61 +2044,91 @@ struct bm_extent {
#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9)
-extern int drbd_bm_init(struct drbd_device *device);
-extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
-extern void drbd_bm_cleanup(struct drbd_device *device);
-extern void drbd_bm_set_all(struct drbd_device *device);
-extern void drbd_bm_clear_all(struct drbd_device *device);
+/* This gets ignored if the backing device has a larger discard granularity */
+#define DRBD_MAX_RS_DISCARD_SIZE (1U << 27) /* 128MiB; arbitrary */
+
+/* how many activity log extents are touched by this interval? */
+static inline int interval_to_al_extents(struct drbd_interval *i)
+{
+ unsigned int first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned int last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ return 1 + last - first; /* worst case: all touched extends are cold. */
+}
+
+struct drbd_bitmap *drbd_bm_alloc(unsigned int max_peers, unsigned int bm_block_shift);
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity,
+ bool set_new_bits);
+void drbd_bm_free(struct drbd_device *device);
+void drbd_bm_set_all(struct drbd_device *device);
+void drbd_bm_clear_all(struct drbd_device *device);
/* set/clear/test only a few bits at a time */
-extern int drbd_bm_set_bits(
- struct drbd_device *device, unsigned long s, unsigned long e);
-extern int drbd_bm_clear_bits(
- struct drbd_device *device, unsigned long s, unsigned long e);
-extern int drbd_bm_count_bits(
- struct drbd_device *device, const unsigned long s, const unsigned long e);
+unsigned int drbd_bm_set_bits(struct drbd_device *device,
+ unsigned int bitmap_index, unsigned long start,
+ unsigned long end);
+unsigned int drbd_bm_clear_bits(struct drbd_device *device,
+ unsigned int bitmap_index,
+ unsigned long start, unsigned long end);
+int drbd_bm_count_bits(struct drbd_device *device, unsigned int bitmap_index,
+ unsigned long s, unsigned long e);
/* bm_set_bits variant for use while holding drbd_bm_lock,
* may process the whole bitmap in one go */
-extern void _drbd_bm_set_bits(struct drbd_device *device,
- const unsigned long s, const unsigned long e);
-extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
-extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
-extern int drbd_bm_read(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
-extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
-extern int drbd_bm_write(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
-extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
-extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
-extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
-extern int drbd_bm_write_all(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
-extern int drbd_bm_write_copy_pages(struct drbd_device *device,
- struct drbd_peer_device *peer_device) __must_hold(local);
-extern size_t drbd_bm_words(struct drbd_device *device);
-extern unsigned long drbd_bm_bits(struct drbd_device *device);
-extern sector_t drbd_bm_capacity(struct drbd_device *device);
+void drbd_bm_set_many_bits(struct drbd_peer_device *peer_device,
+ unsigned long start, unsigned long end);
+void drbd_bm_clear_many_bits(struct drbd_peer_device *peer_device,
+ unsigned long start, unsigned long end);
+void _drbd_bm_clear_many_bits(struct drbd_device *device, int bitmap_index,
+ unsigned long start, unsigned long end);
+void _drbd_bm_set_many_bits(struct drbd_device *device, int bitmap_index,
+ unsigned long start, unsigned long end);
+int drbd_bm_read(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+void drbd_bm_reset_al_hints(struct drbd_device *device);
+void drbd_bm_mark_range_for_writeout(struct drbd_device *device,
+ unsigned long start, unsigned long end);
+int drbd_bm_write(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bm_write_hinted(struct drbd_device *device);
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned int upper_idx);
+int drbd_bm_write_all(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+int drbd_bm_write_copy_pages(struct drbd_device *device,
+ struct drbd_peer_device *peer_device);
+size_t drbd_bm_words(struct drbd_device *device);
+unsigned long drbd_bm_bits(struct drbd_device *device);
+unsigned long drbd_bm_bits_4k(struct drbd_device *device);
+sector_t drbd_bm_capacity(struct drbd_device *device);
#define DRBD_END_OF_BITMAP (~(unsigned long)0)
-extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+unsigned long drbd_bm_find_next(struct drbd_peer_device *peer_device,
+ unsigned long start);
/* bm_find_next variants for use while you hold drbd_bm_lock() */
-extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
-extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
-extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
-extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+unsigned long _drbd_bm_find_next(struct drbd_peer_device *peer_device,
+ unsigned long start);
+unsigned long _drbd_bm_find_next_zero(struct drbd_peer_device *peer_device,
+ unsigned long start);
+unsigned long _drbd_bm_total_weight(struct drbd_device *device,
+ int bitmap_index);
+unsigned long drbd_bm_total_weight(struct drbd_peer_device *peer_device);
/* for receive_bitmap */
-extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
- size_t number, unsigned long *buffer);
+void drbd_bm_merge_lel(struct drbd_peer_device *peer_device, size_t offset,
+ size_t number, unsigned long *buffer);
/* for _drbd_send_bitmap */
-extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
- size_t number, unsigned long *buffer);
-
-extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
-extern void drbd_bm_unlock(struct drbd_device *device);
+void drbd_bm_get_lel(struct drbd_peer_device *peer_device, size_t offset,
+ size_t number, unsigned long *buffer);
+
+void drbd_bm_lock(struct drbd_device *device, const char *why,
+ enum bm_flag flags);
+void drbd_bm_unlock(struct drbd_device *device);
+void drbd_bm_slot_lock(struct drbd_peer_device *peer_device, char *why,
+ enum bm_flag flags);
+void drbd_bm_slot_unlock(struct drbd_peer_device *peer_device);
+void drbd_bm_copy_slot(struct drbd_device *device, unsigned int from_index,
+ unsigned int to_index);
/* drbd_main.c */
+extern struct workqueue_struct *ping_ack_sender;
extern struct kmem_cache *drbd_request_cache;
extern struct kmem_cache *drbd_ee_cache; /* peer requests */
-extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t drbd_request_mempool;
extern mempool_t drbd_ee_mempool;
@@ -1348,38 +2149,69 @@ extern struct bio_set drbd_md_io_bio_set;
/* And a bio_set for cloning */
extern struct bio_set drbd_io_bio_set;
-extern struct mutex resources_mutex;
-
-extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
-extern void drbd_destroy_device(struct kref *kref);
-extern void drbd_delete_device(struct drbd_device *device);
-
-extern struct drbd_resource *drbd_create_resource(const char *name);
-extern void drbd_free_resource(struct drbd_resource *resource);
-
-extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
-extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
-extern void drbd_destroy_connection(struct kref *kref);
-extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
- void *peer_addr, int peer_addr_len);
-extern struct drbd_resource *drbd_find_resource(const char *name);
-extern void drbd_destroy_resource(struct kref *kref);
-extern void conn_free_crypto(struct drbd_connection *connection);
+struct drbd_peer_device *create_peer_device(struct drbd_device *device,
+ struct drbd_connection *connection);
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx,
+ unsigned int minor,
+ struct device_conf *device_conf,
+ struct drbd_device **p_device);
+void drbd_unregister_device(struct drbd_device *device);
+void drbd_reclaim_device(struct rcu_head *rp);
+void drbd_unregister_connection(struct drbd_connection *connection);
+void drbd_reclaim_connection(struct rcu_head *rp);
+void drbd_reclaim_path(struct rcu_head *rp);
+void del_connect_timer(struct drbd_connection *connection);
+
+struct drbd_resource *drbd_create_resource(const char *name,
+ struct res_opts *res_opts);
+void drbd_reclaim_resource(struct rcu_head *rp);
+struct drbd_resource *drbd_find_resource(const char *name);
+void drbd_destroy_resource(struct kref *kref);
+
+void drbd_destroy_device(struct kref *kref);
+
+int set_resource_options(struct drbd_resource *resource,
+ struct res_opts *res_opts, const char *tag);
+struct drbd_connection *drbd_create_connection(struct drbd_resource *resource,
+ struct drbd_transport_class *tc);
+void drbd_transport_shutdown(struct drbd_connection *connection,
+ enum drbd_tr_free_op op);
+void drbd_destroy_connection(struct kref *kref);
+void conn_free_crypto(struct drbd_connection *connection);
/* drbd_req */
-extern void do_submit(struct work_struct *ws);
-extern void __drbd_make_request(struct drbd_device *, struct bio *);
+void drbd_do_submit_conflict(struct work_struct *ws);
+void do_submit(struct work_struct *ws);
+#ifndef CONFIG_DRBD_TIMING_STATS
+#define __drbd_make_request(d, b, k, j) __drbd_make_request(d, b, j)
+#endif
+void __drbd_make_request(struct drbd_device *device, struct bio *bio,
+ ktime_t start_kt, unsigned long start_jif);
void drbd_submit_bio(struct bio *bio);
-/* drbd_nl.c */
-
-extern struct mutex notification_mutex;
+enum drbd_force_detach_flags {
+ DRBD_READ_ERROR,
+ DRBD_WRITE_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+#define drbd_handle_io_error(m, f) drbd_handle_io_error_(m, f, __func__)
+void drbd_handle_io_error_(struct drbd_device *device,
+ enum drbd_force_detach_flags df, const char *where);
-extern void drbd_suspend_io(struct drbd_device *device);
-extern void drbd_resume_io(struct drbd_device *device);
-extern char *ppsize(char *buf, unsigned long long size);
-extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+/* drbd_nl.c */
+enum suspend_scope {
+ READ_AND_WRITE,
+ WRITE_ONLY
+};
+void drbd_suspend_io(struct drbd_device *device, enum suspend_scope ss);
+void drbd_resume_io(struct drbd_device *device);
+char *ppsize(char *buf, unsigned long long size);
+sector_t drbd_new_dev_size(struct drbd_device *device, sector_t current_size,
+ sector_t user_capped_size, enum dds_flags flags);
enum determine_dev_size {
+ DS_2PC_ERR = -5,
+ DS_2PC_NOT_SUPPORTED = -4,
DS_ERROR_SHRINK = -3,
DS_ERROR_SPACE_MD = -2,
DS_ERROR = -1,
@@ -1388,96 +2220,225 @@ enum determine_dev_size {
DS_GREW = 2,
DS_GREW_FROM_ZERO = 3,
};
-extern enum determine_dev_size
-drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
-extern void resync_after_online_grow(struct drbd_device *);
-extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
- struct drbd_backing_dev *bdev, struct o_qlim *o);
-extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
- enum drbd_role new_role,
- int force);
-extern bool conn_try_outdate_peer(struct drbd_connection *connection);
-extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
-extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
-extern int drbd_khelper(struct drbd_device *device, char *cmd);
-
-/* drbd_worker.c */
-/* bi_end_io handlers */
-extern void drbd_md_endio(struct bio *bio);
-extern void drbd_peer_request_endio(struct bio *bio);
-extern void drbd_request_endio(struct bio *bio);
-extern int drbd_worker(struct drbd_thread *thi);
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device,
+ sector_t peer_current_size, enum dds_flags flags,
+ struct resize_parms *rs);
+void resync_after_online_grow(struct drbd_peer_device *peer_device);
+void drbd_reconsider_queue_parameters(struct drbd_device *device,
+ struct drbd_backing_dev *bdev);
+bool barrier_pending(struct drbd_resource *resource);
+enum drbd_state_rv
+drbd_set_role(struct drbd_resource *resource, enum drbd_role role, bool force,
+ const char *tag, struct sk_buff *reply_skb);
+void conn_try_outdate_peer_async(struct drbd_connection *connection);
+int drbd_maybe_khelper(struct drbd_device *device,
+ struct drbd_connection *connection, char *cmd);
+int drbd_create_peer_device_default_config(struct drbd_peer_device *peer_device);
+int drbd_unallocated_index(struct drbd_backing_dev *bdev);
+void youngest_and_oldest_opener_to_str(struct drbd_device *device, char *buf,
+ size_t len);
+int param_set_drbd_strict_names(const char *val,
+ const struct kernel_param *kp);
+void drbd_enable_netns(void);
+
+/* drbd_sender.c */
+int drbd_sender(struct drbd_thread *thi);
+int drbd_worker(struct drbd_thread *thi);
enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
void drbd_resync_after_changed(struct drbd_device *device);
-extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
-extern void resume_next_sg(struct drbd_device *device);
-extern void suspend_other_sg(struct drbd_device *device);
-extern int drbd_resync_finished(struct drbd_peer_device *peer_device);
+bool drbd_stable_sync_source_present(struct drbd_peer_device *except_peer_device,
+ enum which_state which);
+void drbd_start_resync(struct drbd_peer_device *peer_device,
+ enum drbd_repl_state side, const char *tag);
+void resume_next_sg(struct drbd_device *device);
+void suspend_other_sg(struct drbd_device *device);
+void drbd_resync_finished(struct drbd_peer_device *peer_device,
+ enum drbd_disk_state new_peer_disk_state);
+void verify_progress(struct drbd_peer_device *peer_device,
+ const sector_t sector, const unsigned int size);
/* maybe rather drbd_main.c ? */
-extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
-extern void drbd_md_put_buffer(struct drbd_device *device);
-extern int drbd_md_sync_page_io(struct drbd_device *device,
- struct drbd_backing_dev *bdev, sector_t sector, enum req_op op);
-extern void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device,
- sector_t sector, int size);
-extern void wait_until_done_or_force_detached(struct drbd_device *device,
- struct drbd_backing_dev *bdev, unsigned int *done);
-extern void drbd_rs_controller_reset(struct drbd_peer_device *peer_device);
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
+void drbd_md_put_buffer(struct drbd_device *device);
+int drbd_md_sync_page_io(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, sector_t sector,
+ enum req_op op);
+bool drbd_al_active(struct drbd_device *device, sector_t sector,
+ unsigned int size);
+void drbd_ov_out_of_sync_found(struct drbd_peer_device *peer_device,
+ sector_t sector, int size);
+void wait_until_done_or_force_detached(struct drbd_device *device,
+ struct drbd_backing_dev *bdev,
+ unsigned int *done);
+void drbd_rs_controller_reset(struct drbd_peer_device *peer_device);
+void drbd_rs_all_in_flight_came_back(struct drbd_peer_device *peer_device,
+ int rs_sect_in);
+void drbd_check_peers(struct drbd_resource *resource);
+void drbd_check_peers_new_current_uuid(struct drbd_device *device);
+void drbd_conflict_send_resync_request(struct drbd_peer_request *peer_req);
+void drbd_ping_peer(struct drbd_connection *connection);
+struct drbd_peer_device *peer_device_by_node_id(struct drbd_device *device,
+ int node_id);
+void drbd_update_mdf_al_disabled(struct drbd_device *device,
+ enum which_state which);
static inline void ov_out_of_sync_print(struct drbd_peer_device *peer_device)
{
- struct drbd_device *device = peer_device->device;
-
- if (device->ov_last_oos_size) {
+ if (peer_device->ov_last_oos_size) {
drbd_err(peer_device, "Out of sync: start=%llu, size=%lu (sectors)\n",
- (unsigned long long)device->ov_last_oos_start,
- (unsigned long)device->ov_last_oos_size);
+ (unsigned long long)peer_device->ov_last_oos_start,
+ (unsigned long)peer_device->ov_last_oos_size);
}
- device->ov_last_oos_size = 0;
+ peer_device->ov_last_oos_size = 0;
}
+static inline void ov_skipped_print(struct drbd_peer_device *peer_device)
+{
+ if (peer_device->ov_last_skipped_size) {
+ drbd_info(peer_device, "Skipped verify, too busy: start=%llu, size=%lu (sectors)\n",
+ (unsigned long long)peer_device->ov_last_skipped_start,
+ (unsigned long)peer_device->ov_last_skipped_size);
+ }
+ peer_device->ov_last_skipped_size = 0;
+}
+
+void drbd_csum_bios(struct crypto_shash *tfm, struct bio_list *bios, void *digest);
+void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest);
+void drbd_resync_read_req_mod(struct drbd_peer_request *peer_req,
+ enum drbd_interval_flags bit_to_set);
-extern void drbd_csum_bio(struct crypto_shash *, struct bio *, void *);
-extern void drbd_csum_ee(struct crypto_shash *, struct drbd_peer_request *,
- void *);
/* worker callbacks */
-extern int w_e_end_data_req(struct drbd_work *, int);
-extern int w_e_end_rsdata_req(struct drbd_work *, int);
-extern int w_e_end_csum_rs_req(struct drbd_work *, int);
-extern int w_e_end_ov_reply(struct drbd_work *, int);
-extern int w_e_end_ov_req(struct drbd_work *, int);
-extern int w_ov_finished(struct drbd_work *, int);
-extern int w_resync_timer(struct drbd_work *, int);
-extern int w_send_write_hint(struct drbd_work *, int);
-extern int w_send_dblock(struct drbd_work *, int);
-extern int w_send_read_req(struct drbd_work *, int);
-extern int w_restart_disk_io(struct drbd_work *, int);
-extern int w_send_out_of_sync(struct drbd_work *, int);
-
-extern void resync_timer_fn(struct timer_list *t);
-extern void start_resync_timer_fn(struct timer_list *t);
-
-extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+int w_e_end_data_req(struct drbd_work *w, int cancel);
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel);
+int w_e_end_ov_reply(struct drbd_work *w, int cancel);
+int w_e_end_ov_req(struct drbd_work *w, int cancel);
+int w_resync_timer(struct drbd_work *w, int cancel);
+int w_e_reissue(struct drbd_work *w, int cancel);
+int w_send_dagtag(struct drbd_work *w, int cancel);
+int w_send_uuids(struct drbd_work *w, int cancel);
+
+bool drbd_any_flush_pending(struct drbd_resource *resource);
+void resync_timer_fn(struct timer_list *t);
+void start_resync_timer_fn(struct timer_list *t);
+
+int drbd_unmerge_discard(struct drbd_peer_request *peer_req_main,
+ struct list_head *list);
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* bi_end_io handlers */
+void drbd_md_endio(struct bio *bio);
+void drbd_peer_request_endio(struct bio *bio);
+void drbd_request_endio(struct bio *bio);
+
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line);
+
+#define update_sender_timing_details(c, cb) \
+ __update_timing_details(c->s_timing_details, &c->s_cb_nr, cb, __func__, __LINE__)
+#define update_receiver_timing_details(c, cb) \
+ __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__, __LINE__)
+#define update_worker_timing_details(r, cb) \
+ __update_timing_details(r->w_timing_details, &r->w_cb_nr, cb, __func__, __LINE__)
/* drbd_receiver.c */
-extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
- sector_t start, unsigned int nr_sectors, int flags);
-extern int drbd_receiver(struct drbd_thread *thi);
-extern int drbd_ack_receiver(struct drbd_thread *thi);
-extern void drbd_send_acks_wf(struct work_struct *ws);
-extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
-extern bool drbd_rs_should_slow_down(struct drbd_peer_device *peer_device, sector_t sector,
- bool throttle_if_app_is_waiting);
-extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req);
-extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
-extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
- sector_t, unsigned int,
- unsigned int,
- gfp_t) __must_hold(local);
-extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req);
-extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
-extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
-extern int drbd_connected(struct drbd_peer_device *);
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ int vnr;
+ void *data;
+};
+
+/* packet_info->data is just a pointer into some temporary buffer
+ * owned by the transport. As soon as we call into the transport for
+ * any further receive operation, the data it points to is undefined.
+ * The buffer may be freed/recycled/re-used already.
+ * Convert and store the relevant information for any incoming data
+ * in drbd_peer_request_detail.
+ */
+
+struct drbd_peer_request_details {
+ uint64_t sector; /* be64_to_cpu(p_data.sector) */
+ uint64_t block_id; /* unmodified p_data.block_id */
+ uint32_t peer_seq; /* be32_to_cpu(p_data.seq_num) */
+ uint32_t dp_flags; /* be32_to_cpu(p_data.dp_flags) */
+ uint32_t length; /* endian converted p_head*.length */
+ uint32_t bi_size; /* resulting bio size */
+ /* for non-discards: bi_size = length - digest_size */
+ uint32_t digest_size;
+};
+
+
+void drbd_queue_update_peers(struct drbd_peer_device *peer_device,
+ sector_t sector_start, sector_t sector_end);
+int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start,
+ unsigned int nr_sectors, int flags);
+int drbd_send_ack_be(struct drbd_peer_device *peer_device,
+ enum drbd_packet cmd, sector_t sector, int size,
+ u64 block_id);
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req);
+int drbd_send_ov_result(struct drbd_peer_device *peer_device, sector_t sector,
+ int blksize, u64 block_id, enum ov_result result);
+int drbd_receiver(struct drbd_thread *thi);
+void drbd_unsuccessful_resync_request(struct drbd_peer_request *peer_req,
+ bool failed);
+int drbd_send_out_of_sync_wf(struct drbd_work *w, int cancel);
+int drbd_flush_ack_wf(struct drbd_work *w, int unused);
+void drbd_send_ping_wf(struct work_struct *ws);
+void drbd_send_acks_wf(struct work_struct *ws);
+void drbd_send_peer_ack_wf(struct work_struct *ws);
+bool drbd_rs_c_min_rate_throttle(struct drbd_peer_device *peer_device);
+void drbd_verify_skipped_block(struct drbd_peer_device *peer_device,
+ const sector_t sector, const unsigned int size);
+void drbd_conflict_submit_resync_request(struct drbd_peer_request *peer_req);
+void drbd_conflict_submit_peer_read(struct drbd_peer_request *peer_req);
+void drbd_conflict_submit_peer_write(struct drbd_peer_request *peer_req);
+int drbd_submit_peer_request(struct drbd_peer_request *peer_req);
+void drbd_cleanup_after_failed_submit_peer_write(struct drbd_peer_request *peer_req);
+void drbd_cleanup_peer_requests_wfa(struct drbd_device *device,
+ struct list_head *cleanup);
+void drbd_remove_peer_req_interval(struct drbd_peer_request *peer_req);
+int drbd_free_peer_reqs(struct drbd_connection *connection,
+ struct list_head *list);
+struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *peer_device, gfp_t gfp_mask,
+ size_t size, blk_opf_t opf);
+void drbd_free_peer_req(struct drbd_peer_request *peer_req);
+void drbd_peer_req_strip_bio(struct drbd_peer_request *peer_req);
+int drbd_connected(struct drbd_peer_device *peer_device);
+void conn_connect2(struct drbd_connection *connection);
+void wait_initial_states_received(struct drbd_connection *connection);
+void abort_connect(struct drbd_connection *connection);
+void drbd_print_cluster_wide_state_change(struct drbd_resource *resource,
+ const char *message,
+ unsigned int tid,
+ unsigned int initiator_node_id,
+ int target_node_id,
+ union drbd_state mask,
+ union drbd_state val);
+void apply_unacked_peer_requests(struct drbd_connection *connection);
+struct drbd_connection *drbd_connection_by_node_id(struct drbd_resource *resource,
+ int node_id);
+struct drbd_connection *drbd_get_connection_by_node_id(struct drbd_resource *resource,
+ int node_id);
+bool drbd_have_local_disk(struct drbd_resource *resource);
+enum drbd_state_rv drbd_support_2pc_resize(struct drbd_resource *resource);
+enum determine_dev_size
+drbd_commit_size_change(struct drbd_device *device, struct resize_parms *rs,
+ u64 nodes_to_reach);
+void drbd_try_to_get_resynced(struct drbd_device *device);
+void drbd_process_rs_discards(struct drbd_peer_device *peer_device,
+ bool submit_all);
+void drbd_last_resync_request(struct drbd_peer_device *peer_device,
+ bool submit_all);
+void drbd_init_connect_state(struct drbd_connection *connection);
+
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+ return bdev ? bdev_nr_sectors(bdev) : 0;
+}
/* sets the number of 512 byte sectors of our virtual device */
void drbd_set_my_capacity(struct drbd_device *device, sector_t size);
@@ -1488,207 +2449,108 @@ void drbd_set_my_capacity(struct drbd_device *device, sector_t size);
static inline void drbd_submit_bio_noacct(struct drbd_device *device,
int fault_type, struct bio *bio)
{
- __release(local);
- if (!bio->bi_bdev) {
- drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n");
+ if (drbd_insert_fault(device, fault_type)) {
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
- return;
- }
-
- if (drbd_insert_fault(device, fault_type))
- bio_io_error(bio);
- else
+ } else {
submit_bio_noacct(bio);
+ }
}
void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
enum write_ordering_e wo);
+void twopc_timer_fn(struct timer_list *t);
+void connect_timer_fn(struct timer_list *t);
+
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
int drbd_seq_show(struct seq_file *seq, void *v);
/* drbd_actlog.c */
-extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
-extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io_commit(struct drbd_device *device);
-extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
-extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
-extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
-extern int drbd_try_rs_begin_io(struct drbd_peer_device *peer_device, sector_t sector);
-extern void drbd_rs_cancel_all(struct drbd_device *device);
-extern int drbd_rs_del_all(struct drbd_device *device);
-extern void drbd_rs_failed_io(struct drbd_peer_device *peer_device,
- sector_t sector, int size);
-extern void drbd_advance_rs_marks(struct drbd_peer_device *peer_device, unsigned long still_to_go);
-
+bool drbd_al_try_lock(struct drbd_device *device);
+bool drbd_al_try_lock_for_transaction(struct drbd_device *device);
+int drbd_al_begin_io_nonblock(struct drbd_device *device,
+ struct drbd_interval *i);
+void drbd_al_begin_io_commit(struct drbd_device *device);
+bool drbd_al_begin_io_fastpath(struct drbd_device *device,
+ struct drbd_interval *i);
+bool drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+void drbd_advance_rs_marks(struct drbd_peer_device *peer_device,
+ unsigned long still_to_go);
+bool drbd_lazy_bitmap_update_due(struct drbd_peer_device *peer_device);
+unsigned long drbd_set_all_out_of_sync(struct drbd_device *device, sector_t sector,
+ int size);
+unsigned long drbd_set_sync(struct drbd_device *device, sector_t sector, int size,
+ unsigned long bits, unsigned long mask);
enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
-extern int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector, int size,
- enum update_sync_bits_mode mode);
+int __drbd_change_sync(struct drbd_peer_device *peer_device, sector_t sector,
+ int size, enum update_sync_bits_mode mode);
#define drbd_set_in_sync(peer_device, sector, size) \
__drbd_change_sync(peer_device, sector, size, SET_IN_SYNC)
#define drbd_set_out_of_sync(peer_device, sector, size) \
__drbd_change_sync(peer_device, sector, size, SET_OUT_OF_SYNC)
#define drbd_rs_failed_io(peer_device, sector, size) \
__drbd_change_sync(peer_device, sector, size, RECORD_RS_FAILED)
-extern void drbd_al_shrink(struct drbd_device *device);
-extern int drbd_al_initialize(struct drbd_device *, void *);
+void drbd_al_shrink(struct drbd_device *device);
+int drbd_al_initialize(struct drbd_device *device, void *buffer);
/* drbd_nl.c */
-/* state info broadcast */
-struct sib_info {
- enum drbd_state_info_bcast_reason sib_reason;
- union {
- struct {
- char *helper_name;
- unsigned helper_exit_code;
- };
- struct {
- union drbd_state os;
- union drbd_state ns;
- };
- };
-};
-void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
-
-extern int notify_resource_state(struct sk_buff *,
- unsigned int,
- struct drbd_resource *,
- struct resource_info *,
- enum drbd_notification_type);
-extern int notify_device_state(struct sk_buff *,
- unsigned int,
- struct drbd_device *,
- struct device_info *,
- enum drbd_notification_type);
-extern int notify_connection_state(struct sk_buff *,
- unsigned int,
- struct drbd_connection *,
- struct connection_info *,
- enum drbd_notification_type);
-extern int notify_peer_device_state(struct sk_buff *,
- unsigned int,
- struct drbd_peer_device *,
- struct peer_device_info *,
- enum drbd_notification_type);
-extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
- struct drbd_connection *, const char *, int);
+extern struct mutex notification_mutex;
+extern atomic_t drbd_genl_seq;
+
+int notify_resource_state(struct sk_buff *skb, unsigned int seq,
+ struct drbd_resource *resource,
+ struct resource_info *resource_info,
+ struct rename_resource_info *rename_resource_info,
+ enum drbd_notification_type type);
+int notify_device_state(struct sk_buff *skb, unsigned int seq,
+ struct drbd_device *device,
+ struct device_info *device_info,
+ enum drbd_notification_type type);
+int notify_connection_state(struct sk_buff *skb, unsigned int seq,
+ struct drbd_connection *connection,
+ struct connection_info *connection_info,
+ enum drbd_notification_type type);
+int notify_peer_device_state(struct sk_buff *skb, unsigned int seq,
+ struct drbd_peer_device *peer_device,
+ struct peer_device_info *peer_device_info,
+ enum drbd_notification_type type);
+void notify_helper(enum drbd_notification_type type,
+ struct drbd_device *device,
+ struct drbd_connection *connection, const char *name,
+ int status);
+int notify_path(struct drbd_connection *connection, struct drbd_path *path,
+ enum drbd_notification_type type);
+void drbd_broadcast_peer_device_state(struct drbd_peer_device *peer_device);
+
+sector_t drbd_local_max_size(struct drbd_device *device);
+int drbd_open_ro_count(struct drbd_resource *resource);
+
+void device_to_info(struct device_info *info, struct drbd_device *device);
+void device_state_change_to_info(struct device_info *info,
+ struct drbd_device_state_change *state_change);
+void peer_device_state_change_to_info(struct peer_device_info *info,
+ struct drbd_peer_device_state_change *state_change);
/*
* inline helper functions
*************************/
-/* see also page_chain_add and friends in drbd_receiver.c */
-static inline struct page *page_chain_next(struct page *page)
-{
- return (struct page *)page_private(page);
-}
-#define page_chain_for_each(page) \
- for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
- page = page_chain_next(page))
-#define page_chain_for_each_safe(page, n) \
- for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-
-
-static inline union drbd_state drbd_read_state(struct drbd_device *device)
-{
- struct drbd_resource *resource = device->resource;
- union drbd_state rv;
-
- rv.i = device->state.i;
- rv.susp = resource->susp;
- rv.susp_nod = resource->susp_nod;
- rv.susp_fen = resource->susp_fen;
-
- return rv;
-}
-
-enum drbd_force_detach_flags {
- DRBD_READ_ERROR,
- DRBD_WRITE_ERROR,
- DRBD_META_IO_ERROR,
- DRBD_FORCE_DETACH,
-};
-
-#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
-static inline void __drbd_chk_io_error_(struct drbd_device *device,
- enum drbd_force_detach_flags df,
- const char *where)
-{
- enum drbd_io_error_p ep;
-
- rcu_read_lock();
- ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
- rcu_read_unlock();
- switch (ep) {
- case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
- if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
- if (drbd_ratelimit())
- drbd_err(device, "Local IO failed in %s.\n", where);
- if (device->state.disk > D_INCONSISTENT)
- _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
- break;
- }
- fallthrough; /* for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
- case EP_DETACH:
- case EP_CALL_HELPER:
- /* Remember whether we saw a READ or WRITE error.
- *
- * Recovery of the affected area for WRITE failure is covered
- * by the activity log.
- * READ errors may fall outside that area though. Certain READ
- * errors can be "healed" by writing good data to the affected
- * blocks, which triggers block re-allocation in lower layers.
- *
- * If we can not write the bitmap after a READ error,
- * we may need to trigger a full sync (see w_go_diskless()).
- *
- * Force-detach is not really an IO error, but rather a
- * desperate measure to try to deal with a completely
- * unresponsive lower level IO stack.
- * Still it should be treated as a WRITE error.
- *
- * Meta IO error is always WRITE error:
- * we read meta data only once during attach,
- * which will fail in case of errors.
- */
- set_bit(WAS_IO_ERROR, &device->flags);
- if (df == DRBD_READ_ERROR)
- set_bit(WAS_READ_ERROR, &device->flags);
- if (df == DRBD_FORCE_DETACH)
- set_bit(FORCE_DETACH, &device->flags);
- if (device->state.disk > D_FAILED) {
- _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
- drbd_err(device,
- "Local IO failed in %s. Detaching...\n", where);
- }
- break;
- }
-}
-
-/**
- * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
- * @device: DRBD device.
- * @error: Error code passed to the IO completion callback
- * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
- *
- * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+/*
+ * When a device has a replication state above L_OFF, it must be
+ * connected. Otherwise, we report the connection state, which has values up
+ * to C_CONNECTED == L_OFF.
*/
-#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
-static inline void drbd_chk_io_error_(struct drbd_device *device,
- int error, enum drbd_force_detach_flags forcedetach, const char *where)
+static inline int combined_conn_state(struct drbd_peer_device *peer_device, enum which_state which)
{
- if (error) {
- unsigned long flags;
- spin_lock_irqsave(&device->resource->req_lock, flags);
- __drbd_chk_io_error_(device, forcedetach, where);
- spin_unlock_irqrestore(&device->resource->req_lock, flags);
- }
-}
+ enum drbd_repl_state repl_state = peer_device->repl_state[which];
+ if (repl_state > L_OFF)
+ return repl_state;
+ else
+ return peer_device->connection->cstate[which];
+}
/**
* drbd_md_first_sector() - Returns the first sector number of the meta data area
@@ -1718,54 +2580,13 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
switch (bdev->md.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
- return bdev->md.md_offset + MD_4kB_SECT -1;
+ return bdev->md.md_offset + (4096 >> 9) - 1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
- return bdev->md.md_offset + bdev->md.md_size_sect -1;
+ return bdev->md.md_offset + bdev->md.md_size_sect - 1;
}
}
-/* Returns the number of 512 byte sectors of the device */
-static inline sector_t drbd_get_capacity(struct block_device *bdev)
-{
- return bdev ? bdev_nr_sectors(bdev) : 0;
-}
-
-/**
- * drbd_get_max_capacity() - Returns the capacity we announce to out peer
- * @bdev: Meta data block device.
- *
- * returns the capacity we announce to out peer. we clip ourselves at the
- * various MAX_SECTORS, because if we don't, current implementation will
- * oops sooner or later
- */
-static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
-{
- sector_t s;
-
- switch (bdev->md.meta_dev_idx) {
- case DRBD_MD_INDEX_INTERNAL:
- case DRBD_MD_INDEX_FLEX_INT:
- s = drbd_get_capacity(bdev->backing_bdev)
- ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- drbd_md_first_sector(bdev))
- : 0;
- break;
- case DRBD_MD_INDEX_FLEX_EXT:
- s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- drbd_get_capacity(bdev->backing_bdev));
- /* clip at maximum size the meta device can support */
- s = min_t(sector_t, s,
- BM_EXT_TO_SECT(bdev->md.md_size_sect
- - bdev->md.bm_offset));
- break;
- default:
- s = min_t(sector_t, DRBD_MAX_SECTORS,
- drbd_get_capacity(bdev->backing_bdev));
- }
- return s;
-}
-
/**
* drbd_md_ss() - Return the sector number of our meta data super block
* @bdev: Meta data block device.
@@ -1784,18 +2605,10 @@ static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
/* external, some index; this is the old fixed size layout */
- return MD_128MB_SECT * bdev->md.meta_dev_idx;
+ return (128 << 20 >> 9) * bdev->md.meta_dev_idx;
}
-static inline void
-drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
-{
- unsigned long flags;
- spin_lock_irqsave(&q->q_lock, flags);
- list_add_tail(&w->list, &q->q);
- spin_unlock_irqrestore(&q->q_lock, flags);
- wake_up(&q->q_wait);
-}
+void drbd_queue_work(struct drbd_work_queue *, struct drbd_work *);
static inline void
drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
@@ -1812,46 +2625,48 @@ static inline void
drbd_device_post_work(struct drbd_device *device, int work_bit)
{
if (!test_and_set_bit(work_bit, &device->flags)) {
- struct drbd_connection *connection =
- first_peer_device(device)->connection;
- struct drbd_work_queue *q = &connection->sender_work;
- if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+ struct drbd_resource *resource = device->resource;
+ struct drbd_work_queue *q = &resource->work;
+ if (!test_and_set_bit(DEVICE_WORK_PENDING, &resource->flags))
wake_up(&q->q_wait);
}
}
-extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
-
-/* To get the ack_receiver out of the blocking network stack,
- * so it can change its sk_rcvtimeo from idle- to ping-timeout,
- * and send a ping, we need to send a signal.
- * Which signal we send is irrelevant. */
-static inline void wake_ack_receiver(struct drbd_connection *connection)
-{
- struct task_struct *task = connection->ack_receiver.task;
- if (task && get_t_state(&connection->ack_receiver) == RUNNING)
- send_sig(SIGXCPU, task, 1);
-}
-
-static inline void request_ping(struct drbd_connection *connection)
+static inline void
+drbd_peer_device_post_work(struct drbd_peer_device *peer_device, int work_bit)
{
- set_bit(SEND_PING, &connection->flags);
- wake_ack_receiver(connection);
+ if (!test_and_set_bit(work_bit, &peer_device->flags)) {
+ struct drbd_resource *resource = peer_device->device->resource;
+ struct drbd_work_queue *q = &resource->work;
+ if (!test_and_set_bit(PEER_DEVICE_WORK_PENDING, &resource->flags))
+ wake_up(&q->q_wait);
+ }
}
-extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
-extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
-extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
- enum drbd_packet, unsigned int, void *,
- unsigned int);
-extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
- enum drbd_packet, unsigned int, void *,
- unsigned int);
-
-extern int drbd_send_ping(struct drbd_connection *connection);
-extern int drbd_send_ping_ack(struct drbd_connection *connection);
-extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
-extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+void drbd_flush_workqueue_interruptible(struct drbd_device *device);
+
+void *__conn_prepare_command(struct drbd_connection *connection, int size,
+ enum drbd_stream drbd_stream);
+void *conn_prepare_command(struct drbd_connection *connection, int size,
+ enum drbd_stream drbd_stream);
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, int size,
+ enum drbd_stream drbd_stream);
+int __send_command(struct drbd_connection *connection, int vnr,
+ enum drbd_packet cmd, int stream_and_flags);
+int send_command(struct drbd_connection *connection, int vnr,
+ enum drbd_packet cmd, int stream_and_flags);
+int drbd_send_command(struct drbd_peer_device *peer_device,
+ enum drbd_packet cmd, enum drbd_stream drbd_stream);
+
+int drbd_send_ping(struct drbd_connection *connection);
+int conn_send_state_req(struct drbd_connection *connection, int vnr,
+ enum drbd_packet cmd, union drbd_state mask,
+ union drbd_state val);
+int conn_send_twopc_request(struct drbd_connection *connection,
+ struct twopc_request *request);
+int drbd_send_peer_ack(struct drbd_connection *connection, u64 mask,
+ u64 dagtag_sector);
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
@@ -1868,59 +2683,37 @@ static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
_drbd_thread_stop(thi, true, false);
}
-/* counts how many answer packets packets we expect from our peer,
- * for either explicit application requests,
- * or implicit barrier packets as necessary.
- * increased:
- * w_send_barrier
- * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
- * it is much easier and equally valid to count what we queue for the
- * worker, even before it actually was queued or send.
- * (drbd_make_request_common; recovery path on read io-error)
- * decreased:
- * got_BarrierAck (respective tl_clear, tl_clear_barrier)
- * _req_mod(req, DATA_RECEIVED)
- * [from receive_DataReply]
- * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
- * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
- * for some reason it is NOT decreased in got_NegAck,
- * but in the resulting cleanup code from report_params.
- * we should try to remember the reason for that...
- * _req_mod(req, SEND_FAILED or SEND_CANCELED)
- * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
- * [from tl_clear_barrier]
- */
-static inline void inc_ap_pending(struct drbd_device *device)
+static inline void inc_ap_pending(struct drbd_peer_device *peer_device)
{
- atomic_inc(&device->ap_pending_cnt);
+ atomic_inc(&peer_device->ap_pending_cnt);
}
-#define dec_ap_pending(device) ((void)expect((device), __dec_ap_pending(device) >= 0))
-static inline int __dec_ap_pending(struct drbd_device *device)
+#define dec_ap_pending(peer_device) \
+ ((void)expect((peer_device), __dec_ap_pending(peer_device) >= 0))
+static inline int __dec_ap_pending(struct drbd_peer_device *peer_device)
{
- int ap_pending_cnt = atomic_dec_return(&device->ap_pending_cnt);
-
+ int ap_pending_cnt = atomic_dec_return(&peer_device->ap_pending_cnt);
if (ap_pending_cnt == 0)
- wake_up(&device->misc_wait);
+ wake_up(&peer_device->device->misc_wait);
return ap_pending_cnt;
}
/* counts how many resync-related answers we still expect from the peer
* increase decrease
- * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
- * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
+ * L_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * L_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
* (or P_NEG_ACK with ID_SYNCER)
*/
static inline void inc_rs_pending(struct drbd_peer_device *peer_device)
{
- atomic_inc(&peer_device->device->rs_pending_cnt);
+ atomic_inc(&peer_device->rs_pending_cnt);
}
#define dec_rs_pending(peer_device) \
((void)expect((peer_device), __dec_rs_pending(peer_device) >= 0))
static inline int __dec_rs_pending(struct drbd_peer_device *peer_device)
{
- return atomic_dec_return(&peer_device->device->rs_pending_cnt);
+ return atomic_dec_return(&peer_device->rs_pending_cnt);
}
/* counts how many answers we still need to send to the peer.
@@ -1929,42 +2722,82 @@ static inline int __dec_rs_pending(struct drbd_peer_device *peer_device)
* we need to send a P_RECV_ACK (proto B)
* or P_WRITE_ACK (proto C)
* receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
- * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ * receive_data_request etc we need to send back P_DATA
* receive_Barrier_* we need to send a P_BARRIER_ACK
*/
-static inline void inc_unacked(struct drbd_device *device)
+static inline void inc_unacked(struct drbd_peer_device *peer_device)
+{
+ atomic_inc(&peer_device->unacked_cnt);
+}
+
+#define dec_unacked(peer_device) \
+ ((void)expect(peer_device, __dec_unacked(peer_device) >= 0))
+static inline int __dec_unacked(struct drbd_peer_device *peer_device)
+{
+ return atomic_dec_return(&peer_device->unacked_cnt);
+}
+
+static inline bool repl_is_sync_target(enum drbd_repl_state repl_state)
{
- atomic_inc(&device->unacked_cnt);
+ return repl_state == L_SYNC_TARGET || repl_state == L_PAUSED_SYNC_T;
}
-#define dec_unacked(device) ((void)expect(device, __dec_unacked(device) >= 0))
-static inline int __dec_unacked(struct drbd_device *device)
+static inline bool repl_is_sync_source(enum drbd_repl_state repl_state)
{
- return atomic_dec_return(&device->unacked_cnt);
+ return repl_state == L_SYNC_SOURCE || repl_state == L_PAUSED_SYNC_S;
}
-#define sub_unacked(device, n) ((void)expect(device, __sub_unacked(device) >= 0))
-static inline int __sub_unacked(struct drbd_device *device, int n)
+static inline bool repl_is_sync(enum drbd_repl_state repl_state)
{
- return atomic_sub_return(n, &device->unacked_cnt);
+ return repl_is_sync_source(repl_state) ||
+ repl_is_sync_target(repl_state);
}
-static inline bool is_sync_target_state(enum drbd_conns connection_state)
+static inline bool is_sync_target_state(struct drbd_peer_device *peer_device,
+ enum which_state which)
{
- return connection_state == C_SYNC_TARGET ||
- connection_state == C_PAUSED_SYNC_T;
+ return repl_is_sync_target(peer_device->repl_state[which]);
}
-static inline bool is_sync_source_state(enum drbd_conns connection_state)
+static inline bool is_sync_source_state(struct drbd_peer_device *peer_device,
+ enum which_state which)
{
- return connection_state == C_SYNC_SOURCE ||
- connection_state == C_PAUSED_SYNC_S;
+ return repl_is_sync_source(peer_device->repl_state[which]);
}
-static inline bool is_sync_state(enum drbd_conns connection_state)
+static inline bool is_sync_state(struct drbd_peer_device *peer_device,
+ enum which_state which)
{
- return is_sync_source_state(connection_state) ||
- is_sync_target_state(connection_state);
+ return repl_is_sync(peer_device->repl_state[which]);
+}
+
+static inline bool is_verify_state(struct drbd_peer_device *peer_device,
+ enum which_state which)
+{
+ enum drbd_repl_state repl_state = peer_device->repl_state[which];
+ return repl_state == L_VERIFY_S || repl_state == L_VERIFY_T;
+}
+
+static inline bool resync_susp_comb_dep(struct drbd_peer_device *peer_device, enum which_state which)
+{
+ struct drbd_device *device = peer_device->device;
+
+ return peer_device->resync_susp_dependency[which] || peer_device->resync_susp_other_c[which] ||
+ (is_sync_source_state(peer_device, which) && device->disk_state[which] <= D_INCONSISTENT);
+}
+
+static inline int
+drbd_insert_fault_conn(struct drbd_connection *connection, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+ int id = 0;
+ struct drbd_device *device = idr_get_next(&connection->resource->devices, &id);
+
+ return device && drbd_fault_rate &&
+ (drbd_enable_faults & (1<<type)) &&
+ _drbd_insert_fault(device, type);
+#else
+ return 0;
+#endif
}
/**
@@ -1974,14 +2807,11 @@ static inline bool is_sync_state(enum drbd_conns connection_state)
*
* You have to call put_ldev() when finished working with device->ldev.
*/
-#define get_ldev_if_state(_device, _min_state) \
- (_get_ldev_if_state((_device), (_min_state)) ? \
- ({ __acquire(x); true; }) : false)
#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT)
static inline void put_ldev(struct drbd_device *device)
{
- enum drbd_disk_state disk_state = device->state.disk;
+ enum drbd_disk_state disk_state = device->disk_state[NOW];
/* We must check the state *before* the atomic_dec becomes visible,
* or we have a theoretical race where someone hitting zero,
* while state still D_FAILED, will then see D_DISKLESS in the
@@ -1991,13 +2821,14 @@ static inline void put_ldev(struct drbd_device *device)
/* This may be called from some endio handler,
* so we must not sleep here. */
- __release(local);
D_ASSERT(device, i >= 0);
if (i == 0) {
- if (disk_state == D_DISKLESS)
+ if (disk_state == D_DISKLESS) {
/* even internal references gone, safe to destroy */
- drbd_device_post_work(device, DESTROY_DISK);
- if (disk_state == D_FAILED)
+ kref_get(&device->kref);
+ schedule_work(&device->ldev_destroy_work);
+ }
+ if (disk_state == D_FAILED || disk_state == D_DETACHING)
/* all application IO references gone. */
if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
drbd_device_post_work(device, GO_DISKLESS);
@@ -2005,122 +2836,53 @@ static inline void put_ldev(struct drbd_device *device)
}
}
-#ifndef __CHECKER__
-static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+static inline int get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
{
int io_allowed;
/* never get a reference while D_DISKLESS */
- if (device->state.disk == D_DISKLESS)
+ if (device->disk_state[NOW] == D_DISKLESS)
return 0;
atomic_inc(&device->local_cnt);
- io_allowed = (device->state.disk >= mins);
+ io_allowed = (device->disk_state[NOW] >= mins);
if (!io_allowed)
put_ldev(device);
return io_allowed;
}
-#else
-extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
-#endif
-/* this throttles on-the-fly application requests
- * according to max_buffers settings;
- * maybe re-implement using semaphores? */
-static inline int drbd_get_max_buffers(struct drbd_device *device)
-{
- struct net_conf *nc;
- int mxb;
+void drbd_queue_pending_bitmap_work(struct drbd_device *device);
- rcu_read_lock();
- nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
- mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
- rcu_read_unlock();
-
- return mxb;
-}
-
-static inline int drbd_state_is_stable(struct drbd_device *device)
+/* rw = READ or WRITE (0 or 1); nothing else. */
+static inline void dec_ap_bio(struct drbd_device *device, int rw)
{
- union drbd_dev_state s = device->state;
-
- /* DO NOT add a default clause, we want the compiler to warn us
- * for any newly introduced state we may have forgotten to add here */
-
- switch ((enum drbd_conns)s.conn) {
- /* new io only accepted when there is no connection, ... */
- case C_STANDALONE:
- case C_WF_CONNECTION:
- /* ... or there is a well established connection. */
- case C_CONNECTED:
- case C_SYNC_SOURCE:
- case C_SYNC_TARGET:
- case C_VERIFY_S:
- case C_VERIFY_T:
- case C_PAUSED_SYNC_S:
- case C_PAUSED_SYNC_T:
- case C_AHEAD:
- case C_BEHIND:
- /* transitional states, IO allowed */
- case C_DISCONNECTING:
- case C_UNCONNECTED:
- case C_TIMEOUT:
- case C_BROKEN_PIPE:
- case C_NETWORK_FAILURE:
- case C_PROTOCOL_ERROR:
- case C_TEAR_DOWN:
- case C_WF_REPORT_PARAMS:
- case C_STARTING_SYNC_S:
- case C_STARTING_SYNC_T:
- break;
-
- /* Allow IO in BM exchange states with new protocols */
- case C_WF_BITMAP_S:
- if (first_peer_device(device)->connection->agreed_pro_version < 96)
- return 0;
- break;
+ unsigned int nr_requests = device->resource->res_opts.nr_requests;
+ int ap_bio = atomic_dec_return(&device->ap_bio_cnt[rw]);
- /* no new io accepted in these states */
- case C_WF_BITMAP_T:
- case C_WF_SYNC_UUID:
- case C_MASK:
- /* not "stable" */
- return 0;
- }
-
- switch ((enum drbd_disk_state)s.disk) {
- case D_DISKLESS:
- case D_INCONSISTENT:
- case D_OUTDATED:
- case D_CONSISTENT:
- case D_UP_TO_DATE:
- case D_FAILED:
- /* disk state is stable as well. */
- break;
+ D_ASSERT(device, ap_bio >= 0);
- /* no new io accepted during transitional states */
- case D_ATTACHING:
- case D_NEGOTIATING:
- case D_UNKNOWN:
- case D_MASK:
- /* not "stable" */
- return 0;
- }
+ /* Check for list_empty outside the lock is ok. Worst case it queues
+ * nothing because someone else just now did. During list_add, a
+ * refcount on ap_bio_cnt[WRITE] is held, so the bitmap work will be
+ * queued when that is released if we miss it here.
+ * Checking pending_bitmap_work.n is not correct,
+ * it has a different lifetime. */
+ if (ap_bio == 0 && rw == WRITE && !list_empty(&device->pending_bitmap_work.q))
+ drbd_queue_pending_bitmap_work(device);
- return 1;
+ if (ap_bio == 0 || ap_bio == nr_requests-1)
+ wake_up(&device->misc_wait);
}
-static inline int drbd_suspended(struct drbd_device *device)
+static inline bool drbd_suspended(struct drbd_device *device)
{
- struct drbd_resource *resource = device->resource;
-
- return resource->susp || resource->susp_fen || resource->susp_nod;
+ return device->resource->cached_susp;
}
static inline bool may_inc_ap_bio(struct drbd_device *device)
{
- int mxb = drbd_get_max_buffers(device);
-
+ if (device->cached_err_io)
+ return true;
if (drbd_suspended(device))
return false;
if (atomic_read(&device->suspend_cnt))
@@ -2131,76 +2893,45 @@ static inline bool may_inc_ap_bio(struct drbd_device *device)
* to start during "stable" states. */
/* no new io accepted when attaching or detaching the disk */
- if (!drbd_state_is_stable(device))
+ if (device->cached_state_unstable)
return false;
- /* since some older kernels don't have atomic_add_unless,
- * and we are within the spinlock anyways, we have this workaround. */
- if (atomic_read(&device->ap_bio_cnt) > mxb)
- return false;
- if (test_bit(BITMAP_IO, &device->flags))
+ if (atomic_read(&device->pending_bitmap_work.n))
return false;
return true;
}
-static inline bool inc_ap_bio_cond(struct drbd_device *device)
+static inline u64 drbd_current_uuid(struct drbd_device *device)
{
- bool rv = false;
-
- spin_lock_irq(&device->resource->req_lock);
- rv = may_inc_ap_bio(device);
- if (rv)
- atomic_inc(&device->ap_bio_cnt);
- spin_unlock_irq(&device->resource->req_lock);
-
- return rv;
+ if (!device->ldev)
+ return 0;
+ return device->ldev->md.current_uuid;
}
-static inline void inc_ap_bio(struct drbd_device *device)
+static inline bool verify_can_do_stop_sector(struct drbd_peer_device *peer_device)
{
- /* we wait here
- * as long as the device is suspended
- * until the bitmap is no longer on the fly during connection
- * handshake as long as we would exceed the max_buffer limit.
- *
- * to avoid races with the reconnect code,
- * we need to atomic_inc within the spinlock. */
-
- wait_event(device->misc_wait, inc_ap_bio_cond(device));
+ return peer_device->connection->agreed_pro_version >= 97 &&
+ peer_device->connection->agreed_pro_version != 100;
}
-static inline void dec_ap_bio(struct drbd_device *device)
+static inline u64 drbd_bitmap_uuid(struct drbd_peer_device *peer_device)
{
- int mxb = drbd_get_max_buffers(device);
- int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
-
- D_ASSERT(device, ap_bio >= 0);
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_md *peer_md;
- if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
- if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
- drbd_queue_work(&first_peer_device(device)->
- connection->sender_work,
- &device->bm_io_work.w);
- }
+ if (!device->ldev)
+ return 0;
- /* this currently does wake_up for every dec_ap_bio!
- * maybe rather introduce some type of hysteresis?
- * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
- if (ap_bio < mxb)
- wake_up(&device->misc_wait);
+ peer_md = &device->ldev->md.peers[peer_device->node_id];
+ return peer_md->bitmap_uuid;
}
-static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+static inline u64 drbd_history_uuid(struct drbd_device *device, int i)
{
- return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
- first_peer_device(device)->connection->agreed_pro_version != 100;
-}
+ if (!device->ldev || i >= ARRAY_SIZE(device->ldev->md.history_uuids))
+ return 0;
-static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
-{
- int changed = device->ed_uuid != val;
- device->ed_uuid = val;
- return changed;
+ return device->ldev->md.history_uuids[i];
}
static inline int drbd_queue_order_type(struct drbd_device *device)
@@ -2219,4 +2950,215 @@ static inline struct drbd_connection *first_connection(struct drbd_resource *res
struct drbd_connection, connections);
}
+static inline struct net *drbd_net_assigned_to_connection(struct drbd_connection *connection)
+{
+ struct drbd_path *path;
+ struct net *net;
+
+ rcu_read_lock();
+ path = list_first_or_null_rcu(&connection->transport.paths, struct drbd_path, list);
+ net = path ? path->net : NULL;
+ rcu_read_unlock();
+
+ return net;
+}
+
+#define NODE_MASK(id) ((u64)1 << (id))
+
+static inline void drbd_list_del_resync_request(struct drbd_peer_request *peer_req)
+{
+ peer_req->flags &= ~EE_ON_RECV_ORDER;
+ list_del(&peer_req->recv_order);
+
+ if (peer_req == peer_req->peer_device->received_last)
+ peer_req->peer_device->received_last = NULL;
+
+ if (peer_req == peer_req->peer_device->discard_last)
+ peer_req->peer_device->discard_last = NULL;
+}
+
+/*
+ * drbd_interval_same_peer - determine whether "interval" is for the same peer as "i"
+ *
+ * "i" must be an interval corresponding to a drbd_peer_request.
+ */
+static inline bool drbd_interval_same_peer(struct drbd_interval *interval, struct drbd_interval *i)
+{
+ struct drbd_peer_request *interval_peer_req, *i_peer_req;
+
+ /* Ensure we only call "container_of" if it is actually a peer request. */
+ if (interval->type == INTERVAL_LOCAL_WRITE ||
+ interval->type == INTERVAL_LOCAL_READ ||
+ interval->type == INTERVAL_PEERS_IN_SYNC_LOCK)
+ return false;
+
+ interval_peer_req = container_of(interval, struct drbd_peer_request, i);
+ i_peer_req = container_of(i, struct drbd_peer_request, i);
+ return interval_peer_req->peer_device == i_peer_req->peer_device;
+}
+
+/*
+ * drbd_should_defer_to_resync - determine whether "interval" should defer to
+ * "i" in order to ensure that resync makes progress
+ */
+static inline bool drbd_should_defer_to_resync(struct drbd_interval *interval, struct drbd_interval *i)
+{
+ if (!drbd_interval_is_resync(i))
+ return false;
+
+ /* Always defer to resync requests once the reply has been received.
+ * These just need to wait for conflicting local I/O to complete. This
+ * is necessary to ensure that resync replies received before
+ * application writes are submitted first, so that the resync writes do
+ * not overwrite newer data. */
+ if (test_bit(INTERVAL_RECEIVED, &i->flags))
+ return true;
+
+ /* If we are still waiting for a reply from the peer, only defer to the
+ * request if it is towards a different peer. The exclusivity between
+ * resync requests and application writes from another peer is
+ * necessary to avoid overwriting newer data with older in the resync.
+ * When the data in both cases is coming from the same peer, this is
+ * not necessary. The peer ensures that the data stream is correctly
+ * ordered. */
+ return !drbd_interval_same_peer(interval, i);
+}
+
+/*
+ * drbd_should_defer_to_interval - determine whether "interval" should defer to "i"
+ */
+static inline bool drbd_should_defer_to_interval(struct drbd_interval *interval,
+ struct drbd_interval *i, bool defer_to_resync)
+{
+ if (test_bit(INTERVAL_SUBMITTED, &i->flags))
+ return true;
+
+ if (defer_to_resync && drbd_should_defer_to_resync(interval, i))
+ return true;
+
+ /*
+ * We do not send conflicting resync requests because that causes
+ * difficulties associating the replies to the requests.
+ */
+ if (interval->type == INTERVAL_RESYNC_WRITE &&
+ i->type == INTERVAL_RESYNC_WRITE &&
+ test_bit(INTERVAL_READY_TO_SEND, &i->flags))
+ return true;
+
+ return false;
+}
+
+/* Find conflicts at application level instead of at disk level. */
+#define CONFLICT_FLAG_APPLICATION_ONLY (1 << 0)
+
+/*
+ * Ignore peer writes from the peer that this request relates to. This is only
+ * used for determining whether to send a request. It must not be used for
+ * determining whether to submit a request, because that would allow concurrent
+ * writes to the backing disk.
+ */
+#define CONFLICT_FLAG_IGNORE_SAME_PEER (1 << 1)
+
+/*
+ * drbd_find_conflict - find conflicting interval, if any
+ */
+static inline struct drbd_interval *drbd_find_conflict(struct drbd_device *device,
+ struct drbd_interval *interval, unsigned long flags)
+{
+ struct drbd_interval *i;
+ sector_t sector = interval->sector;
+ int size = interval->size;
+ bool application_only = flags & CONFLICT_FLAG_APPLICATION_ONLY;
+ bool defer_to_resync =
+ (interval->type == INTERVAL_LOCAL_WRITE || interval->type == INTERVAL_PEER_WRITE) &&
+ !application_only;
+ bool exclusive_until_completed = interval->type == INTERVAL_LOCAL_WRITE || application_only;
+ bool ignore_same_peer = flags & CONFLICT_FLAG_IGNORE_SAME_PEER;
+
+ lockdep_assert_held(&device->interval_lock);
+
+ drbd_for_each_overlap(i, &device->requests, sector, size) {
+ /* Ignore the interval itself. */
+ if (i == interval)
+ continue;
+
+ if (exclusive_until_completed) {
+ /* Ignore, if already completed to upper layers. */
+ if (test_bit(INTERVAL_COMPLETED, &i->flags))
+ continue;
+ } else {
+ /* Ignore, if already completed by the backing disk. */
+ if (test_bit(INTERVAL_BACKING_COMPLETED, &i->flags))
+ continue;
+ }
+
+ /* Ignore, if there is no need to defer to it. */
+ if (!drbd_should_defer_to_interval(interval, i, defer_to_resync))
+ continue;
+
+ /*
+ * Ignore peer writes from the peer that this request relates
+ * to, if requested.
+ */
+ if (ignore_same_peer && i->type == INTERVAL_PEER_WRITE && drbd_interval_same_peer(interval, i))
+ continue;
+
+ if (unlikely(application_only)) {
+ /* Ignore, if not an application request. */
+ if (!drbd_interval_is_application(i))
+ continue;
+ }
+
+ if (drbd_interval_is_write(interval)) {
+ /*
+ * Mark verify requests as conflicting rather than
+ * treating them as conflicts for us.
+ */
+ if (drbd_interval_is_verify(i)) {
+ set_bit(INTERVAL_CONFLICT, &i->flags);
+ continue;
+ }
+ } else {
+ /* Ignore other resync reads. */
+ if (i->type == INTERVAL_RESYNC_READ)
+ continue;
+
+ /* Ignore verify requests, since they are always reads. */
+ if (drbd_interval_is_verify(i))
+ continue;
+
+ /* Ignore peers-in-sync intervals, since they are always reads. */
+ if (i->type == INTERVAL_PEERS_IN_SYNC_LOCK)
+ continue;
+ }
+
+ dynamic_drbd_dbg(device,
+ "%s at %llus+%u conflicts with %s at %llus+%u\n",
+ drbd_interval_type_str(interval),
+ (unsigned long long) sector, size,
+ drbd_interval_type_str(i),
+ (unsigned long long) i->sector, i->size);
+
+ break;
+ }
+
+ return i;
+}
+
+#ifdef CONFIG_DRBD_TIMING_STATS
+#define ktime_aggregate_delta(D, ST, M) (D->M = ktime_add(D->M, ktime_sub(ktime_get(), ST)))
+#define ktime_aggregate(D, R, M) (D->M = ktime_add(D->M, ktime_sub(R->M, R->start_kt)))
+#define ktime_aggregate_pd(P, N, R, M) (P->M = ktime_add(P->M, ktime_sub(R->M[N], R->start_kt)))
+#define ktime_get_accounting(V) (V = ktime_get())
+#define ktime_get_accounting_assign(V, T) (V = T)
+#define ktime_var_for_accounting(V) ktime_t V = ktime_get()
+#else
+#define ktime_aggregate_delta(D, ST, M)
+#define ktime_aggregate(D, R, M)
+#define ktime_aggregate_pd(P, N, R, M)
+#define ktime_get_accounting(V)
+#define ktime_get_accounting_assign(V, T)
+#define ktime_var_for_accounting(V)
+#endif
+
#endif
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index 5d3213b81eed..a6ef04f89885 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -5,20 +5,149 @@
#include <linux/types.h>
#include <linux/rbtree.h>
+/* Interval types stored directly in drbd_interval so that we can handle
+ * conflicts without having to inspect the containing object. The value 0 is
+ * reserved for uninitialized intervals. */
+enum drbd_interval_type {
+ INTERVAL_LOCAL_WRITE = 1,
+ INTERVAL_PEER_WRITE,
+ INTERVAL_LOCAL_READ,
+ INTERVAL_PEER_READ,
+ INTERVAL_RESYNC_WRITE, /* L_SYNC_TARGET */
+ INTERVAL_RESYNC_READ, /* L_SYNC_SOURCE */
+ INTERVAL_OV_READ_SOURCE, /* L_VERIFY_S */
+ INTERVAL_OV_READ_TARGET, /* L_VERIFY_T */
+ INTERVAL_PEERS_IN_SYNC_LOCK,
+};
+
+#define INTERVAL_TYPE_MASK(type) (1 << (type))
+
+enum drbd_interval_flags {
+ /* Whether this peer request may be sent. */
+ INTERVAL_READY_TO_SEND,
+
+ /*
+ * Used for resync reads. This flag is set after sending and is used to
+ * manage the lifetime of the request. When INTERVAL_SENT is not set,
+ * the sending path still has a reference to the request.
+ */
+ INTERVAL_SENT,
+
+ /*
+ * Whether this peer request has been received yet.
+ *
+ * For resync reads, this flag is set when the corresponding ack has
+ * been received and is used to manage the lifetime of the request.
+ * When INTERVAL_RECEIVED is not set, the receiving path has a
+ * reference to the request. This reference counting is protected by
+ * peer_reqs_lock.
+ */
+ INTERVAL_RECEIVED,
+
+ /* Whether this has been queued after conflict. */
+ INTERVAL_SUBMIT_CONFLICT_QUEUED,
+
+ /* Whether this has been submitted already. */
+ INTERVAL_SUBMITTED,
+
+ /* Whether the local backing device bio is complete. */
+ INTERVAL_BACKING_COMPLETED,
+
+ /* This has been completed already; ignore for conflict detection. */
+ INTERVAL_COMPLETED,
+
+ /* For verify requests: whether this has conflicts. */
+ INTERVAL_CONFLICT,
+
+ /* For resync requests: whether this was canceled while waiting for conflict resolution. */
+ INTERVAL_CANCELED,
+
+ /*
+ * For local requests: whether this is done.
+ *
+ * Included here instead of in local_rq_state to allow access with
+ * atomic bit operations instead of taking rq_lock.
+ */
+ INTERVAL_DONE,
+
+ /*
+ * For local requests: when we put the AL extent for this request, it
+ * was the last in that extent.
+ *
+ * Included here instead of in local_rq_state to allow access with
+ * atomic bit operations instead of taking rq_lock.
+ */
+ INTERVAL_AL_EXTENT_LAST,
+};
+
+/* Intervals used to manage conflicts between application requests and various
+ * internal requests, so that the disk content is deterministic.
+ *
+ * The requests progress through states indicated by successively setting the
+ * flags "INTERVAL_SUBMITTED", "INTERVAL_BACKING_COMPLETED" and
+ * "INTERVAL_COMPLETED".
+ *
+ * Application and resync requests wait to be submitted until any conflicts
+ * that are "INTERVAL_SUBMITTED" have reached "INTERVAL_BACKING_COMPLETED"
+ * state. Application requests also wait for conflicting application requests
+ * to ensure consistency between the replicated copies. In addition,
+ * application requests wait for resync requests that have not yet been
+ * submitted. Resync takes priority over application writes in this way because
+ * a resync locks each block at most once, so it will finish at some point,
+ * whereas the application may repeatedly write the same blocks, which would
+ * potentially lock out resync indefinitely.
+ *
+ * Resync read requests do not conflict with each other, but they are
+ * nevertheless mutually exclusive with writes, so that the bitmap can be
+ * updated reliably.
+ *
+ * Verify requests do not wait for other requests. If there are conflicts, they
+ * are simply cancelled. Futhermore, they do not lock out other requests;
+ * instead they are simply marked as having conflicts and ignored.
+ *
+ * Application write request intervals are retained even when they are
+ * "INTERVAL_COMPLETED", so that they can be used to look up remote replies
+ * that are still pending.
+ */
struct drbd_interval {
struct rb_node rb;
sector_t sector; /* start sector of the interval */
sector_t end; /* highest interval end in subtree */
unsigned int size; /* size in bytes */
- unsigned int local:1 /* local or remote request? */;
- unsigned int waiting:1; /* someone is waiting for completion */
- unsigned int completed:1; /* this has been completed already;
- * ignore for conflict detection */
+ enum drbd_interval_type type; /* what type of interval this is */
+ unsigned long flags;
/* to resume a partially successful drbd_al_begin_io_nonblock(); */
unsigned int partially_in_al_next_enr;
};
+static inline bool drbd_interval_is_application(struct drbd_interval *i)
+{
+ return i->type == INTERVAL_LOCAL_WRITE || i->type == INTERVAL_PEER_WRITE ||
+ i->type == INTERVAL_LOCAL_READ || i->type == INTERVAL_PEER_READ;
+}
+
+static inline bool drbd_interval_is_write(struct drbd_interval *i)
+{
+ return i->type == INTERVAL_LOCAL_WRITE || i->type == INTERVAL_PEER_WRITE ||
+ i->type == INTERVAL_RESYNC_WRITE;
+}
+
+static inline bool drbd_interval_is_resync(struct drbd_interval *i)
+{
+ return i->type == INTERVAL_RESYNC_WRITE || i->type == INTERVAL_RESYNC_READ;
+}
+
+static inline bool drbd_interval_is_verify(struct drbd_interval *i)
+{
+ return i->type == INTERVAL_OV_READ_SOURCE || i->type == INTERVAL_OV_READ_TARGET;
+}
+
+static inline bool drbd_interval_is_local(struct drbd_interval *i)
+{
+ return i->type == INTERVAL_LOCAL_READ || i->type == INTERVAL_LOCAL_WRITE;
+}
+
static inline void drbd_clear_interval(struct drbd_interval *i)
{
RB_CLEAR_NODE(&i->rb);
@@ -29,14 +158,17 @@ static inline bool drbd_interval_empty(struct drbd_interval *i)
return RB_EMPTY_NODE(&i->rb);
}
-extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
-extern bool drbd_contains_interval(struct rb_root *, sector_t,
- struct drbd_interval *);
-extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
-extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
- unsigned int);
-extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
- unsigned int);
+const char *drbd_interval_type_str(struct drbd_interval *i);
+bool drbd_insert_interval(struct rb_root *root, struct drbd_interval *this);
+bool drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval);
+void drbd_remove_interval(struct rb_root *root, struct drbd_interval *this);
+struct drbd_interval *drbd_find_overlap(struct rb_root *root, sector_t sector,
+ unsigned int size);
+struct drbd_interval *drbd_next_overlap(struct drbd_interval *i,
+ sector_t sector, unsigned int size);
+void drbd_update_interval_size(struct drbd_interval *this,
+ unsigned int new_size);
#define drbd_for_each_overlap(i, root, sector, size) \
for (i = drbd_find_overlap(root, sector, size); \
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index e201f0087a0f..463f57d33204 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -73,7 +73,7 @@ int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
-#include <linux/drbd_genl_api.h>
+#include "drbd_genl_api.h"
#include "drbd_nla.h"
#include <linux/genl_magic_func.h>
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
index df0d241d3f6a..2dd6dc99823a 100644
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <net/netlink.h>
-#include <linux/drbd_genl_api.h>
+#include "drbd_genl_api.h"
#include "drbd_nla.h"
static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
index d3555df0d353..4463657c020d 100644
--- a/drivers/block/drbd/drbd_nla.h
+++ b/drivers/block/drbd/drbd_nla.h
@@ -2,8 +2,9 @@
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
-extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
- const struct nla_policy *policy);
-extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype,
+ struct nlattr *nla, const struct nla_policy *policy);
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla,
+ int attrtype);
#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_polymorph_printk.h b/drivers/block/drbd/drbd_polymorph_printk.h
index 8e0082d139ba..7b0873d2980e 100644
--- a/drivers/block/drbd/drbd_polymorph_printk.h
+++ b/drivers/block/drbd/drbd_polymorph_printk.h
@@ -11,104 +11,188 @@
#define DYNAMIC_DEBUG_BRANCH(D) false
#endif
+#define __drbd_printk(level, fmt, args...) \
+ printk(level fmt, ## args)
+#define __drbd_dyn_dbg(descriptor, fmt, args...) \
+ __dynamic_pr_debug(descriptor, fmt, ## args)
+
+#define ___drbd_printk_device(prmacro, rlt, device, lvl_or_desc, fmt, args...)\
+({ \
+ const struct drbd_device *__d = \
+ (const struct drbd_device *)(device); \
+ const struct drbd_resource *__r = __d->resource; \
+ const char *__unregistered = ""; \
+ if (test_bit(UNREGISTERED, &__d->flags)) \
+ __unregistered = "/unregistered/"; \
+ if (drbd_device_ratelimit(__d, rlt)) \
+ prmacro(lvl_or_desc, "drbd %s%s/%u drbd%u: " fmt, \
+ __unregistered, __r->name, __d->vnr, __d->minor,\
+ ## args); \
+})
+
+#define ___drbd_printk_resource(prmacro, rlt, resource, lvl_or_desc, fmt, args...)\
+({ \
+ const struct drbd_resource *__r = \
+ (const struct drbd_resource *)(resource); \
+ const char *__unregistered = ""; \
+ if (test_bit(R_UNREGISTERED, &__r->flags)) \
+ __unregistered = "/unregistered/"; \
+ if (drbd_resource_ratelimit(__r, rlt)) \
+ prmacro(lvl_or_desc, "drbd %s%s: " fmt, \
+ __unregistered, __r->name, ## args); \
+})
+
+// As long as the connection is still "registered", the resource
+// can not yet be "unregistered", no need to test R_UNREGISTERED
+#define ___drbd_printk_peer_device(prmacro, rlt, peer_device, lvl_or_desc, fmt, args...)\
+({ \
+ const struct drbd_peer_device *__pd; \
+ const struct drbd_device *__d; \
+ const struct drbd_connection *__c; \
+ const struct drbd_resource *__r; \
+ const char *__cn; \
+ const char *__unregistered = ""; \
+ rcu_read_lock(); \
+ __pd = (const struct drbd_peer_device *)(peer_device); \
+ __d = __pd->device; \
+ __c = __pd->connection; \
+ __r = __d->resource; \
+ __cn = rcu_dereference(__c->transport.net_conf)->name; \
+ if (test_bit(C_UNREGISTERED, &__c->flags)) \
+ __unregistered = "/unregistered/"; \
+ if (drbd_peer_device_ratelimit(__pd, rlt)) \
+ prmacro(lvl_or_desc, "drbd %s%s/%u drbd%u %s: " fmt, \
+ __unregistered, __r->name, __d->vnr, __d->minor, __cn, \
+ ## args); \
+ rcu_read_unlock(); \
+})
+
+#define ___drbd_printk_connection(prmacro, rlt, connection, lvl_or_desc, fmt, args...) \
+({ \
+ const struct drbd_connection *__c = \
+ (const struct drbd_connection *)(connection); \
+ const struct drbd_resource *__r = __c->resource; \
+ const char *__cn; \
+ const char *__unregistered = ""; \
+ rcu_read_lock(); \
+ __cn = rcu_dereference(__c->transport.net_conf)->name; \
+ if (test_bit(C_UNREGISTERED, &__c->flags)) \
+ __unregistered = "/unregistered/"; \
+ if (drbd_connection_ratelimit(__c, rlt)) \
+ prmacro(lvl_or_desc, "drbd %s%s %s: " fmt, \
+ __unregistered, __r->name, __cn, ## args); \
+ rcu_read_unlock(); \
+})
-#define __drbd_printk_drbd_device_prep(device) \
- const struct drbd_device *__d = (device); \
- const struct drbd_resource *__r = __d->resource
-#define __drbd_printk_drbd_device_fmt(fmt) "drbd %s/%u drbd%u: " fmt
-#define __drbd_printk_drbd_device_args() __r->name, __d->vnr, __d->minor
-#define __drbd_printk_drbd_device_unprep()
-
-#define __drbd_printk_drbd_peer_device_prep(peer_device) \
- const struct drbd_device *__d; \
- const struct drbd_resource *__r; \
- __d = (peer_device)->device; \
- __r = __d->resource
-#define __drbd_printk_drbd_peer_device_fmt(fmt) \
- "drbd %s/%u drbd%u: " fmt
-#define __drbd_printk_drbd_peer_device_args() \
- __r->name, __d->vnr, __d->minor
-#define __drbd_printk_drbd_peer_device_unprep()
-
-#define __drbd_printk_drbd_resource_prep(resource) \
- const struct drbd_resource *__r = resource
-#define __drbd_printk_drbd_resource_fmt(fmt) "drbd %s: " fmt
-#define __drbd_printk_drbd_resource_args() __r->name
-#define __drbd_printk_drbd_resource_unprep(resource)
-
-#define __drbd_printk_drbd_connection_prep(connection) \
- const struct drbd_connection *__c = (connection); \
- const struct drbd_resource *__r = __c->resource
-#define __drbd_printk_drbd_connection_fmt(fmt) \
- "drbd %s: " fmt
-#define __drbd_printk_drbd_connection_args() \
- __r->name
-#define __drbd_printk_drbd_connection_unprep()
+#define __drbd_printk_device(rlt, device, level, fmt, args...)\
+ ___drbd_printk_device(__drbd_printk, rlt, device, level, fmt, ## args)
+#define __drbd_printk_resource(rlt, resource, level, fmt, args...)\
+ ___drbd_printk_resource(__drbd_printk, rlt, resource, level, fmt, ## args)
+#define __drbd_printk_peer_device(rlt, peer_device, level, fmt, args...)\
+ ___drbd_printk_peer_device(__drbd_printk, rlt, peer_device, level, fmt, ## args)
+#define __drbd_printk_connection(rlt, connection, level, fmt, args...)\
+ ___drbd_printk_connection(__drbd_printk, rlt, connection, level, fmt, ## args)
void drbd_printk_with_wrong_object_type(void);
void drbd_dyn_dbg_with_wrong_object_type(void);
#define __drbd_printk_choose_cond(obj, struct_name) \
- (__builtin_types_compatible_p(typeof(obj), struct struct_name *) || \
- __builtin_types_compatible_p(typeof(obj), const struct struct_name *))
-#define __drbd_printk_if_same_type(obj, struct_name, level, fmt, args...) \
- __drbd_printk_choose_cond(obj, struct_name), \
-({ \
- __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
- printk(level __drbd_printk_ ## struct_name ## _fmt(fmt), \
- __drbd_printk_ ## struct_name ## _args(), ## args); \
- __drbd_printk_ ## struct_name ## _unprep(); \
-})
-
-#define drbd_printk(level, obj, fmt, args...) \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, drbd_device, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, drbd_resource, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, drbd_connection, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, drbd_peer_device, level, fmt, ## args), \
- drbd_printk_with_wrong_object_type()))))
+ (__builtin_types_compatible_p(typeof(obj), struct drbd_ ## struct_name *) || \
+ __builtin_types_compatible_p(typeof(obj), const struct drbd_ ## struct_name *))
+
+#define __drbd_obj_ratelimit(struct_name, obj, rlt) \
+ ({ \
+ int __rlt = (rlt); \
+ BUILD_BUG_ON(!__drbd_printk_choose_cond(obj, struct_name)); \
+ BUILD_BUG_ON(__rlt < -1); \
+ BUILD_BUG_ON(__rlt >= (int)ARRAY_SIZE(obj->ratelimit)); \
+ __rlt == -1 ? 1 \
+ : __ratelimit(/* unconst cast ratelimit state */ \
+ (struct ratelimit_state *)(unsigned long) \
+ &obj->ratelimit[__rlt]); \
+ })
+
+#define drbd_device_ratelimit(obj, rlt) \
+ __drbd_obj_ratelimit(device, obj, D_RL_D_ ## rlt)
+#define drbd_resource_ratelimit(obj, rlt) \
+ __drbd_obj_ratelimit(resource, obj, D_RL_R_ ## rlt)
+#define drbd_connection_ratelimit(obj, rlt) \
+ __drbd_obj_ratelimit(connection, obj, D_RL_C_ ## rlt)
+#define drbd_peer_device_ratelimit(obj, rlt) \
+ __drbd_obj_ratelimit(peer_device, obj, D_RL_PD_ ## rlt)
+
+#define drbd_printk(ratelimit_type, level, obj, fmt, args...) \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, device), \
+ __drbd_printk_device(ratelimit_type, obj, level, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, resource), \
+ __drbd_printk_resource(ratelimit_type, obj, level, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, connection), \
+ __drbd_printk_connection(ratelimit_type, obj, level, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, peer_device), \
+ __drbd_printk_peer_device(ratelimit_type, obj, level, fmt, ## args), \
+ \
+ drbd_printk_with_wrong_object_type() \
+ ))))
#define __drbd_dyn_dbg_if_same_type(obj, struct_name, fmt, args...) \
- __drbd_printk_choose_cond(obj, struct_name), \
({ \
DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
if (DYNAMIC_DEBUG_BRANCH(descriptor)) { \
- __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
- __dynamic_pr_debug(&descriptor, __drbd_printk_ ## struct_name ## _fmt(fmt), \
- __drbd_printk_ ## struct_name ## _args(), ## args); \
- __drbd_printk_ ## struct_name ## _unprep(); \
+ ___drbd_printk_ ## struct_name( \
+ __drbd_dyn_dbg, \
+ NOLIMIT, obj, \
+ &descriptor, fmt, ## args); \
} \
})
#define dynamic_drbd_dbg(obj, fmt, args...) \
- __builtin_choose_expr( \
- __drbd_dyn_dbg_if_same_type(obj, drbd_device, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_dyn_dbg_if_same_type(obj, drbd_resource, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_dyn_dbg_if_same_type(obj, drbd_connection, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_dyn_dbg_if_same_type(obj, drbd_peer_device, fmt, ## args), \
- drbd_dyn_dbg_with_wrong_object_type()))))
-
-#define drbd_emerg(device, fmt, args...) \
- drbd_printk(KERN_EMERG, device, fmt, ## args)
-#define drbd_alert(device, fmt, args...) \
- drbd_printk(KERN_ALERT, device, fmt, ## args)
-#define drbd_crit(device, fmt, args...) \
- drbd_printk(KERN_CRIT, device, fmt, ## args)
-#define drbd_err(device, fmt, args...) \
- drbd_printk(KERN_ERR, device, fmt, ## args)
-#define drbd_warn(device, fmt, args...) \
- drbd_printk(KERN_WARNING, device, fmt, ## args)
-#define drbd_notice(device, fmt, args...) \
- drbd_printk(KERN_NOTICE, device, fmt, ## args)
-#define drbd_info(device, fmt, args...) \
- drbd_printk(KERN_INFO, device, fmt, ## args)
-
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, device), \
+ __drbd_dyn_dbg_if_same_type(obj, device, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, resource), \
+ __drbd_dyn_dbg_if_same_type(obj, resource, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, connection), \
+ __drbd_dyn_dbg_if_same_type(obj, connection, fmt, ## args), \
+ \
+ __builtin_choose_expr(__drbd_printk_choose_cond(obj, peer_device), \
+ __drbd_dyn_dbg_if_same_type(obj, peer_device, fmt, ## args), \
+ \
+ drbd_dyn_dbg_with_wrong_object_type() \
+ ))))
+
+#define drbd_emerg_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_EMERG, obj, fmt, ## args)
+#define drbd_alert_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_ALERT, obj, fmt, ## args)
+#define drbd_crit_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_CRIT, obj, fmt, ## args)
+#define drbd_err_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_ERR, obj, fmt, ## args)
+#define drbd_warn_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_WARNING, obj, fmt, ## args)
+#define drbd_notice_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_NOTICE, obj, fmt, ## args)
+#define drbd_info_ratelimit(obj, fmt, args...) \
+ drbd_printk(GENERIC, KERN_INFO, obj, fmt, ## args)
+
+#define drbd_emerg(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_EMERG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_ALERT, obj, fmt, ## args)
+#define drbd_crit(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_CRIT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_WARNING, obj, fmt, ## args)
+#define drbd_notice(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_NOTICE, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+ drbd_printk(NOLIMIT, KERN_INFO, obj, fmt, ## args)
#define drbd_ratelimit() \
({ \
@@ -122,7 +206,7 @@ void drbd_dyn_dbg_with_wrong_object_type(void);
do { \
if (!(exp)) \
drbd_err(x, "ASSERTION %s FAILED in %s\n", \
- #exp, __func__); \
+ #exp, __func__); \
} while (0)
/**
@@ -130,12 +214,13 @@ void drbd_dyn_dbg_with_wrong_object_type(void);
*
* Unlike the assert macro, this macro returns a boolean result.
*/
-#define expect(x, exp) ({ \
- bool _bool = (exp); \
- if (!_bool && drbd_ratelimit()) \
- drbd_err(x, "ASSERTION %s FAILED in %s\n", \
- #exp, __func__); \
- _bool; \
+#define expect(x, exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ drbd_err_ratelimit(x, \
+ "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
})
#endif
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 9ae860e7591b..e5770401cb7a 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -64,36 +64,31 @@
*/
enum drbd_req_event {
- CREATED,
- TO_BE_SENT,
TO_BE_SUBMITTED,
- /* XXX yes, now I am inconsistent...
- * these are not "events" but "actions"
- * oh, well... */
- QUEUE_FOR_NET_WRITE,
- QUEUE_FOR_NET_READ,
- QUEUE_FOR_SEND_OOS,
+ NEW_NET_READ,
+ NEW_NET_WRITE,
+ NEW_NET_OOS,
+ READY_FOR_NET,
+ SKIP_OOS,
- /* An empty flush is queued as P_BARRIER,
- * which will cause it to complete "successfully",
- * even if the local disk flush failed.
+ /* For an empty flush, mark that a corresponding barrier has been sent
+ * to this peer. This causes it to complete "successfully", even if the
+ * local disk flush failed.
*
* Just like "real" requests, empty flushes (blkdev_issue_flush()) will
* only see an error if neither local nor remote data is reachable. */
- QUEUE_AS_DRBD_BARRIER,
+ BARRIER_SENT,
SEND_CANCELED,
SEND_FAILED,
HANDED_OVER_TO_NETWORK,
OOS_HANDED_TO_NETWORK,
- CONNECTION_LOST_WHILE_PENDING,
- READ_RETRY_REMOTE_CANCELED,
+ CONNECTION_LOST,
+ CONNECTION_LOST_WHILE_SUSPENDED,
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
- CONFLICT_RESOLVED,
- POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
@@ -107,82 +102,93 @@ enum drbd_req_event {
ABORT_DISK_IO,
RESEND,
- FAIL_FROZEN_DISK_IO,
- RESTART_FROZEN_DISK_IO,
+ CANCEL_SUSPENDED_IO,
+ COMPLETION_RESUMED,
NOTHING,
};
-/* encoding of request states for now. we don't actually need that many bits.
- * we don't need to do atomic bit operations either, since most of the time we
- * need to look at the connection state and/or manipulate some lists at the
- * same time, so we should hold the request lock anyways.
+/*
+ * Encoding of request states. Modifications are protected by rq_lock. We don't
+ * do atomic bit operations.
*/
enum drbd_req_state_bits {
- /* 3210
- * 0000: no local possible
- * 0001: to be submitted
- * UNUSED, we could map: 011: submitted, completion still pending
- * 0110: completed ok
- * 0010: completed with error
- * 1001: Aborted (before completion)
- * 1x10: Aborted and completed -> free
- */
- __RQ_LOCAL_PENDING,
- __RQ_LOCAL_COMPLETED,
- __RQ_LOCAL_OK,
- __RQ_LOCAL_ABORTED,
-
- /* 87654
- * 00000: no network possible
- * 00001: to be send
- * 00011: to be send, on worker queue
- * 00101: sent, expecting recv_ack (B) or write_ack (C)
- * 11101: sent,
- * recv_ack (B) or implicit "ack" (A),
- * still waiting for the barrier ack.
- * master_bio may already be completed and invalidated.
- * 11100: write acked (C),
- * data received (for remote read, any protocol)
- * or finally the barrier ack has arrived (B,A)...
- * request can be freed
- * 01100: neg-acked (write, protocol C)
- * or neg-d-acked (read, any protocol)
- * or killed from the transfer log
- * during cleanup after connection loss
- * request can be freed
- * 01000: canceled or send failed...
- * request can be freed
+ /*
+ * Here are the possible combinations of the core net flags pending, pending-oos,
+ * queued, ready, sent, done, ok.
+ *
+ * <none>:
+ * No network required, or not yet processed.
+ * pending,queued:
+ * To be sent, must not be processed yet.
+ * pending,queued,ready:
+ * To be sent, processing allowed.
+ * pending,ready,sent:
+ * Sent, expecting P_RECV_ACK (B) or P_WRITE_ACK (C).
+ * queued,ready,ok:
+ * P_RECV_ACK (B) or P_WRITE_ACK (C) received before request marked
+ * as having been sent.
+ * ready,sent,ok:
+ * Sent, implicit "ack" (A), P_RECV_ACK (B) or P_WRITE_ACK (C) received.
+ * Still waiting for the barrier ack.
+ * master_bio may already be completed and invalidated.
+ * pending:
+ * Intended for this peer, but connection lost before processing
+ * allowed.
+ * pending,ready:
+ * Intended for this peer, but connection lost. If
+ * IO is suspended, it will stay in this state until the connection
+ * is restored or IO is resumed.
+ * ready,sent,done,ok:
+ * Data received (for remote read, any protocol),
+ * or finally the barrier ack has arrived.
+ * ready,sent,done:
+ * Received P_NEG_ACK for write (protocol C, or we are SyncSource),
+ * or P_NEG_DREPLY for read (any protocol).
+ * Or cleaned up after connection loss after send.
+ * pending-oos,queued,done:
+ * P_OUT_OF_SYNC to be sent, must not be processed yet.
+ * pending-oos,queued,ready,done:
+ * P_OUT_OF_SYNC to be sent, processing allowed.
+ * queued,ready,done:
+ * P_OUT_OF_SYNC was intended, but skipped.
+ * done:
+ * P_OUT_OF_SYNC was intended, but connection lost before processing
+ * allowed.
+ * ready,done:
+ * P_OUT_OF_SYNC sent.
+ * Or cleaned up after connection loss, either before send or when
+ * only P_OUT_OF_SYNC was intended.
*/
- /* if "SENT" is not set, yet, this can still fail or be canceled.
- * if "SENT" is set already, we still wait for an Ack packet.
- * when cleared, the master_bio may be completed.
- * in (B,A) the request object may still linger on the transaction log
- * until the corresponding barrier ack comes in */
+ /* Pending some network interaction towards the peer apart from
+ * barriers or P_OUT_OF_SYNC.
+ * If "sent" is not yet set, this can still fail or be canceled.
+ * While set, the master_bio may not be completed. */
__RQ_NET_PENDING,
- /* If it is QUEUED, and it is a WRITE, it is also registered in the
- * transfer log. Currently we need this flag to avoid conflicts between
- * worker canceling the request and tl_clear_barrier killing it from
- * transfer log. We should restructure the code so this conflict does
- * no longer occur. */
+ /* Pending send of P_OUT_OF_SYNC */
+ __RQ_NET_PENDING_OOS,
+
+ /* The sender might store pointers to it */
__RQ_NET_QUEUED,
- /* well, actually only "handed over to the network stack".
- *
- * TODO can potentially be dropped because of the similar meaning
- * of RQ_NET_SENT and ~RQ_NET_QUEUED.
- * however it is not exactly the same. before we drop it
- * we must ensure that we can tell a request with network part
- * from a request without, regardless of what happens to it. */
+ /* Ready for processing by the sender */
+ __RQ_NET_READY,
+
+ /* Well, actually only "handed over to the network stack". */
__RQ_NET_SENT,
- /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
- * basically this means the corresponding P_BARRIER_ACK was received */
+ /* When set, the data stage is done, as far as interaction with this
+ * peer is concerned. Basically this means the corresponding
+ * P_BARRIER_ACK was received. */
__RQ_NET_DONE,
- /* whether or not we know (C) or pretend (B,A) that the write
- * was successfully written on the peer.
+ /* Set when the request was successful. That is, the corresponding
+ * condition is fulfilled:
+ * - The write was sent (A)
+ * - Receipt of the write was acknowledged (B)
+ * - The write was successfully written on the peer (C)
+ * - Read data was received
*/
__RQ_NET_OK,
@@ -192,6 +198,29 @@ enum drbd_req_state_bits {
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
+ /* We expect a receive ACK (wire proto B) */
+ __RQ_EXP_RECEIVE_ACK,
+
+ /* We expect a write ACK (wite proto C) */
+ __RQ_EXP_WRITE_ACK,
+
+ /* waiting for a barrier ack, did an extra kref_get */
+ __RQ_EXP_BARR_ACK,
+
+ /* 4321
+ * 0000: no local possible
+ * 0001: to be submitted
+ * UNUSED, we could map: 011: submitted, completion still pending
+ * 0110: completed ok
+ * 0010: completed with error
+ * 1001: Aborted (before completion)
+ * 1x10: Aborted and completed -> free
+ */
+ __RQ_LOCAL_PENDING,
+ __RQ_LOCAL_COMPLETED,
+ __RQ_LOCAL_OK,
+ __RQ_LOCAL_ABORTED,
+
/* Set when this is a write, clear for a read */
__RQ_WRITE,
__RQ_WSAME,
@@ -212,26 +241,11 @@ enum drbd_req_state_bits {
/* would have been completed,
* but was not, because of drbd_suspended() */
__RQ_COMPLETION_SUSP,
-
- /* We expect a receive ACK (wire proto B) */
- __RQ_EXP_RECEIVE_ACK,
-
- /* We expect a write ACK (wite proto C) */
- __RQ_EXP_WRITE_ACK,
-
- /* waiting for a barrier ack, did an extra kref_get */
- __RQ_EXP_BARR_ACK,
};
-
-#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
-#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
-#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
-#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
-
-#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
-
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
+#define RQ_NET_PENDING_OOS (1UL << __RQ_NET_PENDING_OOS)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_READY (1UL << __RQ_NET_READY)
#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
#define RQ_NET_OK (1UL << __RQ_NET_OK)
@@ -239,6 +253,18 @@ enum drbd_req_state_bits {
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
+
+#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK \
+ (RQ_LOCAL_ABORTED | RQ_LOCAL_OK | RQ_LOCAL_COMPLETED | RQ_LOCAL_PENDING)
+
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_WSAME (1UL << __RQ_WSAME)
#define RQ_UNMAP (1UL << __RQ_UNMAP)
@@ -247,14 +273,25 @@ enum drbd_req_state_bits {
#define RQ_UNPLUG (1UL << __RQ_UNPLUG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
-#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
-#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
-#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
-/* For waking up the frozen transfer log mod_req() has to return if the request
- should be counted in the epoch object*/
-#define MR_WRITE 1
-#define MR_READ 2
+
+/* these flags go into local_rq_state,
+ * orhter flags go into their respective net_rq_state[idx] */
+#define RQ_STATE_0_MASK \
+ (RQ_LOCAL_MASK |\
+ RQ_WRITE |\
+ RQ_WSAME |\
+ RQ_UNMAP |\
+ RQ_ZEROES |\
+ RQ_IN_ACT_LOG |\
+ RQ_UNPLUG |\
+ RQ_POSTPONED |\
+ RQ_COMPLETION_SUSP)
+
+static inline bool drbd_req_is_write(struct drbd_request *req)
+{
+ return req->local_rq_state & RQ_WRITE;
+}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
@@ -264,61 +301,63 @@ struct bio_and_error {
int error;
};
-extern void start_new_tl_epoch(struct drbd_connection *connection);
-extern void drbd_req_destroy(struct kref *kref);
-extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
- struct drbd_peer_device *peer_device,
- struct bio_and_error *m);
-extern void complete_master_bio(struct drbd_device *device,
- struct bio_and_error *m);
-extern void request_timer_fn(struct timer_list *t);
-extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
-extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
-extern void tl_abort_disk_io(struct drbd_device *device);
+bool start_new_tl_epoch(struct drbd_resource *resource);
+void drbd_req_destroy(struct kref *kref);
+void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct drbd_peer_device *peer_device, struct bio_and_error *m);
+void complete_master_bio(struct drbd_device *device, struct bio_and_error *m);
+void drbd_release_conflicts(struct drbd_device *device,
+ struct drbd_interval *release_interval);
+void drbd_put_ref_tl_walk(struct drbd_request *req, int done_put, int oos_send_put);
+void drbd_set_pending_out_of_sync(struct drbd_peer_device *peer_device);
+void request_timer_fn(struct timer_list *t);
+void tl_walk(struct drbd_connection *connection,
+ struct drbd_request **from_req, enum drbd_req_event what);
+void __tl_walk(struct drbd_resource * const resource,
+ struct drbd_connection * const connection,
+ struct drbd_request **from_req, const enum drbd_req_event what);
+void drbd_destroy_peer_ack_if_done(struct drbd_peer_ack *peer_ack);
+int w_queue_peer_ack(struct drbd_work *w, int cancel);
+void drbd_queue_peer_ack(struct drbd_resource *resource,
+ struct drbd_request *req);
+bool drbd_should_do_remote(struct drbd_peer_device *peer_device,
+ enum which_state which);
+void drbd_reclaim_req(struct rcu_head *rp);
/* this is in drbd_main.c */
-extern void drbd_restart_request(struct drbd_request *req);
+void drbd_restart_request(struct drbd_request *req);
+void drbd_restart_suspended_reqs(struct drbd_resource *resource);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
-static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what,
+static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what,
struct drbd_peer_device *peer_device)
{
struct drbd_device *device = req->device;
struct bio_and_error m;
- int rv;
/* __req_mod possibly frees req, do not touch req after that! */
- rv = __req_mod(req, what, peer_device, &m);
+ __req_mod(req, what, peer_device, &m);
if (m.bio)
complete_master_bio(device, &m);
-
- return rv;
}
-/* completion of master bio is outside of our spinlock.
- * We still may or may not be inside some irqs disabled section
- * of the lower level driver completion callback, so we need to
- * spin_lock_irqsave here. */
-static inline int req_mod(struct drbd_request *req,
+/* completion of master bio is outside of spinlock.
+ * If you need it irqsave, do it your self!
+ * Which means: don't use from bio endio callback. */
+static inline void req_mod(struct drbd_request *req,
enum drbd_req_event what,
struct drbd_peer_device *peer_device)
{
- unsigned long flags;
struct drbd_device *device = req->device;
struct bio_and_error m;
- int rv;
- spin_lock_irqsave(&device->resource->req_lock, flags);
- rv = __req_mod(req, what, peer_device, &m);
- spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ read_lock_irq(&device->resource->state_rwlock);
+ __req_mod(req, what, peer_device, &m);
+ read_unlock_irq(&device->resource->state_rwlock);
if (m.bio)
complete_master_bio(device, &m);
-
- return rv;
}
-extern bool drbd_should_do_remote(union drbd_dev_state);
-
#endif
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index cbaeb8018dbf..2ae525c1760e 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -2,26 +2,19 @@
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
+#include "drbd_protocol.h"
+
+struct drbd_resource;
struct drbd_device;
struct drbd_connection;
+struct drbd_peer_device;
+struct drbd_work;
+struct twopc_request;
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
- *
- * The NS macros expand to a mask and a value, that can be bit ored onto the
- * current state as soon as the spinlock (req_lock) was taken.
- *
- * The _NS macros are used for state functions that get called with the
- * spinlock. These macros expand directly to the new state value.
- *
- * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
- * to express state changes that affect more than one aspect of the state.
- *
- * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
- * Means that the network connection was established and that the peer
- * is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
@@ -34,141 +27,168 @@ struct drbd_connection;
#define susp_nod_MASK 1
#define susp_fen_MASK 1
-#define NS(T, S) \
- ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T = (S); val; })
-#define NS2(T1, S1, T2, S2) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val; })
-#define NS3(T1, S1, T2, S2, T3, S3) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val.T3 = (S3); val; })
-
-#define _NS(D, T, S) \
- D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
-#define _NS2(D, T1, S1, T2, S2) \
- D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns; })
-#define _NS3(D, T1, S1, T2, S2, T3, S3) \
- D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
-
enum chg_state_flags {
- CS_HARD = 1 << 0,
+ CS_HARD = 1 << 0, /* Forced state change, such as a connection loss */
CS_VERBOSE = 1 << 1,
CS_WAIT_COMPLETE = 1 << 2,
CS_SERIALIZE = 1 << 3,
- CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
- CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
- CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
- CS_DC_PEER = 1 << 6,
- CS_DC_CONN = 1 << 7,
- CS_DC_DISK = 1 << 8,
- CS_DC_PDSK = 1 << 9,
- CS_DC_SUSP = 1 << 10,
- CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
- CS_IGN_OUTD_FAIL = 1 << 11,
-
- /* Make sure no meta data IO is in flight, by calling
- * drbd_md_get_buffer(). Used for graceful detach. */
- CS_INHIBIT_MD_IO = 1 << 12,
+ CS_ALREADY_SERIALIZED = 1 << 4, /* resource->state_sem already taken */
+ CS_LOCAL_ONLY = 1 << 5, /* Do not consider a cluster-wide state change */
+ CS_PREPARE = 1 << 6,
+ CS_PREPARED = 1 << 7,
+ CS_ABORT = 1 << 8,
+ CS_TWOPC = 1 << 9,
+ CS_IGN_OUTD_FAIL = 1 << 10,
+ CS_DONT_RETRY = 1 << 11, /* Disable internal retry. Caller has a retry loop */
+ CS_FORCE_RECALC = 1 << 13, /* Force re-evaluation of state logic */
+ CS_CLUSTER_WIDE = 1 << 14, /* Make this a cluster wide state change! */
+ CS_FP_LOCAL_UP_TO_DATE = 1 << 15, /* force promotion by making local disk state up_to_date */
+ CS_FP_OUTDATE_PEERS = 1 << 16, /* force promotion by marking unknown peers as outdated */
+ CS_FS_IGN_OPENERS = 1 << 17, /* force demote, ignore openers */
};
-/* drbd_dev_state and drbd_state are different types. This is to stress the
- small difference. There is no suspended flag (.susp), and no suspended
- while fence handler runs flas (susp_fen). */
-union drbd_dev_state {
- struct {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- unsigned role:2 ; /* 3/4 primary/secondary/unknown */
- unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
- unsigned conn:5 ; /* 17/32 cstates */
- unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
- unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
- unsigned _unused:1 ;
- unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
- unsigned peer_isp:1 ;
- unsigned user_isp:1 ;
- unsigned _pad:11; /* 0 unused */
-#elif defined(__BIG_ENDIAN_BITFIELD)
- unsigned _pad:11;
- unsigned user_isp:1 ;
- unsigned peer_isp:1 ;
- unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
- unsigned _unused:1 ;
- unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
- unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
- unsigned conn:5 ; /* 17/32 cstates */
- unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
- unsigned role:2 ; /* 3/4 primary/secondary/unknown */
-#else
-# error "this endianess is not supported"
-#endif
- };
- unsigned int i;
-};
+void drbd_resume_al(struct drbd_device *device);
-extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
- enum chg_state_flags f,
- union drbd_state mask,
- union drbd_state val);
-extern void drbd_force_state(struct drbd_device *, union drbd_state,
- union drbd_state);
-extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
- union drbd_state,
- union drbd_state,
- enum chg_state_flags);
-
-extern enum drbd_state_rv
-_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
- union drbd_state, enum chg_state_flags);
-
-extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
- enum chg_state_flags,
- struct completion *done);
-extern void print_st_err(struct drbd_device *, union drbd_state,
- union drbd_state, enum drbd_state_rv);
-
-enum drbd_state_rv
-_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
- enum chg_state_flags flags);
-
-enum drbd_state_rv
-conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
- enum chg_state_flags flags);
-
-extern void drbd_resume_al(struct drbd_device *device);
-extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
-/**
- * drbd_request_state() - Request a state change
- * @device: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- *
- * This is the most graceful way of requesting a state change. It is verbose
- * quite verbose in case the state change is not possible, and all those
- * state changes are globally serialized.
- */
-static inline int drbd_request_state(struct drbd_device *device,
- union drbd_state mask,
- union drbd_state val)
+void state_change_lock(struct drbd_resource *resource,
+ unsigned long *irq_flags, enum chg_state_flags flags);
+void state_change_unlock(struct drbd_resource *resource,
+ unsigned long *irq_flags);
+
+void begin_state_change(struct drbd_resource *resource,
+ unsigned long *irq_flags, enum chg_state_flags flags);
+enum drbd_state_rv end_state_change(struct drbd_resource *resource,
+ unsigned long *irq_flags, const char *tag);
+void abort_state_change(struct drbd_resource *resource,
+ unsigned long *irq_flags);
+void abort_state_change_locked(struct drbd_resource *resource);
+
+void begin_state_change_locked(struct drbd_resource *resource,
+ enum chg_state_flags flags);
+enum drbd_state_rv end_state_change_locked(struct drbd_resource *resource,
+ const char *tag);
+
+void clear_remote_state_change(struct drbd_resource *resource);
+void __clear_remote_state_change(struct drbd_resource *resource);
+
+
+enum which_state;
+bool drbd_all_peer_replication(struct drbd_device *device, enum which_state which);
+union drbd_state drbd_get_device_state(struct drbd_device *device,
+ enum which_state which);
+union drbd_state drbd_get_peer_device_state(struct drbd_peer_device *peer_device,
+ enum which_state which);
+
+#define stable_state_change(resource, change_state) ({ \
+ enum drbd_state_rv rv; \
+ int err; \
+ err = wait_event_interruptible((resource)->state_wait, \
+ (rv = (change_state)) != SS_IN_TRANSIENT_STATE); \
+ if (err) \
+ err = -SS_UNKNOWN_ERROR; \
+ else \
+ err = rv; \
+ err; \
+ })
+
+void nested_twopc_work(struct work_struct *work);
+void drbd_maybe_cluster_wide_reply(struct drbd_resource *resource);
+enum drbd_state_rv nested_twopc_request(struct drbd_resource *resource,
+ struct twopc_request *request);
+bool drbd_twopc_between_peer_and_me(struct drbd_connection *connection);
+bool cluster_wide_reply_ready(struct drbd_resource *resource);
+
+enum drbd_state_rv change_role(struct drbd_resource *resource,
+ enum drbd_role role,
+ enum chg_state_flags flags, const char *tag,
+ const char **err_str);
+
+void __change_io_susp_user(struct drbd_resource *resource, bool value);
+enum drbd_state_rv change_io_susp_user(struct drbd_resource *resource,
+ bool value, enum chg_state_flags flags);
+void __change_io_susp_no_data(struct drbd_resource *resource, bool value);
+void __change_io_susp_fencing(struct drbd_connection *connection, bool value);
+void __change_io_susp_quorum(struct drbd_resource *resource, bool value);
+
+void __change_disk_state(struct drbd_device *device,
+ enum drbd_disk_state disk_state);
+void __downgrade_disk_states(struct drbd_resource *resource,
+ enum drbd_disk_state disk_state);
+enum drbd_state_rv change_disk_state(struct drbd_device *device,
+ enum drbd_disk_state disk_state,
+ enum chg_state_flags flags,
+ const char *tag, const char **err_str);
+
+void __change_cstate(struct drbd_connection *connection,
+ enum drbd_conn_state cstate);
+enum drbd_state_rv change_cstate_tag(struct drbd_connection *connection,
+ enum drbd_conn_state cstate,
+ enum chg_state_flags flags,
+ const char *tag, const char **err_str);
+static inline enum drbd_state_rv change_cstate(struct drbd_connection *connection,
+ enum drbd_conn_state cstate,
+ enum chg_state_flags flags)
{
- return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+ return change_cstate_tag(connection, cstate, flags, NULL, NULL);
}
-/* for use in adm_detach() (drbd_adm_detach(), drbd_adm_down()) */
-int drbd_request_detach_interruptible(struct drbd_device *device);
-
-enum drbd_role conn_highest_role(struct drbd_connection *connection);
-enum drbd_role conn_highest_peer(struct drbd_connection *connection);
-enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
-enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
-enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
-enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
-
+void __change_peer_role(struct drbd_connection *connection,
+ enum drbd_role peer_role);
+
+void __change_repl_state(struct drbd_peer_device *peer_device,
+ enum drbd_repl_state repl_state);
+enum drbd_state_rv change_repl_state(struct drbd_peer_device *peer_device,
+ enum drbd_repl_state new_repl_state,
+ enum chg_state_flags flags,
+ const char *tag);
+enum drbd_state_rv stable_change_repl_state(struct drbd_peer_device *peer_device,
+ enum drbd_repl_state repl_state,
+ enum chg_state_flags flags,
+ const char *tag);
+
+void __change_peer_disk_state(struct drbd_peer_device *peer_device,
+ enum drbd_disk_state disk_state);
+void __downgrade_peer_disk_states(struct drbd_connection *connection,
+ enum drbd_disk_state disk_state);
+void __outdate_myself(struct drbd_resource *resource);
+enum drbd_state_rv change_peer_disk_state(struct drbd_peer_device *peer_device,
+ enum drbd_disk_state disk_state,
+ enum chg_state_flags flags,
+ const char *tag);
+
+void __change_resync_susp_user(struct drbd_peer_device *peer_device,
+ bool value);
+enum drbd_state_rv change_resync_susp_user(struct drbd_peer_device *peer_device,
+ bool value,
+ enum chg_state_flags flags);
+void __change_resync_susp_peer(struct drbd_peer_device *peer_device,
+ bool value);
+void __change_resync_susp_dependency(struct drbd_peer_device *peer_device,
+ bool value);
+void apply_connect(struct drbd_connection *connection, bool commit);
+
+struct drbd_work;
+
+bool resource_is_suspended(struct drbd_resource *resource,
+ enum which_state which);
+bool is_suspended_fen(struct drbd_resource *resource, enum which_state which);
+
+enum dds_flags;
+enum determine_dev_size;
+struct resize_parms;
+
+enum determine_dev_size
+change_cluster_wide_device_size(struct drbd_device *device,
+ sector_t local_max_size,
+ uint64_t new_user_size,
+ enum dds_flags dds_flags,
+ struct resize_parms *rs);
+
+bool drbd_data_accessible(struct drbd_device *device, enum which_state which);
+bool drbd_res_data_accessible(struct drbd_resource *resource);
+
+
+void drbd_empty_twopc_work_fn(struct work_struct *work);
#endif
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
index a56a57d67686..bb68684a5fd3 100644
--- a/drivers/block/drbd/drbd_state_change.h
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -7,58 +7,80 @@ struct drbd_resource_state_change {
enum drbd_role role[2];
bool susp[2];
bool susp_nod[2];
- bool susp_fen[2];
+ bool susp_uuid[2];
+ bool fail_io[2];
};
struct drbd_device_state_change {
struct drbd_device *device;
enum drbd_disk_state disk_state[2];
+ bool have_quorum[2];
};
struct drbd_connection_state_change {
struct drbd_connection *connection;
- enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */
+ enum drbd_conn_state cstate[2];
enum drbd_role peer_role[2];
+ bool susp_fen[2];
+};
+
+/* exception: stores state, not change.
+ * for get_initial_state. */
+struct drbd_path_state {
+ struct drbd_connection *connection;
+ struct drbd_path *path;
+ /* not an array,
+ * because it's not an array in struct drbd_path either */
+ bool path_established;
};
struct drbd_peer_device_state_change {
struct drbd_peer_device *peer_device;
enum drbd_disk_state disk_state[2];
- enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */
+ enum drbd_repl_state repl_state[2];
bool resync_susp_user[2];
bool resync_susp_peer[2];
bool resync_susp_dependency[2];
+ bool resync_susp_other_c[2];
+ bool resync_active[2];
+ bool replication[2];
+ bool peer_replication[2];
+};
+
+struct drbd_state_change_object_count {
+ unsigned int n_devices;
+ unsigned int n_connections;
+ unsigned int n_paths;
};
struct drbd_state_change {
struct list_head list;
unsigned int n_devices;
unsigned int n_connections;
+ unsigned int n_paths;
struct drbd_resource_state_change resource[1];
struct drbd_device_state_change *devices;
struct drbd_connection_state_change *connections;
struct drbd_peer_device_state_change *peer_devices;
+ struct drbd_path_state *paths;
};
-extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
-extern void copy_old_to_new_state_change(struct drbd_state_change *);
-extern void forget_state_change(struct drbd_state_change *);
+struct drbd_state_change *remember_state_change(struct drbd_resource *resource,
+ gfp_t gfp);
+void copy_old_to_new_state_change(struct drbd_state_change *state_change);
+void forget_state_change(struct drbd_state_change *state_change);
-extern int notify_resource_state_change(struct sk_buff *,
- unsigned int,
- void *,
- enum drbd_notification_type type);
-extern int notify_connection_state_change(struct sk_buff *,
- unsigned int,
- void *,
- enum drbd_notification_type type);
-extern int notify_device_state_change(struct sk_buff *,
- unsigned int,
- void *,
- enum drbd_notification_type type);
-extern int notify_peer_device_state_change(struct sk_buff *,
- unsigned int,
- void *,
- enum drbd_notification_type type);
+int notify_resource_state_change(struct sk_buff *skb, unsigned int seq,
+ void *state_change,
+ enum drbd_notification_type type);
+int notify_connection_state_change(struct sk_buff *skb, unsigned int seq,
+ void *state_change,
+ enum drbd_notification_type type);
+int notify_device_state_change(struct sk_buff *skb, unsigned int seq,
+ void *state_change,
+ enum drbd_notification_type type);
+int notify_peer_device_state_change(struct sk_buff *skb, unsigned int seq,
+ void *state_change,
+ enum drbd_notification_type type);
#endif /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
index 0201f6590f6a..f376ce28a815 100644
--- a/drivers/block/drbd/drbd_strings.h
+++ b/drivers/block/drbd/drbd_strings.h
@@ -2,9 +2,26 @@
#ifndef __DRBD_STRINGS_H
#define __DRBD_STRINGS_H
-extern const char *drbd_conn_str(enum drbd_conns);
-extern const char *drbd_role_str(enum drbd_role);
-extern const char *drbd_disk_str(enum drbd_disk_state);
-extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+struct state_names {
+ const char * const *names;
+ unsigned int size;
+};
+
+extern struct state_names drbd_conn_state_names;
+extern struct state_names drbd_repl_state_names;
+extern struct state_names drbd_role_state_names;
+extern struct state_names drbd_disk_state_names;
+extern struct state_names drbd_error_messages;
+extern struct state_names drbd_packet_names;
+
+enum drbd_packet;
+
+const char *drbd_repl_str(enum drbd_repl_state s);
+const char *drbd_conn_str(enum drbd_conn_state s);
+const char *drbd_role_str(enum drbd_role s);
+const char *drbd_disk_str(enum drbd_disk_state s);
+const char *drbd_set_st_err_str(enum drbd_state_rv err);
+const char *drbd_packet_name(enum drbd_packet cmd);
+
#endif /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_transport_lb-tcp.c b/drivers/block/drbd/drbd_transport_lb-tcp.c
index 497fca8c413c..29f18df2be88 100644
--- a/drivers/block/drbd/drbd_transport_lb-tcp.c
+++ b/drivers/block/drbd/drbd_transport_lb-tcp.c
@@ -15,10 +15,10 @@
#include <linux/tcp.h>
#include <linux/highmem.h>
#include <linux/bio.h>
-#include <linux/drbd_genl_api.h>
-#include <linux/drbd_config.h>
+#include "drbd_genl_api.h"
#include <net/tcp.h>
#include "drbd_protocol.h"
+#include "drbd_config.h"
#include "drbd_transport.h"
diff --git a/drivers/block/drbd/drbd_transport_rdma.c b/drivers/block/drbd/drbd_transport_rdma.c
index 21790a769d63..fbdf6a4bcda9 100644
--- a/drivers/block/drbd/drbd_transport_rdma.c
+++ b/drivers/block/drbd/drbd_transport_rdma.c
@@ -28,10 +28,10 @@
#include <rdma/rdma_cm.h>
#include <rdma/ib_cm.h>
#include <linux/interrupt.h>
-#include <linux/drbd_genl_api.h>
+#include "drbd_genl_api.h"
#include "drbd_protocol.h"
#include "drbd_transport.h"
-#include "linux/drbd_config.h" /* for REL_VERSION */
+#include "drbd_config.h" /* for REL_VERSION */
/* Nearly all data transfer uses the send/receive semantics. No need to
actually use RDMA WRITE / READ.
diff --git a/drivers/block/drbd/drbd_transport_tcp.c b/drivers/block/drbd/drbd_transport_tcp.c
index 31885ff9341f..5faa6b82c358 100644
--- a/drivers/block/drbd/drbd_transport_tcp.c
+++ b/drivers/block/drbd/drbd_transport_tcp.c
@@ -19,14 +19,14 @@
#include <linux/tcp.h>
#include <linux/highmem.h>
#include <linux/bio.h>
-#include <linux/drbd_genl_api.h>
-#include <linux/drbd_config.h>
+#include "drbd_genl_api.h"
#include <linux/tls.h>
#include <net/tcp.h>
#include <net/handshake.h>
#include <net/tls.h>
#include <net/tls_prot.h>
#include "drbd_protocol.h"
+#include "drbd_config.h"
#include "drbd_transport.h"
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 5468a2399d48..ed408088a282 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
drbd.h
Kernel module for 2.6.x Kernels
@@ -9,10 +9,10 @@
Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
-
*/
#ifndef DRBD_H
#define DRBD_H
+
#include <asm/types.h>
#ifdef __KERNEL__
@@ -44,8 +44,7 @@ enum drbd_io_error_p {
EP_DETACH
};
-enum drbd_fencing_p {
- FP_NOT_AVAIL = -1, /* Not a policy */
+enum drbd_fencing_policy {
FP_DONT_CARE = 0,
FP_RESOURCE,
FP_STONITH
@@ -68,7 +67,9 @@ enum drbd_after_sb_p {
ASB_CONSENSUS,
ASB_DISCARD_SECONDARY,
ASB_CALL_HELPER,
- ASB_VIOLENTLY
+ ASB_VIOLENTLY,
+ ASB_RETRY_CONNECT,
+ ASB_AUTO_DISCARD,
};
enum drbd_on_no_data {
@@ -76,6 +77,16 @@ enum drbd_on_no_data {
OND_SUSPEND_IO
};
+enum drbd_on_no_quorum {
+ ONQ_IO_ERROR = OND_IO_ERROR,
+ ONQ_SUSPEND_IO = OND_SUSPEND_IO
+};
+
+enum drbd_on_susp_primary_outdated {
+ SPO_DISCONNECT,
+ SPO_FORCE_SECONDARY,
+};
+
enum drbd_on_congestion {
OC_BLOCK,
OC_PULL_AHEAD,
@@ -96,6 +107,11 @@ enum drbd_read_balancing {
RB_1M_STRIPING,
};
+/* Windows km/dderror.h has that a 0L */
+#ifdef NO_ERROR
+#undef NO_ERROR
+#endif
+
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_code {
ERR_CODE_BASE = 100,
@@ -162,6 +178,12 @@ enum drbd_ret_code {
ERR_MD_LAYOUT_TOO_SMALL = 168,
ERR_MD_LAYOUT_NO_FIT = 169,
ERR_IMPLICIT_SHRINK = 170,
+ ERR_INVALID_PEER_NODE_ID = 171,
+ ERR_CREATE_TRANSPORT = 172,
+ ERR_LOCAL_AND_PEER_ADDR = 173,
+ ERR_ALREADY_EXISTS = 174,
+ ERR_APV_TOO_LOW = 175,
+
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
};
@@ -178,17 +200,17 @@ enum drbd_role {
};
/* The order of these constants is important.
- * The lower ones (<C_WF_REPORT_PARAMS) indicate
+ * The lower ones (< C_CONNECTED) indicate
* that there is no socket!
- * >=C_WF_REPORT_PARAMS ==> There is a socket
+ * >= C_CONNECTED ==> There is a socket
*/
-enum drbd_conns {
+enum drbd_conn_state {
C_STANDALONE,
- C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
+ C_DISCONNECTING, /* Temporary state on the way to C_STANDALONE. */
C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
- /* These temporal states are all used on the way
- * from >= C_CONNECTED to Unconnected.
+ /* These temporary states are used on the way
+ * from C_CONNECTED to C_UNCONNECTED.
* The 'disconnect reason' states
* I do not allow to change between them. */
C_TIMEOUT,
@@ -197,35 +219,44 @@ enum drbd_conns {
C_PROTOCOL_ERROR,
C_TEAR_DOWN,
- C_WF_CONNECTION,
- C_WF_REPORT_PARAMS, /* we have a socket */
- C_CONNECTED, /* we have introduced each other */
- C_STARTING_SYNC_S, /* starting full sync by admin request. */
- C_STARTING_SYNC_T, /* starting full sync by admin request. */
- C_WF_BITMAP_S,
- C_WF_BITMAP_T,
- C_WF_SYNC_UUID,
+ C_CONNECTING,
+
+ C_CONNECTED, /* we have a socket */
+
+ C_MASK = 31,
+};
+
+enum drbd_repl_state {
+ L_NEGOTIATING = C_CONNECTED, /* used for peer_device->negotiation_result only */
+ L_OFF = C_CONNECTED,
+
+ L_ESTABLISHED, /* we have introduced each other */
+ L_STARTING_SYNC_S, /* starting full sync by admin request. */
+ L_STARTING_SYNC_T, /* starting full sync by admin request. */
+ L_WF_BITMAP_S,
+ L_WF_BITMAP_T,
+ L_WF_SYNC_UUID,
/* All SyncStates are tested with this comparison
- * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
- C_SYNC_SOURCE,
- C_SYNC_TARGET,
- C_VERIFY_S,
- C_VERIFY_T,
- C_PAUSED_SYNC_S,
- C_PAUSED_SYNC_T,
-
- C_AHEAD,
- C_BEHIND,
-
- C_MASK = 31
+ * xx >= L_SYNC_SOURCE && xx <= L_PAUSED_SYNC_T */
+ L_SYNC_SOURCE,
+ L_SYNC_TARGET,
+ L_VERIFY_S,
+ L_VERIFY_T,
+ L_PAUSED_SYNC_S,
+ L_PAUSED_SYNC_T,
+
+ L_AHEAD,
+ L_BEHIND,
+ L_NEG_NO_RESULT = L_BEHIND, /* used for peer_device->negotiation_result only */
};
enum drbd_disk_state {
D_DISKLESS,
D_ATTACHING, /* In the process of reading the meta-data */
+ D_DETACHING, /* Added in protocol version 110 */
D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
- /* when >= D_FAILED it is legal to access mdev->ldev */
+ /* when >= D_FAILED it is legal to access device->ldev */
D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
D_INCONSISTENT,
D_OUTDATED,
@@ -257,9 +288,11 @@ union drbd_state {
unsigned user_isp:1 ;
unsigned susp_nod:1 ; /* IO suspended because no data */
unsigned susp_fen:1 ; /* IO suspended because fence peer handler runs*/
- unsigned _pad:9; /* 0 unused */
+ unsigned quorum:1;
+ unsigned _pad:8; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
- unsigned _pad:9;
+ unsigned _pad:8;
+ unsigned quorum:1;
unsigned susp_fen:1 ;
unsigned susp_nod:1 ;
unsigned user_isp:1 ;
@@ -297,29 +330,48 @@ enum drbd_state_rv {
SS_DEVICE_IN_USE = -12,
SS_NO_NET_CONFIG = -13,
SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
- SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
+ SS_NEED_CONNECTION = -15,
SS_LOWER_THAN_OUTDATED = -16,
- SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
+ SS_NOT_SUPPORTED = -17,
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
SS_O_VOL_PEER_PRI = -20,
- SS_OUTDATE_WO_CONN = -21,
- SS_AFTER_LAST_ERROR = -22, /* Keep this at bottom */
+ SS_INTERRUPTED = -21, /* interrupted in stable_state_change() */
+ SS_PRIMARY_READER = -22,
+ SS_TIMEOUT = -23,
+ SS_WEAKLY_CONNECTED = -24,
+ SS_NO_QUORUM = -25,
+ SS_ATTACH_NO_BITMAP = -26,
+ SS_HANDSHAKE_DISCONNECT = -27,
+ SS_HANDSHAKE_RETRY = -28,
+ SS_AFTER_LAST_ERROR = -29, /* Keep this at bottom */
};
#define SHARED_SECRET_MAX 64
-#define MDF_CONSISTENT (1 << 0)
-#define MDF_PRIMARY_IND (1 << 1)
-#define MDF_CONNECTED_IND (1 << 2)
-#define MDF_FULL_SYNC (1 << 3)
-#define MDF_WAS_UP_TO_DATE (1 << 4)
-#define MDF_PEER_OUT_DATED (1 << 5)
-#define MDF_CRASHED_PRIMARY (1 << 6)
-#define MDF_AL_CLEAN (1 << 7)
-#define MDF_AL_DISABLED (1 << 8)
+enum mdf_flag {
+ MDF_CONSISTENT = 1 << 0,
+ MDF_PRIMARY_IND = 1 << 1,
+ MDF_WAS_UP_TO_DATE = 1 << 4,
+ MDF_CRASHED_PRIMARY = 1 << 6,
+ MDF_AL_CLEAN = 1 << 7,
+ MDF_AL_DISABLED = 1 << 8,
+ MDF_PRIMARY_LOST_QUORUM = 1 << 9,
+ MDF_HAVE_QUORUM = 1 << 10,
+};
+
+enum mdf_peer_flag {
+ MDF_PEER_CONNECTED = 1 << 0,
+ MDF_PEER_OUTDATED = 1 << 1,
+ MDF_PEER_FENCING = 1 << 2,
+ MDF_PEER_FULL_SYNC = 1 << 3,
+ MDF_PEER_DEVICE_SEEN = 1 << 4,
+ MDF_NODE_EXISTS = 1 << 16,
+ MDF_HAVE_BITMAP = 1 << 31, /* For in core use; no meaning when persistet */
+};
-#define MAX_PEERS 32
+#define DRBD_PEERS_MAX 32
+#define DRBD_NODE_ID_MAX DRBD_PEERS_MAX
enum drbd_uuid_index {
UI_CURRENT,
@@ -331,7 +383,8 @@ enum drbd_uuid_index {
UI_EXTENDED_SIZE /* Everything. */
};
-#define HISTORY_UUIDS MAX_PEERS
+#define HISTORY_UUIDS_V08 (UI_HISTORY_END - UI_HISTORY_START + 1)
+#define HISTORY_UUIDS DRBD_PEERS_MAX
enum drbd_timeout_flag {
UT_DEFAULT = 0,
@@ -339,6 +392,16 @@ enum drbd_timeout_flag {
UT_PEER_OUTDATED = 2,
};
+#define UUID_JUST_CREATED ((__u64)4)
+#define UUID_PRIMARY ((__u64)1)
+
+enum write_ordering_e {
+ WO_NONE,
+ WO_DRAIN_IO,
+ WO_BDEV_FLUSH,
+ WO_BIO_BARRIER
+};
+
enum drbd_notification_type {
NOTIFY_EXISTS,
NOTIFY_CREATE,
@@ -346,11 +409,13 @@ enum drbd_notification_type {
NOTIFY_DESTROY,
NOTIFY_CALL,
NOTIFY_RESPONSE,
+ NOTIFY_RENAME,
NOTIFY_CONTINUES = 0x8000,
NOTIFY_FLAGS = NOTIFY_CONTINUES,
};
+/* These values are part of the ABI! */
enum drbd_peer_state {
P_INCONSISTENT = 3,
P_OUTDATED = 4,
@@ -359,15 +424,6 @@ enum drbd_peer_state {
P_FENCING = 7,
};
-#define UUID_JUST_CREATED ((__u64)4)
-
-enum write_ordering_e {
- WO_NONE,
- WO_DRAIN_IO,
- WO_BDEV_FLUSH,
- WO_BIO_BARRIER
-};
-
/* magic numbers used in meta data and network packets */
#define DRBD_MAGIC 0x83740267
#define DRBD_MAGIC_BIG 0x835a
@@ -376,17 +432,23 @@ enum write_ordering_e {
#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3)
#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4)
#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5)
-
-
-/* how I came up with this magic?
- * base64 decode "actlog==" ;) */
-#define DRBD_AL_MAGIC 0x69cb65a2
+#define DRBD_MD_MAGIC_09 (DRBD_MAGIC+6)
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
-#define DRBD_CPU_MASK_SIZE 32
+/*
+ * This is the maximum string length accepted by drbdadm.
+ * It allows a full mask for up to 908 CPUs.
+ */
+#define DRBD_CPU_MASK_SIZE 256
+
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+
+#define QOU_OFF 0
+#define QOU_MAJORITY 1024
+#define QOU_ALL 1025
#endif
diff --git a/include/linux/drbd_config.h b/include/linux/drbd_config.h
deleted file mode 100644
index d215365c6bb1..000000000000
--- a/include/linux/drbd_config.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * drbd_config.h
- * DRBD's compile time configuration.
- */
-
-#ifndef DRBD_CONFIG_H
-#define DRBD_CONFIG_H
-
-extern const char *drbd_buildtag(void);
-
-#define REL_VERSION "8.4.11"
-#define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 101
-
-#endif
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
index 53f44b8cd75f..75e671a3c5d1 100644
--- a/include/linux/drbd_genl.h
+++ b/include/linux/drbd_genl.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* General overview:
* full generic netlink message:
@@ -68,7 +68,7 @@
* genl_magic_func.h
* generates an entry in the static genl_ops array,
* and static register/unregister functions to
- * genl_register_family().
+ * genl_register_family_with_ops().
*
* flags and handler:
* GENL_op_init( .doit = x, .dumpit = y, .flags = something)
@@ -96,10 +96,12 @@ GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply,
* and/or the replication group (aka resource) name,
* and the volume id within the resource. */
GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context,
+ __u32_field(6, DRBD_GENLA_F_MANDATORY, ctx_peer_node_id)
__u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume)
__str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128)
__bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128)
__bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128)
+ __str_field_def(5, 0, ctx_conn_name, SHARED_SECRET_MAX)
)
GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
@@ -109,37 +111,45 @@ GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
/* use the resize command to try and change the disk_size */
__u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size)
- /* we could change the max_bio_bvecs,
- * but it won't propagate through the stack */
- __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs)
+ /*__u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs)*/
__u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF)
- __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF)
+ /*__u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF)*/
- __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF)
__s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF)
__u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF)
- __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
- __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF)
- __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
- __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
- __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
- __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
- __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
- __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF)
__flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
__flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
__flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
__flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
+ __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
+ __u32_field_def(21, DRBD_GENLA_F_MANDATORY, read_balancing, DRBD_READ_BALANCING_DEF)
+ __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
+ __u32_field_def(25, 0 /* OPTIONAL */, rs_discard_granularity, DRBD_RS_DISCARD_GRANULARITY_DEF)
__flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
- __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF)
+ __flg_field_def(24, 0 /* OPTIONAL */, discard_zeroes_if_aligned, DRBD_DISCARD_ZEROES_IF_ALIGNED_DEF)
__flg_field_def(26, 0 /* OPTIONAL */, disable_write_same, DRBD_DISABLE_WRITE_SAME_DEF)
+ __flg_field_def(27, 0 /* OPTIONAL */, d_bitmap, DRBD_BITMAP_DEF)
)
GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
__str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, DRBD_CPU_MASK_SIZE)
__u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF)
+ __flg_field_def(3, DRBD_GENLA_F_MANDATORY, auto_promote, DRBD_AUTO_PROMOTE_DEF)
+ __u32_field(4, DRBD_F_REQUIRED | DRBD_F_INVARIANT, node_id)
+ __u32_field_def(5, DRBD_GENLA_F_MANDATORY, peer_ack_window, DRBD_PEER_ACK_WINDOW_DEF)
+ __u32_field_def(6, DRBD_GENLA_F_MANDATORY, twopc_timeout, DRBD_TWOPC_TIMEOUT_DEF)
+ __u32_field_def(7, DRBD_GENLA_F_MANDATORY, twopc_retry_timeout, DRBD_TWOPC_RETRY_TIMEOUT_DEF)
+ __u32_field_def(8, 0 /* OPTIONAL */, peer_ack_delay, DRBD_PEER_ACK_DELAY_DEF)
+ __u32_field_def(9, 0 /* OPTIONAL */, auto_promote_timeout, DRBD_AUTO_PROMOTE_TIMEOUT_DEF)
+ __u32_field_def(10, 0 /* OPTIONAL */, nr_requests, DRBD_NR_REQUESTS_DEF)
+ __s32_field_def(11, 0 /* OPTIONAL */, quorum, DRBD_QUORUM_DEF)
+ __u32_field_def(12, 0 /* OPTIONAL */, on_no_quorum, DRBD_ON_NO_QUORUM_DEF)
+ __s32_field_def(13, 0 /* OPTIONAL */, quorum_min_redundancy, DRBD_QUORUM_DEF)
+ __u32_field_def(14, 0 /* OPTIONAL */, on_susp_primary_outdated, DRBD_ON_SUSP_PRI_OUTD_DEF)
+ __flg_field_def(15, 0, drbd8_compat_mode, DRBD_DRBD8_COMPAT_MODE_DEF) /* invisible by drbdsetup show */
+ __flg_field_def(16, 0 /* OPTIONAL */, explicit_drbd8_compat, DRBD_DRBD8_COMPAT_MODE_DEF)
)
GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
@@ -157,9 +167,7 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
__u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF)
__u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF)
__u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF)
- __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF)
__u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF)
- __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
__u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF)
__u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF)
__u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF)
@@ -168,20 +176,29 @@ GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
__u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF)
__u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF)
__flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF)
- __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data)
__flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF)
__flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF)
- __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative)
__flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
- /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
- /* 9: __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX) */
- /* 9: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */
+ __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF)
+ __str_field_def(31, DRBD_GENLA_F_MANDATORY, name, SHARED_SECRET_MAX)
+ /* moved into ctx_peer_node_id: __u32_field(32, DRBD_F_REQUIRED | DRBD_F_INVARIANT, peer_node_id) */
__flg_field_def(33, 0 /* OPTIONAL */, csums_after_crash_only, DRBD_CSUMS_AFTER_CRASH_ONLY_DEF)
__u32_field_def(34, 0 /* OPTIONAL */, sock_check_timeo, DRBD_SOCKET_CHECK_TIMEO_DEF)
+ __str_field_def(35, DRBD_F_INVARIANT, transport_name, SHARED_SECRET_MAX)
+ __u32_field_def(36, 0 /* OPTIONAL */, max_buffers, DRBD_MAX_BUFFERS_DEF)
+ __flg_field_def(37, 0 /* OPTIONAL */, allow_remote_read, DRBD_ALLOW_REMOTE_READ_DEF)
+ __flg_field_def(38, 0 /* OPTIONAL */, tls, DRBD_TLS_DEF)
+ __s32_field_def(39, 0 /* OPTIONAL */, tls_privkey, DRBD_TLS_PRIVKEY_DEF)
+ __s32_field_def(40, 0 /* OPTIONAL */, tls_certificate, DRBD_TLS_CERTIFICATE_DEF)
+ __s32_field_def(41, 0 /* OPTIONAL */, tls_keyring, DRBD_TLS_KEYRING_DEF)
+ __flg_field_def(42, DRBD_F_INVARIANT, load_balance_paths, DRBD_LOAD_BALANCE_PATHS_DEF)
+ __u32_field_def(43, 0 /* OPTIONAL */, rdma_ctrl_rcvbuf_size, DRBD_RDMA_CTRL_RCVBUF_SIZE_DEF)
+ __u32_field_def(44, 0 /* OPTIONAL */, rdma_ctrl_sndbuf_size, DRBD_RDMA_CTRL_SNDBUF_SIZE_DEF)
+
)
GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
- __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate)
+ __flg_field(1, DRBD_GENLA_F_MANDATORY, force)
)
GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
@@ -192,46 +209,6 @@ GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
__u32_field_def(5, 0 /* OPTIONAL */, al_stripe_size, DRBD_AL_STRIPE_SIZE_DEF)
)
-GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
- /* the reason of the broadcast,
- * if this is an event triggered broadcast. */
- __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason)
- __u32_field(2, DRBD_F_REQUIRED, current_state)
- __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity)
- __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid)
-
- /* These are for broadcast from after state change work.
- * prev_state and new_state are from the moment the state change took
- * place, new_state is not neccessarily the same as current_state,
- * there may have been more state changes since. Which will be
- * broadcasted soon, in their respective after state change work. */
- __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state)
- __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state)
-
- /* if we have a local disk: */
- __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64)))
- __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags)
- __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total)
- __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos)
- /* and in case resync or online verify is active */
- __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total)
- __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed)
-
- /* for pre and post notifications of helper execution */
- __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32)
- __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code)
-
- __u64_field(15, 0, send_cnt)
- __u64_field(16, 0, recv_cnt)
- __u64_field(17, 0, read_cnt)
- __u64_field(18, 0, writ_cnt)
- __u64_field(19, 0, al_writ_cnt)
- __u64_field(20, 0, bm_writ_cnt)
- __u32_field(21, 0, ap_bio_cnt)
- __u32_field(22, 0, ap_pending_cnt)
- __u32_field(23, 0, rs_pending_cnt)
-)
-
GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
__u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector)
__u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector)
@@ -239,6 +216,7 @@ GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm)
+ __flg_field(2, DRBD_GENLA_F_MANDATORY, force_resync)
)
GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms,
@@ -251,6 +229,13 @@ GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms,
GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
__flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
+ __flg_field_def(2, 0 /* OPTIONAL */, intentional_diskless_detach, DRBD_DISK_DISKLESS_DEF)
+)
+
+GENL_struct(DRBD_NLA_DEVICE_CONF, 14, device_conf,
+ __u32_field_def(1, DRBD_F_INVARIANT, max_bio_size, DRBD_MAX_BIO_SIZE_DEF)
+ __flg_field_def(2, 0 /* OPTIONAL */, intentional_diskless, DRBD_DISK_DISKLESS_DEF)
+ __u32_field_def(3, 0 /* OPTIONAL */, block_size, DRBD_BLOCK_SIZE_DEF)
)
GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
@@ -258,11 +243,16 @@ GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info,
__flg_field(2, 0, res_susp)
__flg_field(3, 0, res_susp_nod)
__flg_field(4, 0, res_susp_fen)
- /* __flg_field(5, 0, res_weak) */
+ __flg_field(5, 0, res_susp_quorum)
+ __flg_field(6, 0, res_fail_io)
)
GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info,
__u32_field(1, 0, dev_disk_state)
+ __flg_field(2, 0, is_intentional_diskless)
+ __flg_field(3, 0, dev_has_quorum)
+ __flg_field(5, 0, dev_is_open)
+ __str_field(4, 0, backing_dev_path, 128)
)
GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info,
@@ -276,6 +266,7 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info,
__u32_field(3, 0, peer_resync_susp_user)
__u32_field(4, 0, peer_resync_susp_peer)
__u32_field(5, 0, peer_resync_susp_dependency)
+ __flg_field(6, 0, peer_is_intentional_diskless)
)
GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics,
@@ -301,6 +292,8 @@ GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics,
GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics,
__flg_field(1, 0, conn_congested)
+ __u64_field(2, 0, ap_in_flight) /* sectors */
+ __u64_field(3, 0, rs_in_flight) /* sectors */
)
GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
@@ -312,6 +305,27 @@ GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics,
__u64_field(6, 0, peer_dev_resync_failed) /* sectors */
__u64_field(7, 0, peer_dev_bitmap_uuid)
__u32_field(9, 0, peer_dev_flags)
+ /* you need the peer_repl_state from peer_device_info
+ * to properly interpret these stats for "progress"
+ * of syncer/verify */
+ __u64_field(10, 0, peer_dev_rs_total) /* sectors */
+ __u64_field(11, 0, peer_dev_ov_start_sector)
+ __u64_field(12, 0, peer_dev_ov_stop_sector)
+ __u64_field(13, 0, peer_dev_ov_position) /* sectors */
+ __u64_field(14, 0, peer_dev_ov_left) /* sectors */
+ __u64_field(15, 0, peer_dev_ov_skipped) /* sectors */
+ __u64_field(16, 0, peer_dev_rs_same_csum)
+ __u64_field(17, 0, peer_dev_rs_dt_start_ms)
+ __u64_field(18, 0, peer_dev_rs_paused_ms)
+ /* resync progress marks for "resync speed" guestimation */
+ __u64_field(19, 0, peer_dev_rs_dt0_ms)
+ __u64_field(20, 0, peer_dev_rs_db0_sectors)
+ __u64_field(21, 0, peer_dev_rs_dt1_ms)
+ __u64_field(22, 0, peer_dev_rs_db1_sectors)
+ __u32_field(23, 0, peer_dev_rs_c_sync_rate)
+ /* events may not be sent for every change of the UUID flags, however
+ * UUID_FLAG_STABLE can be trusted */
+ __u64_field(24, 0, peer_dev_uuid_flags)
)
GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header,
@@ -323,38 +337,67 @@ GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info,
__u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status)
)
-/*
- * Notifications and commands (genlmsghdr->cmd)
- */
-GENL_mc_group(events)
+GENL_struct(DRBD_NLA_INVALIDATE_PARMS, 25, invalidate_parms,
+ __s32_field_def(1, DRBD_GENLA_F_MANDATORY, sync_from_peer_node_id, DRBD_SYNC_FROM_NID_DEF)
+ __flg_field_def(2, DRBD_GENLA_F_MANDATORY, reset_bitmap, DRBD_INVALIDATE_RESET_BITMAP_DEF)
+)
- /* kernel -> userspace announcement of changes */
-GENL_notification(
- DRBD_EVENT, 1, events,
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
- GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED)
- GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY)
+GENL_struct(DRBD_NLA_FORGET_PEER_PARMS, 26, forget_peer_parms,
+ __s32_field_def(1, DRBD_GENLA_F_MANDATORY, forget_peer_node_id, DRBD_SYNC_FROM_NID_DEF)
)
- /* query kernel for specific or all info */
-GENL_op(
- DRBD_ADM_GET_STATUS, 2,
- GENL_op_init(
- .doit = drbd_adm_get_status,
- .dumpit = drbd_adm_get_status_all,
- /* anyone may ask for the status,
- * it is broadcasted anyways */
- ),
- /* To select the object .doit.
- * Or a subset of objects in .dumpit. */
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+GENL_struct(DRBD_NLA_PEER_DEVICE_OPTS, 27, peer_device_conf,
+ __u32_field_def(1, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF)
+ __u32_field_def(2, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
+ __u32_field_def(3, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF)
+ __u32_field_def(4, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
+ __u32_field_def(5, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
+ __u32_field_def(6, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
+ __flg_field_def(7, 0 /* OPTIONAL */, bitmap, DRBD_BITMAP_DEF)
+#if (PRO_FEATURES & DRBD_FF_RESYNC_WITHOUT_REPLICATION) || !defined(__KERNEL__)
+ __flg_field_def(8, 0 /* OPTIONAL */, resync_without_replication, DRBD_RESYNC_WITHOUT_REPLICATION_DEF)
+#endif
+)
+
+GENL_struct(DRBD_NLA_PATH_PARMS, 28, path_parms,
+ __bin_field(1, DRBD_GENLA_F_MANDATORY, my_addr, 128)
+ __bin_field(2, DRBD_GENLA_F_MANDATORY, peer_addr, 128)
+)
+
+GENL_struct(DRBD_NLA_CONNECT_PARMS, 29, connect_parms,
+ __flg_field_def(1, DRBD_GENLA_F_MANDATORY, tentative, 0)
+ __flg_field_def(2, DRBD_GENLA_F_MANDATORY, discard_my_data, 0)
+)
+
+GENL_struct(DRBD_NLA_PATH_INFO, 30, drbd_path_info,
+ __flg_field(1, 0, path_established)
)
+GENL_struct(DRBD_NLA_RENAME_RESOURCE_PARMS, 31, rename_resource_parms,
+ __str_field(1, DRBD_GENLA_F_MANDATORY, new_resource_name, 128)
+)
+
+GENL_struct(DRBD_NLA_RENAME_RESOURCE_INFO, 32, rename_resource_info,
+ __str_field(1, DRBD_GENLA_F_MANDATORY, res_new_name, 128)
+)
+
+GENL_struct(DRBD_NLA_INVAL_PEER_PARAMS, 33, invalidate_peer_parms,
+ __flg_field_def(1, DRBD_GENLA_F_MANDATORY, p_reset_bitmap, DRBD_INVALIDATE_RESET_BITMAP_DEF)
+)
+
+GENL_struct(DRBD_NLA_SUSPEND_IO_PARAMS, 34, suspend_io_parms,
+ __flg_field_def(1, DRBD_GENLA_F_MANDATORY, bdev_freeze, DRBD_SUSPEND_IO_BDEV_FREEZE_DEF)
+)
+
+/*
+ * Notifications and commands (genlmsghdr->cmd)
+ */
+GENL_mc_group(events)
+
/* add DRBD minor devices as volumes to resources */
GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_new_minor),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DEVICE_CONF, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_del_minor),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
@@ -370,11 +413,29 @@ GENL_op(DRBD_ADM_RESOURCE_OPTS, 9,
GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY)
)
-GENL_op(
- DRBD_ADM_CONNECT, 10,
- GENL_doit(drbd_adm_connect),
+GENL_op(DRBD_ADM_NEW_PEER, 44, GENL_doit(drbd_adm_new_peer),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
- GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(DRBD_ADM_NEW_PATH, 45, GENL_doit(drbd_adm_new_path),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_PATH_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_DEL_PEER, 46, GENL_doit(drbd_adm_del_peer),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DISCONNECT_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(DRBD_ADM_DEL_PATH, 47, GENL_doit(drbd_adm_del_path),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_PATH_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_CONNECT, 10, GENL_doit(drbd_adm_connect),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_CONNECT_PARMS, DRBD_GENLA_F_MANDATORY)
)
GENL_op(
@@ -385,7 +446,9 @@ GENL_op(
)
GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DISCONNECT_PARMS, DRBD_GENLA_F_MANDATORY)
+)
GENL_op(DRBD_ADM_ATTACH, 12,
GENL_doit(drbd_adm_attach),
@@ -438,15 +501,22 @@ GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach),
GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_INVALIDATE_PARMS, DRBD_F_REQUIRED))
+
GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_INVAL_PEER_PARAMS, 0 /* OPTIONAL */))
+
GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_SUSPEND_IO_PARAMS, 0 /* OPTIONAL */))
+
GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate),
@@ -457,39 +527,47 @@ GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
GENL_op(DRBD_ADM_GET_RESOURCES, 30,
- GENL_op_init(
- .dumpit = drbd_adm_dump_resources,
- ),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+ GENL_op_init(
+ .dumpit = drbd_adm_dump_resources,
+ ),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_DEVICES, 31,
- GENL_op_init(
- .dumpit = drbd_adm_dump_devices,
- .done = drbd_adm_dump_devices_done,
- ),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+ GENL_op_init(
+ .dumpit = drbd_adm_dump_devices,
+ .done = drbd_adm_dump_devices_done,
+ ),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_CONNECTIONS, 32,
- GENL_op_init(
- .dumpit = drbd_adm_dump_connections,
- .done = drbd_adm_dump_connections_done,
- ),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
+ GENL_op_init(
+ .dumpit = drbd_adm_dump_connections,
+ .done = drbd_adm_dump_connections_done,
+ ),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY))
GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33,
- GENL_op_init(
- .dumpit = drbd_adm_dump_peer_devices,
- .done = drbd_adm_dump_peer_devices_done,
- ),
- GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
- GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+ GENL_op_init(
+ .dumpit = drbd_adm_dump_peer_devices,
+ .done = drbd_adm_dump_peer_devices_done,
+ ),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_GET_PATHS, 50,
+ GENL_op_init(
+ .dumpit = drbd_adm_dump_paths,
+ .done = drbd_adm_dump_paths_done,
+ ),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_PATH_INFO, DRBD_GENLA_F_MANDATORY))
GENL_notification(
DRBD_RESOURCE_STATE, 34, events,
@@ -509,6 +587,7 @@ GENL_notification(
DRBD_CONNECTION_STATE, 36, events,
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_PATH_PARMS, DRBD_GENLA_F_MANDATORY)
GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED)
GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED))
@@ -522,7 +601,8 @@ GENL_notification(
GENL_op(
DRBD_ADM_GET_INITIAL_STATE, 38,
GENL_op_init(
- .dumpit = drbd_adm_get_initial_state,
+ .dumpit = drbd_adm_get_initial_state,
+ .done = drbd_adm_get_initial_state_done,
),
GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY))
@@ -534,3 +614,21 @@ GENL_notification(
GENL_notification(
DRBD_INITIAL_STATE_DONE, 41, events,
GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_FORGET_PEER, 42, GENL_doit(drbd_adm_forget_peer),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_FORGET_PEER_PARMS, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_CHG_PEER_DEVICE_OPTS, 43,
+ GENL_doit(drbd_adm_peer_device_opts),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_PEER_DEVICE_OPTS, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_RENAME_RESOURCE, 49, GENL_doit(drbd_adm_rename_resource),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_RENAME_RESOURCE_PARMS, DRBD_F_REQUIRED))
+
+GENL_notification(
+ DRBD_PATH_STATE, 48, events,
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_PATH_INFO, DRBD_F_REQUIRED))
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 5b042fb427e9..ed38f94d43c6 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -64,7 +64,7 @@
#define DRBD_DISK_TIMEOUT_DEF 0U /* disabled */
#define DRBD_DISK_TIMEOUT_SCALE '1'
- /* active connection retries when C_WF_CONNECTION */
+ /* active connection retries when C_CONNECTING */
#define DRBD_CONNECT_INT_MIN 1U
#define DRBD_CONNECT_INT_MAX 120U
#define DRBD_CONNECT_INT_DEF 10U /* seconds */
@@ -88,14 +88,13 @@
#define DRBD_MAX_EPOCH_SIZE_DEF 2048U
#define DRBD_MAX_EPOCH_SIZE_SCALE '1'
- /* I don't think that a tcp send buffer of more than 10M is useful */
#define DRBD_SNDBUF_SIZE_MIN 0U
-#define DRBD_SNDBUF_SIZE_MAX (10U<<20)
+#define DRBD_SNDBUF_SIZE_MAX (128U<<20)
#define DRBD_SNDBUF_SIZE_DEF 0U
#define DRBD_SNDBUF_SIZE_SCALE '1'
#define DRBD_RCVBUF_SIZE_MIN 0U
-#define DRBD_RCVBUF_SIZE_MAX (10U<<20)
+#define DRBD_RCVBUF_SIZE_MAX (128U<<20)
#define DRBD_RCVBUF_SIZE_DEF 0U
#define DRBD_RCVBUF_SIZE_SCALE '1'
@@ -117,16 +116,19 @@
#define DRBD_KO_COUNT_MAX 200U
#define DRBD_KO_COUNT_DEF 7U
#define DRBD_KO_COUNT_SCALE '1'
+
+#define DRBD_ALLOW_REMOTE_READ_DEF 1U
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
#define DRBD_RESYNC_RATE_MIN 1U
/* channel bonding 10 GbE, or other hardware */
-#define DRBD_RESYNC_RATE_MAX (4 << 20)
+#define DRBD_RESYNC_RATE_MAX (8U << 20)
#define DRBD_RESYNC_RATE_DEF 250U
#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
+ /* less than 67 would hit performance unnecessarily. */
#define DRBD_AL_EXTENTS_MIN 67U
/* we use u16 as "slot number", (u16)~0 is "FREE".
* If you use >= 292 kB on-disk ring buffer,
@@ -182,7 +184,7 @@
#define DRBD_C_FILL_TARGET_DEF 100U /* Try to place 50KiB in socket send buffer during resync */
#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */
-#define DRBD_C_MAX_RATE_MIN 250U
+#define DRBD_C_MAX_RATE_MIN 0U
#define DRBD_C_MAX_RATE_MAX (4U << 20)
#define DRBD_C_MAX_RATE_DEF 102400U
#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */
@@ -207,10 +209,11 @@
#define DRBD_DISK_BARRIER_DEF 0U
#define DRBD_DISK_FLUSHES_DEF 1U
#define DRBD_DISK_DRAIN_DEF 1U
+#define DRBD_DISK_DISKLESS_DEF 0U
#define DRBD_MD_FLUSHES_DEF 1U
#define DRBD_TCP_CORK_DEF 1U
#define DRBD_AL_UPDATES_DEF 1U
-
+#define DRBD_INVALIDATE_RESET_BITMAP_DEF 1U
/* We used to ignore the discard_zeroes_data setting.
* To not change established (and expected) behaviour,
* by default assume that, for discard_zeroes_data=0,
@@ -227,6 +230,52 @@
#define DRBD_ALWAYS_ASBP_DEF 0U
#define DRBD_USE_RLE_DEF 1U
#define DRBD_CSUMS_AFTER_CRASH_ONLY_DEF 0U
+#define DRBD_AUTO_PROMOTE_DEF 1U
+#define DRBD_BITMAP_DEF 1U
+#define DRBD_RESYNC_WITHOUT_REPLICATION_DEF 1U
+
+#define DRBD_NR_REQUESTS_MIN 4U
+#define DRBD_NR_REQUESTS_DEF 8000U
+#define DRBD_NR_REQUESTS_MAX -1U
+#define DRBD_NR_REQUESTS_SCALE '1'
+
+#define DRBD_MAX_BIO_SIZE_DEF DRBD_MAX_BIO_SIZE
+#define DRBD_MAX_BIO_SIZE_MIN (1U << 9)
+#define DRBD_MAX_BIO_SIZE_MAX DRBD_MAX_BIO_SIZE
+#define DRBD_MAX_BIO_SIZE_SCALE '1'
+
+#define DRBD_NODE_ID_DEF 0U
+#define DRBD_NODE_ID_MIN 0U
+#ifndef DRBD_NODE_ID_MAX /* Is also defined in drbd.h */
+#define DRBD_NODE_ID_MAX DRBD_PEERS_MAX
+#endif
+#define DRBD_NODE_ID_SCALE '1'
+
+#define DRBD_PEER_ACK_WINDOW_DEF 4096U /* 2 MiByte */
+#define DRBD_PEER_ACK_WINDOW_MIN 2048U /* 1 MiByte */
+#define DRBD_PEER_ACK_WINDOW_MAX 204800U /* 100 MiByte */
+#define DRBD_PEER_ACK_WINDOW_SCALE 's' /* sectors*/
+
+#define DRBD_PEER_ACK_DELAY_DEF 100U /* 100ms */
+#define DRBD_PEER_ACK_DELAY_MIN 1U
+#define DRBD_PEER_ACK_DELAY_MAX 10000U /* 10 seconds */
+#define DRBD_PEER_ACK_DELAY_SCALE '1' /* milliseconds */
+
+/* Two-phase commit timeout (1/10 seconds). */
+#define DRBD_TWOPC_TIMEOUT_MIN 50U
+#define DRBD_TWOPC_TIMEOUT_MAX 600U
+#define DRBD_TWOPC_TIMEOUT_DEF 300U
+#define DRBD_TWOPC_TIMEOUT_SCALE '1'
+
+#define DRBD_TWOPC_RETRY_TIMEOUT_MIN 1U
+#define DRBD_TWOPC_RETRY_TIMEOUT_MAX 50U
+#define DRBD_TWOPC_RETRY_TIMEOUT_DEF 1U
+#define DRBD_TWOPC_RETRY_TIMEOUT_SCALE '1'
+
+#define DRBD_SYNC_FROM_NID_DEF -1
+#define DRBD_SYNC_FROM_NID_MIN -1
+#define DRBD_SYNC_FROM_NID_MAX DRBD_PEERS_MAX
+#define DRBD_SYNC_FROM_NID_SCALE '1'
#define DRBD_AL_STRIPES_MIN 1U
#define DRBD_AL_STRIPES_MAX 1024U
@@ -243,9 +292,51 @@
#define DRBD_SOCKET_CHECK_TIMEO_DEF 0U
#define DRBD_SOCKET_CHECK_TIMEO_SCALE '1'
+/* Auto promote timeout (1/10 seconds). */
+#define DRBD_AUTO_PROMOTE_TIMEOUT_MIN 0U
+#define DRBD_AUTO_PROMOTE_TIMEOUT_MAX 600U
+#define DRBD_AUTO_PROMOTE_TIMEOUT_DEF 20U
+#define DRBD_AUTO_PROMOTE_TIMEOUT_SCALE '1'
+
#define DRBD_RS_DISCARD_GRANULARITY_MIN 0U
#define DRBD_RS_DISCARD_GRANULARITY_MAX (1U<<20) /* 1MiByte */
#define DRBD_RS_DISCARD_GRANULARITY_DEF 0U /* disabled by default */
#define DRBD_RS_DISCARD_GRANULARITY_SCALE '1' /* bytes */
+#define DRBD_QUORUM_MIN 0U
+#define DRBD_QUORUM_MAX QOU_ALL /* Note: user visible min/max different */
+#define DRBD_QUORUM_DEF QOU_OFF /* kernel min/max includes symbolic values */
+#define DRBD_QUORUM_SCALE '1' /* nodes */
+
+#define DRBD_BLOCK_SIZE_MIN 512
+#define DRBD_BLOCK_SIZE_MAX 4096
+#define DRBD_BLOCK_SIZE_DEF 512
+#define DRBD_BLOCK_SIZE_SCALE '1' /* Bytes */
+
+/* By default freeze IO, if set error all IOs as quick as possible */
+#define DRBD_ON_NO_QUORUM_DEF ONQ_SUSPEND_IO
+
+#define DRBD_ON_SUSP_PRI_OUTD_DEF SPO_DISCONNECT
+#define DRBD_DRBD8_COMPAT_MODE_DEF 0U
+
+#define DRBD_TLS_DEF 0U /* disabled by default */
+#define DRBD_TLS_PRIVKEY_DEF 0 /* disabled by default */
+#define DRBD_TLS_CERTIFICATE_DEF 0 /* disabled by default */
+#define DRBD_TLS_KEYRING_DEF 0 /* disabled by default */
+
+#define DRBD_LOAD_BALANCE_PATHS_DEF 0U
+
+#define DRBD_RDMA_CTRL_RCVBUF_SIZE_MIN 0U
+#define DRBD_RDMA_CTRL_RCVBUF_SIZE_MAX (10U<<20)
+#define DRBD_RDMA_CTRL_RCVBUF_SIZE_DEF 0
+#define DRBD_RDMA_CTRL_RCVBUF_SIZE_SCALE '1'
+
+#define DRBD_RDMA_CTRL_SNDBUF_SIZE_MIN 0U
+#define DRBD_RDMA_CTRL_SNDBUF_SIZE_MAX (10U<<20)
+#define DRBD_RDMA_CTRL_SNDBUF_SIZE_DEF 0
+#define DRBD_RDMA_CTRL_SNDBUF_SIZE_SCALE '1'
+
+/* Enable bdev_freeze/lockfs by default */
+#define DRBD_SUSPEND_IO_BDEV_FREEZE_DEF 1U
+
#endif
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
index d4da060b7532..db462b860d18 100644
--- a/include/linux/genl_magic_func.h
+++ b/include/linux/genl_magic_func.h
@@ -130,41 +130,53 @@ static void dprint_array(const char *dir, int nla_type,
* {{{2
*/
-/* processing of generic netlink messages is serialized.
- * use one static buffer for parsing of nested attributes */
-static struct nlattr *nested_attr_tb[128];
-
#undef GENL_struct
#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
-/* *_from_attrs functions are static, but potentially unused */ \
static int __ ## s_name ## _from_attrs(struct s_name *s, \
+ struct nlattr ***ret_nested_attribute_table, \
struct genl_info *info, bool exclude_invariants) \
{ \
const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \
struct nlattr *tla = info->attrs[tag_number]; \
- struct nlattr **ntb = nested_attr_tb; \
+ struct nlattr **ntb; \
struct nlattr *nla; \
- int err; \
- BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \
+ int err = 0; \
+ if (ret_nested_attribute_table) \
+ *ret_nested_attribute_table = NULL; \
if (!tla) \
return -ENOMSG; \
+ ntb = kcalloc(ARRAY_SIZE(s_name ## _nl_policy), sizeof(*ntb), GFP_KERNEL); \
+ if (!ntb) \
+ return -ENOMEM; \
DPRINT_TLA(#s_name, "<=-", #tag_name); \
err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \
if (err) \
- return err; \
+ goto out; \
\
s_fields \
- return 0; \
+ out: \
+ if (!err && ret_nested_attribute_table) \
+ *ret_nested_attribute_table = ntb; \
+ else \
+ kfree(ntb); \
+ return err; \
} __attribute__((unused)) \
static int s_name ## _from_attrs(struct s_name *s, \
struct genl_info *info) \
{ \
- return __ ## s_name ## _from_attrs(s, info, false); \
+ return __ ## s_name ## _from_attrs(s, NULL, info, false); \
+} __attribute__((unused)) \
+static int s_name ## _ntb_from_attrs( \
+ struct nlattr ***ret_nested_attribute_table, \
+ struct genl_info *info) \
+{ \
+ return __ ## s_name ## _from_attrs(NULL, \
+ ret_nested_attribute_table, info, false); \
} __attribute__((unused)) \
static int s_name ## _from_attrs_for_change(struct s_name *s, \
struct genl_info *info) \
{ \
- return __ ## s_name ## _from_attrs(s, info, true); \
+ return __ ## s_name ## _from_attrs(s, NULL, info, true); \
} __attribute__((unused)) \
#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \
@@ -172,7 +184,8 @@ static int s_name ## _from_attrs_for_change(struct s_name *s, \
if (nla) { \
if (exclude_invariants && !!((attr_flag) & DRBD_F_INVARIANT)) { \
pr_info("<< must not change invariant attr: %s\n", #name); \
- return -EEXIST; \
+ err = -EEXIST; \
+ goto out; \
} \
assignment; \
} else if (exclude_invariants && !!((attr_flag) & DRBD_F_INVARIANT)) { \
@@ -180,7 +193,8 @@ static int s_name ## _from_attrs_for_change(struct s_name *s, \
/* which was expected */ \
} else if ((attr_flag) & DRBD_F_REQUIRED) { \
pr_info("<< missing attr: %s\n", #name); \
- return -ENOMSG; \
+ err = -ENOMSG; \
+ goto out; \
}
#undef __field
@@ -271,12 +285,12 @@ enum CONCATENATE(GENL_MAGIC_FAMILY, group_ids) {
#undef GENL_mc_group
#define GENL_mc_group(group) \
static int CONCATENATE(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \
- struct sk_buff *skb, gfp_t flags) \
+ struct sk_buff *skb) \
{ \
unsigned int group_id = \
CONCATENATE(GENL_MAGIC_FAMILY, _group_ ## group); \
- return genlmsg_multicast(&ZZZ_genl_family, skb, 0, \
- group_id, flags); \
+ return genlmsg_multicast_allns(&ZZZ_genl_family, skb, 0, \
+ group_id); \
}
#include GENL_MAGIC_INCLUDE_FILE
@@ -298,6 +312,8 @@ static struct genl_family ZZZ_genl_family __ro_after_init = {
.resv_start_op = 42, /* drbd is currently the only user */
.n_mcgrps = ARRAY_SIZE(ZZZ_genl_mcgrps),
.module = THIS_MODULE,
+ .netnsok = false,
+ .parallel_ops = true,
};
int CONCATENATE(GENL_MAGIC_FAMILY, _genl_register)(void)
--
2.53.0
© 2016 - 2026 Red Hat, Inc.