When writing to a qcow2 file there are two functions that take a
virtual offset and return a host offset, possibly allocating new
clusters if necessary:
- handle_copied() looks for normal data clusters that are already
allocated and have a reference count of 1. In those clusters we
can simply write the data and there is no need to perform any
copy-on-write.
- handle_alloc() looks for clusters that do need copy-on-write,
either because they haven't been allocated yet, because their
reference count is != 1 or because they are ZERO_ALLOC clusters.
The ZERO_ALLOC case is a bit special because those are clusters that
are already allocated and they could perfectly be dealt with in
handle_copied() (as long as copy-on-write is performed when required).
In fact, there is extra code specifically for them in handle_alloc()
that tries to reuse the existing allocation if possible and frees them
otherwise.
This patch changes the handling of ZERO_ALLOC clusters so the
semantics of these two functions are now like this:
- handle_copied() looks for clusters that are already allocated and
which we can overwrite (NORMAL and ZERO_ALLOC clusters with a
reference count of 1).
- handle_alloc() looks for clusters for which we need a new
allocation (all other cases).
One important difference after this change is that clusters found
in handle_copied() may now require copy-on-write, but this will be
necessary anyway once we add support for subclusters.
Signed-off-by: Alberto Garcia <berto@igalia.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Reviewed-by: Max Reitz <mreitz@redhat.com>
---
block/qcow2-cluster.c | 230 ++++++++++++++++++++++++------------------
1 file changed, 130 insertions(+), 100 deletions(-)
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e251d00890..5c81046c34 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1041,13 +1041,18 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
/*
* For a given write request, create a new QCowL2Meta structure, add
- * it to @m and the BDRVQcow2State.cluster_allocs list.
+ * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
+ * request does not need copy-on-write or changes to the L2 metadata
+ * then this function does nothing.
*
* @host_cluster_offset points to the beginning of the first cluster.
*
* @guest_offset and @bytes indicate the offset and length of the
* request.
*
+ * @l2_slice contains the L2 entries of all clusters involved in this
+ * write request.
+ *
* If @keep_old is true it means that the clusters were already
* allocated and will be overwritten. If false then the clusters are
* new and we have to decrease the reference count of the old ones.
@@ -1055,15 +1060,53 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
static void calculate_l2_meta(BlockDriverState *bs,
uint64_t host_cluster_offset,
uint64_t guest_offset, unsigned bytes,
- QCowL2Meta **m, bool keep_old)
+ uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
{
BDRVQcow2State *s = bs->opaque;
- unsigned cow_start_from = 0;
+ int l2_index = offset_to_l2_slice_index(s, guest_offset);
+ uint64_t l2_entry;
+ unsigned cow_start_from, cow_end_to;
unsigned cow_start_to = offset_into_cluster(s, guest_offset);
unsigned cow_end_from = cow_start_to + bytes;
- unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
unsigned nb_clusters = size_to_clusters(s, cow_end_from);
QCowL2Meta *old_m = *m;
+ QCow2ClusterType type;
+
+ assert(nb_clusters <= s->l2_slice_size - l2_index);
+
+ /* Return if there's no COW (all clusters are normal and we keep them) */
+ if (keep_old) {
+ int i;
+ for (i = 0; i < nb_clusters; i++) {
+ l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
+ if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) {
+ break;
+ }
+ }
+ if (i == nb_clusters) {
+ return;
+ }
+ }
+
+ /* Get the L2 entry of the first cluster */
+ l2_entry = be64_to_cpu(l2_slice[l2_index]);
+ type = qcow2_get_cluster_type(bs, l2_entry);
+
+ if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
+ cow_start_from = cow_start_to;
+ } else {
+ cow_start_from = 0;
+ }
+
+ /* Get the L2 entry of the last cluster */
+ l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]);
+ type = qcow2_get_cluster_type(bs, l2_entry);
+
+ if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
+ cow_end_to = cow_end_from;
+ } else {
+ cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
+ }
*m = g_malloc0(sizeof(**m));
**m = (QCowL2Meta) {
@@ -1089,18 +1132,22 @@ static void calculate_l2_meta(BlockDriverState *bs,
QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
}
-/* Returns true if writing to a cluster requires COW */
-static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry)
+/*
+ * Returns true if writing to the cluster pointed to by @l2_entry
+ * requires a new allocation (that is, if the cluster is unallocated
+ * or has refcount > 1 and therefore cannot be written in-place).
+ */
+static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
{
switch (qcow2_get_cluster_type(bs, l2_entry)) {
case QCOW2_CLUSTER_NORMAL:
+ case QCOW2_CLUSTER_ZERO_ALLOC:
if (l2_entry & QCOW_OFLAG_COPIED) {
return false;
}
case QCOW2_CLUSTER_UNALLOCATED:
case QCOW2_CLUSTER_COMPRESSED:
case QCOW2_CLUSTER_ZERO_PLAIN:
- case QCOW2_CLUSTER_ZERO_ALLOC:
return true;
default:
abort();
@@ -1108,20 +1155,38 @@ static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry)
}
/*
- * Returns the number of contiguous clusters that can be used for an allocating
- * write, but require COW to be performed (this includes yet unallocated space,
- * which must copy from the backing file)
+ * Returns the number of contiguous clusters that can be written to
+ * using one single write request, starting from @l2_index.
+ * At most @nb_clusters are checked.
+ *
+ * If @new_alloc is true this counts clusters that are either
+ * unallocated, or allocated but with refcount > 1 (so they need to be
+ * newly allocated and COWed).
+ *
+ * If @new_alloc is false this counts clusters that are already
+ * allocated and can be overwritten in-place (this includes clusters
+ * of type QCOW2_CLUSTER_ZERO_ALLOC).
*/
-static int count_cow_clusters(BlockDriverState *bs, int nb_clusters,
- uint64_t *l2_slice, int l2_index)
+static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
+ uint64_t *l2_slice, int l2_index,
+ bool new_alloc)
{
+ BDRVQcow2State *s = bs->opaque;
+ uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index]);
+ uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
int i;
for (i = 0; i < nb_clusters; i++) {
- uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
- if (!cluster_needs_cow(bs, l2_entry)) {
+ l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
+ if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
break;
}
+ if (!new_alloc) {
+ if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
+ break;
+ }
+ expected_offset += s->cluster_size;
+ }
}
assert(i <= nb_clusters);
@@ -1192,10 +1257,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
}
/*
- * Checks how many already allocated clusters that don't require a copy on
- * write there are at the given guest_offset (up to *bytes). If *host_offset is
- * not INV_OFFSET, only physically contiguous clusters beginning at this host
- * offset are counted.
+ * Checks how many already allocated clusters that don't require a new
+ * allocation there are at the given guest_offset (up to *bytes).
+ * If *host_offset is not INV_OFFSET, only physically contiguous clusters
+ * beginning at this host offset are counted.
*
* Note that guest_offset may not be cluster aligned. In this case, the
* returned *host_offset points to exact byte referenced by guest_offset and
@@ -1204,12 +1269,12 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
* Returns:
* 0: if no allocated clusters are available at the given offset.
* *bytes is normally unchanged. It is set to 0 if the cluster
- * is allocated and doesn't need COW, but doesn't have the right
- * physical offset.
+ * is allocated and can be overwritten in-place but doesn't have
+ * the right physical offset.
*
- * 1: if allocated clusters that don't require a COW are available at
- * the requested offset. *bytes may have decreased and describes
- * the length of the area that can be written to.
+ * 1: if allocated clusters that can be overwritten in place are
+ * available at the requested offset. *bytes may have decreased
+ * and describes the length of the area that can be written to.
*
* -errno: in error cases
*/
@@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
l2_index = offset_to_l2_slice_index(s, guest_offset);
nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
- assert(nb_clusters <= INT_MAX);
+ /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
+ nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
/* Find L2 entry for the first involved cluster */
ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
@@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
cluster_offset = be64_to_cpu(l2_slice[l2_index]);
- /* Check how many clusters are already allocated and don't need COW */
- if (qcow2_get_cluster_type(bs, cluster_offset) == QCOW2_CLUSTER_NORMAL
- && (cluster_offset & QCOW_OFLAG_COPIED))
- {
+ if (!cluster_needs_new_alloc(bs, cluster_offset)) {
/* If a specific host_offset is required, check it */
bool offset_matches =
(cluster_offset & L2E_OFFSET_MASK) == *host_offset;
if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
- qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
+ qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
"%#llx unaligned (guest offset: %#" PRIx64
- ")", cluster_offset & L2E_OFFSET_MASK,
+ ")", cluster_offset & QCOW_OFLAG_ZERO ?
+ "Preallocated zero" : "Data",
+ cluster_offset & L2E_OFFSET_MASK,
guest_offset);
ret = -EIO;
goto out;
@@ -1273,15 +1338,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
}
/* We keep all QCOW_OFLAG_COPIED clusters */
- keep_clusters =
- count_contiguous_clusters(bs, nb_clusters, s->cluster_size,
- &l2_slice[l2_index],
- QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
+ keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
+ l2_index, false);
assert(keep_clusters <= nb_clusters);
*bytes = MIN(*bytes,
keep_clusters * s->cluster_size
- offset_into_cluster(s, guest_offset));
+ assert(*bytes != 0);
+
+ calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset,
+ *bytes, l2_slice, m, true);
ret = 1;
} else {
@@ -1357,9 +1424,10 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
}
/*
- * Allocates new clusters for an area that either is yet unallocated or needs a
- * copy on write. If *host_offset is not INV_OFFSET, clusters are only
- * allocated if the new allocation can match the specified host offset.
+ * Allocates new clusters for an area that is either still unallocated or
+ * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
+ * clusters are only allocated if the new allocation can match the specified
+ * host offset.
*
* Note that guest_offset may not be cluster aligned. In this case, the
* returned *host_offset points to exact byte referenced by guest_offset and
@@ -1382,12 +1450,10 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
BDRVQcow2State *s = bs->opaque;
int l2_index;
uint64_t *l2_slice;
- uint64_t entry;
uint64_t nb_clusters;
int ret;
- bool keep_old_clusters = false;
- uint64_t alloc_cluster_offset = INV_OFFSET;
+ uint64_t alloc_cluster_offset;
trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
*bytes);
@@ -1402,10 +1468,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
l2_index = offset_to_l2_slice_index(s, guest_offset);
nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
- assert(nb_clusters <= INT_MAX);
-
- /* Limit total allocation byte count to INT_MAX */
- nb_clusters = MIN(nb_clusters, INT_MAX >> s->cluster_bits);
+ /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
+ nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
/* Find L2 entry for the first involved cluster */
ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
@@ -1413,67 +1477,32 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
return ret;
}
- entry = be64_to_cpu(l2_slice[l2_index]);
- nb_clusters = count_cow_clusters(bs, nb_clusters, l2_slice, l2_index);
+ nb_clusters = count_single_write_clusters(bs, nb_clusters,
+ l2_slice, l2_index, true);
/* This function is only called when there were no non-COW clusters, so if
* we can't find any unallocated or COW clusters either, something is
* wrong with our code. */
assert(nb_clusters > 0);
- if (qcow2_get_cluster_type(bs, entry) == QCOW2_CLUSTER_ZERO_ALLOC &&
- (entry & QCOW_OFLAG_COPIED) &&
- (*host_offset == INV_OFFSET ||
- start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK)))
- {
- int preallocated_nb_clusters;
-
- if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) {
- qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero "
- "cluster offset %#llx unaligned (guest "
- "offset: %#" PRIx64 ")",
- entry & L2E_OFFSET_MASK, guest_offset);
- ret = -EIO;
- goto fail;
- }
-
- /* Try to reuse preallocated zero clusters; contiguous normal clusters
- * would be fine, too, but count_cow_clusters() above has limited
- * nb_clusters already to a range of COW clusters */
- preallocated_nb_clusters =
- count_contiguous_clusters(bs, nb_clusters, s->cluster_size,
- &l2_slice[l2_index], QCOW_OFLAG_COPIED);
- assert(preallocated_nb_clusters > 0);
-
- nb_clusters = preallocated_nb_clusters;
- alloc_cluster_offset = entry & L2E_OFFSET_MASK;
-
- /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2()
- * should not free them. */
- keep_old_clusters = true;
+ /* Allocate at a given offset in the image file */
+ alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
+ start_of_cluster(s, *host_offset);
+ ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
+ &nb_clusters);
+ if (ret < 0) {
+ goto out;
}
- qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
-
- if (alloc_cluster_offset == INV_OFFSET) {
- /* Allocate, if necessary at a given offset in the image file */
- alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
- start_of_cluster(s, *host_offset);
- ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
- &nb_clusters);
- if (ret < 0) {
- goto fail;
- }
-
- /* Can't extend contiguous allocation */
- if (nb_clusters == 0) {
- *bytes = 0;
- return 0;
- }
-
- assert(alloc_cluster_offset != INV_OFFSET);
+ /* Can't extend contiguous allocation */
+ if (nb_clusters == 0) {
+ *bytes = 0;
+ ret = 0;
+ goto out;
}
+ assert(alloc_cluster_offset != INV_OFFSET);
+
/*
* Save info needed for meta data update.
*
@@ -1496,13 +1525,14 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
*bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
assert(*bytes != 0);
- calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
- m, keep_old_clusters);
+ calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice,
+ m, false);
- return 1;
+ ret = 1;
-fail:
- if (*m && (*m)->nb_clusters > 0) {
+out:
+ qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
+ if (ret < 0 && *m && (*m)->nb_clusters > 0) {
QLIST_REMOVE(*m, next_in_flight);
}
return ret;
--
2.20.1
I'm sorry that I'm joining only now and may ask questions already discussed in previous versions :(
17.03.2020 21:16, Alberto Garcia wrote:
> When writing to a qcow2 file there are two functions that take a
> virtual offset and return a host offset, possibly allocating new
> clusters if necessary:
>
> - handle_copied() looks for normal data clusters that are already
> allocated and have a reference count of 1. In those clusters we
> can simply write the data and there is no need to perform any
> copy-on-write.
>
> - handle_alloc() looks for clusters that do need copy-on-write,
> either because they haven't been allocated yet, because their
> reference count is != 1 or because they are ZERO_ALLOC clusters.
>
> The ZERO_ALLOC case is a bit special because those are clusters that
> are already allocated and they could perfectly be dealt with in
> handle_copied() (as long as copy-on-write is performed when required).
>
> In fact, there is extra code specifically for them in handle_alloc()
> that tries to reuse the existing allocation if possible and frees them
> otherwise.
>
> This patch changes the handling of ZERO_ALLOC clusters so the
> semantics of these two functions are now like this:
>
> - handle_copied() looks for clusters that are already allocated and
> which we can overwrite (NORMAL and ZERO_ALLOC clusters with a
> reference count of 1).
>
> - handle_alloc() looks for clusters for which we need a new
> allocation (all other cases).
>
> One important difference after this change is that clusters found
> in handle_copied() may now require copy-on-write, but this will be
> necessary anyway once we add support for subclusters.
>
> Signed-off-by: Alberto Garcia <berto@igalia.com>
> Reviewed-by: Eric Blake <eblake@redhat.com>
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> ---
> block/qcow2-cluster.c | 230 ++++++++++++++++++++++++------------------
> 1 file changed, 130 insertions(+), 100 deletions(-)
>
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index e251d00890..5c81046c34 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -1041,13 +1041,18 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
>
> /*
> * For a given write request, create a new QCowL2Meta structure, add
> - * it to @m and the BDRVQcow2State.cluster_allocs list.
> + * it to @m and the BDRVQcow2State.cluster_allocs list. If the write
> + * request does not need copy-on-write or changes to the L2 metadata
> + * then this function does nothing.
> *
> * @host_cluster_offset points to the beginning of the first cluster.
> *
> * @guest_offset and @bytes indicate the offset and length of the
> * request.
> *
> + * @l2_slice contains the L2 entries of all clusters involved in this
> + * write request.
> + *
> * If @keep_old is true it means that the clusters were already
> * allocated and will be overwritten. If false then the clusters are
> * new and we have to decrease the reference count of the old ones.
> @@ -1055,15 +1060,53 @@ void qcow2_alloc_cluster_abort(BlockDriverState *bs, QCowL2Meta *m)
> static void calculate_l2_meta(BlockDriverState *bs,
> uint64_t host_cluster_offset,
> uint64_t guest_offset, unsigned bytes,
> - QCowL2Meta **m, bool keep_old)
> + uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
> {
> BDRVQcow2State *s = bs->opaque;
> - unsigned cow_start_from = 0;
> + int l2_index = offset_to_l2_slice_index(s, guest_offset);
> + uint64_t l2_entry;
> + unsigned cow_start_from, cow_end_to;
> unsigned cow_start_to = offset_into_cluster(s, guest_offset);
> unsigned cow_end_from = cow_start_to + bytes;
> - unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
> unsigned nb_clusters = size_to_clusters(s, cow_end_from);
> QCowL2Meta *old_m = *m;
> + QCow2ClusterType type;
> +
> + assert(nb_clusters <= s->l2_slice_size - l2_index);
> +
> + /* Return if there's no COW (all clusters are normal and we keep them) */
> + if (keep_old) {
> + int i;
> + for (i = 0; i < nb_clusters; i++) {
> + l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
> + if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) {
Could we also allow full ZERO_ALLOC clusters here?
> + break;
> + }
> + }
> + if (i == nb_clusters) {
> + return;
> + }
> + }
> +
> + /* Get the L2 entry of the first cluster */
> + l2_entry = be64_to_cpu(l2_slice[l2_index]);
> + type = qcow2_get_cluster_type(bs, l2_entry);
> +
> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
> + cow_start_from = cow_start_to;
> + } else {
> + cow_start_from = 0;
> + }
> +
> + /* Get the L2 entry of the last cluster */
> + l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]);
> + type = qcow2_get_cluster_type(bs, l2_entry);
> +
> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
> + cow_end_to = cow_end_from;
> + } else {
> + cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
> + }
These two ifs may be moved into if (keep_old), and drop "&& keep_old" from conditions.
This also will allow to drop extra calculations, move new variables to if (keep_old) {} block and allow to pass l2_slice=NULL together with keep_old=false.
>
> *m = g_malloc0(sizeof(**m));
> **m = (QCowL2Meta) {
> @@ -1089,18 +1132,22 @@ static void calculate_l2_meta(BlockDriverState *bs,
> QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
> }
>
> -/* Returns true if writing to a cluster requires COW */
> -static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry)
> +/*
> + * Returns true if writing to the cluster pointed to by @l2_entry
> + * requires a new allocation (that is, if the cluster is unallocated
> + * or has refcount > 1 and therefore cannot be written in-place).
> + */
> +static bool cluster_needs_new_alloc(BlockDriverState *bs, uint64_t l2_entry)
> {
> switch (qcow2_get_cluster_type(bs, l2_entry)) {
> case QCOW2_CLUSTER_NORMAL:
> + case QCOW2_CLUSTER_ZERO_ALLOC:
> if (l2_entry & QCOW_OFLAG_COPIED) {
> return false;
> }
> case QCOW2_CLUSTER_UNALLOCATED:
> case QCOW2_CLUSTER_COMPRESSED:
> case QCOW2_CLUSTER_ZERO_PLAIN:
> - case QCOW2_CLUSTER_ZERO_ALLOC:
> return true;
> default:
> abort();
> @@ -1108,20 +1155,38 @@ static bool cluster_needs_cow(BlockDriverState *bs, uint64_t l2_entry)
> }
>
> /*
> - * Returns the number of contiguous clusters that can be used for an allocating
> - * write, but require COW to be performed (this includes yet unallocated space,
> - * which must copy from the backing file)
> + * Returns the number of contiguous clusters that can be written to
> + * using one single write request, starting from @l2_index.
> + * At most @nb_clusters are checked.
> + *
> + * If @new_alloc is true this counts clusters that are either
> + * unallocated, or allocated but with refcount > 1 (so they need to be
> + * newly allocated and COWed).
> + *
> + * If @new_alloc is false this counts clusters that are already
> + * allocated and can be overwritten in-place (this includes clusters
> + * of type QCOW2_CLUSTER_ZERO_ALLOC).
> */
> -static int count_cow_clusters(BlockDriverState *bs, int nb_clusters,
> - uint64_t *l2_slice, int l2_index)
> +static int count_single_write_clusters(BlockDriverState *bs, int nb_clusters,
> + uint64_t *l2_slice, int l2_index,
> + bool new_alloc)
> {
> + BDRVQcow2State *s = bs->opaque;
> + uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index]);
> + uint64_t expected_offset = l2_entry & L2E_OFFSET_MASK;
> int i;
>
> for (i = 0; i < nb_clusters; i++) {
> - uint64_t l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
> - if (!cluster_needs_cow(bs, l2_entry)) {
> + l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
> + if (cluster_needs_new_alloc(bs, l2_entry) != new_alloc) {
> break;
> }
> + if (!new_alloc) {
> + if (expected_offset != (l2_entry & L2E_OFFSET_MASK)) {
> + break;
> + }
> + expected_offset += s->cluster_size;
> + }
> }
>
> assert(i <= nb_clusters);
> @@ -1192,10 +1257,10 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
> }
>
> /*
> - * Checks how many already allocated clusters that don't require a copy on
> - * write there are at the given guest_offset (up to *bytes). If *host_offset is
> - * not INV_OFFSET, only physically contiguous clusters beginning at this host
> - * offset are counted.
> + * Checks how many already allocated clusters that don't require a new
> + * allocation there are at the given guest_offset (up to *bytes).
> + * If *host_offset is not INV_OFFSET, only physically contiguous clusters
> + * beginning at this host offset are counted.
> *
> * Note that guest_offset may not be cluster aligned. In this case, the
> * returned *host_offset points to exact byte referenced by guest_offset and
> @@ -1204,12 +1269,12 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
> * Returns:
> * 0: if no allocated clusters are available at the given offset.
> * *bytes is normally unchanged. It is set to 0 if the cluster
> - * is allocated and doesn't need COW, but doesn't have the right
> - * physical offset.
> + * is allocated and can be overwritten in-place but doesn't have
> + * the right physical offset.
> *
> - * 1: if allocated clusters that don't require a COW are available at
> - * the requested offset. *bytes may have decreased and describes
> - * the length of the area that can be written to.
> + * 1: if allocated clusters that can be overwritten in place are
> + * available at the requested offset. *bytes may have decreased
> + * and describes the length of the area that can be written to.
> *
> * -errno: in error cases
> */
> @@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
>
> l2_index = offset_to_l2_slice_index(s, guest_offset);
> nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
> - assert(nb_clusters <= INT_MAX);
> + /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
> + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
>
> /* Find L2 entry for the first involved cluster */
> ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
> @@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
>
> cluster_offset = be64_to_cpu(l2_slice[l2_index]);
It would be good to s/cluster_offset/l2_entry/
And, "cluster_offset & L2E_OFFSET_MASK" is used so many times, so, I'd not substitute, but
keep both variables: l2_entry and cluster_offset..
>
> - /* Check how many clusters are already allocated and don't need COW */
> - if (qcow2_get_cluster_type(bs, cluster_offset) == QCOW2_CLUSTER_NORMAL
> - && (cluster_offset & QCOW_OFLAG_COPIED))
> - {
> + if (!cluster_needs_new_alloc(bs, cluster_offset)) {
> /* If a specific host_offset is required, check it */
> bool offset_matches =
> (cluster_offset & L2E_OFFSET_MASK) == *host_offset;
>
> if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) {
> - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset "
> + qcow2_signal_corruption(bs, true, -1, -1, "%s cluster offset "
> "%#llx unaligned (guest offset: %#" PRIx64
> - ")", cluster_offset & L2E_OFFSET_MASK,
> + ")", cluster_offset & QCOW_OFLAG_ZERO ?
> + "Preallocated zero" : "Data",
> + cluster_offset & L2E_OFFSET_MASK,
> guest_offset);
> ret = -EIO;
> goto out;
> @@ -1273,15 +1338,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
> }
>
> /* We keep all QCOW_OFLAG_COPIED clusters */
> - keep_clusters =
> - count_contiguous_clusters(bs, nb_clusters, s->cluster_size,
> - &l2_slice[l2_index],
> - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO);
> + keep_clusters = count_single_write_clusters(bs, nb_clusters, l2_slice,
> + l2_index, false);
> assert(keep_clusters <= nb_clusters);
>
> *bytes = MIN(*bytes,
> keep_clusters * s->cluster_size
> - offset_into_cluster(s, guest_offset));
> + assert(*bytes != 0);
> +
> + calculate_l2_meta(bs, cluster_offset & L2E_OFFSET_MASK, guest_offset,
> + *bytes, l2_slice, m, true);
>
> ret = 1;
> } else {
> @@ -1357,9 +1424,10 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset,
> }
>
> /*
> - * Allocates new clusters for an area that either is yet unallocated or needs a
> - * copy on write. If *host_offset is not INV_OFFSET, clusters are only
> - * allocated if the new allocation can match the specified host offset.
> + * Allocates new clusters for an area that is either still unallocated or
> + * cannot be overwritten in-place. If *host_offset is not INV_OFFSET,
> + * clusters are only allocated if the new allocation can match the specified
> + * host offset.
> *
> * Note that guest_offset may not be cluster aligned. In this case, the
> * returned *host_offset points to exact byte referenced by guest_offset and
> @@ -1382,12 +1450,10 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
> BDRVQcow2State *s = bs->opaque;
> int l2_index;
> uint64_t *l2_slice;
> - uint64_t entry;
> uint64_t nb_clusters;
> int ret;
> - bool keep_old_clusters = false;
>
> - uint64_t alloc_cluster_offset = INV_OFFSET;
> + uint64_t alloc_cluster_offset;
>
> trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
> *bytes);
> @@ -1402,10 +1468,8 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
>
> l2_index = offset_to_l2_slice_index(s, guest_offset);
> nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
> - assert(nb_clusters <= INT_MAX);
> -
> - /* Limit total allocation byte count to INT_MAX */
> - nb_clusters = MIN(nb_clusters, INT_MAX >> s->cluster_bits);
> + /* Limit total allocation byte count to BDRV_REQUEST_MAX_BYTES */
> + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
>
> /* Find L2 entry for the first involved cluster */
> ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
> @@ -1413,67 +1477,32 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
> return ret;
> }
>
> - entry = be64_to_cpu(l2_slice[l2_index]);
> - nb_clusters = count_cow_clusters(bs, nb_clusters, l2_slice, l2_index);
> + nb_clusters = count_single_write_clusters(bs, nb_clusters,
> + l2_slice, l2_index, true);
>
> /* This function is only called when there were no non-COW clusters, so if
> * we can't find any unallocated or COW clusters either, something is
> * wrong with our code. */
> assert(nb_clusters > 0);
>
> - if (qcow2_get_cluster_type(bs, entry) == QCOW2_CLUSTER_ZERO_ALLOC &&
> - (entry & QCOW_OFLAG_COPIED) &&
> - (*host_offset == INV_OFFSET ||
> - start_of_cluster(s, *host_offset) == (entry & L2E_OFFSET_MASK)))
> - {
> - int preallocated_nb_clusters;
> -
> - if (offset_into_cluster(s, entry & L2E_OFFSET_MASK)) {
> - qcow2_signal_corruption(bs, true, -1, -1, "Preallocated zero "
> - "cluster offset %#llx unaligned (guest "
> - "offset: %#" PRIx64 ")",
> - entry & L2E_OFFSET_MASK, guest_offset);
> - ret = -EIO;
> - goto fail;
> - }
> -
> - /* Try to reuse preallocated zero clusters; contiguous normal clusters
> - * would be fine, too, but count_cow_clusters() above has limited
> - * nb_clusters already to a range of COW clusters */
> - preallocated_nb_clusters =
> - count_contiguous_clusters(bs, nb_clusters, s->cluster_size,
> - &l2_slice[l2_index], QCOW_OFLAG_COPIED);
> - assert(preallocated_nb_clusters > 0);
> -
> - nb_clusters = preallocated_nb_clusters;
> - alloc_cluster_offset = entry & L2E_OFFSET_MASK;
> -
> - /* We want to reuse these clusters, so qcow2_alloc_cluster_link_l2()
> - * should not free them. */
> - keep_old_clusters = true;
> + /* Allocate at a given offset in the image file */
> + alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
> + start_of_cluster(s, *host_offset);
> + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
> + &nb_clusters);
> + if (ret < 0) {
> + goto out;
> }
>
> - qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
actually we don't need l2_slice for keep_old=false in calculate_l2_meta, so if
calculate_l2_meta modified a bit, change of function tail is not needed..
Still, may be l2_slice will be used in calculate_l2_meta() in further patches? Will see..
> -
> - if (alloc_cluster_offset == INV_OFFSET) {
> - /* Allocate, if necessary at a given offset in the image file */
> - alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
> - start_of_cluster(s, *host_offset);
> - ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
> - &nb_clusters);
> - if (ret < 0) {
> - goto fail;
> - }
> -
> - /* Can't extend contiguous allocation */
> - if (nb_clusters == 0) {
> - *bytes = 0;
> - return 0;
> - }
> -
> - assert(alloc_cluster_offset != INV_OFFSET);
> + /* Can't extend contiguous allocation */
> + if (nb_clusters == 0) {
> + *bytes = 0;
> + ret = 0;
> + goto out;
> }
>
> + assert(alloc_cluster_offset != INV_OFFSET);
> +
> /*
> * Save info needed for meta data update.
> *
> @@ -1496,13 +1525,14 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset,
> *bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
> assert(*bytes != 0);
>
> - calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes,
> - m, keep_old_clusters);
> + calculate_l2_meta(bs, alloc_cluster_offset, guest_offset, *bytes, l2_slice,
> + m, false);
>
> - return 1;
> + ret = 1;
>
> -fail:
> - if (*m && (*m)->nb_clusters > 0) {
> +out:
> + qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
> + if (ret < 0 && *m && (*m)->nb_clusters > 0) {
> QLIST_REMOVE(*m, next_in_flight);
> }
Hmm, unrelated to the patch, but why do we remove meta, which we didn't create?
--
Best regards,
Vladimir
On Thu 09 Apr 2020 12:59:30 PM CEST, Vladimir Sementsov-Ogievskiy wrote:
>> static void calculate_l2_meta(BlockDriverState *bs,
>> uint64_t host_cluster_offset,
>> uint64_t guest_offset, unsigned bytes,
>> - QCowL2Meta **m, bool keep_old)
>> + uint64_t *l2_slice, QCowL2Meta **m, bool keep_old)
>> {
>> BDRVQcow2State *s = bs->opaque;
>> - unsigned cow_start_from = 0;
>> + int l2_index = offset_to_l2_slice_index(s, guest_offset);
>> + uint64_t l2_entry;
>> + unsigned cow_start_from, cow_end_to;
>> unsigned cow_start_to = offset_into_cluster(s, guest_offset);
>> unsigned cow_end_from = cow_start_to + bytes;
>> - unsigned cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
>> unsigned nb_clusters = size_to_clusters(s, cow_end_from);
>> QCowL2Meta *old_m = *m;
>> + QCow2ClusterType type;
>> +
>> + assert(nb_clusters <= s->l2_slice_size - l2_index);
>> +
>> + /* Return if there's no COW (all clusters are normal and we keep them) */
>> + if (keep_old) {
>> + int i;
>> + for (i = 0; i < nb_clusters; i++) {
>> + l2_entry = be64_to_cpu(l2_slice[l2_index + i]);
>> + if (qcow2_get_cluster_type(bs, l2_entry) != QCOW2_CLUSTER_NORMAL) {
>
> Could we also allow full ZERO_ALLOC clusters here?
No, because the L2 entry needs to be modified (in order to remove the
'all zeroes' bit) and we need to create a QCowL2Meta entry for that (see
qcow2_handle_l2meta()).
>> + /* Get the L2 entry of the first cluster */
>> + l2_entry = be64_to_cpu(l2_slice[l2_index]);
>> + type = qcow2_get_cluster_type(bs, l2_entry);
>> +
>> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
>> + cow_start_from = cow_start_to;
>> + } else {
>> + cow_start_from = 0;
>> + }
>> +
>> + /* Get the L2 entry of the last cluster */
>> + l2_entry = be64_to_cpu(l2_slice[l2_index + nb_clusters - 1]);
>> + type = qcow2_get_cluster_type(bs, l2_entry);
>> +
>> + if (type == QCOW2_CLUSTER_NORMAL && keep_old) {
>> + cow_end_to = cow_end_from;
>> + } else {
>> + cow_end_to = ROUND_UP(cow_end_from, s->cluster_size);
>> + }
>
> These two ifs may be moved into if (keep_old), and drop "&& keep_old"
> from conditions. This also will allow to drop extra calculations, move
> new variables to if (keep_old) {} block and allow to pass
> l2_slice=NULL together with keep_old=false.
In subsequent patches we're going to have more cases than just
QCOW2_CLUSTER_NORMAL so I don't think it makes sense to move the
keep_old check around.
>> @@ -1239,7 +1304,8 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
>>
>> l2_index = offset_to_l2_slice_index(s, guest_offset);
>> nb_clusters = MIN(nb_clusters, s->l2_slice_size - l2_index);
>> - assert(nb_clusters <= INT_MAX);
>> + /* Limit total byte count to BDRV_REQUEST_MAX_BYTES */
>> + nb_clusters = MIN(nb_clusters, BDRV_REQUEST_MAX_BYTES >> s->cluster_bits);
>>
>> /* Find L2 entry for the first involved cluster */
>> ret = get_cluster_table(bs, guest_offset, &l2_slice, &l2_index);
>> @@ -1249,18 +1315,17 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset,
>>
>> cluster_offset = be64_to_cpu(l2_slice[l2_index]);
>
> It would be good to s/cluster_offset/l2_entry/
>
> And, "cluster_offset & L2E_OFFSET_MASK" is used so many times, so, I'd
> not substitute, but keep both variables: l2_entry and cluster_offset.
Sounds good, I can change that.
>> + /* Allocate at a given offset in the image file */
>> + alloc_cluster_offset = *host_offset == INV_OFFSET ? INV_OFFSET :
>> + start_of_cluster(s, *host_offset);
>> + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset,
>> + &nb_clusters);
>> + if (ret < 0) {
>> + goto out;
>> }
>>
>> - qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
>
> actually we don't need l2_slice for keep_old=false in
> calculate_l2_meta, so if calculate_l2_meta modified a bit, change of
> function tail is not needed..
>
> Still, may be l2_slice will be used in calculate_l2_meta() in further
> patches? Will see..
We'll need it in a later patch.
>> -fail:
>> - if (*m && (*m)->nb_clusters > 0) {
>> +out:
>> + qcow2_cache_put(s->l2_table_cache, (void **) &l2_slice);
>> + if (ret < 0 && *m && (*m)->nb_clusters > 0) {
>> QLIST_REMOVE(*m, next_in_flight);
>> }
>
> Hmm, unrelated to the patch, but why do we remove meta, which we
> didn't create?
Not sure actually, I would need to check further...
Berto
© 2016 - 2026 Red Hat, Inc.