[v9] job: replace AioContext lock with job_mutex

[PATCH v9 11/21] jobs: group together API calls under the same job lock

Posted by Emanuele Giuseppe Esposito 3 years, 7 months ago

Now that the API offers also _locked() functions, take advantage
of it and give also the caller control to take the lock and call
_locked functions.

This makes sense especially when we have for loops, because it
makes no sense to have:

for(job = job_next(); ...)

where each job_next() takes the lock internally.
Instead we want

JOB_LOCK_GUARD();
for(job = job_next_locked(); ...)

Note: at this stage, job_{lock/unlock} and job lock guard macros
are *nop*.

Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
---
 block.c            | 20 +++++++++++-------
 blockdev.c         | 12 ++++++++---
 blockjob.c         | 52 +++++++++++++++++++++++++++++++---------------
 job-qmp.c          |  4 +++-
 job.c              |  5 ++++-
 monitor/qmp-cmds.c |  7 +++++--
 qemu-img.c         | 41 +++++++++++++++++++++---------------
 7 files changed, 93 insertions(+), 48 deletions(-)

diff --git a/block.c b/block.c
index 2c00dddd80..d0db104d71 100644
--- a/block.c
+++ b/block.c
@@ -4978,9 +4978,12 @@ static void bdrv_close(BlockDriverState *bs)
 
 void bdrv_close_all(void)
 {
-    assert(job_next(NULL) == NULL);
     GLOBAL_STATE_CODE();
 
+    WITH_JOB_LOCK_GUARD() {
+        assert(job_next_locked(NULL) == NULL);
+    }
+
     /* Drop references from requests still in flight, such as canceled block
      * jobs whose AIO context has not been polled yet */
     bdrv_drain_all();
@@ -6165,13 +6168,16 @@ XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp)
         }
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        GSList *el;
+    WITH_JOB_LOCK_GUARD() {
+        for (job = block_job_next_locked(NULL); job;
+             job = block_job_next_locked(job)) {
+            GSList *el;
 
-        xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
-                           job->job.id);
-        for (el = job->nodes; el; el = el->next) {
-            xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+            xdbg_graph_add_node(gr, job, X_DBG_BLOCK_GRAPH_NODE_TYPE_BLOCK_JOB,
+                                job->job.id);
+            for (el = job->nodes; el; el = el->next) {
+                xdbg_graph_add_edge(gr, job, (BdrvChild *)el->data);
+            }
         }
     }
 
diff --git a/blockdev.c b/blockdev.c
index 71f793c4ab..5b79093155 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -150,12 +150,15 @@ void blockdev_mark_auto_del(BlockBackend *blk)
         return;
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = block_job_next_locked(NULL); job;
+         job = block_job_next_locked(job)) {
         if (block_job_has_bdrv(job, blk_bs(blk))) {
             AioContext *aio_context = job->job.aio_context;
             aio_context_acquire(aio_context);
 
-            job_cancel(&job->job, false);
+            job_cancel_locked(&job->job, false);
 
             aio_context_release(aio_context);
         }
@@ -3745,7 +3748,10 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
     BlockJobInfoList *head = NULL, **tail = &head;
     BlockJob *job;
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = block_job_next_locked(NULL); job;
+         job = block_job_next_locked(job)) {
         BlockJobInfo *value;
         AioContext *aio_context;
 
diff --git a/blockjob.c b/blockjob.c
index 0d59aba439..bce05a9096 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -99,7 +99,9 @@ static char *child_job_get_parent_desc(BdrvChild *c)
 static void child_job_drained_begin(BdrvChild *c)
 {
     BlockJob *job = c->opaque;
-    job_pause(&job->job);
+    WITH_JOB_LOCK_GUARD() {
+        job_pause_locked(&job->job);
+    }
 }
 
 static bool child_job_drained_poll(BdrvChild *c)
@@ -111,8 +113,10 @@ static bool child_job_drained_poll(BdrvChild *c)
     /* An inactive or completed job doesn't have any pending requests. Jobs
      * with !job->busy are either already paused or have a pause point after
      * being reentered, so no job driver code will run before they pause. */
-    if (!job->busy || job_is_completed(job)) {
-        return false;
+    WITH_JOB_LOCK_GUARD() {
+        if (!job->busy || job_is_completed_locked(job)) {
+            return false;
+        }
     }
 
     /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
@@ -127,7 +131,9 @@ static bool child_job_drained_poll(BdrvChild *c)
 static void child_job_drained_end(BdrvChild *c, int *drained_end_counter)
 {
     BlockJob *job = c->opaque;
-    job_resume(&job->job);
+    WITH_JOB_LOCK_GUARD() {
+        job_resume_locked(&job->job);
+    }
 }
 
 static bool child_job_can_set_aio_ctx(BdrvChild *c, AioContext *ctx,
@@ -475,13 +481,15 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
     job->ready_notifier.notify = block_job_event_ready;
     job->idle_notifier.notify = block_job_on_idle;
 
-    notifier_list_add(&job->job.on_finalize_cancelled,
-                      &job->finalize_cancelled_notifier);
-    notifier_list_add(&job->job.on_finalize_completed,
-                      &job->finalize_completed_notifier);
-    notifier_list_add(&job->job.on_pending, &job->pending_notifier);
-    notifier_list_add(&job->job.on_ready, &job->ready_notifier);
-    notifier_list_add(&job->job.on_idle, &job->idle_notifier);
+    WITH_JOB_LOCK_GUARD() {
+        notifier_list_add(&job->job.on_finalize_cancelled,
+                          &job->finalize_cancelled_notifier);
+        notifier_list_add(&job->job.on_finalize_completed,
+                          &job->finalize_completed_notifier);
+        notifier_list_add(&job->job.on_pending, &job->pending_notifier);
+        notifier_list_add(&job->job.on_ready, &job->ready_notifier);
+        notifier_list_add(&job->job.on_idle, &job->idle_notifier);
+    }
 
     error_setg(&job->blocker, "block device is in use by block job: %s",
                job_type_str(&job->job));
@@ -493,7 +501,10 @@ void *block_job_create(const char *job_id, const BlockJobDriver *driver,
 
     bdrv_op_unblock(bs, BLOCK_OP_TYPE_DATAPLANE, job->blocker);
 
-    if (!block_job_set_speed(job, speed, errp)) {
+    WITH_JOB_LOCK_GUARD() {
+        ret = block_job_set_speed_locked(job, speed, errp);
+    }
+    if (!ret) {
         goto fail;
     }
 
@@ -524,7 +535,9 @@ void block_job_user_resume(Job *job)
 {
     BlockJob *bjob = container_of(job, BlockJob, job);
     GLOBAL_STATE_CODE();
-    block_job_iostatus_reset(bjob);
+    WITH_JOB_LOCK_GUARD() {
+        block_job_iostatus_reset_locked(bjob);
+    }
 }
 
 BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
@@ -558,10 +571,15 @@ BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
                                         action);
     }
     if (action == BLOCK_ERROR_ACTION_STOP) {
-        if (!job->job.user_paused) {
-            job_pause(&job->job);
-            /* make the pause user visible, which will be resumed from QMP. */
-            job->job.user_paused = true;
+        WITH_JOB_LOCK_GUARD() {
+            if (!job->job.user_paused) {
+                job_pause_locked(&job->job);
+                /*
+                 * make the pause user visible, which will be
+                 * resumed from QMP.
+                 */
+                job->job.user_paused = true;
+            }
         }
         block_job_iostatus_set_err(job, error);
     }
diff --git a/job-qmp.c b/job-qmp.c
index ac11a6c23c..cfaf34ffb7 100644
--- a/job-qmp.c
+++ b/job-qmp.c
@@ -194,7 +194,9 @@ JobInfoList *qmp_query_jobs(Error **errp)
     JobInfoList *head = NULL, **tail = &head;
     Job *job;
 
-    for (job = job_next(NULL); job; job = job_next(job)) {
+    JOB_LOCK_GUARD();
+
+    for (job = job_next_locked(NULL); job; job = job_next_locked(job)) {
         JobInfo *value;
         AioContext *aio_context;
 
diff --git a/job.c b/job.c
index de3d368086..9c8792c9e8 100644
--- a/job.c
+++ b/job.c
@@ -1046,11 +1046,14 @@ static void job_completed_txn_abort_locked(Job *job)
 /* Called with job_mutex held, but releases it temporarily */
 static int job_prepare_locked(Job *job)
 {
+    int ret;
+
     GLOBAL_STATE_CODE();
     if (job->ret == 0 && job->driver->prepare) {
         job_unlock();
-        job->ret = job->driver->prepare(job);
+        ret = job->driver->prepare(job);
         job_lock();
+        job->ret = ret;
         job_update_rc_locked(job);
     }
     return job->ret;
diff --git a/monitor/qmp-cmds.c b/monitor/qmp-cmds.c
index 1ebb89f46c..1897ed7a13 100644
--- a/monitor/qmp-cmds.c
+++ b/monitor/qmp-cmds.c
@@ -133,8 +133,11 @@ void qmp_cont(Error **errp)
         blk_iostatus_reset(blk);
     }
 
-    for (job = block_job_next(NULL); job; job = block_job_next(job)) {
-        block_job_iostatus_reset(job);
+    WITH_JOB_LOCK_GUARD() {
+        for (job = block_job_next_locked(NULL); job;
+             job = block_job_next_locked(job)) {
+            block_job_iostatus_reset_locked(job);
+        }
     }
 
     /* Continuing after completed migration. Images have been inactivated to
diff --git a/qemu-img.c b/qemu-img.c
index 4cf4d2423d..289d88a156 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -912,25 +912,30 @@ static void run_block_job(BlockJob *job, Error **errp)
     int ret = 0;
 
     aio_context_acquire(aio_context);
-    job_ref(&job->job);
-    do {
-        float progress = 0.0f;
-        aio_poll(aio_context, true);
+    WITH_JOB_LOCK_GUARD() {
+        job_ref_locked(&job->job);
+        do {
+            float progress = 0.0f;
+            job_unlock();
+            aio_poll(aio_context, true);
+
+            progress_get_snapshot(&job->job.progress, &progress_current,
+                                &progress_total);
+            if (progress_total) {
+                progress = (float)progress_current / progress_total * 100.f;
+            }
+            qemu_progress_print(progress, 0);
+            job_lock();
+        } while (!job_is_ready_locked(&job->job) &&
+                 !job_is_completed_locked(&job->job));
 
-        progress_get_snapshot(&job->job.progress, &progress_current,
-                              &progress_total);
-        if (progress_total) {
-            progress = (float)progress_current / progress_total * 100.f;
+        if (!job_is_completed_locked(&job->job)) {
+            ret = job_complete_sync_locked(&job->job, errp);
+        } else {
+            ret = job->job.ret;
         }
-        qemu_progress_print(progress, 0);
-    } while (!job_is_ready(&job->job) && !job_is_completed(&job->job));
-
-    if (!job_is_completed(&job->job)) {
-        ret = job_complete_sync(&job->job, errp);
-    } else {
-        ret = job->job.ret;
+        job_unref_locked(&job->job);
     }
-    job_unref(&job->job);
     aio_context_release(aio_context);
 
     /* publish completion progress only when success */
@@ -1083,7 +1088,9 @@ static int img_commit(int argc, char **argv)
         bdrv_ref(bs);
     }
 
-    job = block_job_get("commit");
+    WITH_JOB_LOCK_GUARD() {
+        job = block_job_get_locked("commit");
+    }
     assert(job);
     run_block_job(job, &local_err);
     if (local_err) {
-- 
2.31.1

Re: [PATCH v9 11/21] jobs: group together API calls under the same job lock

Posted by Vladimir Sementsov-Ogievskiy 3 years, 7 months ago

On 7/6/22 23:15, Emanuele Giuseppe Esposito wrote:
> Now that the API offers also _locked() functions, take advantage
> of it and give also the caller control to take the lock and call
> _locked functions.
> 
> This makes sense especially when we have for loops, because it
> makes no sense to have:
> 
> for(job = job_next(); ...)
> 
> where each job_next() takes the lock internally.
> Instead we want
> 
> JOB_LOCK_GUARD();
> for(job = job_next_locked(); ...)
> 
> Note: at this stage, job_{lock/unlock} and job lock guard macros
> are *nop*.
> 
> Signed-off-by: Emanuele Giuseppe Esposito <eesposit@redhat.com>
> ---

[..]

>   
> diff --git a/blockjob.c b/blockjob.c
> index 0d59aba439..bce05a9096 100644
> --- a/blockjob.c
> +++ b/blockjob.c
> @@ -99,7 +99,9 @@ static char *child_job_get_parent_desc(BdrvChild *c)
>   static void child_job_drained_begin(BdrvChild *c)
>   {
>       BlockJob *job = c->opaque;
> -    job_pause(&job->job);
> +    WITH_JOB_LOCK_GUARD() {
> +        job_pause_locked(&job->job);
> +    }

What's the reason for it? I'd keep job_pause().

(and it doesn't correspond to what commit subject presents.)

>   }
>   
>   static bool child_job_drained_poll(BdrvChild *c)
> @@ -111,8 +113,10 @@ static bool child_job_drained_poll(BdrvChild *c)
>       /* An inactive or completed job doesn't have any pending requests. Jobs
>        * with !job->busy are either already paused or have a pause point after
>        * being reentered, so no job driver code will run before they pause. */
> -    if (!job->busy || job_is_completed(job)) {
> -        return false;
> +    WITH_JOB_LOCK_GUARD() {
> +        if (!job->busy || job_is_completed_locked(job)) {
> +            return false;
> +        }
>       }

This doesn't correspond to commit subject. I'd put such things to separate commit "correct use of job_mutex in blockjob.c".

>   
>       /* Otherwise, assume that it isn't fully stopped yet, but allow the job to
> @@ -127,7 +131,9 @@ static bool child_job_drained_poll(BdrvChild *c)
>   static void child_job_drained_end(BdrvChild *c, int *drained_end_counter)
>   {
>       BlockJob *job = c->opaque;
> -    job_resume(&job->job);
> +    WITH_JOB_LOCK_GUARD() {
> +        job_resume_locked(&job->job);
> +    }
>   }

Again, don't see a reason for such change.


[my comments relate to more similar cases in the patch]


-- 
Best regards,
Vladimir

Re: [PATCH v9 11/21] jobs: group together API calls under the same job lock

Posted by Emanuele Giuseppe Esposito 3 years, 6 months ago


Am 11/07/2022 um 15:26 schrieb Vladimir Sementsov-Ogievskiy:
>>   }
>>     static bool child_job_drained_poll(BdrvChild *c)
>> @@ -111,8 +113,10 @@ static bool child_job_drained_poll(BdrvChild *c)
>>       /* An inactive or completed job doesn't have any pending
>> requests. Jobs
>>        * with !job->busy are either already paused or have a pause
>> point after
>>        * being reentered, so no job driver code will run before they
>> pause. */
>> -    if (!job->busy || job_is_completed(job)) {
>> -        return false;
>> +    WITH_JOB_LOCK_GUARD() {
>> +        if (!job->busy || job_is_completed_locked(job)) {
>> +            return false;
>> +        }
>>       }
> 
> This doesn't correspond to commit subject. I'd put such things to
> separate commit "correct use of job_mutex in blockjob.c".
> 
>>         /* Otherwise, assume that it isn't fully stopped yet, but
>> allow the job to
>> @@ -127,7 +131,9 @@ static bool child_job_drained_poll(BdrvChild *c)
>>   static void child_job_drained_end(BdrvChild *c, int
>> *drained_end_counter)
>>   {
>>       BlockJob *job = c->opaque;
>> -    job_resume(&job->job);
>> +    WITH_JOB_LOCK_GUARD() {
>> +        job_resume_locked(&job->job);
>> +    }
>>   }
> 
> Again, don't see a reason for such change.
> 
> 
> [my comments relate to more similar cases in the patch]

I think you misunderstood: I aim to group API calls under the same lock.
One application of such grouping involves loops, but not only them.

Regarding the single-line WITH_JOB_LOCK_GUARD (job_next, job_pause,
job_resume and similar), I guess I will keep the not-locked function.

Emanuele

Re: [PATCH v9 11/21] jobs: group together API calls under the same job lock

Posted by Vladimir Sementsov-Ogievskiy 3 years, 6 months ago

On 7/19/22 15:40, Emanuele Giuseppe Esposito wrote:
> 
> 
> Am 11/07/2022 um 15:26 schrieb Vladimir Sementsov-Ogievskiy:
>>>    }
>>>      static bool child_job_drained_poll(BdrvChild *c)
>>> @@ -111,8 +113,10 @@ static bool child_job_drained_poll(BdrvChild *c)
>>>        /* An inactive or completed job doesn't have any pending
>>> requests. Jobs
>>>         * with !job->busy are either already paused or have a pause
>>> point after
>>>         * being reentered, so no job driver code will run before they
>>> pause. */
>>> -    if (!job->busy || job_is_completed(job)) {
>>> -        return false;
>>> +    WITH_JOB_LOCK_GUARD() {
>>> +        if (!job->busy || job_is_completed_locked(job)) {
>>> +            return false;
>>> +        }
>>>        }
>>
>> This doesn't correspond to commit subject. I'd put such things to
>> separate commit "correct use of job_mutex in blockjob.c".
>>
>>>          /* Otherwise, assume that it isn't fully stopped yet, but
>>> allow the job to
>>> @@ -127,7 +131,9 @@ static bool child_job_drained_poll(BdrvChild *c)
>>>    static void child_job_drained_end(BdrvChild *c, int
>>> *drained_end_counter)
>>>    {
>>>        BlockJob *job = c->opaque;
>>> -    job_resume(&job->job);
>>> +    WITH_JOB_LOCK_GUARD() {
>>> +        job_resume_locked(&job->job);
>>> +    }
>>>    }
>>
>> Again, don't see a reason for such change.
>>
>>
>> [my comments relate to more similar cases in the patch]
> 
> I think you misunderstood: I aim to group API calls under the same lock.
> One application of such grouping involves loops, but not only them.

I mean that pre-patch job->busy is accessed without any lock at all. So we not just group correctly locked calls into one critical section, we also fix direct field accesses to be under lock.


> 
> Regarding the single-line WITH_JOB_LOCK_GUARD (job_next, job_pause,
> job_resume and similar), I guess I will keep the not-locked function.
> 
> Emanuele
> 


-- 
Best regards,
Vladimir