[Xen-devel] [OSSTEST PATCH 19/21] starvation: Abandon jobs which are unreasonably delaying their flight

Ian Jackson posted 21 patches 6 years, 9 months ago
[Xen-devel] [OSSTEST PATCH 19/21] starvation: Abandon jobs which are unreasonably delaying their flight
Posted by Ian Jackson 6 years, 9 months ago
Sometimes, due to a shortage of available resources, a flight might be
delayed because a handful of jobs are waiting much longer than the
rest.  Add a heuristic which causes these jobs to be abandoned.

We consider ourselves starving if we are starving now, based on the
most optimistic start time seen in the last I.

Signed-off-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
---
 ts-hosts-allocate-Executive | 105 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/ts-hosts-allocate-Executive b/ts-hosts-allocate-Executive
index 8c9ddaf7..7ea3c4af 100755
--- a/ts-hosts-allocate-Executive
+++ b/ts-hosts-allocate-Executive
@@ -62,6 +62,8 @@ our %magictaskid;
 our $fi;
 our $jobinfo;
 our $harness_rev;
+our $starvation_p;
+our @abs_start_estimates;
 
 #---------- general utilities, setup, etc. ----------
 
@@ -114,12 +116,16 @@ END
     }
 
     $alloc_start_time = time // die $!;
+
+    $starvation_p =
+	hostalloc_starvation_parse_runvar($r{hostalloc_maxwait_starvation});
 }
 
 #---------- prepared sql statements ----------
 # all users of these must ->finish them afterwards, to avoid db deadlock
 
 our ($flagscheckq, $equivflagscheckq, $duration_estimator, $resprop_q,
+     $starvation_q,
      $alloc_findres_q, $alloc_shared_q, $alloc_sharing_slot_q,
      $claim_share_reuse_q, $claim_maxshare_q, $claim_rmshares_q,
      $claim_noshares_q, $claim_rmshare_q, $claim_setres_q,
@@ -146,6 +152,15 @@ END
                       AND name = ?
 END
 
+    $starvation_q= $dbh_tests->prepare(<<END);
+        SELECT job, jobs.status, max(steps.finished)
+          FROM jobs
+     LEFT JOIN steps
+         USING (flight,job)
+         WHERE flight= ?
+      GROUP BY job, jobs.status
+END
+
     # for allocation
 
     $alloc_findres_q= $dbh_tests->prepare(<<END);
@@ -712,6 +727,88 @@ sub alloc_hosts () {
     logm("host allocation: all successful and recorded.");
 }
 
+sub most_optimistic ($$$) {
+    my ($best, $now, $period) = @_;
+    # Records that we have now estimated $best->{Start}.
+    # Returns the most optimistic absolute start time "in the last
+    # $period".  Returns undef if we don't have good data yet.
+
+    push @abs_start_estimates, { At => $now, Got => $best->{Start} + $now };
+
+    # Actually, what we do is prune all but the last entry from before
+    # $period, and we expect at least 4 estimates.  That ensures that
+    # the answer involves at least one estimate at least $period ago.
+    # Ie what we actually return is
+    #   Consider the most recent estimate which is at least $period
+    #   ago (the "oldest relevant"), and all subsequent estimates.
+    #   Answer is the most optimistic start time of all of those,
+    #   provided there are at least 4 of them.
+    my $is_old = sub { return $_[0]{At} <= $now - $period; };
+    my $need_estimates = 4;
+    while (@abs_start_estimates > $need_estimates &&
+	   $is_old->($abs_start_estimates[1])) {
+	# estimates[1] is at least $period ago and more recent
+	# than $estimates[0], so $estimates[0] cannot be the
+	# oldest relevant and is indeed older than the oldest
+	# relevant.
+	shift @abs_start_estimates;
+    }
+
+    my $pr = sub {
+	my ($e) = @_;
+	printf(DEBUG ' %s (@%s)',
+	       $e->{Got} - $now,
+	       $e->{At}  - $now);
+    };
+
+    print DEBUG "most_optimistic: all:";
+    my $optimist;
+    foreach (@abs_start_estimates) {
+	$pr->($_);
+	$optimist = $_ if !$optimist || $_->{Got} < $optimist->{Got};
+    }
+    print DEBUG "\n";
+    printf(DEBUG "most_optimistic: (period=%s):", $period);
+    $pr->($optimist);
+    print DEBUG "\n";
+
+    return undef unless @abs_start_estimates >= $need_estimates;
+
+    return $optimist->{Got};
+}
+
+sub starving ($) {
+    my ($best_start_abs) = @_;
+    return (0, 'runvar says never give up') unless %$starvation_p;
+    return (0, 'no estimate') unless defined $best_start_abs;
+    $starvation_q->execute($flight);
+    my $d=0;
+    my $w=0;
+    my $maxfin=0;
+    while (my ($j,$st,$fin) = $starvation_q->fetchrow_array()) {
+	if ($st eq 'preparing' ||
+	    $st eq 'queued' ||
+	    $st eq 'running') {
+	    $w++;
+	} else {
+	    $d++;
+	    return (0, "job $j status $st but no step finished time!")
+		unless defined $fin;
+	    $maxfin = $fin if $fin > $maxfin;
+	}
+    }
+    # we quit if the total time from the start of the flight
+    # to our expected finish is more than the total time so
+    # far (for the completed jobs) by the margin X and I
+    my $X = hostalloc_starvation_calculate_X($starvation_p, $w, $d);
+    return (0, 'X=inf') unless defined $X;
+    my $total_d = $maxfin - $fi->{started};
+    my $projected_me = $best_start_abs - $fi->{started};
+    my $m = "D=$d W=$w X=$X maxfin=$maxfin";
+    my $bad = $projected_me > $X * $total_d + $starvation_p->{I};
+    return ($bad, $m);
+}
+
 sub attempt_allocation {
     my $mayalloc;
     ($plan, $mayalloc) = @_;
@@ -772,6 +869,14 @@ sub attempt_allocation {
 	    if ($wait_sofar > $maxwait/2
 		&& $wait_sofar + $best->{Start} > $maxwait) {
 		logm "timed out: $wait_sofar, $best->{Start}, $maxwait";
+	} elsif (%$starvation_p) {
+	    my $est_abs = most_optimistic($best, $now, $starvation_p->{I});
+	    my ($starving, $m) = starving($est_abs);
+	    $starvation_q->finish();
+	    if (!$starving) {
+		print DEBUG "not starving: $m\n";
+	    } else {
+		logm "starving ($m)";
 		return 2;
 	    }
 	}
-- 
2.11.0


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel