Check for number of workers before soft failing the task. #104195
@ -187,17 +187,25 @@ func (f *Flamenco) onTaskFailed(
|
|||||||
Int("failedByWorkerCount", numFailed).
|
Int("failedByWorkerCount", numFailed).
|
||||||
Int("threshold", threshold).
|
Int("threshold", threshold).
|
||||||
Logger()
|
Logger()
|
||||||
if numFailed < threshold {
|
|
||||||
Nitin-Rawat-1 marked this conversation as resolved
Outdated
|
|||||||
numWorkers, err := f.numWorkersCapableOfRunningTask(ctx, task)
|
if numFailed > threshold {
|
||||||
if err != nil {
|
return f.hardFailTask(ctx, logger, worker, task, numFailed)
|
||||||
return err
|
|
||||||
}
|
|
||||||
if numWorkers == 1 {
|
|
||||||
return f.failJobAfterCatastroficTaskFailure(ctx, logger, worker, task)
|
|
||||||
}
|
|
||||||
return f.softFailTask(ctx, logger, worker, task, numFailed)
|
|
||||||
}
|
}
|
||||||
return f.hardFailTask(ctx, logger, worker, task, numFailed)
|
|
||||||
|
numWorkers, err := f.numWorkersCapableOfRunningTask(ctx, task)
|
||||||
Nitin-Rawat-1 marked this conversation as resolved
Outdated
Sybren A. Stüvel
commented
I'm assuming that Also it's better to use I'm assuming that `1` is because this worker hasn't been registered as failing this task yet, and thus is still counted. If this is indeed the case, it should be mentioned in a comment, as it's not entirely obvious from glancing at the code. Or maybe I'm wrong, and then there should definitely be a comment that explains where the `1` comes from ;-)
Also it's better to use `numWorkers <= 1` here, as this should also hard-fail if there are zero workers left to run this task. Given that it's a very asynchronous system, there could be other factors that we just don't see right now here in this part of the code, that could lead to unexpected results.
|
|||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
// If number of workers capable of running the failed task again is "1",
|
||||||
|
// that means we have no worker besides the one that actually failed the task.
|
||||||
|
// Because at this point in code the worker hasn't been registered as failing this task yet,
|
||||||
|
// and thus it is still counted.
|
||||||
|
// In such condition we should just fail the job itself.
|
||||||
|
if numWorkers <= 1 {
|
||||||
|
return f.failJobAfterCatastroficTaskFailure(ctx, logger, worker, task)
|
||||||
|
}
|
||||||
|
return f.softFailTask(ctx, logger, worker, task, numFailed)
|
||||||
}
|
}
|
||||||
|
|
||||||
// maybeBlocklistWorker potentially block-lists the Worker, and checks whether
|
// maybeBlocklistWorker potentially block-lists the Worker, and checks whether
|
||||||
|
Loading…
Reference in New Issue
Block a user
If you flip this condition, you can return early and reduce the nesting level of the following code.