Check for number of workers before soft failing the task. #104195

Merged
Sybren A. Stüvel merged 15 commits from Nitin-Rawat-1/flamenco:104190-job-stuck into main 2023-04-20 11:53:43 +02:00
Showing only changes of commit 6e24e0be3b - Show all commits

View File

@ -187,18 +187,26 @@ func (f *Flamenco) onTaskFailed(
Int("failedByWorkerCount", numFailed). Int("failedByWorkerCount", numFailed).
Int("threshold", threshold). Int("threshold", threshold).
Logger() Logger()
if numFailed < threshold {
if numFailed > threshold {
return f.hardFailTask(ctx, logger, worker, task, numFailed)
}
numWorkers, err := f.numWorkersCapableOfRunningTask(ctx, task) numWorkers, err := f.numWorkersCapableOfRunningTask(ctx, task)
if err != nil { if err != nil {
return err return err
} }
if numWorkers == 1 {
// If number of workers capable of running the failed task again is "1",
// that means we have no worker besides the one that actually failed the task.
// Because at this point in code the worker hasn't been registered as failing this task yet,
// and thus it is still counted.
// In such condition we should just fail the job itself.
if numWorkers <= 1 {
return f.failJobAfterCatastroficTaskFailure(ctx, logger, worker, task) return f.failJobAfterCatastroficTaskFailure(ctx, logger, worker, task)
} }
return f.softFailTask(ctx, logger, worker, task, numFailed) return f.softFailTask(ctx, logger, worker, task, numFailed)
} }
return f.hardFailTask(ctx, logger, worker, task, numFailed)
}
// maybeBlocklistWorker potentially block-lists the Worker, and checks whether // maybeBlocklistWorker potentially block-lists the Worker, and checks whether
// there are any workers left to run tasks of this type. // there are any workers left to run tasks of this type.