Support pausing jobs #104313
@ -119,7 +119,7 @@ func (sm *StateMachine) updateJobAfterTaskStatusChange(
|
|||||||
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusCompleted, api.JobStatusRequeueing, "task was queued")
|
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusCompleted, api.JobStatusRequeueing, "task was queued")
|
||||||
|
|
||||||
case api.TaskStatusPaused:
|
case api.TaskStatusPaused:
|
||||||
return sm.updateJobOnTaskStatusPaused(ctx, logger, job)
|
return nil
|
||||||
|
|
||||||
case api.TaskStatusCanceled:
|
case api.TaskStatusCanceled:
|
||||||
return sm.updateJobOnTaskStatusCanceled(ctx, logger, job)
|
return sm.updateJobOnTaskStatusCanceled(ctx, logger, job)
|
||||||
@ -182,38 +182,6 @@ func (sm *StateMachine) updateJobOnTaskStatusCanceled(ctx context.Context, logge
|
|||||||
return nil
|
return nil
|
||||||
David-Zhang-10 marked this conversation as resolved
Outdated
|
|||||||
}
|
}
|
||||||
|
|
||||||
// updateJobOnTaskStatusPaused conditionally escalates the pausing of a task to pause the job.
|
|
||||||
func (sm *StateMachine) updateJobOnTaskStatusPaused(ctx context.Context, logger zerolog.Logger, job *persistence.Job) error {
|
|
||||||
// If no more tasks can run, pause the job.
|
|
||||||
numRunnable, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job,
|
|
||||||
api.TaskStatusActive, api.TaskStatusQueued, api.TaskStatusSoftFailed)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if numRunnable == 0 {
|
|
||||||
logger.Info().Msg("paused task was last runnable task of job, pausing job")
|
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "paused task was last runnable task of job, pausing job")
|
|
||||||
}
|
|
||||||
|
|
||||||
if job.Status == api.JobStatusPauseRequested {
|
|
||||||
// if the job is in pause-requested state, and all other tasks are paused,
|
|
||||||
// then the job can be paused.
|
|
||||||
numPaused, numTotal, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusPaused)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if numPaused == numTotal {
|
|
||||||
logger.Info().Msg("all tasks of job are paused, job is paused")
|
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks paused")
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// if the job is not in pause-requested state, then some error occurred and the job should be failed.
|
|
||||||
logger.Info().Msg("task cannot be changed to paused when job is not in pause-requested state")
|
|
||||||
}
|
|
||||||
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// updateJobOnTaskStatusFailed conditionally escalates the failure of a task to fail the entire job.
|
// updateJobOnTaskStatusFailed conditionally escalates the failure of a task to fail the entire job.
|
||||||
func (sm *StateMachine) updateJobOnTaskStatusFailed(ctx context.Context, logger zerolog.Logger, job *persistence.Job) error {
|
func (sm *StateMachine) updateJobOnTaskStatusFailed(ctx context.Context, logger zerolog.Logger, job *persistence.Job) error {
|
||||||
// Count the number of failed tasks. If it is over the threshold, fail the job.
|
// Count the number of failed tasks. If it is over the threshold, fail the job.
|
||||||
|
Loading…
Reference in New Issue
Block a user
This code block feels a bit out of place. If I understand correctly, this is the flow that this code would handle:
{queued, soft-failed, paused}
).pause-requested
.To me this doesn't look like a status in which the pausing was complete.
This logic is also repeated quite a bit below. I think it's better to make a new function responsible for answering the question "is the job pausing, and is that process complete now?", and call that function from the places that need this.
Finally, some reordering can be done to optimise the code. Currently every check will count the tasks in the job, even when
job.Status != api.JobStatusPauseRequested
. It's good practice to do the cheapest check first, and the most expensive check last.The comment makes sense to me! The only thing I am confused about is what you meant by "the pausing was complete".
"Pausing is complete" for me means that the work that should be done when the job goes to
pause-requested
is done, and it can be moved to statepaused
.So for the pausing to be complete, we need all tasks to be in
paused
state, including tasks that are runnable but not inactive
state? In other words, instead of checking the number of active tasks here, we should check if the job has any tasks other thanpaused
?