Support pausing jobs #104313

Manually merged
Sybren A. Stüvel merged 28 commits from David-Zhang-10/flamenco:paused-job-status into main 2024-07-01 17:53:44 +02:00
Showing only changes of commit d510eb390c - Show all commits

View File

@ -181,9 +181,7 @@ func (sm *StateMachine) updateJobOnTaskStatusCanceled(ctx context.Context, logge
} }
David-Zhang-10 marked this conversation as resolved Outdated

This code block feels a bit out of place. If I understand correctly, this is the flow that this code would handle:

  1. User cancels a task.
  2. There are other tasks that are still runnable but not active (i.e. status in {queued, soft-failed, paused}).
  3. Job is in status pause-requested.

To me this doesn't look like a status in which the pausing was complete.

This logic is also repeated quite a bit below. I think it's better to make a new function responsible for answering the question "is the job pausing, and is that process complete now?", and call that function from the places that need this.

Finally, some reordering can be done to optimise the code. Currently every check will count the tasks in the job, even when job.Status != api.JobStatusPauseRequested. It's good practice to do the cheapest check first, and the most expensive check last.

This code block feels a bit out of place. If I understand correctly, this is the flow that this code would handle: 1. User cancels a task. 2. There are other tasks that are still runnable but not active (i.e. status in `{queued, soft-failed, paused}`). 3. Job is in status `pause-requested`. To me this doesn't look like a status in which the pausing was complete. This logic is also repeated quite a bit below. I think it's better to make a new function responsible for answering the question "_is the job pausing, and is that process complete now?_", and call that function from the places that need this. Finally, some reordering can be done to optimise the code. Currently every check will count the tasks in the job, even when `job.Status != api.JobStatusPauseRequested`. It's good practice to do the cheapest check first, and the most expensive check last.

The comment makes sense to me! The only thing I am confused about is what you meant by "the pausing was complete".

The comment makes sense to me! The only thing I am confused about is what you meant by "the pausing was complete".

The only thing I am confused about is what you meant by "the pausing was complete".

"Pausing is complete" for me means that the work that should be done when the job goes to pause-requested is done, and it can be moved to state paused.

> The only thing I am confused about is what you meant by "the pausing was complete". "Pausing is complete" for me means that the work that should be done when the job goes to `pause-requested` is done, and it can be moved to state `paused`.

So for the pausing to be complete, we need all tasks to be in paused state, including tasks that are runnable but not in active state? In other words, instead of checking the number of active tasks here, we should check if the job has any tasks other than paused?

So for the pausing to be complete, we need all tasks to be in `paused` state, including tasks that are runnable but not in `active` state? In other words, instead of checking the number of active tasks here, we should check if the job has any tasks other than `paused`?
// Deal with the special case when the job is in pause-requested status. // Deal with the special case when the job is in pause-requested status.
if job.Status != api.JobStatusPauseRequested { if job.Status == api.JobStatusPauseRequested {
return nil
} else {
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive) numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
if err != nil { if err != nil {
return err return err
@ -220,10 +218,7 @@ func (sm *StateMachine) updateJobOnTaskStatusFailed(ctx context.Context, logger
// If the job didn't fail, this failure indicates that at least the job is active. // If the job didn't fail, this failure indicates that at least the job is active.
failLogger.Info().Msg("task failed, but not enough to fail the job") failLogger.Info().Msg("task failed, but not enough to fail the job")
if job.Status != api.JobStatusPauseRequested { if job.Status == api.JobStatusPauseRequested {
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive,
"task failed, but not enough to fail the job")
} else {
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive) numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
if err != nil { if err != nil {
return err return err
@ -250,13 +245,7 @@ func (sm *StateMachine) updateJobOnTaskStatusCompleted(ctx context.Context, logg
return sm.JobStatusChange(ctx, job, api.JobStatusCompleted, "all tasks completed") return sm.JobStatusChange(ctx, job, api.JobStatusCompleted, "all tasks completed")
} }
if job.Status != api.JobStatusPauseRequested { if job.Status == api.JobStatusPauseRequested {
logger.Info().
Int("taskNumTotal", numTotal).
Int("taskNumComplete", numComplete).
Msg("task completed; there are more tasks to do")
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive, "no more tasks to do")
} else {
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive) numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
if err != nil { if err != nil {
return err return err