Support pausing jobs #104313
@ -189,13 +189,12 @@ func (sm *StateMachine) updateJobOnTaskStatusCanceled(ctx context.Context, logge
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if numActive == 0 {
|
if numActive == 0 {
|
||||||
// there is no active task, and the job is in pause-requested status, so we can pause the job
|
// There is no active task, and the job is in pause-requested status, so we can pause the job.
|
||||||
logger.Info().Msg("all tasks of job are completed, job is paused")
|
logger.Info().Msg("No more active tasks, job is paused")
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Execution should not reach here.
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -221,15 +220,21 @@ func (sm *StateMachine) updateJobOnTaskStatusFailed(ctx context.Context, logger
|
|||||||
// If the job didn't fail, this failure indicates that at least the job is active.
|
// If the job didn't fail, this failure indicates that at least the job is active.
|
||||||
failLogger.Info().Msg("task failed, but not enough to fail the job")
|
failLogger.Info().Msg("task failed, but not enough to fail the job")
|
||||||
|
|
||||||
|
if job.Status != api.JobStatusPauseRequested {
|
||||||
|
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive,
|
||||||
|
"task failed, but not enough to fail the job")
|
||||||
|
} else {
|
||||||
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
|
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if numActive == 0 && job.Status == api.JobStatusPauseRequested {
|
if numActive == 0 {
|
||||||
// there is no active task, and the job is in pause-requested status, so we can pause the job
|
// There is no active task, and the job is in pause-requested status, so we can pause the job.
|
||||||
logger.Info().Msg("all tasks of job are completed, job is paused")
|
failLogger.Info().Msg("No more active tasks, job is paused")
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive,
|
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive,
|
||||||
"task failed, but not enough to fail the job")
|
"task failed, but not enough to fail the job")
|
||||||
}
|
}
|
||||||
@ -240,19 +245,29 @@ func (sm *StateMachine) updateJobOnTaskStatusCompleted(ctx context.Context, logg
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
if numComplete == numTotal {
|
if numComplete == numTotal {
|
||||||
logger.Info().Msg("all tasks of job are completed, job is completed")
|
logger.Info().Msg("all tasks of job are completed, job is completed")
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusCompleted, "all tasks completed")
|
return sm.JobStatusChange(ctx, job, api.JobStatusCompleted, "all tasks completed")
|
||||||
}
|
}
|
||||||
if numActive == 0 && job.Status == api.JobStatusPauseRequested {
|
|
||||||
// there is no active task, and the job is in pause-requested status, so we can pause the job
|
if job.Status != api.JobStatusPauseRequested {
|
||||||
logger.Info().Msg("all tasks of job are completed, job is paused")
|
logger.Info().
|
||||||
|
Int("taskNumTotal", numTotal).
|
||||||
|
Int("taskNumComplete", numComplete).
|
||||||
|
Msg("task completed; there are more tasks to do")
|
||||||
|
return sm.jobStatusIfAThenB(ctx, logger, job, api.JobStatusQueued, api.JobStatusActive, "no more tasks to do")
|
||||||
|
} else {
|
||||||
|
numActive, _, err := sm.persist.CountTasksOfJobInStatus(ctx, job, api.TaskStatusActive)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if numActive == 0 {
|
||||||
|
// There is no active task, and the job is in pause-requested status, so we can pause the job.
|
||||||
|
logger.Info().Msg("No more active tasks, job is paused")
|
||||||
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
return sm.JobStatusChange(ctx, job, api.JobStatusPaused, "all tasks completed")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
logger.Info().
|
logger.Info().
|
||||||
Int("taskNumTotal", numTotal).
|
Int("taskNumTotal", numTotal).
|
||||||
Int("taskNumComplete", numComplete).
|
Int("taskNumComplete", numComplete).
|
||||||
|
@ -76,7 +76,6 @@ func TestTaskStatusChangeActiveToCompleted(t *testing.T) {
|
|||||||
mocks.expectWriteTaskLogTimestamped(t, task, "task changed status active -> completed")
|
mocks.expectWriteTaskLogTimestamped(t, task, "task changed status active -> completed")
|
||||||
mocks.expectBroadcastTaskChange(task, api.TaskStatusActive, api.TaskStatusCompleted)
|
mocks.expectBroadcastTaskChange(task, api.TaskStatusActive, api.TaskStatusCompleted)
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(1, 3, nil) // 1 of 3 complete.
|
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(1, 3, nil) // 1 of 3 complete.
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusActive).Return(2, 3, nil) // 2 of 3 active.
|
|
||||||
require.NoError(t, sm.TaskStatusChange(ctx, task, api.TaskStatusCompleted))
|
require.NoError(t, sm.TaskStatusChange(ctx, task, api.TaskStatusCompleted))
|
||||||
|
|
||||||
// Second task hickup: T: active > soft-failed --> J: active > active
|
// Second task hickup: T: active > soft-failed --> J: active > active
|
||||||
@ -90,7 +89,6 @@ func TestTaskStatusChangeActiveToCompleted(t *testing.T) {
|
|||||||
mocks.expectWriteTaskLogTimestamped(t, task2, "task changed status soft-failed -> completed")
|
mocks.expectWriteTaskLogTimestamped(t, task2, "task changed status soft-failed -> completed")
|
||||||
mocks.expectBroadcastTaskChange(task2, api.TaskStatusSoftFailed, api.TaskStatusCompleted)
|
mocks.expectBroadcastTaskChange(task2, api.TaskStatusSoftFailed, api.TaskStatusCompleted)
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(2, 3, nil) // 2 of 3 complete.
|
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(2, 3, nil) // 2 of 3 complete.
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusActive).Return(1, 3, nil) // 1 of 3 active.
|
|
||||||
require.NoError(t, sm.TaskStatusChange(ctx, task2, api.TaskStatusCompleted))
|
require.NoError(t, sm.TaskStatusChange(ctx, task2, api.TaskStatusCompleted))
|
||||||
|
|
||||||
// Third task completing: T: active > completed --> J: active > completed
|
// Third task completing: T: active > completed --> J: active > completed
|
||||||
@ -98,7 +96,6 @@ func TestTaskStatusChangeActiveToCompleted(t *testing.T) {
|
|||||||
mocks.expectWriteTaskLogTimestamped(t, task3, "task changed status active -> completed")
|
mocks.expectWriteTaskLogTimestamped(t, task3, "task changed status active -> completed")
|
||||||
mocks.expectBroadcastTaskChange(task3, api.TaskStatusActive, api.TaskStatusCompleted)
|
mocks.expectBroadcastTaskChange(task3, api.TaskStatusActive, api.TaskStatusCompleted)
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(3, 3, nil) // 3 of 3 complete.
|
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusCompleted).Return(3, 3, nil) // 3 of 3 complete.
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusActive).Return(0, 3, nil) // 0 of 3 active.
|
|
||||||
mocks.expectSaveJobWithStatus(t, task.Job, api.JobStatusCompleted)
|
mocks.expectSaveJobWithStatus(t, task.Job, api.JobStatusCompleted)
|
||||||
mocks.expectBroadcastJobChange(task.Job, api.JobStatusActive, api.JobStatusCompleted)
|
mocks.expectBroadcastJobChange(task.Job, api.JobStatusActive, api.JobStatusCompleted)
|
||||||
|
|
||||||
@ -116,7 +113,6 @@ func TestTaskStatusChangeQueuedToFailed(t *testing.T) {
|
|||||||
mocks.expectBroadcastTaskChange(task, api.TaskStatusQueued, api.TaskStatusFailed)
|
mocks.expectBroadcastTaskChange(task, api.TaskStatusQueued, api.TaskStatusFailed)
|
||||||
mocks.expectSaveJobWithStatus(t, task.Job, api.JobStatusActive)
|
mocks.expectSaveJobWithStatus(t, task.Job, api.JobStatusActive)
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusFailed).Return(1, 100, nil) // 1 out of 100 failed.
|
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusFailed).Return(1, 100, nil) // 1 out of 100 failed.
|
||||||
mocks.persist.EXPECT().CountTasksOfJobInStatus(ctx, task.Job, api.TaskStatusActive).Return(0, 100, nil) // 0 out of 100 active.
|
|
||||||
mocks.expectBroadcastJobChange(task.Job, api.JobStatusQueued, api.JobStatusActive)
|
mocks.expectBroadcastJobChange(task.Job, api.JobStatusQueued, api.JobStatusActive)
|
||||||
|
|
||||||
require.NoError(t, sm.TaskStatusChange(ctx, task, api.TaskStatusFailed))
|
require.NoError(t, sm.TaskStatusChange(ctx, task, api.TaskStatusFailed))
|
||||||
|
Loading…
Reference in New Issue
Block a user