Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions shim/restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ var (
ErrAlreadyRestored = errors.New("container is already restored")
ErrRestoreRequestFailed = errors.New("restore request failed")
ErrRestoreDial = errors.New("failed to connect to node socket")
ErrInvalidCheckpoint = errors.New("checkpoint data invalid")
)

// Restore starts the container either from a checkpoint or from scratch
Expand Down Expand Up @@ -236,9 +237,11 @@ func MigrationRestore(ctx context.Context, r *task.CreateTaskRequest, cfg *Confi

log.G(ctx).Infof("restore response: %v", resp.MigrationInfo)

// TODO: validate that path is valid and contains image
if err := validateCheckpointData(nodev1.SnapshotPath(resp.MigrationInfo.ImageId)); err != nil {
return false, fmt.Errorf("%w: %w", ErrInvalidCheckpoint, err)
}
r.Checkpoint = nodev1.SnapshotPath(resp.MigrationInfo.ImageId)
log.G(ctx).Infof("setting checkpoint dir for restore: %s", nodev1.SnapshotPath(resp.MigrationInfo.ImageId))
log.G(ctx).Infof("setting checkpoint dir for restore: %s", r.Checkpoint)

// we set the criu work path for the live migration to work (the lazy pages
// socket needs to be there) and also so the restore stats are stored in the
Expand Down Expand Up @@ -323,3 +326,8 @@ func FinishRestore(ctx context.Context, id string, cfg *Config, startTime time.T

return nil
}

func validateCheckpointData(snapshotPath string) error {
_, err := os.Stat(filepath.Join(snapshotPath, "descriptors.json"))
return err
}
5 changes: 3 additions & 2 deletions shim/task/service_zeropod.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,14 @@ func (w *wrapper) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *
skipStart, err := zshim.MigrationRestore(ctx, r, cfg)
if err != nil {
if errors.Is(err, zshim.ErrRestoreRequestFailed) ||
errors.Is(err, zshim.ErrRestoreDial) {
errors.Is(err, zshim.ErrRestoreDial) ||
errors.Is(err, zshim.ErrInvalidCheckpoint) {
// if the restore fails with ErrRestoreRequestFailed it's very
// likely it simply did not find a matching migration. Equally,
// if the shim can't manage to dial the node service there's no
// chance it can be restored. We log it and create the container
// from scratch.
log.G(ctx).Errorf("restore request failed: %s", err)
log.G(ctx).WithError(err).Error("restore request failed")
} else {
return nil, err
}
Expand Down
Loading