diff --git a/shim/restore.go b/shim/restore.go index 477d8cc..8ae0475 100644 --- a/shim/restore.go +++ b/shim/restore.go @@ -31,6 +31,7 @@ var ( ErrAlreadyRestored = errors.New("container is already restored") ErrRestoreRequestFailed = errors.New("restore request failed") ErrRestoreDial = errors.New("failed to connect to node socket") + ErrInvalidCheckpoint = errors.New("checkpoint data invalid") ) // Restore starts the container either from a checkpoint or from scratch @@ -236,9 +237,11 @@ func MigrationRestore(ctx context.Context, r *task.CreateTaskRequest, cfg *Confi log.G(ctx).Infof("restore response: %v", resp.MigrationInfo) - // TODO: validate that path is valid and contains image + if err := validateCheckpointData(nodev1.SnapshotPath(resp.MigrationInfo.ImageId)); err != nil { + return false, fmt.Errorf("%w: %w", ErrInvalidCheckpoint, err) + } r.Checkpoint = nodev1.SnapshotPath(resp.MigrationInfo.ImageId) - log.G(ctx).Infof("setting checkpoint dir for restore: %s", nodev1.SnapshotPath(resp.MigrationInfo.ImageId)) + log.G(ctx).Infof("setting checkpoint dir for restore: %s", r.Checkpoint) // we set the criu work path for the live migration to work (the lazy pages // socket needs to be there) and also so the restore stats are stored in the @@ -323,3 +326,8 @@ func FinishRestore(ctx context.Context, id string, cfg *Config, startTime time.T return nil } + +func validateCheckpointData(snapshotPath string) error { + _, err := os.Stat(filepath.Join(snapshotPath, "descriptors.json")) + return err +} diff --git a/shim/task/service_zeropod.go b/shim/task/service_zeropod.go index 20a420c..bbe5406 100644 --- a/shim/task/service_zeropod.go +++ b/shim/task/service_zeropod.go @@ -160,13 +160,14 @@ func (w *wrapper) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ * skipStart, err := zshim.MigrationRestore(ctx, r, cfg) if err != nil { if errors.Is(err, zshim.ErrRestoreRequestFailed) || - errors.Is(err, zshim.ErrRestoreDial) { + errors.Is(err, zshim.ErrRestoreDial) || + errors.Is(err, zshim.ErrInvalidCheckpoint) { // if the restore fails with ErrRestoreRequestFailed it's very // likely it simply did not find a matching migration. Equally, // if the shim can't manage to dial the node service there's no // chance it can be restored. We log it and create the container // from scratch. - log.G(ctx).Errorf("restore request failed: %s", err) + log.G(ctx).WithError(err).Error("restore request failed") } else { return nil, err }