Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
package terraformcloud

import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"time"

"workspace-engine/pkg/jobagents/types"
"workspace-engine/pkg/oapi"
)

const planTimeout = 5 * time.Minute

var _ types.Plannable = (*TFCPlanner)(nil)

// WorkspaceSetup handles workspace provisioning for a plan.
type WorkspaceSetup interface {
Setup(ctx context.Context, dispatchCtx *oapi.DispatchContext) (workspaceID string, err error)
}

// SpeculativeRunner creates and reads speculative (plan-only) runs.
type SpeculativeRunner interface {
CreateSpeculativeRun(
ctx context.Context,
cfg *tfeConfig,
workspaceID string,
) (runID string, err error)
ReadRunStatus(ctx context.Context, cfg *tfeConfig, runID string) (*RunStatus, error)
ReadPlanJSON(ctx context.Context, cfg *tfeConfig, planID string) ([]byte, error)
}

// RunStatus is the information read back from a TFC run.
type RunStatus struct {
Status string
PlanID string
ResourceAdditions int
ResourceChanges int
ResourceDestructions int
IsFinished bool
IsErrored bool
}

type TFCPlanner struct {
workspace WorkspaceSetup
runner SpeculativeRunner
}

func NewTFCPlanner(workspace WorkspaceSetup, runner SpeculativeRunner) *TFCPlanner {
return &TFCPlanner{workspace: workspace, runner: runner}
}

func (p *TFCPlanner) Type() string {
return "tfe"
}

type tfePlanState struct {
RunID string `json:"runId"`
PollCount int `json:"pollCount"`
FirstPolled *time.Time `json:"firstPolled,omitempty"`
}

func (p *TFCPlanner) Plan(
ctx context.Context,
dispatchCtx *oapi.DispatchContext,
state json.RawMessage,
) (*types.PlanResult, error) {
cfg, err := parseJobAgentConfig(dispatchCtx.JobAgentConfig)
if err != nil {
return nil, err
}

var s tfePlanState
if state != nil {
if err := json.Unmarshal(state, &s); err != nil {
return nil, fmt.Errorf("unmarshal plan state: %w", err)
}
}

if s.RunID == "" {
workspaceID, err := p.workspace.Setup(ctx, dispatchCtx)
if err != nil {
return nil, fmt.Errorf("setup workspace: %w", err)
}
return p.createRun(ctx, cfg, workspaceID)
}

return p.pollRun(ctx, cfg, s)
}

func (p *TFCPlanner) createRun(
ctx context.Context,
cfg *tfeConfig,
workspaceID string,
) (*types.PlanResult, error) {
runID, err := p.runner.CreateSpeculativeRun(ctx, cfg, workspaceID)
if err != nil {
return nil, fmt.Errorf("create speculative run: %w", err)
}

now := time.Now()
s := tfePlanState{
RunID: runID,
PollCount: 0,
FirstPolled: &now,
}

stateJSON, err := json.Marshal(s)
if err != nil {
return nil, fmt.Errorf("marshal plan state: %w", err)
}

return &types.PlanResult{
State: stateJSON,
Message: fmt.Sprintf("Speculative run %s created, waiting for plan", runID),
}, nil
}

func (p *TFCPlanner) pollRun(
ctx context.Context,
cfg *tfeConfig,
s tfePlanState,
) (*types.PlanResult, error) {
status, err := p.runner.ReadRunStatus(ctx, cfg, s.RunID)
if err != nil {
return nil, fmt.Errorf("read run %s: %w", s.RunID, err)
}

s.PollCount++

if status.IsFinished {
return p.completePlan(ctx, cfg, status)
}

if status.IsErrored {
now := time.Now()
return &types.PlanResult{
CompletedAt: &now,
Message: fmt.Sprintf("Run %s ended with status: %s", s.RunID, status.Status),
}, nil
Comment on lines +138 to +142
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the TFC run ends in an errored/canceled/discarded state, this branch returns a non-nil CompletedAt with a nil error. The deploymentplanresult controller treats any nil error + non-nil CompletedAt as a successful completion and will persist Status=completed, which misclassifies failed plans. Return a non-nil error here (including run ID/status) so the controller records Status=errored (or introduce an explicit failure status in PlanResult and handle it in the controller).

Suggested change
now := time.Now()
return &types.PlanResult{
CompletedAt: &now,
Message: fmt.Sprintf("Run %s ended with status: %s", s.RunID, status.Status),
}, nil
return nil, fmt.Errorf("run %s ended with status: %s", s.RunID, status.Status)

Copilot uses AI. Check for mistakes.
}

if s.FirstPolled != nil && time.Since(*s.FirstPolled) > planTimeout {
now := time.Now()
return &types.PlanResult{
CompletedAt: &now,
Message: fmt.Sprintf(
"Run %s timed out after %d polls (%s elapsed), last status: %s",
s.RunID, s.PollCount, time.Since(*s.FirstPolled).Round(time.Second), status.Status,
),
}, nil
Comment on lines +146 to +153
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The timeout path also returns CompletedAt with a nil error, so the controller will persist Status=completed even though this is effectively a plan failure/timeout. Consider returning an error (e.g., include elapsed time/poll count/last status) so timeouts are recorded as Status=errored and are visible as failures to callers.

Suggested change
now := time.Now()
return &types.PlanResult{
CompletedAt: &now,
Message: fmt.Sprintf(
"Run %s timed out after %d polls (%s elapsed), last status: %s",
s.RunID, s.PollCount, time.Since(*s.FirstPolled).Round(time.Second), status.Status,
),
}, nil
elapsed := time.Since(*s.FirstPolled).Round(time.Second)
return nil, fmt.Errorf(
"run %s timed out after %d polls (%s elapsed), last status: %s",
s.RunID, s.PollCount, elapsed, status.Status,
)

Copilot uses AI. Check for mistakes.
}

stateJSON, err := json.Marshal(s)
if err != nil {
return nil, fmt.Errorf("marshal plan state: %w", err)
}

return &types.PlanResult{
State: stateJSON,
Message: fmt.Sprintf(
"Waiting for plan (poll %d, status: %s)",
s.PollCount, status.Status,
),
}, nil
}

func (p *TFCPlanner) completePlan(
ctx context.Context,
cfg *tfeConfig,
status *RunStatus,
) (*types.PlanResult, error) {
planJSON, err := p.runner.ReadPlanJSON(ctx, cfg, status.PlanID)
if err != nil {
return nil, fmt.Errorf("read plan JSON: %w", err)
}
Comment on lines +170 to +178
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

completePlan assumes status.PlanID is populated whenever status.IsFinished is true. In ReadRunStatus, PlanID is only set when run.Plan != nil, so it's possible to reach here with an empty PlanID (e.g., unexpected API response), which will produce a confusing downstream error. Add an explicit check for empty PlanID and return a clear error before calling ReadPlanJSON.

Copilot uses AI. Check for mistakes.

hasChanges := status.ResourceAdditions+status.ResourceChanges+status.ResourceDestructions > 0
hash := sha256.Sum256(planJSON)

now := time.Now()
return &types.PlanResult{
CompletedAt: &now,
HasChanges: hasChanges,
ContentHash: hex.EncodeToString(hash[:]),
Current: "",
Proposed: string(planJSON),
Message: fmt.Sprintf(
"+%d ~%d -%d resources",
status.ResourceAdditions, status.ResourceChanges, status.ResourceDestructions,
),
}, nil
}
128 changes: 128 additions & 0 deletions apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package terraformcloud

import (
"context"
"fmt"

"github.com/hashicorp/go-tfe"
"workspace-engine/pkg/oapi"
)

// GoWorkspaceSetup is the production implementation of WorkspaceSetup.
type GoWorkspaceSetup struct{}

// Setup provisions the TFC workspace (upsert + variable sync) and returns its ID.
func (g *GoWorkspaceSetup) Setup(
ctx context.Context,
dispatchCtx *oapi.DispatchContext,
) (string, error) {
cfg, err := parseJobAgentConfig(dispatchCtx.JobAgentConfig)
if err != nil {
return "", err
}

client, err := getClient(cfg.address, cfg.token)
if err != nil {
return "", fmt.Errorf("create tfe client: %w", err)
}

workspace, err := templateWorkspace(dispatchCtx, cfg.template)
if err != nil {
return "", fmt.Errorf("template workspace: %w", err)
}

targetWorkspace, err := upsertWorkspace(ctx, client, cfg.organization, workspace)
if err != nil {
return "", fmt.Errorf("upsert workspace: %w", err)
}

if len(workspace.Variables) > 0 {
if err := syncVariables(ctx, client, targetWorkspace.ID, workspace.Variables); err != nil {
return "", fmt.Errorf("sync variables: %w", err)
}
}

return targetWorkspace.ID, nil
}

// GoSpeculativeRunner is the production implementation of SpeculativeRunner.
type GoSpeculativeRunner struct{}

// CreateSpeculativeRun creates a plan-only run on the given workspace and returns the run ID.
func (g *GoSpeculativeRunner) CreateSpeculativeRun(
ctx context.Context,
cfg *tfeConfig,
workspaceID string,
) (string, error) {
client, err := getClient(cfg.address, cfg.token)
if err != nil {
return "", fmt.Errorf("create tfe client: %w", err)
}

planOnly := true
message := "Speculative plan by ctrlplane"
run, err := client.Runs.Create(ctx, tfe.RunCreateOptions{
Workspace: &tfe.Workspace{ID: workspaceID},
PlanOnly: &planOnly,
Message: &message,
})
if err != nil {
return "", fmt.Errorf("create speculative run: %w", err)
}
return run.ID, nil
}

// ReadRunStatus reads the current status of a TFC run and maps it to a RunStatus.
func (g *GoSpeculativeRunner) ReadRunStatus(
ctx context.Context,
cfg *tfeConfig,
runID string,
) (*RunStatus, error) {
client, err := getClient(cfg.address, cfg.token)
if err != nil {
return nil, fmt.Errorf("create tfe client: %w", err)
}

run, err := client.Runs.Read(ctx, runID)
if err != nil {
return nil, fmt.Errorf("read run: %w", err)
}

status := &RunStatus{
Status: string(run.Status),
}

if run.Plan != nil {
status.PlanID = run.Plan.ID
status.ResourceAdditions = run.Plan.ResourceAdditions
status.ResourceChanges = run.Plan.ResourceChanges
status.ResourceDestructions = run.Plan.ResourceDestructions
}

switch run.Status {
case tfe.RunPlannedAndFinished:
status.IsFinished = true
case tfe.RunErrored, tfe.RunCanceled, tfe.RunDiscarded, tfe.RunPolicySoftFailed:
status.IsErrored = true
}
Comment on lines +102 to +107
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

In Terraform Cloud Runs API, is "force_canceled" a possible run status, and should it be treated as a terminal non-success state for speculative (plan-only) runs?

💡 Result:

Yes. The Terraform Cloud (HCP Terraform) Runs API documents force_canceled as a valid data.attributes.status value (“A workspace admin forcefully canceled the run.”). It indicates the run was terminated due to an administrative force-cancel, not a successful completion. [1]

For speculative (plan-only) runs, the documented successful terminal status is planned_and_finished (“The run is completed… This is a final state.”). A force_canceled run should therefore be treated as a terminal, non-success outcome when evaluating plan-only run results. [1]

Source
[1] Terraform Cloud Runs API docs (run statuses; plan-only terminal status; cancel/force-cancel behavior). (developer.hashicorp.com)

Citations:


🏁 Script executed:

cat -n apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go | sed -n '95,115p'

Repository: ctrlplanedev/ctrlplane

Length of output: 798


🏁 Script executed:

rg "force_canceled|RunStatus" apps/workspace-engine/pkg/jobagents/terraformcloud/ -A 2 -B 2

Repository: ctrlplanedev/ctrlplane

Length of output: 8901


🏁 Script executed:

rg "import.*tfe\|from.*tfe" apps/workspace-engine/pkg/jobagents/terraformcloud/ -A 3

Repository: ctrlplanedev/ctrlplane

Length of output: 48


🏁 Script executed:

head -30 apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go

Repository: ctrlplanedev/ctrlplane

Length of output: 784


🏁 Script executed:

rg "tfe\." apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go | head -20

Repository: ctrlplanedev/ctrlplane

Length of output: 285


🏁 Script executed:

rg "RunCanceled|RunErrored|RunPlannedAndFinished|RunDiscarded|RunPolicySoftFailed" apps/workspace-engine/pkg/jobagents/terraformcloud/ -B 2 -A 2

Repository: ctrlplanedev/ctrlplane

Length of output: 1738


🏁 Script executed:

rg "RunForceCanceled|force_canceled" apps/workspace-engine/ -i

Repository: ctrlplanedev/ctrlplane

Length of output: 48


🏁 Script executed:

web_search
hashicorp go-tfe library RunForceCanceled constant or force_canceled status

Repository: ctrlplanedev/ctrlplane

Length of output: 164


🌐 Web query:

hashicorp go-tfe RunForceCanceled constant API documentation

💡 Result:

force_canceled is a valid run state in the HCP Terraform / Terraform Enterprise API (data.attributes.status) and means “a workspace admin forcefully canceled the run.” (developer.hashicorp.com)

However, in the current github.com/hashicorp/go-tfe package, there is no RunForceCanceled (or RunForceCanceled-like) constant in the RunStatus constants list; the exported RunStatus constants include RunCanceled, RunErrored, etc., but not force_canceled. (pkg.go.dev)

What to do in Go

  • Compare against the raw string: status == "force_canceled", or
  • Define your own constant in your codebase:
    const RunForceCanceled tfe.RunStatus = "force_canceled"

References (numbered)

  • [1] HCP Terraform Runs API docs (“Run States” includes force_canceled). (developer.hashicorp.com)
  • [2] go-tfe RunStatus constants list (does not include force_canceled). (pkg.go.dev)

Citations:


🏁 Script executed:

rg "const.*Run" apps/workspace-engine/pkg/jobagents/terraformcloud/ -B 2 -A 2

Repository: ctrlplanedev/ctrlplane

Length of output: 48


🏁 Script executed:

rg "type RunStatus\|Status.*tfe\.RunStatus" apps/workspace-engine/pkg/jobagents/terraformcloud/ -A 3

Repository: ctrlplanedev/ctrlplane

Length of output: 48


🏁 Script executed:

rg "\.Status\s*=" apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go -B 3 -A 1

Repository: ctrlplanedev/ctrlplane

Length of output: 48


🏁 Script executed:

python3 << 'EOF'
# Check the go-tfe package to understand RunStatus type
import subprocess
result = subprocess.run(['rg', 'github.com/hashicorp/go-tfe', 'go.mod', 'go.sum'], 
                       capture_output=True, text=True, cwd='/tmp')
if result.returncode == 0:
    print(result.stdout)
else:
    print("Not found in go files")
EOF

Repository: ctrlplanedev/ctrlplane

Length of output: 87


🏁 Script executed:

cat -n apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go | sed -n '80,110p'

Repository: ctrlplanedev/ctrlplane

Length of output: 1039


🏁 Script executed:

rg "run\.Status" apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go -B 5 -A 5

Repository: ctrlplanedev/ctrlplane

Length of output: 607


Add force_canceled to terminal error statuses in the run status switch.

The switch at line 102 does not handle Terraform Cloud's force_canceled run status. When a run is administratively force-canceled, neither IsFinished nor IsErrored will be set, causing polling to continue until timeout instead of terminating immediately as an error state.

Proposed fix
 switch run.Status {
 case tfe.RunPlannedAndFinished:
 	status.IsFinished = true
-case tfe.RunErrored, tfe.RunCanceled, tfe.RunDiscarded, tfe.RunPolicySoftFailed:
+case tfe.RunErrored, tfe.RunCanceled, tfe.RunDiscarded, tfe.RunPolicySoftFailed, tfe.RunStatus("force_canceled"):
 	status.IsErrored = true
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@apps/workspace-engine/pkg/jobagents/terraformcloud/tfe_plan_client.go` around
lines 102 - 107, The switch handling run.Status in tfe_plan_client.go currently
marks only RunPlannedAndFinished as terminal and treats RunErrored, RunCanceled,
RunDiscarded, and RunPolicySoftFailed as errored; add tfe.RunForceCanceled
(Terraform Cloud's force_canceled status) to the error branch so that when
run.Status == tfe.RunForceCanceled you set status.IsErrored = true (same branch
as RunErrored/RunCanceled/RunDiscarded/RunPolicySoftFailed), ensuring polling
stops immediately for force-canceled runs; update any related comments/tests
referencing terminal statuses if present.


return status, nil
}

// ReadPlanJSON fetches the JSON output of a completed plan.
func (g *GoSpeculativeRunner) ReadPlanJSON(
ctx context.Context,
cfg *tfeConfig,
planID string,
) ([]byte, error) {
client, err := getClient(cfg.address, cfg.token)
if err != nil {
return nil, fmt.Errorf("create tfe client: %w", err)
}

data, err := client.Plans.ReadJSONOutput(ctx, planID)
if err != nil {
return nil, fmt.Errorf("read plan JSON output: %w", err)
}
return data, nil
}
Loading
Loading