Skip to content

Commit ff74550

Browse files
authored
fixed sf compute lifecycle status (#95)
* fixed sf compute lifecycle status * fixed error logging * lint fixes * move to helper
1 parent 294265d commit ff74550

1 file changed

Lines changed: 89 additions & 14 deletions

File tree

v1/providers/sfcompute/instance.go

Lines changed: 89 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ const (
1919
maxPricePerNodeHour = 1600
2020
defaultPort = 2222
2121
defaultSSHUsername = "ubuntu"
22+
vmStatusRunning = "running"
2223
)
2324

2425
func (c *SFCClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) {
@@ -174,12 +175,22 @@ func (c *SFCClient) ListInstances(ctx context.Context, args v1.ListInstancesArgs
174175

175176
nodeInfo, err := c.sfcNodeInfoFromNodeListResponseData(ctx, &node, zone)
176177
if err != nil {
177-
return nil, errors.WrapAndTrace(err)
178+
c.logger.Error(ctx, err,
179+
v1.LogField("msg", "sfc: ListInstances skipping node due to error"),
180+
v1.LogField("nodeID", node.ID),
181+
v1.LogField("nodeName", node.Name),
182+
)
183+
continue
178184
}
179185

180186
inst, err := c.sfcNodeToBrevInstance(*nodeInfo)
181187
if err != nil {
182-
return nil, errors.WrapAndTrace(err)
188+
c.logger.Error(ctx, err,
189+
v1.LogField("msg", "sfc: ListInstances skipping node due to conversion error"),
190+
v1.LogField("nodeID", node.ID),
191+
v1.LogField("nodeName", node.Name),
192+
)
193+
continue
183194
}
184195
instances = append(instances, *inst)
185196
}
@@ -265,24 +276,51 @@ func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error
265276
}
266277

267278
func (c *SFCClient) sfcNodeInfoFromNode(ctx context.Context, node *sfcnodes.Node, zone *sfcnodes.ZoneListResponseData) (*sfcNodeInfo, error) {
268-
var sshHostname string
269-
if len(node.VMs.Data) == 1 { //nolint:gocritic // ok
270-
hostname, err := c.getSSHHostnameFromVM(ctx, node.VMs.Data[0].ID, node.VMs.Data[0].Status)
271-
if err != nil {
272-
return nil, errors.WrapAndTrace(err)
279+
nodeStatus := sfcStatusToLifecycleStatus(fmt.Sprint(node.Status))
280+
281+
// Check node-level status first before inspecting individual VMs.
282+
// A node in a terminal state (e.g. "released") may still have VMs reporting as "Running"
283+
// because SFC keeps the VM alive until the end of its allotted time window. The node status
284+
// is the source of truth — if the node is released/destroyed/deleted, the instance is
285+
// terminated; if the node is failed, the instance is failed. VM status should not be
286+
// consulted in either case.
287+
// Additionally, terminal nodes can accumulate multiple VM records (previous + current),
288+
// which would otherwise cause a "multiple VMs found" error and break ListInstances entirely.
289+
if isTerminalNodeStatus(fmt.Sprint(node.Status)) {
290+
if nodeStatus == v1.LifecycleStatusFailed {
291+
for _, vm := range node.VMs.Data {
292+
if strings.ToLower(vm.Status) == vmStatusRunning {
293+
c.logger.Warn(ctx, "sfc: node is failed but VM is still running",
294+
v1.LogField("node_id", node.ID),
295+
v1.LogField("node_status", fmt.Sprint(node.Status)),
296+
v1.LogField("vm_id", vm.ID),
297+
v1.LogField("vm_status", vm.Status),
298+
)
299+
}
300+
}
273301
}
274-
sshHostname = hostname
275-
} else if len(node.VMs.Data) == 0 {
276-
sshHostname = ""
277-
} else {
278-
return nil, errors.WrapAndTrace(fmt.Errorf("multiple VMs found for node %s", node.ID))
302+
return &sfcNodeInfo{
303+
id: node.ID,
304+
name: node.Name,
305+
createdAt: time.Unix(node.CreatedAt, 0),
306+
status: nodeStatus,
307+
gpuType: string(node.GPUType),
308+
sshUsername: defaultSSHUsername,
309+
sshHostname: "",
310+
zone: zone,
311+
}, nil
312+
}
313+
314+
sshHostname, err := c.sshHostnameFromVMs(ctx, node.VMs.Data)
315+
if err != nil {
316+
return nil, errors.WrapAndTrace(err)
279317
}
280318

281319
return &sfcNodeInfo{
282320
id: node.ID,
283321
name: node.Name,
284322
createdAt: time.Unix(node.CreatedAt, 0),
285-
status: sfcStatusToLifecycleStatus(fmt.Sprint(node.Status)),
323+
status: nodeStatus,
286324
gpuType: string(node.GPUType),
287325
sshUsername: defaultSSHUsername,
288326
sshHostname: sshHostname,
@@ -339,6 +377,23 @@ func sfcListResponseNodeDataToNode(node *sfcnodes.ListResponseNodeData) *sfcnode
339377
}
340378
}
341379

380+
// sfcStatusToLifecycleStatus maps SFC node-level statuses to Brev lifecycle statuses.
381+
//
382+
// SFC node statuses (from the nodes-go SDK):
383+
// - "pending" → node is being provisioned
384+
// - "awaitingcapacity" → node is waiting for capacity (auto-reserved nodes)
385+
// - "running" → node is active with a VM provisioned
386+
// - "released" → node was released via Nodes.Release(). This is a TERMINAL state.
387+
// VMs may still report "Running" underneath until their allotted time ends,
388+
// but the node lease is over. "released" means terminated, NOT terminating.
389+
// - "terminated" → node has been terminated
390+
// - "deleted" → node has been deleted
391+
// - "destroyed" → node has been destroyed
392+
// - "failed" → node provisioning or operation failed
393+
// - "unknown" → unknown status
394+
//
395+
// Note: SFC does NOT have a transitional "releasing" or "terminating" status.
396+
// The Release API transitions a node directly from "running" to "released".
342397
func sfcStatusToLifecycleStatus(status string) v1.LifecycleStatus {
343398
switch strings.ToLower(status) {
344399
case "pending", "unspecified", "awaitingcapacity", "unknown":
@@ -358,9 +413,29 @@ func sfcStatusToLifecycleStatus(status string) v1.LifecycleStatus {
358413
}
359414
}
360415

416+
// isTerminalNodeStatus returns true if the SFC node status is a terminal state where
417+
// the node is no longer active. When a node is terminal, its VM statuses should not be
418+
// inspected — they may be stale (e.g. VM still "Running" after the node was released).
419+
func isTerminalNodeStatus(status string) bool {
420+
lifecycleStatus := sfcStatusToLifecycleStatus(status)
421+
return lifecycleStatus == v1.LifecycleStatusTerminated || lifecycleStatus == v1.LifecycleStatusFailed
422+
}
423+
424+
// sshHostnameFromVMs finds the first running VM and returns its SSH hostname.
425+
// Nodes can accumulate multiple VM records (e.g. awaitingcapacity nodes with previous
426+
// destroyed VMs); only a running VM provides usable SSH info.
427+
func (c *SFCClient) sshHostnameFromVMs(ctx context.Context, vms []sfcnodes.NodeVMsData) (string, error) {
428+
for _, vm := range vms {
429+
if strings.ToLower(vm.Status) == vmStatusRunning {
430+
return c.getSSHHostnameFromVM(ctx, vm.ID, vm.Status)
431+
}
432+
}
433+
return "", nil
434+
}
435+
361436
func (c *SFCClient) getSSHHostnameFromVM(ctx context.Context, vmID string, vmStatus string) (string, error) {
362437
// If the VM is not running, set the SSH username and hostname to empty strings
363-
if strings.ToLower(vmStatus) != "running" {
438+
if strings.ToLower(vmStatus) != vmStatusRunning {
364439
return "", nil
365440
}
366441

0 commit comments

Comments
 (0)