@@ -19,6 +19,7 @@ const (
1919 maxPricePerNodeHour = 1600
2020 defaultPort = 2222
2121 defaultSSHUsername = "ubuntu"
22+ vmStatusRunning = "running"
2223)
2324
2425func (c * SFCClient ) CreateInstance (ctx context.Context , attrs v1.CreateInstanceAttrs ) (* v1.Instance , error ) {
@@ -174,12 +175,22 @@ func (c *SFCClient) ListInstances(ctx context.Context, args v1.ListInstancesArgs
174175
175176 nodeInfo , err := c .sfcNodeInfoFromNodeListResponseData (ctx , & node , zone )
176177 if err != nil {
177- return nil , errors .WrapAndTrace (err )
178+ c .logger .Error (ctx , err ,
179+ v1 .LogField ("msg" , "sfc: ListInstances skipping node due to error" ),
180+ v1 .LogField ("nodeID" , node .ID ),
181+ v1 .LogField ("nodeName" , node .Name ),
182+ )
183+ continue
178184 }
179185
180186 inst , err := c .sfcNodeToBrevInstance (* nodeInfo )
181187 if err != nil {
182- return nil , errors .WrapAndTrace (err )
188+ c .logger .Error (ctx , err ,
189+ v1 .LogField ("msg" , "sfc: ListInstances skipping node due to conversion error" ),
190+ v1 .LogField ("nodeID" , node .ID ),
191+ v1 .LogField ("nodeName" , node .Name ),
192+ )
193+ continue
183194 }
184195 instances = append (instances , * inst )
185196 }
@@ -265,24 +276,51 @@ func (c *SFCClient) sfcNodeToBrevInstance(node sfcNodeInfo) (*v1.Instance, error
265276}
266277
267278func (c * SFCClient ) sfcNodeInfoFromNode (ctx context.Context , node * sfcnodes.Node , zone * sfcnodes.ZoneListResponseData ) (* sfcNodeInfo , error ) {
268- var sshHostname string
269- if len (node .VMs .Data ) == 1 { //nolint:gocritic // ok
270- hostname , err := c .getSSHHostnameFromVM (ctx , node .VMs .Data [0 ].ID , node .VMs .Data [0 ].Status )
271- if err != nil {
272- return nil , errors .WrapAndTrace (err )
279+ nodeStatus := sfcStatusToLifecycleStatus (fmt .Sprint (node .Status ))
280+
281+ // Check node-level status first before inspecting individual VMs.
282+ // A node in a terminal state (e.g. "released") may still have VMs reporting as "Running"
283+ // because SFC keeps the VM alive until the end of its allotted time window. The node status
284+ // is the source of truth — if the node is released/destroyed/deleted, the instance is
285+ // terminated; if the node is failed, the instance is failed. VM status should not be
286+ // consulted in either case.
287+ // Additionally, terminal nodes can accumulate multiple VM records (previous + current),
288+ // which would otherwise cause a "multiple VMs found" error and break ListInstances entirely.
289+ if isTerminalNodeStatus (fmt .Sprint (node .Status )) {
290+ if nodeStatus == v1 .LifecycleStatusFailed {
291+ for _ , vm := range node .VMs .Data {
292+ if strings .ToLower (vm .Status ) == vmStatusRunning {
293+ c .logger .Warn (ctx , "sfc: node is failed but VM is still running" ,
294+ v1 .LogField ("node_id" , node .ID ),
295+ v1 .LogField ("node_status" , fmt .Sprint (node .Status )),
296+ v1 .LogField ("vm_id" , vm .ID ),
297+ v1 .LogField ("vm_status" , vm .Status ),
298+ )
299+ }
300+ }
273301 }
274- sshHostname = hostname
275- } else if len (node .VMs .Data ) == 0 {
276- sshHostname = ""
277- } else {
278- return nil , errors .WrapAndTrace (fmt .Errorf ("multiple VMs found for node %s" , node .ID ))
302+ return & sfcNodeInfo {
303+ id : node .ID ,
304+ name : node .Name ,
305+ createdAt : time .Unix (node .CreatedAt , 0 ),
306+ status : nodeStatus ,
307+ gpuType : string (node .GPUType ),
308+ sshUsername : defaultSSHUsername ,
309+ sshHostname : "" ,
310+ zone : zone ,
311+ }, nil
312+ }
313+
314+ sshHostname , err := c .sshHostnameFromVMs (ctx , node .VMs .Data )
315+ if err != nil {
316+ return nil , errors .WrapAndTrace (err )
279317 }
280318
281319 return & sfcNodeInfo {
282320 id : node .ID ,
283321 name : node .Name ,
284322 createdAt : time .Unix (node .CreatedAt , 0 ),
285- status : sfcStatusToLifecycleStatus ( fmt . Sprint ( node . Status )) ,
323+ status : nodeStatus ,
286324 gpuType : string (node .GPUType ),
287325 sshUsername : defaultSSHUsername ,
288326 sshHostname : sshHostname ,
@@ -339,6 +377,23 @@ func sfcListResponseNodeDataToNode(node *sfcnodes.ListResponseNodeData) *sfcnode
339377 }
340378}
341379
380+ // sfcStatusToLifecycleStatus maps SFC node-level statuses to Brev lifecycle statuses.
381+ //
382+ // SFC node statuses (from the nodes-go SDK):
383+ // - "pending" → node is being provisioned
384+ // - "awaitingcapacity" → node is waiting for capacity (auto-reserved nodes)
385+ // - "running" → node is active with a VM provisioned
386+ // - "released" → node was released via Nodes.Release(). This is a TERMINAL state.
387+ // VMs may still report "Running" underneath until their allotted time ends,
388+ // but the node lease is over. "released" means terminated, NOT terminating.
389+ // - "terminated" → node has been terminated
390+ // - "deleted" → node has been deleted
391+ // - "destroyed" → node has been destroyed
392+ // - "failed" → node provisioning or operation failed
393+ // - "unknown" → unknown status
394+ //
395+ // Note: SFC does NOT have a transitional "releasing" or "terminating" status.
396+ // The Release API transitions a node directly from "running" to "released".
342397func sfcStatusToLifecycleStatus (status string ) v1.LifecycleStatus {
343398 switch strings .ToLower (status ) {
344399 case "pending" , "unspecified" , "awaitingcapacity" , "unknown" :
@@ -358,9 +413,29 @@ func sfcStatusToLifecycleStatus(status string) v1.LifecycleStatus {
358413 }
359414}
360415
416+ // isTerminalNodeStatus returns true if the SFC node status is a terminal state where
417+ // the node is no longer active. When a node is terminal, its VM statuses should not be
418+ // inspected — they may be stale (e.g. VM still "Running" after the node was released).
419+ func isTerminalNodeStatus (status string ) bool {
420+ lifecycleStatus := sfcStatusToLifecycleStatus (status )
421+ return lifecycleStatus == v1 .LifecycleStatusTerminated || lifecycleStatus == v1 .LifecycleStatusFailed
422+ }
423+
424+ // sshHostnameFromVMs finds the first running VM and returns its SSH hostname.
425+ // Nodes can accumulate multiple VM records (e.g. awaitingcapacity nodes with previous
426+ // destroyed VMs); only a running VM provides usable SSH info.
427+ func (c * SFCClient ) sshHostnameFromVMs (ctx context.Context , vms []sfcnodes.NodeVMsData ) (string , error ) {
428+ for _ , vm := range vms {
429+ if strings .ToLower (vm .Status ) == vmStatusRunning {
430+ return c .getSSHHostnameFromVM (ctx , vm .ID , vm .Status )
431+ }
432+ }
433+ return "" , nil
434+ }
435+
361436func (c * SFCClient ) getSSHHostnameFromVM (ctx context.Context , vmID string , vmStatus string ) (string , error ) {
362437 // If the VM is not running, set the SSH username and hostname to empty strings
363- if strings .ToLower (vmStatus ) != "running" {
438+ if strings .ToLower (vmStatus ) != vmStatusRunning {
364439 return "" , nil
365440 }
366441
0 commit comments