-
Notifications
You must be signed in to change notification settings - Fork 1
Distributed ux polish #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
1f43762
1b3c951
9373de9
f0ab68e
92b9e22
ee34a52
7a9d89f
44e7d98
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -394,6 +394,23 @@ type SystemBackend struct { | |||||
| Metadata *BackendMetadata | ||||||
| UpgradeAvailable bool `json:"upgrade_available,omitempty"` | ||||||
| AvailableVersion string `json:"available_version,omitempty"` | ||||||
| // Nodes holds per-node attribution in distributed mode. Empty in single-node. | ||||||
| // Each entry describes a node that has this backend installed, with the | ||||||
| // version/digest it reports. Lets the UI surface drift and per-node status. | ||||||
| Nodes []NodeBackendRef `json:"nodes,omitempty"` | ||||||
| } | ||||||
|
|
||||||
| // NodeBackendRef describes one node's view of an installed backend. Used both | ||||||
| // for per-node attribution in the UI and for drift detection during upgrade | ||||||
| // checks (a cluster with mismatched versions/digests is flagged upgradeable). | ||||||
| type NodeBackendRef struct { | ||||||
| NodeID string `json:"node_id"` | ||||||
| NodeName string `json:"node_name"` | ||||||
| NodeStatus string `json:"node_status"` // healthy | unhealthy | offline | draining | pending | ||||||
| Version string `json:"version,omitempty"` | ||||||
| Digest string `json:"digest,omitempty"` | ||||||
| URI string `json:"uri,omitempty"` | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exposing raw backend URI values in the JSON response can leak internal filesystem paths or registry locations, so omit this field from the API model or redact it before serialization.
Suggested change
Prompt for AI assistanceCopy the prompt below and paste it into ChatGPT, Claude, or any LLM: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. URI is added to per-node API output and metadata without any redaction, so registry credentials or internal endpoints embedded in backend URIs should be stripped before serialization. Suggested fix // Store only a sanitized URI here to avoid exposing credentials or internal-only endpoints.
URI string `json:"uri,omitempty"`Prompt for AI assistanceCopy the prompt below and paste it into ChatGPT, Claude, or any LLM: |
||||||
| InstalledAt string `json:"installed_at,omitempty"` | ||||||
| } | ||||||
|
|
||||||
| type SystemBackends map[string]SystemBackend | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,20 +23,43 @@ type UpgradeInfo struct { | |
| AvailableVersion string `json:"available_version"` | ||
| InstalledDigest string `json:"installed_digest,omitempty"` | ||
| AvailableDigest string `json:"available_digest,omitempty"` | ||
| // NodeDrift lists nodes whose installed version or digest differs from | ||
| // the cluster majority. Non-empty means the cluster has diverged and an | ||
| // upgrade will realign it. Empty in single-node mode. | ||
| NodeDrift []NodeDriftInfo `json:"node_drift,omitempty"` | ||
| } | ||
|
|
||
| // CheckBackendUpgrades compares installed backends against gallery entries | ||
| // and returns a map of backend names to UpgradeInfo for those that have | ||
| // newer versions or different OCI digests available. | ||
| // NodeDriftInfo describes one node that disagrees with the cluster majority | ||
| // on which version/digest of a backend is installed. | ||
| type NodeDriftInfo struct { | ||
| NodeID string `json:"node_id"` | ||
| NodeName string `json:"node_name"` | ||
| Version string `json:"version,omitempty"` | ||
| Digest string `json:"digest,omitempty"` | ||
| } | ||
|
Comment on lines
+26
to
+39
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NodeDrift exposes internal node IDs, names, versions, and digests to any caller of this API, so redact these fields or gate them behind an authorization check before serializing. Suggested fix // NodeDrift lists nodes whose installed version or digest differs from
// the cluster majority. Populate this only for privileged callers.
NodeDrift []NodeDriftInfo `json:"node_drift,omitempty"`
}
// NodeDriftInfo describes one node that disagrees with the cluster majority
// on which version/digest of a backend is installed.
type NodeDriftInfo struct {
NodeName string `json:"node_name,omitempty"`
}Prompt for AI assistanceCopy the prompt below and paste it into ChatGPT, Claude, or any LLM: |
||
|
|
||
| // CheckBackendUpgrades is the single-node entrypoint. Distributed callers use | ||
| // CheckUpgradesAgainst directly with their aggregated SystemBackends. | ||
| func CheckBackendUpgrades(ctx context.Context, galleries []config.Gallery, systemState *system.SystemState) (map[string]UpgradeInfo, error) { | ||
| galleryBackends, err := AvailableBackends(galleries, systemState) | ||
| installed, err := ListSystemBackends(systemState) | ||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to list available backends: %w", err) | ||
| return nil, fmt.Errorf("failed to list installed backends: %w", err) | ||
| } | ||
| return CheckUpgradesAgainst(ctx, galleries, systemState, installed) | ||
| } | ||
|
|
||
| installedBackends, err := ListSystemBackends(systemState) | ||
| // CheckUpgradesAgainst compares a caller-supplied SystemBackends set against | ||
| // the gallery. Fixes the distributed-mode bug where the old code passed the | ||
| // frontend's (empty) local filesystem through ListSystemBackends and so never | ||
| // surfaced any upgrades. | ||
| // | ||
| // Cluster drift policy: if a backend's per-node versions/digests disagree, the | ||
| // row is flagged upgradeable regardless of whether any node matches the gallery | ||
| // — next Upgrade All realigns the cluster. NodeDrift lists the outliers. | ||
| func CheckUpgradesAgainst(ctx context.Context, galleries []config.Gallery, systemState *system.SystemState, installedBackends SystemBackends) (map[string]UpgradeInfo, error) { | ||
| galleryBackends, err := AvailableBackends(galleries, systemState) | ||
|
Comment on lines
+59
to
+60
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The new ctx parameter is ignored, so upgrade checks can block indefinitely on gallery resolution or remote digest fetches; thread the context into the called functions or remove it until cancellation is supported. Suggested fixfunc CheckUpgradesAgainst(ctx context.Context, galleries []config.Gallery, systemState *system.SystemState, installedBackends SystemBackends) (map[string]UpgradeInfo, error) {
_ = ctx
galleryBackends, err := AvailableBackends(galleries, systemState)Prompt for AI assistanceCopy the prompt below and paste it into ChatGPT, Claude, or any LLM: |
||
| if err != nil { | ||
| return nil, fmt.Errorf("failed to list installed backends: %w", err) | ||
| return nil, fmt.Errorf("failed to list available backends: %w", err) | ||
| } | ||
|
|
||
| result := make(map[string]UpgradeInfo) | ||
|
|
@@ -57,56 +80,117 @@ func CheckBackendUpgrades(ctx context.Context, galleries []config.Gallery, syste | |
| } | ||
|
|
||
| installedVersion := installed.Metadata.Version | ||
| installedDigest := installed.Metadata.Digest | ||
| galleryVersion := galleryEntry.Version | ||
|
|
||
| // If both sides have versions, compare them | ||
| // Detect cluster drift: does every node report the same version+digest? | ||
| // In single-node mode this stays empty (Nodes is nil). | ||
| majority, drift := summarizeNodeDrift(installed.Nodes) | ||
| if majority.version != "" { | ||
| installedVersion = majority.version | ||
| } | ||
| if majority.digest != "" { | ||
| installedDigest = majority.digest | ||
| } | ||
|
|
||
| makeInfo := func(info UpgradeInfo) UpgradeInfo { | ||
| info.NodeDrift = drift | ||
| return info | ||
| } | ||
|
|
||
| // If versions are available on both sides, they're the source of truth. | ||
| if galleryVersion != "" && installedVersion != "" { | ||
| if galleryVersion != installedVersion { | ||
| result[installed.Metadata.Name] = UpgradeInfo{ | ||
| if galleryVersion != installedVersion || len(drift) > 0 { | ||
| result[installed.Metadata.Name] = makeInfo(UpgradeInfo{ | ||
| BackendName: installed.Metadata.Name, | ||
| InstalledVersion: installedVersion, | ||
| AvailableVersion: galleryVersion, | ||
| } | ||
| }) | ||
| } | ||
| // Versions match — no upgrade needed | ||
| continue | ||
| } | ||
|
|
||
| // Gallery has a version but installed doesn't — this happens for backends | ||
| // installed before version tracking was added. Flag as upgradeable so | ||
| // users can re-install to pick up version metadata. | ||
| // Gallery has a version but installed doesn't — backends installed before | ||
| // version tracking was added. Flag as upgradeable to pick up metadata. | ||
| if galleryVersion != "" && installedVersion == "" { | ||
| result[installed.Metadata.Name] = UpgradeInfo{ | ||
| result[installed.Metadata.Name] = makeInfo(UpgradeInfo{ | ||
| BackendName: installed.Metadata.Name, | ||
| InstalledVersion: "", | ||
| AvailableVersion: galleryVersion, | ||
| } | ||
| }) | ||
| continue | ||
| } | ||
|
|
||
| // Fall back to OCI digest comparison when versions are unavailable | ||
| // Fall back to OCI digest comparison when versions are unavailable. | ||
| if downloader.URI(galleryEntry.URI).LooksLikeOCI() { | ||
| remoteDigest, err := oci.GetImageDigest(galleryEntry.URI, "", nil, nil) | ||
|
Comment on lines
125
to
126
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The ctx parameter is ignored during remote digest checks, so pass it into the OCI lookup or check ctx.Done before each network call. Suggested fix if downloader.URI(galleryEntry.URI).LooksLikeOCI() {
select {
case <-ctx.Done():
return nil, ctx.Err()
default:
}
remoteDigest, err := oci.GetImageDigest(galleryEntry.URI, "", nil, nil)Prompt for AI assistanceCopy the prompt below and paste it into ChatGPT, Claude, or any LLM: |
||
| if err != nil { | ||
| xlog.Warn("Failed to get remote OCI digest for upgrade check", "backend", installed.Metadata.Name, "error", err) | ||
| continue | ||
| } | ||
| // If we have a stored digest, compare; otherwise any remote digest | ||
| // means we can't confirm we're up to date — flag as upgradeable | ||
| if installed.Metadata.Digest == "" || remoteDigest != installed.Metadata.Digest { | ||
| result[installed.Metadata.Name] = UpgradeInfo{ | ||
| // means we can't confirm we're up to date — flag as upgradeable. | ||
| if installedDigest == "" || remoteDigest != installedDigest || len(drift) > 0 { | ||
| result[installed.Metadata.Name] = makeInfo(UpgradeInfo{ | ||
| BackendName: installed.Metadata.Name, | ||
| InstalledDigest: installed.Metadata.Digest, | ||
| InstalledDigest: installedDigest, | ||
| AvailableDigest: remoteDigest, | ||
| } | ||
| }) | ||
| } | ||
| } else if len(drift) > 0 { | ||
| // No version/digest path but nodes disagree — still worth flagging. | ||
| result[installed.Metadata.Name] = makeInfo(UpgradeInfo{ | ||
| BackendName: installed.Metadata.Name, | ||
| InstalledVersion: installedVersion, | ||
| InstalledDigest: installedDigest, | ||
| }) | ||
| } | ||
| // No version info and non-OCI URI — cannot determine, skip | ||
| } | ||
|
|
||
| return result, nil | ||
| } | ||
|
|
||
| // summarizeNodeDrift collapses per-node version/digest tuples to a majority | ||
| // pair and returns the outliers. In single-node mode (empty nodes slice) this | ||
| // returns zero values and a nil drift list. | ||
| func summarizeNodeDrift(nodes []NodeBackendRef) (majority struct{ version, digest string }, drift []NodeDriftInfo) { | ||
| if len(nodes) == 0 { | ||
| return majority, nil | ||
| } | ||
|
|
||
| type key struct{ version, digest string } | ||
| counts := map[key]int{} | ||
| var topKey key | ||
| var topCount int | ||
| for _, n := range nodes { | ||
| k := key{n.Version, n.Digest} | ||
| counts[k]++ | ||
| if counts[k] > topCount { | ||
| topCount = counts[k] | ||
| topKey = k | ||
| } | ||
| } | ||
|
|
||
| majority.version = topKey.version | ||
| majority.digest = topKey.digest | ||
|
|
||
| if len(counts) == 1 { | ||
| return majority, nil // unanimous — no drift | ||
| } | ||
| for _, n := range nodes { | ||
| if n.Version == majority.version && n.Digest == majority.digest { | ||
| continue | ||
| } | ||
| drift = append(drift, NodeDriftInfo{ | ||
| NodeID: n.NodeID, | ||
| NodeName: n.NodeName, | ||
| Version: n.Version, | ||
| Digest: n.Digest, | ||
| }) | ||
| } | ||
| return majority, drift | ||
| } | ||
|
|
||
| // UpgradeBackend upgrades a single backend to the latest gallery version using | ||
| // an atomic swap with backup-based rollback on failure. | ||
| func UpgradeBackend(ctx context.Context, systemState *system.SystemState, modelLoader *model.ModelLoader, galleries []config.Gallery, backendName string, downloadStatus func(string, string, string, float64)) error { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ListBackendsfan-out typically take longer than 10 s on any real cluster. RunningCheckUpgradesbefore workers are visible means the upgrade banner silently shows nothing until the next scheduled check (6 h later). The old 30 s value was closer to the right order of magnitude.