diff --git a/cmd/metrics/loader_legacy.go b/cmd/metrics/loader_legacy.go index 8fdcef1c..428a5517 100644 --- a/cmd/metrics/loader_legacy.go +++ b/cmd/metrics/loader_legacy.go @@ -184,6 +184,10 @@ func isCollectableEvent(event EventDefinition, metadata Metadata) bool { // - their corresponding device is not found // - not in system-wide collection scope if event.Device != "cpu" && event.Device != "" { + if !metadata.SupportsUncore { + slog.Debug("Uncore events not supported on target", slog.String("event", event.Name)) + return false + } if flagScope == scopeProcess || flagScope == scopeCgroup { slog.Debug("Uncore events not supported in process or cgroup scope", slog.String("event", event.Name)) return false diff --git a/cmd/metrics/metadata.go b/cmd/metrics/metadata.go index 87ac4461..31b3072b 100644 --- a/cmd/metrics/metadata.go +++ b/cmd/metrics/metadata.go @@ -48,6 +48,7 @@ const ( scriptKernelVersion = "kernel version" scriptARMSlots = "arm slots" scriptARMCPUID = "arm cpuid" + scriptPerfStatAMDUncoreProbe = "perf stat amd uncore probe" ) // CommonMetadata -- common to all architectures @@ -213,6 +214,13 @@ BEGIN { Architectures: []string{cpus.X86Architecture}, Depends: []string{"perf"}, }, + { + Name: scriptPerfStatAMDUncoreProbe, + ScriptTemplate: `perf stat -a -e "l3/event=0x4,umask=0xff,enallcores=0x1,enallslices=0x1,threadmask=0x3,name='l3_lookup_state.all_coherent_accesses_to_l3'/" sleep 1`, + Architectures: []string{cpus.X86Architecture}, + Vendors: []string{cpus.AMDVendor}, + Depends: []string{"perf"}, + }, { Name: scriptPerfStatFixedInstr, ScriptTemplate: "perf stat -a -e '{{{.InstructionsList}}}' sleep 1", diff --git a/cmd/metrics/metadata_x86.go b/cmd/metrics/metadata_x86.go index 6f6a108a..669d639b 100644 --- a/cmd/metrics/metadata_x86.go +++ b/cmd/metrics/metadata_x86.go @@ -149,6 +149,18 @@ func (c *X86MetadataCollector) CollectMetadata(t target.Target, noRoot bool, noS } metadata.SupportsUncore = c.checkUncoreSupport(metadata.UncoreDeviceIDs, isAMDArchitecture) + // On AMD, probe that L3 uncore actually works (e.g. GCP VMs expose l3 in sysfs but perf cannot use it). + // If the probe fails, disable uncore so collection uses core-only events and still produces metrics. + if isAMDArchitecture && metadata.SupportsUncore { + if output, ok := scriptOutputs[scriptPerfStatAMDUncoreProbe]; ok { + if !getSupportsAMDUncore(output) { + slog.Warn("AMD L3 uncore probe failed, disabling uncore metrics", slog.String("stderr", output.Stderr)) + metadata.SupportsUncore = false + removeUncoreDevices(metadata.UncoreDeviceIDs, "l3", "df") + } + } + } + return metadata, nil } @@ -229,6 +241,32 @@ func (c *X86MetadataCollector) checkUncoreSupport(uncoreDeviceIDs map[string][]i return false } +// getSupportsAMDUncore returns true if the AMD uncore probe script succeeded (L3 PMU is usable). +// On some VMs (e.g. GCP AMD Turin), sysfs lists amd_l3 but perf cannot use it. +func getSupportsAMDUncore(output script.ScriptOutput) bool { + if output.Exitcode != 0 { + return false + } + stderr := output.Stderr + if strings.Contains(stderr, "Unable to find PMU or event on a PMU of 'l3'") { + return false + } + if strings.Contains(stderr, "event syntax error") && strings.Contains(stderr, "l3") { + return false + } + if strings.Contains(stderr, "") { + return false + } + return true +} + +// removeUncoreDevices removes the given device names from the map (used when uncore probe fails). +func removeUncoreDevices(uncoreDeviceIDs map[string][]int, deviceNames ...string) { + for _, name := range deviceNames { + delete(uncoreDeviceIDs, name) + } +} + // --- x86-specific helper functions --- // getUncoreDeviceIDs returns a map of device type to list of device indices.