From fa661d8208be6f2c9a42f601e43003d6bd26b787 Mon Sep 17 00:00:00 2001 From: Juan Pablo Pino Bravo Date: Fri, 26 Jun 2026 20:36:23 +0200 Subject: [PATCH 1/3] Add NvmeInfo to SystemPerformanceInfo Add an NvmeInfo message (NVMe SSD health, wear, thermal throttling and identity) and nest it in SystemPerformanceInfo (field 14). Jetson-only; i.MX drones have no NVMe device and leave it at defaults. Partition used/free is reported separately via DataStorageSpaceTel, so NvmeInfo carries only NVMe-specific state plus the physical drive capacity. Co-Authored-By: Claude Opus 4.8 (1M context) --- protobuf_definitions/message_formats.proto | 45 ++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/protobuf_definitions/message_formats.proto b/protobuf_definitions/message_formats.proto index 76ca9869..b940c84f 100644 --- a/protobuf_definitions/message_formats.proto +++ b/protobuf_definitions/message_formats.proto @@ -1497,6 +1497,50 @@ message VideoCodecInfo { float vic_load = 16; // Video Image Compositor (VIC) utilization (0..1). } +// NVMe SSD health, wear and identity (Jetson only; i.MX drones boot/store on +// an SD card and have no NVMe device, leaving this at its defaults). +// +// Read from the NVMe SMART/health log plus Identify Controller. Only the +// composite temperature is trustworthy — per-sensor values are vendor-specific +// and may be stuck at placeholder values, so they are not exposed. +message NvmeInfo { + bool present = 1; // True if an NVMe device was found and read. + + // Tier 1 — health & safety. + float composite_temperature_c = 2; // Composite drive temperature (°C). + uint32 critical_warning = 3; // NVMe critical-warning bitfield. + uint32 percentage_used = 4; // Wear indicator (%, may exceed 100). + uint32 available_spare = 5; // Remaining spare capacity (%). + uint32 available_spare_threshold = 6; // Spare alarm threshold (%). + uint64 media_errors = 7; // Uncorrected data-integrity errors. + uint64 unsafe_shutdowns = 8; // Power-loss / unsafe shutdown count. + + // Tier 2 — thermal throttling. + uint32 warning_temp_time_min = 9; // Minutes above the warning temperature. + uint32 critical_temp_time_min = 10; // Minutes above the critical temperature. + uint32 thermal_mgmt_t1_trans_count = 11; // Times entered light throttling. + uint32 thermal_mgmt_t2_trans_count = 12; // Times entered heavy throttling. + uint32 thermal_mgmt_t1_total_time_s = 13; // Total seconds in light throttling. + uint32 thermal_mgmt_t2_total_time_s = 14; // Total seconds in heavy throttling. + + // Tier 3 — endurance / workload. + uint64 data_units_written_bytes = 15; // Host data written (B). + uint64 data_units_read_bytes = 16; // Host data read (B). + uint64 host_write_commands = 17; // Host write command count. + uint64 host_read_commands = 18; // Host read command count. + uint64 power_on_hours = 19; // Power-on hours. + uint64 power_cycles = 20; // Power cycle count. + uint64 controller_busy_time_min = 21; // Controller busy time (min). + uint64 num_err_log_entries = 22; // Error-log entry count. + + // Tier 4 — identity & capacity. + string model = 23; // Drive model. + string serial = 24; // Drive serial number. + string firmware_rev = 25; // Firmware revision. + uint64 capacity_bytes = 26; // Total drive capacity (B). Partition used/free is + // reported separately via DataStorageSpaceTel. +} + // System performance information. // // Comprehensive performance metrics for the drone's compute platform. @@ -1516,6 +1560,7 @@ message SystemPerformanceInfo { float camera_queue_load = 11; // Camera queue load (0..1). float overlay_queue_load = 12; // Overlay queue load (0..1). float position_observer_queue_load = 13; // Position observer queue load (0..1). + NvmeInfo nvme = 14; // NVMe SSD health/identity (Jetson only). } // Surface Unit battery information. From e66e428380e79465282228ca39908fecc4afd211 Mon Sep 17 00:00:00 2001 From: Juan Pablo Pino Bravo Date: Fri, 26 Jun 2026 20:53:45 +0200 Subject: [PATCH 2/3] Address review: rename composite_temperature; note NVMe in summary Drop the _c suffix (units stay in the comment, consistent with the other temperature fields) and add NVMe to the SystemPerformanceInfo summary. Co-Authored-By: Claude Opus 4.8 (1M context) --- protobuf_definitions/message_formats.proto | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/protobuf_definitions/message_formats.proto b/protobuf_definitions/message_formats.proto index b940c84f..a075a8c1 100644 --- a/protobuf_definitions/message_formats.proto +++ b/protobuf_definitions/message_formats.proto @@ -1507,7 +1507,7 @@ message NvmeInfo { bool present = 1; // True if an NVMe device was found and read. // Tier 1 — health & safety. - float composite_temperature_c = 2; // Composite drive temperature (°C). + float composite_temperature = 2; // Composite drive temperature (°C). uint32 critical_warning = 3; // NVMe critical-warning bitfield. uint32 percentage_used = 4; // Wear indicator (%, may exceed 100). uint32 available_spare = 5; // Remaining spare capacity (%). @@ -1544,8 +1544,9 @@ message NvmeInfo { // System performance information. // // Comprehensive performance metrics for the drone's compute platform. -// Covers CPU, GPU, DLA, memory, thermals, and video codec utilization. -// Fields not applicable to a platform are left at their zero/empty defaults. +// Covers CPU, GPU, DLA, memory, thermals, video codec utilization and NVMe +// SSD state. Fields not applicable to a platform are left at their +// zero/empty defaults. message SystemPerformanceInfo { repeated CpuCoreLoad cpu_cores = 1; // Per-core CPU utilization. float cpu_utilization = 2; // Mean CPU utilization across all cores (0..1). From c1476850b3c8a1fb55616f9b84a2dd4ebfa40153 Mon Sep 17 00:00:00 2001 From: Juan Pablo Pino Bravo Date: Mon, 29 Jun 2026 12:54:37 +0200 Subject: [PATCH 3/3] NvmeInfo: drop present field, renumber, trim comment Message presence (has_nvme()) already signals whether NVMe data is available, so the explicit present field is redundant -- remove it and renumber 1..25 (not merged yet, so no wire-compat concern). Also drop the composite-temperature trustworthiness note; it's an SSD-specific implementation detail. Co-Authored-By: Claude Opus 4.8 (1M context) --- protobuf_definitions/message_formats.proto | 58 ++++++++++------------ 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/protobuf_definitions/message_formats.proto b/protobuf_definitions/message_formats.proto index a075a8c1..2ed4b950 100644 --- a/protobuf_definitions/message_formats.proto +++ b/protobuf_definitions/message_formats.proto @@ -1498,46 +1498,40 @@ message VideoCodecInfo { } // NVMe SSD health, wear and identity (Jetson only; i.MX drones boot/store on -// an SD card and have no NVMe device, leaving this at its defaults). -// -// Read from the NVMe SMART/health log plus Identify Controller. Only the -// composite temperature is trustworthy — per-sensor values are vendor-specific -// and may be stuck at placeholder values, so they are not exposed. +// an SD card and have no NVMe device, so this message is omitted there). message NvmeInfo { - bool present = 1; // True if an NVMe device was found and read. - // Tier 1 — health & safety. - float composite_temperature = 2; // Composite drive temperature (°C). - uint32 critical_warning = 3; // NVMe critical-warning bitfield. - uint32 percentage_used = 4; // Wear indicator (%, may exceed 100). - uint32 available_spare = 5; // Remaining spare capacity (%). - uint32 available_spare_threshold = 6; // Spare alarm threshold (%). - uint64 media_errors = 7; // Uncorrected data-integrity errors. - uint64 unsafe_shutdowns = 8; // Power-loss / unsafe shutdown count. + float composite_temperature = 1; // Composite drive temperature (°C). + uint32 critical_warning = 2; // NVMe critical-warning bitfield. + uint32 percentage_used = 3; // Wear indicator (%, may exceed 100). + uint32 available_spare = 4; // Remaining spare capacity (%). + uint32 available_spare_threshold = 5; // Spare alarm threshold (%). + uint64 media_errors = 6; // Uncorrected data-integrity errors. + uint64 unsafe_shutdowns = 7; // Power-loss / unsafe shutdown count. // Tier 2 — thermal throttling. - uint32 warning_temp_time_min = 9; // Minutes above the warning temperature. - uint32 critical_temp_time_min = 10; // Minutes above the critical temperature. - uint32 thermal_mgmt_t1_trans_count = 11; // Times entered light throttling. - uint32 thermal_mgmt_t2_trans_count = 12; // Times entered heavy throttling. - uint32 thermal_mgmt_t1_total_time_s = 13; // Total seconds in light throttling. - uint32 thermal_mgmt_t2_total_time_s = 14; // Total seconds in heavy throttling. + uint32 warning_temp_time_min = 8; // Minutes above the warning temperature. + uint32 critical_temp_time_min = 9; // Minutes above the critical temperature. + uint32 thermal_mgmt_t1_trans_count = 10; // Times entered light throttling. + uint32 thermal_mgmt_t2_trans_count = 11; // Times entered heavy throttling. + uint32 thermal_mgmt_t1_total_time_s = 12; // Total seconds in light throttling. + uint32 thermal_mgmt_t2_total_time_s = 13; // Total seconds in heavy throttling. // Tier 3 — endurance / workload. - uint64 data_units_written_bytes = 15; // Host data written (B). - uint64 data_units_read_bytes = 16; // Host data read (B). - uint64 host_write_commands = 17; // Host write command count. - uint64 host_read_commands = 18; // Host read command count. - uint64 power_on_hours = 19; // Power-on hours. - uint64 power_cycles = 20; // Power cycle count. - uint64 controller_busy_time_min = 21; // Controller busy time (min). - uint64 num_err_log_entries = 22; // Error-log entry count. + uint64 data_units_written_bytes = 14; // Host data written (B). + uint64 data_units_read_bytes = 15; // Host data read (B). + uint64 host_write_commands = 16; // Host write command count. + uint64 host_read_commands = 17; // Host read command count. + uint64 power_on_hours = 18; // Power-on hours. + uint64 power_cycles = 19; // Power cycle count. + uint64 controller_busy_time_min = 20; // Controller busy time (min). + uint64 num_err_log_entries = 21; // Error-log entry count. // Tier 4 — identity & capacity. - string model = 23; // Drive model. - string serial = 24; // Drive serial number. - string firmware_rev = 25; // Firmware revision. - uint64 capacity_bytes = 26; // Total drive capacity (B). Partition used/free is + string model = 22; // Drive model. + string serial = 23; // Drive serial number. + string firmware_rev = 24; // Firmware revision. + uint64 capacity_bytes = 25; // Total drive capacity (B). Partition used/free is // reported separately via DataStorageSpaceTel. }