From cd3f3a44b61effb7dcf779aa61693944f26decd4 Mon Sep 17 00:00:00 2001 From: Yunze Xu Date: Tue, 12 May 2026 20:57:45 +0800 Subject: [PATCH 1/3] fix: fence the tsc() call and prevent crash from _cycles_per_sec --- src/tsc_now.rs | 49 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 5 deletions(-) diff --git a/src/tsc_now.rs b/src/tsc_now.rs index c4b9d13..c120b28 100644 --- a/src/tsc_now.rs +++ b/src/tsc_now.rs @@ -2,6 +2,10 @@ //! This module will be compiled when it's either linux_x86 or linux_x86_64. +#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] +use core::sync::atomic::compiler_fence; +#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] +use core::sync::atomic::Ordering; use std::cell::UnsafeCell; use std::fs::read_to_string; use std::io::ErrorKind; @@ -163,7 +167,7 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { let mut last_tsc; let mut old_cycles = 0.0; - loop { + 'outer: loop { let (t1, tsc1) = monotonic_with_tsc(); loop { let (t2, tsc2) = monotonic_with_tsc(); @@ -171,7 +175,14 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { last_tsc = tsc2; let elapsed_nanos = (t2 - t1).as_nanos(); if elapsed_nanos > 10_000_000 { - cycles_per_sec = (tsc2 - tsc1) as f64 * 1_000_000_000.0 / elapsed_nanos as f64; + // Even with RDTSCP serialization, tsc2 < tsc1 is still possible + // if the thread migrates to a different CPU core between samples + // (cores may have slightly different TSC offsets). checked_sub + // prevents overflow; we retry from the outer loop with fresh tsc1. + let Some(delta) = tsc2.checked_sub(tsc1) else { + continue 'outer; + }; + cycles_per_sec = delta as f64 * 1_000_000_000.0 / elapsed_nanos as f64; break; } } @@ -192,12 +203,40 @@ fn monotonic_with_tsc() -> (Instant, u64) { (Instant::now(), tsc()) } +// The RDTSCP instruction waits until all previous instructions have been executed before reading +// the counter. However, subsequent instructions may begin execution before the read operation is +// performed. Therefore, we need to fence the instruction stream after the RDTSCP to ensure that no +// instructions are executed until the read operation is complete. On x86-64 and x86 with SSE2, we +// can use an LFENCE instruction for this purpose. On x86 without SSE2, we can use a compiler fence +// to achieve the same effect. #[inline] fn tsc() -> u64 { #[cfg(target_arch = "x86")] - use core::arch::x86::_rdtsc; + use core::arch::x86::__rdtscp; #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::_rdtsc; + use core::arch::x86_64::__rdtscp; - unsafe { _rdtsc() } + // Case 1: 64-bit (always has SSE2/lfence) OR 32-bit with SSE2 enabled + #[cfg(any(target_arch = "x86_64", target_feature = "sse2"))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::_mm_lfence; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::_mm_lfence; + let mut aux = 0u32; + unsafe { + let r = __rdtscp(&mut aux); + _mm_lfence(); + r + } + } + + // Case 2: 32-bit WITHOUT SSE2 enabled + #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] + { + let mut aux = 0u32; + let r = unsafe { __rdtscp(&mut aux) }; + compiler_fence(Ordering::SeqCst); + r + } } From e96c833da2fb466e1c657a4a1c6f56c9938f3163 Mon Sep 17 00:00:00 2001 From: Yunze Xu Date: Tue, 12 May 2026 21:15:32 +0800 Subject: [PATCH 2/3] fix clippy check --- src/tsc_now.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tsc_now.rs b/src/tsc_now.rs index c120b28..4f76160 100644 --- a/src/tsc_now.rs +++ b/src/tsc_now.rs @@ -136,7 +136,7 @@ fn has_invariant_tsc() -> bool { use core::arch::x86_64::__cpuid; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { + { let cpuid_invariant_tsc_bts = 1 << 8; __cpuid(0x80000000).eax >= 0x80000007 && __cpuid(0x80000007).edx & cpuid_invariant_tsc_bts != 0 From efd6e91332b5a52c55a3e462c56fe34043c70a69 Mon Sep 17 00:00:00 2001 From: Yunze Xu Date: Tue, 12 May 2026 21:16:29 +0800 Subject: [PATCH 3/3] use unused_unsafe instead --- src/tsc_now.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tsc_now.rs b/src/tsc_now.rs index 4f76160..f1b49b6 100644 --- a/src/tsc_now.rs +++ b/src/tsc_now.rs @@ -136,7 +136,8 @@ fn has_invariant_tsc() -> bool { use core::arch::x86_64::__cpuid; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { + #[allow(unused_unsafe)] + unsafe { let cpuid_invariant_tsc_bts = 1 << 8; __cpuid(0x80000000).eax >= 0x80000007 && __cpuid(0x80000007).edx & cpuid_invariant_tsc_bts != 0