diff --git a/src/tsc_now.rs b/src/tsc_now.rs index c4b9d13..892e168 100644 --- a/src/tsc_now.rs +++ b/src/tsc_now.rs @@ -132,7 +132,7 @@ fn has_invariant_tsc() -> bool { use core::arch::x86_64::__cpuid; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - unsafe { + { let cpuid_invariant_tsc_bts = 1 << 8; __cpuid(0x80000000).eax >= 0x80000007 && __cpuid(0x80000007).edx & cpuid_invariant_tsc_bts != 0 @@ -163,7 +163,7 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { let mut last_tsc; let mut old_cycles = 0.0; - loop { + 'outer: loop { let (t1, tsc1) = monotonic_with_tsc(); loop { let (t2, tsc2) = monotonic_with_tsc(); @@ -171,7 +171,14 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { last_tsc = tsc2; let elapsed_nanos = (t2 - t1).as_nanos(); if elapsed_nanos > 10_000_000 { - cycles_per_sec = (tsc2 - tsc1) as f64 * 1_000_000_000.0 / elapsed_nanos as f64; + // Even with fence added in monotonic_with_tsc(), tsc2 < tsc1 is still possible + // if the thread migrates to a different CPU core between samples + // (cores may have slightly different TSC offsets). checked_sub + // prevents overflow; we retry from the outer loop with fresh tsc1. + let Some(delta) = tsc2.checked_sub(tsc1) else { + continue 'outer; + }; + cycles_per_sec = delta as f64 * 1_000_000_000.0 / elapsed_nanos as f64; break; } } @@ -189,7 +196,23 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { /// get interrupted in half way may happen, they aren't guaranteed /// to represent the same instant. fn monotonic_with_tsc() -> (Instant, u64) { - (Instant::now(), tsc()) + let t = Instant::now(); + // RDTSC is not serializing; LFENCE ensures Instant::now() completes first. + #[cfg(target_feature = "sse2")] + { + #[cfg(target_arch = "x86")] + use std::arch::x86::_mm_lfence; + #[cfg(target_arch = "x86_64")] + use std::arch::x86_64::_mm_lfence; + unsafe { _mm_lfence() }; + } + #[cfg(not(target_feature = "sse2"))] + { + use std::sync::atomic::compiler_fence; + use std::sync::atomic::Ordering; + compiler_fence(Ordering::SeqCst); + } + (t, tsc()) } #[inline]