diff --git a/src/tsc_now.rs b/src/tsc_now.rs index c4b9d13..f1b49b6 100644 --- a/src/tsc_now.rs +++ b/src/tsc_now.rs @@ -2,6 +2,10 @@ //! This module will be compiled when it's either linux_x86 or linux_x86_64. +#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] +use core::sync::atomic::compiler_fence; +#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] +use core::sync::atomic::Ordering; use std::cell::UnsafeCell; use std::fs::read_to_string; use std::io::ErrorKind; @@ -132,6 +136,7 @@ fn has_invariant_tsc() -> bool { use core::arch::x86_64::__cpuid; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[allow(unused_unsafe)] unsafe { let cpuid_invariant_tsc_bts = 1 << 8; __cpuid(0x80000000).eax >= 0x80000007 @@ -163,7 +168,7 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { let mut last_tsc; let mut old_cycles = 0.0; - loop { + 'outer: loop { let (t1, tsc1) = monotonic_with_tsc(); loop { let (t2, tsc2) = monotonic_with_tsc(); @@ -171,7 +176,14 @@ fn _cycles_per_sec() -> (u64, Instant, u64) { last_tsc = tsc2; let elapsed_nanos = (t2 - t1).as_nanos(); if elapsed_nanos > 10_000_000 { - cycles_per_sec = (tsc2 - tsc1) as f64 * 1_000_000_000.0 / elapsed_nanos as f64; + // Even with RDTSCP serialization, tsc2 < tsc1 is still possible + // if the thread migrates to a different CPU core between samples + // (cores may have slightly different TSC offsets). checked_sub + // prevents overflow; we retry from the outer loop with fresh tsc1. + let Some(delta) = tsc2.checked_sub(tsc1) else { + continue 'outer; + }; + cycles_per_sec = delta as f64 * 1_000_000_000.0 / elapsed_nanos as f64; break; } } @@ -192,12 +204,40 @@ fn monotonic_with_tsc() -> (Instant, u64) { (Instant::now(), tsc()) } +// The RDTSCP instruction waits until all previous instructions have been executed before reading +// the counter. However, subsequent instructions may begin execution before the read operation is +// performed. Therefore, we need to fence the instruction stream after the RDTSCP to ensure that no +// instructions are executed until the read operation is complete. On x86-64 and x86 with SSE2, we +// can use an LFENCE instruction for this purpose. On x86 without SSE2, we can use a compiler fence +// to achieve the same effect. #[inline] fn tsc() -> u64 { #[cfg(target_arch = "x86")] - use core::arch::x86::_rdtsc; + use core::arch::x86::__rdtscp; #[cfg(target_arch = "x86_64")] - use core::arch::x86_64::_rdtsc; + use core::arch::x86_64::__rdtscp; - unsafe { _rdtsc() } + // Case 1: 64-bit (always has SSE2/lfence) OR 32-bit with SSE2 enabled + #[cfg(any(target_arch = "x86_64", target_feature = "sse2"))] + { + #[cfg(target_arch = "x86")] + use core::arch::x86::_mm_lfence; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::_mm_lfence; + let mut aux = 0u32; + unsafe { + let r = __rdtscp(&mut aux); + _mm_lfence(); + r + } + } + + // Case 2: 32-bit WITHOUT SSE2 enabled + #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))] + { + let mut aux = 0u32; + let r = unsafe { __rdtscp(&mut aux) }; + compiler_fence(Ordering::SeqCst); + r + } }