Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 45 additions & 5 deletions src/tsc_now.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

//! This module will be compiled when it's either linux_x86 or linux_x86_64.

#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
use core::sync::atomic::compiler_fence;
#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
use core::sync::atomic::Ordering;
use std::cell::UnsafeCell;
use std::fs::read_to_string;
use std::io::ErrorKind;
Expand Down Expand Up @@ -132,6 +136,7 @@ fn has_invariant_tsc() -> bool {
use core::arch::x86_64::__cpuid;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[allow(unused_unsafe)]
unsafe {
let cpuid_invariant_tsc_bts = 1 << 8;
__cpuid(0x80000000).eax >= 0x80000007
Expand Down Expand Up @@ -163,15 +168,22 @@ fn _cycles_per_sec() -> (u64, Instant, u64) {
let mut last_tsc;
let mut old_cycles = 0.0;

loop {
'outer: loop {
let (t1, tsc1) = monotonic_with_tsc();
loop {
let (t2, tsc2) = monotonic_with_tsc();
last_monotonic = t2;
last_tsc = tsc2;
let elapsed_nanos = (t2 - t1).as_nanos();
if elapsed_nanos > 10_000_000 {
cycles_per_sec = (tsc2 - tsc1) as f64 * 1_000_000_000.0 / elapsed_nanos as f64;
// Even with RDTSCP serialization, tsc2 < tsc1 is still possible
// if the thread migrates to a different CPU core between samples
// (cores may have slightly different TSC offsets). checked_sub
// prevents overflow; we retry from the outer loop with fresh tsc1.
let Some(delta) = tsc2.checked_sub(tsc1) else {
continue 'outer;
};
cycles_per_sec = delta as f64 * 1_000_000_000.0 / elapsed_nanos as f64;
break;
}
}
Expand All @@ -192,12 +204,40 @@ fn monotonic_with_tsc() -> (Instant, u64) {
(Instant::now(), tsc())
}

// The RDTSCP instruction waits until all previous instructions have been executed before reading
// the counter. However, subsequent instructions may begin execution before the read operation is
// performed. Therefore, we need to fence the instruction stream after the RDTSCP to ensure that no
// instructions are executed until the read operation is complete. On x86-64 and x86 with SSE2, we
// can use an LFENCE instruction for this purpose. On x86 without SSE2, we can use a compiler fence
// to achieve the same effect.
#[inline]
fn tsc() -> u64 {
#[cfg(target_arch = "x86")]
use core::arch::x86::_rdtsc;
use core::arch::x86::__rdtscp;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_rdtsc;
use core::arch::x86_64::__rdtscp;

unsafe { _rdtsc() }
// Case 1: 64-bit (always has SSE2/lfence) OR 32-bit with SSE2 enabled
#[cfg(any(target_arch = "x86_64", target_feature = "sse2"))]
{
#[cfg(target_arch = "x86")]
use core::arch::x86::_mm_lfence;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::_mm_lfence;
let mut aux = 0u32;
unsafe {
let r = __rdtscp(&mut aux);
_mm_lfence();
r
}
}

// Case 2: 32-bit WITHOUT SSE2 enabled
#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
{
let mut aux = 0u32;
let r = unsafe { __rdtscp(&mut aux) };
compiler_fence(Ordering::SeqCst);
r
}
}
Loading