diff --git a/src/tsc_now.rs b/src/tsc_now.rs
index c4b9d13..f1b49b6 100644
--- a/src/tsc_now.rs
+++ b/src/tsc_now.rs
@@ -2,6 +2,10 @@
 
 //! This module will be compiled when it's either linux_x86 or linux_x86_64.
 
+#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+use core::sync::atomic::compiler_fence;
+#[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+use core::sync::atomic::Ordering;
 use std::cell::UnsafeCell;
 use std::fs::read_to_string;
 use std::io::ErrorKind;
@@ -132,6 +136,7 @@ fn has_invariant_tsc() -> bool {
     use core::arch::x86_64::__cpuid;
 
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[allow(unused_unsafe)]
     unsafe {
         let cpuid_invariant_tsc_bts = 1 << 8;
         __cpuid(0x80000000).eax >= 0x80000007
@@ -163,7 +168,7 @@ fn _cycles_per_sec() -> (u64, Instant, u64) {
     let mut last_tsc;
     let mut old_cycles = 0.0;
 
-    loop {
+    'outer: loop {
         let (t1, tsc1) = monotonic_with_tsc();
         loop {
             let (t2, tsc2) = monotonic_with_tsc();
@@ -171,7 +176,14 @@ fn _cycles_per_sec() -> (u64, Instant, u64) {
             last_tsc = tsc2;
             let elapsed_nanos = (t2 - t1).as_nanos();
             if elapsed_nanos > 10_000_000 {
-                cycles_per_sec = (tsc2 - tsc1) as f64 * 1_000_000_000.0 / elapsed_nanos as f64;
+                // Even with RDTSCP serialization, tsc2 < tsc1 is still possible
+                // if the thread migrates to a different CPU core between samples
+                // (cores may have slightly different TSC offsets). checked_sub
+                // prevents overflow; we retry from the outer loop with fresh tsc1.
+                let Some(delta) = tsc2.checked_sub(tsc1) else {
+                    continue 'outer;
+                };
+                cycles_per_sec = delta as f64 * 1_000_000_000.0 / elapsed_nanos as f64;
                 break;
             }
         }
@@ -192,12 +204,40 @@ fn monotonic_with_tsc() -> (Instant, u64) {
     (Instant::now(), tsc())
 }
 
+// The RDTSCP instruction waits until all previous instructions have been executed before reading
+// the counter. However, subsequent instructions may begin execution before the read operation is
+// performed. Therefore, we need to fence the instruction stream after the RDTSCP to ensure that no
+// instructions are executed until the read operation is complete. On x86-64 and x86 with SSE2, we
+// can use an LFENCE instruction for this purpose. On x86 without SSE2, we can use a compiler fence
+// to achieve the same effect.
 #[inline]
 fn tsc() -> u64 {
     #[cfg(target_arch = "x86")]
-    use core::arch::x86::_rdtsc;
+    use core::arch::x86::__rdtscp;
     #[cfg(target_arch = "x86_64")]
-    use core::arch::x86_64::_rdtsc;
+    use core::arch::x86_64::__rdtscp;
 
-    unsafe { _rdtsc() }
+    // Case 1: 64-bit (always has SSE2/lfence) OR 32-bit with SSE2 enabled
+    #[cfg(any(target_arch = "x86_64", target_feature = "sse2"))]
+    {
+        #[cfg(target_arch = "x86")]
+        use core::arch::x86::_mm_lfence;
+        #[cfg(target_arch = "x86_64")]
+        use core::arch::x86_64::_mm_lfence;
+        let mut aux = 0u32;
+        unsafe {
+            let r = __rdtscp(&mut aux);
+            _mm_lfence();
+            r
+        }
+    }
+
+    // Case 2: 32-bit WITHOUT SSE2 enabled
+    #[cfg(all(target_arch = "x86", not(target_feature = "sse2")))]
+    {
+        let mut aux = 0u32;
+        let r = unsafe { __rdtscp(&mut aux) };
+        compiler_fence(Ordering::SeqCst);
+        r
+    }
 }