diff --git a/thread.c b/thread.c
index e66352c03f6c4a..870c5177520f25 100644
--- a/thread.c
+++ b/thread.c
@@ -170,7 +170,7 @@ static inline void blocking_region_end(rb_thread_t *th, struct rb_blocking_regio
 #define THREAD_BLOCKING_BEGIN(th) do { \
   struct rb_thread_sched * const sched = TH_SCHED(th); \
   RB_VM_SAVE_MACHINE_CONTEXT(th); \
-  thread_sched_to_waiting((sched), (th));
+  thread_sched_to_waiting((sched), (th), true);
 
 #define THREAD_BLOCKING_END(th) \
   thread_sched_to_running((sched), (th)); \
@@ -194,7 +194,7 @@ static inline void blocking_region_end(rb_thread_t *th, struct rb_blocking_regio
         /* Important that this is inlined into the macro, and not part of \
          * blocking_region_begin - see bug #20493 */ \
         RB_VM_SAVE_MACHINE_CONTEXT(th); \
-        thread_sched_to_waiting(TH_SCHED(th), th); \
+        thread_sched_to_waiting(TH_SCHED(th), th, false); \
         exec; \
         blocking_region_end(th, &__region); \
     }; \
@@ -2092,7 +2092,7 @@ rb_thread_call_with_gvl(void *(*func)(void *), void *data1)
     int released = blocking_region_begin(th, brb, prev_unblock.func, prev_unblock.arg, FALSE);
     RUBY_ASSERT_ALWAYS(released);
     RB_VM_SAVE_MACHINE_CONTEXT(th);
-    thread_sched_to_waiting(TH_SCHED(th), th);
+    thread_sched_to_waiting(TH_SCHED(th), th, true);
     return r;
 }
 
diff --git a/thread_none.c b/thread_none.c
index e6616c05856ff9..6531038482ec41 100644
--- a/thread_none.c
+++ b/thread_none.c
@@ -26,11 +26,11 @@ thread_sched_to_running(struct rb_thread_sched *sched, rb_thread_t *th)
 }
 
 static void
-thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th)
+thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th, bool yield_immediately)
 {
 }
 
-#define thread_sched_to_dead thread_sched_to_waiting
+#define thread_sched_to_dead(a,b) thread_sched_to_waiting((a),(b),true)
 
 static void
 thread_sched_yield(struct rb_thread_sched *sched, rb_thread_t *th)
diff --git a/thread_pthread.c b/thread_pthread.c
index 9c7754067bcdf9..dd97826fe793fd 100644
--- a/thread_pthread.c
+++ b/thread_pthread.c
@@ -698,8 +698,12 @@ thread_sched_readyq_contain_p(struct rb_thread_sched *sched, rb_thread_t *th)
 {
     rb_thread_t *rth;
     ccan_list_for_each(&sched->readyq, rth, sched.node.readyq) {
-        if (rth == th) return true;
+        if (rth == th) {
+            VM_ASSERT(th->sched.node.is_ready);
+            return true;
+        }
     }
+    VM_ASSERT(!th->sched.node.is_ready);
     return false;
 }
 
@@ -720,6 +724,8 @@ thread_sched_deq(struct rb_thread_sched *sched)
     }
     else {
         next_th = ccan_list_pop(&sched->readyq, rb_thread_t, sched.node.readyq);
+        VM_ASSERT(next_th->sched.node.is_ready);
+        next_th->sched.node.is_ready = false;
 
         VM_ASSERT(sched->readyq_cnt > 0);
         sched->readyq_cnt--;
@@ -753,6 +759,7 @@ thread_sched_enq(struct rb_thread_sched *sched, rb_thread_t *ready_th)
     }
 
     ccan_list_add_tail(&sched->readyq, &ready_th->sched.node.readyq);
+    ready_th->sched.node.is_ready = true;
     sched->readyq_cnt++;
 }
 
@@ -836,6 +843,30 @@ thread_sched_wait_running_turn(struct rb_thread_sched *sched, rb_thread_t *th, b
     VM_ASSERT(th == rb_ec_thread_ptr(rb_current_ec_noinline()));
 
     if (th != sched->running) {
+        // TODO: This optimization should also be made to work for MN_THREADS
+        if (th->has_dedicated_nt && th == sched->runnable_hot_th && (sched->running == NULL || sched->running->has_dedicated_nt)) {
+            RUBY_DEBUG_LOG("(nt) stealing: hot-th:%u.  running:%u", rb_th_serial(th), rb_th_serial(sched->running));
+
+            // If there is a thread set to run, move it back to the front of the readyq
+            if (sched->running != NULL) {
+                rb_thread_t *running = sched->running;
+                VM_ASSERT(!thread_sched_readyq_contain_p(sched, running));
+                running->sched.node.is_ready = true;
+                ccan_list_add(&sched->readyq, &running->sched.node.readyq);
+                sched->readyq_cnt++;
+            }
+
+            // Pull off the ready queue and start running.
+            if (th->sched.node.is_ready) {
+                VM_ASSERT(thread_sched_readyq_contain_p(sched, th));
+                ccan_list_del_init(&th->sched.node.readyq);
+                th->sched.node.is_ready = false;
+                sched->readyq_cnt--;
+            }
+            thread_sched_set_running(sched, th);
+            rb_ractor_thread_switch(th->ractor, th, false);
+        }
+
         // already deleted from running threads
         // VM_ASSERT(!ractor_sched_running_threads_contain_p(th->vm, th)); // need locking
 
@@ -852,6 +883,15 @@ thread_sched_wait_running_turn(struct rb_thread_sched *sched, rb_thread_t *th, b
                 }
                 thread_sched_set_locked(sched, th);
 
+                if (sched->runnable_hot_th != NULL) {
+                    VM_ASSERT(sched->runnable_hot_th != th);
+                    // Give the hot thread a chance to preempt, if it's actively spinning.
+                    // On multicore, this reduces the rate of core-switching. On single-core it
+                    // should mostly be a nop, since the other thread can't be concurrently spinning.
+                    thread_sched_unlock(sched, th);
+                    thread_sched_lock(sched, th);
+                }
+
                 RUBY_DEBUG_LOG("(nt) wakeup %s", sched->running == th ? "success" : "failed");
                 if (th == sched->running) {
                     rb_ractor_thread_switch(th->ractor, th, false);
@@ -900,6 +940,10 @@ thread_sched_wait_running_turn(struct rb_thread_sched *sched, rb_thread_t *th, b
         thread_sched_add_running_thread(sched, th);
     }
 
+    // Control transfer to the current thread is now complete. The original thread
+    // cannot steal control at this point.
+    sched->runnable_hot_th = NULL;
+
     // VM_ASSERT(ractor_sched_running_threads_contain_p(th->vm, th)); need locking
     RB_INTERNAL_THREAD_HOOK(RUBY_INTERNAL_THREAD_EVENT_RESUMED, th);
 }
@@ -1000,13 +1044,16 @@ thread_sched_to_dead(struct rb_thread_sched *sched, rb_thread_t *th)
 //
 // This thread will run dedicated task (th->nt->dedicated++).
 static void
-thread_sched_to_waiting_common(struct rb_thread_sched *sched, rb_thread_t *th)
+thread_sched_to_waiting_common(struct rb_thread_sched *sched, rb_thread_t *th, bool yield_immediately)
 {
     RUBY_DEBUG_LOG("th:%u DNT:%d", rb_th_serial(th), th->nt->dedicated);
 
     RB_INTERNAL_THREAD_HOOK(RUBY_INTERNAL_THREAD_EVENT_SUSPENDED, th);
 
     native_thread_dedicated_inc(th->vm, th->ractor, th->nt);
+    if (!yield_immediately) {
+        sched->runnable_hot_th = th;
+    }
     thread_sched_wakeup_next_thread(sched, th, false);
 }
 
@@ -1014,11 +1061,11 @@ thread_sched_to_waiting_common(struct rb_thread_sched *sched, rb_thread_t *th)
 //
 // This thread will run a dedicated task.
 static void
-thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th)
+thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th, bool yield_immediately)
 {
     thread_sched_lock(sched, th);
     {
-        thread_sched_to_waiting_common(sched, th);
+        thread_sched_to_waiting_common(sched, th, yield_immediately);
     }
     thread_sched_unlock(sched, th);
 }
diff --git a/thread_pthread.h b/thread_pthread.h
index 992e9fb0803d67..3d9a278ac85aa8 100644
--- a/thread_pthread.h
+++ b/thread_pthread.h
@@ -56,6 +56,9 @@ struct rb_thread_sched_item {
         // connected to ractor->threads.sched.reqdyq
         // locked by ractor->threads.sched.lock
         struct ccan_list_node readyq;
+        // Indicates whether thread is on the readyq.
+        // There is no clear relationship between this and th->status.
+        bool is_ready;
 
         // connected to vm->ractor.sched.timeslice_threads
         // locked by vm->ractor.sched.lock
@@ -127,6 +130,10 @@ struct rb_thread_sched {
     struct rb_thread_struct *lock_owner;
 #endif
     struct rb_thread_struct *running; // running thread or NULL
+    // Most recently running thread or NULL. If this thread wakes up before the newly running
+    // thread completes the transfer of control, it can interrupt and resume running.
+    // The new thread clears this field when it takes control.
+    struct rb_thread_struct *runnable_hot_th;
     bool is_running;
     bool is_running_timeslice;
     bool enable_mn_threads;
diff --git a/thread_win32.c b/thread_win32.c
index 3fc763924846bd..635f0ea76f6c88 100644
--- a/thread_win32.c
+++ b/thread_win32.c
@@ -132,18 +132,22 @@ thread_sched_to_running(struct rb_thread_sched *sched, rb_thread_t *th)
     if (GVL_DEBUG) fprintf(stderr, "gvl acquire (%p): acquire\n", th);
 }
 
-#define thread_sched_to_dead thread_sched_to_waiting
-
 static void
-thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th)
+thread_sched_to_waiting(struct rb_thread_sched *sched, rb_thread_t *th, bool yield_immediately)
 {
     ReleaseMutex(sched->lock);
 }
 
+static void
+thread_sched_to_dead(struct rb_thread_sched *sched, rb_thread_t *th)
+{
+    thread_sched_to_waiting(sched, th, true);
+}
+
 static void
 thread_sched_yield(struct rb_thread_sched *sched, rb_thread_t *th)
 {
-    thread_sched_to_waiting(sched, th);
+    thread_sched_to_waiting(sched, th, true);
     native_thread_yield();
     thread_sched_to_running(sched, th);
 }