From 91ce7ecc66b68db28d3eba5b503cdbe5b939286a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20=C5=9Awi=C4=85tkowski?= Date: Fri, 17 Apr 2026 20:14:35 +0200 Subject: [PATCH] Improve timeouts handling to make sure Node processes are not hanging This improves how timeouts are handles by: - Removing the race condition from where timeout originates - now it's always coming from GenServer.call - Killing the process on timeout, so the next time the worker is checked out from the pool, it for sure has a fresh and working timeout - Replacing returning strings in error tuples with an atom --- lib/nodejs/supervisor.ex | 37 ++++++++++++++++++------------ lib/nodejs/worker.ex | 18 ++++++--------- test/nodejs_test.exs | 49 +++++++++++++++++++++++++++++++++------- 3 files changed, 71 insertions(+), 33 deletions(-) diff --git a/lib/nodejs/supervisor.ex b/lib/nodejs/supervisor.ex index a39d9b8..25716d2 100644 --- a/lib/nodejs/supervisor.ex +++ b/lib/nodejs/supervisor.ex @@ -34,20 +34,28 @@ defmodule NodeJS.Supervisor do timeout = Keyword.get(opts, :timeout, @timeout) esm = Keyword.get(opts, :esm, module |> elem(0) |> to_string |> String.ends_with?(".mjs")) - func = fn pid -> - try do - GenServer.call(pid, {module, args, [binary: binary, timeout: timeout, esm: esm]}, timeout) - catch - :exit, {:timeout, _} -> - {:error, "Call timed out."} - - :exit, error -> - {:error, {:node_js_worker_exit, error}} - end - end - pool_name = supervisor_pool(opts) - :poolboy.transaction(pool_name, func, timeout) + pid = :poolboy.checkout(pool_name, true, timeout) + + try do + result = GenServer.call(pid, {module, args, [binary: binary, esm: esm]}, timeout) + :poolboy.checkin(pool_name, pid) + result + catch + :exit, {:timeout, _} -> + # Kill the stuck worker — don't check it back in. The worker's Node.js + # runtime may be wedged (infinite loop, blocked event loop), and + # returning it to the pool would hand the next caller a broken runtime. + # Poolboy links to workers, so it sees the EXIT and spawns a fresh + # replacement on its own; checking in would race that cleanup and + # temporarily re-add a dead pid to the pool. + Process.exit(pid, :kill) + {:error, :timeout} + + :exit, error -> + :poolboy.checkin(pool_name, pid) + {:error, {:node_js_worker_exit, error}} + end end defp supervisor_name(opts) do @@ -69,7 +77,7 @@ defmodule NodeJS.Supervisor do run_in_transaction(module, args, opts) catch :exit, {:timeout, _} -> - {:error, "Call timed out."} + {:error, :timeout} end end @@ -78,6 +86,7 @@ defmodule NodeJS.Supervisor do |> call(args, opts) |> case do {:ok, result} -> result + {:error, :timeout} -> raise NodeJS.Error, message: "Call timed out." {:error, message} -> raise NodeJS.Error, message: message end end diff --git a/lib/nodejs/worker.ex b/lib/nodejs/worker.ex index 4a5b329..73a3566 100644 --- a/lib/nodejs/worker.ex +++ b/lib/nodejs/worker.ex @@ -70,14 +70,14 @@ defmodule NodeJS.Worker do ] end - defp get_response(data, timeout, port, expected_uid) do + defp get_response(data, port, expected_uid) do receive do {^port, {:data, {flag, chunk}}} -> data = data ++ chunk case flag do :noeol -> - get_response(data, timeout, port, expected_uid) + get_response(data, port, expected_uid) :eol -> case data do @@ -87,19 +87,16 @@ defmodule NodeJS.Worker do {:ok, response_data} {_stale_uid, _response_data} -> - # Response from a different (likely timed-out) request — discard it - get_response(~c"", timeout, port, expected_uid) + get_response(~c"", port, expected_uid) end _ -> - get_response(~c"", timeout, port, expected_uid) + get_response(~c"", port, expected_uid) end end {^port, {:exit_status, status}} when status != 0 -> {:error, {:exit, status}} - after - timeout -> {:error, :timeout} end end @@ -123,7 +120,6 @@ defmodule NodeJS.Worker do @doc false def handle_call({module, args, opts}, _from, %{port: port, uid_counter: uid_counter} = state) when is_tuple(module) do - timeout = Keyword.get(opts, :timeout) binary = Keyword.get(opts, :binary) esm = Keyword.get(opts, :esm, false) uid = Integer.to_string(uid_counter) @@ -132,7 +128,7 @@ defmodule NodeJS.Worker do state = %{state | uid_counter: uid_counter + 1} - case get_response(~c"", timeout, port, uid) do + case get_response(~c"", port, uid) do {:ok, response} -> decoded_response = response @@ -141,8 +137,8 @@ defmodule NodeJS.Worker do {:reply, decoded_response, state} - {:error, :timeout} -> - {:reply, {:error, :timeout}, state} + {:error, _} = error -> + {:reply, error, state} end end diff --git a/test/nodejs_test.exs b/test/nodejs_test.exs index a9d984d..82a9249 100644 --- a/test/nodejs_test.exs +++ b/test/nodejs_test.exs @@ -164,10 +164,10 @@ defmodule NodeJS.Test do assert_receive :received_timeout_3, 10 assert_receive :received_timeout_4, 10 - assert {:error, "Call timed out."} = Task.await(task1) - assert {:error, "Call timed out."} = Task.await(task2) - assert {:error, "Call timed out."} = Task.await(task3) - assert {:error, "Call timed out."} = Task.await(task4) + assert {:error, :timeout} = Task.await(task1) + assert {:error, :timeout} = Task.await(task2) + assert {:error, :timeout} = Task.await(task3) + assert {:error, :timeout} = Task.await(task4) # We should still get an answer here, before the timeout assert {:ok, 1115} = NodeJS.call("slow-async-echo", [1115, 1]) @@ -190,7 +190,7 @@ defmodule NodeJS.Test do # Send a call that takes 300ms but times out after 50ms. # After timeout, the worker is free but the Node.js response is still pending. - assert {:error, "Call timed out."} = + assert {:error, :timeout} = NodeJS.call("slow-async-echo", [9999, 300], timeout: 50, name: NodeJS.RaceTest @@ -207,9 +207,42 @@ defmodule NodeJS.Test do end end + describe "CPU-bound JS timing out" do + test "doesn't leave the worker pool permanently wedged" do + # A single-worker pool makes the failure mode unambiguous: if the timed-out + # call doesn't tear down the worker, the follow-up has nowhere to run. + path = __ENV__.file |> Path.dirname() |> Path.join("js") + + start_supervised!( + Supervisor.child_spec( + {NodeJS.Supervisor, path: path, name: NodeJS.WedgedTest, pool_size: 1}, + id: NodeJS.WedgedTest + ) + ) + + # block-event-loop busy-waits for 10s, long enough that a wedged Node + # runtime could not possibly service the follow-up call below in time. + # The call is aborted at 50ms; the worker must die so its Node process + # dies with it, freeing the pool for a fresh worker. + assert {:error, :timeout} = + NodeJS.call("block-event-loop", [10_000], + timeout: 50, + name: NodeJS.WedgedTest + ) + + # Must succeed on a freshly spawned worker. If it does not, it means it received + # broken runtime from poolboy. + assert {:ok, 42} = + NodeJS.call("slow-async-echo", [42, 1], + timeout: 2_000, + name: NodeJS.WedgedTest + ) + end + end + describe "overriding call timeout" do test "works, and you can tell because the slow function will time out" do - assert {:error, "Call timed out."} = NodeJS.call("slow-async-echo", [1111], timeout: 0) + assert {:error, :timeout} = NodeJS.call("slow-async-echo", [1111], timeout: 0) assert_raise NodeJS.Error, fn -> NodeJS.call!("slow-async-echo", [1111], timeout: 0) end assert {:ok, 1111} = NodeJS.call("slow-async-echo", [1111]) end @@ -223,9 +256,9 @@ defmodule NodeJS.Test do describe "Implementation details shouldn't leak:" do test "Timeouts do not send stray messages to calling process" do - assert {:error, "Call timed out."} = NodeJS.call("slow-async-echo", [1111], timeout: 0) + assert {:error, :timeout} = NodeJS.call("slow-async-echo", [1111], timeout: 0) - refute_receive {_ref, {:error, "Call timed out."}}, 50 + refute_receive {_ref, {:error, :timeout}}, 50 end test "Crashes do not bring down the calling process" do