From 3302c9b0a8605835ad537b53b4f725b4484f39a6 Mon Sep 17 00:00:00 2001 From: Julian Brost Date: Thu, 30 Apr 2026 13:16:42 +0200 Subject: [PATCH] Fix that NewClientHandler() could hang indefinitely, preventing new connection attempts There is some race condition when the `async_write()`/`async_flush()` operation for the `icinga::Hello` message fails (connection reset by peer for example) around the same time the connect timeout fires and calls `cancel()` on the stream, the following call to `async_shutdown()` may block indefinitely. If that happens, the endpoint remains in the connecting state and no new connection attemps are initiated. This commit fixes the issue by removing the `Defer` containing the `async_shutdown()`. The purpose of `async_shutdown()` is to signal a clean termination of the connection to the peer, which really isn't something that makes sense to to in a `Defer` block that is also executed in case of errors. For the one situation where doing a clean TLS shutdown makes some sense (closing anonymous client connections), a call to GracefulShutdown() is added to that specific code path. A large part of the change is just changing the indentation of the code, given that a now unnecessary `try`/`catch` block is removed. The following Go code creates a TLS server that can be used to demonstrate the issue. Note that given that a race condition is involved, this is not reliable and the sleep duration may need some fine-tuning. For this to work, `ApiListener.tls_handshake_timeout` needs to be set to a large-enough value like 60s to disable the timeout for `async_handshake()` itself so that the overall connect timeout is the one that fires. However, changing the timeout is not a prerequisite for the problem, it just makes it easier to reproduce. The error can also happen with the default timeouts if the TCP connect takes long enough so that the handshake is started late enough that its timeout expires after the connect timeout. package main import ( "crypto/tls" "log" "net" "time" ) func main() { cert, err := tls.LoadX509KeyPair("bad-agent.crt", "bad-agent.key") if err != nil { panic(err) } listener, err := tls.Listen("tcp", ":1337", &tls.Config{ Certificates: []tls.Certificate{cert}, }) if err != nil { panic(err) } log.Println("Listening on", listener.Addr()) for { conn, err := listener.Accept() if err != nil { panic(err) } go handle(conn.(*tls.Conn)) } } func handle(conn *tls.Conn) { addr := conn.RemoteAddr().String() log.Println(addr, "new connection") time.Sleep(15*time.Second - 10*time.Millisecond) log.Println(addr, "SetLinger(0)", conn.NetConn().(*net.TCPConn).SetLinger(0)) log.Println(addr, "Handshake()", conn.Handshake()) log.Println(addr, "conn.NetConn().Close()", conn.NetConn().Close()) } With additional logging in the `catch` block for `boost::system::system_error` and `Defer shutdownSslConn` (both removed by this commit), this showed the following. Note that in particular, `async_shutdown()` never returned, indicating that it hangs in there. [2026-04-24 17:32:56 +0200] information/ApiListener: Reconnecting to endpoint 'bad-agent' via host 'host.docker.internal' and port '1337' [2026-04-24 17:33:11 +0200] critical/ApiListener: Timeout while reconnecting to endpoint 'bad-agent' via host 'host.docker.internal' and port '1337', cancelling attempt [2026-04-24 17:33:11 +0200] information/ApiListener: New client connection for identity 'bad-agent' to [172.17.0.1]:1337 [2026-04-24 17:33:12 +0200] information/ApiListener: rethrowing for bad-agent: Error: Connection reset by peer [system:104 at /usr/include/boost/asio/detail/reactive_socket_send_op.hpp:137 in function 'do_complete'] [2026-04-24 17:33:12 +0200] information/ApiListener: doing async_shutdown for bad-agent --- lib/remote/apilistener.cpp | 112 ++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 65 deletions(-) diff --git a/lib/remote/apilistener.cpp b/lib/remote/apilistener.cpp index aa13eaf5646..47f5f285a73 100644 --- a/lib/remote/apilistener.cpp +++ b/lib/remote/apilistener.cpp @@ -748,16 +748,6 @@ void ApiListener::NewClientHandlerInternal( return; } - Defer shutdownSslConn ([&sslConn, &yc]() { - // Ignore the error, but do not throw an exception being swallowed at all cost. - // https://github.com/Icinga/icinga2/issues/7351 - boost::system::error_code ec; - - // Using async_shutdown() instead of AsioTlsStream::GracefulDisconnect() as this whole function - // is already guarded by a timeout based on the connect timeout. - sslConn.async_shutdown(yc[ec]); - }); - std::shared_ptr cert (sslConn.GetPeerCertificate()); bool verify_ok = false; String identity; @@ -809,8 +799,46 @@ void ApiListener::NewClientHandlerInternal( ClientType ctype; - try { - if (role == RoleClient) { + if (role == RoleClient) { + JsonRpc::SendMessage(client, new Dictionary({ + { "jsonrpc", "2.0" }, + { "method", "icinga::Hello" }, + { "params", new Dictionary({ + { "version", (double)l_AppVersionInt }, + { "capabilities", (double)ApiCapabilities::MyCapabilities } + }) } + }), yc); + + client->async_flush(yc); + + ctype = ClientJsonRpc; + } else { + { + boost::system::error_code ec; + + if (client->async_fill(yc[ec]) == 0u) { + if (identity.IsEmpty()) { + Log(LogInformation, "ApiListener") + << "No data received on new API connection " << conninfo << ": " << ec.message() + << ". Ensure that the remote endpoints are properly configured in a cluster setup."; + } else { + Log(LogWarning, "ApiListener") + << "No data received on new API connection " << conninfo << " for identity '" << identity << "': " << ec.message() + << ". Ensure that the remote endpoints are properly configured in a cluster setup."; + } + + return; + } + } + + char firstByte = 0; + + { + asio::mutable_buffer firstByteBuf (&firstByte, 1); + client->peek(firstByteBuf); + } + + if (firstByte >= '0' && firstByte <= '9') { JsonRpc::SendMessage(client, new Dictionary({ { "jsonrpc", "2.0" }, { "method", "icinga::Hello" }, @@ -824,54 +852,8 @@ void ApiListener::NewClientHandlerInternal( ctype = ClientJsonRpc; } else { - { - boost::system::error_code ec; - - if (client->async_fill(yc[ec]) == 0u) { - if (identity.IsEmpty()) { - Log(LogInformation, "ApiListener") - << "No data received on new API connection " << conninfo << ": " << ec.message() - << ". Ensure that the remote endpoints are properly configured in a cluster setup."; - } else { - Log(LogWarning, "ApiListener") - << "No data received on new API connection " << conninfo << " for identity '" << identity << "': " << ec.message() - << ". Ensure that the remote endpoints are properly configured in a cluster setup."; - } - - return; - } - } - - char firstByte = 0; - - { - asio::mutable_buffer firstByteBuf (&firstByte, 1); - client->peek(firstByteBuf); - } - - if (firstByte >= '0' && firstByte <= '9') { - JsonRpc::SendMessage(client, new Dictionary({ - { "jsonrpc", "2.0" }, - { "method", "icinga::Hello" }, - { "params", new Dictionary({ - { "version", (double)l_AppVersionInt }, - { "capabilities", (double)ApiCapabilities::MyCapabilities } - }) } - }), yc); - - client->async_flush(yc); - - ctype = ClientJsonRpc; - } else { - ctype = ClientHttp; - } - } - } catch (const boost::system::system_error& systemError) { - if (systemError.code() == boost::asio::error::operation_aborted) { - shutdownSslConn.Cancel(); + ctype = ClientHttp; } - - throw; } std::shared_lock wgLock(*m_ListenerWaitGroup, std::try_to_lock); @@ -910,20 +892,20 @@ void ApiListener::NewClientHandlerInternal( << "Ignoring anonymous JSON-RPC connection " << conninfo << ". Max connections (" << GetMaxAnonymousClients() << ") exceeded."; - aclient = nullptr; - } + // Close the connection cleanly, signals the other endpoint + // that the connection didn't break but was closed intentionally. + client->GracefulDisconnect(*strand, yc); - if (aclient) { - aclient->Start(); - shutdownSslConn.Cancel(); + return; } + + aclient->Start(); } else { Log(LogNotice, "ApiListener", "New HTTP client"); HttpServerConnection::Ptr aclient = new HttpServerConnection(m_WaitGroup, identity, verify_ok, client); AddHttpClient(aclient); aclient->Start(); - shutdownSslConn.Cancel(); } }