Skip to content

Commit a194938

Browse files
Merge pull request #13880 from rabbitmq/mergify/bp/v4.1.x/pr-13879
Add health checks for testing readiness to serve clients (backport #13879)
2 parents 12ee638 + 9d02953 commit a194938

5 files changed

+242
-1
lines changed

deps/rabbitmq_management/priv/www/api/index.html

+35
Original file line numberDiff line numberDiff line change
@@ -1252,6 +1252,41 @@ <h2>Reference</h2>
12521252
Service Unavailable.
12531253
</td>
12541254
</tr>
1255+
<tr>
1256+
<td>X</td>
1257+
<td></td>
1258+
<td></td>
1259+
<td></td>
1260+
<td class="path">/api/health/checks/below-node-connection-limit</td>
1261+
<td>
1262+
Responds a 200 OK if the target node has fewer connections to the AMQP
1263+
and AMQPS ports than the configured maximum, otherwise responds with a
1264+
503 Service Unavailable.
1265+
</td>
1266+
</tr>
1267+
<tr>
1268+
<td>X</td>
1269+
<td></td>
1270+
<td></td>
1271+
<td></td>
1272+
<td class="path">/api/health/checks/ready-to-serve-clients</td>
1273+
<td>
1274+
<p>
1275+
Responds a 200 OK if the target node is ready to serve clients, otherwise
1276+
responds with a 503 Service Unavailable. This check combines:
1277+
</p>
1278+
<ol>
1279+
<li>/api/health/checks/is-in-service</li>
1280+
<li>/api/health/checks/protocol-listener/amqp or /api/health/checks/protocol-listener/amqps</li>
1281+
<li>/api/health/checks/below-node-connection-limit</li>
1282+
</ol>
1283+
<p>
1284+
So this check will only return 200 OK if the target node is in service,
1285+
an AMQP or AMQPS listener is available and the target node has fewer active
1286+
AMQP and AMQPS connections that its configured limit.
1287+
</p>
1288+
</td>
1289+
</tr>
12551290
<tr>
12561291
<td>X</td>
12571292
<td></td>

deps/rabbitmq_management/src/rabbit_mgmt_dispatcher.erl

+2
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,8 @@ dispatcher() ->
208208
{"/health/checks/quorum-queues-without-elected-leaders/vhost/:vhost/pattern/:pattern", rabbit_mgmt_wm_health_check_quorum_queues_without_elected_leaders, []},
209209
{"/health/checks/node-is-quorum-critical", rabbit_mgmt_wm_health_check_node_is_quorum_critical, []},
210210
{"/health/checks/is-in-service", rabbit_mgmt_wm_health_check_is_in_service, []},
211+
{"/health/checks/below-node-connection-limit", rabbit_mgmt_wm_health_check_below_node_connection_limit, []},
212+
{"/health/checks/ready-to-serve-clients", rabbit_mgmt_wm_health_check_ready_to_serve_clients, []},
211213
{"/reset", rabbit_mgmt_wm_reset, []},
212214
{"/reset/:node", rabbit_mgmt_wm_reset, []},
213215
{"/rebalance/queues", rabbit_mgmt_wm_rebalance_queues, [{queues, all}]},
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
%% This Source Code Form is subject to the terms of the Mozilla Public
2+
%% License, v. 2.0. If a copy of the MPL was not distributed with this
3+
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
%%
5+
%% Copyright (c) 2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
6+
%%
7+
8+
-module(rabbit_mgmt_wm_health_check_below_node_connection_limit).
9+
10+
-export([init/2]).
11+
-export([to_json/2, content_types_provided/2]).
12+
-export([variances/2]).
13+
14+
-include("rabbit_mgmt.hrl").
15+
-include_lib("rabbitmq_management_agent/include/rabbit_mgmt_records.hrl").
16+
17+
init(Req, _State) ->
18+
Req1 = rabbit_mgmt_headers:set_no_cache_headers(
19+
rabbit_mgmt_headers:set_common_permission_headers(
20+
Req, ?MODULE), ?MODULE),
21+
{cowboy_rest, Req1, #context{}}.
22+
23+
variances(Req, Context) ->
24+
{[<<"accept-encoding">>, <<"origin">>], Req, Context}.
25+
26+
content_types_provided(ReqData, Context) ->
27+
{rabbit_mgmt_util:responder_map(to_json), ReqData, Context}.
28+
29+
to_json(ReqData, Context) ->
30+
ActiveConns = lists:foldl(
31+
fun(Protocol, Acc) ->
32+
Acc + protocol_connection_count(Protocol)
33+
end, 0, [amqp, 'amqp/ssl']),
34+
Limit = rabbit_misc:get_env(rabbit, connection_max, infinity),
35+
case ActiveConns < Limit of
36+
true ->
37+
rabbit_mgmt_util:reply(
38+
#{status => ok,
39+
limit => Limit,
40+
connections => ActiveConns}, ReqData, Context);
41+
false ->
42+
Body = #{
43+
status => failed,
44+
reason => <<"node connection limit is reached">>,
45+
limit => Limit,
46+
connections => ActiveConns
47+
},
48+
{Response, ReqData1, Context1} = rabbit_mgmt_util:reply(
49+
Body, ReqData, Context),
50+
{stop,
51+
cowboy_req:reply(
52+
?HEALTH_CHECK_FAILURE_STATUS, #{}, Response, ReqData1),
53+
Context1}
54+
end.
55+
56+
protocol_connection_count(Protocol) ->
57+
case rabbit_networking:ranch_ref_of_protocol(Protocol) of
58+
undefined ->
59+
0;
60+
RanchRef ->
61+
#{active_connections := Count} = ranch:info(RanchRef),
62+
Count
63+
end.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
%% This Source Code Form is subject to the terms of the Mozilla Public
2+
%% License, v. 2.0. If a copy of the MPL was not distributed with this
3+
%% file, You can obtain one at https://mozilla.org/MPL/2.0/.
4+
%%
5+
%% Copyright (c) 2025 Broadcom. All Rights Reserved. The term “Broadcom” refers to Broadcom Inc. and/or its subsidiaries. All rights reserved.
6+
%%
7+
8+
%% A composite health check that combines:
9+
%% * GET /api/health/checks/is-in-service
10+
%% * GET /api/health/checks/protocol-listener/amqp
11+
%% * GET /api/health/checks/below-node-connection-limit
12+
13+
-module(rabbit_mgmt_wm_health_check_ready_to_serve_clients).
14+
15+
-export([init/2]).
16+
-export([to_json/2, content_types_provided/2]).
17+
-export([variances/2]).
18+
19+
-include("rabbit_mgmt.hrl").
20+
-include_lib("rabbitmq_management_agent/include/rabbit_mgmt_records.hrl").
21+
22+
init(Req, _State) ->
23+
Req1 = rabbit_mgmt_headers:set_no_cache_headers(
24+
rabbit_mgmt_headers:set_common_permission_headers(
25+
Req, ?MODULE), ?MODULE),
26+
{cowboy_rest, Req1, #context{}}.
27+
28+
variances(Req, Context) ->
29+
{[<<"accept-encoding">>, <<"origin">>], Req, Context}.
30+
31+
content_types_provided(ReqData, Context) ->
32+
{rabbit_mgmt_util:responder_map(to_json), ReqData, Context}.
33+
34+
to_json(ReqData, Context) ->
35+
case check() of
36+
{ok, Body} ->
37+
rabbit_mgmt_util:reply(Body, ReqData, Context);
38+
{error, Body} ->
39+
{Response, ReqData1, Context1} = rabbit_mgmt_util:reply(
40+
Body, ReqData, Context),
41+
{stop,
42+
cowboy_req:reply(
43+
?HEALTH_CHECK_FAILURE_STATUS, #{}, Response, ReqData1),
44+
Context1}
45+
end.
46+
47+
check() ->
48+
case rabbit:is_serving() of
49+
true ->
50+
RanchRefs0 = [
51+
rabbit_networking:ranch_ref_of_protocol(amqp),
52+
rabbit_networking:ranch_ref_of_protocol('amqp/ssl')
53+
],
54+
RanchRefs = [R || R <- RanchRefs0, R =/= undefined],
55+
case RanchRefs of
56+
[_ | _] ->
57+
ActiveConns = lists:foldl(
58+
fun(RanchRef, Acc) ->
59+
#{active_connections := Count} = ranch:info(RanchRef),
60+
Acc + Count
61+
end, 0, RanchRefs),
62+
Limit = rabbit_misc:get_env(rabbit, connection_max, infinity),
63+
case ActiveConns < Limit of
64+
true ->
65+
{ok, #{status => ok,
66+
limit => Limit,
67+
connections => ActiveConns}};
68+
false ->
69+
{error, #{status => failed,
70+
reason => <<"node connection limit is reached">>,
71+
limit => Limit,
72+
connections => ActiveConns}}
73+
end;
74+
[] ->
75+
{error, #{status => failed,
76+
reason => <<"no active listeners for AMQP/AMQPS">>}}
77+
end;
78+
false ->
79+
{error, #{status => failed,
80+
reason => <<"the rabbit node is not currently available to serve">>}}
81+
end.

deps/rabbitmq_management/test/rabbit_mgmt_http_health_checks_SUITE.erl

+61-1
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ all_tests() -> [
5151
protocol_listener_test,
5252
port_listener_test,
5353
certificate_expiration_test,
54-
is_in_service_test
54+
is_in_service_test,
55+
below_node_connection_limit_test,
56+
ready_to_serve_clients_test
5557
].
5658

5759
%% -------------------------------------------------------------------
@@ -470,8 +472,66 @@ is_in_service_test(Config) ->
470472

471473
passed.
472474

475+
below_node_connection_limit_test(Config) ->
476+
Path = "/health/checks/below-node-connection-limit",
477+
Check0 = http_get(Config, Path, ?OK),
478+
?assertEqual(<<"ok">>, maps:get(status, Check0)),
479+
?assertEqual(0, maps:get(connections, Check0)),
480+
?assertEqual(<<"infinity">>, maps:get(limit, Check0)),
481+
482+
%% Set the connection limit low and open 'limit' connections.
483+
Limit = 10,
484+
rabbit_ct_broker_helpers:rpc(
485+
Config, 0, application, set_env, [rabbit, connection_max, Limit]),
486+
Connections = [rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0) || _ <- lists:seq(1, Limit)],
487+
true = lists:all(fun(E) -> is_pid(E) end, Connections),
488+
{error, not_allowed} = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0),
489+
490+
Body0 = http_get_failed(Config, Path),
491+
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body0)),
492+
?assertEqual(10, maps:get(<<"limit">>, Body0)),
493+
?assertEqual(10, maps:get(<<"connections">>, Body0)),
494+
495+
%% Clean up the connections and reset the limit.
496+
[catch rabbit_ct_client_helpers:close_connection(C) || C <- Connections],
497+
rabbit_ct_broker_helpers:rpc(
498+
Config, 0, application, set_env, [rabbit, connection_max, infinity]),
499+
500+
passed.
501+
502+
ready_to_serve_clients_test(Config) ->
503+
Path = "/health/checks/ready-to-serve-clients",
504+
Check0 = http_get(Config, Path, ?OK),
505+
?assertEqual(<<"ok">>, maps:get(status, Check0)),
506+
507+
true = rabbit_ct_broker_helpers:mark_as_being_drained(Config, 0),
508+
Body0 = http_get_failed(Config, Path),
509+
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body0)),
510+
true = rabbit_ct_broker_helpers:unmark_as_being_drained(Config, 0),
511+
512+
%% Set the connection limit low and open 'limit' connections.
513+
Limit = 10,
514+
rabbit_ct_broker_helpers:rpc(
515+
Config, 0, application, set_env, [rabbit, connection_max, Limit]),
516+
Connections = [rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0) || _ <- lists:seq(1, Limit)],
517+
true = lists:all(fun(E) -> is_pid(E) end, Connections),
518+
{error, not_allowed} = rabbit_ct_client_helpers:open_unmanaged_connection(Config, 0),
519+
520+
Body1 = http_get_failed(Config, Path),
521+
?assertEqual(<<"failed">>, maps:get(<<"status">>, Body1)),
522+
?assertEqual(10, maps:get(<<"limit">>, Body1)),
523+
?assertEqual(10, maps:get(<<"connections">>, Body1)),
524+
525+
%% Clean up the connections and reset the limit.
526+
[catch rabbit_ct_client_helpers:close_connection(C) || C <- Connections],
527+
rabbit_ct_broker_helpers:rpc(
528+
Config, 0, application, set_env, [rabbit, connection_max, infinity]),
529+
530+
passed.
531+
473532
http_get_failed(Config, Path) ->
474533
{ok, {{_, Code, _}, _, ResBody}} = req(Config, get, Path, [auth_header("guest", "guest")]),
534+
ct:pal("GET ~s: ~w ~w", [Path, Code, ResBody]),
475535
?assertEqual(Code, ?HEALTH_CHECK_FAILURE_STATUS),
476536
rabbit_json:decode(rabbit_data_coercion:to_binary(ResBody)).
477537

0 commit comments

Comments
 (0)