From 9b1cfe12edfc5ba37352bcf453a8df08875b69ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Filipe=20Caba=C3=A7o?= Date: Wed, 13 Nov 2024 18:12:52 +0000 Subject: [PATCH] fix: Add region to healthcheck; Better failure flow on Connect (#1220) --- lib/realtime/tenants.ex | 49 +++++++++++++++++-- lib/realtime/tenants/connect.ex | 15 ++++-- .../templates/layout/live.html.heex | 4 +- mix.exs | 2 +- test/realtime/tenants/connect_test.exs | 8 +-- .../controllers/tenant_controller_test.exs | 30 ++++++++++-- 6 files changed, 88 insertions(+), 20 deletions(-) diff --git a/lib/realtime/tenants.ex b/lib/realtime/tenants.ex index 609599cb5..8d549daeb 100644 --- a/lib/realtime/tenants.ex +++ b/lib/realtime/tenants.ex @@ -55,13 +55,36 @@ defmodule Realtime.Tenants do {:error, :tenant_not_found | String.t() - | %{connected_cluster: pos_integer, db_connected: false, healthy: false}} - | {:ok, %{connected_cluster: non_neg_integer, db_connected: true, healthy: true}} + | %{ + connected_cluster: pos_integer, + db_connected: false, + healthy: false, + region: String.t(), + node: String.t() + }} + | {:ok, + %{ + connected_cluster: non_neg_integer, + db_connected: true, + healthy: true, + region: String.t(), + node: String.t() + }} def health_check(external_id) when is_binary(external_id) do + region = Application.get_env(:realtime, :region) + node = Node.self() |> to_string() + with %Tenant{} = tenant <- Cache.get_tenant_by_external_id(external_id), {:error, _} <- get_health_conn(tenant), connected_cluster when connected_cluster > 0 <- UsersCounter.tenant_users(external_id) do - {:error, %{healthy: false, db_connected: false, connected_cluster: connected_cluster}} + {:error, + %{ + healthy: false, + db_connected: false, + connected_cluster: connected_cluster, + region: region, + node: node + }} else nil -> {:error, :tenant_not_found} @@ -70,14 +93,30 @@ defmodule Realtime.Tenants do connected_cluster = UsersCounter.tenant_users(external_id) tenant = Cache.get_tenant_by_external_id(external_id) Migrations.maybe_run_migrations(health_conn, tenant) - {:ok, %{healthy: true, db_connected: true, connected_cluster: connected_cluster}} + + {:ok, + %{ + healthy: true, + db_connected: true, + connected_cluster: connected_cluster, + region: region, + node: node + }} connected_cluster when is_integer(connected_cluster) -> tenant = Cache.get_tenant_by_external_id(external_id) {:ok, db_conn} = Database.connect(tenant, "realtime_health_check", 1) Migrations.maybe_run_migrations(db_conn, tenant) Process.alive?(db_conn) && GenServer.stop(db_conn) - {:ok, %{healthy: true, db_connected: false, connected_cluster: connected_cluster}} + + {:ok, + %{ + healthy: true, + db_connected: false, + connected_cluster: connected_cluster, + region: region, + node: node + }} end end diff --git a/lib/realtime/tenants/connect.ex b/lib/realtime/tenants/connect.ex index 86f90429f..b373ab4a2 100644 --- a/lib/realtime/tenants/connect.ex +++ b/lib/realtime/tenants/connect.ex @@ -157,12 +157,17 @@ defmodule Realtime.Tenants.Connect do def handle_continue(:run_migrations, state) do %{tenant: tenant, db_conn_pid: db_conn_pid} = state - :ok = Migrations.maybe_run_migrations(db_conn_pid, tenant) - :ok = Migrations.create_partitions(db_conn_pid) - {:ok, broadcast_changes_pid} = start_replication(tenant) - {:noreply, %{state | broadcast_changes_pid: broadcast_changes_pid}, - {:continue, :setup_connected_user_events}} + with :ok <- Migrations.maybe_run_migrations(db_conn_pid, tenant), + :ok <- Migrations.create_partitions(db_conn_pid), + {:ok, broadcast_changes_pid} <- start_replication(tenant) do + {:noreply, %{state | broadcast_changes_pid: broadcast_changes_pid}, + {:continue, :setup_connected_user_events}} + else + error -> + log_error("MigrationsFailedToRun", error) + {:stop, :shutdown} + end end @impl true diff --git a/lib/realtime_web/templates/layout/live.html.heex b/lib/realtime_web/templates/layout/live.html.heex index 0ae4db1c5..4d6ff9cce 100644 --- a/lib/realtime_web/templates/layout/live.html.heex +++ b/lib/realtime_web/templates/layout/live.html.heex @@ -82,8 +82,8 @@
- - + + <%= @inner_content %>
diff --git a/mix.exs b/mix.exs index 07040b4c3..facad23fa 100644 --- a/mix.exs +++ b/mix.exs @@ -4,7 +4,7 @@ defmodule Realtime.MixProject do def project do [ app: :realtime, - version: "2.33.44", + version: "2.33.45", elixir: "~> 1.16.0", elixirc_paths: elixirc_paths(Mix.env()), start_permanent: Mix.env() == :prod, diff --git a/test/realtime/tenants/connect_test.exs b/test/realtime/tenants/connect_test.exs index 3cba97add..fd3578099 100644 --- a/test/realtime/tenants/connect_test.exs +++ b/test/realtime/tenants/connect_test.exs @@ -31,7 +31,7 @@ defmodule Realtime.Tenants.ConnectTest do test "on database disconnect, returns new connection", %{tenant: tenant} do assert {:ok, old_conn} = Connect.lookup_or_start_connection(tenant.external_id) - + :timer.sleep(500) GenServer.stop(old_conn) :timer.sleep(500) @@ -75,7 +75,7 @@ defmodule Realtime.Tenants.ConnectTest do tenant: %{external_id: tenant_id} } do {:ok, db_conn} = - Connect.lookup_or_start_connection(tenant_id, check_connected_user_interval: 50) + Connect.lookup_or_start_connection(tenant_id, check_connected_user_interval: 200) Sandbox.allow(Repo, self(), db_conn) # Not enough time has passed, connection still alive @@ -184,8 +184,8 @@ defmodule Realtime.Tenants.ConnectTest do with_mock Realtime.Tenants.Migrations, [], maybe_run_migrations: fn _, _ -> raise("error") end do assert {:ok, pid} = Connect.lookup_or_start_connection(tenant.external_id) - Process.alive?(pid) - Process.sleep(1000) + Process.sleep(200) + refute Process.alive?(pid) assert_called(Realtime.Tenants.Migrations.maybe_run_migrations(:_, :_)) end end diff --git a/test/realtime_web/controllers/tenant_controller_test.exs b/test/realtime_web/controllers/tenant_controller_test.exs index 267fbc621..3074c27af 100644 --- a/test/realtime_web/controllers/tenant_controller_test.exs +++ b/test/realtime_web/controllers/tenant_controller_test.exs @@ -201,6 +201,11 @@ defmodule RealtimeWeb.TenantControllerTest do describe "health check tenant" do setup [:create_tenant] + setup do + Application.put_env(:realtime, :region, "us-east-1") + on_exit(fn -> Application.put_env(:realtime, :region, nil) end) + end + test "health check when tenant does not exist", %{conn: conn} do with_mock ChannelsAuthorization, authorize: fn _, _, _ -> {:ok, %{}} end do Routes.tenant_path(conn, :reload, "wrong_external_id") @@ -216,7 +221,14 @@ defmodule RealtimeWeb.TenantControllerTest do with_mock JwtVerification, verify: fn _token, _secret, _jwks -> {:ok, %{}} end do conn = get(conn, Routes.tenant_path(conn, :health, ext_id)) data = json_response(conn, 200)["data"] - assert %{"healthy" => true, "db_connected" => false, "connected_cluster" => 0} = data + + assert %{ + "healthy" => true, + "db_connected" => false, + "connected_cluster" => 0, + "region" => "us-east-1", + "node" => "nonode@nohost" + } == data end end @@ -232,7 +244,13 @@ defmodule RealtimeWeb.TenantControllerTest do conn = get(conn, Routes.tenant_path(conn, :health, ext_id)) data = json_response(conn, 200)["data"] - assert %{"healthy" => false, "db_connected" => false, "connected_cluster" => 1} = data + assert %{ + "healthy" => false, + "db_connected" => false, + "connected_cluster" => 1, + "region" => "us-east-1", + "node" => "nonode@nohost" + } == data end end @@ -255,7 +273,13 @@ defmodule RealtimeWeb.TenantControllerTest do conn = get(conn, Routes.tenant_path(conn, :health, ext_id)) data = json_response(conn, 200)["data"] - assert %{"healthy" => true, "db_connected" => true, "connected_cluster" => 1} = data + assert %{ + "healthy" => true, + "db_connected" => true, + "connected_cluster" => 1, + "region" => "us-east-1", + "node" => "nonode@nohost" + } == data end end