diff --git a/apps/opentelemetry/src/otel_batch_processor.erl b/apps/opentelemetry/src/otel_batch_processor.erl index 8dea9ae6..32a840b6 100644 --- a/apps/opentelemetry/src/otel_batch_processor.erl +++ b/apps/opentelemetry/src/otel_batch_processor.erl @@ -364,10 +364,15 @@ complete_exporting(Data) -> handed_off_table=undefined}}. kill_runner(Data=#data{runner_pid=RunnerPid}) when RunnerPid =/= undefined -> + Mon = erlang:monitor(process, RunnerPid), erlang:unlink(RunnerPid), erlang:exit(RunnerPid, kill), - Data#data{runner_pid=undefined, - handed_off_table=undefined}. + %% Wait for the runner process termination to be sure that + %% the export table is destroyed and can be safely recreated + receive + {'DOWN', Mon, process, RunnerPid, _} -> + Data#data{runner_pid=undefined, handed_off_table=undefined} + end. new_export_table(Name) -> ets:new(Name, [public, diff --git a/apps/opentelemetry/src/otel_simple_processor.erl b/apps/opentelemetry/src/otel_simple_processor.erl index 84b54d56..eaa5fd83 100644 --- a/apps/opentelemetry/src/otel_simple_processor.erl +++ b/apps/opentelemetry/src/otel_simple_processor.erl @@ -195,10 +195,15 @@ complete_exporting(Data=#data{current_from=From, [{reply, From, ok}]}. kill_runner(Data=#data{runner_pid=RunnerPid}) when RunnerPid =/= undefined -> + Mon = erlang:monitor(process, RunnerPid), erlang:unlink(RunnerPid), erlang:exit(RunnerPid, kill), - Data#data{runner_pid=undefined, - handed_off_table=undefined}. + %% Wait for the runner process termination to be sure that + %% the export table is destroyed and can be safely recreated + receive + {'DOWN', Mon, process, RunnerPid, _} -> + Data#data{runner_pid=undefined, handed_off_table=undefined} + end. new_export_table(Name) -> ets:new(Name, [public, diff --git a/apps/opentelemetry/test/otel_batch_processor_SUITE.erl b/apps/opentelemetry/test/otel_batch_processor_SUITE.erl index 4034b8ba..ba16c4ca 100644 --- a/apps/opentelemetry/test/otel_batch_processor_SUITE.erl +++ b/apps/opentelemetry/test/otel_batch_processor_SUITE.erl @@ -5,17 +5,19 @@ -include_lib("stdlib/include/assert.hrl"). -include_lib("common_test/include/ct.hrl"). +-include("otel_span.hrl"). -include_lib("opentelemetry_api/include/opentelemetry.hrl"). all() -> - [exporting_timeout_test]. + [exporting_timeout_test, + exporting_runner_timeout_test]. %% verifies that after the runner has to be killed for taking too long %% that everything is still functional and the exporter does not crash exporting_timeout_test(_Config) -> process_flag(trap_exit, true), - {ok, Pid, _} = otel_batch_processor:start_link(#{reg_name => test_processor, + {ok, Pid, _} = otel_batch_processor:start_link(#{name => test_processor, resource => otel_resource:create([]), exporter => ?MODULE, exporting_timeout_ms => 1, @@ -30,13 +32,47 @@ exporting_timeout_test(_Config) -> ok end. +exporting_runner_timeout_test(_Config) -> + process_flag(trap_exit, true), + + {ok, Pid, #{reg_name := RegName}} = otel_batch_processor:start_link( + #{name => test_processor1, + resource => otel_resource:create([]), + exporter => ?MODULE, + exporting_timeout_ms => 1, + scheduled_delay_ms => 1}), + + %% Insert a few spans to make sure runner process will be spawned and killed + %% because it hangs for 10 minutes (see export/4 below) + true = otel_batch_processor:on_end(generate_span(), #{reg_name => RegName}), + true = otel_batch_processor:on_end(generate_span(), #{reg_name => RegName}), + + receive + {'EXIT', Pid, _} -> + %% test is to ensure we don't hit this + ct:fail(batch_processor_crash) + after + 200 -> + ok + end. + %% exporter behaviour init(_) -> {ok, []}. -export(_, _) -> +export(_, _, _, _) -> timer:sleep(timer:minutes(10)). shutdown(_) -> ok. + +%% helpers + +generate_span() -> + #span{trace_id = otel_id_generator:generate_trace_id(), + span_id = otel_id_generator:generate_span_id(), + name = "test_span", + trace_flags = 1, + is_recording = true, + instrumentation_scope = #instrumentation_scope{name = "test"}}.