Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the support tracing of child models invoked from a BLS model #6063

Merged
merged 15 commits into from
Aug 7, 2023
Merged
5 changes: 0 additions & 5 deletions docs/user_guide/trace.md
Original file line number Diff line number Diff line change
Expand Up @@ -448,11 +448,6 @@ class TritonPythonModel:
...
for request in requests:
...
# Create an InferenceRequest object. `model_name`,
# `requested_output_names`, and `inputs` are the required arguments and
# must be provided when constructing an InferenceRequest object. Make
# sure to replace `inputs` argument with a list of `pb_utils.Tensor`
# objects.
inference_request = pb_utils.InferenceRequest(
model_name='model_name',
requested_output_names=['REQUESTED_OUTPUT_1', 'REQUESTED_OUTPUT_2'],
Expand Down
5 changes: 3 additions & 2 deletions qa/L0_trace/opentelemetry_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import tritonclient.http as httpclient

EXPECTED_NUM_SPANS = 16
NO_PARENT_SPAN = "0000000000000000"
rmccorm4 marked this conversation as resolved.
Show resolved Hide resolved


class OpenTelemetryTest(tu.TestResultCollector):
Expand Down Expand Up @@ -155,7 +156,7 @@ def _check_parent(self, child_span, parent_span):
self.assertEqual(child_span["trace_id"], parent_span["trace_id"])
self.assertNotEqual(
child_span["parent_span_id"],
"0000000000000000",
NO_PARENT_SPAN,
"child span does not have parent span id specified",
)
self.assertEqual(
Expand Down Expand Up @@ -262,7 +263,7 @@ def send_bls_request(model_name="simple"):
with httpclient.InferenceServerClient("localhost:8000") as client:
inputs = prepare_data(httpclient)
inputs.append(httpclient.InferInput("MODEL_NAME", [1], "BYTES"))
inputs[2].set_data_from_numpy(np.array([model_name], dtype=np.object_))
inputs[-1].set_data_from_numpy(np.array([model_name], dtype=np.object_))
client.infer("bls_simple", inputs)


Expand Down
21 changes: 13 additions & 8 deletions qa/L0_trace/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -669,19 +669,25 @@ $TRACE_SUMMARY -t bls_trace.log > summary_bls.log

if [ `grep -c "COMPUTE_INPUT_END" summary_bls.log` != "2" ]; then
cat summary_bls.log
echo -e "\n***\n*** Test Failed\n***"
echo -e "\n***\n*** Test Failed: Unexpected number of traced "COMPUTE_INPUT_END" events.\n***"
RET=1
fi

if [ `grep -c ^simple summary_bls.log` != "1" ]; then
if [ `grep -c ^ensemble_add_sub_int32_int32_int32 summary_bls.log` != "1" ]; then
cat summary_bls.log
echo -e "\n***\n*** Test Failed\n***"
echo -e "\n***\n*** Test Failed: BLS child ensemble model wasn't traced. \n***"
RET=1
fi

if [ `grep -c 'parent id' bls_trace.log` == "1" ]; then
if [ `grep -c ^simple summary_bls.log` != "1" ]; then
cat summary_bls.log
echo -e "\n***\n*** Test Failed\n***"
echo -e "\n***\n*** Test Failed: ensemble's model 'simple' wasn't traced. \n***"
RET=1
fi

if [ `grep -c 'parent id' bls_trace.log` != "2" ]; then
cat bls_trace.log
echo -e "\n***\n*** Test Failed: Unexpected number of 'parent id' fields. \n***"
RET=1
fi

Expand Down Expand Up @@ -760,9 +766,8 @@ fi

set +e
# Preparing traces for unittest.
# Note: need to run this separately, to speed up trace collection.
# Otherwise internal (opentelemetry_unittest.OpenTelemetryTest.setUp) check
# will slow down collection.
# Note: running this separately, so that I could extract spans with `grep`
# from server log later.
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1

Expand Down
102 changes: 60 additions & 42 deletions src/tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,6 @@ TraceManager::Create(
*manager = new TraceManager(
level, rate, count, log_frequency, filepath, mode, config_map);

(*manager)->InitTracer(config_map);

return nullptr; // success
}

Expand All @@ -86,6 +84,8 @@ TraceManager::TraceManager(
false /*filepath_specified*/, false /*mode_specified*/,
false /*config_map_specified*/));
trace_files_.emplace(filepath, file);

InitTracer(config_map);
}

TRITONSERVER_Error*
Expand Down Expand Up @@ -357,58 +357,76 @@ TraceManager::Trace::CaptureTimestamp(
void
TraceManager::InitTracer(const triton::server::TraceConfigMap& config_map)
{
if (global_setting_->mode_ == TRACE_MODE_TRITON) {
return;
}
switch (global_setting_->mode_) {
case TRACE_MODE_OPENTELEMETRY: {
#if !defined(_WIN32) && defined(TRITON_ENABLE_TRACING)
otlp::OtlpHttpExporterOptions opts;
otel_resource::ResourceAttributes attributes = {};
attributes[otel_resource::SemanticConventions::kServiceName] =
"triton-inference-server";
auto mode_key = std::to_string(TRACE_MODE_OPENTELEMETRY);
auto otel_options_it = config_map.find(mode_key);
if (otel_options_it != config_map.end()) {
for (const auto& setting : otel_options_it->second) {
// FIXME add more configuration options of OTLP HTTP Exporter
if (setting.first == "url") {
opts.url = setting.second;
otlp::OtlpHttpExporterOptions opts;
otel_resource::ResourceAttributes attributes = {};
attributes[otel_resource::SemanticConventions::kServiceName] =
"triton-inference-server";
auto mode_key = std::to_string(TRACE_MODE_OPENTELEMETRY);
auto otel_options_it = config_map.find(mode_key);
if (otel_options_it != config_map.end()) {
for (const auto& setting : otel_options_it->second) {
// FIXME add more configuration options of OTLP HTTP Exporter
if (setting.first == "url") {
opts.url = setting.second;
}
if (setting.first == "resource") {
auto pos = setting.second.find('=');
auto key = setting.second.substr(0, pos);
auto value = setting.second.substr(pos + 1);
attributes[key] = value;
}
}
}
if (setting.first == "resource") {
auto pos = setting.second.find('=');
auto key = setting.second.substr(0, pos);
auto value = setting.second.substr(pos + 1);
attributes[key] = value;
auto exporter = otlp::OtlpHttpExporterFactory::Create(opts);
auto test_exporter = triton::server::GetEnvironmentVariableOrDefault(
"TRITON_OPENTELEMETRY_TEST", "false");
if (test_exporter != "false") {
exporter = opentelemetry::exporter::trace::OStreamSpanExporterFactory::
Create();
}
auto processor = otel_trace_sdk::SimpleSpanProcessorFactory::Create(
std::move(exporter));
auto resource = otel_resource::Resource::Create(attributes);
std::shared_ptr<otel_trace_api::TracerProvider> provider =
otel_trace_sdk::TracerProviderFactory::Create(
std::move(processor), resource);

otel_trace_api::Provider::SetTracerProvider(provider);
break;
#else
LOG_ERROR << "Unsupported trace mode: "
<< TraceManager::InferenceTraceModeString(
global_setting_->mode_);
break;
#endif
}
default:
return;
}
auto exporter = otlp::OtlpHttpExporterFactory::Create(opts);
auto test_exporter = triton::server::GetEnvironmentVariableOrDefault(
"TRITON_OPENTELEMETRY_TEST", "false");
if (test_exporter != "false") {
exporter =
opentelemetry::exporter::trace::OStreamSpanExporterFactory::Create();
}
auto processor =
otel_trace_sdk::SimpleSpanProcessorFactory::Create(std::move(exporter));
auto resource = otel_resource::Resource::Create(attributes);
std::shared_ptr<otel_trace_api::TracerProvider> provider =
otel_trace_sdk::TracerProviderFactory::Create(
std::move(processor), resource);

otel_trace_api::Provider::SetTracerProvider(provider);
#endif
}

void
TraceManager::CleanupTracer()
{
if (global_setting_->mode_ == TRACE_MODE_TRITON) {
return;
}
switch (global_setting_->mode_) {
case TRACE_MODE_OPENTELEMETRY: {
#if !defined(_WIN32) && defined(TRITON_ENABLE_TRACING)
std::shared_ptr<otel_trace_api::TracerProvider> none;
otel_trace_api::Provider::SetTracerProvider(none);
std::shared_ptr<otel_trace_api::TracerProvider> none;
otel_trace_api::Provider::SetTracerProvider(none);
break;
#else
LOG_ERROR << "Unsupported trace mode: "
<< TraceManager::InferenceTraceModeString(
global_setting_->mode_);
break;
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel like you should subclass TraceManager for different mode

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate, please?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of being mentioned in a previous PR, assuming the mode will not be changed at runtime, you can have

class OpenTelemetryTraceManager : public TraceManager;

which will be created if mode is OpenTelemetry, then there is no runtime check and different branching based on mode selected.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I can introduce it in a refactoring PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For reference: all refactoring suggestions are tracked under this ticket: https://jirasw.nvidia.com/browse/DLIS-4803

}
default:
return;
}
}

#ifndef _WIN32
Expand Down