Skip to content

Commit

Permalink
Revised according to comments, refactored test, added ensemble check
Browse files Browse the repository at this point in the history
  • Loading branch information
oandreeva-nv committed Jul 6, 2023
1 parent dcea33a commit cee5738
Show file tree
Hide file tree
Showing 6 changed files with 110 additions and 131 deletions.
1 change: 1 addition & 0 deletions docs/user_guide/trace.md
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ The meaning of the trace timestamps is:
## OpenTelemetry trace support

Triton provides an option to generate and export traces
for standalone and ensemble models
using [OpenTelemetry APIs and SDKs](https://opentelemetry.io/).

To specify OpenTelemetry mode for tracing, specify the `--trace-config`
Expand Down
89 changes: 71 additions & 18 deletions qa/L0_trace/opentelemetry_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,28 +27,38 @@
import sys

sys.path.append("../common")

import json
import unittest

import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
import numpy as np
import test_util as tu
import time

EXPECTED_NUM_SPANS = 10

class OpenTelemetryTest(tu.TestResultCollector):

def setUp(self):
with open('trace_collector.log', 'rt') as f:
data = f.read()

while True:
with open('trace_collector.log', 'rt') as f:
data = f.read()
if data.count("resource_spans") != EXPECTED_NUM_SPANS:
time.sleep(5)
continue
else:
break

data = data.split('\n')
full_spans = [entry for entry in data if "resource_spans" in entry]
full_spans = [entry.split('POST')[0] for entry in data if "resource_spans" in entry]
self.spans = []
for span in full_spans:
span = json.loads(span)
self.spans.append(
span["resource_spans"][0]['scope_spans'][0]['spans'][0])

self.model_name = "simple"
self.simple_model_name = "simple"
self.ensemble_model_name = "ensemble_add_sub_int32_int32_int32"
self.root_span = "InferRequest"

def _check_events(self, span_name, events):
Expand Down Expand Up @@ -102,7 +112,7 @@ def _check_events(self, span_name, events):
self.assertFalse(
all(entry in events for entry in compute_events))

elif span_name == self.model_name:
elif span_name == self.simple_model_name:
# Check that all request related events (and only them)
# are recorded in request span
self.assertTrue(all(entry in events for entry in request_events))
Expand Down Expand Up @@ -131,14 +141,15 @@ def test_spans(self):
parsed_spans.append(span_name)

# There should be 6 spans in total:
# 3 for http request and 3 for grpc request.
self.assertEqual(len(self.spans), 6)
# We should have 2 compute spans
self.assertEqual(parsed_spans.count("compute"), 2)
# 2 request spans (named simple - same as our model name)
self.assertEqual(parsed_spans.count(self.model_name), 2)
# 2 root spans
self.assertEqual(parsed_spans.count(self.root_span), 2)
# 3 for http request, 3 for grpc request, 4 for ensemble
self.assertEqual(len(self.spans), 10)
# We should have 3 compute spans
self.assertEqual(parsed_spans.count("compute"), 3)
# 4 request spans (3 named simple - same as our model name, 1 ensemble)
self.assertEqual(parsed_spans.count(self.simple_model_name), 3)
self.assertEqual(parsed_spans.count(self.ensemble_model_name), 1)
# 3 root spans
self.assertEqual(parsed_spans.count(self.root_span), 3)

def test_nested_spans(self):

Expand All @@ -156,9 +167,9 @@ def test_nested_spans(self):
self.spans[2],
"root span has a parent_span_id specified")

# Last 3 spans in `self.spans` belong to GRPC request
# Next 3 spans in `self.spans` belong to GRPC request
# Order of spans and their relationship described earlier
for child, parent in zip(self.spans[3:], self.spans[4:]):
for child, parent in zip(self.spans[3:6], self.spans[4:6]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
Expand All @@ -167,6 +178,48 @@ def test_nested_spans(self):
self.spans[5],
"root span has a parent_span_id specified")

# Final 4 spans in `self.spans` belong to ensemble request
# Order of spans: compute span - request span - request span - root span
for child, parent in zip(self.spans[6:10], self.spans[7:10]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
'parent_span_id',
self.spans[9],
"root span has a parent_span_id specified")

def prepare_data(client):

inputs = []
outputs = []
input0_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)
input1_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)

inputs.append(client.InferInput('INPUT0', [1, 16], "INT32"))
inputs.append(client.InferInput('INPUT1', [1, 16], "INT32"))

# Initialize the data
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)

return inputs

def prepare_traces():

triton_client_http = httpclient.InferenceServerClient("localhost:8000",
verbose=True)
triton_client_grpc = grpcclient.InferenceServerClient("localhost:8001",
verbose=True)
inputs = prepare_data(httpclient)
triton_client_http.infer("simple",inputs)

inputs = prepare_data(grpcclient)
triton_client_grpc.infer("simple", inputs)

inputs = prepare_data(httpclient)
triton_client_http.infer("ensemble_add_sub_int32_int32_int32", inputs)


if __name__ == '__main__':
unittest.main()
58 changes: 23 additions & 35 deletions qa/L0_trace/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,19 @@ rm -f *.log
rm -fr $MODELSDIR && mkdir -p $MODELSDIR

# set up simple and global_simple model using MODELBASE
rm -fr $MODELSDIR && mkdir -p $MODELSDIR && \
cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
rm -r $MODELSDIR/simple/2 && rm -r $MODELSDIR/simple/3 && \
(cd $MODELSDIR/simple && \
sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt) && \
cp -r $MODELSDIR/simple $MODELSDIR/global_simple && \
(cd $MODELSDIR/global_simple && \
sed -i "s/^name:.*/name: \"global_simple\"/" config.pbtxt) && \
cp -r $ENSEMBLEDIR/simple_onnx_int32_int32_int32 $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \
rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/2 && \
rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/3 && \
(cd $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \
sed -i "s/^name:.*/name: \"ensemble_add_sub_int32_int32_int32\"/" config.pbtxt && \
sed -i "s/model_name:.*/model_name: \"simple\"/" config.pbtxt)

RET=0

Expand Down Expand Up @@ -661,11 +666,12 @@ wait $SERVER_PID

OPENTELEMETRY_TEST=opentelemetry_unittest.py
OPENTELEMETRY_LOG="opentelemetry_unittest.log"
TEST_RESULT_FILE="opentelemetry_results.txt"
EXPECTED_NUM_TESTS="2"

SERVER_ARGS="--trace-config=triton,file=some_file.log --trace-config=level=TIMESTAMPS \
--trace-config=rate=1 --trace-config=count=6 --trace-config=mode=opentelemetry --trace-config=opentelemetry,url=localhost:$OTLP_PORT --model-repository=$MODELSDIR"
SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_trace_config.log"
run_server
if [ "$SERVER_PID" == "0" ]; then
Expand All @@ -674,37 +680,18 @@ if [ "$SERVER_PID" == "0" ]; then
exit 1
fi

# This is a simple python code that opens port
python $TRACE_COLLECTOR $OTLP_PORT $TRACE_COLLECTOR_LOG &
COLLECTOR_PID=$!

set +e

# To make sure receiver is ready
sleep 3

# Send http request and collect trace
$SIMPLE_HTTP_CLIENT >> client_update.log 2>&1
if [ $? -ne 0 ]; then
cat client_update.log
RET=1
fi

# Send grpc request and collect trace
$SIMPLE_GRPC_CLIENT >> client_update.log 2>&1
if [ $? -ne 0 ]; then
cat client_update.log
RET=1
fi
# To make sure log gets all data
sleep 3

kill $COLLECTOR_PID
wait $COLLECTOR_PID
# Using netcat as trace collector
apt-get update && apt-get install -y netcat
nc -l -k 127.0.0.1 $OTLP_PORT >> $TRACE_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

# Preparing traces for unittest.
# Note: need to run this separately, to speed up trace collection.
# Otherwise internal (opentelemetry_unittest.OpenTelemetryTest.setUp) check
# will slow down collection.
python -c 'import opentelemetry_unittest; opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1

set +e

# Unittest will not start untill expected number of spans is collected.
python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
Expand All @@ -718,11 +705,12 @@ else
fi
fi

kill $COLLECTOR_PID
wait $COLLECTOR_PID

set -e

kill $SERVER_PID
wait $SERVER_PID

set +e

exit $RET
66 changes: 0 additions & 66 deletions qa/L0_trace/trace_collector.py

This file was deleted.

19 changes: 12 additions & 7 deletions src/tracer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -374,8 +374,8 @@ TraceManager::Trace::InitTracer(
std::chrono::steady_clock::now().time_since_epoch())
.count();
auto root_span =
StartSpan("InferRequest", steady_timestamp_ns, true /*is_root_span*/);
// Initializing OTel context and storring "InferRequest" span as a root span
StartSpan("InferRequest", steady_timestamp_ns);
// Initializing OTel context and storing "InferRequest" span as a root span
// to keep it alive for the duration of the request.
otel_context_ = opentelemetry::context::Context({kRootSpan, root_span});
}
Expand All @@ -391,6 +391,8 @@ TraceManager::Trace::StartSpan(
// and can not be a parent of any sub-span. If parent_id==0,
// then current model is either a standalone model, or an ensemble model.
// In both cases, the parent of the new request sub-span is the kRootSpan.
// A request span with trace id = `trace_id` is a parent of a compute span,
// started in the same trace.
// If parent_id > 0, then this is a child trace, spawned from
// the ensamble's main request. For this instance, the parent
// span is the ensembles's request span.
Expand All @@ -399,7 +401,11 @@ TraceManager::Trace::StartSpan(
TRITONSERVER_InferenceTraceParentId(trace, &parent_id),
"getting trace parent id");
std::string parent_span_key =
(parent_id != 0) ? kRequestSpan + std::to_string(parent_id) : kRootSpan;
(parent_id == 0 && activity == TRITONSERVER_TRACE_REQUEST_START)
? kRootSpan
: (activity == TRITONSERVER_TRACE_REQUEST_START)
? kRequestSpan + std::to_string(parent_id)
: kRequestSpan + std::to_string(trace_id);

std::string display_name = "compute";
const char* model_name;
Expand All @@ -410,8 +416,7 @@ TraceManager::Trace::StartSpan(
display_name = model_name;
}

auto span = StartSpan(
display_name, timestamp_ns, false /*is_root_span*/, parent_span_key);
auto span = StartSpan(display_name, timestamp_ns, parent_span_key);

if (activity == TRITONSERVER_TRACE_REQUEST_START) {
int64_t model_version;
Expand All @@ -430,7 +435,7 @@ TraceManager::Trace::StartSpan(
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>
TraceManager::Trace::StartSpan(
std::string display_name, const uint64_t& raw_timestamp_ns,
bool is_root_span, std::string parent_span_key)
std::string parent_span_key)
{
otel_trace_api::StartSpanOptions options;
options.kind = otel_trace_api::SpanKind::kServer;
Expand All @@ -441,7 +446,7 @@ TraceManager::Trace::StartSpan(

// If the new span is a child span, we need to retrieve its parent from
// the context and provide it through StartSpanOptions to the child span
if (!is_root_span && otel_context_.HasKey(parent_span_key)) {
if (!parent_span_key.empty() && otel_context_.HasKey(parent_span_key)) {
auto parent_span = opentelemetry::nostd::get<
opentelemetry::nostd::shared_ptr<otel_trace_api::Span>>(
otel_context_.GetValue(parent_span_key));
Expand Down
8 changes: 3 additions & 5 deletions src/tracer.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,15 +250,13 @@ class TraceManager {
/// OpenTelemetry SystemTimestamp to display span on a timeline, and
/// OpenTelemetry SteadyTimestamp to calculate the duration on the span
/// with better precision.
/// \param is_root_span If true, a root span will be started,
/// i.e. with no parent span specified. If false, a new child span will
/// be started.
/// \param parent_span_key A span key, to find a parent span in the
/// OpenTelemetry context.
/// OpenTelemetry context. If empty, a root span will be started,
/// i.e. with no parent span specified.
/// \return A shared pointer to a newly created OpenTelemetry span.
opentelemetry::nostd::shared_ptr<otel_trace_api::Span> StartSpan(
std::string display_name, const uint64_t& raw_timestamp_ns,
bool is_root_span, std::string parent_span_key = "");
std::string parent_span_key = "");

/// Ends the provided span.
///
Expand Down

0 comments on commit cee5738

Please sign in to comment.