Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabling nested spans for trace mode OpenTelemetry #5928

Merged
merged 21 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/user_guide/trace.md
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,7 @@ The meaning of the trace timestamps is:
## OpenTelemetry trace support

Triton provides an option to generate and export traces
for standalone and ensemble models
using [OpenTelemetry APIs and SDKs](https://opentelemetry.io/).

To specify OpenTelemetry mode for tracing, specify the `--trace-config`
Expand Down
224 changes: 224 additions & 0 deletions qa/L0_trace/opentelemetry_unittest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
# Copyright 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import sys

sys.path.append("../common")
import json
import unittest
import tritonclient.http as httpclient
import tritonclient.grpc as grpcclient
import numpy as np
import test_util as tu
import time

EXPECTED_NUM_SPANS = 10

class OpenTelemetryTest(tu.TestResultCollector):

def setUp(self):
while True:
with open('trace_collector.log', 'rt') as f:
data = f.read()
if data.count("resource_spans") != EXPECTED_NUM_SPANS:
time.sleep(5)
continue
else:
break

data = data.split('\n')
full_spans = [entry.split('POST')[0] for entry in data if "resource_spans" in entry]
self.spans = []
for span in full_spans:
span = json.loads(span)
self.spans.append(
span["resource_spans"][0]['scope_spans'][0]['spans'][0])

self.simple_model_name = "simple"
self.ensemble_model_name = "ensemble_add_sub_int32_int32_int32"
self.root_span = "InferRequest"

def _check_events(self, span_name, events):
root_events_http =\
["HTTP_RECV_START",
"HTTP_RECV_END",
"INFER_RESPONSE_COMPLETE",
"HTTP_SEND_START",
"HTTP_SEND_END"]
root_events_grpc =\
["GRPC_WAITREAD_START",
"GRPC_WAITREAD_END",
"INFER_RESPONSE_COMPLETE",
"GRPC_SEND_START",
"GRPC_SEND_END"]
request_events =\
["REQUEST_START",
"QUEUE_START",
"REQUEST_END"]
compute_events =\
["COMPUTE_START",
"COMPUTE_INPUT_END",
"COMPUTE_OUTPUT_START",
"COMPUTE_END"]

if span_name == "compute":
# Check that all compute related events (and only them)
# are recorded in compute span
self.assertTrue(all(entry in events for entry in compute_events))
self.assertFalse(all(entry in events for entry in request_events))
self.assertFalse(
all(entry in events
for entry in root_events_http + root_events_grpc))

elif span_name == self.root_span:
# Check that root span has INFER_RESPONSE_COMPLETE, _RECV/_WAITREAD
# and _SEND events (and only them)
if "HTTP" in events:
self.assertTrue(
all(entry in events for entry in root_events_http))
self.assertFalse(
all(entry in events for entry in root_events_grpc))

elif "GRPC" in events:
self.assertTrue(
all(entry in events for entry in root_events_grpc))
self.assertFalse(
all(entry in events for entry in root_events_http))
self.assertFalse(
all(entry in events for entry in request_events))
self.assertFalse(
all(entry in events for entry in compute_events))

elif span_name == self.simple_model_name:
# Check that all request related events (and only them)
# are recorded in request span
self.assertTrue(all(entry in events for entry in request_events))
self.assertFalse(
all(entry in events
for entry in root_events_http + root_events_grpc))
self.assertFalse(all(entry in events for entry in compute_events))

def _check_parent(self, child_span, parent_span):
# Check that child and parent span have the same trace_id
# and child's `parent_span_id` is the same as parent's `span_id`
self.assertEqual(child_span['trace_id'], parent_span['trace_id'])
self.assertIn(
'parent_span_id',
child_span,
"child span does not have parent span id specified")
self.assertEqual(child_span['parent_span_id'], parent_span['span_id'])

def test_spans(self):
parsed_spans = []

# Check that collected spans have proper events recorded
for span in self.spans:
span_name = span['name']
self._check_events(span_name, json.dumps(span['events']))
parsed_spans.append(span_name)

# There should be 6 spans in total:
# 3 for http request, 3 for grpc request, 4 for ensemble
self.assertEqual(len(self.spans), 10)
# We should have 3 compute spans
self.assertEqual(parsed_spans.count("compute"), 3)
# 4 request spans (3 named simple - same as our model name, 1 ensemble)
self.assertEqual(parsed_spans.count(self.simple_model_name), 3)
self.assertEqual(parsed_spans.count(self.ensemble_model_name), 1)
# 3 root spans
self.assertEqual(parsed_spans.count(self.root_span), 3)

def test_nested_spans(self):

# First 3 spans in `self.spans` belong to HTTP request
# They are recorded in the following order:
# compute_span [idx 0] , request_span [idx 1], root_span [idx 2].
# compute_span should be a child of request_span
# request_span should be a child of root_span
for child, parent in zip(self.spans[:3], self.spans[1:3]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
'parent_span_id',
self.spans[2],
"root span has a parent_span_id specified")

# Next 3 spans in `self.spans` belong to GRPC request
# Order of spans and their relationship described earlier
for child, parent in zip(self.spans[3:6], self.spans[4:6]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
'parent_span_id',
self.spans[5],
"root span has a parent_span_id specified")

# Final 4 spans in `self.spans` belong to ensemble request
# Order of spans: compute span - request span - request span - root span
for child, parent in zip(self.spans[6:10], self.spans[7:10]):
self._check_parent(child, parent)

# root_span should not have `parent_span_id` field
self.assertNotIn(
'parent_span_id',
self.spans[9],
"root span has a parent_span_id specified")

def prepare_data(client):

inputs = []
input0_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)
input1_data = np.full(shape=(1, 16), fill_value=-1, dtype=np.int32)

inputs.append(client.InferInput('INPUT0', [1, 16], "INT32"))
inputs.append(client.InferInput('INPUT1', [1, 16], "INT32"))

# Initialize the data
inputs[0].set_data_from_numpy(input0_data)
inputs[1].set_data_from_numpy(input1_data)

return inputs

def prepare_traces():

triton_client_http = httpclient.InferenceServerClient("localhost:8000",
verbose=True)
triton_client_grpc = grpcclient.InferenceServerClient("localhost:8001",
verbose=True)
inputs = prepare_data(httpclient)
triton_client_http.infer("simple",inputs)

inputs = prepare_data(grpcclient)
triton_client_grpc.infer("simple", inputs)

inputs = prepare_data(httpclient)
triton_client_http.infer("ensemble_add_sub_int32_int32_int32", inputs)


if __name__ == '__main__':
unittest.main()
101 changes: 46 additions & 55 deletions qa/L0_trace/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,19 @@ rm -f *.log
rm -fr $MODELSDIR && mkdir -p $MODELSDIR

# set up simple and global_simple model using MODELBASE
rm -fr $MODELSDIR && mkdir -p $MODELSDIR && \
cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
cp -r $DATADIR/$MODELBASE $MODELSDIR/simple && \
rm -r $MODELSDIR/simple/2 && rm -r $MODELSDIR/simple/3 && \
(cd $MODELSDIR/simple && \
sed -i "s/^name:.*/name: \"simple\"/" config.pbtxt) && \
cp -r $MODELSDIR/simple $MODELSDIR/global_simple && \
(cd $MODELSDIR/global_simple && \
sed -i "s/^name:.*/name: \"global_simple\"/" config.pbtxt) && \
cp -r $ENSEMBLEDIR/simple_onnx_int32_int32_int32 $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \
rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/2 && \
rm -r $MODELSDIR/ensemble_add_sub_int32_int32_int32/3 && \
(cd $MODELSDIR/ensemble_add_sub_int32_int32_int32 && \
sed -i "s/^name:.*/name: \"ensemble_add_sub_int32_int32_int32\"/" config.pbtxt && \
sed -i "s/model_name:.*/model_name: \"simple\"/" config.pbtxt)

RET=0

Expand Down Expand Up @@ -653,82 +658,68 @@ set -e
kill $SERVER_PID
wait $SERVER_PID

set +e

# Check opentelemetry trace exporter sends proper info.
# A helper python script starts listenning on $OTLP_PORT, where
# OTLP exporter sends traces. It then check that received data contains
# expected entries
# OTLP exporter sends traces.
# Unittests then check that produced spans have expected format and events
# FIXME: Redesign this test to remove time sensitivity
SERVER_ARGS="--trace-config=triton,file=some_file.log --trace-config=level=TIMESTAMPS \
--trace-config=rate=1 --trace-config=count=6 --trace-config=mode=opentelemetry --trace-config=opentelemetry,url=localhost:$OTLP_PORT --model-repository=$MODELSDIR"

OPENTELEMETRY_TEST=opentelemetry_unittest.py
OPENTELEMETRY_LOG="opentelemetry_unittest.log"
EXPECTED_NUM_TESTS="2"

SERVER_ARGS="--trace-config=level=TIMESTAMPS --trace-config=rate=1 \
--trace-config=count=100 --trace-config=mode=opentelemetry \
--trace-config=opentelemetry,url=localhost:$OTLP_PORT \
--model-repository=$MODELSDIR"
SERVER_LOG="./inference_server_trace_config.log"

# Increasing OTLP timeout, since we don't use a valid OTLP collector
# and don't send a proper signal back.
export OTEL_EXPORTER_OTLP_TIMEOUT=50000
export OTEL_EXPORTER_OTLP_TRACES_TIMEOUT=50000

run_server
if [ "$SERVER_PID" == "0" ]; then
echo -e "\n***\n*** Failed to start $SERVER\n***"
cat $SERVER_LOG
exit 1
fi

# This is a simple python code that opens port
python $TRACE_COLLECTOR $OTLP_PORT $TRACE_COLLECTOR_LOG &
COLLECTOR_PID=$!
# Using netcat as trace collector
apt-get update && apt-get install -y netcat
nc -l -k 127.0.0.1 $OTLP_PORT >> $TRACE_COLLECTOR_LOG 2>&1 & COLLECTOR_PID=$!

set +e

# To make sure receiver is ready log gets all data
sleep 3

# Send http request and collect trace
$SIMPLE_HTTP_CLIENT >> client_update.log 2>&1
if [ $? -ne 0 ]; then
cat client_update.log
RET=1
fi

# Send grpc request and collect trace
$SIMPLE_GRPC_CLIENT >> client_update.log 2>&1
# Preparing traces for unittest.
# Note: need to run this separately, to speed up trace collection.
# Otherwise internal (opentelemetry_unittest.OpenTelemetryTest.setUp) check
# will slow down collection.
python -c 'import opentelemetry_unittest; \
opentelemetry_unittest.prepare_traces()' >>$CLIENT_LOG 2>&1

# Unittest will not start untill expected number of spans is collected.
python $OPENTELEMETRY_TEST >>$OPENTELEMETRY_LOG 2>&1
if [ $? -ne 0 ]; then
cat client_update.log
cat $OPENTELEMETRY_LOG
RET=1
else
check_test_results $TEST_RESULT_FILE $EXPECTED_NUM_TESTS
if [ $? -ne 0 ]; then
cat $OPENTELEMETRY_LOG
echo -e "\n***\n*** Test Result Verification Failed\n***"
RET=1
fi
fi
# To make sure log gets all data
sleep 3

kill $COLLECTOR_PID
wait $COLLECTOR_PID

EXPECTED_ENTRIES=${EXPECTED_ENTRIES:="REQUEST_START QUEUE_START INFER_RESPONSE_COMPLETE COMPUTE_START COMPUTE_INPUT_END COMPUTE_OUTPUT_START COMPUTE_END REQUEST_END"}
HTTP_ENTRIES=${HTTP_ENTRIES:="HTTP_RECV_START HTTP_RECV_END HTTP_SEND_START HTTP_SEND_END"}
GRPC_ENTRIES=${GRPC_ENTRIES:="GRPC_WAITREAD_START GRPC_WAITREAD_END GRPC_SEND_START GRPC_SEND_END"}

for ENTRY in $EXPECTED_ENTRIES; do
if [ `grep -c $ENTRY $TRACE_COLLECTOR_LOG` != "2" ]; then
RET=1
fi
done

for ENTRY in $HTTP_ENTRIES; do
if [ `grep -c $ENTRY $TRACE_COLLECTOR_LOG` != "1" ]; then
RET=1
fi
done

for ENTRY in $GRPC_ENTRIES; do
if [ `grep -c $ENTRY $TRACE_COLLECTOR_LOG` != "1" ]; then
RET=1
fi
done

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
echo -e "\n***\n*** Test FAILED\n***"
fi

set -e

kill $SERVER_PID
wait $SERVER_PID

set +e

exit $RET
Loading