Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TensorRT EP - timing cache #14767

Merged
merged 35 commits into from
Mar 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
3ba51a2
add timing cache
chilo-ms Dec 23, 2021
f905d35
enable timing cache for test
chilo-ms Jan 6, 2022
086ba0e
Make it only on Linux
chilo-ms Jan 10, 2022
191424b
undo last commit
chilo-ms Jan 14, 2022
d08ea41
Merge branch 'master' into trt_timing_cache
chilo-ms Jan 14, 2022
4fe5a0a
add 'timing_cache_enable' tensorrt provider options
chilo-ms Jan 14, 2022
83e251d
fix bug
chilo-ms Jan 15, 2022
5b52f63
fix bug
chilo-ms Jan 15, 2022
4ff502d
revert modification
chilo-ms Jan 15, 2022
2dd3194
small modification
chilo-ms Jan 15, 2022
8d75a70
remove intrumentation code for recording engine build latency
chilo-ms Jan 15, 2022
58e37fc
add timing_cache_enable as additional member of internal TensorRT pro…
chilo-ms Jan 15, 2022
b707c65
fix warning
chilo-ms Jan 17, 2022
48ecbeb
enable trt timing cache for model tests
chilo-ms Jan 28, 2022
e248193
Merge branch 'master' into trt_timing_cache
chilo-ms Jan 28, 2022
9dc0d16
enable timing cache for model tests
chilo-ms Jan 29, 2022
0f378b1
change pool
chilo-ms Jan 29, 2022
72b7645
change back previous pool
chilo-ms Jan 29, 2022
2c5ac28
change path of trt_timing_cache
chilo-ms Jan 31, 2022
93e61f0
refactor code
chilo-ms Feb 4, 2022
c9813c2
Add test cases for timing cache
chilo-ms Feb 4, 2022
e69723e
fix bug
chilo-ms Feb 4, 2022
c371a6d
fix bug for CI
chilo-ms Feb 5, 2022
8da55b5
Merge branch 'master' into trt_timing_cache
chilo-ms Feb 5, 2022
e513740
fix bug
chilo-ms Feb 5, 2022
4a45c30
fix bug
chilo-ms Feb 5, 2022
be0da22
Merge branch 'trt_timing_cache'
gedoensmax Feb 1, 2023
e38556a
timing cache test
gedoensmax Feb 8, 2023
bf0b880
append compute capability to cache and add force option
gedoensmax Feb 22, 2023
244b437
Take timing of first warm up inference
gedoensmax Feb 22, 2023
0db18d8
Merge branch 'main' into tensorrt_timing_cache
gedoensmax Feb 22, 2023
5db55ff
detailed build log option
gedoensmax Feb 27, 2023
6b0be1d
format changes and adding force timing cache to provider options
gedoensmax Mar 1, 2023
c1c3f71
fix pybind OrtTensorRTProviderOptionsV2
gedoensmax Mar 6, 2023
b888fc3
reset OrtTensorRTProviderOptions
gedoensmax Mar 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ if (onnxruntime_USE_CUDA)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/x64/lib64)
else()
if(onnxruntime_CUDNN_HOME)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib64)
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDNN_HOME}/lib ${onnxruntime_CUDNN_HOME}/lib64)
endif()
list(APPEND onnxruntime_LINK_DIRS ${onnxruntime_CUDA_HOME}/lib64)
endif()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

/// <summary>
/// Options for the TensorRT provider that are passed to SessionOptionsAppendExecutionProvider_TensorRT_V2.
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
/// Please note that this struct is *similar* to OrtTensorRTProviderOptions but only to be used internally.
/// Going forward, new trt provider options are to be supported via this struct and usage of the publicly defined
/// OrtTensorRTProviderOptions will be deprecated over time.
/// User can only get the instance of OrtTensorRTProviderOptionsV2 via CreateTensorRTProviderOptions.
Expand All @@ -31,4 +31,7 @@ struct OrtTensorRTProviderOptionsV2 {
int trt_force_sequential_engine_build; // force building TensorRT engine sequentially. Default 0 = false, nonzero = true
int trt_context_memory_sharing_enable; // enable context memory sharing between subgraphs. Default 0 = false, nonzero = true
int trt_layer_norm_fp32_fallback; // force Pow + Reduce ops in layer norm to FP32. Default 0 = false, nonzero = true
int trt_timing_cache_enable; // enable TensorRT timing cache. Default 0 = false, nonzero = true
int trt_force_timing_cache; // force the TensorRT cache to be used even if device profile does not match. Default 0 = false, nonzero = true
chilo-ms marked this conversation as resolved.
Show resolved Hide resolved
int trt_detailed_build_log; // Enable detailed build step logging on TensorRT EP with timing for each engine build. Default 0 = false, nonzero = true
};
142 changes: 137 additions & 5 deletions onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,32 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_map<s
}
return true;
}

inline std::vector<char> loadTimingCacheFile(const std::string inFileName) {
std::ifstream iFile(inFileName, std::ios::in | std::ios::binary);
if (!iFile) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not read timing cache from: " << inFileName
<< ". A new timing cache will be generated and written.";
return std::vector<char>();
}
iFile.seekg(0, std::ifstream::end);
size_t fsize = iFile.tellg();
iFile.seekg(0, std::ifstream::beg);
std::vector<char> content(fsize);
iFile.read(content.data(), fsize);
iFile.close();
return content;
}

inline void saveTimingCacheFile(const std::string outFileName, const nvinfer1::IHostMemory* blob) {
std::ofstream oFile(outFileName, std::ios::out | std::ios::binary);
if (!oFile) {
LOGS_DEFAULT(WARNING) << "[TensorRT EP] Could not write timing cache to: " << outFileName;
return;
}
oFile.write((char*)blob->data(), blob->size());
oFile.close();
}
} // namespace

namespace google {
Expand Down Expand Up @@ -312,7 +338,10 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
}
dump_subgraphs_ = info.dump_subgraphs;
engine_cache_enable_ = info.engine_cache_enable;
if (engine_cache_enable_ || int8_enable_) {
timing_cache_enable_ = info.timing_cache_enable;
force_timing_cache_match_ = info.force_timing_cache;
detailed_build_log_ = info.detailed_build_log;
gedoensmax marked this conversation as resolved.
Show resolved Hide resolved
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
cache_path_ = info.engine_cache_path;
}
engine_decryption_enable_ = info.engine_decryption_enable;
Expand Down Expand Up @@ -386,7 +415,22 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
engine_cache_enable_ = (std::stoi(engine_cache_enable_env) == 0 ? false : true);
}

if (engine_cache_enable_ || int8_enable_) {
const std::string timing_cache_enable_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kTimingCacheEnable);
if (!timing_cache_enable_env.empty()) {
timing_cache_enable_ = (std::stoi(timing_cache_enable_env) == 0 ? false : true);
}

const std::string detailed_build_log_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kDetailedBuildLog);
if (!detailed_build_log_env.empty()) {
detailed_build_log_ = (std::stoi(detailed_build_log_env) == 0 ? false : true);
}

const std::string timing_force_match_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kForceTimingCache);
if (!timing_force_match_env.empty()) {
force_timing_cache_match_ = (std::stoi(timing_force_match_env) == 0 ? false : true);
}

if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
const std::string engine_cache_path = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kEngineCachePath);
cache_path_ = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kCachePath);
if (!engine_cache_path.empty() && cache_path_.empty()) {
Expand Down Expand Up @@ -438,7 +482,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv
dla_core_ = 0;
}

if (engine_cache_enable_ || int8_enable_) {
if (engine_cache_enable_ || int8_enable_ || timing_cache_enable_) {
if (!cache_path_.empty() && !fs::is_directory(cache_path_)) {
if (!fs::create_directory(cache_path_)) {
throw std::runtime_error("Failed to create directory " + cache_path_);
Expand Down Expand Up @@ -1373,6 +1417,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
if (!has_dynamic_shape) {
const std::string cache_path = GetCachePath(cache_path_, trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
std::string timing_cache_path = "";
if (timing_cache_enable_) {
cudaDeviceProp prop;
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
timing_cache_path = GetTimingCachePath(cache_path_, prop);
}
{
// ifstream file check, engine serialization/deserialization and engine build are in critical section. It needs lock protection to prevent race condition when inferencing with multithreading.
auto lock = GetApiLock();
Expand Down Expand Up @@ -1419,12 +1469,36 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
}
}

// Load timing cache from file. Create a fresh cache if the file doesn't exist
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
if (timing_cache_enable_) {
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
if (timing_cache == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not create timing cache: " + timing_cache_path);
}
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
}
}

// Build engine
std::chrono::steady_clock::time_point engine_build_start;
if (detailed_build_log_) {
engine_build_start = std::chrono::steady_clock::now();
}
trt_engine = std::unique_ptr<nvinfer1::ICudaEngine>(trt_builder->buildEngineWithConfig(*trt_network, *trt_config));
if (trt_engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not build engine for fused node: " + fused_node.Name());
}
if (detailed_build_log_) {
auto engine_build_stop = std::chrono::steady_clock::now();
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_node_name_with_precision << " took: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
}
if (engine_cache_enable_) {
std::unique_ptr<nvinfer1::IHostMemory> serializedModel(trt_engine->serialize());
size_t engine_size = serializedModel->size();
Expand All @@ -1438,7 +1512,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
std::ofstream file(engine_cache_path, std::ios::binary | std::ios::out);
file.write(reinterpret_cast<char*>(serializedModel->data()), engine_size);
}
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized " + engine_cache_path;
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized engine " + engine_cache_path;
}
// serialize and save timing cache
if (timing_cache_enable_) {
auto timing_cache = trt_config->getTimingCache();
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
if (timingCacheHostData == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
}
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
}
}
}
}
Expand Down Expand Up @@ -1504,7 +1591,8 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
input_shape_ranges_[context->node_name], &tensorrt_mu_, fp16_enable_, int8_enable_, int8_calibration_cache_available_,
dla_enable_, dla_core_, &max_workspace_size_, trt_node_name_with_precision, engine_cache_enable_, cache_path_,
runtime_.get(), nullptr, allocator_, context_memory_sharing_enable_, &max_ctx_mem_size_, &context_memory_,
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_};
dynamic_range_map, engine_decryption_enable_, engine_decryption_, engine_encryption_, timing_cache_enable_,
force_timing_cache_match_, detailed_build_log_};
*state = p.release();
return 0;
};
Expand Down Expand Up @@ -1545,6 +1633,12 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
const std::string cache_path = GetCachePath(trt_state->engine_cache_path, trt_state->trt_node_name_with_precision);
const std::string engine_cache_path = cache_path + ".engine";
const std::string profile_cache_path = cache_path + ".profile";
std::string timing_cache_path = "";
if (timing_cache_enable_) {
cudaDeviceProp prop;
CUDA_CALL_THROW(cudaGetDeviceProperties(&prop, device_id_));
timing_cache_path = GetTimingCachePath(cache_path_, prop);
}
if (trt_state->engine_cache_enable && trt_engine == nullptr) {
std::ifstream engine_file(engine_cache_path, std::ios::binary | std::ios::in);
std::ifstream profile_file(profile_cache_path, std::ios::binary | std::ios::in);
Expand Down Expand Up @@ -1779,11 +1873,35 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
trt_config->setDLACore(trt_state->dla_core);
}

// Load timing cache from file. Create a fresh cache if the file doesn't exist
std::unique_ptr<nvinfer1::ITimingCache> timing_cache = nullptr;
if (trt_state->timing_cache_enable) {
std::vector<char> loaded_timing_cache = loadTimingCacheFile(timing_cache_path);
timing_cache.reset(trt_config->createTimingCache(static_cast<const void*>(loaded_timing_cache.data()), loaded_timing_cache.size()));
if (timing_cache == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not create timing cache: " + timing_cache_path);
}
trt_config->setTimingCache(*timing_cache, force_timing_cache_match_);
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Deserialized timing cache from " + timing_cache_path;
}
}

// Build engine
{
auto lock = GetApiLock();
std::chrono::steady_clock::time_point engine_build_start;
if (detailed_build_log_) {
engine_build_start = std::chrono::steady_clock::now();
}
*(trt_state->engine) = std::unique_ptr<nvinfer1::ICudaEngine>(
trt_builder->buildEngineWithConfig(*trt_state->network->get(), *trt_config));
if (detailed_build_log_) {
auto engine_build_stop = std::chrono::steady_clock::now();
LOGS_DEFAULT(INFO) << "TensorRT engine build for " << trt_state->trt_node_name_with_precision << " took: " <<
std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
}
}
if (trt_state->engine == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP Failed to Build Engine.");
Expand All @@ -1809,6 +1927,20 @@ common::Status TensorrtExecutionProvider::Compile(const std::vector<FusedNodeAnd
}
}

// serialize and save timing cache
if (trt_state->timing_cache_enable) {
auto timing_cache = trt_config->getTimingCache();
std::unique_ptr<nvinfer1::IHostMemory> timingCacheHostData{timing_cache->serialize()};
if (timingCacheHostData == nullptr) {
return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"TensorRT EP could not serialize timing cache: " + timing_cache_path);
}
saveTimingCacheFile(timing_cache_path, timingCacheHostData.get());
if (detailed_build_log_) {
LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Serialized timing cache " + timing_cache_path;
}
}

// Build context
if (trt_state->context_memory_sharing_enable) {
*(trt_state->context) = std::unique_ptr<nvinfer1::IExecutionContext>(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ static const std::string kDecryptionLibPath = "ORT_TENSORRT_ENGINE_DECRYPTION_LI
static const std::string kForceSequentialEngineBuild = "ORT_TENSORRT_FORCE_SEQUENTIAL_ENGINE_BUILD";
static const std::string kContextMemorySharingEnable = "ORT_TENSORRT_CONTEXT_MEMORY_SHARING_ENABLE";
static const std::string kLayerNormFP32Fallback = "ORT_TENSORRT_LAYER_NORM_FP32_FALLBACK";
static const std::string kTimingCacheEnable = "ORT_TENSORRT_TIMING_CACHE_ENABLE";
static const std::string kForceTimingCache = "ORT_TENSORRT_FORCE_TIMING_CACHE_ENABLE";
static const std::string kDetailedBuildLog = "ORT_TENSORRT_DETAILED_BUILD_LOG_ENABLE";
// Old env variable for backward compatibility
static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH";
} // namespace tensorrt_env_vars
Expand Down Expand Up @@ -114,6 +117,9 @@ struct TensorrtFuncState {
bool engine_decryption_enable = false;
int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable = true;
bool force_timing_cache = false;
bool detailed_build_log = false;
};

// Logical device representation.
Expand Down Expand Up @@ -176,6 +182,9 @@ class TensorrtExecutionProvider : public IExecutionProvider {
bool engine_decryption_enable_ = false;
int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
bool timing_cache_enable_ = false;
bool force_timing_cache_match_ = false;
bool detailed_build_log_ = false;

std::unordered_set<std::string> control_flow_op_set_ = {"If", "Loop", "Scan"};
std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,14 @@ constexpr const char* kCachePath = "trt_engine_cache_path";
constexpr const char* kDecryptionEnable = "trt_engine_decryption_enable";
constexpr const char* kDecryptionLibPath = "trt_engine_decryption_lib_path";
constexpr const char* kForceSequentialEngineBuild = "trt_force_sequential_engine_build";
// add new provider option name here.
// add new provider option name here.
constexpr const char* kContextMemorySharingEnable = "trt_context_memory_sharing_enable";
constexpr const char* kLayerNormFP32Fallback = "trt_layer_norm_fp32_fallback";
constexpr const char* kTimingCacheEnable = "trt_timing_cache_enable";
constexpr const char* kForceTimingCacheMatch = "trt_force_timing_cache_match";
constexpr const char* kDetailedBuildLog = "trt_detailed_build_log";
} // namespace provider_option_names
} // namespace tensorrt
} // namespace tensorrt

TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions(const ProviderOptions& options) {
TensorrtExecutionProviderInfo info{};
Expand All @@ -57,15 +60,17 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8CalibTable, info.int8_calibration_table_name)
.AddAssignmentToReference(tensorrt::provider_option_names::kInt8UseNativeCalibTable, info.int8_use_native_calibration_table)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLAEnable, info.dla_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
.AddAssignmentToReference(tensorrt::provider_option_names::kDLACore, info.dla_core)
.AddAssignmentToReference(tensorrt::provider_option_names::kDumpSubgraphs, info.dump_subgraphs)
.AddAssignmentToReference(tensorrt::provider_option_names::kEngineCacheEnable, info.engine_cache_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kCachePath, info.engine_cache_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionEnable, info.engine_decryption_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kDecryptionLibPath, info.engine_decryption_lib_path)
.AddAssignmentToReference(tensorrt::provider_option_names::kForceSequentialEngineBuild, info.force_sequential_engine_build)
.AddAssignmentToReference(tensorrt::provider_option_names::kContextMemorySharingEnable, info.context_memory_sharing_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kLayerNormFP32Fallback, info.layer_norm_fp32_fallback)
.AddAssignmentToReference(tensorrt::provider_option_names::kTimingCacheEnable, info.timing_cache_enable)
.AddAssignmentToReference(tensorrt::provider_option_names::kForceTimingCacheMatch, info.force_timing_cache)
.Parse(options)); // add new provider option here.

return info;
Expand Down Expand Up @@ -93,6 +98,7 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE
// add new provider option here.
{tensorrt::provider_option_names::kContextMemorySharingEnable, MakeStringWithClassicLocale(info.context_memory_sharing_enable)},
{tensorrt::provider_option_names::kLayerNormFP32Fallback, MakeStringWithClassicLocale(info.layer_norm_fp32_fallback)},
{tensorrt::provider_option_names::kTimingCacheEnable, MakeStringWithClassicLocale(info.timing_cache_enable)},
};
return options;
}
Expand Down
Loading