Skip to content

Commit

Permalink
Support specifying an execution provider in benchmark script (#10453)
Browse files Browse the repository at this point in the history
* Support specifying execution providers.

* Change default provider setting to None.

* Add support for bert_perf_test script.

* Fall back to ROCM/CUDA EP for MIGraphX/Tensorrt EP.

* Assert fall back EPs are included.

* Add model class AutoModelForCausalLM and other minor updates.

Co-authored-by: Yao Zhang <[email protected]>
  • Loading branch information
zhangyaobit and zhangyaobit authored Feb 3, 2022
1 parent a405658 commit 239c6ad
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 22 deletions.
16 changes: 12 additions & 4 deletions onnxruntime/python/tools/transformers/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,14 @@
from transformers import (AutoConfig, AutoTokenizer, AutoModel, GPT2Model, LxmertConfig)


def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
def run_onnxruntime(use_gpu, provider, model_names, model_class, precision, num_threads, batch_sizes, sequence_lengths,
repeat_times, input_counts, optimize_onnx, validate_onnx, cache_dir, onnx_dir, verbose, overwrite,
disable_ort_io_binding, use_raw_attention_mask, model_fusion_statistics, model_source):
import onnxruntime

results = []
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
if (use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()) and
('ROCMExecutionProvider' not in onnxruntime.get_available_providers())):
logger.error(
"Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
)
Expand Down Expand Up @@ -105,6 +106,7 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b

ort_session = create_onnxruntime_session(onnx_model_file,
use_gpu,
provider,
enable_all_optimization=True,
num_threads=num_threads,
verbose=verbose)
Expand Down Expand Up @@ -425,7 +427,13 @@ def parse_arguments():
default=os.path.join('.', 'onnx_models'),
help="Directory to store onnx models")

parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on cuda device")
parser.add_argument("-g", "--use_gpu", required=False, action="store_true", help="Run on gpu device")

parser.add_argument("--provider",
required=False,
type=str,
default=None,
help="Execution provider to use")

parser.add_argument(
"-p",
Expand Down Expand Up @@ -545,7 +553,7 @@ def main():
if enable_onnxruntime:
try:
use_raw_attention_mask = True
results += run_onnxruntime(args.use_gpu, args.models, args.model_class, args.precision, num_threads,
results += run_onnxruntime(args.use_gpu, args.provider, args.models, args.model_class, args.precision, num_threads,
args.batch_sizes, args.sequence_lengths, args.test_times, args.input_counts,
args.optimize_onnx, args.validate_onnx, args.cache_dir, args.onnx_dir,
args.verbose, args.overwrite, args.disable_ort_io_binding,
Expand Down
18 changes: 13 additions & 5 deletions onnxruntime/python/tools/transformers/benchmark_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def __str__(self):

def create_onnxruntime_session(onnx_model_path,
use_gpu,
provider=None,
enable_all_optimization=True,
num_threads=-1,
enable_profiling=False,
verbose=False,
use_dml=False):
verbose=False):
session = None
try:
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel, __version__ as onnxruntime_version
Expand All @@ -68,8 +68,16 @@ def create_onnxruntime_session(onnx_model_path,

logger.debug(f"Create session for onnx model: {onnx_model_path}")
if use_gpu:
if use_dml:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
Expand All @@ -89,7 +97,7 @@ def setup_logger(verbose=True):
logging.getLogger("transformers").setLevel(logging.WARNING)


def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):
def prepare_environment(cache_dir, output_dir, use_gpu, provider=None):
if cache_dir and not os.path.exists(cache_dir):
os.makedirs(cache_dir)

Expand All @@ -98,7 +106,7 @@ def prepare_environment(cache_dir, output_dir, use_gpu, use_dml=False):

import onnxruntime
if use_gpu:
if use_dml:
if provider == 'dml':
assert 'DmlExecutionProvider' in onnxruntime.get_available_providers(
), "Please install onnxruntime-directml package to test GPU inference."

Expand Down
48 changes: 42 additions & 6 deletions onnxruntime/python/tools/transformers/bert_perf_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class TestSetting:
test_cases: int
test_times: int
use_gpu: bool
provider: str
intra_op_num_threads: int
seed: int
verbose: bool
Expand All @@ -50,7 +51,7 @@ class ModelSetting:
opt_level: int


def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
def create_session(model_path, use_gpu, provider, intra_op_num_threads, graph_optimization_level=None):
import onnxruntime

if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
Expand All @@ -61,8 +62,21 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
if intra_op_num_threads is None and graph_optimization_level is None:
session = onnxruntime.InferenceSession(model_path)
else:
execution_providers = ['CPUExecutionProvider'
] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
if use_gpu:
if provider == 'dml':
execution_providers = ['DmlExecutionProvider', 'CPUExecutionProvider']
elif provider == 'rocm':
execution_providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'migraphx':
execution_providers = ['MIGraphXExecutionProvider', 'ROCMExecutionProvider', 'CPUExecutionProvider']
elif provider == 'cuda':
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
elif provider == 'tensorrt':
execution_providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
else:
execution_providers = ['CPUExecutionProvider']

sess_options = onnxruntime.SessionOptions()
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
Expand All @@ -86,7 +100,23 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
session = onnxruntime.InferenceSession(model_path, sess_options, providers=execution_providers)

if use_gpu:
assert 'CUDAExecutionProvider' in session.get_providers()
if provider == 'dml':
assert 'DmlExecutionProvider' in session.get_providers()
elif provider == 'rocm':
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'migraphx':
assert 'MIGraphXExecutionProvider' in session.get_providers()
assert 'ROCMExecutionProvider' in session.get_providers()
elif provider == 'cuda':
assert 'CUDAExecutionProvider' in session.get_providers()
elif provider == 'tensorrt':
assert 'TensorrtExecutionProvider' in session.get_providers()
assert 'CUDAExecutionProvider' in session.get_providers()
else:
assert 'CUDAExecutionProvider' in session.get_providers()
else:
assert 'CPUExecutionProvider' in session.get_providers()

return session


Expand Down Expand Up @@ -117,7 +147,7 @@ def to_string(model_path, session, test_setting):


def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
session = create_session(model_setting.model_path, test_setting.use_gpu, test_setting.provider, intra_op_num_threads,
model_setting.opt_level)
output_names = [output.name for output in session.get_outputs()]

Expand Down Expand Up @@ -239,6 +269,12 @@ def parse_arguments():
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)

parser.add_argument("--provider",
required=False,
type=str,
default=None,
help="Execution provider to use")

parser.add_argument('-n',
'--intra_op_num_threads',
required=False,
Expand Down Expand Up @@ -276,7 +312,7 @@ def main():

for batch_size in batch_size_set:
test_setting = TestSetting(batch_size, args.sequence_length, args.samples, args.test_times, args.use_gpu,
args.intra_op_num_threads, args.seed, args.verbose)
args.provider, args.intra_op_num_threads, args.seed, args.verbose)

print("test setting", test_setting)
run_performance(model_setting, test_setting, perf_results)
Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/python/tools/transformers/huggingface_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@

# Maps model class name to a tuple of model class
MODEL_CLASSES = [
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering'
'AutoModel', 'AutoModelWithLMHead', 'AutoModelForSequenceClassification', 'AutoModelForQuestionAnswering',
'AutoModelForCausalLM',
]

# List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/python/tools/transformers/onnx_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_
model_class_name = 'TF' + model_class_name

transformers_module = __import__("transformers", fromlist=[model_class_name])
logger.info(f"Model class name: {model_class_name}")
model_class = getattr(transformers_module, model_class_name)

return model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)
Expand Down
15 changes: 9 additions & 6 deletions onnxruntime/python/tools/transformers/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,11 @@ def parse_arguments(argv=None):
parser.add_argument('-g', '--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)

parser.add_argument('-d', '--use_dml', required=False, action='store_true', help="use DML")
parser.set_defaults(use_dml=False)
parser.add_argument('--provider',
required=False,
type=str,
default='cuda',
help="Execution provider to use")

parser.add_argument(
'--basic_optimization',
Expand All @@ -108,15 +111,15 @@ def parse_arguments(argv=None):
return parser.parse_args(argv)


def run_profile(onnx_model_path, use_gpu, basic_optimization, thread_num, all_inputs, use_dml):
def run_profile(onnx_model_path, use_gpu, provider, basic_optimization, thread_num, all_inputs):
from benchmark_helper import create_onnxruntime_session

session = create_onnxruntime_session(onnx_model_path,
use_gpu,
provider,
enable_all_optimization=not basic_optimization,
num_threads=thread_num,
enable_profiling=True,
use_dml=use_dml)
enable_profiling=True)

for inputs in all_inputs:
_ = session.run(None, inputs)
Expand Down Expand Up @@ -604,7 +607,7 @@ def run(args):
else: # default
all_inputs = create_dummy_inputs(onnx_model, args.batch_size, args.sequence_length, args.samples)

profile_file = run_profile(args.model, args.use_gpu, args.basic_optimization, args.thread_num, all_inputs, args.use_dml)
profile_file = run_profile(args.model, args.use_gpu, args.provider, args.basic_optimization, args.thread_num, all_inputs)

return profile_file

Expand Down

0 comments on commit 239c6ad

Please sign in to comment.