diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc index ec1826d1eabd2..3cfa3ab959343 100644 --- a/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc +++ b/onnxruntime/contrib_ops/cuda/collective/distributed_expand.cc @@ -51,10 +51,11 @@ Status DistributedExpand::ComputeInternal(OpKernelContext* context) const { TensorShapeVector original_output_dims{p_shape, p_shape + shape_tensor->Shape().Size()}; TensorShape original_output_shape(original_output_dims); ORT_ENFORCE( - onnxruntime::cuda::ComputeOutputShape( - Node().Name(), - original_input_shape, - original_output_dims, original_output_shape).IsOK()); + onnxruntime::cuda::ComputeOutputShape( + Node().Name(), + original_input_shape, + original_output_dims, original_output_shape) + .IsOK()); // Compute local output shape. const auto local_output_shape = ComputeShardShape(original_output_shape, output_sharding_spec); @@ -62,11 +63,11 @@ Status DistributedExpand::ComputeInternal(OpKernelContext* context) const { auto output_tensor = context->Output(0, local_output_shape); return FuncExpand( - this, - context, - input_tensor, - shape_tensor, - output_tensor); + this, + context, + input_tensor, + shape_tensor, + output_tensor); } ONNX_OPERATOR_TYPED_KERNEL_EX( diff --git a/onnxruntime/core/providers/cuda/tensor/expand.cc b/onnxruntime/core/providers/cuda/tensor/expand.cc index 368c167f58641..806ecfa1aab17 100644 --- a/onnxruntime/core/providers/cuda/tensor/expand.cc +++ b/onnxruntime/core/providers/cuda/tensor/expand.cc @@ -148,7 +148,6 @@ Status FuncExpand( const Tensor* input_data_tensor, const Tensor* /*input_shape_tensor*/, Tensor* output_tensor) { - TensorShape output_shape = output_tensor->Shape(); #ifdef ENABLE_STRIDED_TENSORS @@ -203,10 +202,11 @@ std::unique_ptr FuncExpand( TensorShape output_shape(output_dims); ORT_ENFORCE( - ComputeOutputShape( - cuda_kernel->Node().Name(), - input_data_tensor->Shape(), - output_dims, output_shape).IsOK()); + ComputeOutputShape( + cuda_kernel->Node().Name(), + input_data_tensor->Shape(), + output_dims, output_shape) + .IsOK()); // Pre-allocate output. AllocatorPtr alloc;