Skip to content

Commit

Permalink
Fix conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
alekstheod committed Jan 14, 2025
1 parent 68e1e20 commit afe115b
Show file tree
Hide file tree
Showing 14 changed files with 93 additions and 1,115 deletions.
30 changes: 5 additions & 25 deletions .bazelrc
Original file line number Diff line number Diff line change
Expand Up @@ -843,21 +843,13 @@ test:windows_x86_cpu_wheel_test --build_tests_only --config=windows_x86_cpu_pycp
test:linux_cpu_pycpp_test_filters --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
test:linux_cpu_pycpp_test_filters --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
test:linux_cpu_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
<<<<<<< HEAD
test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
=======
test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
>>>>>>> upstream/master
test:linux_cpu_pycpp_test --config=linux_cpu_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...

# LINUX CUDA PYCPP:
test:linux_cuda_pycpp_test_filters --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
<<<<<<< HEAD
test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
=======
test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_gpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
>>>>>>> upstream/master
test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_gpu -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...

# LINUX ARM64 PYCPP
# In Linux Arm64 presubmit/continuous build, we cross-compile the binaries on
Expand All @@ -872,11 +864,7 @@ build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-tf_tosa,-no_aar
build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-tf_tosa,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
# TODO(michaelhudgins): Why do we need to specifically omit go and java here?
<<<<<<< HEAD
build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/python/tools:aot_compiled_test
=======
build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/python/tools:aot_compiled_test
>>>>>>> upstream/master
build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/python/tools:aot_compiled_test
# CROSS-COMPILE ARM64 PYCPP
build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
# Tests that fail only when cross-compiled
Expand All @@ -885,22 +873,14 @@ build:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantizat
test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
<<<<<<< HEAD
test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
=======
test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
>>>>>>> upstream/master
test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
# MACOS X86 PYCPP
# These are defined as build configs so that we can run a build only job. See
# the note under "ARM64 PYCPP" for more details.
build:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
build:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-tf_tosa,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
build:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
<<<<<<< HEAD
build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
=======
build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
>>>>>>> upstream/master
build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... //tensorflow/tools/pip_package:import_api_packages_test_cpu -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
# CROSS-COMPILE MACOS X86 PYCPP
build:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
build:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,29 @@ limitations under the License.
#include <vector>

#include "llvm/Transforms/Utils/Cloning.h"
#include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project
#include "mlir/Target/LLVMIR/Export.h" // from @llvm-project
#include "mlir/Dialect/Func/IR/FuncOps.h" // from @llvm-project
#include "mlir/Target/LLVMIR/Export.h" // from @llvm-project
#include "mlir/Transforms/DialectConversion.h" // from @llvm-project
#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/path.h"
#include "tensorflow/core/platform/status.h"
#include "tensorflow/core/platform/statusor.h"
#include "xla/debug_options_flags.h"
#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
#include "xla/service/gpu/gpu_asm_opts_util.h"
#include "xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h"
#include "xla/service/gpu/target_constants.h"
#include "xla/stream_executor/device_description.h"
#include "xla/xla.pb.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/path.h"
#include "tensorflow/core/platform/status.h"
#include "tensorflow/core/platform/statusor.h"

#if GOOGLE_CUDA
#include "xla/service/gpu/llvm_gpu_backend/nvptx_backend.h"
#include "xla/stream_executor/cuda/cuda_asm_compiler.h"
#elif TENSORFLOW_USE_ROCM
#include "xla/stream_executor/gpu/asm_compiler.h"
#include "tensorflow/core/platform/rocm_rocdl_path.h"
#include "xla/stream_executor/gpu/asm_compiler.h"
#endif

namespace mlir {
Expand Down
1 change: 0 additions & 1 deletion tensorflow/core/common_runtime/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ tf_cuda_library(
"//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/strings",
"@local_xla//xla/stream_executor",
"@local_xla//xla/stream_executor/gpu:gpu_cudamallocasync_allocator",
"@local_xla//xla/stream_executor/gpu:gpu_init_impl",
"@local_xla//xla/tsl/framework:device_id_utils",
Expand Down
19 changes: 6 additions & 13 deletions tensorflow/core/common_runtime/gpu/gpu_device_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,21 +21,21 @@ limitations under the License.

#include "tensorflow/core/common_runtime/gpu/gpu_device.h"

#include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
#include "xla/stream_executor/gpu/gpu_init.h"
#include "xla/tests/test_macros.h"
#include "xla/tsl/framework/device_id.h"
#include "xla/tsl/lib/core/status_test_util.h"
#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/random.h"
#include "tensorflow/core/platform/status.h"
#include "tensorflow/core/platform/test.h"
#include "xla/stream_executor/gpu/gpu_cudamallocasync_allocator.h"
#include "xla/stream_executor/gpu/gpu_init.h"
#include "xla/tests/test_macros.h"
#include "xla/tsl/framework/device_id.h"
#include "xla/tsl/lib/core/status_test_util.h"

#ifdef TF_GPU_USE_PJRT
#include "xla/pjrt/pjrt_client.h"
#include "tensorflow/core/tfrt/common/pjrt_util.h"
#include "xla/pjrt/pjrt_client.h"
#endif // TF_GPU_USE_PJRT

#if GOOGLE_CUDA
Expand Down Expand Up @@ -201,17 +201,10 @@ TEST_F(GPUDeviceTest, CudaMallocAsync) {
EXPECT_EQ(status.code(), error::OK);
}

<<<<<<< HEAD
TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsyncPreallocate)) {
#ifndef GOOGLE_CUDA
return;
#endif
=======
TEST_F(GPUDeviceTest, CudaMallocAsyncPreallocate) {
if (IsRocm()) {
GTEST_SKIP();
}
>>>>>>> upstream/master
SessionOptions opts = MakeSessionOptions("0", 0, 1, {}, {}, {}, 0,
/*use_cuda_malloc_async=*/true);
setenv("TF_CUDA_MALLOC_ASYNC_SUPPORTED_PREALLOC", "2048", 1);
Expand Down
36 changes: 8 additions & 28 deletions tensorflow/core/kernels/matmul_op_fused.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ limitations under the License.
#include <vector>

#include "Eigen/Core" // from @eigen_archive
#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive
#include "tensorflow/core/framework/bounds_check.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
Expand All @@ -51,14 +50,13 @@ limitations under the License.
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/util/matmul_autotune.h"
#include "tensorflow/core/util/tensor_format.h"
#include "unsupported/Eigen/CXX11/Tensor" // from @eigen_archive

#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
#include "xla/tsl/framework/contraction/eigen_contraction_kernel.h"
#endif

#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#include "xla/stream_executor/gpu/redzone_allocator.h"
#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
#include "tensorflow/core/kernels/conv_ops_gpu.h"
#include "tensorflow/core/kernels/gpu_utils.h"
#include "tensorflow/core/kernels/matmul_op_impl.h"
Expand All @@ -71,6 +69,8 @@ limitations under the License.
#include "tensorflow/core/util/autotune_maps/conv_parameters.h"
#include "tensorflow/core/util/proto/proto_utils.h"
#include "tensorflow/core/util/use_cudnn.h"
#include "xla/stream_executor/gpu/redzone_allocator.h"
#include "xla/stream_executor/integrations/tf_allocator_adapter.h"
#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

namespace tensorflow {
Expand Down Expand Up @@ -202,7 +202,7 @@ namespace {
/*
hipBLASLt support Epilogue:
https://rocm.docs.amd.com/projects/hipBLASLt/en/latest/datatypes.html#hipblasltepilogue-t
*/
*/
StatusOr<se::gpu::BlasLt::Epilogue> GetBlasLtEpilogOp(
FusedComputationType fusion) {
if (fusion == FusedComputationType::kBiasAdd) {
Expand Down Expand Up @@ -484,12 +484,6 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
#if !(GOOGLE_CUDA || TF_HIPBLASLT)
use_cudnn = true;
#endif
const auto& cc = stream->parent()->GetDeviceDescription().
gpu_compute_capability();
if (auto *procm = std::get_if< se::RocmComputeCapability >(&cc)) {
use_cudnn = !procm->gfx9_mi200_or_later();
}

// use_cudnn is for hipblaslt doesn't support yet
switch (fusion) {
case FusedComputationType::kBiasAddWithGeluExact:
Expand Down Expand Up @@ -525,9 +519,6 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
default:
use_cudnn = false;
}
<<<<<<< HEAD

=======
#if !(GOOGLE_CUDA || TF_HIPBLASLT)
use_cudnn = true;
#endif
Expand All @@ -537,7 +528,6 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
if (auto* procm = std::get_if<se::RocmComputeCapability>(&cc)) {
use_cudnn = !procm->gfx9_mi200_or_later();
}
>>>>>>> upstream/master
BlasScratchAllocator scratch_allocator(context);

// The Gelu exact fusion is supported by the cuDNN.
Expand Down Expand Up @@ -607,11 +597,7 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
epilog_op};
absl::Mutex* pmu;
auto plan_and_algorithms_or =
<<<<<<< HEAD
BlasLtMatmulPlanCache::GetOrCreate(stream, matmul_params, &pmu);
=======
PlanAndAlgorithms::GetOrCreate(stream, matmul_params, &pmu);
>>>>>>> upstream/master
OP_REQUIRES_OK(context, plan_and_algorithms_or.status());
absl::MutexLock lock(pmu);
const auto& entry = *plan_and_algorithms_or.value();
Expand All @@ -621,15 +607,9 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
auto launch_func = [&](BlasScratchAllocator& scratch_allocator,
size_t alg_idx,
se::blas::ProfileResult* profile_result) {
<<<<<<< HEAD
return BlasLtMatmulPlanCache::ExecuteOnStream(
stream, entry, a_ptr, b_ptr, c_ptr, alg_idx,
scratch_allocator, bias_ptr, profile_result);
=======
return plan_and_algorithms->ExecuteOnStream(stream, a_ptr, b_ptr, c_ptr,
alg_idx, scratch_allocator,
bias_ptr, profile_result);
>>>>>>> upstream/master
return BlasLtMatmulPlanCache::ExecuteOnStream(
stream, entry, a_ptr, b_ptr, c_ptr, alg_idx, scratch_allocator,
bias_ptr, profile_result);
};

size_t alg_idx = 0;
Expand All @@ -641,7 +621,7 @@ struct LaunchFusedMatMulOp<GPUDevice, T> {
}

OP_REQUIRES_OK(context, launch_func(scratch_allocator, alg_idx, nullptr));
#endif // GOOGLE_CUDA || TF_HIPBLASLT
#endif // GOOGLE_CUDA || TF_HIPBLASLT
}
};

Expand Down
Loading

0 comments on commit afe115b

Please sign in to comment.